# Burrows-Wheeler index

• Burrows-Wheeler変換(BWT)はBWTに基づくインデックス方法を用いる
• 大規模テキストの検索に向く
• 少メモリで実行可能なことを特徴とする
• 圧縮技術・塩基配列処理に応用されている
• BWインデックスを説明する
• 文字列"acaacg"にBWインデックスを付ける
• "a,t,g,c"のうち、"t"が入っていない
• "acaacg"の末尾に"$"をつけて"acaacg$"を作る
• "acaacg$"という７文字列の先頭を"$->a->c->a->a->c->g"と変えてやって、文字列の長さの数だけ異なる配列を作る

• 文字列長ｘ文字列長の行列になっている
• この行列を行に関して、辞書的に並べ替える。ただし、"$"は、どの文字よりも先頭に来るような順序で扱う • この行列の最右列の文字列"gc$aaac"が「作成された文字列」とする
• "acaacg$"から"gc$aaac"がBW変換
#適当にatgc文字列を作る
L<-c("a","t","g","c")
n<-6
T<-sample(L,n,replace=TRUE)
T
# "$"を付加する T2<-c(T,"$")
# T2の巡回文字列を作る
M<-matrix(0,n+1,n+1)
T2<-rep(T2,2)

for(i in 1:length(M[,1])){
M[i,]<-T2[i:(n+i)]
}
M
# 辞書式ソートをするために、文字列扱いにする
M2<-NULL
for(i in 1:length(M[,1])){
M2[i]<-""
for(j in 1:(n+1)){
M2[i]<-paste(M2[i],M[i,j],sep="")
}
}
M2
# 辞書式順序を作る
sortlist<-order(M2)
# 辞書式順序で並び替える
sortlist<-order(M2)
sortedM<-M[sortlist,]
sortedM
# 必要なのは、左端と右端
L<-sortedM[,1]
R<-sortedM[,n+1]
L
R

> T
[1] "g" "c" "a" "c" "c" "c"
> M
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,] "g"  "c"  "a"  "c"  "c"  "c"  "$" [2,] "c" "a" "c" "c" "c" "$"  "g"
[3,] "a"  "c"  "c"  "c"  "$" "g" "c" [4,] "c" "c" "c" "$"  "g"  "c"  "a"
[5,] "c"  "c"  "$" "g" "c" "a" "c" [6,] "c" "$"  "g"  "c"  "a"  "c"  "c"
[7,] "$" "g" "c" "a" "c" "c" "c" > M2 [1] "gcaccc$" "caccc$g" "accc$gc" "ccc$gca" "cc$gcac" "c$gcacc" "$gcaccc"
> sortedM
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,] "$" "g" "c" "a" "c" "c" "c" [2,] "a" "c" "c" "c" "$"  "g"  "c"
[3,] "c"  "$" "g" "c" "a" "c" "c" [4,] "c" "a" "c" "c" "c" "$"  "g"
[5,] "c"  "c"  "$" "g" "c" "a" "c" [6,] "c" "c" "c" "$"  "g"  "c"  "a"
[7,] "g"  "c"  "a"  "c"  "c"  "c"  "$" > L [1] "$" "a" "c" "c" "c" "c" "g"
> R
[1] "c" "c" "c" "g" "c" "a" "\$"