尤度比(kinship relationのために) - ryamadaの遺伝学・遺伝統計学メモ

911WTCでも使われたソフトウェア→こちら
kinshipパッケージのPDF解説文書→こちら
forensicパッケージのPDF解説文書→こちら
paramlinkパッケージのPDF解説文書→こちら

MakeFamilyGraph<-function(d){
	
}
# kinship パッケージで家系図を描く
library(kinship)
par(mfcol=c(1,2))
test1 <- data.frame(id  =c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14), mom =c(0, 0, 0, 0, 2, 2, 4, 4, 6,  2,  0,  0, 12, 13), dad =c(0, 0, 0, 0, 1, 1, 3, 3, 3,  7,  0,  0, 11, 10),sex =c(0, 1, 0, 1, 0, 1, 0, 1, 0,  0,  0,  1,  1,  1))
affected<-sample(c(0,1),length(test1$sex),replace=TRUE)
ptemp<-pedigree(id=test1$id,dadid=test1$dad,momid=test1$mom,sex=test1$sex,affected=affected)
plot(ptemp)

# 親が特定できていない個人の両親を架空のペアの子として並べて登録しなおす

test1<-as.matrix(test1)
maxid<-max(test1[,1])
fakem<-maxid+1
fakef<-fakem+1
test2<-matrix(0,length(test1[,1])+2,length(test1[1,]))
test2[,1]<-c(test1[,1],fakem,fakef)
test2[,2]<-c(test1[,2],0,0)
test2[which(test1[,2]==0),2]<-fakem
test2[,3]<-c(test1[,3],0,0)
test2[which(test1[,3]==0),3]<-fakef
test2[,4]<-c(test1[,4],1,0)

affected<-c(affected,0,0)
ptemp2<-pedigree(id=test2[,1],dadid=test2[,3],momid=test2[,2],sex=test2[,4],affected=affected)
plot(ptemp2)

par(mfcol=c(1,1))

# NL個のローカスに、リピート数アレルとその確率を適当に与える
NL<-15

Alleles<-NULL
Probs<-NULL
library(MCMCpack)
for(i in 1:NL){
	na<-sample(3:8,1)
	st<-sample(10:20,1)
	tmp<-NULL
	for(j in 1:na){
		a<-paste("",st+j,sep="")
		tmp<-c(tmp,a)
	}
	Alleles[[i]]<-tmp
	Probs[[i]]<-c(rdirichlet(1,rep(1,na)))
	print(sum(Probs[[i]]))
}
Alleles
Probs

家系図を元に適当にジェノタイプを与えてみよう

# 家系図を見ながら、ジェノタイプを割り当てよう

RandomGenotypeFamily<-function(d,n,Alleles,Probs){
	ns<-length(d[,1])
	g<-array(0,c(ns,2,n))
	noparents<-which(d[,2]==0,d[,3]==0)
	for(i in noparents){
		for(j in 1:n){
			g[i,,j]<-sample(Alleles[[j]],2,replace=TRUE,Probs[[j]])
		}
	}
	given<-rep(0,ns)
	given[noparents]<-1
	loop<-TRUE
	if(sum(given)==ns)loop<-FALSE
	while(loop){
		yet<-which(given==0)
		for(i in yet){
			prt1<-d[i,2]
			prt2<-d[i,3]
			if(given[prt1]+given[prt2]==2){
				for(j in 1:n){
					g[i,1,j]<-sample(g[prt1,,j],1)
					g[i,2,j]<-sample(g[prt2,,j],1)
				}
				given[i]<-1
			}
		}
		if(sum(given)==ns)loop<-FALSE
	}
	
	#g[noparents,,]<--9
	g
}
G<-RandomGenotypeFamily(test1,NL,Alleles,Probs)

> G
, , 1

      [,1] [,2]
 [1,] "13" "15"
 [2,] "15" "15"
 [3,] "13" "15"
 [4,] "15" "15"
 [5,] "15" "13"
 [6,] "15" "15"
 [7,] "15" "15"
 [8,] "15" "13"
 [9,] "15" "15"
[10,] "15" "15"
[11,] "15" "15"
[12,] "15" "15"
[13,] "15" "15"
[14,] "15" "15"

, , 2

      [,1] [,2]
 [1,] "17" "17"
 [2,] "16" "17"
 [3,] "17" "16"
 [4,] "18" "16"
 [5,] "17" "17"
 [6,] "16" "17"
...

ランダムにHWE集団のジェノタイプのサンプルプールを作成しよう

# ランダムにジェノタイプを発生しよう
RandomGenotype<-function(N,n,Alleles,Probs){
	ns<-N
	g<-array(0,c(ns,2,n))
	noparents<-1:ns
	for(i in noparents){
		for(j in 1:n){
			g[i,,j]<-sample(Alleles[[j]],2,replace=TRUE,Probs[[j]])
		}
	}
	g
}
Np<-20
Gpool<-RandomGenotype(Np,NL,Alleles,Probs)

サンプルプールに全部対応するとは限らない

# 調べたい個人が何人か、いる
# そのうち、１人ずつだけをサンプルプールに対応づけるか
# 2,3,...人ずつ対応付けるかを考慮する必要がある
# そのパターンを決める
MakeSearchPattern<-function(v){
	library(gtools)
	nt<-length(v)
	x<-0
	X<-NULL
	for(i in 1:nt){
		tmp<-combinations(nt,i)
		tmp2<-tmp
		tmp2[1:length(tmp2)]<-c(v[tmp])
		X[[i]]<-tmp2
		x<-sum(x,length(X[[i]][,1]))
	}
	list(X=X,x=x)
}



tobeSearched<-which(Type==2)
SearchPattern<-MakeSearchPattern(tobeSearched)

> SearchPattern
$X
$X[[1]]
     [,1]
[1,]    3
[2,]    7

$X[[2]]
     [,1] [,2]
[1,]    3    7


$x
[1] 3

探索パターン
- 複数の探索個人をプールしたサンプルに対応付けようとするとき、
- 探索個人はサンプルプールに複数の候補があるとする
- 複数の探索個人が、同一のサンプルを候補とする場合もあるとする
- そのときに、対応付けにあっては、探索個人は異なるサンプルに対応付ける必要があるので、
- 探索パターンは、複数の探索個人の候補に関する積パターンのうち、同一サンプルを候補に挙げている場合を除いたものとなる
- それを作る関数が以下

# それぞれのtargetsには候補者がいる
targets<-c(2,6)
candidates<-NULL
for(i in 1:length(targets)){
	candidates[[i]]<-sample(1:Npool,3)
}
candidates[[1]][1]<-candidates[[2]][1]
# ある候補者が複数から指名されていたら、テストしない
MakeTestPattern<-function(candidates){
	gr<-expand.grid(candidates)
	n<-length(gr[,1])
	ok<-rep(1,n)
	for(i in 1:n){
		tmp<-unlist(gr[i,])
		tmp2<-outer(tmp,tmp,"-")
		diag(tmp2)<-1
		if(prod(tmp2)==0){
			ok[i]<-0
		}
	}
	gr[which(ok==1),]
}
MakeTestPattern(candidates)

実際には、探索したい個人のリストがあり、それぞれの個人をプールのどれかに対応付けたいのでMakeSearchPattern()とMakeTestPattern()を組み合わせて探索することになる

# SearchPatternごとに
# TestPatternを決めて、それぞれテストする

for(i in 1:length(SearchPattern$X)){
	for(j in 1:length(SearchPattern$X[[i]][,1])){
		numtested<-length(SearchPattern$X[[i]][j,])
		tmpcandidates<-NULL
		ToTestCurrent<-SearchPattern$X[[i]][j,]
		for(k in 1:numtested){
			selectedToBeTested<-which(tobeSearched==SearchPattern$X[[i]][j,k])
			
			tmpcandidates[[k]]<-candidates[[selectedToBeTested]]
		}
		tmpTestPattern<-MakeTestPattern(tmpcandidates)
		print("to be tested IDs in pedigree")
		print(ToTestCurrent)
		print("to be tested sampleIDs in pool")
		print(tmpTestPattern)
	}
}

これを実行すると

[1] "to be tested IDs in pedigree"
[1] 3
[1] "to be tested sampleIDs in pool"
     [,1]
[1,]   14
[2,]   27
[3,]   26
[1] "to be tested IDs in pedigree"
[1] 7
[1] "to be tested sampleIDs in pool"
     [,1]
[1,]   14
[2,]   34
[3,]   30
[1] "to be tested IDs in pedigree"
[1] 3 7
[1] "to be tested sampleIDs in pool"
  Var1 Var2
2   27   14
3   26   14
4   14   34
5   27   34
6   26   34
7   14   30
8   27   30
9   26   30

library(kinship)
# kinship パッケージで家系図を描く
p<-matrix(
c(1:8,
  0,0,1,0,3,3,3,3,
  0,0,2,0,4,4,4,4,
  1,0,1,0,1,1,0,0,
  1,1,2,1,1,1,2,1),
  ncol=5)

MakePedigreeFromFamilyInfo<-function(p){
	ns<-length(p[,1])
	affected<-status<-rep(1,ns)
	affected[which(p[,5]==3)]<-0
	status[which(p[,5]==1)]<-0
	ptemp<-pedigree(id=p[,1],dadid=p[,3],momid=p[,2],sex=p[,4],affected=affected,status=status)
	if(sum(ptemp$affected)==0)ptemp$affected<-affected
	ptemp
}

ptemp<-MakePedigreeFromFamilyInfo(p)
plot(ptemp)

# NL個のローカスに、リピート数アレルとその確率を適当に与える
NL<-15

Alleles<-NULL
Probs<-NULL
library(MCMCpack)
for(i in 1:NL){
	na<-sample(3:8,1)
	st<-sample(10:20,1)
	tmp<-NULL
	for(j in 1:na){
		a<-paste("",st+j,sep="")
		tmp<-c(tmp,a)
	}
	Alleles[[i]]<-tmp
	Probs[[i]]<-c(rdirichlet(1,rep(1,na)))
	#print(sum(Probs[[i]]))
}
Alleles
Probs
# ランダムにジェノタイプを発生しよう
RandomGenotype<-function(N,n,Alleles,Probs){
	ns<-N
	g<-array(0,c(ns,2,n))
	noparents<-1:ns
	for(i in noparents){
		for(j in 1:n){
			g[i,,j]<-sample(Alleles[[j]],2,replace=TRUE,Probs[[j]])
		}
	}
	g
}
Np<-3
Gpool<-RandomGenotype(Np,NL,Alleles,Probs)

# 家系図を見ながら、ジェノタイプを割り当てよう

RandomGenotypeFamily<-function(d,n,Alleles,Probs){
	ns<-length(d[,1])
	g<-array(0,c(ns,2,n))
	noparents<-which(d[,2]==0,d[,3]==0)
	for(i in noparents){
		for(j in 1:n){
			g[i,,j]<-sample(Alleles[[j]],2,replace=TRUE,Probs[[j]])
		}
	}
	given<-rep(0,ns)
	given[noparents]<-1
	loop<-TRUE
	if(sum(given)==ns)loop<-FALSE
	while(loop){
		yet<-which(given==0)
		for(i in yet){
			prt1<-d[i,2]
			prt2<-d[i,3]
			if(given[prt1]+given[prt2]==2){
				for(j in 1:n){
					g[i,1,j]<-sample(g[prt1,,j],1)
					g[i,2,j]<-sample(g[prt2,,j],1)
				}
				given[i]<-1
			}
		}
		if(sum(given)==ns)loop<-FALSE
	}
	
	#g[noparents,,]<--9
	g
}
G<-RandomGenotypeFamily(p,NL,Alleles,Probs)

# さて尤度を計算しよう

# ランダムな集団ジェノタイプと家系を元に作ったのを、『仮の』ジェノタイププールとしよう
# これは身元不明者のジェノタイププールを想定している

Gpool2<-array(0,c(length(Gpool[,1,1])+length(G[,1,1]),2,NL))
Gpool2[1:length(G[,1,1]),,]<-G
Gpool2[(length(G[,1,1])+1):length(Gpool2[,1,1]),,]<-Gpool
#Gpool2[1:length(Gpool[,1,1]),,]<-Gpool
#Gpool2[(length(Gpool[,1,1])+1):length(Gpool2[,1,1]),,]<-G
Gpool2


# 家系図構成個人は３種類に分けられる
# (1) 家系図上の位置が確定していて、ジェノタイプが確定している個人
# (2) 家系図上の位置が確定していて、ジェノタイプがないことも確定している個人
# (3) 家系図上の位置が確定しているが、あるジェノタイププールのどのサンプルが対応するか、または、しないかが未確定の個人

# 家系図のうち、ジェノタイプのある個人のジェノタイプのみのデータにする
# ジェノタイプのある個人のうち、確定している人と
# 確定していなくて、そうなのかそうでないのかをテストしたい個人の２種類がある

# Typeが1は(1)
# 2は(3)
# 3は(2)

Type<-p[,5]

Npool<-length(Gpool2[,1,1])

#targets<-sample(1:Npool,Uchiwake[2])


G2<-array(0,c(length(p[,1]),2,NL))

G2[which(Type==1),,]<-G[which(Type==1),,]

# 調べたい個人が何人か、いる
# そのうち、１人ずつだけをサンプルプールに対応づけるか
# 2,3,...人ずつ対応付けるかを考慮する必要がある
# そのパターンを決める

MakeSearchPattern<-function(v){
	library(gtools)
	nt<-length(v)
	x<-0
	X<-NULL
	for(i in 1:nt){
		tmp<-combinations(nt,i)
		tmp2<-tmp
		tmp2[1:length(tmp2)]<-c(v[tmp])
		#X[[i]]<-matrix(tmp2,nrow=length(tmp[,1]))
		X[[i]]<-tmp2
		x<-sum(x,length(as.matrix(X[[i]])[,1]))
	}
	list(X=X,x=x)
}


# それぞれのtargetsには候補者がいる
candidates<-NULL
tobeSearched<-which(p[,5]==2)
for(i in 1:length(tobeSearched)){
	#candidates[[i]]<-sample(1:Npool,5)
	#candidates[[i]][1]<-which(Type==2)[1]
	candidates[[i]]<-1:Npool

}
#candidates[[1]]<-1:Npool
#candidates[[1]][1]<-candidates[[2]][1]
#candidates2[[targets[1]]][1]<-candidates2[[targets[2]]][1]
# ある候補者が複数から指名されていたら、テストしない
MakeTestPattern<-function(candidates){
	gr<-expand.grid(candidates)
	n<-length(gr[,1])
	ok<-rep(1,n)
	for(i in 1:n){
		tmp<-unlist(gr[i,])
		tmp2<-outer(tmp,tmp,"-")
		diag(tmp2)<-1
		if(prod(tmp2)==0){
			ok[i]<-0
		}
	}
	as.matrix(gr[which(ok==1),])
}


OffspringGenotypeProb<-function(p1,p2){
	V<-p1%*%t(p2)
	U<-V+t(V)
	U[lower.tri(U)]<-0
	diag(U)<-diag(U)/2
	U
}

LogLikelihoodTrio<-function(p1,p2,c){
	V<-p1%*%t(p2)
	U<-V+t(V)
	U[lower.tri(U)]<-0
	diag(U)<-diag(U)/2
}

CalcLogLikelihoodFamilyLocus<-function(p,g,A,P){
	unknown<-"0"
	ns<-length(p[,1])
	na<-length(A)
	gvector<-array(0,c(ns,na,2))
	given<-rep(0,ns)
	for(i in 1:ns){
		tmp<-0
		if(g[i,1]!=unknown){
			gvector[i,which(A==g[i,1]),1]<-1
			tmp<-tmp+1
		}
		if(g[i,2]!=unknown){
			gvector[i,which(A==g[i,2]),2]<-1
			tmp<-tmp+1
		}
		if(tmp==2)given[i]<-1
	}
	loop<-TRUE
	if(sum(given)==ns)loop<-FALSE
	while(loop){
		yet<-which(given==0)
		for(i in yet){
			if(p[i,2]+p[i,3]==0){
				gvector[i,,1]<-gvector[i,,2]<-P
				given[i]<-1
			}else{
				tmp1<-p[i,2]
				tmp2<-p[i,3]
				if(given[tmp1]+given[tmp2]==2){
					gvector[i,,1]<-(gvector[tmp1,,1]+gvector[tmp1,,2])/2
					gvector[i,,2]<-(gvector[tmp2,,1]+gvector[tmp2,,2])/2
					given[i]<-1
				}
			}
		}
		if(sum(given)==ns)loop<-FALSE
	}
	# 捜索される人でそれがジェノタイプ未定ということは、
	# ランダムな個人であったときと比べたいということ
	h<-which(p[,5]==2)
	for(i in h){
		if(g[i,1]==unknown){
			gvector[i,,1]<-gvector[i,,2]<-P
		}
	}
	# すべての親子トリオについて計算
	ret<-rep(0,ns)
	for(i in 1:ns){
		tmp1<-p[i,2]
		tmp2<-p[i,3]
		if(tmp1*tmp2>0){
			P1<-(gvector[tmp1,,1]+gvector[tmp1,,2])/2
			P2<-(gvector[tmp2,,1]+gvector[tmp2,,2])/2
			FromParents<-OffspringGenotypeProb(P1,P2)
			Offspring<-OffspringGenotypeProb(gvector[i,,1],gvector[i,,2])
			ret[i]<-log(sum(FromParents*Offspring))
		}
	}
	list(loglikelist=ret,loglikesum=sum(ret))
}

CalcLogLikelihoodFamily<-function(p,g,Alleles,Probs){
	nl<-length(g[1,1,])
	ret<-rep(0,nl)
	
	for(i in 1:length(g[1,1,])){
		tmp<-CalcLogLikelihoodFamilyLocus(p,g[,,i],Alleles[[i]],Probs[[i]])
		ret[i]<-tmp$loglikesum
	}
	list(loglikelist=ret,loglikesum=sum(ret))
}

CalcLogLikelihoodFamily(p,G2,Alleles,Probs)


# SearchPattern$X,test1,tmpG2,Alleles,Probs
# SearchPattern<-MakeSearchPattern(tobeSearched)
# p 
# 第１カラム：ＩＤ
# 第２カラム：母
# 第３カラム：父
# 第４カラム：性別　0:男 1:女
# 第５カラム：タイプ (1はジェノタイプ提供者、2は探されている人、3は情報なしの人)
TestAllPatterns<-function(p,G,Gpool,candidates,Alleles,Probs){
	ptemp<-MakePedigreeFromFamilyInfo(p)

	tobeSearched<-which(p[,5]==2)
	SearchPattern<-MakeSearchPattern(tobeSearched)
	ret<-NULL
	ret2<-NULL
	ret3<-NULL
	counter<-1
	for(i in 1:length(SearchPattern$X)){
		spx<-as.matrix(SearchPattern$X[[i]])
		
		for(j in 1:length(spx[,1])){
			
			ToTestCurrent<-spx[j,]
			tmpG2<-G
			#print(tmpG2[,,1])
			tobeEvaluated<-rep(0,length(p[,1]))
			tobeEvaluated[which(p[,5]==1)]<-1
			tobeEvaluated[which(p[,5]==3)]<-1
			tobeEvaluated[ToTestCurrent]<-1
			tmpp<-p[which(tobeEvaluated==1),]
			tmptmpG2<-tmpG2[which(tobeEvaluated==1),,]
			print("---")
			print(tobeEvaluated)
			print(tmpp)
			print(tmptmpG2)
			ret[[counter]]<-CalcLogLikelihoodFamily(tmpp,tmptmpG2,Alleles,Probs)
			ret2[[counter]]<-list(ToTestCurrent)
			ret3[[counter]]<-list()
			#print(ret[[counter]])
		
			counter<-counter+1
			
			numtested<-length(spx[j,])
			tmpcandidates<-NULL

			for(k in 1:numtested){
				selectedToBeTested<-which(tobeSearched==spx[j,k])
				
				tmpcandidates[[k]]<-candidates[[selectedToBeTested]]
			}
			tmpTestPattern<-MakeTestPattern(tmpcandidates)
			for(k in 1:length(tmpTestPattern[,1])){
				#print("to be tested IDs in pedigree")
				#print(ToTestCurrent)
				#print("to be tested sampleIDs in pool")
				#print(tmpTestPattern[k,])

				tmpG2<-G
				tmpG2[ToTestCurrent,,]<-Gpool[tmpTestPattern[k,],,]
				tobeEvaluated<-rep(0,length(p[,1]))
				tobeEvaluated[which(p[,5]==1)]<-1
				tobeEvaluated[which(p[,5]==3)]<-1
				tobeEvaluated[ToTestCurrent]<-1
				tmpp<-p[which(tobeEvaluated==1),]
				tmptmpG2<-tmpG2[which(tobeEvaluated==1),,]
				print("---")
				print(tobeEvaluated)
				print(tmpp)
				print(tmptmpG2)
				ret[[counter]]<-CalcLogLikelihoodFamily(tmpp,tmptmpG2,Alleles,Probs)
				
				#ret[[counter]]<-CalcLogLikelihoodFamily(p,tmpG2,Alleles,Probs)
				ret2[[counter]]<-list(ToTestCurrent)
				ret3[[counter]]<-list(tmpTestPattern[k,])

				#print(ret[[counter]])
				counter<-counter+1
			}
		}
	}
	
	list(LogLike=ret,SearchedID=ret2,CandidateID=ret3,familyinfo=p,pedigree=ptemp,tobeSearched=tobeSearched,candidates=candidates,offeredGenotype=G,poolGenotype=Gpool,Alleles=Alleles,Probs=Probs)
}
pwithType<-cbind(p,Type)
pwithType

tap<-TestAllPatterns(p=p,G=G2,Gpool=Gpool2,candidates=candidates,Alleles=Alleles,Probs=Probs)

printTAPout<-function(tap){
	n<-length(tap$LogLike)
	
	for(i in 1:n){
		tmp<-paste("LogLike",tap$LogLike[[i]][2],"SearchedID",c(tap$SearchedID[[i]]),"CandidateID",c(tap$CandidateID[[i]]))
		print(tmp)
	}
}

printTAPout(tap)
G[,,1]
Gpool2[,,1]