３分布 - ryamadaの遺伝学・遺伝統計学メモ

３つの多次元分布がある
- X,Y,Zとする
標本がたくさんあって、X,Y,Zが形成されるが、標本は３分布を横串で貫いているのでX,Y,Zの間には何かしらの関係がありえる
X,Yの関係とX,Zの関係は既知として、Y,Z間の関係があるのかないのかをデータから読み取りたいとする
- 例えば、こんな(濃淡は標本の対応関係を『見せる』ため)

X <- rbind(matrix(rnorm(1000*2),ncol=2), matrix(rnorm(500*2,1,0.3),ncol=2),matrix(rnorm(500*2,2,0.6),ncol=2))
par(mfcol=c(1,3))
plot(X)

Y <- X + X[,1] * rnorm(length(X[,1])) + X[,2] * rnorm(length(X[,2]),2)
#Y<-Y[sample(1:length(X[,1])),]
plot(Y)

#Z <- cbind(sin(Y[,1]*0.2),cos(Y[,1])) + 0.1*(X-min(X))^0.5
tmp.X <- X[sample(1:length(X[,1])),]
Z <- cbind(sin(Y[,1]*0.2),cos(Y[,1])) + 0.1*(tmp.X-min(tmp.X))^0.5

plot(Z)
par(mfcol=c(1,1))

cols <- gray((max(Y[,1])-Y[,1])/(max(Y[,1])-min(Y[,1])))
par(mfcol=c(1,3))

plot(X,col=cols)
#dev.new()
plot(Y,col=cols)
#dev.new()
plot(Z,col=cols)
par(mfcol=c(1,1))

X,Y,Zの３つの点に適当に「距離」を入れる。最小全域木を使って入れてみる
- 距離を入れるのに、ちょっと関数を作っておく

###############
# 使用関数
# ベクトルのノルム
nrm<-function(A){
	sqrt(sum(A^2))
}
# AからXsが作る「hyperplane」への垂線の足を計算する
# ２点が作る「hyperplane」は直線
# 垂線の足がXsが作る凸包(２点の場合は線分)の上にあるかどうかを判定する
MinDistance2<-function(A,Xs,bothside=TRUE){
 df<-length(A)
 N<-length(Xs[,1])
 Xs1<-t(t(Xs)-A)
 C<-apply(Xs1,2,sum)/N
 Xs2<-t(t(Xs1)-C)
# \sum_{i=1}^N a_i v_i と表したときに \sum_{i=1}^N a_i=1という制約がある
# v_iは与えられている凸包の頂点座標(ただし、重心を原点にとりなおしている
# 垂線の足をbとすると
# Ma=bという連立方程式を解いてaを求めるたい
# ただし凸包はN次元ではなくてN-1次元なので、N次元のうち１次元分は取り除いて
# その代わり係数の和が１という制約等式を入れれば
# あとは行列を使って連立方程式を解くだけなので、Rに任せる

 Xs3<-Xs2 %*% t(Xs1)
 Xs4<-rbind(Xs3[1:(N-1),],rep(1,N))
 coefs<-solve(Xs4,c(rep(0,(N-1)),1))
 Apr<-t(Xs1) %*% coefs
 L<-nrm(Apr)
 Apr<-Apr+A
 inside<-FALSE
 if(length(coefs[coefs<0])==0 || (bothside & length(coefs[coefs>0])==0) ){# すべての要素が0以上の条件
  inside<-TRUE
 }
 list(A=A,Xs=Xs,distance=L,Apr=Apr,coefs=coefs,inside=inside)

}

# function to get foot of points to the graph
# 点から線分への足を計算するには、足が線分上にあるかその外側にあるかの判定が必要
calc.foot2<-function(X,x,m){
	difference<-t(x)-X
	distance<-sqrt(apply(difference^2,2,sum))
	m2<-m
	m2[lower.tri(m2)] <- 0
	edges<-which(m2==1,arr.ind=TRUE)
	min.dist.per.edge<-rep(0,length(edges[,1]))
	t.per.edge<-rep(0,length(edges[,1]))
	closest<-matrix(0,length(edges[,1]),length(X))
	for(i in 1:length(edges[,1])){
		tmp.out<-MinDistance2(X,x[edges[i,],])
		if(tmp.out$inside){
			t.per.edge[i]<-tmp.out$coefs[1]
			min.dist.per.edge[i]<-tmp.out$distance
			closest[i,]<-tmp.out$Apr
		}else{
			if(distance[edges[i,1]]<distance[edges[i,2]]){
				t.per.edge[i]<-1
				min.dist.per.edge[i]<-distance[edges[i,1]]
				closest[i,]<-x[edges[i,1],]
			}else{
				min.dist.per.edge[i]<-distance[edges[i,2]]
				closest[i,]<-x[edges[i,2],]
			}
		}
	}
	min.pt<-which(min.dist.per.edge==min(min.dist.per.edge))
	if(length(min.pt)==1){
		return(list(coods=matrix(closest[min.pt,],nrow=1),edge=min.pt,vs = edges[min.pt,]))
	}else{
		return(list(coods=closest[min.pt[1],],edge=min.pt[1],vs = edges[min.pt[1],]))
	}
}

# 木グラフ(木構造がg,点の座標がX,木における最短距離行列がs.x)に対して、任意の点xから垂線をおろし、その垂線を加えた上で、点xと木gのすべてのノードとのグラフ上の距離を返す
getD.onGraph <- function(x,X,g,s.x){
	m.x <- get.adjacency(g)
	foot <- calc.foot2(x,X,m.x)
	dx2foot <- sqrt(sum(x-foot$coods)^2)
	s.x.1 <- s.x[foot$vs[1],]
	s.x.2 <- s.x[foot$vs[2],]
	dv1foot <- sqrt(sum((foot$coods-X[foot$vs[1],])^2))
	dv2foot <- sqrt(sum((foot$coods-X[foot$vs[2],])^2))
	D <- apply(cbind(s.x.1+dv1foot,s.x.2+dv2foot),1,min) +dx2foot
	return(list(D=D,foot=foot,ds=c(dx2foot,dv1foot,dv2foot)))
}
# 木グラフのノードとの距離Dに応じて、グラフ外の点xが被説明変数の最小全域木グラフg.yの各点に相当する値(セット)を観測する相対尤度が求められる
# そのグラフのノードに与えられた相対尤度について、次の手順で「信頼区間的くくり」を作る
# 最大尤度のノードを第１のくくりとする
# このくくりの「信頼区間」はこの最大尤度とする
# 次のくくりは第２最大尤度のノードで、第２のくくりは、第１のくくりと第２最大尤度の点とを含み、第１のくくりと第２最大尤度点とにはさまれたノードはくくられるものとする
# このくくりの「信頼区間」はくくりに入るノードの尤度の和とする
# 以下、第i+1のくくりを作るに当たり、第iのくくりにおいて、まだくくられていないノードの中で最大尤度のノードを加え、添加ノードと第iくくりとに挟まれたノードはくくられるものとする、という手順で拡大する
# 最終的には、すべてのノードがくくられる。このとき、「信頼区間」は100% である
CI.tree <- function(D,s.y){
	L <- calc.RelativeLike(D)
	order.L <- order(L,decreasing = TRUE)
	tmp.list <- tmp.value.list <- list()
	tmp.list[[1]] <- order.L[1]
	tmp.value.list[[1]] <- L[tmp.list[[1]]]
	#print(tmp.list)
	resids <- order.L[-1]
	cnt <- 2
	while(length(resids)>0){
		#print(resids)
		tmp <- resids[1]
		resids <- resids[-1]
		#print(resids)
		if(length(resids)>0){
			d.1 <- matrix(s.y[resids,tmp.list[[cnt-1]]],nrow=length(resids))
			d.2 <- s.y[resids,tmp]
			d.3 <- s.y[tmp,tmp.list[[cnt-1]]]
			#print(d.1)
			#print(d.2)
			#print(d.3)
			#print("---")
			dd23 <- outer(d.2, d.3, "-")
			#print(d.1)
			#print(dd12)
			#print(d.1+dd23)
			insiders <- which(abs(apply(matrix(d.1+dd23,nrow = length(resids)),1,prod)) < 10000*.Machine$double.eps^0.5)
			#print(insiders)
			tmp.list[[cnt]] <- c(tmp.list[[cnt-1]],c(tmp,resids[insiders]))
			tmp.value.list[[cnt]] <- tmp.value.list[[cnt-1]] + sum(L[c(tmp,resids[insiders])])
			cnt <- cnt +1
			if(length(insiders)>0){
				resids <- resids[-insiders]
			}
		}else{
			tmp.list[[cnt]] <- c(tmp.list[[cnt-1]],c(tmp))
			tmp.value.list[[cnt]] <- tmp.value.list[[cnt-1]] + sum(L[c(tmp)])
		}

		
		#print(resids)
	}
	return(list(nodes = tmp.list,prob = tmp.value.list))
}

# 相対尤度
calc.RelativeLike <- function(D=D.out$D){
	tmp <- exp(-D^2/2)
	tmp/sum(tmp)
}

# MSTオブジェクトを経由してグラフオブジェクトを作る
make.mst.graph <- function(X){
	mst.x <- spantree(dist(X))
	e.x <- cbind(1:(length(mst.x$kid)),mst.x$kid-1)
	#graph.edgelist(e.x)
	return(list(g = graph.edgelist(e.x),mst=mst.x))
}
dist.mst <- function(mst.x,mst.y, para = "non.para"){
	e.x <- cbind(1:(length(mst.x$kid)),mst.x$kid-1)
	g.x <- graph.edgelist(e.x)
	w.x <- mst.x$dist
	e.y <- cbind(1:(length(mst.x$kid)),mst.x$kid-1)
	g.y <- graph.edgelist(e.y)
	w.y <- mst.y$dist
	todo <- c(1,1)
	if(para == "non.para"){
		todo <- c(0,1)
	}else if (para == "para"){
		todo <- c(1,0)
	}else if(para == "both"){
		todo <- c(1,1)
	}
	s.x.p <- s.y.p <- s.x.np <- s.y.np <- NULL
	st.p <- st.np <- NULL
	if(todo[1] == 1){
		s.x.p <- shortest.paths(g.x,weight=w.x)
		s.y.p <- shortest.paths(g.y,weight=w.y)
		st.p <- sum((s.x.p-s.y.p)^2)
	}
	if(todo[2] == 1){
		s.x.np <- shortest.paths(g.x)
		s.y.np <- shortest.paths(g.y)
		st.np <- sum((s.x.np-s.y.np)^2)

	}

	return(list(st.p = st.p, st.np = st.np, s.x.p = s.x.p, s.y.p = s.y.p, s.x.np = s.x.np, s.y.np = s.y.np))
}

dist.mst.permutation2 <- function(s.x,s.y,n.iter=1000){
	n <- length(s.x[,1])
	ret <- rep(0,n.iter)
	for(i in 1:n.iter){
		t <- sample(1:n)
		ret[i] <- sum((s.x-s.y[t,t])^2)
	}
	ret
}

- 距離を入れる

gout.x <- make.mst.graph(X)
gout.y <- make.mst.graph(Y)
gout.z <- make.mst.graph(Z)

sp.x <- shortest.paths(gout.x$g)
sp.y <- shortest.paths(gout.y$g)
sp.z <- shortest.paths(gout.z$g)

X,Y間、X,Z間の関係を「維持」した上で、この標本からリサンプリングする
- X,Y間、X,Z間の関係は、『X上での距離の二乗について正規分布的に確率が減じるものとして、Xの値に対応して、重み付き確率でY,Zの値を発生させる』ことにして維持する
- 以下のソースは、試行錯誤の過程を反映したゴミがたくさん入っているけれども…
- この話とこちらの話は実はつながる
- もちろん、こちらも
- 実際、「距離が定まった空間の分布」ならなんでもよいので、木のように、連結関係に制約を入れる必要もなくなって、閉じた空間とかでも、行けそう。そうすると、こちらとも、閉じた空間での分布という点でつながる

st.ori <- list()
st.ori[[1]] <- (sp.x-sp.y)^2
st.ori[[2]] <- (sp.y-sp.z)^2
st.ori[[3]] <- (sp.z-sp.x)^2

plot(c(sp.x),c(sp.y))
plot(c(sp.y),c(sp.z))
plot(c(sp.z),c(sp.x))

xy.z <- c(abs(sp.x-sp.y)+abs(sp.y-sp.z)-abs(sp.z-sp.x))
yz.x <- c(abs(sp.y-sp.z)+abs(sp.z-sp.x)-abs(sp.x-sp.y))
zx.y <- c(abs(sp.z-sp.x)+abs(sp.x-sp.y)-abs(sp.y-sp.z))
length(which(xy.z==0))
length(which(yz.x==0))
length(which(zx.y==0))


plot(sort(c(abs(sp.x-sp.y)+abs(sp.y-sp.z)-abs(sp.z-sp.x))))

st.st <- cbind(c(st.ori[[1]]),c(st.ori[[2]]),c(st.ori[[3]]))
apply(st.st,1,function(v)which(v==min(v))) -> which.min

plot(unlist(which.min))


n.perm <- 100
st.perm <- matrix(0,n.perm,3)
prob.perm <- rep(0,n.perm)
tmp.type.y <- tmp.type.z <- matrix(0,n.perm,length(X[,1]))
for(i in 1:length(X[,1])){
	tmp <- sample(1:length(X[,1]),2*n.perm,replace=TRUE,prob=exp(-sp.x[i,]^2/2))
	tmp.type.y[,i] <- tmp[1:n.perm]
	tmp.type.z[,i] <- tmp[(n.perm+1):(n.perm*2)]
}
for(i in 1:n.perm){
	#sh.y <- sample(1:length(X[,1]))
	#sh.z <- sample(1:length(X[,1]))
	sh.y <- tmp.type.y[i,]
	sh.z <- tmp.type.z[i,]

	
	#tmp.st.xy <- sum((sp.x-sp.y[sh.y,sh.y])^2)
	#tmp.st.xz <- sum((sp.x-sp.z[sh.z,sh.z])^2)
	#tmp.st.yz <- sum((sp.y[sh.y,sh.y]-sp.z[sh.z,sh.z])^2)
	tmp.xy.z <- c(abs(sp.x-sp.y[sh.y,sh.y])+abs(sp.y[sh.y,sh.y]-sp.z[sh.z,sh.z])-abs(sp.z[sh.z,sh.z]-sp.x))
	tmp.yz.x <- c(abs(sp.y[sh.y,sh.y]-sp.z[sh.z,sh.z])+abs(sp.z[sh.z,sh.z]-sp.x)-abs(sp.x-sp.y[sh.y,sh.y]))
	tmp.zx.y <- c(abs(sp.z[sh.z,sh.z]-sp.x)+abs(sp.x-sp.y[sh.y,sh.y])-abs(sp.y[sh.y,sh.y]-sp.z[sh.z,sh.z]))
	st.perm[i,] <- c(length(which(tmp.xy.z==0)),length(which(tmp.yz.x==0)),length(which(tmp.zx.y==0)))
	#par(ask=TRUE)
	par(mfcol=c(1,3))
	plot(c(sp.x),c(sp.y[sh.y,sh.y]))
	#par(ask=FALSE)
	plot(c(sp.y[sh.y,sh.y]),c(sp.z[sh.z,sh.z]))
	plot(c(sp.z[sh.z,sh.z]),c(sp.x))
	par(mfcol=c(1,1))
	#st.perm[i, ] <- c(tmp.st.xy,tmp.st.xz,tmp.st.yz)
	for(j in 1:length(sp.x[,1])){
		prob.perm[i] <- prob.perm[i] - sp.x[j,sh.y[j]]^2/2 - sp.x[j,sh.z[j]]^2/2
	}
}