2013-06-24
n次元のx軸 vs. m次元のy軸で関連の有無を検定する？

メモソース

library(FNN)

# X is a matrix or a list.
# In case of matrix, each row of X represents a sample and a row vector is a point in the dimensional space. The function returns a distance matrix with specifeid distance measure ("method" argument).
# In case of list, each sample has a matrix, representing point clouds. The function returns a Kullback-Leibler distance in pair-wise.
# Xは行列かリスト
# Xが行列なら、返り値はペアワイズ距離行列。距離の定義はmethodで指定
# Xがリストなら、リストの個々の要素がサンプルのデータで、それはpoint cloud型データ(行列)。返り値はペアワイズの距離行列。ただし、距離の定義は、KL距離(k-nearest neighborを利用したKL距離でk=10をデフォルトにしている)
my.dist.KL <- function(X,method="euclidean",k=10){
	if(is.matrix(X)){
		ret <- as.matrix(dist(X,method=method))
		return(ret)
	}else{
		n <- length(X)
		num.sample <- sapply(X,nrow)
		K <- min(k,min(num.sample)-1)
		ret <- array(0,c(n,n,K))
		for(i in 1:n){
			for(j in 1:n){
				ret[i,j,] <- KL.divergence(X[[i]],X[[j]],k=K)
			}
		}
		return(ret)
	}
}

# This function performs permutation test as follow.
# n.iter is the number of iterations.
# Statistics for the original data, X, Y, and statistics for shuffled X,Y are k-nearest neighbor distance with the specified distance measure ("method" argument).
# Statistics is something I extended the idea of kNN-based KL distance.
# Because k-NN's k can be from k1 to k2, it returns (k2-k1+1) p-values.
# この関数は、以下の要領でパーミュテーションテストするもの
# n.iter回数のモンテカルロ・シャッフル
# 説明変数群X,従属変数群Yとして、統計量を出し、その後、シャッフルして統計量を出す。
# 統計量は、kNNで出すKL距離を異なる次元の分布間に拡張したもの(拡張したのは自分(のつもり)。誰かがやっているかもしれないし、やっていなければ、これの正当性も確かめる必要がある)。
# kNNベースなので、統計量が複数種類出る→p値も複数種類出る
my.perm.KL <- function(X,Y,n.iter=100,method="euclidean",k1=10,k2=10){
	D.X <- my.dist.KL(X,k=k1)
	log.D.X <- log(D.X)
	D.Y <- my.dist.KL(Y,k=k1)
	log.D.Y <- log(D.Y)
	#k-NN.X
	k.NN.X <- t(apply(D.X,1,order))
	k.NN.Y <- t(apply(D.Y,1,order))
	K <- k2
	n <- length(D.X[,1])
	St.K <- rep(0,K)
	for(i in 1:K){
		St.K[i] <- sum(log.D.Y[cbind(1:n,k.NN.X[,i+1])])
		St.K[i] <- St.K[i] + sum(log.D.X[cbind(1:n,k.NN.Y[,i+1])])
	}
	
	St.Perm <- matrix(0,n.iter,K)
	for(i in 1:n.iter){
		sh <- sample(1:n)
		sh.log.D.Y <- log.D.Y[sh,]
		sh.log.D.Y <- sh.log.D.Y[,sh]
		for(j in 1:K){
			St.Perm[i,j] <- sum(sh.log.D.Y[cbind(1:n,k.NN.X[,j+1])])
		}
		sh <- order(sh)
		sh.log.D.X <- log.D.X[sh,]
		sh.log.D.X <- sh.log.D.X[,sh]
		for(j in 1:K){
			St.Perm[i,j] <- St.Perm[i,j] + sum(sh.log.D.X[cbind(1:n,k.NN.Y[,j+1])])
		}

	}

	plot(c(sort(St.Perm[,1]),St.K[1]))
	abline(h = St.K[1])
	ps <- rep(0,K)
	for(i in 1:K){
		ps[i] <- length(which(St.Perm[,i]< St.K[i]))/n.iter
	}
	
	return(list(St.K=St.K,St.Perm=St.Perm,p.value=ps))
}
# Sample data
# N: sample size
# Pre-medication conditions: D-dimensional
# X
# Pre-post^(t-1) medicational conditions: d-dimensional * t time points
# Y
# 説明変数をD種類測定
# 従属変数をd種類測定。時系列でt回測定
N <- 100
D <- 100
d <- 70
t <- 3

# 説明変数は正規乱数
X <- matrix(rnorm(N*D),ncol=D)
# 従属変数
Y <- array(0,c(N,d,t))

# model
# YはXのu因子とY(t=0)の値の関数とする。いい加減
# Y[,,1]をXに含めるのもよいのだろうけれど、それがどういう影響をもたらすか、とか検討が済んでいないので、今のところは保留

# 治療前は説明変数と独立とする
Y[,,1] <- rnorm(d)
for(i in 1:N){
	# 治療後の２回の測定は、何かしら適当に
	Y[i,,2] <- Y[i,,1] + (Y[i,,1]+1)*(X[i,1]*X[i,2]+X[i,3]) + sum(X[i,4:7])
	Y[i,,3] <- Y[i,,1] + (Y[i,,1]+1)*(X[i,1]*X[i,2]+X[i,3]) + sum(X[i,4:7])*3
}
# 誤差項
er <- 0.1
Y <- Y + rnorm(length(Y),0,var(Y)*er)
# 誤差項(これを上下させて、パーミュテーションpが連動して変化するようなら、方法としてはうまく行っていることになる(と思います)

# 時系列データは「対応のあるデータ」なので、先日の勉強会のように、線分を点に置き換えます(先日は、観察項目次元が１で時刻数が２だったので、２次元座標になりました。今回は、観察項目次元がdで時刻数がtなので、d*t次元座標になります)
Y2 <- matrix(0,N,d*t)
for(i in 1:N){
	Y2[i,] <- Y[i,,]
}



my.perm.out <- my.perm.KL(X,Y2)
my.perm.out
par(ask=TRUE)
for(i in 1:length(my.perm.out$St.K)){
	plot(c(sort(my.perm.out$St.Perm[,i]),my.perm.out$St.K[i]))
	abline(h=my.perm.out$St.K[i])
}
par(ask=FALSE)




n.x <- 30
n.y <- 30
X <- list()
Y <- list()

for(i in 1:n.x){
	X[[i]] <- matrix(rnorm(50),ncol=2)
}
for(i in 1:n.y){
	Y[[i]] <- matrix(rnorm(75),ncol=3)
}

my.perm.out <- my.perm.KL(X,Y)
par(ask=TRUE)
for(i in 1:length(my.perm.out$St.K)){
	plot(c(sort(my.perm.out$St.Perm[,i]),my.perm.out$St.K[i]))

}
par(ask=FALSE)