ポアソン分布か、ポアソン分布としたら、どんなのか

  • ポアソン分布らしい観測データがある
  • ポアソン分布にフィットするかどうかは、こちらのような適合度検定でできる
    • 期待値からポアソン分布のパラメタを求めて、そのパラメタ値の下での期待度数と観測度数のずれをカイ二乗統計量にして検定している
  • 分布パラメタの値を推定するには、上のように期待値から出すのもよいのだと思うけれど、最適化関数とかを使うとどうなるだろう?
  • optim()関数をつかってやってみる
  • 期待値の小さいセルの合併は省略してあることに注意

# tmp.X が観測度数
# xがポアソン分布のパラメタ
# optim()関数に渡す最適化用関数を3つ作る
# 1つ目
# 期待度数と観測度数の差の二乗の和を最小化する(最小二乗法)
my.pois <- function(x){
	sum((dpois(0:M,x) * sum(tmp.X) - tmp.X)^2)
}
# 2つ目
# 期待度数と観測度数の差の二乗を期待度数で割ったものを最小にする(??カイ二乗検定風??)
my.pois2 <- function(x){
	exps <- dpois(0:M,x) * sum(tmp.X)
	sum((exps - tmp.X)^2/exps
}
# 3つ目
# 尤度を最大にする
my.pois3 <- function(x){
	p <- dpois(0:M,x)
	abs(sum(tmp.X*log(p)))
}
  • 期待値からの算術計算の結果が最尤推定値になっているようだ
M <- 500
G <- 100
L <- sample(500:1000,G,replace=TRUE)
mus <- rexp(G,3)
X <- matrix(0,G,M+1)

for(i in 1:G){
	tmp.prob <- dpois(0:M,mus[i])
	tmp.prob <- tmp.prob * (1+runif(M+1)*1)
	tmp.prob <- tmp.prob/sum(tmp.prob)
	tmp <- sample(0:M,L[i],replace=TRUE,prob = tmp.prob)
	#if(i == 1)tmp <- sample(0:M,L[i],replace=TRUE,prob = dpois(0:M,0.01))
	X[i,] <- c(length(which(tmp==0)),tabulate(tmp,nbins=M))
}


# tmp.X が観測度数
# xがポアソン分布のパラメタ
# optim()関数に渡す最適化用関数を3つ作る
# 1つ目
# 期待度数と観測度数の差の二乗の和を最小化する(最小二乗法)
my.pois <- function(x){
	sum((dpois(0:M,x) * sum(tmp.X) - tmp.X)^2)
}
# 2つ目
# 期待度数と観測度数の差の二乗を期待度数で割ったものを最小にする(??カイ二乗検定風??)
my.pois2 <- function(x){
	exps <- dpois(0:M,x) * sum(tmp.X)
	non.zero <- which(exps!=0)
	sum((exps[non.zero] - tmp.X[non.zero])^2/exps[non.zero])
}
# 3つ目
# 尤度を最大にする
my.pois3 <- function(x){
	p <- dpois(0:M,x)
	non.zero <- which(p!=0)
	abs(sum(tmp.X[non.zero]*log(p[non.zero])))
}

pois.par <- pois.par2 <- pois.par3 <- pois.par4 <- rep(0,G)
for(i in 1:G){
	tmp.X <- X[i,]
	pois.par[i] <- optim(sum(tmp.X*(0:M))/sum(tmp.X),my.pois,lower=0,method="L-BFGS-B")$par
	pois.par2[i] <- optim(sum(tmp.X*(0:M))/sum(tmp.X),my.pois2,lower=0,method="L-BFGS-B")$par
	pois.par3[i] <- optim(sum(tmp.X*(0:M))/sum(tmp.X),my.pois3,lower=0,method="L-BFGS-B")$par
	pois.par4[i] <- sum(tmp.X*(0:M))/sum(tmp.X)
	
}
matplot(t(X),type="l")

plot(data.frame(pois.par,pois.par2,pois.par3,pois.par4))
# ポアソン分布の対数尤度関数mlpoiを定義
# nlm()関数の都合で符号は逆
mlpoi<-function(p){
  -(sum(x*log(p))-length(x)*p-sum(log(factorial(x))))
}
# 最尤法(ニュートン=ラフソン法)
r_ml <- nlm(mlpoi, 0.5, hessian=TRUE)
# 推定結果を表示
print(sprintf("lambda:%1.5f variance:%1.5f iterations:%d", r_ml$estimate, 1/r_ml$hessian, r_ml$iterations))
# 算術的に推定
sum(xtabs(~x)*0:4)/length(x)
  • これを(も)使って、ちょっと作業
# M人
# G個の遺伝子(または集計単位)
# 集計単位ごとの座位数(塩基数):Li; i = 1,2,...,G
# そのカウント表:X

# Simulate a sample data matrix X
M <- 40
G <- 100
L <- sample(1000:50000,G,replace=TRUE)
mus <- rep(1,G)
X <- matrix(0,G,M+1)

for(i in 1:G){
	tmp.prob <- dpois(0:M,mus[i])
	#tmp.prob <- tmp.prob * (1+runif(M+1)*1)
	tmp.prob <- tmp.prob/sum(tmp.prob)
	tmp <- sample(0:M,L[i],replace=TRUE,prob = tmp.prob)
	#if(i == 1)tmp <- sample(0:M,L[i],replace=TRUE,prob = dpois(0:M,0.01))
	X[i,] <- c(length(which(tmp==0)),tabulate(tmp,nbins=M))
}

# Here, statistical calculation starts.

# Likelihood ratio test : You have to know "http://en.wikipedia.org/wiki/Likelihood-ratio_test"

# Summing-up the whole genes.
Xw <- apply(X,2,sum)
# Now assume the distribution is in poisson distribution. 
# Poisson distribution is determined by one parameter.
# The calculation below will give you the maximum likelihood estimation of the parameter.
# This value is the estimated value for the whole genes.
pois.par <- sum(0:M * Xw)/sum(Xw)

# In the same way, we estimate the poisson distribution parameter for individual genes.
pois.pars <- rep(0,G)
for(i in 1:G){
	pois.pars[i] <- sum(0:M * X[i,])/sum(X[i,])
}

# Below, we calculate likelihood based on various models for individual genes.
# Likelihood is the probability to observe something when you assume a model/hypothesis.
# Four models are assumed and four likelihoods are calculated below.
# The 1st model: Each gene's distribution is in the poisson distribution with its parameter being the estimated value based on the whole genes observatin (pois.par).
# The 2nd model: Each gene's distribution is in the poisson distribution with its parameter being the estimated value based on the gene's own observation (pois.pars[i]).
# The 3rd model: Each gene's distribution is in accordance with the observed frequency of the whole genes.
# The 4th model: Each gene's distribution is in accordance with the observed frequency of the gene's own observation.
# Actually log(Likelihood) is calculated.
Lw <- Ls <- L.chisq <- p <- rep(0,G)
Lxw <- Lxs <- Lx.chisq <- px <- rep(0,G)
N.non.zero <- length(which(Xw != 0))
for(i in 1:G){
	prob <- dpois(0:M,pois.par)
	non.zero <- which(prob!=0)
	Lw[i] <- sum(X[i,non.zero] * log(prob[non.zero]))
	prob2 <- dpois(0:M,pois.pars[i])
	non.zero2 <- which(prob2!=0)
	Ls[i] <- sum(X[i,non.zero2] * log(prob2[non.zero2]))
	probx <- Xw/sum(Xw)
	non.zerox <- which(probx!=0)
	Lxw[i] <- sum(X[i,non.zerox] * log(probx[non.zerox]))
	prob2x <- X[i,]/sum(X[i,])
	non.zero2x <- which(prob2x!=0)
	Lxs[i] <- sum(X[i,non.zero2x] * log(prob2x[non.zero2x]))
}

# Let's make chisquare statistics from log(Likelihood) values.
# When you compare thw models ,calculate the difference between two log(Likelihood) and multiply it by 2.
# They are chi-square value.
# Below, the 1st and 2nd models are compared and the 3rd and 4th models are compared.
# p-values are calculated. When you calculat p-value from chi-square value, you have to specify "degree of freedom".
# When you compare 1st and 2nd model, you only manipulate the parameter of poisson distribution. The manipulated parameter is only 1, therefore degree of freedom is 1.
# When you compare 3rd and 4th model, many cells can be manipulated. Actually the manipulatble cells are the non-zero cells in the whole.
L.chisq <- 2*(Ls - Lw)
# degree of freedom is 1.
p <- pchisq(L.chisq,1,lower.tail=FALSE)
Lx.chisq <- 2*(Lxs - Lxw)
# degree of freedom is the number of non-zero cells in the distribution of the whole genes.
px <- pchisq(Lx.chisq,N.non.zero,lower.tail=FALSE)
par(mfcol=c(2,3))
plot(Ls,Lw,main = "Log-Like two Hx")
plot(sort(L.chisq), main = "Chisq")
plot(sort(p), main ="p")
plot(L,L.chisq, main ="Length of genes vs. chisq")
plot(L,p, main = "Length of genes vs. p")
par(mfcol=c(1,1))

par(mfcol=c(2,3))
plot(Lxs,Lxw,main = "Log-Like two Hx")
plot(sort(Lx.chisq), main = "Chisq")
plot(sort(px), main ="p")
plot(L,Lx.chisq, main ="Length of genes vs. chisq")
plot(L,px, main = "Length of genes vs. p")
par(mfcol=c(1,1))