CNV多型をはじめとする多(3以上)アレルのHWE検定

アレルタイプ数をNaとする。DiploidではジェノタイプはNg=\frac{Na\times(Na+1)}{2}。アレルをA=\{a_1,a_2,...,a_{Na}\}とし、その頻度をP_{A}=\{p_{a_1},p_{a_2},...,p_{a_{Na}}\}、観測本数をN_{A}=\{n_{a_1},n_{a_2},...,n_{a_{Na}}\}すると、ジェノタイプG=\{g_{a_1,a_1},g_{a_1,a_2},...,g_{a_{Na},a_{Na}}の観測人数がP_{G}=\{n_{g_{a_1,a_1}},n_{g_{a_1,a_2}},...,n_{g_{a_{Na},a_{Na}}}\}と表せる。
今、HWEにおける期待人数ne_{G}=\{pe_{g_{a_1,a_1}},ne_{g_{a_1,a_2}},...,ne_{g_{a_{Na},a_{Na}}}\}ne_{g_{a_i,a_i}}=(n_{a_i})^2/N,ne_{g_{a_i,a_j}}=2\times n_{a_i}\times n_{a_j}(ただしNを観測人数、i\not{=} j)と表される。
カイ自乗検定をするとすれば、\chi^2(HWE)=\sum_{g_k \in G} \frac{(n_{g_k}-ne_{g_k})^2}{ne_{g_k}}、として表せる。
これを自由度df=Ng-Naで評価すれば、漸近検定になる。
他方、正確検定はどうなるか。
N人、2\times N本の染色体。ホモの個体数はN_{homo}=\sum_{i}n_{g_{a_i,a_i}}ヘテロの個体数はN_{hetero}=\sum_{i\not{=}j}n_{g_{a_i,a_j}}
観測ジェノタイプテーブルの生起確率はPr=\frac{(2!)^N\times \prod_{i=1}^{Na} (n_i)!}{(2\times N)!(2!)^{N_{homo}}}\times \frac{N!}{\prod_{i,j} n_{g_{a_i,a_j}}}
書き換えて
Pr=\frac{(2!)^{N_{hetero}\times \prod_{i=1}^{Na} (n_i)!}{(2\times N)!}}\times \frac{N!}{\prod_{i,j} n_{g_{a_i,a_j}}}
これを、すべてのテーブルについて算出し、大小比較して累積することで正確確率が出る。・・・問題は、可能なテーブルすべてを網羅するための計算量ではあるが。昨日のソースにHWE検定を加えたらこう・・・(遅い)

package CNV;

public class CNVcalculator {
	/*
	 * アレル単位での最大コピー数をK
	 * 最小コピー数をkとしたとき
	 * Na=K-k+1
	 * Ng=Na*(Na+1)/2
	 * とすること!
	 */

	public int Na; //No. allele
	public int Ng; //No. genotype
	public double[][] t;//contingency table
	public int[][] t2;//CopyNumber別ジェノタイプテーブル
	public int[][] t3;//allele本数テーブル
	//public double[] w;//weight vector
	public double N;//No.samples
	public double R;//No.cases
	public double S;//No.controls
	public double[] n;//number vector
	int df=1;//自由度
	public org.apache.commons.math.distribution.ChiSquaredDistributionImpl chidf1 = 
		new org.apache.commons.math.distribution.ChiSquaredDistributionImpl(df);
	public org.apache.commons.math.distribution.ChiSquaredDistributionImpl chidfX;
	public double[][] hweChi;
	public double[][] hweExact;
	public double[][] assoc;//[0] trend,[1] MWW,[2] CNpool
	/**
	 * @param args
	 */
	public static void main(String[] args) {
		// TODO 自動生成されたメソッド・スタブ
		int cnvN=1;
		int caseN=500;
		int contN=500;
		int numallele = 2;
		int numgenotype = (int)(numallele*(numallele+1)/2);
		int x=10000000;
		int seed=(int)(Math.random()*x);
		Utils.MersenneTwisterFast mz= new Utils.MersenneTwisterFast(seed);
		double[] sl = StatUtilsX.Fisher.serialLogfact((caseN+contN)*2);
		
		for(int i=0;i<cnvN;i++){
			int[][] table=new int[2][numgenotype];
			double[] weight = new double[numgenotype];
			for(int j=0;j<weight.length;j++){
				weight[j]=j;
			}
			double[] af=new double[numallele];
			for(int j=0;j<af.length;j++){
				af[j]=mz.nextDouble();
				
			}
			StatUtilsX.MiscUtil.standard(af);
			//af=StatUtilsX.MiscUtil.accumstandard(af);
			/*
			for(int j=0;j<af.length;j++){
				System.out.println(af[j]);
			}
			*/
			//System.out.println("##");
			double[] gf = new double[numgenotype];
			//System.out.println("numgenotype="+numgenotype);
			int counter=0;
			for(int j=0;j<af.length;j++){
				for(int k=j;k<af.length;k++){
					double tmp = af[j]*af[k];
					if(j==k){
						gf[counter]=tmp;
					}else{
						gf[counter]=tmp*2;
					}
					
					counter++;
				}
			}
			/*
			for(int j=0;j<gf.length;j++){
				System.out.println(gf[j]+"\t");
			}
			*/
			
			gf=StatUtilsX.MiscUtil.accumstandard(gf);
			/*
			for(int j=0;j<gf.length;j++){
				System.out.println(gf[j]+"\t");
			}
			*/
			for(int j=0;j<caseN;j++){
				double r=mz.nextDouble();
				for(int k=0;k<gf.length;k++){
					if(r<gf[k]){
						table[0][k]++;
						break;
					}
				}
			}
			for(int j=0;j<contN;j++){
				double r=mz.nextDouble();
				for(int k=0;k<gf.length;k++){
					if(r<gf[k]){
						table[1][k]++;
						break;
					}
				}
			}
			double hweexactpCase=StatUtilsX.Fisher.hweFisherAbecasis(table[0]);
			double hweexactpCont=StatUtilsX.Fisher.hweFisherAbecasis(table[1]);
			//double hweexactpCase=StatUtilsX.Fisher.hweFisherAbecasis(table[0]);
			//System.out.print("casehwe\t"+hweexactpCase+"\t");
			//System.out.print("conthwe\t"+hweexactpCont+"\t");
			
			CNVcalculator cnvc=new CNVcalculator(table);
			
			cnvc.assoc[0]= cnvc.testcnvTrend(weight);
			cnvc.assoc[2] = cnvc.testCNpool();
			boolean cor=false;
			cnvc.assoc[1] = cnvc.testMWW(cor);
			cnvc.hweChi = cnvc.hweChicasecontall();
			cnvc.hweExact=cnvc.hweExactcasecontall(sl);
			cnvc.Print("\t", "\t");
			
		}
		/*
		int[][] t = {{1,2,3,4,5,6},{2,3,4,5,6,7}};
		//int[][] t = {{5,10,3},{8,9,2}};
		CNVcalculator cnv = new CNVcalculator(t);

		double[] w = {1,2,3,4,5,6};
		cnv.assoc[0]= cnv.testcnvTrend(w);
		cnv.assoc[2] = cnv.testCNpool();
		boolean cor=false;
		cnv.assoc[1] = cnv.testMWW(cor);
		cnv.hweChi = cnv.hweChicasecontall();
		cnv.Print("\t", "\n");
		*/
		/*
		for(int i=0;i<hwe.length;i++){
			for(int j=0;j<hwe[i].length;j++){
				System.out.print(hwe[i][j]+"\t");
			}
			System.out.println();
			
		}
		*/
	}
	
	public CNVcalculator(int[][] t_){
		t = new double[t_.length][t_[0].length];
		for(int i=0;i<t.length;i++){
			for(int j=0;j<t[i].length;j++){
				t[i][j]=(double)t_[i][j];
			}
		}
		//t = StatUtilsX.MiscUtil.DeepCopyInt2(t_);
		Ng=t[0].length;
		
		Na=(int)(Math.round((Math.sqrt(1+8*(double)Ng)-1)/2));
		//System.out.println(""+Na);
		N=0;
		R=0;
		S=0;
		n = new double[t[0].length];
		for(int i=0;i<t.length;i++){
			for(int j=0;j<t[i].length;j++){
				N+=t[i][j];
				if(i==0){
					R+=t[i][j];
				}else if(i==1){
					S+=t[i][j];
				}
				n[j]+=t[i][j];
			}
		}
		int maxCN=(Na-1)*2;
		
		t2= new int[2][maxCN+1];
		t3=new int[2][Na];
		int a1=0;
		int a2=0;
		
		for(int i=0;i<t[0].length;i++){
			int numcp=a1+a2;
			t2[0][numcp]+=t[0][i];
			t2[1][numcp]+=t[1][i];
			t3[0][a1]+=t[0][i];
			t3[0][a2]+=t[0][i];
			t3[1][a1]+=t[1][i];
			t3[1][a2]+=t[1][i];
			a2++;
			if(a2==Na){
				
				a1++;
				a2=a1;
			}
		}
		int dfX=Ng-Na;
		chidfX = 
			new org.apache.commons.math.distribution.ChiSquaredDistributionImpl(dfX);
		
		hweChi = new double[3][2];
		hweExact = new double[3][2];
		assoc = new double[3][2];
		/*
		for(int i=0;i<t2[0].length;i++){
			System.out.println(t2[0][i]+"\t"+t2[1][i]);
		}
		System.out.println("");
		for(int i=0;i<t3[0].length;i++){
			System.out.println(t3[0][i]+"\t"+t3[1][i]);
		}
		*/
		//w = new double[Ng];
	}

	public CNVcalculator(int[][] t_,String st){
		/*
		 * trendテストをするため
		 * t_はアレル数・ジェノタイプ数の基準を満たしていなくてもよい
		 */
		t = new double[t_.length][t_[0].length];
		for(int i=0;i<t.length;i++){
			for(int j=0;j<t[i].length;j++){
				t[i][j]=(double)t_[i][j];
			}
		}
		//t = StatUtilsX.MiscUtil.DeepCopyInt2(t_);
		
		Ng=t[0].length;
		/*
		Na=(int)(Math.round((Math.sqrt(1+8*(double)Ng)-1)/2));
		//System.out.println(""+Na);
		 */
		N=0;
		R=0;
		S=0;
		n = new double[t[0].length];
		for(int i=0;i<t.length;i++){
			for(int j=0;j<t[i].length;j++){
				N+=t[i][j];
				if(i==0){
					R+=t[i][j];
				}else if(i==1){
					S+=t[i][j];
				}
				n[j]+=t[i][j];
			}
		}
		/*
		int maxCN=(Na-1)*2;
		
		t2= new int[2][maxCN+1];
		t3=new int[2][Na];
		int a1=0;
		int a2=0;
		
		for(int i=0;i<t[0].length;i++){
			int numcp=a1+a2;
			t2[0][numcp]+=t[0][i];
			t2[1][numcp]+=t[1][i];
			t3[0][a1]+=t[0][i];
			t3[0][a2]+=t[0][i];
			t3[1][a1]+=t[1][i];
			t3[1][a2]+=t[1][i];
			a2++;
			if(a2==Na){
				
				a1++;
				a2=a1;
			}
		}
		chidfX = 
			new org.apache.commons.math.distribution.ChiSquaredDistributionImpl(t3[0].length-1);
		
		hweChi = new double[3][2];
		assoc = new double[3][2];
		*/
		/*
		for(int i=0;i<t2[0].length;i++){
			System.out.println(t2[0][i]+"\t"+t2[1][i]);
		}
		System.out.println("");
		for(int i=0;i<t3[0].length;i++){
			System.out.println(t3[0][i]+"\t"+t3[1][i]);
		}
		*/
		//w = new double[Ng];
	}
	public void Print(String sep1,String sep2){
		System.out.println(StringCNVcalculator(sep1,sep2));
	}
	public String StringCNVcalculator(String sep1,String sep2){
		String ret ="";
		ret += "contTable"+sep2;
		for(int i=0;i<t.length;i++){
			for(int j=0;j<t[i].length;j++){
				ret+=t[i][j]+sep1;
			}
			ret+=sep2;
		}
		for(int i=0;i<n.length;i++){
			ret+=n[i]+sep1;
		}
		ret +=sep2;
		ret+="t2"+sep2;
		for(int i=0;i<t2.length;i++){
			for(int j=0;j<t2[i].length;j++){
				ret+=t2[i][j]+sep1;
			}
			ret+=sep2;
		}
		ret+="t3"+sep2;
		for(int i=0;i<t3.length;i++){
			for(int j=0;j<t3[i].length;j++){
				ret+=t3[i][j]+sep1;
			}
			ret+=sep2;
		}
		ret += "R"+sep1+R+sep1+"S"+sep1+S+sep1+"N"+sep1+N+sep2;
		ret += "Na"+sep1+Na+sep2;
		ret += "Ng"+sep1+Ng+sep2;
		
		ret += "hweChi"+sep2;
		for(int i=0;i<hweChi.length;i++){
			for(int j=0;j<hweChi[i].length;j++){
				ret += hweChi[i][j]+sep1;
			}
			ret +=sep2;
		}
		ret += "hweExact"+sep2;
		for(int i=0;i<hweExact.length;i++){
			for(int j=0;j<hweExact[i].length;j++){
				ret += hweExact[i][j]+sep1;
			}
			ret +=sep2;
		}
		ret+="assoc"+sep2;
		for(int i=0;i<assoc.length;i++){
			for(int j=0;j<assoc[i].length;j++){
				ret+=assoc[i][j]+sep1;
			}
			ret+=sep2;
		}
		return ret;
	}
	public double[] testcnvTrend(double[] w){
		double[] ret = new double[2];
		if(N==0){
			return ret;
		}
		
		
		double[] p = new double[Ng];
		double P = R/N;
		for(int i=0;i<p.length;i++){
			if(n[i]!=0){
				p[i]=t[0][i]/n[i];
			}
			
		}
		double[] wn = new double[Ng];
		double W = 0;
		for(int i=0;i<wn.length;i++){
			wn[i]=n[i]*w[i];
			W+=wn[i];
		}
		double Wav=W/N;
		double[] wcor=new double[Ng];
		for(int i=0;i<wcor.length;i++){
			wcor[i]=w[i]-Wav;
		}
		double[] rsqn=new double[Ng];
		double Rsqn=0;
		for(int i=0;i<rsqn.length;i++){
			rsqn[i]=t[0][i]*t[0][i]/n[i];
			Rsqn+=rsqn[i];
		}
		double[] numer=new double[Ng];
		double Numer=0;
		for(int i=0;i<numer.length;i++){
			numer[i]=n[i]*(p[i]-P)*wcor[i];
			Numer+=numer[i];
		}
		double[] denom=new double[Ng];
		double Denom=0;
		for(int i=0;i<denom.length;i++){
			denom[i]=n[i]*wcor[i]*wcor[i];
			Denom+=denom[i];
		}
		ret[0]=Numer*Numer/(Denom*P*(1-P));
		/*
		ret[0]=(N*Math.pow((N*(t[0][1]+2*t[0][2])-R*(n[1]+2*n[2])),2))/
		(R*S*(N*(n[1]+4*n[2])-Math.pow((n[1]+2*n[2]),2)));
	*/
		try{
			ret[1]=1-chidf1.cumulativeProbability(ret[0]);
		} catch (org.apache.commons.math.MathException ex) {
			
		}
		//double s=Math.pow(3,2);
		//System.out.println("cnvtrend\t"+ret[0]+"\t"+ret[1]);
		return ret;
	}
	public double[] testMWW(boolean correct){
		/*
		 * Mann-Whiteney-Willcoxon
		 */
		double[] ret = new double[2];
		/*
		int maxCN=(Na-1)*2;
		
		int[][] t2= new int[2][maxCN+1];
		int a1=Na-1;
		int a2=Na-1;
		
		for(int i=0;i<t[0].length;i++){
			int numcp=a1+a2;
			t2[0][numcp]+=t[0][i];
			t2[1][numcp]+=t[1][i];
			a2--;
			if(a2<0){
				
				a1--;
				a2=a1;
			}
		}
		*/
		int count=0;
		int counted=1;
		double countedNum=S;
		if(R>S){
			count=1;
			counted=0;
			countedNum=R;
		}
		
		double U=0;
		//System.out.println("countedNum="+countedNum);
		for(int i=0;i<t2[count].length;i++){
			countedNum-=t2[counted][i];
			//System.out.println("i="+i+"\t"+countedNum+"\t"+t2[count][i]);
			U+=t2[count][i]*(countedNum+(double)t2[counted][i]/2);
		}
		double tmpU=R*S-U;
		U=Math.min(U,tmpU);
		double[] tie=new double[t2[0].length];
		double Tsum=0;
		for(int i=0;i<t2[0].length;i++){
			tie[i]=t2[0][i]+t2[1][i];
			Tsum+=tie[i]*tie[i]*tie[i]-tie[i];
		}
		double V=R*S*(N*N*N-N-Tsum)/12/(N*N-N);
		double E=R*S/2;
		double cor = 0;
		if(correct){
			cor=0.5;
		}else{
			
		}
		double Z=(Math.abs(U-E)-cor)/Math.sqrt(V);
		ret[0]=Z;
		System.out.print("U=\t"+U+"\t");
		System.out.print("V=\t"+V+"\t");
		System.out.print("E=\t"+E+"\t");
		org.apache.commons.math.distribution.NormalDistributionImpl norm = 
			new org.apache.commons.math.distribution.NormalDistributionImpl();
		try{
			ret[1]= (1-norm.cumulativeProbability(Z))*2;
		} catch (org.apache.commons.math.MathException ex) {
			
		}
		//System.out.println("Z\t"+Z+"\t"+"p\t"+ret[1]);
		return ret;
	}
	public double[] testCNpool(){
		/*
		int maxCN=(Na-1)*2;
		
		int[][] t2= new int[2][maxCN+1];
		int a1=Na-1;
		int a2=Na-1;
		for(int i=0;i<t[0].length;i++){
			int numcp=a1+a2;
			t2[0][numcp]+=t[0][i];
			t2[1][numcp]+=t[1][i];
			a2--;
			if(a2<0){
				
				a1--;
				a2=a1;
			}
		}
		*/
		String st="";
		CNVcalculator cnv=new CNVcalculator(t2,st);
		double[] w= new double[t2[0].length];
		w[0]=2*(Na-1);
		for(int i=1;i<w.length;i++){
			w[i]=w[i-1]-1;
		}
		double[] ret = cnv.testcnvTrend(w);
		
		
		
		return ret;
	}
	public double[][] hweChicasecontall(){
		double[][] ret = new double[3][2];
		
		if(N==0){
			return ret;
		}
		if(R!=0){
			ret[0]=hweChi(t[0],t3[0]);	
		}
		if(S!=0){
			ret[1]=hweChi(t[1],t3[1]);
		}
		int[] n3 = new int[t3[0].length];
		for(int i=0;i<n3.length;i++){
			n3[i]=t3[0][i]+t3[1][i];
		}
		ret[2]=hweChi(n,n3);
		
		return ret;
	}

	public double[][] hweExactcasecontall(double[] sl){
		double[][] ret = new double[3][2];
		
		if(N==0){
			return ret;
		}
		if(R!=0){
			ret[0]=hweExact(t[0],t3[0],sl);	
		}
		if(S!=0){
			ret[1]=hweExact(t[1],t3[1],sl);
		}
		int[] n3 = new int[t3[0].length];
		for(int i=0;i<n3.length;i++){
			n3[i]=t3[0][i]+t3[1][i];
		}
		ret[2]=hweExact(n,n3,sl);
		
		return ret;
	}
	
	public double[] hweChi(double[] t,int[] t3){
		
		double[] ret = new double[2];
		double[] exp=new double[t.length];
		double sum =0;
		for(int i=0;i<t3.length;i++){
			sum+=t3[i];
		}
		int counter=0;
		for(int i=0;i<t3.length;i++){
			for(int j=i;j<t3.length;j++){
				double tmp = t3[i]*t3[j]/(2*sum);
				if(i==j){
					exp[counter]=tmp;
				}else{
					exp[counter]=tmp*2;
				}
				if(exp[counter]!=0){
					ret[0]+=(t[counter]-exp[counter])*(t[counter]-exp[counter])/exp[counter];
					
				}
				counter++;
			}
		}
		/*
		for(int i=0;i<exp.length;i++){
			System.out.println(t[i]+"\t"+exp[i]);
		}
		*/
		try{
			ret[1]=1-chidfX.cumulativeProbability(ret[0]);
		} catch (org.apache.commons.math.MathException ex) {
			
		}
		return ret;
	}

	public double[] hweExact(double[] t,int[] t3,double[] sl){
		int numhetero=0;
		int numchrom=0;
		int numsubj=0;
		int[] limit = new int[t.length];
		int a1=0;
		int a2=0;
		for(int i=0;i<limit.length;i++){
			numchrom+=2*t[i];
			numsubj+=t[i];
			if(a1==a2){
				limit[i]=t3[a1]/2;
			}else{
				limit[i]=Math.min(t3[a1], t3[a2]);
				numhetero+=t[i];
			}
			a2++;
			if(a2==t3.length){
				a1++;
				a2=a1;
			}
		}
		double prob=0;
		double commune=-sl[numchrom]+sl[numsubj];
		for(int i=0;i<t3.length;i++){
			commune+=sl[t3[i]];
		}
		double originalpart=numhetero*sl[2];
		for(int i=0;i<t.length;i++){
			originalpart-=sl[(int)(t[i])];
		}
		//System.out.print(originalpart+"\t");
		/*
		for(int i=0;i<limit.length;i++){
			System.out.println("limit\t"+limit[i]);
		}
		*/
		int[] data=new int[t.length];
		boolean yes=true;
		
		while(yes){
			int tmpnumhetero=0;
			double tablepart=0;
			boolean ok=true;
			int[] allelesum=new int[t3.length];
			int b1=0;
			int b2=0;
			for(int i=0;i<data.length;i++){
				allelesum[b1]+=data[i];
				allelesum[b2]+=data[i];
				if(allelesum[b1]>t3[b1]){
					ok=false;
					break;
				}
				if(allelesum[b2]>t3[b2]){
					ok=false;
					break;
				}
				if(b1!=b2){
					tmpnumhetero+=data[i];
				}
				b2++;
				if(b2>=t3.length){
					b1++;
					b2=b1;
				}
			}
			if(ok){
				for(int i=0;i<t3.length;i++){
					if(allelesum[i]!=t3[i]){
						ok=false;
					}
				}
			}
			//ok=true;
			if(ok){
				/*
				for(int i=0;i<data.length;i++){
					System.out.print(data[i]+"\t");
				}
				*/
				tablepart = tmpnumhetero*sl[2];
				for(int i=0;i<data.length;i++){
					tablepart-=sl[(int)(data[i])];
				}
				//System.out.print(tablepart+"\t");
				double tmpPr=Math.exp(commune+tablepart);
				//System.out.print(tmpPr+"\t");
				//System.out.println();
				if(tablepart<=originalpart){
					prob+=Math.exp(commune+tablepart);
				}
			}
			data[0]++;
			for(int i=0;i<limit.length-1;i++){
				if(data[i]>limit[i]){
					data[i]=0;
					data[i+1]++;
				}
			}
			if(data[data.length-1]==limit[data.length-1]+1){
				yes=false;
			}
		}
		double[] ret = new double[2];
		ret[1]=prob;
		ret[0]=Math.exp(commune+originalpart);
		//System.out.println("Prob\t="+ret[0]+"\t"+ret[1]);
		return ret;
	}

}