2011-10-22

ちょっといろいろ調べてみる

R HWE LD SNP plink GWAViewer

こちらの講義に出席してみる
RのGenetics関係のパッケージのページ(こちら)
RでHWE

library(HardyWeinberg)
x <- c(100, 50, 10)
HW.test <- HWChisq(x, verbose = TRUE)

HW.lrtest <- HWLratio(x, verbose = TRUE)

HW.exacttest <- HWExact(x, verbose = TRUE)

> library(HardyWeinberg)
> x <- c(100, 50, 10)
> HW.test <- HWChisq(x, verbose = TRUE)
Chi-square test with continuity correction for Hardy-Weinberg equilibrium
Chi2 =  0.799458 p-value =  0.3712555 D =  -2.34375 
> 
> HW.lrtest <- HWLratio(x, verbose = TRUE)
G2 = 1.123345 p-value = 0.2891993 
> 
> HW.exacttest <- HWExact(x, verbose = TRUE)
Haldane's Exact test for Hardy-Weinberg equilibrium
sample counts: nAA =  100 nAB =  50 nBB =  10 
H0: HWE (D==0), H1: D <> 0 
D =  -2.34375 p =  0.3671761 
>

# make random compositions that are in HWE

set.seed(123)

m <- 100 # number of markers
n <- 100 # sample size

out <- HWData(n,m)
Xc <- out$Xc
out <- HWTernaryPlot(Xc,100,region=1,vertex.cex=2,signifcolour=TRUE)

library(mapLD)
data(SNPdata)
getLD <- mapLD(SNPdata = SNPdata,
locusID.col = 'markerID',
subjectID.col = 'subjectID',
allele.cols = 1:2,
WhichGene = NA,
outgraph = NA)

plinkの出力ファイル一覧(こちら)

Filename	Main associated command(s)	Description
plink.adjust	 --adjust	 Adjusted significance values (multiple testing)
plink.assoc	 --assoc	 Association results
plink.assoc.hap	 --hap-assoc	 Haplotype-based association results
plink.assoc.linear	 --linear	 Linear regression model
plink.assoc.logistic	 --logistic	 Logistic regression model
plink.assoc.mperm	 --assoc --mperm	 maxT permutation empirical p-values
plink.assoc.perm	 --assoc --perm	 Adaptive permutation empirical p-values
plink.assoc.proxy	 --proxy-assoc	 Proxy association results
plink.assoc.set	 --assoc --set	 Set-based association results
plink.bed	 --make-bed	 Binary PED file
plink.bim	 --make-bed	 Binary MAP file
plink.chap	 --chap	 Conditional haplotype tests
plink.cov	 --write-covar	 Ordered, filtered covariate file
plink.clumped	 --clump	 LD-based results clumping
plink.clumped.best	 --clump-best	 Single best LD-based clumping
plink.clumped.ranges	 --clump-range	 Gene/region report for clumps
plink.cluster0	 --cluster	 Progress of IBS clustering
plink.cluster1	 --cluster	 IBS cluster solution, format 1
plink.cluster2	 --cluster	 IBS cluster solution, format 2
plink.cluster3	 --cluster	 IBS cluster solution, format 3
plink.cluster3.missing	 --cluster-missing	 IBM cluster solution, format 3
plink.cmh	 --mh	 Cochran-Mantel-Haenszel test 1
plink.cmh2	 --mh2	 Cochran-Mantel-Haenszel test 2
plink.cnv.indiv	 --cnv-list	 Copy number variant per individual summary
plink.cnv.overlap	 --cnv-list	 Copy number variant overlap
plink.cnv.summary	 --cnv-list	 Copy number variant summary
plink.cnv.summary.mperm	 --cnv-list	 Copy number variant test
plink.diff	 --merge-mode 6/7	 Difference file
plink.epi-cc1	 --epistasis	 Epistasis: case/control pairwise results
plink.epi-cc2	 --epistasis	 Epistasis: case/control summary results
plink.epi-co1	 --epistasis --case-only	 Epistasis: case-only pairwise results
plink.epi-co2	 --epistasis --case-only	 Epistasis: case-only summary results
plink.fam	 --make-bed	 Binary FAM file
plink.fmendel	 --mendel	 Mendel errors, per family
plink.frq	 --freq	 Allele frequency table
plink.frq.count	 --freq --counts	 Allele counts table
plink.frq.hap	 --hap-freq	 Allele frequency table
plink.genepi.dat	 --genepi	 Gene-based epistasis R dataset
plink.genepi.R	 --genepi	 Gene-based epistasis R script
plink.genome	 --genome	 Genome-wide IBD/IBS pairwise measures
plink.het	 --het	 Individual inbreeding coefficients
plink.hh		 List of heterozygous haploid genotypes (SNPs/individuals)
plink.hom	 --homozyg-snp --homozyg-kb	 Runs of homozygosity
plink.hom.overlap	 --homozyg-group	 Pools of overlapping runs of homozygosity
plink.homog	 --homog	 Between strata homogeneity test
plink.hwe	 --hardy	 Hardy-Weinberg test statistics
plink.imendel	 --mendel	 Mendel errors, per individual
plink.imiss	 --missing	 Missing rates, per individual
plink.info	 --recodeHV	 Info file for Haploview filesets
plink.irem	 --mind	 List of individuals removed for low genotyping
plink.imputed.map	 --hap-impute	 Imputed from multi-marker predictors
plink.impute.ped	 --hap-impute	 Imputed from multi-marker predictors
plink.list	 --list	 Recoded LIST file
plink.lmendel	 --mendel	 Mendel errors, per locus
plink.lmiss	 --missing	 Missing rates, per locus
plink.log		 Log file (always generated)
plink.map	 --recode	 Recoded MAP file
plink.mdist	 --cluster --matrix	 IBS distance matrix
plink.mdist.missing	 --cluster-missing	 IBM distance matrix
plink.mendel	 --mendel	 Mendel errors, per error
plink.mishap	--hap	 List of SNPs that show problem phasing (could not be found or on wrong chromosome)
plink.missing	 --test-missing	 Test of differences in C/C missing rates
plink.missing.hap	 --test-mishap	 Haplotype-based test of non-random genotyping failure
plink.missnp	--merge	 List of SNPs that show strand problems when merging files (more than 2 alleles)
plink.model	 --model	 Full-model association results
plink.model.best.mperm	 --model --mperm	 Best full-model association max(T) permutation results
plink.model.best.perm	 --model --perm	 Best full-model association adaptive permutation results
plink.model.gen.mperm	 --model --mperm --model-gen	 Genotypic association max(T) permutation results
plink.model.gen.perm	 --model --perm --model-gen	 Genotypic association adaptive permutation results
plink.model.dom.mperm	 --model --mperm --model-dom	 Dominant association max(T) permutation results
plink.model.dom.perm	 --model --perm --model-dom	 Dominant association adaptive permutation results
plink.model.trend.mperm	 --model --mperm --model-trend	 Trend test association max(T) permutation results
plink.model.trend.perm	 --model --perm --model-trend	 Trend test association adaptive permutation results
plink.model.rec.mperm	 --model --mperm --model-rec	 Recessive association max(T) permutation results
plink.model.rec.perm	 --model --perm --model-rec	 Recessive association adaptive permutation results
plink.nof		 List of SNPs with no observed founders
plink.nosex		 List of individuals with ambiguous sex code
plink.nearest	--cluster --neighbour	 Nearest neighbour (IBS) statistics
plink.pdump	 --pedigree	 Information on pedigree structure
plink.ped	 --recode	 Recoded PED file
plink.phase-*	 --hap --phase	 Haplotype phases (one file per locus)
plink.plist	 --plist	 Pairwise list of two people's genotypes
plink.proxy.impute	 --proxy-impute	 Proxy imputation output
plink.proxy.impute.dosage	 --proxy-impute --proxy-dosage	 Proxy imputation dosage output
plink.proxy.report	 --proxy-assoc	 Verbose proxy association output
plink.prune.in	 --indep --indep-pairwise	 List of remaining SNPs (i.e. not pruned)
plink.prune.out	 --indep --indep-pairwise	 List of pruned-out SNPs
plink.qassoc	 --assoc	 Quantitative trait association results
plink.qassoc.gxe	 --gxe	 Quantitative trait interaction results
plink.range.report	 --cnv-verbose-report-regions	 Listing of CNVs by genes/regions
plink.raw	 --recodeAD	 Recoded additive/dominance format file
plink.snplist	 --write-snplist	 List of SNPs in the dataset
plink.T2	 --T2	 Hotelling's T(2) test results
plink.tdt	 --tdt	 TDT/parenTDT asymptotic results
plink.tdt.hap	 --tdt	 TDT/parenTDT permutaion results
plink.tdt.mperm	 --tdt	 TDT/parenTDT max(T) permutation results
plink.tdt.perm	 --tdt	 TDT/parenTDT adaptive permutation results
plink.tdt.poo	 --tdt --poo	 TDT parent-of-origin results
plink.tdt.poo.mperm	 --tdt --poo --mperm	 TDT parent-of-origin max(T) permutation results
plink.tdt.poo.perm	 --tdt --poo --perm	 TDT parent-of-origin adaptive permutation results
plink.tdt.poo.set	 --tdt --poo --set --mperm	 TDT parent-of-origin set-based results
plink.tdt.set	 --tdt --set --mperm	 TDT/parenTDT set-based results
plink.tfam	 --transpose / --tfile	 FAM for for transposed fileset
plink.tped	 --transpose / --tfile	 PED file for transposed fileset
plink.twolocus	 --twolocus	 SNP x SNP contingency table

GWAViewer(こちら)

2011-04-03

同じのとり方はいくつある？２

R LD 連鎖不平衡

昨日、「同じ $r^2$ のマーカーの取り方について書いた
$r^2$ とは、マーカー間の関係を数値化したものであるので、それを角度ととらえて、幾何的に考えることにする
球と単体とは、「相互関係がそろっている点」を扱う仕組みであるのは、こちらの記事の通りなので、それを使う
k次元空間でk-1次元多様体であるk次元球面上の点がマーカーに相当するとする
このとき、あるマーカーを含んだN個のマーカーが相互に等しい関係であるようにN個のマーカーを取ることを考える
これは、あるマーカーに相当する点を頂点の一つとするk-1正単体の頂点を決めることに相当するので、 $N=k-1$ がその最大個数である
またあるマーカーと等しい関係にあるマーカーは、そのマーカーを中心としたk-1次元球面上の点に対応する
そのようにしてとる点は、中心マーカーとの関係は同じだが、取られたマーカー同士の関係は遠近取り混ぜられている
以下のＲのソースではgは正単体を作るマーカー、g2はそれに「中心」を置いたもの
以下のソースでは、染色体の本数を自然数で作るためにs,mをはじめとするパラメタの値の与え方に条件があるが、頻度でよければその制約を減らして、同様な方法でハプロタイプ頻度分布を算出することもできる

#N 染色体本数
#p 0アレルの頻度
#P pの逆数
#n 0アレル染色体の本数
#r 組換え染色体の割合
#R rの逆数
#x 0アレル染色体のうちの組換え染色体の本数
#m マーカー数
#s 「中心染色体」と同じアレルを持ち、かつ組換え体である本数

s<-2
m<-3
x<-s*m
R<-6
r<-1/R
n<-x*R
P<-3
p<-1/P
N<-P*n
center<-c(rep(0,N*p),rep(1,N*(1-p)))
g<-rbind(matrix(0,N*p,m),matrix(1,N*(1-p),m))

for(i in 1:m){
	g[((i-1)*x+1):(i*x),i]<-1
	g[(N-(i-1)*x):(N-i*x+1),i]<-0
	center[((i-1)*x+1):((i-1)*x+s)]<-1
	center[(N-(i-1)*x):(N-(i-1)*x-(s-1))]<-1
}
cor(g)
g2<-cbind(center,g)
cor(g2)

2011-04-01

連鎖不平衡にあるマーカーのバリエーション

R LD

サンプルを集めたらアレル頻度0.5のSNPマーカーがいくつかあったとする
今、あるマーカーとのLD関係が $r^2$ で等しいようなマーカーを複数作ることを考える
２通りで作ろう
- １つ目
  - 次のように、すべてのマーカー同士の関係が均等
  - $\frac{\frac{Nh}{2}}{step}$ 個のマーカーが作れる

Nh<-40
step<-2
Nm<-round(Nh/2/step)
hcenter<-c(rep(0,Nh/2),rep(1,Nh/2))

H1<-rbind(matrix(0,Nh/2,Nm),matrix(1,Nh/2,Nm))
for(i in 1:Nm){
	H1[((i-1)*step+1):(i*step),i]<-1
	H1[(((i-1)*step+1)+Nh/2):((i*step)+Nh/2),i]<-0
}

cor(H1)

- ２つ目
  - 中心となるマーカーとの関係が等しいようなマーカーを作る
  - 中心以外のマーカー同士は近かったり遠かったりする
  - $(_{Nh/2}C_{step})^2$ 個のマーカーが作れる

Nm<-6

H2<-hcenter
for(i in 2:Nm){
	t1<-sample(1:(Nh/2),step*2)
	t2<-sample((1+Nh/2):Nh,step*2)
	tmp<-hcenter
	tmp[t1]<-1
	tmp[t2]<-0
	H2<-cbind(H2,tmp)
}

cor(H2)

2009-05-14

真の関連と間接的関連

２ＳＮＰがある。アリル頻度が、 $P(1),P(2)$ とする。この２ＳＮＰのＬＤ関係は $r(1,2)^2$ で表されるとする。２ＳＮＰの作る４ハプロタイプの頻度は、
$h1=P(1)P(2)+r(1,2)\sqrt{P(1)(1-P(1))P(2)(1-P(2))}$
$h2=P(1)(1-P(2))-r(1,2)\sqrt{P(1)(1-P(1))P(2)(1-P(2))}$
$h3=(1-P(1))P(2)-r(1,2)\sqrt{P(1)(1-P(1))P(2)(1-P(2))}$
$h4=(1-P(1))(1-P(2))+r(1,2)\sqrt{P(1)(1-P(1))P(2)(1-P(2))}$
今、有病率P(3)の疾患があって、この疾患と第一のＳＮＰのアリルとに関連があるとする。
もう片方のＳＮＰは、疾患とは関係がないが、ＬＤがあるために、間接的な関連が観察されるものとする。
関連の強さは、相対危険度として $g$ とする。
ケースとコントロールのサンプリングを有病率に忠実に、総サンプル数 $N$ で行ったときの、カイ自乗統計量を $\chi^2$ としたときに、 $N\times r(1,3)^2=\chi^2$ とすると、 $r(1,3)$ を用いて $g$ を表すことができて、それは、
$g=1+\frac{1}{P(1)\frac{\sqrt{\frac{P(3)(1-P(1))}{(1-P(3))P(1)}}}{r(1,3)}-1}$
このとき $r(2,3)=r(1,2)\times r(1,3)$ となる。
その上で、
$h(1,1,1)=\frac{g\times h1}{K}P(3)$
$h(1,2,1)=\frac{g\times h2}{K}P(3)$
$h(2,1,1)=\frac{\times h3}{K}P(3)$
$h(2,2,1)=\frac{\times h4}{K}P(3)$
$h(1,1,2)=h1-\frac{g\times h1}{K}P(3)$
$h(1,2,2)=h2-\frac{g\times h2}{K}P(3)$
$h(2,1,2)=h3-\frac{\times h3}{K}P(3)$
$h(2,2,2)=h4-\frac{\times h4}{K}P(3)$
ただし、
$K=gP(1)+P(2)$