small n large p

  • 100 samples x 25000 genes
  • You can predict perfectly when you are allowed to use explanatory variables as many as sample size.
n <- 10
m <- 10
p <- sample(0:1,n,replace=TRUE)
g <- matrix(sample(0:1,n*m,replace=TRUE),n,m)
p
g
lm.out <- lm(p~g)
p.est <- predict(lm.out)

plot(p,p.est)
print(cbind(p,p.est))
print(cbind(p,round(p.est)))

print(round(lm.out$coefficients))
> p
 [1] 1 1 0 0 0 1 1 0 0 0
> g
      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
 [1,]    0    1    1    0    1    1    0    1    1     0
 [2,]    1    0    0    0    0    1    1    0    0     1
 [3,]    0    0    0    0    1    1    1    0    0     1
 [4,]    0    1    1    0    1    1    1    0    0     1
 [5,]    0    1    0    0    0    0    1    0    0     1
 [6,]    1    1    1    0    0    0    1    1    0     0
 [7,]    1    0    0    0    0    0    1    0    0     0
 [8,]    1    0    1    0    0    0    1    1    0     0
 [9,]    0    1    1    1    1    0    0    0    0     0
[10,]    1    0    0    0    1    1    1    0    0     0
> print(cbind(p,p.est))
   p         p.est
1  1  1.000000e+00
2  1  1.000000e+00
3  0 -3.333333e-01
4  0  8.881784e-16
5  0  3.333333e-01
6  1  6.666667e-01
7  1  6.666667e-01
8  0  3.333333e-01
9  0  1.110223e-16
10 0  3.333333e-01
> print(cbind(p,round(p.est)))
   p  
1  1 1
2  1 1
3  0 0
4  0 0
5  0 0
6  1 1
7  1 1
8  0 0
9  0 0
10 0 0
> 
> print(round(lm.out$coefficients))
(Intercept)          g1          g2          g3          g4          g5          g6          g7          g8 
          1           1           0           0          -1          -1           0          -1           0 
         g9         g10 
         NA          NA 
  • When you have p >> n explanatory variabls, you would find many variable sets, each of which perfectly predict dependent variable.