- データのベクトル化と重み付け
- TF*IDF(term frequency-inverse document frequency)
public static double[][] tfidf(double[][] vs, int type){
/*
* type=0 log(N/n)
* type=1 log((N+1)/n)
*/
double[][] ret = new double[vs.length][vs[0].length];
int[] counter=new int[vs[0].length];
for(int i=0;i<vs.length;i++){
for(int j=0;j<vs[i].length;j++){
if(vs[i][j]!=0){
counter[j]++;
}
}
}
for(int i=0;i<vs.length;i++){
for(int j=0;j<vs[i].length;j++){
if(type==0){
ret[i][j]=vs[i][j]*Math.log((double)vs.length/(double)counter[j]);
}else if(type==1){
ret[i][j]=vs[i][j]*Math.log((double)(vs.length+1)/(double)counter[j]);
}
}
}
return ret;
}
public static double cosmeasure(double[] a, double[] b){
double ret = 0;
double bunsi=0;
double bunbo1=0;
double bunbo2=0;
for(int i=0;i<a.length;i++){
bunsi+=a[i]*b[i];
bunbo1+=a[i]*a[i];
bunbo2+=b[i]*b[i];
}
ret = bunsi/Math.pow(bunbo1*bunbo2,0.5);
return ret;
}
-
- Jaccard係数のn次元拡張(たぶん、これで合っている)
public static double jaccardN(double[] a,double[] b){
double ret = 0;
double bunsi=0;
double bunbo=0;
for(int i=0;i<a.length;i++){
if(a[i]!=0 && b[i]!=0){
bunsi += a[i]+b[i];
bunbo += a[i]*a[i]+b[i]*b[i]+a[i]*b[i];
}
}
ret = bunsi/bunbo;
return ret;
}
- 疎の行列の書式:Matrix market coordinate format
- 0が多い行列を効率よく表現するフォーマット(こちらに記事)