2008-09-02

多次元ベクトルデータの超球面表現とその先

多次元ベクトルデータを持つサンプルがある
サンプル間の関係を相関係数で表した、相関係数行列(サンプル数xサンプル数の正方対象行列、対角成分は1)ができる
この相関係数行列は、サンプル数次元の単位超球面上の点を表現しているとみなせる(A Spherical Representation of a Correlation Matrix, by Bruno Falissard)
- サンプルは、球表面の点
- サンプル間の相関係数〜近さの係数は球面上の大円距離(超球の中心とサンプルを結ぶベクトルのなす角でもある)
多次元データの低次元表現
- ユークリッド低次元座標への射影
  - Rの特異値分解によって、主座標分析的に射影される
- 低次元超球面座標への射影も可能(方法については、『要、調べ物』レベル
密度推定
- サンプルは、超球面上において、また、ユークリッド座標空間において、疎密を持って存在している
  - この疎密から、母集団の密度分布を推定することが可能
    - ユークリッド空間におけるサンプルの場合には、カーネル推定が可能であり、このとき、カーネルに正規分布を仮定することはたいていの場合、悪くない
    - 超球面座標空間における密度分布も可能である(はず)。『要、調べ物(やや難)』のレベル
- 密度分布は、サンプルについても可能。また、サンプルがなにか属性を持っている場合には、その属性についても可能
  - 属性が、疎密のあるサンプル集団の中で、どのような濃度で分布しているかを知りたい場合には、属性の分布をサンプルの分布にて除したものの、空間中での分布を知ればよい
参考サイト
- カーネル推定
  - Rの記事
- 超球面上の点列への小超球面あてはめ：球面最小二乗法
- 球面最小二乗法による球面上の曲線あてはめ
- 超球面探索法による水素結合クラスター構造の系統的探索と熱力学解析に基づく構造転移
- 多次元密度分布推定のためのＲ関数は、おそらくMASS library のkde2d関数(これは、２次元用だが、適宜、拡張できる構造におそらくなっている・・・)
  - ２次元版は以下の通り

function (x, y, h, n = 25, lims = c(range(x), range(y))) 
{
    nx <- length(x)
    if (length(y) != nx) 
        stop("data vectors must be the same length")
    if (any(!is.finite(x)) || any(!is.finite(y))) 
        stop("missing or infinite values in the data are not allowed")
    if (any(!is.finite(lims))) 
        stop("only finite values are allowed in 'lims'")
    gx <- seq.int(lims[1], lims[2], length.out = n)
    gy <- seq.int(lims[3], lims[4], length.out = n)
    if (missing(h)) 
        h <- c(bandwidth.nrd(x), bandwidth.nrd(y))
    h <- h/4
    ax <- outer(gx, x, "-")/h[1]
    ay <- outer(gy, y, "-")/h[2]
    z <- matrix(dnorm(ax), n, nx) %*% t(matrix(dnorm(ay), n, 
        nx))/(nx * h[1] * h[2])
    return(list(x = gx, y = gy, z = z))
}

- - 参考の参考、Ｒのdensity関数

> getS3method("density","default")
function (x, bw = "nrd0", adjust = 1, kernel = c("gaussian", 
    "epanechnikov", "rectangular", "triangular", "biweight", 
    "cosine", "optcosine"), weights = NULL, window = kernel, 
    width, give.Rkern = FALSE, n = 512, from, to, cut = 3, na.rm = FALSE, 
    ...) 
{
    if (length(list(...)) > 0) 
        warning("non-matched further arguments are disregarded")
    if (!missing(window) && missing(kernel)) 
        kernel <- window
    kernel <- match.arg(kernel)
    if (give.Rkern) 
        return(switch(kernel, gaussian = 1/(2 * sqrt(pi)), rectangular = sqrt(3)/6, 
            triangular = sqrt(6)/9, epanechnikov = 3/(5 * sqrt(5)), 
            biweight = 5 * sqrt(7)/49, cosine = 3/4 * sqrt(1/3 - 
                2/pi^2), optcosine = sqrt(1 - 8/pi^2) * pi^2/16))
    if (!is.numeric(x)) 
        stop("argument 'x' must be numeric")
    name <- deparse(substitute(x))
    x <- as.vector(x)
    x.na <- is.na(x)
    if (any(x.na)) {
        if (na.rm) 
            x <- x[!x.na]
        else stop("'x' contains missing values")
    }
    N <- nx <- length(x)
    x.finite <- is.finite(x)
    if (any(!x.finite)) {
        x <- x[x.finite]
        nx <- length(x)
    }
    if (is.null(weights)) {
        weights <- rep.int(1/nx, nx)
        totMass <- nx/N
    }
    else {
        if (length(weights) != N) 
            stop("'x' and 'weights' have unequal length")
        if (!all(is.finite(weights))) 
            stop("'weights' must all be finite")
        if (any(weights < 0)) 
            stop("'weights' must not be negative")
        wsum <- sum(weights)
        if (any(!x.finite)) {
            weights <- weights[x.finite]
            totMass <- sum(weights)/wsum
        }
        else totMass <- 1
        if (!isTRUE(all.equal(1, wsum))) 
            warning("sum(weights) != 1  -- will not get true density")
    }
    n.user <- n
    n <- max(n, 512)
    if (n > 512) 
        n <- 2^ceiling(log2(n))
    if (missing(bw) && !missing(width)) {
        if (is.numeric(width)) {
            fac <- switch(kernel, gaussian = 4, rectangular = 2 * 
                sqrt(3), triangular = 2 * sqrt(6), epanechnikov = 2 * 
                sqrt(5), biweight = 2 * sqrt(7), cosine = 2/sqrt(1/3 - 
                2/pi^2), optcosine = 2/sqrt(1 - 8/pi^2))
            bw <- width/fac
        }
        if (is.character(width)) 
            bw <- width
    }
    if (is.character(bw)) {
        if (nx < 2) 
            stop("need at least 2 points to select a bandwidth automatically")
        bw <- switch(tolower(bw), nrd0 = bw.nrd0(x), nrd = bw.nrd(x), 
            ucv = bw.ucv(x), bcv = bw.bcv(x), sj = , "sj-ste" = bw.SJ(x, 
                method = "ste"), "sj-dpi" = bw.SJ(x, method = "dpi"), 
            stop("unknown bandwidth rule"))
    }
    if (!is.finite(bw)) 
        stop("non-finite 'bw'")
    bw <- adjust * bw
    if (bw <= 0) 
        stop("'bw' is not positive.")
    if (missing(from)) 
        from <- min(x) - cut * bw
    if (missing(to)) 
        to <- max(x) + cut * bw
    if (!is.finite(from)) 
        stop("non-finite 'from'")
    if (!is.finite(to)) 
        stop("non-finite 'to'")
    lo <- from - 4 * bw
    up <- to + 4 * bw
    y <- .C("massdist", x = as.double(x), xmass = as.double(weights), 
        nx = nx, xlo = as.double(lo), xhi = as.double(up), y = double(2 * 
            n), ny = as.integer(n), PACKAGE = "stats")$y * totMass
    kords <- seq.int(0, 2 * (up - lo), length.out = 2 * n)
    kords[(n + 2):(2 * n)] <- -kords[n:2]
    kords <- switch(kernel, gaussian = dnorm(kords, sd = bw), 
        rectangular = {
            a <- bw * sqrt(3)
            ifelse(abs(kords) < a, 0.5/a, 0)
        }, triangular = {
            a <- bw * sqrt(6)
            ax <- abs(kords)
            ifelse(ax < a, (1 - ax/a)/a, 0)
        }, epanechnikov = {
            a <- bw * sqrt(5)
            ax <- abs(kords)
            ifelse(ax < a, 3/4 * (1 - (ax/a)^2)/a, 0)
        }, biweight = {
            a <- bw * sqrt(7)
            ax <- abs(kords)
            ifelse(ax < a, 15/16 * (1 - (ax/a)^2)^2/a, 0)
        }, cosine = {
            a <- bw/sqrt(1/3 - 2/pi^2)
            ifelse(abs(kords) < a, (1 + cos(pi * kords/a))/(2 * 
                a), 0)
        }, optcosine = {
            a <- bw/sqrt(1 - 8/pi^2)
            ifelse(abs(kords) < a, pi/4 * cos(pi * kords/(2 * 
                a))/a, 0)
        })
    kords <- fft(fft(y) * Conj(fft(kords)), inverse = TRUE)
    kords <- pmax.int(0, Re(kords)[1:n]/length(y))
    xords <- seq.int(lo, up, length.out = n)
    x <- seq.int(from, to, length.out = n.user)
    structure(list(x = x, y = approx(xords, kords, x)$y, bw = bw, 
        n = N, call = match.call(), data.name = name, has.na = FALSE), 
        class = "density")
}
<environment: namespace:stats>
>