arulesパッケージ

  • 2値の疎なデータにあって、項目間の包含関係を単純な比として数値化する手法
  • 大規模なデータであるから、演算自体を非常に単純化してある
  • 大規模なデータであるから、項目の組み合わせを限定する基準が必要
  • arulesパッケージを使ってみる
  • arulesパッケージ
  • transactionとは1度の買い物と言う行為のこと。全項目に関する0,1の値のベクトル
  • ruleとはX \Rightarrow Y \text{where} X, Y \subseteq I, X \cap Y = \empty のことで、Xは、ルールの antecedent (left-hand-side or LHS) Yは consequent (right-hand-side or RHS) と言う
  • 単純な例

    • support
      • \text{support}(X) = x+z
      • \text{support}(Y) = y+z
      • \text{support}(X \cup Y)=z
    • \text{confidence} (X \Rightarrow Y = \frac{\text{support}(X \cup Y)}{\text{support}(X)}=\frac{z}{x+z}
    • \text{confidence} (Y \Rightarrow X = \frac{\text{support}(X \cup Y)}{\text{support}(Y)}=\frac{z}{y+z}
    • lift
      • \text{lift}(X \Rightarrow Y)=\frac{\text{support}(X \cup Y)}{\text{support}(X)\text{support}(Y)}
      • \text{lift}(Y \Rightarrow X)=\frac{\text{support}(X \cup Y)}{\text{support}(X)\text{support}(Y)}=\text{lift}(X \Rightarrow Y)
install.packages("arules")
library(arules)
  • データは"transactions"型
# 買い物情報
data("Epub")
Epub
str(Epub)
summary(Epub)
> str(Epub)
Formal class 'transactions' [package "arules"] with 4 slots
  ..@ transactionInfo:'data.frame':     15729 obs. of  2 variables:
  .. ..$ transactionID: Factor w/ 15732 levels "session_10002",..: 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 ...
  .. ..$ TimeStamp    : POSIXct[1:15729], format: "2003-01-02 10:59:00" ...
  ..@ data           :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
  .. .. ..@ i       : int [1:25893] 7 199 31 0 64 935 422 0 194 0 ...
  .. .. ..@ p       : int [1:15730] 0 1 2 3 6 7 8 9 11 12 ...
  .. .. ..@ Dim     : int [1:2] 936 15729
  .. .. ..@ Dimnames:List of 2
  .. .. .. ..$ : NULL
  .. .. .. ..$ : NULL
  .. .. ..@ factors : list()
  ..@ itemInfo       :'data.frame':     936 obs. of  1 variable:
  .. ..$ labels:Class 'AsIs'  chr [1:936] "doc_11d" "doc_13d" "doc_14c" "doc_14e" ...
  ..@ itemsetInfo    :'data.frame':     15729 obs. of  1 variable:
  .. ..$ itemsetID: Factor w/ 15732 levels "session_10002",..: 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 ...
> 
head(Epub@transactionInfo)
head(Epub@data,1)
head(Epub@itemInfo)
head(Epub@itemsetInfo)
# transactionはそのID と名前と時刻情報からなっている
> head(Epub@transactionInfo)
      transactionID           TimeStamp
10792  session_4795 2003-01-02 10:59:00
10793  session_4797 2003-01-02 21:46:01
10794  session_479a 2003-01-03 00:50:38
10795  session_47b7 2003-01-03 08:55:50
10796  session_47bb 2003-01-03 11:27:44
10797  session_47c2 2003-01-04 00:18:04
# dataの中身は.と|とで描かれている
> head(Epub@data,1)
1 x 15729 sparse Matrix of class "ngCMatrix"
                                                                        
[1,] . . . | . | . | . . . . . . . . | . | . . | | | | | | | | | | . . .
                                                                        
[1,] . | . . . | . . . . . . . . . . . . . . . . . . . | . . . . . . . .
> head(Epub@itemInfo)
   labels
1 doc_11d
2 doc_13d
3 doc_14c
4 doc_14e
5 doc_150
6 doc_151
> head(Epub@itemsetInfo)
         itemsetID
10792 session_4795
10793 session_4797
10794 session_479a
10795 session_47b7
10796 session_47bb
10797 session_47c2
# タイムスタンプから年の情報を取り出して、年ごとのtransaction数を集計
year <- strftime(as.POSIXlt(transactionInfo(Epub)[["TimeStamp"]]), "%Y")
table(year)
# 特定の年 2003 年を取り出して格納する
# Epub[1],...,Epub[n]はそれぞれ、第1,...,n番のtransactionである
Epub2003 <- Epub[year == "2003"]
length(Epub2003)
image(Epub2003)
> year <- strftime(as.POSIXlt(transactionInfo(Epub)[["TimeStamp"]]), "%Y")
> table(year)
year
2003 2004 2005 2006 2007 2008 2009 
 986 1376 1610 3010 4051 4693    3 
> # 特定の年 2003 年を取り出して格納する
> Epub2003 <- Epub[year == "2003"]
> length(Epub2003)
[1] 986
> image(Epub2003)
  • 2003年の購買の01パターン

# 一度の買い物に20アイテムよりたくさん買ったものを取り出す
transactionInfo(Epub2003[size(Epub2003) > 20])
# 1st-5th transactionsを示してみる
inspect(Epub2003[1:5])
# 1st-5th transactionsをリストにしてみる
as(Epub2003[1:5], "list")
> inspect(Epub2003[1:5])
  items     transactionID           TimeStamp
1 {doc_154}  session_4795 2003-01-02 10:59:00
2 {doc_3d6}  session_4797 2003-01-02 21:46:01
3 {doc_16f}  session_479a 2003-01-03 00:50:38
4 {doc_11d,                                  
   doc_1a7,                                  
   doc_f4}   session_47b7 2003-01-03 08:55:50
5 {doc_83}   session_47bb 2003-01-03 11:27:44
> # 1st-5th transactionsをリストにしてみる
> as(Epub2003[1:5], "list")
$session_4795
[1] "doc_154"

$session_4797
[1] "doc_3d6"

$session_479a
[1] "doc_16f"

$session_47b7
[1] "doc_11d" "doc_1a7" "doc_f4" 

$session_47bb
[1] "doc_83"
# tidListsというのは"transactionIDLists"というarulesパッケージのオブジェクトの名称
# アイテムごとに、どのtransactionの対象になっているかで扱っている
# それをさらにlist化している
EpubTidLists <- as(Epub, "tidLists")
EpubTidLists
head(as(EpubTidLists[1:3], "list"))
> EpubTidLists <- as(Epub, "tidLists")
> EpubTidLists
tidLists in sparse format with
 936 items/itemsets (rows) and
 15729 transactions (columns)
> head(as(EpubTidLists[1:3], "list"))
$doc_11d
  [1] "session_47b7"    "session_47c2"    "session_47d8"   
  [4] "session_4855"    "session_488d"    "session_4898"   
  [7] "session_489b"    "session_489c"    "session_48a1"   
 [10] "session_4897"    "session_48a0"    "session_489d"   
 [13] "session_48a5"    "session_489a"    "session_4896"   
 [16] "session_48aa"    "session_48d0"    "session_49de"   
 [19] "session_4b35"    "session_4bac"    
# ある名簿の性別・年齢・収入などの情報
data("AdultUCI")
dim(AdultUCI)
AdultUCI[1:2,]
> data("AdultUCI")
> dim(AdultUCI)
[1] 48842    15
> AdultUCI[1:2,]
  age        workclass fnlwgt education education-num     marital-status
1  39        State-gov  77516 Bachelors            13      Never-married
2  50 Self-emp-not-inc  83311 Bachelors            13 Married-civ-spouse
       occupation  relationship  race  sex capital-gain capital-loss
1    Adm-clerical Not-in-family White Male         2174            0
2 Exec-managerial       Husband White Male            0            0
  hours-per-week native-country income
1             40  United-States  small
2             13  United-States  small
# 0,1データに整形する
# "fnlwgt","education-num"はなかったことにする(なかったことにしたかったらしい)
AdultUCI[["fnlwgt"]] <- NULL
AdultUCI[["education-num"]] <- NULL
# 年齢・仕事時間・capital-gain・capital-lossを順序カテゴリにする
AdultUCI[[ "age"]] <- ordered(cut(AdultUCI[[ "age"]], c(15,25,45,65,100)),labels = c("Young", "Middle-aged", "Senior", "Old"))
AdultUCI[[ "hours-per-week"]] <- ordered(cut(AdultUCI[[ "hours-per-week"]],c(0,25,40,60,168)),labels = c("Part-time", "Full-time", "Over-time", "Workaholic"))
AdultUCI[[ "capital-gain"]] <- ordered(cut(AdultUCI[[ "capital-gain"]],c(-Inf,0,median(AdultUCI[[ "capital-gain"]][AdultUCI[[ "capital-gain"]]>0]),Inf)),labels = c("None", "Low", "High"))
AdultUCI[[ "capital-loss"]] <- ordered(cut(AdultUCI[[ "capital-loss"]],c(-Inf,0,median(AdultUCI[[ "capital-loss"]][AdultUCI[[ "capital-loss"]]>0]),Inf)),labels = c("none", "low", "high"))
# transactionsに適したデータ構成にしたので、型変更する
# それぞれのカテゴリ(none,low,highなど)を1アイテムとして与える
Adult <- as(AdultUCI, "transactions")
Adult
summary(Adult)
# アイテムごとに、出現頻度を示す
# 該当transactionsが全transactionsのsupportを下限としてプロットする
par(mfcol=c(1,2))
itemFrequencyPlot(Adult, support = 0.1, cex.names=0.8)
# 下限supportを変更すする
itemFrequencyPlot(Adult, support = 0.0, cex.names=0.1)
par(mfcol=c(1,1))

# support と confidenceを与えて、その条件での全ルールを取り出す関数apriori()
# apriori algorithm([http://en.wikipedia.org/wiki/Apriori_algorithm:title=Wiki])というアルゴリズムを使っている
# defaultの条件は support 0.1, confidence 0.8, and maxlen 10
rules <- apriori(Adult, parameter = list(support = 0.01, confidence = 0.6,maxlen=10))
rules
summary(rules)
> rules <- apriori(Adult, parameter = list(support = 0.01, confidence = 0.6))

parameter specification:
 confidence minval smax arem  aval originalSupport support minlen maxlen
        0.6    0.1    1 none FALSE            TRUE    0.01      1     10
 target   ext
  rules FALSE

algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

apriori - find association rules with the apriori algorithm
version 4.21 (2004.05.09)        (c) 1996-2004   Christian Borgelt
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[115 item(s), 48842 transaction(s)] done [0.03s].
sorting and recoding items ... [67 item(s)] done [0.01s].
creating transaction tree ... done [0.04s].
checking subsets of size 1 2 3 4 5 6 7 8 9 10 done [0.84s].
writing ... [276443 rule(s)] done [0.08s].
creating S4 object  ... done [0.16s].
> rules
set of 276443 rules 
> summary(rules)
set of 276443 rules

rule length distribution (lhs + rhs):sizes
    1     2     3     4     5     6     7     8     9    10 
    6   432  4981 22127 52669 75104 67198 38094 13244  2588 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   5.000   6.000   6.289   7.000  10.000 

summary of quality measures:
    support          confidence          lift        
 Min.   :0.01001   Min.   :0.6000   Min.   : 0.7171  
 1st Qu.:0.01253   1st Qu.:0.7691   1st Qu.: 1.0100  
 Median :0.01701   Median :0.9051   Median : 1.0554  
 Mean   :0.02679   Mean   :0.8600   Mean   : 1.3109  
 3rd Qu.:0.02741   3rd Qu.:0.9542   3rd Qu.: 1.2980  
 Max.   :0.95328   Max.   :1.0000   Max.   :20.6826  

mining info:
  data ntransactions support confidence
 Adult         48842    0.01        0.6
# rulesのうち、ある条件を満たすもののみを取り出す
rulesIncomeSmall <- subset(rules, subset = rhs %in% "income=small" & lift > 1.2)
rulesIncomeLarge <- subset(rules, subset = rhs %in% "income=large" & lift > 1.2)
inspect(head(SORT(rulesIncomeSmall, by = "confidence"), n = 3))
inspect(head(SORT(rulesIncomeLarge, by = "confidence"), n = 3))
> inspect(head(SORT(rulesIncomeSmall, by = "confidence"), n = 3))
  lhs                               rhs               support confidence     lift
1 {workclass=Private,                                                            
   marital-status=Never-married,                                                 
   relationship=Own-child,                                                       
   sex=Male,                                                                     
   hours-per-week=Part-time,                                                     
   native-country=United-States} => {income=small} 0.01074895  0.7104195 1.403653
2 {workclass=Private,                                                            
   marital-status=Never-married,                                                 
   relationship=Own-child,                                                       
   sex=Male,                                                                     
   hours-per-week=Part-time}     => {income=small} 0.01144507  0.7102922 1.403402
3 {workclass=Private,                                                            
   marital-status=Never-married,                                                 
   relationship=Own-child,                                                       
   sex=Male,                                                                     
   capital-gain=None,                                                            
   hours-per-week=Part-time,                                                     
   native-country=United-States} => {income=small} 0.01046231  0.7097222 1.402276
 警告メッセージ: 
In .local(x, ...) : arules: SORT is deprecated use sort instead.
> inspect(head(SORT(rulesIncomeLarge, by = "confidence"), n = 3))
  lhs                                    rhs               support confidence     lift
1 {marital-status=Married-civ-spouse,                                                 
   capital-gain=High,                                                                 
   native-country=United-States}      => {income=large} 0.01562180  0.6849192 4.266398
2 {marital-status=Married-civ-spouse,                                                 
   capital-gain=High,                                                                 
   capital-loss=None,                                                                 
   native-country=United-States}      => {income=large} 0.01562180  0.6849192 4.266398
3 {relationship=Husband,                                                              
   race=White,                                                                        
   capital-gain=High,                                                                 
   native-country=United-States}      => {income=large} 0.01302158  0.6846071 4.264454
 警告メッセージ: 
In .local(x, ...) : arules: SORT is deprecated use sort instead.
  • support, confidence, liftの分布をみる
plot(sort(rules@quality$confidence))
plot(sort(rules@quality$support))
plot(sort(rules@quality$lift))

pairs(rules@quality,cex=0.1)




# ルールを書き出す
WRITE(rulesIncomeSmall, file = "data.txt", sep = "\t", col.names = NA)
  • PMML(Predictive Model Markup Language(Wiki)を使った、探索されたモデルの表記方法で出力する
install.packages("pmml")
library("pmml")
rules_pmml <- pmml(rulesIncomeSmall)
saveXML(rules_pmml, file = "data.xml")
  • eclatアルゴリズムを使う
    • \text{conf}(I \Rightarrow X|I) \ge \text{all-confidence}(X) for all I \subset X
    • \text{all-confidence}(X)=\frac{\text{support}(X)}{\text{max}_{I \subset X} \text{support}(I)}
    • 実際には\text{max}_{I \subset X} \text{support}(I)は単一アイテムのときに最大値を取るから、それを探せばよい
data("Adult")
fsets <- eclat(Adult, parameter = list(support = 0.05), control = list(verbose=FALSE))

singleItems <- fsets[size(items(fsets)) == 1]

singleSupport <- quality(singleItems)$support
names(singleSupport) <- unlist(LIST(items(singleItems),decode = FALSE))
head(singleSupport, n = 5)
    • そのような単一アイテムを探した結果、66,63,...というアイテムが高い分母をもたらすことがわかる
> head(singleSupport, n = 5)
       66        63       111        60         8 
0.9532779 0.9173867 0.8974243 0.8550428 0.6941976 
    • すべてのXについて、all-confidence(X)を計算する
itemsetList <- LIST(items(fsets), decode = FALSE)
allConfidence <- quality(fsets)$support /sapply(itemsetList, function(x) max(singleSupport[as.character(x)]))
quality(fsets) <- cbind(quality(fsets), allConfidence)
summary(fsets)

fsetsEducation <- subset(fsets, subset = items %pin% "education")
inspect(SORT(fsetsEducation[size(fsetsEducation)>1], by = "allConfidence")[1 : 3])
itemsetList <- LIST(items(fsets), decode = FALSE)
allConfidence <- quality(fsets)$support /sapply(itemsetList, function(x) max(singleSupport[as.character(x)]))
quality(fsets) <- cbind(quality(fsets), allConfidence)
summary(fsets)

fsetsEducation <- subset(fsets, subset = items %pin% "education")
inspect(SORT(fsetsEducation[size(fsetsEducation)>1], by = "allConfidence")[1 : 3])
> summary(fsets)
set of 8496 itemsets

most frequent items:
           capital-loss=None native-country=United-States 
                        4082                         3973 
           capital-gain=None                   race=White 
                        3962                         3781 
           workclass=Private                      (Other) 
                        3142                        21931 

element (itemset/transaction) length distribution:sizes
   1    2    3    4    5    6    7    8    9   10 
  36  303 1078 2103 2388 1689  706  171   21    1 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   4.000   5.000   4.811   6.000  10.000 

summary of quality measures:
    support        allConfidence    
 Min.   :0.05002   Min.   :0.05247  
 1st Qu.:0.06038   1st Qu.:0.06597  
 Median :0.07546   Median :0.08428  
 Mean   :0.10124   Mean   :0.11667  
 3rd Qu.:0.11279   3rd Qu.:0.12711  
 Max.   :0.95328   Max.   :1.00000  

includes transaction ID lists: FALSE 

mining info:
  data ntransactions support
 Adult         48842    0.05
> 
  • 出力がsupportとallConfidenceになってくる
> fsetsEducation <- subset(fsets, subset = items %pin% "education")
> inspect(SORT(fsetsEducation[size(fsetsEducation)>1], by = "allConfidence")[1 : 3])
  items                        support allConfidence
1 {education=HS-grad,                               
   hours-per-week=Full-time} 0.2090209     0.3572453
2 {education=HS-grad,                               
   income=small}             0.1807051     0.3570388
3 {workclass=Private,                               
   education=HS-grad}        0.2391794     0.3445408
 警告メッセージ: 
In .local(x, ...) : arules: SORT is deprecated use sort instead.
  • サンプリング
# 適当なサンプル数を求める
data("Adult")
Adult
supp <- 0.05
epsilon <- 0.1
c <- 0.1
n <- -2 * log(c)/ (supp * epsilon^2)
n
> n
[1] 9210.34
# サンプリングしている
AdultSample <- sample(Adult, n, replace = TRUE)
# オリジナルデータセットと抽出セットとの異同を2種類のプロットで示す
par(mfcol=c(1,2))
# 頻度で
itemFrequencyPlot(AdultSample, population = Adult, support = supp, cex.names = 0.7)
# lift ratioで
itemFrequencyPlot(AdultSample, population = Adult, support = supp, lift = TRUE, cex.names = 0.9)
par(mfcol=c(1,1))

  • 全レコードを使うかサンプルを使うかで、処理時間を比較する
time <- system.time(itemsets <- eclat(Adult, parameter = list(support = supp), control = list(verbose = FALSE)))
time

timeSample <- system.time(itemsetsSample <- eclat(AdultSample,parameter = list(support = supp), control = list(verbose = FALSE)))
timeSample
time[1] / timeSample[1]
> time <- system.time(itemsets <- eclat(Adult, parameter = list(support = supp), control = list(verbose = FALSE)))
> time
   user  system elapsed 
   0.32    0.00    0.33 
> 
> timeSample <- system.time(itemsetsSample <- eclat(AdultSample,parameter = list(support = supp), control = list(verbose = FALSE)))
> timeSample
   user  system elapsed 
   0.07    0.00    0.08 
> time[1] / timeSample[1]
user.self 
 4.571429 
# データ全体 vs. サンプルで、見つけた買い物のアイテムセットの数を比較する
itemsets
itemsetsSample
# データ全体 vs. サンプルで、見つかったアイテムセットの一致具合を比較する
match <- match(itemsets, itemsetsSample, nomatch = 0)
# よく登場するアイテムに関しては、サンプルでの計算で代用できることがわかる
> itemsets
set of 8496 itemsets 
> itemsetsSample
set of 8352 itemsets 
## remove no matches
sum(match>0) / length(itemsets)
summary(quality(itemsets[which(!match)])$support)
summary(quality(itemsetsSample[-match])$support)

# 見つけ損ねるのは、サンプルにすることで登場頻度で足切りされる場合らしいことを示す
supportItemsets <- quality(itemsets[which(match > 0)])$support
supportSample <- quality(itemsetsSample[match])$support
accuracy <- 1 - abs(supportSample - supportItemsets) / supportItemsets
summary(accuracy)
> summary(accuracy)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.8684  0.9610  0.9784  0.9734  0.9901  1.0000