- 2値の疎なデータにあって、項目間の包含関係を単純な比として数値化する手法
- 大規模なデータであるから、演算自体を非常に単純化してある
- 大規模なデータであるから、項目の組み合わせを限定する基準が必要
- arulesパッケージを使ってみる
- arulesパッケージ
- transactionとは1度の買い物と言う行為のこと。全項目に関する0,1の値のベクトル
- ruleとは のことで、Xは、ルールの antecedent (left-hand-side or LHS) Yは consequent (right-hand-side or RHS) と言う
- 単純な例
-
- support
- lift
install.packages("arules")
library(arules)
data("Epub")
Epub
str(Epub)
summary(Epub)
> str(Epub)
Formal class 'transactions' [package "arules"] with 4 slots
..@ transactionInfo:'data.frame': 15729 obs. of 2 variables:
.. ..$ transactionID: Factor w/ 15732 levels "session_10002",..: 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 ...
.. ..$ TimeStamp : POSIXct[1:15729], format: "2003-01-02 10:59:00" ...
..@ data :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
.. .. ..@ i : int [1:25893] 7 199 31 0 64 935 422 0 194 0 ...
.. .. ..@ p : int [1:15730] 0 1 2 3 6 7 8 9 11 12 ...
.. .. ..@ Dim : int [1:2] 936 15729
.. .. ..@ Dimnames:List of 2
.. .. .. ..$ : NULL
.. .. .. ..$ : NULL
.. .. ..@ factors : list()
..@ itemInfo :'data.frame': 936 obs. of 1 variable:
.. ..$ labels:Class 'AsIs' chr [1:936] "doc_11d" "doc_13d" "doc_14c" "doc_14e" ...
..@ itemsetInfo :'data.frame': 15729 obs. of 1 variable:
.. ..$ itemsetID: Factor w/ 15732 levels "session_10002",..: 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 ...
>
head(Epub@transactionInfo)
head(Epub@data,1)
head(Epub@itemInfo)
head(Epub@itemsetInfo)
> head(Epub@transactionInfo)
transactionID TimeStamp
10792 session_4795 2003-01-02 10:59:00
10793 session_4797 2003-01-02 21:46:01
10794 session_479a 2003-01-03 00:50:38
10795 session_47b7 2003-01-03 08:55:50
10796 session_47bb 2003-01-03 11:27:44
10797 session_47c2 2003-01-04 00:18:04
> head(Epub@data,1)
1 x 15729 sparse Matrix of class "ngCMatrix"
[1,] . . . | . | . | . . . . . . . . | . | . . | | | | | | | | | | . . .
[1,] . | . . . | . . . . . . . . . . . . . . . . . . . | . . . . . . . .
> head(Epub@itemInfo)
labels
1 doc_11d
2 doc_13d
3 doc_14c
4 doc_14e
5 doc_150
6 doc_151
> head(Epub@itemsetInfo)
itemsetID
10792 session_4795
10793 session_4797
10794 session_479a
10795 session_47b7
10796 session_47bb
10797 session_47c2
year <- strftime(as.POSIXlt(transactionInfo(Epub)[["TimeStamp"]]), "%Y")
table(year)
Epub2003 <- Epub[year == "2003"]
length(Epub2003)
image(Epub2003)
> year <- strftime(as.POSIXlt(transactionInfo(Epub)[["TimeStamp"]]), "%Y")
> table(year)
year
2003 2004 2005 2006 2007 2008 2009
986 1376 1610 3010 4051 4693 3
>
> Epub2003 <- Epub[year == "2003"]
> length(Epub2003)
[1] 986
> image(Epub2003)
transactionInfo(Epub2003[size(Epub2003) > 20])
inspect(Epub2003[1:5])
as(Epub2003[1:5], "list")
> inspect(Epub2003[1:5])
items transactionID TimeStamp
1 {doc_154} session_4795 2003-01-02 10:59:00
2 {doc_3d6} session_4797 2003-01-02 21:46:01
3 {doc_16f} session_479a 2003-01-03 00:50:38
4 {doc_11d,
doc_1a7,
doc_f4} session_47b7 2003-01-03 08:55:50
5 {doc_83} session_47bb 2003-01-03 11:27:44
>
> as(Epub2003[1:5], "list")
$session_4795
[1] "doc_154"
$session_4797
[1] "doc_3d6"
$session_479a
[1] "doc_16f"
$session_47b7
[1] "doc_11d" "doc_1a7" "doc_f4"
$session_47bb
[1] "doc_83"
EpubTidLists <- as(Epub, "tidLists")
EpubTidLists
head(as(EpubTidLists[1:3], "list"))
> EpubTidLists <- as(Epub, "tidLists")
> EpubTidLists
tidLists in sparse format with
936 items/itemsets (rows) and
15729 transactions (columns)
> head(as(EpubTidLists[1:3], "list"))
$doc_11d
[1] "session_47b7" "session_47c2" "session_47d8"
[4] "session_4855" "session_488d" "session_4898"
[7] "session_489b" "session_489c" "session_48a1"
[10] "session_4897" "session_48a0" "session_489d"
[13] "session_48a5" "session_489a" "session_4896"
[16] "session_48aa" "session_48d0" "session_49de"
[19] "session_4b35" "session_4bac"
data("AdultUCI")
dim(AdultUCI)
AdultUCI[1:2,]
> data("AdultUCI")
> dim(AdultUCI)
[1] 48842 15
> AdultUCI[1:2,]
age workclass fnlwgt education education-num marital-status
1 39 State-gov 77516 Bachelors 13 Never-married
2 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
occupation relationship race sex capital-gain capital-loss
1 Adm-clerical Not-in-family White Male 2174 0
2 Exec-managerial Husband White Male 0 0
hours-per-week native-country income
1 40 United-States small
2 13 United-States small
AdultUCI[["fnlwgt"]] <- NULL
AdultUCI[["education-num"]] <- NULL
AdultUCI[[ "age"]] <- ordered(cut(AdultUCI[[ "age"]], c(15,25,45,65,100)),labels = c("Young", "Middle-aged", "Senior", "Old"))
AdultUCI[[ "hours-per-week"]] <- ordered(cut(AdultUCI[[ "hours-per-week"]],c(0,25,40,60,168)),labels = c("Part-time", "Full-time", "Over-time", "Workaholic"))
AdultUCI[[ "capital-gain"]] <- ordered(cut(AdultUCI[[ "capital-gain"]],c(-Inf,0,median(AdultUCI[[ "capital-gain"]][AdultUCI[[ "capital-gain"]]>0]),Inf)),labels = c("None", "Low", "High"))
AdultUCI[[ "capital-loss"]] <- ordered(cut(AdultUCI[[ "capital-loss"]],c(-Inf,0,median(AdultUCI[[ "capital-loss"]][AdultUCI[[ "capital-loss"]]>0]),Inf)),labels = c("none", "low", "high"))
Adult <- as(AdultUCI, "transactions")
Adult
summary(Adult)
par(mfcol=c(1,2))
itemFrequencyPlot(Adult, support = 0.1, cex.names=0.8)
itemFrequencyPlot(Adult, support = 0.0, cex.names=0.1)
par(mfcol=c(1,1))
rules <- apriori(Adult, parameter = list(support = 0.01, confidence = 0.6,maxlen=10))
rules
summary(rules)
> rules <- apriori(Adult, parameter = list(support = 0.01, confidence = 0.6))
parameter specification:
confidence minval smax arem aval originalSupport support minlen maxlen
0.6 0.1 1 none FALSE TRUE 0.01 1 10
target ext
rules FALSE
algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
apriori - find association rules with the apriori algorithm
version 4.21 (2004.05.09) (c) 1996-2004 Christian Borgelt
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[115 item(s), 48842 transaction(s)] done [0.03s].
sorting and recoding items ... [67 item(s)] done [0.01s].
creating transaction tree ... done [0.04s].
checking subsets of size 1 2 3 4 5 6 7 8 9 10 done [0.84s].
writing ... [276443 rule(s)] done [0.08s].
creating S4 object ... done [0.16s].
> rules
set of 276443 rules
> summary(rules)
set of 276443 rules
rule length distribution (lhs + rhs):sizes
1 2 3 4 5 6 7 8 9 10
6 432 4981 22127 52669 75104 67198 38094 13244 2588
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 5.000 6.000 6.289 7.000 10.000
summary of quality measures:
support confidence lift
Min. :0.01001 Min. :0.6000 Min. : 0.7171
1st Qu.:0.01253 1st Qu.:0.7691 1st Qu.: 1.0100
Median :0.01701 Median :0.9051 Median : 1.0554
Mean :0.02679 Mean :0.8600 Mean : 1.3109
3rd Qu.:0.02741 3rd Qu.:0.9542 3rd Qu.: 1.2980
Max. :0.95328 Max. :1.0000 Max. :20.6826
mining info:
data ntransactions support confidence
Adult 48842 0.01 0.6
rulesIncomeSmall <- subset(rules, subset = rhs %in% "income=small" & lift > 1.2)
rulesIncomeLarge <- subset(rules, subset = rhs %in% "income=large" & lift > 1.2)
inspect(head(SORT(rulesIncomeSmall, by = "confidence"), n = 3))
inspect(head(SORT(rulesIncomeLarge, by = "confidence"), n = 3))
> inspect(head(SORT(rulesIncomeSmall, by = "confidence"), n = 3))
lhs rhs support confidence lift
1 {workclass=Private,
marital-status=Never-married,
relationship=Own-child,
sex=Male,
hours-per-week=Part-time,
native-country=United-States} => {income=small} 0.01074895 0.7104195 1.403653
2 {workclass=Private,
marital-status=Never-married,
relationship=Own-child,
sex=Male,
hours-per-week=Part-time} => {income=small} 0.01144507 0.7102922 1.403402
3 {workclass=Private,
marital-status=Never-married,
relationship=Own-child,
sex=Male,
capital-gain=None,
hours-per-week=Part-time,
native-country=United-States} => {income=small} 0.01046231 0.7097222 1.402276
警告メッセージ:
In .local(x, ...) : arules: SORT is deprecated use sort instead.
> inspect(head(SORT(rulesIncomeLarge, by = "confidence"), n = 3))
lhs rhs support confidence lift
1 {marital-status=Married-civ-spouse,
capital-gain=High,
native-country=United-States} => {income=large} 0.01562180 0.6849192 4.266398
2 {marital-status=Married-civ-spouse,
capital-gain=High,
capital-loss=None,
native-country=United-States} => {income=large} 0.01562180 0.6849192 4.266398
3 {relationship=Husband,
race=White,
capital-gain=High,
native-country=United-States} => {income=large} 0.01302158 0.6846071 4.264454
警告メッセージ:
In .local(x, ...) : arules: SORT is deprecated use sort instead.
- support, confidence, liftの分布をみる
plot(sort(rules@quality$confidence))
plot(sort(rules@quality$support))
plot(sort(rules@quality$lift))
pairs(rules@quality,cex=0.1)
WRITE(rulesIncomeSmall, file = "data.txt", sep = "\t", col.names = NA)
- PMML(Predictive Model Markup Language(Wiki)を使った、探索されたモデルの表記方法で出力する
install.packages("pmml")
library("pmml")
rules_pmml <- pmml(rulesIncomeSmall)
saveXML(rules_pmml, file = "data.xml")
- eclatアルゴリズムを使う
- for all
- 実際にはは単一アイテムのときに最大値を取るから、それを探せばよい
data("Adult")
fsets <- eclat(Adult, parameter = list(support = 0.05), control = list(verbose=FALSE))
singleItems <- fsets[size(items(fsets)) == 1]
singleSupport <- quality(singleItems)$support
names(singleSupport) <- unlist(LIST(items(singleItems),decode = FALSE))
head(singleSupport, n = 5)
-
- そのような単一アイテムを探した結果、66,63,...というアイテムが高い分母をもたらすことがわかる
> head(singleSupport, n = 5)
66 63 111 60 8
0.9532779 0.9173867 0.8974243 0.8550428 0.6941976
-
- すべてのについて、all-confidence(X)を計算する
itemsetList <- LIST(items(fsets), decode = FALSE)
allConfidence <- quality(fsets)$support /sapply(itemsetList, function(x) max(singleSupport[as.character(x)]))
quality(fsets) <- cbind(quality(fsets), allConfidence)
summary(fsets)
fsetsEducation <- subset(fsets, subset = items %pin% "education")
inspect(SORT(fsetsEducation[size(fsetsEducation)>1], by = "allConfidence")[1 : 3])
itemsetList <- LIST(items(fsets), decode = FALSE)
allConfidence <- quality(fsets)$support /sapply(itemsetList, function(x) max(singleSupport[as.character(x)]))
quality(fsets) <- cbind(quality(fsets), allConfidence)
summary(fsets)
fsetsEducation <- subset(fsets, subset = items %pin% "education")
inspect(SORT(fsetsEducation[size(fsetsEducation)>1], by = "allConfidence")[1 : 3])
> summary(fsets)
set of 8496 itemsets
most frequent items:
capital-loss=None native-country=United-States
4082 3973
capital-gain=None race=White
3962 3781
workclass=Private (Other)
3142 21931
element (itemset/transaction) length distribution:sizes
1 2 3 4 5 6 7 8 9 10
36 303 1078 2103 2388 1689 706 171 21 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 4.000 5.000 4.811 6.000 10.000
summary of quality measures:
support allConfidence
Min. :0.05002 Min. :0.05247
1st Qu.:0.06038 1st Qu.:0.06597
Median :0.07546 Median :0.08428
Mean :0.10124 Mean :0.11667
3rd Qu.:0.11279 3rd Qu.:0.12711
Max. :0.95328 Max. :1.00000
includes transaction ID lists: FALSE
mining info:
data ntransactions support
Adult 48842 0.05
>
- 出力がsupportとallConfidenceになってくる
> fsetsEducation <- subset(fsets, subset = items %pin% "education")
> inspect(SORT(fsetsEducation[size(fsetsEducation)>1], by = "allConfidence")[1 : 3])
items support allConfidence
1 {education=HS-grad,
hours-per-week=Full-time} 0.2090209 0.3572453
2 {education=HS-grad,
income=small} 0.1807051 0.3570388
3 {workclass=Private,
education=HS-grad} 0.2391794 0.3445408
警告メッセージ:
In .local(x, ...) : arules: SORT is deprecated use sort instead.
data("Adult")
Adult
supp <- 0.05
epsilon <- 0.1
c <- 0.1
n <- -2 * log(c)/ (supp * epsilon^2)
n
> n
[1] 9210.34
AdultSample <- sample(Adult, n, replace = TRUE)
par(mfcol=c(1,2))
itemFrequencyPlot(AdultSample, population = Adult, support = supp, cex.names = 0.7)
itemFrequencyPlot(AdultSample, population = Adult, support = supp, lift = TRUE, cex.names = 0.9)
par(mfcol=c(1,1))
- 全レコードを使うかサンプルを使うかで、処理時間を比較する
time <- system.time(itemsets <- eclat(Adult, parameter = list(support = supp), control = list(verbose = FALSE)))
time
timeSample <- system.time(itemsetsSample <- eclat(AdultSample,parameter = list(support = supp), control = list(verbose = FALSE)))
timeSample
time[1] / timeSample[1]
> time <- system.time(itemsets <- eclat(Adult, parameter = list(support = supp), control = list(verbose = FALSE)))
> time
user system elapsed
0.32 0.00 0.33
>
> timeSample <- system.time(itemsetsSample <- eclat(AdultSample,parameter = list(support = supp), control = list(verbose = FALSE)))
> timeSample
user system elapsed
0.07 0.00 0.08
> time[1] / timeSample[1]
user.self
4.571429
itemsets
itemsetsSample
match <- match(itemsets, itemsetsSample, nomatch = 0)
> itemsets
set of 8496 itemsets
> itemsetsSample
set of 8352 itemsets
sum(match>0) / length(itemsets)
summary(quality(itemsets[which(!match)])$support)
summary(quality(itemsetsSample[-match])$support)
supportItemsets <- quality(itemsets[which(match > 0)])$support
supportSample <- quality(itemsetsSample[match])$support
accuracy <- 1 - abs(supportSample - supportItemsets) / supportItemsets
summary(accuracy)
> summary(accuracy)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.8684 0.9610 0.9784 0.9734 0.9901 1.0000