Code examples for the paper:
Bing Liu, Wynne Hsu and Yiming Ma, Integrating Classification and Association Rule Mining, ACM SIGKDD Conference on Knowledge Discovery and Data Mining, 1998.
set.seed(1234)
library("arulesCBA")
## Loading required package: Matrix
## Loading required package: arules
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: discretization
## Loading required package: glmnet
## Loading required package: foreach
## Loaded glmnet 2.0-16
library("caret") # for confusionMatrix
## Loading required package: lattice
## Loading required package: ggplot2
data(iris)
Discretize using Fayyad and Irani’s MDL method
iris_d <- discretizeDF.supervised(Species ~ ., data = iris)
head(iris_d)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 [-Inf,5.55) [3.35, Inf] [-Inf,2.45) [-Inf,0.8) setosa
## 2 [-Inf,5.55) [2.95,3.35) [-Inf,2.45) [-Inf,0.8) setosa
## 3 [-Inf,5.55) [2.95,3.35) [-Inf,2.45) [-Inf,0.8) setosa
## 4 [-Inf,5.55) [2.95,3.35) [-Inf,2.45) [-Inf,0.8) setosa
## 5 [-Inf,5.55) [3.35, Inf] [-Inf,2.45) [-Inf,0.8) setosa
## 6 [-Inf,5.55) [3.35, Inf] [-Inf,2.45) [-Inf,0.8) setosa
Train classifier on 90% of the data
train <- sample(1:nrow(iris_d), size = as.integer(nrow(iris_d)*.9))
5/length(train) # support
## [1] 0.03703704
cba_model <- CBA(Species ~ ., data = iris_d[train,], supp = 0.03, conf=0.5)
cba_model
## CBA Classifier Object
## Class: Species=setosa, Species=versicolor, Species=virginica
## Default Class: Species=setosa
## Number of rules: 10
## Classification method: first
## Description: CBA algorithm by Liu, et al. 1998 with support=0.03
## and confidence=0.5
inspect(rules(cba_model))
## lhs rhs support confidence lift count
## [1] {Petal.Length=[-Inf,2.45)} => {Species=setosa} 0.34074074 1.0000000 2.934783 46
## [2] {Sepal.Length=[6.15, Inf],
## Petal.Width=[1.75, Inf]} => {Species=virginica} 0.24444444 1.0000000 3.068182 33
## [3] {Sepal.Length=[5.55,6.15),
## Petal.Length=[2.45,4.75)} => {Species=versicolor} 0.13333333 1.0000000 3.000000 18
## [4] {Sepal.Width=[-Inf,2.95),
## Petal.Width=[1.75, Inf]} => {Species=virginica} 0.11111111 1.0000000 3.068182 15
## [5] {Sepal.Width=[2.95,3.35),
## Petal.Width=[0.8,1.75)} => {Species=versicolor} 0.08888889 1.0000000 3.000000 12
## [6] {Sepal.Length=[6.15, Inf],
## Petal.Length=[2.45,4.75)} => {Species=versicolor} 0.07407407 1.0000000 3.000000 10
## [7] {Petal.Width=[1.75, Inf]} => {Species=virginica} 0.29629630 0.9756098 2.993348 40
## [8] {Petal.Length=[2.45,4.75)} => {Species=versicolor} 0.28888889 0.9750000 2.925000 39
## [9] {Sepal.Length=[6.15, Inf],
## Petal.Width=[0.8,1.75)} => {Species=versicolor} 0.10370370 0.9333333 2.800000 14
## [10] {Petal.Width=[0.8,1.75)} => {Species=versicolor} 0.32592593 0.9166667 2.750000 44
Make predictions for the test data
pr <- predict(cba_model, newdata = iris_d[-train,])
pr
## [1] setosa setosa setosa setosa versicolor versicolor
## [7] versicolor versicolor versicolor virginica virginica virginica
## [13] virginica versicolor virginica
## Levels: setosa versicolor virginica
confusionMatrix(reference = iris_d[-train,]$Species, data = pr)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 4 0 0
## versicolor 0 5 1
## virginica 0 0 5
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.6805, 0.9983)
## No Information Rate : 0.4
## P-Value [Acc > NIR] : 2.523e-05
##
## Kappa : 0.8993
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8333
## Specificity 1.0000 0.9000 1.0000
## Pos Pred Value 1.0000 0.8333 1.0000
## Neg Pred Value 1.0000 1.0000 0.9000
## Prevalence 0.2667 0.3333 0.4000
## Detection Rate 0.2667 0.3333 0.3333
## Detection Prevalence 0.2667 0.4000 0.3333
## Balanced Accuracy 1.0000 0.9500 0.9167
data(Zoo, package = "mlbench")
head(Zoo)
## hair feathers eggs milk airborne aquatic predator toothed
## aardvark TRUE FALSE FALSE TRUE FALSE FALSE TRUE TRUE
## antelope TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
## bass FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE
## bear TRUE FALSE FALSE TRUE FALSE FALSE TRUE TRUE
## boar TRUE FALSE FALSE TRUE FALSE FALSE TRUE TRUE
## buffalo TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
## backbone breathes venomous fins legs tail domestic catsize
## aardvark TRUE TRUE FALSE FALSE 4 FALSE FALSE TRUE
## antelope TRUE TRUE FALSE FALSE 4 TRUE FALSE TRUE
## bass TRUE FALSE FALSE TRUE 0 TRUE FALSE FALSE
## bear TRUE TRUE FALSE FALSE 4 FALSE FALSE TRUE
## boar TRUE TRUE FALSE FALSE 4 TRUE FALSE TRUE
## buffalo TRUE TRUE FALSE FALSE 4 TRUE FALSE TRUE
## type
## aardvark mammal
## antelope mammal
## bass fish
## bear mammal
## boar mammal
## buffalo mammal
discretize legs
Zoo$legs <- Zoo$legs>0
Train classifier on 90% of the data
train <- sample(1:nrow(Zoo), size = as.integer(nrow(Zoo)*.9))
3/length(train) # support
## [1] 0.03333333
cba_model <- CBA(type ~ ., data = Zoo[train,], supp = 0.03, conf=0.5)
cba_model
## CBA Classifier Object
## Class: type=mammal, type=bird, type=reptile, type=fish, type=amphibian, type=insect, type=mollusc.et.al
## Default Class: type=amphibian
## Number of rules: 5
## Classification method: first
## Description: CBA algorithm by Liu, et al. 1998 with support=0.03
## and confidence=0.5
inspect(rules(cba_model))
## lhs rhs support confidence
## [1] {milk} => {type=mammal} 0.42222222 1
## [2] {feathers} => {type=bird} 0.18888889 1
## [3] {eggs,fins} => {type=fish} 0.12222222 1
## [4] {eggs,aquatic,toothed,legs} => {type=amphibian} 0.04444444 1
## [5] {hair,eggs} => {type=insect} 0.03333333 1
## lift count
## [1] 2.368421 38
## [2] 5.294118 17
## [3] 8.181818 11
## [4] 22.500000 4
## [5] 15.000000 3
Make predictions for the test data
pr <- predict(cba_model, newdata = Zoo[-train,])
pr
## [1] fish amphibian bird amphibian mammal mammal fish
## [8] insect bird mammal bird
## Levels: mammal bird reptile fish amphibian insect mollusc.et.al
cm <- confusionMatrix(reference = Zoo[-train,]$type, data = pr)
cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction mammal bird reptile fish amphibian insect mollusc.et.al
## mammal 3 0 0 0 0 0 0
## bird 0 3 0 0 0 0 0
## reptile 0 0 0 0 0 0 0
## fish 0 0 0 2 0 0 0
## amphibian 0 0 0 0 0 1 1
## insect 0 0 0 0 0 1 0
## mollusc.et.al 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.8182
## 95% CI : (0.4822, 0.9772)
## No Information Rate : 0.2727
## P-Value [Acc > NIR] : 0.0002617
##
## Kappa : 0.7732
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: mammal Class: bird Class: reptile Class: fish
## Sensitivity 1.0000 1.0000 NA 1.0000
## Specificity 1.0000 1.0000 1 1.0000
## Pos Pred Value 1.0000 1.0000 NA 1.0000
## Neg Pred Value 1.0000 1.0000 NA 1.0000
## Prevalence 0.2727 0.2727 0 0.1818
## Detection Rate 0.2727 0.2727 0 0.1818
## Detection Prevalence 0.2727 0.2727 0 0.1818
## Balanced Accuracy 1.0000 1.0000 NA 1.0000
## Class: amphibian Class: insect Class: mollusc.et.al
## Sensitivity NA 0.50000 0.00000
## Specificity 0.8182 1.00000 1.00000
## Pos Pred Value NA 1.00000 NaN
## Neg Pred Value NA 0.90000 0.90909
## Prevalence 0.0000 0.18182 0.09091
## Detection Rate 0.0000 0.09091 0.00000
## Detection Prevalence 0.1818 0.09091 0.00000
## Balanced Accuracy NA 0.75000 0.50000
accuracy <- list(CBA = cm$overall["Accuracy"])
C5.0 is an improved version of C4.5 which builds a decision tree using information gain for recursive partitioning. Rules are extracted form the tree.
library("C50")
Zoo_numeric <- Zoo
for(i in 1:ncol(Zoo_numeric)) if(is.logical(Zoo_numeric[[i]])) Zoo_numeric[[i]] <- as.numeric(Zoo_numeric[[i]])
C5.0_model <- C5.0(type ~ ., data = Zoo_numeric[train, ], rules = TRUE)
C5.0_model
##
## Call:
## C5.0.formula(formula = type ~ ., data = Zoo_numeric[train, ], rules = TRUE)
##
## Rule-Based Model
## Number of samples: 90
## Number of predictors: 16
##
## Number of Rules: 7
##
## Non-standard options: attempt to group attributes
summary(C5.0_model)
##
## Call:
## C5.0.formula(formula = type ~ ., data = Zoo_numeric[train, ], rules = TRUE)
##
##
## C5.0 [Release 2.07 GPL Edition] Wed Apr 3 13:27:40 2019
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 90 cases (17 attributes) from undefined.data
##
## Rules:
##
## Rule 1: (38, lift 2.3)
## milk > 0
## -> class mammal [0.975]
##
## Rule 2: (17, lift 5.0)
## feathers > 0
## -> class bird [0.947]
##
## Rule 3: (6/1, lift 13.5)
## feathers <= 0
## milk <= 0
## backbone > 0
## fins <= 0
## tail > 0
## -> class reptile [0.750]
##
## Rule 4: (11, lift 7.6)
## milk <= 0
## fins > 0
## -> class fish [0.923]
##
## Rule 5: (3, lift 18.0)
## milk <= 0
## backbone > 0
## tail <= 0
## -> class amphibian [0.800]
##
## Rule 6: (5, lift 12.9)
## feathers <= 0
## milk <= 0
## airborne > 0
## -> class insect [0.857]
##
## Rule 7: (10/1, lift 8.3)
## airborne <= 0
## backbone <= 0
## -> class mollusc.et.al [0.833]
##
## Default class: mammal
##
##
## Evaluation on training data (90 cases):
##
## Rules
## ----------------
## No Errors
##
## 7 2( 2.2%) <<
##
##
## (a) (b) (c) (d) (e) (f) (g) <-classified as
## ---- ---- ---- ---- ---- ---- ----
## 38 (a): class mammal
## 17 (b): class bird
## 5 (c): class reptile
## 11 (d): class fish
## 1 3 (e): class amphibian
## 5 1 (f): class insect
## 9 (g): class mollusc.et.al
##
##
## Attribute usage:
##
## 70.00% milk
## 31.11% feathers
## 21.11% backbone
## 18.89% fins
## 16.67% airborne
## 10.00% tail
##
##
## Time: 0.0 secs
pr <- predict(C5.0_model, newdata = Zoo_numeric[-train,])
pr
## [1] fish mollusc.et.al bird mollusc.et.al mammal
## [6] mammal fish insect bird mammal
## [11] bird
## Levels: mammal bird reptile fish amphibian insect mollusc.et.al
cm <- confusionMatrix(reference = Zoo[-train,]$type, data = pr)
cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction mammal bird reptile fish amphibian insect mollusc.et.al
## mammal 3 0 0 0 0 0 0
## bird 0 3 0 0 0 0 0
## reptile 0 0 0 0 0 0 0
## fish 0 0 0 2 0 0 0
## amphibian 0 0 0 0 0 0 0
## insect 0 0 0 0 0 1 0
## mollusc.et.al 0 0 0 0 0 1 1
##
## Overall Statistics
##
## Accuracy : 0.9091
## 95% CI : (0.5872, 0.9977)
## No Information Rate : 0.2727
## P-Value [Acc > NIR] : 1.883e-05
##
## Kappa : 0.8842
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: mammal Class: bird Class: reptile Class: fish
## Sensitivity 1.0000 1.0000 NA 1.0000
## Specificity 1.0000 1.0000 1 1.0000
## Pos Pred Value 1.0000 1.0000 NA 1.0000
## Neg Pred Value 1.0000 1.0000 NA 1.0000
## Prevalence 0.2727 0.2727 0 0.1818
## Detection Rate 0.2727 0.2727 0 0.1818
## Detection Prevalence 0.2727 0.2727 0 0.1818
## Balanced Accuracy 1.0000 1.0000 NA 1.0000
## Class: amphibian Class: insect Class: mollusc.et.al
## Sensitivity NA 0.50000 1.00000
## Specificity 1 1.00000 0.90000
## Pos Pred Value NA 1.00000 0.50000
## Neg Pred Value NA 0.90000 1.00000
## Prevalence 0 0.18182 0.09091
## Detection Rate 0 0.09091 0.09091
## Detection Prevalence 0 0.09091 0.18182
## Balanced Accuracy NA 0.75000 0.95000
accuracy[["C5.0_rules"]] <- cm$overall["Accuracy"]
Extracts rules directly withour building a decision tree first.
library("RWeka")
ripper_model <- JRip(type ~ ., data = Zoo[train,])
ripper_model
## JRIP rules:
## ===========
##
## (aquatic = TRUE) and (breathes = TRUE) and (tail = FALSE) => type=amphibian (4.0/1.0)
## (hair = FALSE) and (aquatic = FALSE) and (toothed = TRUE) => type=reptile (3.0/0.0)
## (backbone = FALSE) and (airborne = TRUE) => type=insect (5.0/0.0)
## (backbone = FALSE) => type=mollusc.et.al (10.0/1.0)
## (breathes = FALSE) => type=fish (12.0/1.0)
## (feathers = TRUE) => type=bird (17.0/0.0)
## => type=mammal (39.0/2.0)
##
## Number of Rules : 7
pr <- predict(ripper_model, newdata = Zoo[-train,])
pr
## [1] fish mollusc.et.al bird mollusc.et.al mammal
## [6] mammal fish insect bird mammal
## [11] bird
## Levels: mammal bird reptile fish amphibian insect mollusc.et.al
cm <- confusionMatrix(reference = Zoo[-train,]$type, data = pr)
cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction mammal bird reptile fish amphibian insect mollusc.et.al
## mammal 3 0 0 0 0 0 0
## bird 0 3 0 0 0 0 0
## reptile 0 0 0 0 0 0 0
## fish 0 0 0 2 0 0 0
## amphibian 0 0 0 0 0 0 0
## insect 0 0 0 0 0 1 0
## mollusc.et.al 0 0 0 0 0 1 1
##
## Overall Statistics
##
## Accuracy : 0.9091
## 95% CI : (0.5872, 0.9977)
## No Information Rate : 0.2727
## P-Value [Acc > NIR] : 1.883e-05
##
## Kappa : 0.8842
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: mammal Class: bird Class: reptile Class: fish
## Sensitivity 1.0000 1.0000 NA 1.0000
## Specificity 1.0000 1.0000 1 1.0000
## Pos Pred Value 1.0000 1.0000 NA 1.0000
## Neg Pred Value 1.0000 1.0000 NA 1.0000
## Prevalence 0.2727 0.2727 0 0.1818
## Detection Rate 0.2727 0.2727 0 0.1818
## Detection Prevalence 0.2727 0.2727 0 0.1818
## Balanced Accuracy 1.0000 1.0000 NA 1.0000
## Class: amphibian Class: insect Class: mollusc.et.al
## Sensitivity NA 0.50000 1.00000
## Specificity 1 1.00000 0.90000
## Pos Pred Value NA 1.00000 0.50000
## Neg Pred Value NA 0.90000 1.00000
## Prevalence 0 0.18182 0.09091
## Detection Rate 0 0.09091 0.09091
## Detection Prevalence 0 0.09091 0.18182
## Balanced Accuracy NA 0.75000 0.95000
accuracy[["Ripper"]] <- cm$overall["Accuracy"]
unlist(accuracy)
## CBA.Accuracy C5.0_rules.Accuracy Ripper.Accuracy
## 0.8181818 0.9090909 0.9090909