Code for: Bing Liu, Wynne Hsu, and Yiming Ma. 1998. Integrating classification and association rule mining. In Proceedings of the Fourth International Conference on Knowledge Discovery and Data Mining (KDD’98), Rakesh Agrawal and Paul Stolorz (Eds.). AAAI Press 80-86.

Note: This uses the development version of arulesCBA

# devtools::install_github('ianjjohnson/arulesCBA')

library("arulesCBA")
## Loading required package: Matrix
## Loading required package: arules
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library("RWeka")  # for Discretize
library("caret")  # for confusionMatrix
## Loading required package: lattice
## Loading required package: ggplot2

Iris dataset

data(iris)

Discretize using Fayyad and Irani’s MDL method

iris_d <- Discretize(Species ~ ., data = iris)
head(iris_d)
##    Sepal.Length   Sepal.Width  Petal.Length  Petal.Width Species
## 1 '(-inf-5.55]'  '(3.35-inf)' '(-inf-2.45]' '(-inf-0.8]'  setosa
## 2 '(-inf-5.55]' '(2.95-3.35]' '(-inf-2.45]' '(-inf-0.8]'  setosa
## 3 '(-inf-5.55]' '(2.95-3.35]' '(-inf-2.45]' '(-inf-0.8]'  setosa
## 4 '(-inf-5.55]' '(2.95-3.35]' '(-inf-2.45]' '(-inf-0.8]'  setosa
## 5 '(-inf-5.55]'  '(3.35-inf)' '(-inf-2.45]' '(-inf-0.8]'  setosa
## 6 '(-inf-5.55]'  '(3.35-inf)' '(-inf-2.45]' '(-inf-0.8]'  setosa

Train classifier on 80% of the data

train <- sample(1:nrow(iris_d), size = as.integer(nrow(iris_d)*.8))

5/length(train) # support
## [1] 0.04166667
cl <- CBA(Species ~ ., data = iris_d[train,], supp = 0.05, conf=0.5)
cl
## CBA Classifier Object
## Class: Species (labels: setosa, versicolor, virginica )
## Default Class: Species=virginica
## Number of rules: 12
## Classification method: first 
## Description: CBA algorithm by Liu, et al. 1998 with support=0.05
##      and confidence=0.5
inspect(cl$rules)
##      lhs                             rhs                  lhs_support    support confidence     lift
## [1]  {Sepal.Width='(-inf-2.95]',                                                                    
##       Petal.Width='(0.8-1.75]'}   => {Species=versicolor}  0.20000000 0.25833333  1.0000000 2.727273
## [2]  {Sepal.Length='(5.55-6.15]',                                                                   
##       Petal.Width='(0.8-1.75]'}   => {Species=versicolor}  0.16666667 0.16666667  1.0000000 2.727273
## [3]  {Sepal.Length='(5.55-6.15]',                                                                   
##       Sepal.Width='(-inf-2.95]',                                                                    
##       Petal.Width='(0.8-1.75]'}   => {Species=versicolor}  0.12500000 0.13333333  1.0000000 2.727273
## [4]  {Sepal.Length='(6.15-inf)',                                                                    
##       Sepal.Width='(-inf-2.95]',                                                                    
##       Petal.Width='(0.8-1.75]'}   => {Species=versicolor}  0.13333333 0.05833333  1.0000000 2.727273
## [5]  {Petal.Width='(0.8-1.75]'}   => {Species=versicolor}  0.23333333 0.35833333  0.9772727 2.665289
## [6]  {Sepal.Length='(6.15-inf)',                                                                    
##       Petal.Width='(0.8-1.75]'}   => {Species=versicolor}  0.20000000 0.11666667  0.9333333 2.545455
## [7]  {Sepal.Length='(-inf-5.55]',                                                                   
##       Sepal.Width='(2.95-3.35]'}  => {Species=setosa}      0.39166667 0.10000000  0.9230769 3.076923
## [8]  {Sepal.Length='(6.15-inf)',                                                                    
##       Sepal.Width='(2.95-3.35]',                                                                    
##       Petal.Length='(4.75-inf)'}  => {Species=virginica}   0.06666667 0.14166667  0.8947368 2.684211
## [9]  {Petal.Length='(4.75-inf)',                                                                    
##       Petal.Width='(0.8-1.75]'}   => {Species=versicolor}  0.20000000 0.04166667  0.8333333 2.272727
## [10] {Sepal.Length='(5.55-6.15]',                                                                   
##       Sepal.Width='(-inf-2.95]'}  => {Species=versicolor}  0.33333333 0.13333333  0.8000000 2.181818
## [11] {Sepal.Length='(-inf-5.55]'} => {Species=setosa}      0.30000000 0.29166667  0.7954545 2.651515
## [12] {Sepal.Width='(3.35-inf)'}   => {Species=setosa}      0.24166667 0.19166667  0.7931034 2.643678

Make predictions for the test data

pr <- predict(cl, newdata = iris_d[-train,])
pr
##  [1] setosa     setosa     setosa     setosa     setosa     setosa    
##  [7] setosa     setosa     setosa     setosa     setosa     setosa    
## [13] setosa     setosa     versicolor versicolor versicolor versicolor
## [19] versicolor versicolor versicolor virginica  virginica  virginica 
## [25] versicolor virginica  versicolor virginica  versicolor versicolor
## Levels: setosa versicolor virginica
confusionMatrix(reference = iris_d[-train,]$Species, data = pr)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         14          0         0
##   versicolor      0          6         5
##   virginica       0          0         5
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8333          
##                  95% CI : (0.6528, 0.9436)
##     No Information Rate : 0.4667          
##     P-Value [Acc > NIR] : 3.894e-05       
##                                           
##                   Kappa : 0.7449          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.5000
## Specificity                 1.0000            0.7917           1.0000
## Pos Pred Value              1.0000            0.5455           1.0000
## Neg Pred Value              1.0000            1.0000           0.8000
## Prevalence                  0.4667            0.2000           0.3333
## Detection Rate              0.4667            0.2000           0.1667
## Detection Prevalence        0.4667            0.3667           0.1667
## Balanced Accuracy           1.0000            0.8958           0.7500

Zoo dataset

data(Zoo, package = "mlbench")
head(Zoo)
##           hair feathers  eggs  milk airborne aquatic predator toothed
## aardvark  TRUE    FALSE FALSE  TRUE    FALSE   FALSE     TRUE    TRUE
## antelope  TRUE    FALSE FALSE  TRUE    FALSE   FALSE    FALSE    TRUE
## bass     FALSE    FALSE  TRUE FALSE    FALSE    TRUE     TRUE    TRUE
## bear      TRUE    FALSE FALSE  TRUE    FALSE   FALSE     TRUE    TRUE
## boar      TRUE    FALSE FALSE  TRUE    FALSE   FALSE     TRUE    TRUE
## buffalo   TRUE    FALSE FALSE  TRUE    FALSE   FALSE    FALSE    TRUE
##          backbone breathes venomous  fins legs  tail domestic catsize
## aardvark     TRUE     TRUE    FALSE FALSE    4 FALSE    FALSE    TRUE
## antelope     TRUE     TRUE    FALSE FALSE    4  TRUE    FALSE    TRUE
## bass         TRUE    FALSE    FALSE  TRUE    0  TRUE    FALSE   FALSE
## bear         TRUE     TRUE    FALSE FALSE    4 FALSE    FALSE    TRUE
## boar         TRUE     TRUE    FALSE FALSE    4  TRUE    FALSE    TRUE
## buffalo      TRUE     TRUE    FALSE FALSE    4  TRUE    FALSE    TRUE
##            type
## aardvark mammal
## antelope mammal
## bass       fish
## bear     mammal
## boar     mammal
## buffalo  mammal

dsicretize legs

Zoo$legs <- Zoo$legs>0

Train classifier on 80% of the data

train <- sample(1:nrow(Zoo), size = as.integer(nrow(Zoo)*.8))

5/length(train) # support
## [1] 0.0625
cl <- CBA(type ~ ., data = Zoo[train,], supp = 0.05, conf=0.5)
cl
## CBA Classifier Object
## Class: type (labels: mammal, bird, reptile, fish, amphibian, insect, mollusc.et.al )
## Default Class: type=mammal
## Number of rules: 5
## Classification method: first 
## Description: CBA algorithm by Liu, et al. 1998 with support=0.05
##      and confidence=0.5
inspect(cl$rules)
##     lhs                             rhs              lhs_support support
## [1] {eggs,legs,tail}             => {type=bird}      0.1875      0.2125 
## [2] {airborne,tail}              => {type=bird}      0.1125      0.1750 
## [3] {eggs,toothed,legs}          => {type=amphibian} 0.0750      0.0375 
## [4] {fins}                       => {type=fish}      0.0750      0.1000 
## [5] {eggs,aquatic,predator,tail} => {type=fish}      0.0625      0.0875 
##     confidence lift     
## [1] 0.8947368   4.210526
## [2] 0.8750000   4.117647
## [3] 0.7500000  20.000000
## [4] 0.7272727   7.272727
## [5] 0.6363636   6.363636

Make predictions for the test data

pr <- predict(cl, newdata = Zoo[-train,])
pr
##  [1] mammal bird   mammal mammal mammal mammal fish   bird   fish   mammal
## [11] bird   bird   mammal mammal mammal fish   fish   fish   mammal fish  
## [21] bird  
## Levels: mammal bird reptile fish amphibian insect mollusc.et.al
confusionMatrix(reference = Zoo[-train,]$type, data = pr)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      mammal bird reptile fish amphibian insect mollusc.et.al
##   mammal             7    0       0    0         0      1             2
##   bird               0    3       1    0         1      0             0
##   reptile            0    0       0    0         0      0             0
##   fish               1    0       0    5         0      0             0
##   amphibian          0    0       0    0         0      0             0
##   insect             0    0       0    0         0      0             0
##   mollusc.et.al      0    0       0    0         0      0             0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7143          
##                  95% CI : (0.4782, 0.8872)
##     No Information Rate : 0.381           
##     P-Value [Acc > NIR] : 0.002017        
##                                           
##                   Kappa : 0.6013          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: mammal Class: bird Class: reptile Class: fish
## Sensitivity                 0.8750      1.0000        0.00000      1.0000
## Specificity                 0.7692      0.8889        1.00000      0.9375
## Pos Pred Value              0.7000      0.6000            NaN      0.8333
## Neg Pred Value              0.9091      1.0000        0.95238      1.0000
## Prevalence                  0.3810      0.1429        0.04762      0.2381
## Detection Rate              0.3333      0.1429        0.00000      0.2381
## Detection Prevalence        0.4762      0.2381        0.00000      0.2857
## Balanced Accuracy           0.8221      0.9444        0.50000      0.9688
##                      Class: amphibian Class: insect Class: mollusc.et.al
## Sensitivity                   0.00000       0.00000              0.00000
## Specificity                   1.00000       1.00000              1.00000
## Pos Pred Value                    NaN           NaN                  NaN
## Neg Pred Value                0.95238       0.95238              0.90476
## Prevalence                    0.04762       0.04762              0.09524
## Detection Rate                0.00000       0.00000              0.00000
## Detection Prevalence          0.00000       0.00000              0.00000
## Balanced Accuracy             0.50000       0.50000              0.50000

Compare to C4.5 and CART

Use C4.5 (=J48 from Weka)

J48(type ~ ., data = Zoo[train,])
## J48 pruned tree
## ------------------
## 
## feathers = FALSE
## |   milk = FALSE
## |   |   toothed = FALSE
## |   |   |   airborne = FALSE
## |   |   |   |   predator = FALSE
## |   |   |   |   |   legs = FALSE: mollusc.et.al (2.0)
## |   |   |   |   |   legs = TRUE: insect (2.0)
## |   |   |   |   predator = TRUE: mollusc.et.al (6.0)
## |   |   |   airborne = TRUE: insect (5.0)
## |   |   toothed = TRUE
## |   |   |   fins = FALSE
## |   |   |   |   tail = FALSE: amphibian (3.0)
## |   |   |   |   tail = TRUE: reptile (4.0)
## |   |   |   fins = TRUE: fish (8.0)
## |   milk = TRUE: mammal (33.0)
## feathers = TRUE: bird (17.0)
## 
## Number of Leaves  :  9
## 
## Size of the tree :   17

We would have to extract rules…

Use CART (= rpart in R)

library("rpart")
rpart(type ~ ., data = Zoo[train,])
## n= 80 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 80 47 mammal (0.41 0.21 0.05 0.1 0.037 0.087 0.1)  
##    2) milk>=0.5 33  0 mammal (1 0 0 0 0 0 0) *
##    3) milk< 0.5 47 30 bird (0 0.36 0.085 0.17 0.064 0.15 0.17)  
##      6) feathers>=0.5 17  0 bird (0 1 0 0 0 0 0) *
##      7) feathers< 0.5 30 22 fish (0 0 0.13 0.27 0.1 0.23 0.27)  
##       14) fins>=0.5 8  0 fish (0 0 0 1 0 0 0) *
##       15) fins< 0.5 22 14 mollusc.et.al (0 0 0.18 0 0.14 0.32 0.36)  
##         30) toothed>=0.5 7  3 reptile (0 0 0.57 0 0.43 0 0) *
##         31) toothed< 0.5 15  7 mollusc.et.al (0 0 0 0 0 0.47 0.53) *

We would have to extract rules…