Code examples for the paper:

Bing Liu, Wynne Hsu and Yiming Ma, Integrating Classification and Association Rule Mining, ACM SIGKDD Conference on Knowledge Discovery and Data Mining, 1998.

set.seed(1234)
library("arulesCBA")

## Loading required package: Matrix

## Loading required package: arules

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

## Loading required package: discretization

## Loading required package: glmnet

## Loading required package: foreach

## Loaded glmnet 2.0-16

library("caret")  # for confusionMatrix

## Loading required package: lattice

## Loading required package: ggplot2

Example for the Iris Data Set

data(iris)

Discretize using Fayyad and Irani’s MDL method

iris_d <- discretizeDF.supervised(Species ~ ., data = iris)
head(iris_d)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1  [-Inf,5.55) [3.35, Inf]  [-Inf,2.45)  [-Inf,0.8)  setosa
## 2  [-Inf,5.55) [2.95,3.35)  [-Inf,2.45)  [-Inf,0.8)  setosa
## 3  [-Inf,5.55) [2.95,3.35)  [-Inf,2.45)  [-Inf,0.8)  setosa
## 4  [-Inf,5.55) [2.95,3.35)  [-Inf,2.45)  [-Inf,0.8)  setosa
## 5  [-Inf,5.55) [3.35, Inf]  [-Inf,2.45)  [-Inf,0.8)  setosa
## 6  [-Inf,5.55) [3.35, Inf]  [-Inf,2.45)  [-Inf,0.8)  setosa

Train classifier on 90% of the data

train <- sample(1:nrow(iris_d), size = as.integer(nrow(iris_d)*.9))

5/length(train) # support

## [1] 0.03703704

Build a CBA Model

cba_model <- CBA(Species ~ ., data = iris_d[train,], supp = 0.03, conf=0.5)
cba_model

## CBA Classifier Object
## Class: Species=setosa, Species=versicolor, Species=virginica
## Default Class: Species=setosa
## Number of rules: 10
## Classification method: first 
## Description: CBA algorithm by Liu, et al. 1998 with support=0.03
##      and confidence=0.5

inspect(rules(cba_model))

##      lhs                           rhs                     support confidence     lift count
## [1]  {Petal.Length=[-Inf,2.45)} => {Species=setosa}     0.34074074  1.0000000 2.934783    46
## [2]  {Sepal.Length=[6.15, Inf],                                                             
##       Petal.Width=[1.75, Inf]}  => {Species=virginica}  0.24444444  1.0000000 3.068182    33
## [3]  {Sepal.Length=[5.55,6.15),                                                             
##       Petal.Length=[2.45,4.75)} => {Species=versicolor} 0.13333333  1.0000000 3.000000    18
## [4]  {Sepal.Width=[-Inf,2.95),                                                              
##       Petal.Width=[1.75, Inf]}  => {Species=virginica}  0.11111111  1.0000000 3.068182    15
## [5]  {Sepal.Width=[2.95,3.35),                                                              
##       Petal.Width=[0.8,1.75)}   => {Species=versicolor} 0.08888889  1.0000000 3.000000    12
## [6]  {Sepal.Length=[6.15, Inf],                                                             
##       Petal.Length=[2.45,4.75)} => {Species=versicolor} 0.07407407  1.0000000 3.000000    10
## [7]  {Petal.Width=[1.75, Inf]}  => {Species=virginica}  0.29629630  0.9756098 2.993348    40
## [8]  {Petal.Length=[2.45,4.75)} => {Species=versicolor} 0.28888889  0.9750000 2.925000    39
## [9]  {Sepal.Length=[6.15, Inf],                                                             
##       Petal.Width=[0.8,1.75)}   => {Species=versicolor} 0.10370370  0.9333333 2.800000    14
## [10] {Petal.Width=[0.8,1.75)}   => {Species=versicolor} 0.32592593  0.9166667 2.750000    44

Make predictions for the test data

pr <- predict(cba_model, newdata = iris_d[-train,])
pr

##  [1] setosa     setosa     setosa     setosa     versicolor versicolor
##  [7] versicolor versicolor versicolor virginica  virginica  virginica 
## [13] virginica  versicolor virginica 
## Levels: setosa versicolor virginica

confusionMatrix(reference = iris_d[-train,]$Species, data = pr)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa          4          0         0
##   versicolor      0          5         1
##   virginica       0          0         5
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9333          
##                  95% CI : (0.6805, 0.9983)
##     No Information Rate : 0.4             
##     P-Value [Acc > NIR] : 2.523e-05       
##                                           
##                   Kappa : 0.8993          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.8333
## Specificity                 1.0000            0.9000           1.0000
## Pos Pred Value              1.0000            0.8333           1.0000
## Neg Pred Value              1.0000            1.0000           0.9000
## Prevalence                  0.2667            0.3333           0.4000
## Detection Rate              0.2667            0.3333           0.3333
## Detection Prevalence        0.2667            0.4000           0.3333
## Balanced Accuracy           1.0000            0.9500           0.9167

Example for the Zoo Data Set

data(Zoo, package = "mlbench")
head(Zoo)

##           hair feathers  eggs  milk airborne aquatic predator toothed
## aardvark  TRUE    FALSE FALSE  TRUE    FALSE   FALSE     TRUE    TRUE
## antelope  TRUE    FALSE FALSE  TRUE    FALSE   FALSE    FALSE    TRUE
## bass     FALSE    FALSE  TRUE FALSE    FALSE    TRUE     TRUE    TRUE
## bear      TRUE    FALSE FALSE  TRUE    FALSE   FALSE     TRUE    TRUE
## boar      TRUE    FALSE FALSE  TRUE    FALSE   FALSE     TRUE    TRUE
## buffalo   TRUE    FALSE FALSE  TRUE    FALSE   FALSE    FALSE    TRUE
##          backbone breathes venomous  fins legs  tail domestic catsize
## aardvark     TRUE     TRUE    FALSE FALSE    4 FALSE    FALSE    TRUE
## antelope     TRUE     TRUE    FALSE FALSE    4  TRUE    FALSE    TRUE
## bass         TRUE    FALSE    FALSE  TRUE    0  TRUE    FALSE   FALSE
## bear         TRUE     TRUE    FALSE FALSE    4 FALSE    FALSE    TRUE
## boar         TRUE     TRUE    FALSE FALSE    4  TRUE    FALSE    TRUE
## buffalo      TRUE     TRUE    FALSE FALSE    4  TRUE    FALSE    TRUE
##            type
## aardvark mammal
## antelope mammal
## bass       fish
## bear     mammal
## boar     mammal
## buffalo  mammal

discretize legs

Zoo$legs <- Zoo$legs>0

Train classifier on 90% of the data

train <- sample(1:nrow(Zoo), size = as.integer(nrow(Zoo)*.9))

3/length(train) # support

## [1] 0.03333333

cba_model <- CBA(type ~ ., data = Zoo[train,], supp = 0.03, conf=0.5)
cba_model

## CBA Classifier Object
## Class: type=mammal, type=bird, type=reptile, type=fish, type=amphibian, type=insect, type=mollusc.et.al
## Default Class: type=amphibian
## Number of rules: 5
## Classification method: first 
## Description: CBA algorithm by Liu, et al. 1998 with support=0.03
##      and confidence=0.5

inspect(rules(cba_model))

##     lhs                            rhs              support    confidence
## [1] {milk}                      => {type=mammal}    0.42222222 1         
## [2] {feathers}                  => {type=bird}      0.18888889 1         
## [3] {eggs,fins}                 => {type=fish}      0.12222222 1         
## [4] {eggs,aquatic,toothed,legs} => {type=amphibian} 0.04444444 1         
## [5] {hair,eggs}                 => {type=insect}    0.03333333 1         
##     lift      count
## [1]  2.368421 38   
## [2]  5.294118 17   
## [3]  8.181818 11   
## [4] 22.500000  4   
## [5] 15.000000  3

Make predictions for the test data

pr <- predict(cba_model, newdata = Zoo[-train,])
pr

##  [1] fish      amphibian bird      amphibian mammal    mammal    fish     
##  [8] insect    bird      mammal    bird     
## Levels: mammal bird reptile fish amphibian insect mollusc.et.al

cm <- confusionMatrix(reference = Zoo[-train,]$type, data = pr)
cm

## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      mammal bird reptile fish amphibian insect mollusc.et.al
##   mammal             3    0       0    0         0      0             0
##   bird               0    3       0    0         0      0             0
##   reptile            0    0       0    0         0      0             0
##   fish               0    0       0    2         0      0             0
##   amphibian          0    0       0    0         0      1             1
##   insect             0    0       0    0         0      1             0
##   mollusc.et.al      0    0       0    0         0      0             0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8182          
##                  95% CI : (0.4822, 0.9772)
##     No Information Rate : 0.2727          
##     P-Value [Acc > NIR] : 0.0002617       
##                                           
##                   Kappa : 0.7732          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: mammal Class: bird Class: reptile Class: fish
## Sensitivity                 1.0000      1.0000             NA      1.0000
## Specificity                 1.0000      1.0000              1      1.0000
## Pos Pred Value              1.0000      1.0000             NA      1.0000
## Neg Pred Value              1.0000      1.0000             NA      1.0000
## Prevalence                  0.2727      0.2727              0      0.1818
## Detection Rate              0.2727      0.2727              0      0.1818
## Detection Prevalence        0.2727      0.2727              0      0.1818
## Balanced Accuracy           1.0000      1.0000             NA      1.0000
##                      Class: amphibian Class: insect Class: mollusc.et.al
## Sensitivity                        NA       0.50000              0.00000
## Specificity                    0.8182       1.00000              1.00000
## Pos Pred Value                     NA       1.00000                  NaN
## Neg Pred Value                     NA       0.90000              0.90909
## Prevalence                     0.0000       0.18182              0.09091
## Detection Rate                 0.0000       0.09091              0.00000
## Detection Prevalence           0.1818       0.09091              0.00000
## Balanced Accuracy                  NA       0.75000              0.50000

accuracy <- list(CBA = cm$overall["Accuracy"])

Compare to C5.0 rules and Ripper

Compare to C5.0 rules

C5.0 is an improved version of C4.5 which builds a decision tree using information gain for recursive partitioning. Rules are extracted form the tree.

library("C50")
Zoo_numeric <- Zoo
for(i in 1:ncol(Zoo_numeric)) if(is.logical(Zoo_numeric[[i]])) Zoo_numeric[[i]] <- as.numeric(Zoo_numeric[[i]])
C5.0_model <- C5.0(type ~ ., data = Zoo_numeric[train, ], rules = TRUE)
C5.0_model

## 
## Call:
## C5.0.formula(formula = type ~ ., data = Zoo_numeric[train, ], rules = TRUE)
## 
## Rule-Based Model
## Number of samples: 90 
## Number of predictors: 16 
## 
## Number of Rules: 7 
## 
## Non-standard options: attempt to group attributes

summary(C5.0_model)

## 
## Call:
## C5.0.formula(formula = type ~ ., data = Zoo_numeric[train, ], rules = TRUE)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Wed Apr  3 13:27:40 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 90 cases (17 attributes) from undefined.data
## 
## Rules:
## 
## Rule 1: (38, lift 2.3)
##  milk > 0
##  ->  class mammal  [0.975]
## 
## Rule 2: (17, lift 5.0)
##  feathers > 0
##  ->  class bird  [0.947]
## 
## Rule 3: (6/1, lift 13.5)
##  feathers <= 0
##  milk <= 0
##  backbone > 0
##  fins <= 0
##  tail > 0
##  ->  class reptile  [0.750]
## 
## Rule 4: (11, lift 7.6)
##  milk <= 0
##  fins > 0
##  ->  class fish  [0.923]
## 
## Rule 5: (3, lift 18.0)
##  milk <= 0
##  backbone > 0
##  tail <= 0
##  ->  class amphibian  [0.800]
## 
## Rule 6: (5, lift 12.9)
##  feathers <= 0
##  milk <= 0
##  airborne > 0
##  ->  class insect  [0.857]
## 
## Rule 7: (10/1, lift 8.3)
##  airborne <= 0
##  backbone <= 0
##  ->  class mollusc.et.al  [0.833]
## 
## Default class: mammal
## 
## 
## Evaluation on training data (90 cases):
## 
##          Rules     
##    ----------------
##      No      Errors
## 
##       7    2( 2.2%)   <<
## 
## 
##     (a)   (b)   (c)   (d)   (e)   (f)   (g)    <-classified as
##    ----  ----  ----  ----  ----  ----  ----
##      38                                        (a): class mammal
##            17                                  (b): class bird
##                   5                            (c): class reptile
##                        11                      (d): class fish
##                   1           3                (e): class amphibian
##                                     5     1    (f): class insect
##                                           9    (g): class mollusc.et.al
## 
## 
##  Attribute usage:
## 
##   70.00% milk
##   31.11% feathers
##   21.11% backbone
##   18.89% fins
##   16.67% airborne
##   10.00% tail
## 
## 
## Time: 0.0 secs

pr <- predict(C5.0_model, newdata = Zoo_numeric[-train,])
pr

##  [1] fish          mollusc.et.al bird          mollusc.et.al mammal       
##  [6] mammal        fish          insect        bird          mammal       
## [11] bird         
## Levels: mammal bird reptile fish amphibian insect mollusc.et.al

cm <- confusionMatrix(reference = Zoo[-train,]$type, data = pr)
cm

## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      mammal bird reptile fish amphibian insect mollusc.et.al
##   mammal             3    0       0    0         0      0             0
##   bird               0    3       0    0         0      0             0
##   reptile            0    0       0    0         0      0             0
##   fish               0    0       0    2         0      0             0
##   amphibian          0    0       0    0         0      0             0
##   insect             0    0       0    0         0      1             0
##   mollusc.et.al      0    0       0    0         0      1             1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9091          
##                  95% CI : (0.5872, 0.9977)
##     No Information Rate : 0.2727          
##     P-Value [Acc > NIR] : 1.883e-05       
##                                           
##                   Kappa : 0.8842          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: mammal Class: bird Class: reptile Class: fish
## Sensitivity                 1.0000      1.0000             NA      1.0000
## Specificity                 1.0000      1.0000              1      1.0000
## Pos Pred Value              1.0000      1.0000             NA      1.0000
## Neg Pred Value              1.0000      1.0000             NA      1.0000
## Prevalence                  0.2727      0.2727              0      0.1818
## Detection Rate              0.2727      0.2727              0      0.1818
## Detection Prevalence        0.2727      0.2727              0      0.1818
## Balanced Accuracy           1.0000      1.0000             NA      1.0000
##                      Class: amphibian Class: insect Class: mollusc.et.al
## Sensitivity                        NA       0.50000              1.00000
## Specificity                         1       1.00000              0.90000
## Pos Pred Value                     NA       1.00000              0.50000
## Neg Pred Value                     NA       0.90000              1.00000
## Prevalence                          0       0.18182              0.09091
## Detection Rate                      0       0.09091              0.09091
## Detection Prevalence                0       0.09091              0.18182
## Balanced Accuracy                  NA       0.75000              0.95000

accuracy[["C5.0_rules"]] <- cm$overall["Accuracy"]

Compare with Ripper (Repeated Incremental Pruning to Produce Error Reduction)

Extracts rules directly withour building a decision tree first.

library("RWeka")
ripper_model <- JRip(type ~ ., data = Zoo[train,])
ripper_model

## JRIP rules:
## ===========
## 
## (aquatic = TRUE) and (breathes = TRUE) and (tail = FALSE) => type=amphibian (4.0/1.0)
## (hair = FALSE) and (aquatic = FALSE) and (toothed = TRUE) => type=reptile (3.0/0.0)
## (backbone = FALSE) and (airborne = TRUE) => type=insect (5.0/0.0)
## (backbone = FALSE) => type=mollusc.et.al (10.0/1.0)
## (breathes = FALSE) => type=fish (12.0/1.0)
## (feathers = TRUE) => type=bird (17.0/0.0)
##  => type=mammal (39.0/2.0)
## 
## Number of Rules : 7

pr <- predict(ripper_model, newdata = Zoo[-train,])
pr

##  [1] fish          mollusc.et.al bird          mollusc.et.al mammal       
##  [6] mammal        fish          insect        bird          mammal       
## [11] bird         
## Levels: mammal bird reptile fish amphibian insect mollusc.et.al

cm <- confusionMatrix(reference = Zoo[-train,]$type, data = pr)
cm

## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      mammal bird reptile fish amphibian insect mollusc.et.al
##   mammal             3    0       0    0         0      0             0
##   bird               0    3       0    0         0      0             0
##   reptile            0    0       0    0         0      0             0
##   fish               0    0       0    2         0      0             0
##   amphibian          0    0       0    0         0      0             0
##   insect             0    0       0    0         0      1             0
##   mollusc.et.al      0    0       0    0         0      1             1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9091          
##                  95% CI : (0.5872, 0.9977)
##     No Information Rate : 0.2727          
##     P-Value [Acc > NIR] : 1.883e-05       
##                                           
##                   Kappa : 0.8842          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: mammal Class: bird Class: reptile Class: fish
## Sensitivity                 1.0000      1.0000             NA      1.0000
## Specificity                 1.0000      1.0000              1      1.0000
## Pos Pred Value              1.0000      1.0000             NA      1.0000
## Neg Pred Value              1.0000      1.0000             NA      1.0000
## Prevalence                  0.2727      0.2727              0      0.1818
## Detection Rate              0.2727      0.2727              0      0.1818
## Detection Prevalence        0.2727      0.2727              0      0.1818
## Balanced Accuracy           1.0000      1.0000             NA      1.0000
##                      Class: amphibian Class: insect Class: mollusc.et.al
## Sensitivity                        NA       0.50000              1.00000
## Specificity                         1       1.00000              0.90000
## Pos Pred Value                     NA       1.00000              0.50000
## Neg Pred Value                     NA       0.90000              1.00000
## Prevalence                          0       0.18182              0.09091
## Detection Rate                      0       0.09091              0.09091
## Detection Prevalence                0       0.09091              0.18182
## Balanced Accuracy                  NA       0.75000              0.95000

accuracy[["Ripper"]] <- cm$overall["Accuracy"]

unlist(accuracy)

##        CBA.Accuracy C5.0_rules.Accuracy     Ripper.Accuracy 
##           0.8181818           0.9090909           0.9090909

EMIS/CSE 8331: Code for Classification based on Association Rules (CBA)