Code examples for the paper:

Mohamed Azmia, George C. Runger, Abdelaziz Berrado. Interpretable regularized class association rules algorithm for classification in a categorical data space, Information Sciences, Vol 483, pp. 313-331, 2019.

The paper uses regularized logistic regression to find a subset of classification rules.

set.seed(1234)

Implemented in package arulesCBA

library("arulesCBA")

## Loading required package: Matrix

## Loading required package: arules

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

## 
## Attaching package: 'arulesCBA'

## The following object is masked from 'package:arules':
## 
##     rules

library("arulesViz")

Votes data used in the paper

data("Votes", package = "cba")
dim(Votes)

## [1] 435  17

Votes[1,]

##   handicapped-infants water-project-cost-sharing
## 1                   n                          y
##   adoption-of-the-budget-resolution physician-fee-freeze el-salvador-aid
## 1                                 n                    y               y
##   religious-groups-in-schools anti-satellite-test-ban aid-to-nicaraguan-contras
## 1                           y                       n                         n
##   mx-missile immigration synfuels-corporation-cutback education-spending
## 1          n           y                         <NA>                  y
##   superfund-right-to-sue crime duty-free-exports
## 1                      y     y                 n
##   export-administration-act-south-africa      Class
## 1                                      y republican

RCAR

Example from the paper

rcar_model <- RCAR(Class ~ ., data = Votes, support = 0.1, confidence = .5, lambda = 0.01)
rcar_model

## CBA Classifier Object
## Class: 
## Default Class: NA
## Number of rules: 33
## Classification method: logit  
## Description: RCAR+ based on RCAR (Azmi et al., 2019)

rcar_model$weights

##          democrat  republican
##  [1,]  0.37583671 -0.37583671
##  [2,]  0.10134298 -0.10134298
##  [3,]  0.14102149 -0.14102149
##  [4,]  1.13314895 -1.13314895
##  [5,]  0.32856343 -0.32856343
##  [6,]  0.15493657 -0.15493657
##  [7,] -0.29555020  0.29555020
##  [8,] -0.21016988  0.21016988
##  [9,] -0.64269337  0.64269337
## [10,] -0.13823185  0.13823185
## [11,] -0.11763806  0.11763806
## [12,]  0.02064650 -0.02064650
## [13,]  0.08267695 -0.08267695
## [14,] -0.25150238  0.25150238
## [15,]  0.48347145 -0.48347145
## [16,] -0.01180879  0.01180879
## [17,] -0.05967489  0.05967489
## [18,] -0.04140535  0.04140535
## [19,] -0.40120279  0.40120279
## [20,] -0.15873456  0.15873456
## [21,] -0.06386612  0.06386612
## [22,] -0.15827677  0.15827677
## [23,]  0.01722490 -0.01722490
## [24,]  0.19749264 -0.19749264
## [25,]  0.35295423 -0.35295423
## [26,] -0.01905085  0.01905085
## [27,] -0.45908817  0.45908817
## [28,] -0.56633813  0.56633813
## [29,] -0.10279917  0.10279917
## [30,] -0.01586533  0.01586533
## [31,] -0.01085850  0.01085850
## [32,] -0.06015053  0.06015053
## [33,]  0.28230523 -0.28230523

inspectDT(rules(rcar_model))

Votes[1:5,]

##   handicapped-infants water-project-cost-sharing
## 1                   n                          y
## 2                   n                          y
## 3                <NA>                          y
## 4                   n                          y
## 5                   y                          y
##   adoption-of-the-budget-resolution physician-fee-freeze el-salvador-aid
## 1                                 n                    y               y
## 2                                 n                    y               y
## 3                                 y                 <NA>               y
## 4                                 y                    n            <NA>
## 5                                 y                    n               y
##   religious-groups-in-schools anti-satellite-test-ban aid-to-nicaraguan-contras
## 1                           y                       n                         n
## 2                           y                       n                         n
## 3                           y                       n                         n
## 4                           y                       n                         n
## 5                           y                       n                         n
##   mx-missile immigration synfuels-corporation-cutback education-spending
## 1          n           y                         <NA>                  y
## 2          n           n                            n                  y
## 3          n           n                            y                  n
## 4          n           n                            y                  n
## 5          n           n                            y               <NA>
##   superfund-right-to-sue crime duty-free-exports
## 1                      y     y                 n
## 2                      y     y                 n
## 3                      y     y                 n
## 4                      y     n                 n
## 5                      y     y                 y
##   export-administration-act-south-africa      Class
## 1                                      y republican
## 2                                   <NA> republican
## 3                                      n   democrat
## 4                                      y   democrat
## 5                                      y   democrat

predict(rcar_model, Votes[1:5,])

## [1] republican republican democrat   democrat   democrat  
## Levels: democrat republican

Implementation of RCAR

arulesCBA::RCAR

## function (formula, data, lambda = NULL, alpha = 1, glmnet.args = NULL, 
##     cv.glmnet.args = NULL, parameter = NULL, control = NULL, 
##     balanceSupport = FALSE, disc.method = "mdlp", verbose = FALSE, 
##     ...) 
## {
##     trans <- prepareTransactions(formula, data, disc.method)
##     formula <- as.formula(formula)
##     form <- .parseformula(formula, trans)
##     if (verbose) {
##         glmnet.args$trace.it <- TRUE
##         cv.glmnet.args$trace.it <- TRUE
##     }
##     if (verbose) 
##         cat("* Mining CARs...\n")
##     cars <- mineCARs(formula, trans, parameter = parameter, control = control, 
##         balanceSupport = balanceSupport, verbose = verbose, ...)
##     if (verbose) 
##         cat("* Creating model matrix\n")
##     X <- is.superset(trans, lhs(cars))
##     y <- response(formula, trans)
##     cv <- NULL
##     if (is.null(lambda)) {
##         if (verbose) 
##             cat("* Determine lambda using cross-validation: ")
##         cv <- do.call(glmnet::cv.glmnet, c(list(x = X, y = y, 
##             family = "multinomial", alpha = alpha), cv.glmnet.args))
##         lambda <- cv$lambda.1se
##         if (verbose) 
##             cat(lambda, "\n")
##     }
##     if (verbose) 
##         cat("* Fitting glmnet\n")
##     model <- do.call(glmnet::glmnet, c(list(x = X, y = y, family = "multinomial", 
##         alpha = alpha, lambda = lambda), glmnet.args))
##     weights <- sapply(model$beta, as.vector)
##     remove <- apply(weights, MARGIN = 1, FUN = function(x) all(x == 
##         0))
##     quality(cars)$weight <- apply(weights, MARGIN = 1, max)
##     quality(cars)$oddsratio <- exp(quality(cars)$weight)
##     rulebase <- cars[!remove]
##     weights <- weights[!remove, ]
##     biases <- model$a0
##     if (verbose) 
##         cat("* CARs left:", length(rulebase), "\n")
##     structure(list(formula = formula, discretization = attr(trans, 
##         "disc_info"), rules = rulebase, default = NA, weights = weights, 
##         biases = biases, method = "logit", model = list(all_rules = cars, 
##             reg_model = model, cv = cv), description = paste("RCAR+ based on RCAR (Azmi et al., 2019)")), 
##         class = "CBA")
## }
## <bytecode: 0x55a356da4900>
## <environment: namespace:arulesCBA>

Do it manually

1. create transactions

trans <- prepareTransactions(Class ~ ., data = Votes)

2. mine CARs

cars <- mineCARs(Class ~ ., trans, support = 0.1, confidence = .5)

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen target  ext
##       5  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 43 
## 
## set item appearances ...[34 item(s)] done [0.00s].
## set transactions ...[34 item(s), 435 transaction(s)] done [0.00s].
## sorting and recoding items ... [34 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 done [0.01s].
## writing ... [6620 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

inspectDT(cars)

## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

3. Create the “Rule Space” which is a model matrix for the regression.

Note that cars can be seen as interesting input variables and interactions between variables. The dataset has

nitems(trans)

## [1] 34

variables and if we would like to use all interaction effects up to 5 variables, we would have to consider a model matrix with

choose(nitems(trans), 1:5)

## [1]     34    561   5984  46376 278256

sum(choose(nitems(trans), 1:5))

## [1] 331211

columns. RCAR only considers the car:

length(cars)

## [1] 6620

The model matrix X is a sparse \(|T| \times |Cars|\) logical matrix indicating if a car covers (applies to) a transaction.

X <- is.superset(trans, lhs(cars))
length(trans)

## [1] 435

length(cars)

## [1] 6620

dim(X)

## [1]  435 6620

lets look at the first 10 transactions (rows) and look what rules cover each transaction.

m <- as(X[1:10, 1:15], "matrix")
dimnames(m) <- NULL
m

##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10] [,11] [,12]
##  [1,] TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [2,] TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [3,] TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [4,] TRUE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE
##  [5,] TRUE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [6,] TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [7,] TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
##  [8,] TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [9,] TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [10,] TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##       [,13] [,14] [,15]
##  [1,]  TRUE  TRUE  TRUE
##  [2,]  TRUE  TRUE  TRUE
##  [3,]  TRUE FALSE  TRUE
##  [4,]  TRUE  TRUE FALSE
##  [5,] FALSE FALSE  TRUE
##  [6,] FALSE  TRUE  TRUE
##  [7,] FALSE  TRUE  TRUE
##  [8,] FALSE  TRUE  TRUE
##  [9,]  TRUE  TRUE  TRUE
## [10,] FALSE FALSE FALSE

For example, transaction 1 is covered by car 5.

inspect(trans[1])

##     items                                       transactionID
## [1] {handicapped-infants=n,                                  
##      water-project-cost-sharing=y,                           
##      adoption-of-the-budget-resolution=n,                    
##      physician-fee-freeze=y,                                 
##      el-salvador-aid=y,                                      
##      religious-groups-in-schools=y,                          
##      anti-satellite-test-ban=n,                              
##      aid-to-nicaraguan-contras=n,                            
##      mx-missile=n,                                           
##      immigration=y,                                          
##      education-spending=y,                                   
##      superfund-right-to-sue=y,                               
##      crime=y,                                                
##      duty-free-exports=n,                                    
##      export-administration-act-south-africa=y,               
##      Class=republican}                                      1

inspect(cars[5])

##     lhs                                      rhs                  support confidence  coverage     lift count
## [1] {adoption-of-the-budget-resolution=n} => {Class=republican} 0.3264368  0.8304094 0.3931034 2.150167   142

y is a vector indicating what the correct class label for each transaction is

y <- response(Class ~ ., trans)
head(y)

## [1] republican republican democrat   democrat   democrat   democrat  
## Levels: democrat republican

4. Do logistic regression (with LASSO)

fit <- glmnet::glmnet(x = X, y = y, family = "multinomial", alpha = 1)

Each curve corresponds to a variable. It shows the path of its coefficient against the L1-norm of the whole coefficient vector.

plot(fit)

We can get the coefficients for a lambda of 0.01.

cs <- coef(fit, s = 0.01)
str(cs$democrat)

## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   ..@ i       : int [1:34] 0 3 24 28 31 32 55 91 122 126 ...
##   ..@ p       : int [1:2] 0 34
##   ..@ Dim     : int [1:2] 6621 1
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:6621] "(Intercept)" "{}" "{export-administration-act-south-africa=n}" "{synfuels-corporation-cutback=y}" ...
##   .. ..$ : chr "1"
##   ..@ x       : num [1:34] 0.0045 0.376 0.1005 0.1411 1.1348 ...
##   ..@ factors : list()

str(cs$democrat)

## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   ..@ i       : int [1:34] 0 3 24 28 31 32 55 91 122 126 ...
##   ..@ p       : int [1:2] 0 34
##   ..@ Dim     : int [1:2] 6621 1
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:6621] "(Intercept)" "{}" "{export-administration-act-south-africa=n}" "{synfuels-corporation-cutback=y}" ...
##   .. ..$ : chr "1"
##   ..@ x       : num [1:34] 0.0045 0.376 0.1005 0.1411 1.1348 ...
##   ..@ factors : list()

Note that we get 34 beta values (an intercept plus 33 rules).

CBA

cba_model <- CBA(Class ~ ., data = Votes, support = 0.1, confidence = .5)
cba_model

## CBA Classifier Object
## Class: 
## Default Class: NA
## Number of rules: 44
## Classification method: first  
## Description: CBA algorithm (Liu et al., 1998)

inspectDT(rules(cba_model))

Votes[1:5,]

##   handicapped-infants water-project-cost-sharing
## 1                   n                          y
## 2                   n                          y
## 3                <NA>                          y
## 4                   n                          y
## 5                   y                          y
##   adoption-of-the-budget-resolution physician-fee-freeze el-salvador-aid
## 1                                 n                    y               y
## 2                                 n                    y               y
## 3                                 y                 <NA>               y
## 4                                 y                    n            <NA>
## 5                                 y                    n               y
##   religious-groups-in-schools anti-satellite-test-ban aid-to-nicaraguan-contras
## 1                           y                       n                         n
## 2                           y                       n                         n
## 3                           y                       n                         n
## 4                           y                       n                         n
## 5                           y                       n                         n
##   mx-missile immigration synfuels-corporation-cutback education-spending
## 1          n           y                         <NA>                  y
## 2          n           n                            n                  y
## 3          n           n                            y                  n
## 4          n           n                            y                  n
## 5          n           n                            y               <NA>
##   superfund-right-to-sue crime duty-free-exports
## 1                      y     y                 n
## 2                      y     y                 n
## 3                      y     y                 n
## 4                      y     n                 n
## 5                      y     y                 y
##   export-administration-act-south-africa      Class
## 1                                      y republican
## 2                                   <NA> republican
## 3                                      n   democrat
## 4                                      y   democrat
## 5                                      y   democrat

predict(cba_model, Votes[1:5,])

## [1] republican republican democrat   democrat   democrat  
## Levels: democrat republican

rulefit (in package pre)

library(pre)

# the code does not like missing values and "-" in column names!
Votes2 <- Votes[complete.cases(Votes),]
colnames(Votes2) <- make.names(colnames(Votes2))

rulefit_model <- pre(Class ~ ., data = Votes2, family = "binomial")
rulefit_model

## 
## Final ensemble with cv error within 1se of minimum: 
##   lambda =  0.02759246
##   number of terms = 7
##   mean cv error (se) = 0.2893241 (0.05065471)
## 
##   cv error type : Binomial Deviance
## 
##                    rule    coefficient
##             (Intercept)  -2.721566e+00
##   physician.fee.freezey   3.515079e+00
##                   rule2   1.502050e+00
##                  rule59   1.315425e-01
##                  rule22   1.055060e-01
##                  rule16   1.044860e-01
##                  rule39   3.691169e-03
##                   rule1  -2.024924e-15
##                                                                          description
##                                                                                    1
##                                                                physician.fee.freezey
##          physician.fee.freeze %in% c("y") & synfuels.corporation.cutback %in% c("n")
##   adoption.of.the.budget.resolution %in% c("n") & superfund.right.to.sue %in% c("y")
##                            physician.fee.freeze %in% c("y") & mx.missile %in% c("n")
##     physician.fee.freeze %in% c("y") & adoption.of.the.budget.resolution %in% c("n")
##        adoption.of.the.budget.resolution %in% c("n") & duty.free.exports %in% c("n")
##                                                     physician.fee.freeze %in% c("n")

Votes2[1:5,]

##    handicapped.infants water.project.cost.sharing
## 6                    n                          y
## 9                    n                          y
## 20                   y                          y
## 24                   y                          y
## 26                   y                          n
##    adoption.of.the.budget.resolution physician.fee.freeze el.salvador.aid
## 6                                  y                    n               y
## 9                                  n                    y               y
## 20                                 y                    n               n
## 24                                 y                    n               n
## 26                                 y                    n               n
##    religious.groups.in.schools anti.satellite.test.ban
## 6                            y                       n
## 9                            y                       n
## 20                           n                       y
## 24                           n                       y
## 26                           n                       y
##    aid.to.nicaraguan.contras mx.missile immigration
## 6                          n          n           n
## 9                          n          n           n
## 20                         y          y           n
## 24                         y          y           n
## 26                         y          y           y
##    synfuels.corporation.cutback education.spending superfund.right.to.sue crime
## 6                             n                  n                      y     y
## 9                             n                  y                      y     y
## 20                            y                  n                      n     n
## 24                            n                  n                      n     n
## 26                            n                  n                      n     n
##    duty.free.exports export.administration.act.south.africa      Class
## 6                  y                                      y   democrat
## 9                  n                                      y republican
## 20                 y                                      y   democrat
## 24                 y                                      y   democrat
## 26                 y                                      y   democrat

predict(rulefit_model, Votes2[1:5,], type = "class")

##            6            9           20           24           26 
##   "democrat" "republican"   "democrat"   "democrat"   "democrat"

CS 8331: Code for Regularized Class Association Rules

Michael Hahsler

2021-11-17