Code examples for the paper:
Mohamed Azmia, George C. Runger, Abdelaziz Berrado. Interpretable regularized class association rules algorithm for classification in a categorical data space, Information Sciences, Vol 483, pp. 313-331, 2019.
The paper uses regularized logistic regression to find a subset of classification rules.
set.seed(1234)
Implemented in package arulesCBA
library("arulesCBA")
## Loading required package: Matrix
## Loading required package: arules
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
##
## Attaching package: 'arulesCBA'
## The following object is masked from 'package:arules':
##
## rules
library("arulesViz")
Votes data used in the paper
data("Votes", package = "cba")
dim(Votes)
## [1] 435 17
Votes[1,]
## handicapped-infants water-project-cost-sharing
## 1 n y
## adoption-of-the-budget-resolution physician-fee-freeze el-salvador-aid
## 1 n y y
## religious-groups-in-schools anti-satellite-test-ban aid-to-nicaraguan-contras
## 1 y n n
## mx-missile immigration synfuels-corporation-cutback education-spending
## 1 n y <NA> y
## superfund-right-to-sue crime duty-free-exports
## 1 y y n
## export-administration-act-south-africa Class
## 1 y republican
rcar_model <- RCAR(Class ~ ., data = Votes, support = 0.1, confidence = .5, lambda = 0.01)
rcar_model
## CBA Classifier Object
## Class:
## Default Class: NA
## Number of rules: 33
## Classification method: logit
## Description: RCAR+ based on RCAR (Azmi et al., 2019)
rcar_model$weights
## democrat republican
## [1,] 0.37583671 -0.37583671
## [2,] 0.10134298 -0.10134298
## [3,] 0.14102149 -0.14102149
## [4,] 1.13314895 -1.13314895
## [5,] 0.32856343 -0.32856343
## [6,] 0.15493657 -0.15493657
## [7,] -0.29555020 0.29555020
## [8,] -0.21016988 0.21016988
## [9,] -0.64269337 0.64269337
## [10,] -0.13823185 0.13823185
## [11,] -0.11763806 0.11763806
## [12,] 0.02064650 -0.02064650
## [13,] 0.08267695 -0.08267695
## [14,] -0.25150238 0.25150238
## [15,] 0.48347145 -0.48347145
## [16,] -0.01180879 0.01180879
## [17,] -0.05967489 0.05967489
## [18,] -0.04140535 0.04140535
## [19,] -0.40120279 0.40120279
## [20,] -0.15873456 0.15873456
## [21,] -0.06386612 0.06386612
## [22,] -0.15827677 0.15827677
## [23,] 0.01722490 -0.01722490
## [24,] 0.19749264 -0.19749264
## [25,] 0.35295423 -0.35295423
## [26,] -0.01905085 0.01905085
## [27,] -0.45908817 0.45908817
## [28,] -0.56633813 0.56633813
## [29,] -0.10279917 0.10279917
## [30,] -0.01586533 0.01586533
## [31,] -0.01085850 0.01085850
## [32,] -0.06015053 0.06015053
## [33,] 0.28230523 -0.28230523
inspectDT(rules(rcar_model))
Votes[1:5,]
## handicapped-infants water-project-cost-sharing
## 1 n y
## 2 n y
## 3 <NA> y
## 4 n y
## 5 y y
## adoption-of-the-budget-resolution physician-fee-freeze el-salvador-aid
## 1 n y y
## 2 n y y
## 3 y <NA> y
## 4 y n <NA>
## 5 y n y
## religious-groups-in-schools anti-satellite-test-ban aid-to-nicaraguan-contras
## 1 y n n
## 2 y n n
## 3 y n n
## 4 y n n
## 5 y n n
## mx-missile immigration synfuels-corporation-cutback education-spending
## 1 n y <NA> y
## 2 n n n y
## 3 n n y n
## 4 n n y n
## 5 n n y <NA>
## superfund-right-to-sue crime duty-free-exports
## 1 y y n
## 2 y y n
## 3 y y n
## 4 y n n
## 5 y y y
## export-administration-act-south-africa Class
## 1 y republican
## 2 <NA> republican
## 3 n democrat
## 4 y democrat
## 5 y democrat
predict(rcar_model, Votes[1:5,])
## [1] republican republican democrat democrat democrat
## Levels: democrat republican
arulesCBA::RCAR
## function (formula, data, lambda = NULL, alpha = 1, glmnet.args = NULL,
## cv.glmnet.args = NULL, parameter = NULL, control = NULL,
## balanceSupport = FALSE, disc.method = "mdlp", verbose = FALSE,
## ...)
## {
## trans <- prepareTransactions(formula, data, disc.method)
## formula <- as.formula(formula)
## form <- .parseformula(formula, trans)
## if (verbose) {
## glmnet.args$trace.it <- TRUE
## cv.glmnet.args$trace.it <- TRUE
## }
## if (verbose)
## cat("* Mining CARs...\n")
## cars <- mineCARs(formula, trans, parameter = parameter, control = control,
## balanceSupport = balanceSupport, verbose = verbose, ...)
## if (verbose)
## cat("* Creating model matrix\n")
## X <- is.superset(trans, lhs(cars))
## y <- response(formula, trans)
## cv <- NULL
## if (is.null(lambda)) {
## if (verbose)
## cat("* Determine lambda using cross-validation: ")
## cv <- do.call(glmnet::cv.glmnet, c(list(x = X, y = y,
## family = "multinomial", alpha = alpha), cv.glmnet.args))
## lambda <- cv$lambda.1se
## if (verbose)
## cat(lambda, "\n")
## }
## if (verbose)
## cat("* Fitting glmnet\n")
## model <- do.call(glmnet::glmnet, c(list(x = X, y = y, family = "multinomial",
## alpha = alpha, lambda = lambda), glmnet.args))
## weights <- sapply(model$beta, as.vector)
## remove <- apply(weights, MARGIN = 1, FUN = function(x) all(x ==
## 0))
## quality(cars)$weight <- apply(weights, MARGIN = 1, max)
## quality(cars)$oddsratio <- exp(quality(cars)$weight)
## rulebase <- cars[!remove]
## weights <- weights[!remove, ]
## biases <- model$a0
## if (verbose)
## cat("* CARs left:", length(rulebase), "\n")
## structure(list(formula = formula, discretization = attr(trans,
## "disc_info"), rules = rulebase, default = NA, weights = weights,
## biases = biases, method = "logit", model = list(all_rules = cars,
## reg_model = model, cv = cv), description = paste("RCAR+ based on RCAR (Azmi et al., 2019)")),
## class = "CBA")
## }
## <bytecode: 0x55a356da4900>
## <environment: namespace:arulesCBA>
trans <- prepareTransactions(Class ~ ., data = Votes)
cars <- mineCARs(Class ~ ., trans, support = 0.1, confidence = .5)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 5 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 43
##
## set item appearances ...[34 item(s)] done [0.00s].
## set transactions ...[34 item(s), 435 transaction(s)] done [0.00s].
## sorting and recoding items ... [34 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 done [0.01s].
## writing ... [6620 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspectDT(cars)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
Note that cars can be seen as interesting input variables and interactions between variables. The dataset has
nitems(trans)
## [1] 34
variables and if we would like to use all interaction effects up to 5 variables, we would have to consider a model matrix with
choose(nitems(trans), 1:5)
## [1] 34 561 5984 46376 278256
sum(choose(nitems(trans), 1:5))
## [1] 331211
columns. RCAR only considers the car:
length(cars)
## [1] 6620
The model matrix X
is a sparse \(|T| \times |Cars|\) logical matrix indicating if a car covers (applies to) a transaction.
X <- is.superset(trans, lhs(cars))
length(trans)
## [1] 435
length(cars)
## [1] 6620
dim(X)
## [1] 435 6620
lets look at the first 10 transactions (rows) and look what rules cover each transaction.
m <- as(X[1:10, 1:15], "matrix")
dimnames(m) <- NULL
m
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## [1,] TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [2,] TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [3,] TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## [4,] TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE
## [5,] TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## [6,] TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## [7,] TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE
## [8,] TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [9,] TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [10,] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [,13] [,14] [,15]
## [1,] TRUE TRUE TRUE
## [2,] TRUE TRUE TRUE
## [3,] TRUE FALSE TRUE
## [4,] TRUE TRUE FALSE
## [5,] FALSE FALSE TRUE
## [6,] FALSE TRUE TRUE
## [7,] FALSE TRUE TRUE
## [8,] FALSE TRUE TRUE
## [9,] TRUE TRUE TRUE
## [10,] FALSE FALSE FALSE
For example, transaction 1 is covered by car 5.
inspect(trans[1])
## items transactionID
## [1] {handicapped-infants=n,
## water-project-cost-sharing=y,
## adoption-of-the-budget-resolution=n,
## physician-fee-freeze=y,
## el-salvador-aid=y,
## religious-groups-in-schools=y,
## anti-satellite-test-ban=n,
## aid-to-nicaraguan-contras=n,
## mx-missile=n,
## immigration=y,
## education-spending=y,
## superfund-right-to-sue=y,
## crime=y,
## duty-free-exports=n,
## export-administration-act-south-africa=y,
## Class=republican} 1
inspect(cars[5])
## lhs rhs support confidence coverage lift count
## [1] {adoption-of-the-budget-resolution=n} => {Class=republican} 0.3264368 0.8304094 0.3931034 2.150167 142
y is a vector indicating what the correct class label for each transaction is
y <- response(Class ~ ., trans)
head(y)
## [1] republican republican democrat democrat democrat democrat
## Levels: democrat republican
fit <- glmnet::glmnet(x = X, y = y, family = "multinomial", alpha = 1)
Each curve corresponds to a variable. It shows the path of its coefficient against the L1-norm of the whole coefficient vector.
plot(fit)
We can get the coefficients for a lambda of 0.01.
cs <- coef(fit, s = 0.01)
str(cs$democrat)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## ..@ i : int [1:34] 0 3 24 28 31 32 55 91 122 126 ...
## ..@ p : int [1:2] 0 34
## ..@ Dim : int [1:2] 6621 1
## ..@ Dimnames:List of 2
## .. ..$ : chr [1:6621] "(Intercept)" "{}" "{export-administration-act-south-africa=n}" "{synfuels-corporation-cutback=y}" ...
## .. ..$ : chr "1"
## ..@ x : num [1:34] 0.0045 0.376 0.1005 0.1411 1.1348 ...
## ..@ factors : list()
str(cs$democrat)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## ..@ i : int [1:34] 0 3 24 28 31 32 55 91 122 126 ...
## ..@ p : int [1:2] 0 34
## ..@ Dim : int [1:2] 6621 1
## ..@ Dimnames:List of 2
## .. ..$ : chr [1:6621] "(Intercept)" "{}" "{export-administration-act-south-africa=n}" "{synfuels-corporation-cutback=y}" ...
## .. ..$ : chr "1"
## ..@ x : num [1:34] 0.0045 0.376 0.1005 0.1411 1.1348 ...
## ..@ factors : list()
Note that we get 34 beta values (an intercept plus 33 rules).
cba_model <- CBA(Class ~ ., data = Votes, support = 0.1, confidence = .5)
cba_model
## CBA Classifier Object
## Class:
## Default Class: NA
## Number of rules: 44
## Classification method: first
## Description: CBA algorithm (Liu et al., 1998)
inspectDT(rules(cba_model))
Votes[1:5,]
## handicapped-infants water-project-cost-sharing
## 1 n y
## 2 n y
## 3 <NA> y
## 4 n y
## 5 y y
## adoption-of-the-budget-resolution physician-fee-freeze el-salvador-aid
## 1 n y y
## 2 n y y
## 3 y <NA> y
## 4 y n <NA>
## 5 y n y
## religious-groups-in-schools anti-satellite-test-ban aid-to-nicaraguan-contras
## 1 y n n
## 2 y n n
## 3 y n n
## 4 y n n
## 5 y n n
## mx-missile immigration synfuels-corporation-cutback education-spending
## 1 n y <NA> y
## 2 n n n y
## 3 n n y n
## 4 n n y n
## 5 n n y <NA>
## superfund-right-to-sue crime duty-free-exports
## 1 y y n
## 2 y y n
## 3 y y n
## 4 y n n
## 5 y y y
## export-administration-act-south-africa Class
## 1 y republican
## 2 <NA> republican
## 3 n democrat
## 4 y democrat
## 5 y democrat
predict(cba_model, Votes[1:5,])
## [1] republican republican democrat democrat democrat
## Levels: democrat republican
library(pre)
# the code does not like missing values and "-" in column names!
Votes2 <- Votes[complete.cases(Votes),]
colnames(Votes2) <- make.names(colnames(Votes2))
rulefit_model <- pre(Class ~ ., data = Votes2, family = "binomial")
rulefit_model
##
## Final ensemble with cv error within 1se of minimum:
## lambda = 0.02759246
## number of terms = 7
## mean cv error (se) = 0.2893241 (0.05065471)
##
## cv error type : Binomial Deviance
##
## rule coefficient
## (Intercept) -2.721566e+00
## physician.fee.freezey 3.515079e+00
## rule2 1.502050e+00
## rule59 1.315425e-01
## rule22 1.055060e-01
## rule16 1.044860e-01
## rule39 3.691169e-03
## rule1 -2.024924e-15
## description
## 1
## physician.fee.freezey
## physician.fee.freeze %in% c("y") & synfuels.corporation.cutback %in% c("n")
## adoption.of.the.budget.resolution %in% c("n") & superfund.right.to.sue %in% c("y")
## physician.fee.freeze %in% c("y") & mx.missile %in% c("n")
## physician.fee.freeze %in% c("y") & adoption.of.the.budget.resolution %in% c("n")
## adoption.of.the.budget.resolution %in% c("n") & duty.free.exports %in% c("n")
## physician.fee.freeze %in% c("n")
Votes2[1:5,]
## handicapped.infants water.project.cost.sharing
## 6 n y
## 9 n y
## 20 y y
## 24 y y
## 26 y n
## adoption.of.the.budget.resolution physician.fee.freeze el.salvador.aid
## 6 y n y
## 9 n y y
## 20 y n n
## 24 y n n
## 26 y n n
## religious.groups.in.schools anti.satellite.test.ban
## 6 y n
## 9 y n
## 20 n y
## 24 n y
## 26 n y
## aid.to.nicaraguan.contras mx.missile immigration
## 6 n n n
## 9 n n n
## 20 y y n
## 24 y y n
## 26 y y y
## synfuels.corporation.cutback education.spending superfund.right.to.sue crime
## 6 n n y y
## 9 n y y y
## 20 y n n n
## 24 n n n n
## 26 n n n n
## duty.free.exports export.administration.act.south.africa Class
## 6 y y democrat
## 9 n y republican
## 20 y y democrat
## 24 y y democrat
## 26 y y democrat
predict(rulefit_model, Votes2[1:5,], type = "class")
## 6 9 20 24 26
## "democrat" "republican" "democrat" "democrat" "democrat"