library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
##
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Loading required package: grid
options(digits=3)
Load the cleaned data (my data is not really clean!)
load("data_clean.rda")
Select some features
data_sub <- data_clean[,c("UCROffDesc", "PCClass", "Beat", "CompRace",
"CompSex", "CompAge","VictimCond", "Status", "Time1")]
All features need to be discrete!
Fix age
data_sub$CompAge[data_sub$CompAge > 100] <- NA
data_sub$CompAge <- discretize(data_sub$CompAge, method = "frequency")
summary(data_sub$CompAge)
## [ 0, 31) [31, 46) [46,100] NA's
## 26253 26453 25260 41067
Fix time
data_sub$Time1 <- as.factor(as.numeric(gsub(":\\d\\d", "", data_sub$Time1)))
summary(data_sub$Time1)
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 7997 4097 3631 2792 2301 2126 2588 3364 4929 4104 4746 4489 5727 4520 4656
## 15 16 17 18 19 20 21 22 23 NA's
## 5211 5517 6799 7488 6414 6641 6510 7009 5374 3
Replace all “” with NA (NAs are not translated into items and then get rid of empty factor levels
data_sub[data_sub==""] <- NA
for(i in 1:ncol(data_sub)) data_sub[[i]] <- factor(data_sub[[i]])
trans <- as(data_sub, "transactions")
trans
## transactions in sparse format with
## 119033 transactions (rows) and
## 329 items (columns)
summary(trans)
## transactions as itemMatrix in sparse format with
## 119033 rows (elements/itemsets/transactions) and
## 329 columns (items) and a density of 0.021
##
## most frequent items:
## Status=Suspended CompSex=M CompSex=F UCROffDesc=THEFT
## 101738 41940 37255 33114
## CompRace=B (Other)
## 26882 580538
##
## element (itemset/transaction) length distribution:
## sizes
## 2 3 4 5 6 7 8 9
## 2 295 8804 30654 198 5918 68980 4182
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 5.0 8.0 6.9 8.0 9.0
##
## includes extended item information - examples:
## labels variables levels
## 1 UCROffDesc=ACCIDENTAL INJURY UCROffDesc ACCIDENTAL INJURY
## 2 UCROffDesc=AGGRAVATED ASSAULT UCROffDesc AGGRAVATED ASSAULT
## 3 UCROffDesc=AIRPLANE UCROffDesc AIRPLANE
##
## includes extended transaction information - examples:
## transactionID
## 1 1
## 2 2
## 3 3
itemFrequencyPlot(trans, topN = 20)
This is how many transactions need to contain an itemset to make it frequent
nrow(trans) * 0.01
## [1] 1190
rules <- apriori(trans, parameter = list(supp = .01, conf = .8))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport support minlen maxlen
## 0.8 0.1 1 none FALSE TRUE 0.01 1 10
## target ext
## rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 1190
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[329 item(s), 119033 transaction(s)] done [0.05s].
## sorting and recoding items ... [64 item(s)] done [0.01s].
## creating transaction tree ... done [0.07s].
## checking subsets of size 1 2 3 4 5 6 done [0.02s].
## writing ... [640 rule(s)] done [0.00s].
## creating S4 object ... done [0.02s].
rules
## set of 640 rules
summary(rules)
## set of 640 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4 5 6
## 1 53 234 252 92 8
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 3.00 4.00 3.63 4.00 6.00
##
## summary of quality measures:
## support confidence lift
## Min. :0.010 Min. :0.800 Min. : 0.94
## 1st Qu.:0.012 1st Qu.:0.866 1st Qu.: 1.05
## Median :0.019 Median :0.905 Median : 1.10
## Mean :0.031 Mean :0.903 Mean : 2.21
## 3rd Qu.:0.031 3rd Qu.:0.944 3rd Qu.: 1.14
## Max. :0.855 Max. :0.979 Max. :25.20
##
## mining info:
## data ntransactions support confidence
## trans 119033 0.01 0.8
inspect(head(sort(rules, by = "lift"), n =10))
## lhs rhs support confidence lift
## 1 {PCClass=F1,
## Status=Suspended} => {UCROffDesc=ROBBERY} 0.0242 0.956 25.20
## 2 {PCClass=F1,
## CompSex=M,
## Status=Suspended} => {UCROffDesc=ROBBERY} 0.0128 0.951 25.08
## 3 {PCClass=F1} => {UCROffDesc=ROBBERY} 0.0282 0.926 24.41
## 4 {PCClass=F1,
## CompSex=M} => {UCROffDesc=ROBBERY} 0.0143 0.909 23.96
## 5 {PCClass=MC,
## Status=Clear by Arrest} => {UCROffDesc=ASSAULT} 0.0152 0.891 21.78
## 6 {UCROffDesc=ASSAULT,
## Status=Clear by Arrest} => {PCClass=MC} 0.0152 0.937 10.04
## 7 {UCROffDesc=BURGLARY,
## CompSex=F,
## CompAge=[ 0, 31)} => {PCClass=F2} 0.0136 0.933 8.76
## 8 {UCROffDesc=BURGLARY,
## CompSex=F,
## CompAge=[ 0, 31),
## Status=Suspended} => {PCClass=F2} 0.0126 0.933 8.76
## 9 {UCROffDesc=BURGLARY,
## CompAge=[ 0, 31)} => {PCClass=F2} 0.0231 0.926 8.69
## 10 {UCROffDesc=BURGLARY,
## CompAge=[ 0, 31),
## Status=Suspended} => {PCClass=F2} 0.0213 0.925 8.68
Too many rules with suspended! Get rid of the item by subsetting the transactions. You could also get rid of the level in the original data and recreate the transactions.
trans2 <- trans[, colnames(trans)!="Status=Suspended"]
Let’s also lower the support to find rules that are not so common.
nrow(trans) * 0.001
## [1] 119
rules <- apriori(trans2, parameter = list(supp = 0.001, conf = .8))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport support minlen maxlen
## 0.8 0.1 1 none FALSE TRUE 0.001 1 10
## target ext
## rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 119
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[328 item(s), 119033 transaction(s)] done [0.04s].
## sorting and recoding items ... [306 item(s)] done [0.01s].
## creating transaction tree ... done [0.07s].
## checking subsets of size 1 2 3 4 5 6 done [0.10s].
## writing ... [1375 rule(s)] done [0.00s].
## creating S4 object ... done [0.03s].
rules
## set of 1375 rules
summary(rules)
## set of 1375 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5 6
## 11 282 652 409 21
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 4.00 4.00 4.11 5.00 6.00
##
## summary of quality measures:
## support confidence lift
## Min. :0.0010 Min. :0.800 Min. : 2.3
## 1st Qu.:0.0013 1st Qu.:0.839 1st Qu.: 3.3
## Median :0.0017 Median :0.871 Median : 5.2
## Mean :0.0034 Mean :0.883 Mean : 11.7
## 3rd Qu.:0.0029 3rd Qu.:0.925 3rd Qu.: 8.6
## Max. :0.0858 Max. :1.000 Max. :115.8
##
## mining info:
## data ntransactions support confidence
## trans2 119033 0.001 0.8
inspect(head(sort(rules, by = "lift"), n =5))
## lhs rhs support confidence lift
## 1 {PCClass=FS,
## CompRace=B,
## CompSex=M,
## Status=Clear by Exceptional Arrest,
## Time1=18} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00105 1.000 116
## 2 {PCClass=FS,
## CompSex=M,
## Status=Clear by Exceptional Arrest,
## Time1=18} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00219 0.996 115
## 3 {PCClass=FS,
## CompAge=[31, 46),
## Status=Clear by Exceptional Arrest,
## Time1=18} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00154 0.995 115
## 4 {PCClass=FS,
## CompRace=B,
## Status=Clear by Exceptional Arrest,
## Time1=18} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00152 0.995 115
## 5 {PCClass=FS,
## CompRace=L,
## Status=Clear by Exceptional Arrest,
## Time1=18} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00144 0.994 115
Only look at rules with an item containing THEFT
inspect(head(sort(subset(rules, rhs %pin% "THEFT"), by = "lift")))
## lhs rhs support confidence lift
## 1 {PCClass=MB,
## Beat=252,
## Status=Clear by Arrest} => {UCROffDesc=THEFT} 0.00189 0.974 3.50
## 2 {PCClass=MA,
## Beat=143,
## CompSex=M} => {UCROffDesc=THEFT} 0.00124 0.949 3.41
## 3 {PCClass=MB,
## Beat=416,
## Status=Clear by Arrest} => {UCROffDesc=THEFT} 0.00127 0.938 3.37
## 4 {PCClass=MA,
## Beat=143,
## CompRace=W} => {UCROffDesc=THEFT} 0.00160 0.936 3.36
## 5 {PCClass=MA,
## Beat=143,
## CompAge=[ 0, 31)} => {UCROffDesc=THEFT} 0.00129 0.928 3.33
## 6 {PCClass=MA,
## Beat=521,
## CompSex=M} => {UCROffDesc=THEFT} 0.00147 0.926 3.33
Visualization
plot(rules)
#plot(rules, interactive = TRUE)
Represent top 100 lift rules as a graph
plot(head(sort(rules, by = "lift"), n = 100), method = "graph")
# plot(head(sort(rules, by = "lift"), n = 100), method = "graph", interactive = TRUE)
this needs the latest version of arules (resd ? is.redundant
)
nr_rules <- rules[!is.redundant(rules)]
nr_rules
## set of 570 rules
inspect(head(nr_rules, by = "lift"))
## lhs rhs support confidence lift
## 1 {PCClass=FS,
## CompSex=M,
## CompAge=[31, 46),
## Status=Clear by Exceptional Arrest,
## Time1=18} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00108 0.992 114.9
## 2 {PCClass=FS,
## CompRace=L,
## CompAge=[ 0, 31),
## Status=Clear by Exceptional Arrest} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00123 0.942 109.1
## 3 {PCClass=FS,
## CompSex=F,
## CompAge=[31, 46),
## Status=Clear by Exceptional Arrest} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00108 0.908 105.1
## 4 {PCClass=FS,
## CompRace=B,
## CompAge=[ 0, 31),
## Status=Clear by Exceptional Arrest} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00116 0.868 100.5
## 5 {PCClass=FS,
## CompSex=F,
## CompAge=[ 0, 31),
## Status=Clear by Exceptional Arrest} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00105 0.833 96.5
## 6 {PCClass=FS,
## CompRace=B,
## CompSex=F,
## Status=Clear by Exceptional Arrest} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00120 0.817 94.6
#plot(nr_rules, method = "graph", interactive = TRUE)