Mining Association Rules for the Dallas Police Data

library(arules)

## Loading required package: Matrix
## 
## Attaching package: 'arules'
## 
## The following objects are masked from 'package:base':
## 
##     abbreviate, write

library(arulesViz)

## Loading required package: grid

options(digits=3)

Load the cleaned data (my data is not really clean!)

load("data_clean.rda")

Select some features

data_sub <- data_clean[,c("UCROffDesc", "PCClass", "Beat", "CompRace",
  "CompSex", "CompAge","VictimCond", "Status", "Time1")]

All features need to be discrete!

Fix age

data_sub$CompAge[data_sub$CompAge > 100] <- NA
data_sub$CompAge <- discretize(data_sub$CompAge, method = "frequency")
summary(data_sub$CompAge)

## [ 0, 31) [31, 46) [46,100]     NA's 
##    26253    26453    25260    41067

Fix time

data_sub$Time1 <- as.factor(as.numeric(gsub(":\\d\\d", "", data_sub$Time1)))
summary(data_sub$Time1)

##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
## 7997 4097 3631 2792 2301 2126 2588 3364 4929 4104 4746 4489 5727 4520 4656 
##   15   16   17   18   19   20   21   22   23 NA's 
## 5211 5517 6799 7488 6414 6641 6510 7009 5374    3

Replace all “” with NA (NAs are not translated into items and then get rid of empty factor levels

data_sub[data_sub==""] <- NA
for(i in 1:ncol(data_sub)) data_sub[[i]] <- factor(data_sub[[i]])

trans <- as(data_sub, "transactions")
trans

## transactions in sparse format with
##  119033 transactions (rows) and
##  329 items (columns)

summary(trans)

## transactions as itemMatrix in sparse format with
##  119033 rows (elements/itemsets/transactions) and
##  329 columns (items) and a density of 0.021 
## 
## most frequent items:
## Status=Suspended        CompSex=M        CompSex=F UCROffDesc=THEFT 
##           101738            41940            37255            33114 
##       CompRace=B          (Other) 
##            26882           580538 
## 
## element (itemset/transaction) length distribution:
## sizes
##     2     3     4     5     6     7     8     9 
##     2   295  8804 30654   198  5918 68980  4182 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0     5.0     8.0     6.9     8.0     9.0 
## 
## includes extended item information - examples:
##                          labels  variables             levels
## 1  UCROffDesc=ACCIDENTAL INJURY UCROffDesc  ACCIDENTAL INJURY
## 2 UCROffDesc=AGGRAVATED ASSAULT UCROffDesc AGGRAVATED ASSAULT
## 3           UCROffDesc=AIRPLANE UCROffDesc           AIRPLANE
## 
## includes extended transaction information - examples:
##   transactionID
## 1             1
## 2             2
## 3             3

itemFrequencyPlot(trans, topN = 20)

This is how many transactions need to contain an itemset to make it frequent

nrow(trans) * 0.01

## [1] 1190

rules <- apriori(trans, parameter = list(supp = .01, conf = .8))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport support minlen maxlen
##         0.8    0.1    1 none FALSE            TRUE    0.01      1     10
##  target   ext
##   rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 1190 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[329 item(s), 119033 transaction(s)] done [0.05s].
## sorting and recoding items ... [64 item(s)] done [0.01s].
## creating transaction tree ... done [0.07s].
## checking subsets of size 1 2 3 4 5 6 done [0.02s].
## writing ... [640 rule(s)] done [0.00s].
## creating S4 object  ... done [0.02s].

rules

## set of 640 rules

summary(rules)

## set of 640 rules
## 
## rule length distribution (lhs + rhs):sizes
##   1   2   3   4   5   6 
##   1  53 234 252  92   8 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    3.00    4.00    3.63    4.00    6.00 
## 
## summary of quality measures:
##     support        confidence         lift      
##  Min.   :0.010   Min.   :0.800   Min.   : 0.94  
##  1st Qu.:0.012   1st Qu.:0.866   1st Qu.: 1.05  
##  Median :0.019   Median :0.905   Median : 1.10  
##  Mean   :0.031   Mean   :0.903   Mean   : 2.21  
##  3rd Qu.:0.031   3rd Qu.:0.944   3rd Qu.: 1.14  
##  Max.   :0.855   Max.   :0.979   Max.   :25.20  
## 
## mining info:
##   data ntransactions support confidence
##  trans        119033    0.01        0.8

inspect(head(sort(rules, by = "lift"), n =10))

##    lhs                         rhs                  support confidence  lift
## 1  {PCClass=F1,                                                             
##     Status=Suspended}       => {UCROffDesc=ROBBERY}  0.0242      0.956 25.20
## 2  {PCClass=F1,                                                             
##     CompSex=M,                                                              
##     Status=Suspended}       => {UCROffDesc=ROBBERY}  0.0128      0.951 25.08
## 3  {PCClass=F1}             => {UCROffDesc=ROBBERY}  0.0282      0.926 24.41
## 4  {PCClass=F1,                                                             
##     CompSex=M}              => {UCROffDesc=ROBBERY}  0.0143      0.909 23.96
## 5  {PCClass=MC,                                                             
##     Status=Clear by Arrest} => {UCROffDesc=ASSAULT}  0.0152      0.891 21.78
## 6  {UCROffDesc=ASSAULT,                                                     
##     Status=Clear by Arrest} => {PCClass=MC}          0.0152      0.937 10.04
## 7  {UCROffDesc=BURGLARY,                                                    
##     CompSex=F,                                                              
##     CompAge=[ 0, 31)}       => {PCClass=F2}          0.0136      0.933  8.76
## 8  {UCROffDesc=BURGLARY,                                                    
##     CompSex=F,                                                              
##     CompAge=[ 0, 31),                                                       
##     Status=Suspended}       => {PCClass=F2}          0.0126      0.933  8.76
## 9  {UCROffDesc=BURGLARY,                                                    
##     CompAge=[ 0, 31)}       => {PCClass=F2}          0.0231      0.926  8.69
## 10 {UCROffDesc=BURGLARY,                                                    
##     CompAge=[ 0, 31),                                                       
##     Status=Suspended}       => {PCClass=F2}          0.0213      0.925  8.68

Too many rules with suspended! Get rid of the item by subsetting the transactions. You could also get rid of the level in the original data and recreate the transactions.

trans2 <- trans[, colnames(trans)!="Status=Suspended"]

Let’s also lower the support to find rules that are not so common.

nrow(trans) * 0.001

## [1] 119

rules <- apriori(trans2, parameter = list(supp = 0.001, conf = .8))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport support minlen maxlen
##         0.8    0.1    1 none FALSE            TRUE   0.001      1     10
##  target   ext
##   rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 119 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[328 item(s), 119033 transaction(s)] done [0.04s].
## sorting and recoding items ... [306 item(s)] done [0.01s].
## creating transaction tree ... done [0.07s].
## checking subsets of size 1 2 3 4 5 6 done [0.10s].
## writing ... [1375 rule(s)] done [0.00s].
## creating S4 object  ... done [0.03s].

rules

## set of 1375 rules

summary(rules)

## set of 1375 rules
## 
## rule length distribution (lhs + rhs):sizes
##   2   3   4   5   6 
##  11 282 652 409  21 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    4.00    4.00    4.11    5.00    6.00 
## 
## summary of quality measures:
##     support         confidence         lift      
##  Min.   :0.0010   Min.   :0.800   Min.   :  2.3  
##  1st Qu.:0.0013   1st Qu.:0.839   1st Qu.:  3.3  
##  Median :0.0017   Median :0.871   Median :  5.2  
##  Mean   :0.0034   Mean   :0.883   Mean   : 11.7  
##  3rd Qu.:0.0029   3rd Qu.:0.925   3rd Qu.:  8.6  
##  Max.   :0.0858   Max.   :1.000   Max.   :115.8  
## 
## mining info:
##    data ntransactions support confidence
##  trans2        119033   0.001        0.8

inspect(head(sort(rules, by = "lift"), n =5))

##   lhs                                     rhs                                   support confidence lift
## 1 {PCClass=FS,                                                                                         
##    CompRace=B,                                                                                         
##    CompSex=M,                                                                                          
##    Status=Clear by Exceptional Arrest,                                                                 
##    Time1=18}                           => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00105      1.000  116
## 2 {PCClass=FS,                                                                                         
##    CompSex=M,                                                                                          
##    Status=Clear by Exceptional Arrest,                                                                 
##    Time1=18}                           => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00219      0.996  115
## 3 {PCClass=FS,                                                                                         
##    CompAge=[31, 46),                                                                                   
##    Status=Clear by Exceptional Arrest,                                                                 
##    Time1=18}                           => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00154      0.995  115
## 4 {PCClass=FS,                                                                                         
##    CompRace=B,                                                                                         
##    Status=Clear by Exceptional Arrest,                                                                 
##    Time1=18}                           => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00152      0.995  115
## 5 {PCClass=FS,                                                                                         
##    CompRace=L,                                                                                         
##    Status=Clear by Exceptional Arrest,                                                                 
##    Time1=18}                           => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00144      0.994  115

Only look at rules with an item containing THEFT

inspect(head(sort(subset(rules, rhs %pin% "THEFT"), by = "lift")))

##   lhs                         rhs                support confidence lift
## 1 {PCClass=MB,                                                          
##    Beat=252,                                                            
##    Status=Clear by Arrest} => {UCROffDesc=THEFT} 0.00189      0.974 3.50
## 2 {PCClass=MA,                                                          
##    Beat=143,                                                            
##    CompSex=M}              => {UCROffDesc=THEFT} 0.00124      0.949 3.41
## 3 {PCClass=MB,                                                          
##    Beat=416,                                                            
##    Status=Clear by Arrest} => {UCROffDesc=THEFT} 0.00127      0.938 3.37
## 4 {PCClass=MA,                                                          
##    Beat=143,                                                            
##    CompRace=W}             => {UCROffDesc=THEFT} 0.00160      0.936 3.36
## 5 {PCClass=MA,                                                          
##    Beat=143,                                                            
##    CompAge=[ 0, 31)}       => {UCROffDesc=THEFT} 0.00129      0.928 3.33
## 6 {PCClass=MA,                                                          
##    Beat=521,                                                            
##    CompSex=M}              => {UCROffDesc=THEFT} 0.00147      0.926 3.33

Visualization

plot(rules)

#plot(rules, interactive = TRUE)

Represent top 100 lift rules as a graph

plot(head(sort(rules, by = "lift"), n = 100), method = "graph")

# plot(head(sort(rules, by = "lift"), n = 100), method = "graph", interactive = TRUE)

this needs the latest version of arules (resd ? is.redundant)

nr_rules <- rules[!is.redundant(rules)]
nr_rules

## set of 570 rules

inspect(head(nr_rules, by = "lift"))

##   lhs                                     rhs                                   support confidence  lift
## 1 {PCClass=FS,                                                                                          
##    CompSex=M,                                                                                           
##    CompAge=[31, 46),                                                                                    
##    Status=Clear by Exceptional Arrest,                                                                  
##    Time1=18}                           => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00108      0.992 114.9
## 2 {PCClass=FS,                                                                                          
##    CompRace=L,                                                                                          
##    CompAge=[ 0, 31),                                                                                    
##    Status=Clear by Exceptional Arrest} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00123      0.942 109.1
## 3 {PCClass=FS,                                                                                          
##    CompSex=F,                                                                                           
##    CompAge=[31, 46),                                                                                    
##    Status=Clear by Exceptional Arrest} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00108      0.908 105.1
## 4 {PCClass=FS,                                                                                          
##    CompRace=B,                                                                                          
##    CompAge=[ 0, 31),                                                                                    
##    Status=Clear by Exceptional Arrest} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00116      0.868 100.5
## 5 {PCClass=FS,                                                                                          
##    CompSex=F,                                                                                           
##    CompAge=[ 0, 31),                                                                                    
##    Status=Clear by Exceptional Arrest} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00105      0.833  96.5
## 6 {PCClass=FS,                                                                                          
##    CompRace=B,                                                                                          
##    CompSex=F,                                                                                           
##    Status=Clear by Exceptional Arrest} => {UCROffDesc=CHILD (OFFENSES AGAINST)} 0.00120      0.817  94.6

#plot(nr_rules, method = "graph", interactive = TRUE)

crime3.R

hahsler

Thu Mar 24 15:41:44 2016

Mining Association Rules for the Dallas Police Data