load("Employment_2009.rda")
summary(dat)
## PseudoID Name Date
## 004101442: 4 NAME WITHHELD BY OPM :304408 20090331:1270610
## 003167685: 3 NAME WITHHELD BY AGENCY:169859
## 003327886: 3 NAME UNKNOWN : 52
## 003509194: 3 SMITH,PATRICIA A : 20
## 004023994: 3 SMITH,JAMES E : 18
## 004188607: 3 SMITH,MICHAEL A : 17
## (Other) :1270591 (Other) :796236
## Agency Station Age Education
## VATA :260206 #########:381500 50-54 :203671 13 :319393
## TR93 :104069 110010001:103144 45-49 :194857 04 :271752
## SZ00 : 63229 240130031: 13718 55-59 :173459 17 :130966
## HSBC : 61324 241360031: 13179 40-44 :167649 07 : 74839
## HSBD : 55006 426540101: 12122 35-39 :137531 10 : 73869
## TD03 : 47034 241698005: 11452 30-34 :112498 15 : 57892
## (Other):679742 (Other) :735495 (Other):280945 (Other):341899
## PayPlan Grade LOS Occupation
## GS :885287 13 :145501 5-9 :236641 0301 : 61709
## SV : 61193 12 :136265 20-24 :160063 0303 : 56891
## VN : 45801 11 :128610 15-19 :151277 0610 : 56257
## AD : 45316 07 :101042 1-2 :147091 1802 : 55960
## WG : 39701 09 : 92546 10-14 :139189 2210 : 41421
## GL : 38389 14 : 81942 < 1 :105342 1811 : 39845
## (Other):154923 (Other):584704 (Other):331007 (Other):958527
## Category Pay SupervisoryStatus Appointment
## *: 223 Min. : 0 *: 35 10 :673958
## A:474254 1st Qu.: 45693 2: 147715 38 :273766
## B: 58487 Median : 65810 4: 5132 15 :125912
## C:116111 Mean : 74122 5: 5343 48 : 63080
## O: 50943 3rd Qu.: 95620 6: 10615 32 : 41408
## P:325197 Max. :393411 7: 6215 30 : 34454
## T:245395 NA's :2219 8:1095555 (Other): 58032
## Schedule NSFTP
## F :1136271 1:1081701
## P : 52803 2: 188909
## I : 45535
## G : 31096
## J : 2810
## Q : 2022
## (Other): 73
## AgencyName Fulltime
## VETERANS HEALTH ADMINISTRATION :260206 Mode :logical
## INTERNAL REVENUE SERVICE :104069 FALSE:103243
## SOCIAL SECURITY ADMINISTRATION : 63229 TRUE :1167367
## TRANSPORTATION SECURITY ADMINISTRATION: 61324
## CUSTOMS AND BORDER PROTECTION : 55006
## (Other) :726714
## NA's : 62
## Seasonal
## Mode :logical
## FALSE:1234680
## TRUE :35930
##
##
##
##
Important Note: I did not clean the data to remove duplicates, etc. You need to do this first if you have not done it for project 1! Remove attributes not useful for predictive modeling
dat2 <- dat
dat2$PseudoID <- NULL
dat2$Name <- NULL
dat2$Date <- NULL
Make some attributes measured on an ordinal scale continuous
Reason: Decision trees are very slow with nominal/ordinal attributes with many different values.
dat2$Education <- as.numeric(as.character(dat2$Education))
## Warning: NAs introduced by coercion
dat2$Age <- as.numeric(substr(dat2$Age, start = 1, stop = 2))
## Warning: NAs introduced by coercion
dat2$LOS <- as.numeric(sub("-.*|\\+|<", "", dat2$LOS))
## Warning: NAs introduced by coercion
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
Discretize two subsets (two agencies) in the same way.
dat_set1 <- dat2[dat2$Agency == "VATA",]
dat_set2 <- dat2[dat2$Agency == "TR93",]
dat_1_2 <- rbind(dat_set1, dat_set2)
dat_1_2$Pay <- discretize(dat_1_2$Pay, method = "frequency")
dat_1_2$LOS <- discretize(dat_1_2$LOS, method = "frequency")
dat_1_2$Education <- discretize(dat_1_2$Education, method = "frequency")
dat_1_2$Age <- discretize(dat_1_2$Age, method = "frequency")
dat_set1 <- dat_1_2[dat_1_2$Agency == "VATA",]
dat_set2 <- dat_1_2[dat_1_2$Agency == "TR93",]
trans1 <- as(dat_set1, "transactions")
trans2 <- as(dat_set2, "transactions")
rules1 <- apriori(trans1)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 26020
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[1144 item(s), 260206 transaction(s)] done [0.30s].
## sorting and recoding items ... [31 item(s)] done [0.04s].
## creating transaction tree ... done [0.28s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(trans1): Mining stopped (maxlen reached). Only patterns
## up to a length of 10 returned!
## done [0.12s].
## writing ... [33150 rule(s)] done [0.01s].
## creating S4 object ... done [0.07s].
rules2 <- apriori(trans2)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 10406
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[678 item(s), 104069 transaction(s)] done [0.11s].
## sorting and recoding items ... [34 item(s)] done [0.02s].
## creating transaction tree ... done [0.08s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(trans2): Mining stopped (maxlen reached). Only patterns
## up to a length of 10 returned!
## done [0.17s].
## writing ... [68231 rule(s)] done [0.01s].
## creating S4 object ... done [0.03s].
inspect(head(rules1, by = "lift", n=5))
## lhs rhs support confidence lift count
## [1] {Occupation=0610,
## Appointment=38,
## NSFTP=1} => {PayPlan=VN} 0.1676441 0.9914767 5.63293 43622
## [2] {Occupation=0610,
## Appointment=38,
## Fulltime} => {PayPlan=VN} 0.1676441 0.9914767 5.63293 43622
## [3] {Occupation=0610,
## Appointment=38,
## Schedule=F} => {PayPlan=VN} 0.1676441 0.9914767 5.63293 43622
## [4] {Occupation=0610,
## Category=P,
## Appointment=38,
## NSFTP=1} => {PayPlan=VN} 0.1676441 0.9914767 5.63293 43622
## [5] {Occupation=0610,
## Category=P,
## Appointment=38,
## Fulltime} => {PayPlan=VN} 0.1676441 0.9914767 5.63293 43622
inspect(head(rules2, by = "lift", n=5))
## lhs rhs support confidence lift count
## [1] {Station=#########,
## Category=P,
## Appointment=10} => {Occupation=0512} 0.1021342 0.9748693 7.842133 10629
## [2] {Agency=TR93,
## Station=#########,
## Category=P,
## Appointment=10} => {Occupation=0512} 0.1021342 0.9748693 7.842133 10629
## [3] {Station=#########,
## Category=P,
## Appointment=10,
## AgencyName=INTERNAL REVENUE SERVICE} => {Occupation=0512} 0.1021342 0.9748693 7.842133 10629
## [4] {Agency=TR93,
## Station=#########,
## Category=P,
## Appointment=10,
## AgencyName=INTERNAL REVENUE SERVICE} => {Occupation=0512} 0.1021342 0.9748693 7.842133 10629
## [5] {Station=#########,
## PayPlan=GS,
## Category=P,
## SupervisoryStatus=8,
## NSFTP=1} => {Occupation=0512} 0.1069675 0.9682526 7.788906 11132
how many rules (%) are in both sets?
m <- match(rules1, rules2)
sum(!is.na(m))/min(length(rules1),length(rules2))
## [1] 0.05499246
r <- sample(rules1, 100)
Note: this takes a while
q <- interestMeasure(r, measure = c("supp", "confidence", "lift"),
transactions = trans2, reuse = FALSE)
% change in measure. Many do not occur at all (difference of 1 or NaN)
diff <- (quality(r)[,-4] - q)/quality(r)[,-4]
diff
## support confidence lift
## 25518 1.00000000 NaN NaN
## 17700 1.00000000 1.0000000000 NaN
## 23105 0.99749127 0.9917998417 -1.05018559
## 18890 1.00000000 NaN NaN
## 8177 1.00000000 NaN NaN
## 23706 1.00000000 NaN NaN
## 9859 0.99340706 0.0000000000 -0.09498339
## 1060 1.00000000 1.0000000000 NaN
## 14707 1.00000000 NaN NaN
## 1086 1.00000000 1.0000000000 NaN
## 33147 1.00000000 NaN NaN
## 15516 1.00000000 NaN NaN
## 5375 0.46830201 0.3909889615 0.29196083
## 29086 1.00000000 NaN NaN
## 22471 1.00000000 NaN NaN
## 8269 1.00000000 NaN NaN
## 30426 1.00000000 NaN NaN
## 14733 1.00000000 NaN NaN
## 25989 1.00000000 NaN NaN
## 10387 1.00000000 NaN NaN
## 26613 -0.45159884 0.0000000000 -0.16260482
## 12678 1.00000000 NaN NaN
## 16688 1.00000000 NaN NaN
## 13903 1.00000000 NaN NaN
## 11593 1.00000000 NaN NaN
## 15544 1.00000000 NaN NaN
## 6723 1.00000000 NaN NaN
## 3685 1.00000000 1.0000000000 NaN
## 2916 1.00000000 1.0000000000 NaN
## 28482 1.00000000 NaN NaN
## 25644 1.00000000 NaN NaN
## 9863 1.00000000 1.0000000000 NaN
## 21674 1.00000000 1.0000000000 NaN
## 15057 1.00000000 NaN NaN
## 25553 1.00000000 NaN NaN
## 15225 1.00000000 NaN NaN
## 29348 1.00000000 NaN NaN
## 2253 1.00000000 NaN NaN
## 12183 0.02092592 0.0618553350 0.02166727
## 25375 1.00000000 NaN NaN
## 25557 1.00000000 NaN NaN
## 2865 1.00000000 NaN NaN
## 30677 1.00000000 NaN NaN
## 16737 1.00000000 NaN NaN
## 13054 1.00000000 1.0000000000 NaN
## 24614 1.00000000 NaN NaN
## 13984 0.69406369 0.0000000000 0.05219498
## 19928 1.00000000 NaN NaN
## 6075 1.00000000 1.0000000000 NaN
## 28420 1.00000000 NaN NaN
## 11331 1.00000000 NaN NaN
## 14467 1.00000000 NaN NaN
## 8360 -0.29458823 0.0000000000 0.05219498
## 24963 1.00000000 NaN NaN
## 26286 1.00000000 NaN NaN
## 13208 0.99545879 -0.0235253541 -0.06737094
## 23244 1.00000000 NaN NaN
## 19743 1.00000000 1.0000000000 NaN
## 32976 1.00000000 NaN NaN
## 5418 1.00000000 NaN NaN
## 17036 1.00000000 NaN NaN
## 6262 1.00000000 NaN NaN
## 3682 1.00000000 NaN NaN
## 8864 -0.42170914 -0.0007166566 0.36195893
## 1235 1.00000000 NaN NaN
## 12750 -1.54771252 -0.0783185870 -0.12451140
## 28794 1.00000000 NaN NaN
## 13313 1.00000000 1.0000000000 NaN
## 7695 1.00000000 NaN NaN
## 29470 1.00000000 NaN NaN
## 9948 1.00000000 NaN NaN
## 3036 1.00000000 1.0000000000 NaN
## 2656 1.00000000 NaN NaN
## 22460 1.00000000 NaN NaN
## 26513 1.00000000 NaN NaN
## 11481 1.00000000 NaN NaN
## 25326 1.00000000 NaN NaN
## 24567 1.00000000 NaN NaN
## 15109 1.00000000 NaN NaN
## 32533 1.00000000 NaN NaN
## 31507 1.00000000 NaN NaN
## 21786 1.00000000 NaN NaN
## 31626 1.00000000 NaN NaN
## 1497 1.00000000 NaN NaN
## 7097 1.00000000 NaN NaN
## 6610 1.00000000 NaN NaN
## 27155 1.00000000 NaN NaN
## 19506 1.00000000 NaN NaN
## 9979 1.00000000 NaN NaN
## 14192 0.99697817 -0.0145656718 -0.05802745
## 8768 0.02134061 0.0778211160 0.41203336
## 500 1.00000000 NaN NaN
## 9728 0.99372567 0.0000000000 -0.16260482
## 13578 0.99853633 0.1111111111 0.02668143
## 415 1.00000000 NaN NaN
## 9141 -0.75458194 0.0000000000 -0.16260482
## 24932 1.00000000 NaN NaN
## 14965 1.00000000 NaN NaN
## 4087 1.00000000 1.0000000000 NaN
## 29040 1.00000000 NaN NaN
for which rules did support increase/decrease by 10%
inspect(r[which(diff$supp > 0.2 & diff$supp!=1)])
## lhs rhs support confidence lift count
## [1] {LOS=[ 1,10),
## Category=P,
## SupervisoryStatus=8,
## NSFTP=1,
## Fulltime} => {Appointment=38} 0.1532094 0.9226319 2.087186 39866
## [2] {Education=[14,22],
## Appointment=38,
## Schedule=F,
## Fulltime} => {NSFTP=1} 0.1442895 1.0000000 1.182110 37545
## [3] {Age=[15,45),
## LOS=[ 1,10),
## Fulltime} => {Schedule=F} 0.2442834 1.0000000 1.108665 63564
## [4] {LOS=[ 1,10),
## Category=P,
## SupervisoryStatus=8,
## NSFTP=1} => {Fulltime} 0.1660569 1.0000000 1.108665 43209
## [5] {Age=[15,45),
## Appointment=38,
## NSFTP=1,
## Fulltime} => {SupervisoryStatus=8} 0.1375372 0.9478507 1.036919 35788
## [6] {LOS=[ 1,10),
## Appointment=38,
## Schedule=F,
## NSFTP=1} => {SupervisoryStatus=8} 0.2194108 0.9445750 1.033336 57092
## [7] {Education=[14,22],
## Category=P,
## Appointment=38,
## Fulltime} => {Schedule=F} 0.1378331 1.0000000 1.108665 35865
## [8] {Education=[ 8,14),
## LOS=[ 1,10),
## Appointment=38,
## Fulltime} => {NSFTP=1} 0.1050399 1.0000000 1.182110 27332
inspect(r[which(diff$supp < -0.1)])
## lhs rhs support confidence lift count
## [1] {PayPlan=GS,
## Category=T,
## SupervisoryStatus=8,
## Appointment=10,
## NSFTP=1,
## Fulltime} => {Schedule=F} 0.1203047 1.0000000 1.1086655 31304
## [2] {PayPlan=GS,
## LOS=[10,20),
## SupervisoryStatus=8,
## NSFTP=1} => {Fulltime} 0.1100897 1.0000000 1.1086655 28646
## [3] {Category=T,
## Pay=[43201, 70401),
## SupervisoryStatus=8,
## NSFTP=1} => {PayPlan=GS} 0.1018885 0.9992839 1.7202301 26512
## [4] {Education=[ 8,14),
## PayPlan=GS,
## Appointment=10,
## Schedule=F} => {SupervisoryStatus=8} 0.1080721 0.8861753 0.9694483 28121
## [5] {Category=T,
## Appointment=10,
## NSFTP=1,
## Fulltime} => {Schedule=F} 0.1239556 1.0000000 1.1086655 32254
for which rules did lift increase/decrease by 10%
inspect(r[which(diff$lift > 0.1)])
## lhs rhs support confidence lift count
## [1] {Age=[15,45),
## LOS=[ 1,10),
## Fulltime} => {Schedule=F} 0.2442834 1.0000000 1.108665 63564
## [2] {Category=T,
## Pay=[43201, 70401),
## SupervisoryStatus=8,
## NSFTP=1} => {PayPlan=GS} 0.1018885 0.9992839 1.720230 26512
## [3] {Education=[ 1, 8),
## Category=T,
## Schedule=F,
## Fulltime} => {PayPlan=GS} 0.1364188 0.9966588 1.715711 35497
inspect(r[which(diff$lift < -0.1)])
## lhs rhs support confidence lift count
## [1] {LOS=[ 1,10),
## Category=P,
## SupervisoryStatus=8,
## NSFTP=1,
## Fulltime} => {Appointment=38} 0.1532094 0.9226319 2.0871856 39866
## [2] {PayPlan=GS,
## Category=T,
## SupervisoryStatus=8,
## Appointment=10,
## NSFTP=1,
## Fulltime} => {Schedule=F} 0.1203047 1.0000000 1.1086655 31304
## [3] {Education=[ 8,14),
## PayPlan=GS,
## Appointment=10,
## Schedule=F} => {SupervisoryStatus=8} 0.1080721 0.8861753 0.9694483 28121
## [4] {Education=[14,22],
## Category=P,
## Appointment=38,
## Fulltime} => {Schedule=F} 0.1378331 1.0000000 1.1086655 35865
## [5] {Category=T,
## Appointment=10,
## NSFTP=1,
## Fulltime} => {Schedule=F} 0.1239556 1.0000000 1.1086655 32254