load("Employment_2009.rda")
summary(dat)
## PseudoID Name Date
## 004101442: 4 NAME WITHHELD BY OPM :304408 20090331:1270610
## 003167685: 3 NAME WITHHELD BY AGENCY:169859
## 003327886: 3 NAME UNKNOWN : 52
## 003509194: 3 SMITH,PATRICIA A : 20
## 004023994: 3 SMITH,JAMES E : 18
## 004188607: 3 SMITH,MICHAEL A : 17
## (Other) :1270591 (Other) :796236
## Agency Station Age Education
## VATA :260206 #########:381500 50-54 :203671 13 :319393
## TR93 :104069 110010001:103144 45-49 :194857 04 :271752
## SZ00 : 63229 240130031: 13718 55-59 :173459 17 :130966
## HSBC : 61324 241360031: 13179 40-44 :167649 07 : 74839
## HSBD : 55006 426540101: 12122 35-39 :137531 10 : 73869
## TD03 : 47034 241698005: 11452 30-34 :112498 15 : 57892
## (Other):679742 (Other) :735495 (Other):280945 (Other):341899
## PayPlan Grade LOS Occupation
## GS :885287 13 :145501 5-9 :236641 0301 : 61709
## SV : 61193 12 :136265 20-24 :160063 0303 : 56891
## VN : 45801 11 :128610 15-19 :151277 0610 : 56257
## AD : 45316 07 :101042 1-2 :147091 1802 : 55960
## WG : 39701 09 : 92546 10-14 :139189 2210 : 41421
## GL : 38389 14 : 81942 < 1 :105342 1811 : 39845
## (Other):154923 (Other):584704 (Other):331007 (Other):958527
## Category Pay SupervisoryStatus Appointment
## *: 223 Min. : 0 *: 35 10 :673958
## A:474254 1st Qu.: 45693 2: 147715 38 :273766
## B: 58487 Median : 65810 4: 5132 15 :125912
## C:116111 Mean : 74122 5: 5343 48 : 63080
## O: 50943 3rd Qu.: 95620 6: 10615 32 : 41408
## P:325197 Max. :393411 7: 6215 30 : 34454
## T:245395 NA's :2219 8:1095555 (Other): 58032
## Schedule NSFTP
## F :1136271 1:1081701
## P : 52803 2: 188909
## I : 45535
## G : 31096
## J : 2810
## Q : 2022
## (Other): 73
## AgencyName Fulltime
## VETERANS HEALTH ADMINISTRATION :260206 Mode :logical
## INTERNAL REVENUE SERVICE :104069 FALSE:103243
## SOCIAL SECURITY ADMINISTRATION : 63229 TRUE :1167367
## TRANSPORTATION SECURITY ADMINISTRATION: 61324
## CUSTOMS AND BORDER PROTECTION : 55006
## (Other) :726714
## NA's : 62
## Seasonal
## Mode :logical
## FALSE:1234680
## TRUE :35930
##
##
##
##
Important Note: I did not clean the data to remove duplicates, etc. You need to do this first if you have not done it for project 1! Remove attributes not useful for predictive modeling
dat2 <- dat
dat2$PseudoID <- NULL
dat2$Name <- NULL
dat2$Date <- NULL
Make some attributes measured on an ordinal scale continuous
Reason: Decision trees are very slow with nominal/ordinal attributes with many different values.
dat2$Education <- as.numeric(as.character(dat2$Education))
## Warning: NAs introduced by coercion
dat2$Age <- as.numeric(substr(dat2$Age, start = 1, stop = 2))
## Warning: NAs introduced by coercion
dat2$LOS <- as.numeric(sub("-.*|\\+|<", "", dat2$LOS))
## Warning: NAs introduced by coercion
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
Discretice continous (numeric) variabels
which(sapply(dat2, FUN = function(i) is.numeric(i)))
## Age Education LOS Pay
## 3 4 7 10
for(i in which(sapply(dat2, FUN = function(i) is.numeric(i))))
dat2[[i]] <- discretize(dat2[[i]], method = "frequency")
Get rid of some more columns
dat2$Agency <- NULL ### we have AgencyName
dat2$Schedule <- NULL ### we have fulltime
dat2$NSFTP <- NULL ### ???
trans <- as(dat2, "transactions")
summary(trans)
## transactions as itemMatrix in sparse format with
## 1270610 rows (elements/itemsets/transactions) and
## 47255 columns (items) and a density of 0.0002730179
##
## most frequent items:
## Fulltime SupervisoryStatus=8 PayPlan=GS
## 1167367 1095555 885287
## Appointment=10 LOS=[ 1,10) (Other)
## 673958 589592 11980968
##
## element (itemset/transaction) length distribution:
## sizes
## 9 10 11 12 13 14
## 5 499 24424 103560 1113400 28722
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.0 13.0 13.0 12.9 13.0 14.0
##
## includes extended item information - examples:
## labels variables levels
## 1 Station=********* Station *********
## 2 Station=######### Station #########
## 3 Station=010000003 Station 010000003
##
## includes extended transaction information - examples:
## transactionID
## 1 1
## 2 2
## 3 3
itemFrequencyPlot(trans, topN = 50)
rules <- apriori(trans, parameter = list(supp = .01, conf = .8))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 12706
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[13238 item(s), 1270610 transaction(s)] done [1.40s].
## sorting and recoding items ... [106 item(s)] done [0.16s].
## creating transaction tree ... done [1.59s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(trans, parameter = list(supp = 0.01, conf = 0.8)):
## Mining stopped (maxlen reached). Only patterns up to a length of 10
## returned!
## done [2.01s].
## writing ... [72374 rule(s)] done [0.02s].
## creating S4 object ... done [0.42s].
summary(rules)
## set of 72374 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4 5 6 7 8 9 10
## 2 337 3176 11347 20002 19726 11734 4570 1252 228
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 5.613 6.000 10.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.01000 Min. :0.8000 Min. : 0.8712 Min. : 12707
## 1st Qu.:0.01197 1st Qu.:0.9049 1st Qu.: 1.0806 1st Qu.: 15206
## Median :0.01406 Median :0.9701 Median : 1.5274 Median : 17867
## Mean :0.01922 Mean :0.9463 Mean : 4.7226 Mean : 24423
## 3rd Qu.:0.02050 3rd Qu.:0.9967 3rd Qu.: 3.3206 3rd Qu.: 26054
## Max. :0.91875 Max. :1.0000 Max. :94.8069 Max. :1167367
##
## mining info:
## data ntransactions support confidence
## trans 1270610 0.01 0.8
inspect(head(rules, by = "lift"))
## lhs rhs support confidence lift count
## [1] {Station=#########,
## Category=P,
## AgencyName=INTERNAL REVENUE SERVICE} => {Occupation=0512} 0.01013608 0.9652976 94.80689 12879
## [2] {Category=P,
## AgencyName=INTERNAL REVENUE SERVICE} => {Occupation=0512} 0.01018172 0.8133409 79.88243 12937
## [3] {Station=#########,
## Category=O,
## AgencyName=BUREAU OF PRISONS/FEDERAL PRISON SYSTEM,
## Fulltime} => {Occupation=0007} 0.01326686 0.9997628 74.80764 16857
## [4] {Station=#########,
## PayPlan=GL,
## Category=O,
## AgencyName=BUREAU OF PRISONS/FEDERAL PRISON SYSTEM} => {Occupation=0007} 0.01241372 0.9997465 74.80642 15773
## [5] {Station=#########,
## PayPlan=GL,
## Category=O,
## AgencyName=BUREAU OF PRISONS/FEDERAL PRISON SYSTEM,
## Fulltime} => {Occupation=0007} 0.01241372 0.9997465 74.80642 15773
## [6] {Station=#########,
## Category=O,
## SupervisoryStatus=8,
## AgencyName=BUREAU OF PRISONS/FEDERAL PRISON SYSTEM,
## Fulltime} => {Occupation=0007} 0.01218627 0.9997417 74.80607 15484
library(arulesViz)
## Loading required package: grid
plot(rules, engine = "html")
## Warning: plot: Too many rules supplied. Only plotting the best 1000 rules
## using measure lift (change parameter max if needed)
plot(rules, method = "graph", engine = "html")
## Warning: Too many rules supplied. Only plotting the best 100 rules using
## lift (change control parameter max if needed)