load("Employment_2009.rda")
summary(dat)
## PseudoID Name Date
## 004101442: 4 NAME WITHHELD BY OPM :304408 20090331:1270610
## 003167685: 3 NAME WITHHELD BY AGENCY:169859
## 003327886: 3 NAME UNKNOWN : 52
## 003509194: 3 SMITH,PATRICIA A : 20
## 004023994: 3 SMITH,JAMES E : 18
## 004188607: 3 SMITH,MICHAEL A : 17
## (Other) :1270591 (Other) :796236
## Agency Station Age Education
## VATA :260206 #########:381500 50-54 :203671 13 :319393
## TR93 :104069 110010001:103144 45-49 :194857 04 :271752
## SZ00 : 63229 240130031: 13718 55-59 :173459 17 :130966
## HSBC : 61324 241360031: 13179 40-44 :167649 07 : 74839
## HSBD : 55006 426540101: 12122 35-39 :137531 10 : 73869
## TD03 : 47034 241698005: 11452 30-34 :112498 15 : 57892
## (Other):679742 (Other) :735495 (Other):280945 (Other):341899
## PayPlan Grade LOS Occupation
## GS :885287 13 :145501 5-9 :236641 0301 : 61709
## SV : 61193 12 :136265 20-24 :160063 0303 : 56891
## VN : 45801 11 :128610 15-19 :151277 0610 : 56257
## AD : 45316 07 :101042 1-2 :147091 1802 : 55960
## WG : 39701 09 : 92546 10-14 :139189 2210 : 41421
## GL : 38389 14 : 81942 < 1 :105342 1811 : 39845
## (Other):154923 (Other):584704 (Other):331007 (Other):958527
## Category Pay SupervisoryStatus Appointment
## *: 223 Min. : 0 *: 35 10 :673958
## A:474254 1st Qu.: 45693 2: 147715 38 :273766
## B: 58487 Median : 65810 4: 5132 15 :125912
## C:116111 Mean : 74122 5: 5343 48 : 63080
## O: 50943 3rd Qu.: 95620 6: 10615 32 : 41408
## P:325197 Max. :393411 7: 6215 30 : 34454
## T:245395 NA's :2219 8:1095555 (Other): 58032
## Schedule NSFTP
## F :1136271 1:1081701
## P : 52803 2: 188909
## I : 45535
## G : 31096
## J : 2810
## Q : 2022
## (Other): 73
## AgencyName Fulltime
## VETERANS HEALTH ADMINISTRATION :260206 Mode :logical
## INTERNAL REVENUE SERVICE :104069 FALSE:103243
## SOCIAL SECURITY ADMINISTRATION : 63229 TRUE :1167367
## TRANSPORTATION SECURITY ADMINISTRATION: 61324
## CUSTOMS AND BORDER PROTECTION : 55006
## (Other) :726714
## NA's : 62
## Seasonal
## Mode :logical
## FALSE:1234680
## TRUE :35930
##
##
##
##
Important Note: I did not clean the data to remove duplicates, etc. You need to do this first if you have not done it for project 1! remove attributes not useful for predictive modeling
dat2 <- dat
dat2$PseudoID <- NULL
dat2$Name <- NULL
dat2$Date <- NULL
make some attributes measured on an ordinal scale continuous.
Reason: Decision trees are very slow with nominal/ordinal attributes with many different values.
dat2$Education <- as.numeric(as.character(dat2$Education))
## Warning: NAs introduced by coercion
dat2$Age <- as.numeric(substr(dat2$Age, start = 1, stop = 2))
## Warning: NAs introduced by coercion
dat2$LOS <- as.numeric(sub("-.*|\\+|<", "", dat2$LOS))
## Warning: NAs introduced by coercion
prepare class variable for classification (we need a discrete class label)
dat2$Pay <- cut(dat2$Pay,
breaks = c(0, 50000, 75000, 100000 ,Inf),
labels = c("<50k", "50-75k", "75k-100k", ">100k"))
summary(dat2$Pay)
## <50k 50-75k 75k-100k >100k NA's
## 398569 348533 241641 279320 2547
Only consider very large agencies with 10000+ employees
agencies <- names(which(table(dat2$AgencyName)>10000))
agencies
## [1] "BUREAU OF LAND MANAGEMENT"
## [2] "BUREAU OF PRISONS/FEDERAL PRISON SYSTEM"
## [3] "BUREAU OF THE CENSUS"
## [4] "CITIZENSHIP AND IMMIGRATION SERVICES"
## [5] "CUSTOMS AND BORDER PROTECTION"
## [6] "DEPARTMENT OF ENERGY"
## [7] "DEPARTMENT OF STATE"
## [8] "ENVIRONMENTAL PROTECTION AGENCY"
## [9] "EXEC OFC US ATTORNEY AND OFF US ATTORNEY"
## [10] "FEDERAL AVIATION ADMINISTRATION"
## [11] "FEDERAL BUREAU OF INVESTIGATION"
## [12] "FEDERAL EMERGENCY MANAGEMENT AGENCY"
## [13] "FOOD AND DRUG ADMINISTRATION"
## [14] "FOREST SERVICE"
## [15] "IMMIGRATION AND CUSTOMS ENFORCEMENT"
## [16] "INDIAN HEALTH SERVICE"
## [17] "INTERNAL REVENUE SERVICE"
## [18] "NAT OCEANIC AND ATMOSPHERIC ADMIN"
## [19] "NATIONAL INSTITUTES OF HEALTH"
## [20] "NATIONAL PARK SERVICE"
## [21] "NATURAL RESOURCES CONSERVATION SERVICE"
## [22] "OFC SEC HEALTH AND HUMAN SERVICES"
## [23] "SOCIAL SECURITY ADMINISTRATION"
## [24] "TRANSPORTATION SECURITY ADMINISTRATION"
## [25] "VETERANS BENEFITS ADMINISTRATION"
## [26] "VETERANS HEALTH ADMINISTRATION"
dat3 <- dat2[dat2$AgencyName %in% agencies, ]
# get rid of unused agency levels
dat3$AgencyName <- factor(dat3$AgencyName)
dat3$Agency <- factor(dat3$Agency)
dim(dat3)
## [1] 941291 17
Sample to make the algorithm faster
sample_ID <- sample(1:nrow(dat3), size = 10000)
dat4 <- dat3[sample_ID, ]
Learn CART model
library(rpart)
model <- rpart(Pay ~ Age + Education, data = dat4)
model
## n=9979 (21 observations deleted due to missingness)
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 9979 6323 <50k (0.36636938 0.28890670 0.16735144 0.17737248)
## 2) Education< 12.5 5744 2772 <50k (0.51740947 0.29979109 0.11507660 0.06772284) *
## 3) Education>=12.5 4235 2854 >100k (0.16151122 0.27414404 0.23825266 0.32609209)
## 6) Age< 32.5 816 499 50-75k (0.37132353 0.38848039 0.16299020 0.07720588) *
## 7) Age>=32.5 3419 2101 >100k (0.11143609 0.24685581 0.25621527 0.38549283) *
library("rpart.plot")
rpart.plot(model, extra = 2, under = TRUE, varlen=0, faclen=0)
model <- rpart(Pay ~ Age + Education + LOS, data = dat4)
rpart.plot(model, extra = 2, under = TRUE, varlen=0, faclen=0)
Note: Beware of nominal/ordinal variable with many different values!!! These will take forever, because it will try always all possible combinations to split them into two subsets.
model <- rpart(Pay ~ Age + Education + LOS + Agency, data = dat4)
model
## n=9979 (21 observations deleted due to missingness)
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 9979 6323 <50k (0.366369376 0.288906704 0.167351438 0.177372482)
## 2) Education< 12.5 5700 2774 <50k (0.513333333 0.302982456 0.115789474 0.067894737)
## 4) Agency=AG11,AG16,CM63,HE10,HE37,HSBC,TR93,VATA 3375 1063 <50k (0.685037037 0.214222222 0.072296296 0.028444444) *
## 5) Agency=CM54,DJ02,DJ03,DJ09,DN00,EP00,HE36,HE38,HSAB,HSBB,HSBD,HSCB,IN05,IN10,ST00,SZ00,TD03,VALA 2325 1321 50-75k (0.264086022 0.431827957 0.178924731 0.125161290)
## 10) LOS< 4 615 227 <50k (0.630894309 0.294308943 0.058536585 0.016260163) *
## 11) LOS>=4 1710 887 50-75k (0.132163743 0.481286550 0.222222222 0.164327485)
## 22) Agency=CM54,DJ02,DJ03,DJ09,DN00,EP00,HE36,HE38,HSAB,HSBB,HSBD,HSCB,IN05,IN10,SZ00,VALA 1441 669 50-75k (0.147814018 0.535739070 0.231783484 0.084663428) *
## 23) Agency=ST00,TD03 269 110 >100k (0.048327138 0.189591078 0.171003717 0.591078067) *
## 3) Education>=12.5 4279 2896 >100k (0.170600608 0.270156579 0.236036457 0.323206357)
## 6) LOS< 7.5 2054 1380 50-75k (0.300876339 0.328140214 0.185004869 0.185978578)
## 12) Education< 14.5 1167 673 <50k (0.423307626 0.361610968 0.165381320 0.049700086)
## 24) LOS< 2 502 183 <50k (0.635458167 0.268924303 0.067729084 0.027888446) *
## 25) LOS>=2 665 378 50-75k (0.263157895 0.431578947 0.239097744 0.066165414) *
## 13) Education>=14.5 887 563 >100k (0.139797069 0.284103720 0.210822999 0.365276212)
## 26) Education>=15.5 657 429 50-75k (0.161339422 0.347031963 0.263318113 0.228310502) *
## 27) Education< 15.5 230 56 >100k (0.078260870 0.104347826 0.060869565 0.756521739) *
## 7) LOS>=7.5 2225 1224 >100k (0.050337079 0.216629213 0.283146067 0.449887640)
## 14) Agency=AG11,AG16,CM54,DJ03,HE37,HSAB,HSBB,HSBD,IN05,IN10,SZ00,TR93,VALA,VATA 1552 1051 >100k (0.070231959 0.293814433 0.313144330 0.322809278)
## 28) Education< 14.5 1055 690 50-75k (0.093838863 0.345971564 0.336492891 0.223696682) *
## 29) Education>=14.5 497 232 >100k (0.020120724 0.183098592 0.263581489 0.533199195) *
## 15) Agency=CM63,DJ02,DJ09,DN00,EP00,HE10,HE36,HE38,HSBC,HSCB,ST00,TD03 673 173 >100k (0.004457652 0.038632987 0.213967311 0.742942051) *
rpart.plot(model, extra = 2, under = TRUE, varlen=0,
faclen=0)
Try an alternative encoding for Agency (binary indicator variables using dummyVars from caret)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
mm <- dummyVars(~ Agency + Age + Education + LOS, dat4)
dat5 <- as.data.frame(predict(mm, dat4))
dat5$Pay <- dat4$Pay
summary(dat5)
## Agency.AG11 Agency.AG16 Agency.CM54 Agency.CM63
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.0334 Mean :0.0117 Mean :0.0123 Mean :0.0176
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## Agency.DJ02 Agency.DJ03 Agency.DJ09 Agency.DN00
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.0341 Mean :0.0379 Mean :0.0142 Mean :0.0159
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## Agency.EP00 Agency.HE10 Agency.HE36 Agency.HE37
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.0203 Mean :0.014 Mean :0.0134 Mean :0.0143
## 3rd Qu.:0.0000 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.0000
##
## Agency.HE38 Agency.HSAB Agency.HSBB Agency.HSBC
## Min. :0.000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.021 Mean :0.012 Mean :0.0211 Mean :0.0671
## 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.000 Max. :1.000 Max. :1.0000 Max. :1.0000
##
## Agency.HSBD Agency.HSCB Agency.IN05 Agency.IN10
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.000
## Mean :0.0605 Mean :0.0196 Mean :0.0086 Mean :0.023
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.000
##
## Agency.ST00 Agency.SZ00 Agency.TD03 Agency.TR93
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.0123 Mean :0.0678 Mean :0.0463 Mean :0.1111
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## Agency.VALA Agency.VATA Age Education
## Min. :0.0000 Min. :0.0000 Min. :15.00 Min. : 1.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:35.00 1st Qu.: 4.000
## Median :0.0000 Median :0.0000 Median :45.00 Median :10.000
## Mean :0.0188 Mean :0.2717 Mean :43.34 Mean : 9.981
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:50.00 3rd Qu.:13.000
## Max. :1.0000 Max. :1.0000 Max. :75.00 Max. :22.000
## NA's :342
## LOS Pay
## Min. : 1.00 <50k :3656
## 1st Qu.: 3.00 50-75k :2883
## Median :10.00 75k-100k:1670
## Mean :11.27 >100k :1770
## 3rd Qu.:20.00 NA's : 21
## Max. :35.00
## NA's :99
model <- rpart(Pay ~ ., data = dat5)
model
## n=9979 (21 observations deleted due to missingness)
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 9979 6323 <50k (0.36636938 0.28890670 0.16735144 0.17737248)
## 2) Education< 12.5 5762 2782 <50k (0.51718153 0.30041652 0.11489066 0.06751128)
## 4) LOS< 7.5 2953 842 <50k (0.71486624 0.21672875 0.04740941 0.02099560) *
## 5) LOS>=7.5 2809 1718 50-75k (0.30936276 0.38839445 0.18583126 0.11641153)
## 10) Agency.VATA>=0.5 751 311 <50k (0.58588549 0.29294274 0.09587217 0.02529960) *
## 11) Agency.VATA< 0.5 2058 1187 50-75k (0.20845481 0.42322643 0.21865889 0.14965986)
## 22) Agency.TD03< 0.5 1865 1022 50-75k (0.22841823 0.45201072 0.22466488 0.09490617) *
## 23) Agency.TD03>=0.5 193 62 >100k (0.01554404 0.14507772 0.16062176 0.67875648) *
## 3) Education>=12.5 4217 2836 >100k (0.16030353 0.27317999 0.23903249 0.32748399)
## 6) LOS< 7.5 1997 1326 50-75k (0.28442664 0.33600401 0.18928393 0.19028543)
## 12) Education< 14.5 1124 672 <50k (0.40213523 0.37366548 0.17259786 0.05160142)
## 24) LOS< 2 469 182 <50k (0.61194030 0.28571429 0.07249467 0.02985075) *
## 25) LOS>=2 655 369 50-75k (0.25190840 0.43664122 0.24427481 0.06717557) *
## 13) Education>=14.5 873 551 >100k (0.13287514 0.28751432 0.21076747 0.36884307)
## 26) Education>=15.5 654 426 50-75k (0.16055046 0.34862385 0.26299694 0.22782875) *
## 27) Education< 15.5 219 46 >100k (0.05022831 0.10502283 0.05479452 0.78995434) *
## 7) LOS>=7.5 2220 1219 >100k (0.04864865 0.21666667 0.28378378 0.45090090) *
rpart.plot(model, extra = 2, under = TRUE, varlen=0,
faclen=0)
library(caret)
Note: There are missing values. rpart can deal with missing values, so I pass them on to rpart using na.pass
. caret recodes Agency automatically with dummy variables.
fit <- train(Pay ~ Age + Education + LOS + Agency, data = dat4 , method = "rpart",
na.action = na.pass,
trControl = trainControl(method = "cv", number = 10),
tuneLength=10)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
fit
## CART
##
## 9545 samples
## 4 predictor
## 4 classes: '<50k', '50-75k', '75k-100k', '>100k'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9979, 9003, 9002, 9002, 9003, 9001, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.004006537 0.5665914 0.3883035
## 0.005614424 0.5589744 0.3793249
## 0.008223944 0.5529604 0.3697632
## 0.012494069 0.5446434 0.3610230
## 0.016289736 0.5333184 0.3462535
## 0.017713111 0.5187853 0.3224940
## 0.034793611 0.4752924 0.2516402
## 0.035109916 0.4709838 0.2438488
## 0.046022458 0.4473433 0.2134250
## 0.111497707 0.3985398 0.1022196
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.004006537.
rpart.plot(fit$finalModel, extra = 2, under = TRUE, varlen=0, faclen=0)
variable importance
varImp(fit)
## rpart variable importance
##
## only 20 most important variables shown (out of 28)
##
## Overall
## LOS 100.0000
## Education 77.6562
## Age 56.7109
## AgencyTD03 42.4123
## AgencyHSBC 25.8027
## AgencyVATA 17.2976
## AgencyDJ03 8.6975
## AgencyHSBD 8.4228
## AgencyHSCB 3.7583
## AgencyEP00 3.1140
## AgencyTR93 2.7408
## AgencyDJ02 2.7394
## AgencySZ00 1.0972
## AgencyHE36 0.5085
## AgencyHE38 0.0000
## AgencyIN10 0.0000
## AgencyDJ09 0.0000
## AgencyIN05 0.0000
## AgencyAG16 0.0000
## AgencyVALA 0.0000
test using a test set that was not seen by the algorithm
testing <- dat3[-sample_ID, ]
testing <- testing[sample(1:nrow(testing), size = 1000), ]
Note: There are missing values. I pass them on to rpart.
pred <- predict(fit, newdata = testing, na.action = na.pass)
head(pred)
## [1] <50k >100k >100k <50k <50k 50-75k
## Levels: <50k 50-75k 75k-100k >100k
confusionMatrix(data = pred, testing$Pay)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <50k 50-75k 75k-100k >100k
## <50k 279 111 27 9
## 50-75k 66 136 64 29
## 75k-100k 0 0 0 0
## >100k 11 60 70 136
##
## Overall Statistics
##
## Accuracy : 0.5521
## 95% CI : (0.5206, 0.5833)
## No Information Rate : 0.3567
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3678
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: <50k Class: 50-75k Class: 75k-100k
## Sensitivity 0.7837 0.4430 0.0000
## Specificity 0.7710 0.7699 1.0000
## Pos Pred Value 0.6549 0.4610 NaN
## Neg Pred Value 0.8654 0.7568 0.8387
## Prevalence 0.3567 0.3076 0.1613
## Detection Rate 0.2796 0.1363 0.0000
## Detection Prevalence 0.4269 0.2956 0.0000
## Balanced Accuracy 0.7774 0.6064 0.5000
## Class: >100k
## Sensitivity 0.7816
## Specificity 0.8289
## Pos Pred Value 0.4910
## Neg Pred Value 0.9473
## Prevalence 0.1743
## Detection Rate 0.1363
## Detection Prevalence 0.2776
## Balanced Accuracy 0.8052