Load data file from project 1

load("Employment_2009.rda")

summary(dat)
##       PseudoID                            Name              Date        
##  004101442:      4   NAME WITHHELD BY OPM   :304408   20090331:1270610  
##  003167685:      3   NAME WITHHELD BY AGENCY:169859                     
##  003327886:      3   NAME UNKNOWN           :    52                     
##  003509194:      3   SMITH,PATRICIA A       :    20                     
##  004023994:      3   SMITH,JAMES E          :    18                     
##  004188607:      3   SMITH,MICHAEL A        :    17                     
##  (Other)  :1270591   (Other)                :796236                     
##      Agency            Station            Age           Education     
##  VATA   :260206   #########:381500   50-54  :203671   13     :319393  
##  TR93   :104069   110010001:103144   45-49  :194857   04     :271752  
##  SZ00   : 63229   240130031: 13718   55-59  :173459   17     :130966  
##  HSBC   : 61324   241360031: 13179   40-44  :167649   07     : 74839  
##  HSBD   : 55006   426540101: 12122   35-39  :137531   10     : 73869  
##  TD03   : 47034   241698005: 11452   30-34  :112498   15     : 57892  
##  (Other):679742   (Other)  :735495   (Other):280945   (Other):341899  
##     PayPlan           Grade             LOS           Occupation    
##  GS     :885287   13     :145501   5-9    :236641   0301   : 61709  
##  SV     : 61193   12     :136265   20-24  :160063   0303   : 56891  
##  VN     : 45801   11     :128610   15-19  :151277   0610   : 56257  
##  AD     : 45316   07     :101042   1-2    :147091   1802   : 55960  
##  WG     : 39701   09     : 92546   10-14  :139189   2210   : 41421  
##  GL     : 38389   14     : 81942   < 1    :105342   1811   : 39845  
##  (Other):154923   (Other):584704   (Other):331007   (Other):958527  
##  Category        Pay         SupervisoryStatus  Appointment    
##  *:   223   Min.   :     0   *:     35         10     :673958  
##  A:474254   1st Qu.: 45693   2: 147715         38     :273766  
##  B: 58487   Median : 65810   4:   5132         15     :125912  
##  C:116111   Mean   : 74122   5:   5343         48     : 63080  
##  O: 50943   3rd Qu.: 95620   6:  10615         32     : 41408  
##  P:325197   Max.   :393411   7:   6215         30     : 34454  
##  T:245395   NA's   :2219     8:1095555         (Other): 58032  
##     Schedule       NSFTP      
##  F      :1136271   1:1081701  
##  P      :  52803   2: 188909  
##  I      :  45535              
##  G      :  31096              
##  J      :   2810              
##  Q      :   2022              
##  (Other):     73              
##                                   AgencyName      Fulltime      
##  VETERANS HEALTH ADMINISTRATION        :260206   Mode :logical  
##  INTERNAL REVENUE SERVICE              :104069   FALSE:103243   
##  SOCIAL SECURITY ADMINISTRATION        : 63229   TRUE :1167367  
##  TRANSPORTATION SECURITY ADMINISTRATION: 61324                  
##  CUSTOMS AND BORDER PROTECTION         : 55006                  
##  (Other)                               :726714                  
##  NA's                                  :    62                  
##   Seasonal      
##  Mode :logical  
##  FALSE:1234680  
##  TRUE :35930    
##                 
##                 
##                 
## 

Important Note: I did not clean the data to remove duplicates, etc. You need to do this first if you have not done it for project 1! remove attributes not useful for predictive modeling

dat2 <- dat
dat2$PseudoID <- NULL
dat2$Name <- NULL
dat2$Date <- NULL

make some attributes measured on an ordinal scale continuous.

Reason: Decision trees are very slow with nominal/ordinal attributes with many different values.

dat2$Education <- as.numeric(as.character(dat2$Education))
## Warning: NAs introduced by coercion
dat2$Age <- as.numeric(substr(dat2$Age, start = 1, stop = 2))
## Warning: NAs introduced by coercion
dat2$LOS <- as.numeric(sub("-.*|\\+|<", "", dat2$LOS))
## Warning: NAs introduced by coercion

prepare class variable for classification (we need a discrete class label)

dat2$Pay <- cut(dat2$Pay,
  breaks = c(0, 50000, 75000, 100000 ,Inf),
  labels = c("<50k", "50-75k", "75k-100k", ">100k"))

summary(dat2$Pay)
##     <50k   50-75k 75k-100k    >100k     NA's 
##   398569   348533   241641   279320     2547

Only consider very large agencies with 10000+ employees

agencies <- names(which(table(dat2$AgencyName)>10000))
agencies
##  [1] "BUREAU OF LAND MANAGEMENT"               
##  [2] "BUREAU OF PRISONS/FEDERAL PRISON SYSTEM" 
##  [3] "BUREAU OF THE CENSUS"                    
##  [4] "CITIZENSHIP AND IMMIGRATION SERVICES"    
##  [5] "CUSTOMS AND BORDER PROTECTION"           
##  [6] "DEPARTMENT OF ENERGY"                    
##  [7] "DEPARTMENT OF STATE"                     
##  [8] "ENVIRONMENTAL PROTECTION AGENCY"         
##  [9] "EXEC OFC US ATTORNEY AND OFF US ATTORNEY"
## [10] "FEDERAL AVIATION ADMINISTRATION"         
## [11] "FEDERAL BUREAU OF INVESTIGATION"         
## [12] "FEDERAL EMERGENCY MANAGEMENT AGENCY"     
## [13] "FOOD AND DRUG ADMINISTRATION"            
## [14] "FOREST SERVICE"                          
## [15] "IMMIGRATION AND CUSTOMS ENFORCEMENT"     
## [16] "INDIAN HEALTH SERVICE"                   
## [17] "INTERNAL REVENUE SERVICE"                
## [18] "NAT OCEANIC AND ATMOSPHERIC ADMIN"       
## [19] "NATIONAL INSTITUTES OF HEALTH"           
## [20] "NATIONAL PARK SERVICE"                   
## [21] "NATURAL RESOURCES CONSERVATION SERVICE"  
## [22] "OFC SEC HEALTH AND HUMAN SERVICES"       
## [23] "SOCIAL SECURITY ADMINISTRATION"          
## [24] "TRANSPORTATION SECURITY ADMINISTRATION"  
## [25] "VETERANS BENEFITS ADMINISTRATION"        
## [26] "VETERANS HEALTH ADMINISTRATION"
dat3 <- dat2[dat2$AgencyName %in% agencies, ]
# get rid of unused agency levels
dat3$AgencyName <- factor(dat3$AgencyName)
dat3$Agency <- factor(dat3$Agency)

dim(dat3)
## [1] 941291     17

Sample to make the algorithm faster

sample_ID <- sample(1:nrow(dat3), size = 10000)
dat4 <- dat3[sample_ID, ]

Learn CART model

library(rpart)

model <- rpart(Pay ~ Age + Education, data = dat4)
model
## n=9979 (21 observations deleted due to missingness)
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 9979 6323 <50k (0.36636938 0.28890670 0.16735144 0.17737248)  
##   2) Education< 12.5 5744 2772 <50k (0.51740947 0.29979109 0.11507660 0.06772284) *
##   3) Education>=12.5 4235 2854 >100k (0.16151122 0.27414404 0.23825266 0.32609209)  
##     6) Age< 32.5 816  499 50-75k (0.37132353 0.38848039 0.16299020 0.07720588) *
##     7) Age>=32.5 3419 2101 >100k (0.11143609 0.24685581 0.25621527 0.38549283) *
library("rpart.plot")
rpart.plot(model, extra = 2, under = TRUE, varlen=0, faclen=0)

model <- rpart(Pay ~ Age + Education + LOS, data = dat4)
rpart.plot(model, extra = 2, under = TRUE, varlen=0, faclen=0)

Include Agency

Note: Beware of nominal/ordinal variable with many different values!!! These will take forever, because it will try always all possible combinations to split them into two subsets.

model <- rpart(Pay ~ Age + Education + LOS + Agency, data = dat4)
model
## n=9979 (21 observations deleted due to missingness)
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 9979 6323 <50k (0.366369376 0.288906704 0.167351438 0.177372482)  
##    2) Education< 12.5 5700 2774 <50k (0.513333333 0.302982456 0.115789474 0.067894737)  
##      4) Agency=AG11,AG16,CM63,HE10,HE37,HSBC,TR93,VATA 3375 1063 <50k (0.685037037 0.214222222 0.072296296 0.028444444) *
##      5) Agency=CM54,DJ02,DJ03,DJ09,DN00,EP00,HE36,HE38,HSAB,HSBB,HSBD,HSCB,IN05,IN10,ST00,SZ00,TD03,VALA 2325 1321 50-75k (0.264086022 0.431827957 0.178924731 0.125161290)  
##       10) LOS< 4 615  227 <50k (0.630894309 0.294308943 0.058536585 0.016260163) *
##       11) LOS>=4 1710  887 50-75k (0.132163743 0.481286550 0.222222222 0.164327485)  
##         22) Agency=CM54,DJ02,DJ03,DJ09,DN00,EP00,HE36,HE38,HSAB,HSBB,HSBD,HSCB,IN05,IN10,SZ00,VALA 1441  669 50-75k (0.147814018 0.535739070 0.231783484 0.084663428) *
##         23) Agency=ST00,TD03 269  110 >100k (0.048327138 0.189591078 0.171003717 0.591078067) *
##    3) Education>=12.5 4279 2896 >100k (0.170600608 0.270156579 0.236036457 0.323206357)  
##      6) LOS< 7.5 2054 1380 50-75k (0.300876339 0.328140214 0.185004869 0.185978578)  
##       12) Education< 14.5 1167  673 <50k (0.423307626 0.361610968 0.165381320 0.049700086)  
##         24) LOS< 2 502  183 <50k (0.635458167 0.268924303 0.067729084 0.027888446) *
##         25) LOS>=2 665  378 50-75k (0.263157895 0.431578947 0.239097744 0.066165414) *
##       13) Education>=14.5 887  563 >100k (0.139797069 0.284103720 0.210822999 0.365276212)  
##         26) Education>=15.5 657  429 50-75k (0.161339422 0.347031963 0.263318113 0.228310502) *
##         27) Education< 15.5 230   56 >100k (0.078260870 0.104347826 0.060869565 0.756521739) *
##      7) LOS>=7.5 2225 1224 >100k (0.050337079 0.216629213 0.283146067 0.449887640)  
##       14) Agency=AG11,AG16,CM54,DJ03,HE37,HSAB,HSBB,HSBD,IN05,IN10,SZ00,TR93,VALA,VATA 1552 1051 >100k (0.070231959 0.293814433 0.313144330 0.322809278)  
##         28) Education< 14.5 1055  690 50-75k (0.093838863 0.345971564 0.336492891 0.223696682) *
##         29) Education>=14.5 497  232 >100k (0.020120724 0.183098592 0.263581489 0.533199195) *
##       15) Agency=CM63,DJ02,DJ09,DN00,EP00,HE10,HE36,HE38,HSBC,HSCB,ST00,TD03 673  173 >100k (0.004457652 0.038632987 0.213967311 0.742942051) *
rpart.plot(model, extra = 2, under = TRUE, varlen=0,
  faclen=0)

Try an alternative encoding for Agency (binary indicator variables using dummyVars from caret)

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
mm <- dummyVars(~ Agency + Age + Education + LOS, dat4)
dat5 <- as.data.frame(predict(mm, dat4))

dat5$Pay <- dat4$Pay
summary(dat5)
##   Agency.AG11      Agency.AG16      Agency.CM54      Agency.CM63    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.0334   Mean   :0.0117   Mean   :0.0123   Mean   :0.0176  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                                                     
##   Agency.DJ02      Agency.DJ03      Agency.DJ09      Agency.DN00    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.0341   Mean   :0.0379   Mean   :0.0142   Mean   :0.0159  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                                                     
##   Agency.EP00      Agency.HE10     Agency.HE36      Agency.HE37    
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.0203   Mean   :0.014   Mean   :0.0134   Mean   :0.0143  
##  3rd Qu.:0.0000   3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##                                                                    
##   Agency.HE38     Agency.HSAB     Agency.HSBB      Agency.HSBC    
##  Min.   :0.000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.021   Mean   :0.012   Mean   :0.0211   Mean   :0.0671  
##  3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##                                                                   
##   Agency.HSBD      Agency.HSCB      Agency.IN05      Agency.IN10   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.000  
##  Mean   :0.0605   Mean   :0.0196   Mean   :0.0086   Mean   :0.023  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.000  
##                                                                    
##   Agency.ST00      Agency.SZ00      Agency.TD03      Agency.TR93    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.0123   Mean   :0.0678   Mean   :0.0463   Mean   :0.1111  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                                                     
##   Agency.VALA      Agency.VATA          Age          Education     
##  Min.   :0.0000   Min.   :0.0000   Min.   :15.00   Min.   : 1.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:35.00   1st Qu.: 4.000  
##  Median :0.0000   Median :0.0000   Median :45.00   Median :10.000  
##  Mean   :0.0188   Mean   :0.2717   Mean   :43.34   Mean   : 9.981  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:50.00   3rd Qu.:13.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :75.00   Max.   :22.000  
##                                                    NA's   :342     
##       LOS              Pay      
##  Min.   : 1.00   <50k    :3656  
##  1st Qu.: 3.00   50-75k  :2883  
##  Median :10.00   75k-100k:1670  
##  Mean   :11.27   >100k   :1770  
##  3rd Qu.:20.00   NA's    :  21  
##  Max.   :35.00                  
##  NA's   :99
model <- rpart(Pay ~ ., data = dat5)
model
## n=9979 (21 observations deleted due to missingness)
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 9979 6323 <50k (0.36636938 0.28890670 0.16735144 0.17737248)  
##    2) Education< 12.5 5762 2782 <50k (0.51718153 0.30041652 0.11489066 0.06751128)  
##      4) LOS< 7.5 2953  842 <50k (0.71486624 0.21672875 0.04740941 0.02099560) *
##      5) LOS>=7.5 2809 1718 50-75k (0.30936276 0.38839445 0.18583126 0.11641153)  
##       10) Agency.VATA>=0.5 751  311 <50k (0.58588549 0.29294274 0.09587217 0.02529960) *
##       11) Agency.VATA< 0.5 2058 1187 50-75k (0.20845481 0.42322643 0.21865889 0.14965986)  
##         22) Agency.TD03< 0.5 1865 1022 50-75k (0.22841823 0.45201072 0.22466488 0.09490617) *
##         23) Agency.TD03>=0.5 193   62 >100k (0.01554404 0.14507772 0.16062176 0.67875648) *
##    3) Education>=12.5 4217 2836 >100k (0.16030353 0.27317999 0.23903249 0.32748399)  
##      6) LOS< 7.5 1997 1326 50-75k (0.28442664 0.33600401 0.18928393 0.19028543)  
##       12) Education< 14.5 1124  672 <50k (0.40213523 0.37366548 0.17259786 0.05160142)  
##         24) LOS< 2 469  182 <50k (0.61194030 0.28571429 0.07249467 0.02985075) *
##         25) LOS>=2 655  369 50-75k (0.25190840 0.43664122 0.24427481 0.06717557) *
##       13) Education>=14.5 873  551 >100k (0.13287514 0.28751432 0.21076747 0.36884307)  
##         26) Education>=15.5 654  426 50-75k (0.16055046 0.34862385 0.26299694 0.22782875) *
##         27) Education< 15.5 219   46 >100k (0.05022831 0.10502283 0.05479452 0.78995434) *
##      7) LOS>=7.5 2220 1219 >100k (0.04864865 0.21666667 0.28378378 0.45090090) *
rpart.plot(model, extra = 2, under = TRUE, varlen=0,
  faclen=0)

Use Caret

library(caret)

Note: There are missing values. rpart can deal with missing values, so I pass them on to rpart using na.pass. caret recodes Agency automatically with dummy variables.

fit <- train(Pay ~ Age + Education + LOS + Agency, data = dat4 , method = "rpart",
  na.action = na.pass,
  trControl = trainControl(method = "cv", number = 10),
  tuneLength=10)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
fit
## CART 
## 
## 9545 samples
##    4 predictor
##    4 classes: '<50k', '50-75k', '75k-100k', '>100k' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 9979, 9003, 9002, 9002, 9003, 9001, ... 
## Resampling results across tuning parameters:
## 
##   cp           Accuracy   Kappa    
##   0.004006537  0.5665914  0.3883035
##   0.005614424  0.5589744  0.3793249
##   0.008223944  0.5529604  0.3697632
##   0.012494069  0.5446434  0.3610230
##   0.016289736  0.5333184  0.3462535
##   0.017713111  0.5187853  0.3224940
##   0.034793611  0.4752924  0.2516402
##   0.035109916  0.4709838  0.2438488
##   0.046022458  0.4473433  0.2134250
##   0.111497707  0.3985398  0.1022196
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.004006537.
rpart.plot(fit$finalModel, extra = 2, under = TRUE, varlen=0, faclen=0)

variable importance

varImp(fit)
## rpart variable importance
## 
##   only 20 most important variables shown (out of 28)
## 
##             Overall
## LOS        100.0000
## Education   77.6562
## Age         56.7109
## AgencyTD03  42.4123
## AgencyHSBC  25.8027
## AgencyVATA  17.2976
## AgencyDJ03   8.6975
## AgencyHSBD   8.4228
## AgencyHSCB   3.7583
## AgencyEP00   3.1140
## AgencyTR93   2.7408
## AgencyDJ02   2.7394
## AgencySZ00   1.0972
## AgencyHE36   0.5085
## AgencyHE38   0.0000
## AgencyIN10   0.0000
## AgencyDJ09   0.0000
## AgencyIN05   0.0000
## AgencyAG16   0.0000
## AgencyVALA   0.0000

test using a test set that was not seen by the algorithm

testing <- dat3[-sample_ID, ]
testing <- testing[sample(1:nrow(testing), size = 1000), ]

Note: There are missing values. I pass them on to rpart.

pred <- predict(fit, newdata = testing, na.action = na.pass)
head(pred)
## [1] <50k   >100k  >100k  <50k   <50k   50-75k
## Levels: <50k 50-75k 75k-100k >100k
confusionMatrix(data = pred, testing$Pay)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <50k 50-75k 75k-100k >100k
##   <50k      279    111       27     9
##   50-75k     66    136       64    29
##   75k-100k    0      0        0     0
##   >100k      11     60       70   136
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5521          
##                  95% CI : (0.5206, 0.5833)
##     No Information Rate : 0.3567          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3678          
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: <50k Class: 50-75k Class: 75k-100k
## Sensitivity               0.7837        0.4430          0.0000
## Specificity               0.7710        0.7699          1.0000
## Pos Pred Value            0.6549        0.4610             NaN
## Neg Pred Value            0.8654        0.7568          0.8387
## Prevalence                0.3567        0.3076          0.1613
## Detection Rate            0.2796        0.1363          0.0000
## Detection Prevalence      0.4269        0.2956          0.0000
## Balanced Accuracy         0.7774        0.6064          0.5000
##                      Class: >100k
## Sensitivity                0.7816
## Specificity                0.8289
## Pos Pred Value             0.4910
## Neg Pred Value             0.9473
## Prevalence                 0.1743
## Detection Rate             0.1363
## Detection Prevalence       0.2776
## Balanced Accuracy          0.8052

Todo