Load data file from project 1

load("Employment_2009.rda")

summary(dat)
##       PseudoID                            Name              Date        
##  004101442:      4   NAME WITHHELD BY OPM   :304408   20090331:1270610  
##  003167685:      3   NAME WITHHELD BY AGENCY:169859                     
##  003327886:      3   NAME UNKNOWN           :    52                     
##  003509194:      3   SMITH,PATRICIA A       :    20                     
##  004023994:      3   SMITH,JAMES E          :    18                     
##  004188607:      3   SMITH,MICHAEL A        :    17                     
##  (Other)  :1270591   (Other)                :796236                     
##      Agency            Station            Age           Education     
##  VATA   :260206   #########:381500   50-54  :203671   13     :319393  
##  TR93   :104069   110010001:103144   45-49  :194857   04     :271752  
##  SZ00   : 63229   240130031: 13718   55-59  :173459   17     :130966  
##  HSBC   : 61324   241360031: 13179   40-44  :167649   07     : 74839  
##  HSBD   : 55006   426540101: 12122   35-39  :137531   10     : 73869  
##  TD03   : 47034   241698005: 11452   30-34  :112498   15     : 57892  
##  (Other):679742   (Other)  :735495   (Other):280945   (Other):341899  
##     PayPlan           Grade             LOS           Occupation    
##  GS     :885287   13     :145501   5-9    :236641   0301   : 61709  
##  SV     : 61193   12     :136265   20-24  :160063   0303   : 56891  
##  VN     : 45801   11     :128610   15-19  :151277   0610   : 56257  
##  AD     : 45316   07     :101042   1-2    :147091   1802   : 55960  
##  WG     : 39701   09     : 92546   10-14  :139189   2210   : 41421  
##  GL     : 38389   14     : 81942   < 1    :105342   1811   : 39845  
##  (Other):154923   (Other):584704   (Other):331007   (Other):958527  
##  Category        Pay         SupervisoryStatus  Appointment    
##  *:   223   Min.   :     0   *:     35         10     :673958  
##  A:474254   1st Qu.: 45693   2: 147715         38     :273766  
##  B: 58487   Median : 65810   4:   5132         15     :125912  
##  C:116111   Mean   : 74122   5:   5343         48     : 63080  
##  O: 50943   3rd Qu.: 95620   6:  10615         32     : 41408  
##  P:325197   Max.   :393411   7:   6215         30     : 34454  
##  T:245395   NA's   :2219     8:1095555         (Other): 58032  
##     Schedule       NSFTP      
##  F      :1136271   1:1081701  
##  P      :  52803   2: 188909  
##  I      :  45535              
##  G      :  31096              
##  J      :   2810              
##  Q      :   2022              
##  (Other):     73              
##                                   AgencyName      Fulltime      
##  VETERANS HEALTH ADMINISTRATION        :260206   Mode :logical  
##  INTERNAL REVENUE SERVICE              :104069   FALSE:103243   
##  SOCIAL SECURITY ADMINISTRATION        : 63229   TRUE :1167367  
##  TRANSPORTATION SECURITY ADMINISTRATION: 61324                  
##  CUSTOMS AND BORDER PROTECTION         : 55006                  
##  (Other)                               :726714                  
##  NA's                                  :    62                  
##   Seasonal      
##  Mode :logical  
##  FALSE:1234680  
##  TRUE :35930    
##                 
##                 
##                 
## 

Important Note: I did not clean the data to remove duplicates, etc. You need to do this first if you have not done it for project 1! Remove attributes not useful for predictive modeling

dat2 <- dat

dat2$PseudoID <- NULL
dat2$Name <- NULL
dat2$Date <- NULL

Make some attributes measured on an ordinal scale continuous

Reason: Decision trees are very slow with nominal/ordinal attributes with many different values.

dat2$Education <- as.numeric(as.character(dat2$Education))
## Warning: NAs introduced by coercion
dat2$Age <- as.numeric(substr(dat2$Age, start = 1, stop = 2))
## Warning: NAs introduced by coercion
dat2$LOS <- as.numeric(sub("-.*|\\+|<", "", dat2$LOS))
## Warning: NAs introduced by coercion
dat2$SupervisoryStatus <- as.numeric(as.character(dat2$SupervisoryStatus))
## Warning: NAs introduced by coercion

Select some features for clustering

dat3 <- dat2[,c("Age", "LOS", "Education", "SupervisoryStatus", "Pay")]
dim(dat3)
## [1] 1270610       5
summary(dat3)
##       Age             LOS          Education     SupervisoryStatus
##  Min.   :15.00   Min.   : 1.00   Min.   : 1.00   Min.   :2.000    
##  1st Qu.:35.00   1st Qu.: 3.00   1st Qu.: 6.00   1st Qu.:8.000    
##  Median :45.00   Median :10.00   Median :11.00   Median :8.000    
##  Mean   :43.71   Mean   :11.83   Mean   :10.45   Mean   :7.252    
##  3rd Qu.:55.00   3rd Qu.:20.00   3rd Qu.:13.00   3rd Qu.:8.000    
##  Max.   :75.00   Max.   :35.00   Max.   :22.00   Max.   :8.000    
##  NA's   :108     NA's   :9644    NA's   :45857   NA's   :35       
##       Pay        
##  Min.   :     0  
##  1st Qu.: 45693  
##  Median : 65810  
##  Mean   : 74122  
##  3rd Qu.: 95620  
##  Max.   :393411  
##  NA's   :2219

Note: k-means does not like missing values

take <- complete.cases(dat3)

dat3 <- dat3[take,]
dim(dat3)
## [1] 1213929       5
summary(dat3)
##       Age             LOS          Education     SupervisoryStatus
##  Min.   :15.00   Min.   : 1.00   Min.   : 1.00   Min.   :2.000    
##  1st Qu.:35.00   1st Qu.: 3.00   1st Qu.: 6.00   1st Qu.:8.000    
##  Median :45.00   Median :10.00   Median :11.00   Median :8.000    
##  Mean   :43.87   Mean   :12.15   Mean   :10.45   Mean   :7.232    
##  3rd Qu.:55.00   3rd Qu.:20.00   3rd Qu.:13.00   3rd Qu.:8.000    
##  Max.   :75.00   Max.   :35.00   Max.   :22.00   Max.   :8.000    
##       Pay        
##  Min.   :     0  
##  1st Qu.: 46625  
##  Median : 67613  
##  Mean   : 75166  
##  3rd Qu.: 96484  
##  Max.   :393411

fix supervisory Status (0-7; more is better)

dat3$SupervisoryStatus <- 8-dat3$SupervisoryStatus

Cluster using k-means

dat3 <- scale(dat3)
summary(dat3)
##       Age                LOS            Education       SupervisoryStatus
##  Min.   :-2.50515   Min.   :-1.1128   Min.   :-1.9191   Min.   :-0.3913  
##  1st Qu.:-0.76988   1st Qu.:-0.9132   1st Qu.:-0.9041   1st Qu.:-0.3913  
##  Median : 0.09776   Median :-0.2147   Median : 0.1109   Median :-0.3913  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.96539   3rd Qu.: 0.7832   3rd Qu.: 0.5169   3rd Qu.:-0.3913  
##  Max.   : 2.70066   Max.   : 2.2800   Max.   : 2.3438   Max.   : 2.6664  
##       Pay         
##  Min.   :-2.0194  
##  1st Qu.:-0.7668  
##  Median :-0.2029  
##  Mean   : 0.0000  
##  3rd Qu.: 0.5727  
##  Max.   : 8.5501
set.seed(1000)

k <- 3
km <- kmeans(dat3, centers = k)
str(km)
## List of 9
##  $ cluster     : Named int [1:1213929] 2 2 2 2 2 2 2 2 1 2 ...
##   ..- attr(*, "names")= chr [1:1213929] "1" "2" "3" "4" ...
##  $ centers     : num [1:3, 1:5] -0.651 0.298 0.663 -0.743 0.526 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:5] "Age" "LOS" "Education" "SupervisoryStatus" ...
##  $ totss       : num 6069640
##  $ withinss    : num [1:3] 1289075 570374 1568056
##  $ tot.withinss: num 3427505
##  $ betweenss   : num 2642135
##  $ size        : int [1:3] 569904 153833 490192
##  $ iter        : int 4
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
km$centers
##          Age        LOS  Education SupervisoryStatus        Pay
## 1 -0.6508947 -0.7426088 -0.1931255        -0.3776919 -0.5478810
## 2  0.2978375  0.5264870  0.2572201         2.5916395  0.8593497
## 3  0.6632713  0.6981441  0.1438090        -0.3742035  0.3672912

ggparcoord is better than parcoord in MASS

library("GGally")
ggparcoord(km$centers)

ggparcoord(cbind(data.frame(km$centers), data.frame(id = as.character(1:k))), columns = 1:ncol(km$centers), groupColumn = 'id')

Find original observations for clusters

dat_used <- dat2[take,]
dat_cluster_1 <- dat_used[km$cluster == 1,]
dat_cluster_2 <- dat_used[km$cluster == 2,]
dat_cluster_3 <- dat_used[km$cluster == 3,]

summary(dat_cluster_1[,c("Age", "Education", "LOS", "Agency", "Pay")])
##       Age          Education           LOS            Agency      
##  Min.   :15.00   Min.   : 1.000   Min.   : 1.00   VATA   :134106  
##  1st Qu.:30.00   1st Qu.: 4.000   1st Qu.: 1.00   HSBC   : 48962  
##  Median :35.00   Median :10.000   Median : 5.00   TR93   : 46978  
##  Mean   :36.37   Mean   : 9.502   Mean   : 4.71   HSBD   : 34792  
##  3rd Qu.:45.00   3rd Qu.:13.000   3rd Qu.: 5.00   SZ00   : 28306  
##  Max.   :75.00   Max.   :22.000   Max.   :25.00   DJ03   : 19283  
##                                                   (Other):257477  
##       Pay        
##  Min.   :     0  
##  1st Qu.: 37706  
##  Median : 49651  
##  Mean   : 54773  
##  3rd Qu.: 67613  
##  Max.   :215221  
## 
summary(dat_cluster_2[,c("Age", "Education", "LOS", "Agency", "Pay")])
##       Age          Education          LOS            Agency     
##  Min.   :15.00   Min.   : 1.00   Min.   : 1.00   VATA   :21217  
##  1st Qu.:40.00   1st Qu.: 8.00   1st Qu.:10.00   TR93   : 9196  
##  Median :50.00   Median :13.00   Median :20.00   DJ03   : 8717  
##  Mean   :47.31   Mean   :11.72   Mean   :17.43   HSBD   : 8526  
##  3rd Qu.:55.00   3rd Qu.:15.00   3rd Qu.:25.00   HSBC   : 7779  
##  Max.   :75.00   Max.   :22.00   Max.   :35.00   SZ00   : 6453  
##                                                  (Other):91945  
##       Pay        
##  Min.   :     0  
##  1st Qu.: 76072  
##  Median :104525  
##  Mean   :107152  
##  3rd Qu.:133543  
##  Max.   :389975  
## 
summary(dat_cluster_3[,c("Age", "Education", "LOS", "Agency", "Pay")])
##       Age          Education          LOS            Agency      
##  Min.   :25.00   Min.   : 1.00   Min.   : 1.00   VATA   :101924  
##  1st Qu.:45.00   1st Qu.: 7.00   1st Qu.:15.00   TR93   : 45817  
##  Median :50.00   Median :13.00   Median :20.00   SZ00   : 28140  
##  Mean   :51.52   Mean   :11.16   Mean   :19.15   TD03   : 24360  
##  3rd Qu.:55.00   3rd Qu.:14.00   3rd Qu.:25.00   AG11   : 12117  
##  Max.   :75.00   Max.   :22.00   Max.   :35.00   DJ02   : 11883  
##                                                  (Other):265951  
##       Pay        
##  Min.   :     0  
##  1st Qu.: 60490  
##  Median : 85281  
##  Mean   : 88837  
##  3rd Qu.:109570  
##  Max.   :393411  
## 

Where do peope in different clusters work?

head(sort(table(dat_cluster_1$Agency)/nrow(dat_cluster_1), decreasing = TRUE))
## 
##       VATA       HSBC       TR93       HSBD       SZ00       DJ03 
## 0.23531332 0.08591272 0.08243143 0.06104888 0.04966801 0.03383552
head(sort(table(dat_cluster_2$Agency)/nrow(dat_cluster_2), decreasing = TRUE))
## 
##       VATA       TR93       DJ03       HSBD       HSBC       SZ00 
## 0.13792229 0.05977911 0.05666534 0.05542374 0.05056782 0.04194809
head(sort(table(dat_cluster_3$Agency)/nrow(dat_cluster_3), decreasing = TRUE))
## 
##       VATA       TR93       SZ00       TD03       AG11       DJ02 
## 0.20792669 0.09346746 0.05740608 0.04969481 0.02471889 0.02424152
lift_cluster_1 <- sort((table(dat_cluster_1$Agency)/nrow(dat_cluster_1)) /
    (table(dat_used$Agency)/nrow(dat_used)), decreasing = TRUE)
head(lift_cluster_1, n = 10)
## 
##     HSBC     FQ02     HE10     AW00     EDEX     EO00     HSBD     HSAD 
## 1.718522 1.478000 1.422878 1.420039 1.420039 1.389169 1.359374 1.339088 
##     ZU00     FQ01 
## 1.331287 1.320479
tail(lift_cluster_1, n = 10)
## 
##      AB00      BW00      RE00      CX00      DB00      PU00      UJ00 
## 0.1183366 0.1014314 0.1014314 0.0000000 0.0000000 0.0000000 0.0000000 
##      VADA      VAHD      VAKA 
## 0.0000000 0.0000000 0.0000000
barplot(rev(head(lift_cluster_1, n=20)), horiz = TRUE, las = 2, xlab = "Lift")

lift_cluster_2 <- sort((table(dat_cluster_2$Agency)/nrow(dat_cluster_2)) /
    (table(dat_used$Agency)/nrow(dat_used)), decreasing = TRUE)
head(lift_cluster_2, n = 10)
## 
##     DB00     AB00     UJ00     GS31     NK00     EDED     GS16     GO00 
## 6.904812 6.795211 6.677180 5.523849 5.366025 4.592427 4.509265 4.384007 
##     GS26     GS04 
## 4.314620 4.304298
tail(lift_cluster_2, n = 10)
## 
##      VABD      VAAD      TR35      AP00      CX00      DLCA      EDEX 
## 0.4641890 0.3722270 0.3470323 0.0000000 0.0000000 0.0000000 0.0000000 
##      HE31      VAHD      VAKA 
## 0.0000000 0.0000000 0.0000000
barplot(rev(head(lift_cluster_2, n=20)), horiz = TRUE, las = 2, xlab = "Lift")

lift_cluster_3 <- sort((table(dat_cluster_3$Agency)/nrow(dat_cluster_3)) /
    (table(dat_used$Agency)/nrow(dat_used)), decreasing = TRUE)
head(lift_cluster_3, n = 10)
## 
##     CX00     VAHD     VAKA     RE00     PU00     FM00     GY00     HUJJ 
## 2.476436 2.476436 2.476436 2.063696 1.981149 1.950825 1.893745 1.857327 
##     VABA     NN23 
## 1.857327 1.850810
tail(lift_cluster_3, n = 10)
## 
##      GX00      UJ00      DF00      DB00      EO00      ZS00      AB00 
## 0.4127393 0.3809901 0.3537765 0.3095545 0.2691778 0.2653324 0.2063696 
##      HSBC      AW00      ZU00 
## 0.1610232 0.0000000 0.0000000
barplot(rev(head(lift_cluster_3, n=20)), horiz = TRUE, las = 2, xlab = "Lift")

Cluster agencies

agency_data <-
  aggregate(cbind(Age, Education, Grade, LOS, Pay, SupervisoryStatus, Fulltime, Seasonal) ~ Agency, data = dat_used, FUN = mean)
head(agency_data)
##   Agency      Age Education     Grade      LOS       Pay SupervisoryStatus
## 1   AB00 53.61111  9.583333 15.166667 11.97222  81720.33          2.916667
## 2   AG01 45.86957  9.217391  9.086957 17.82609 102045.78          6.173913
## 3   AG02 46.13782  9.971154 11.732601 14.40980  67538.08          7.119963
## 4   AG03 46.34811 13.835414 12.005453 13.49261  74518.65          7.502655
## 5   AG07 46.26789  9.623164 11.751813 16.41197  65072.94          7.363636
## 6   AG08 46.14979 10.970464 13.759494 17.99367  87031.31          7.227848
##    Fulltime    Seasonal
## 1 1.0000000 0.000000000
## 2 1.0000000 0.000000000
## 3 0.9271978 0.003205128
## 4 0.9809155 0.001147941
## 5 0.9888455 0.000000000
## 6 0.9873418 0.000000000

Note: Does averaging make sense for these variables? Why? Why not?

Add agency size

agency_size <- as.data.frame(table(dat_used$Agency))
colnames(agency_size) <- c("Agency", "size")
head(agency_size)
##   Agency size
## 1   AB00   36
## 2   AG01   23
## 3   AG02 2184
## 4   AG03 6969
## 5   AG07 5379
## 6   AG08  474
agency_data <- merge(agency_data, agency_size)
head(agency_data)
##   Agency      Age Education     Grade      LOS       Pay SupervisoryStatus
## 1   AB00 53.61111  9.583333 15.166667 11.97222  81720.33          2.916667
## 2   AG01 45.86957  9.217391  9.086957 17.82609 102045.78          6.173913
## 3   AG02 46.13782  9.971154 11.732601 14.40980  67538.08          7.119963
## 4   AG03 46.34811 13.835414 12.005453 13.49261  74518.65          7.502655
## 5   AG07 46.26789  9.623164 11.751813 16.41197  65072.94          7.363636
## 6   AG08 46.14979 10.970464 13.759494 17.99367  87031.31          7.227848
##    Fulltime    Seasonal size
## 1 1.0000000 0.000000000   36
## 2 1.0000000 0.000000000   23
## 3 0.9271978 0.003205128 2184
## 4 0.9809155 0.001147941 6969
## 5 0.9888455 0.000000000 5379
## 6 0.9873418 0.000000000  474
rownames(agency_data) <- agency_data$Agency
agency_data <- agency_data[,-1]

d <- dist(scale(agency_data))
cl <- hclust(d)
plot(cl)

Larger agencies only

agency_large <- subset(agency_data, subset = size > 5000)

d <- dist(scale(agency_large))
cl <- hclust(d)
plot(cl)

Ignoring size

d <- dist(scale(agency_large[,colnames(agency_large) != "size"]))
cl <- hclust(d)
plot(cl)

Look at the structure of the data using PCA

pc <- prcomp(scale(agency_large[,colnames(agency_large) != "size"]))
biplot(pc, col = c("grey", "red"))