load("Employment_2009.rda")
summary(dat)
## PseudoID Name Date
## 004101442: 4 NAME WITHHELD BY OPM :304408 20090331:1270610
## 003167685: 3 NAME WITHHELD BY AGENCY:169859
## 003327886: 3 NAME UNKNOWN : 52
## 003509194: 3 SMITH,PATRICIA A : 20
## 004023994: 3 SMITH,JAMES E : 18
## 004188607: 3 SMITH,MICHAEL A : 17
## (Other) :1270591 (Other) :796236
## Agency Station Age Education
## VATA :260206 #########:381500 50-54 :203671 13 :319393
## TR93 :104069 110010001:103144 45-49 :194857 04 :271752
## SZ00 : 63229 240130031: 13718 55-59 :173459 17 :130966
## HSBC : 61324 241360031: 13179 40-44 :167649 07 : 74839
## HSBD : 55006 426540101: 12122 35-39 :137531 10 : 73869
## TD03 : 47034 241698005: 11452 30-34 :112498 15 : 57892
## (Other):679742 (Other) :735495 (Other):280945 (Other):341899
## PayPlan Grade LOS Occupation
## GS :885287 13 :145501 5-9 :236641 0301 : 61709
## SV : 61193 12 :136265 20-24 :160063 0303 : 56891
## VN : 45801 11 :128610 15-19 :151277 0610 : 56257
## AD : 45316 07 :101042 1-2 :147091 1802 : 55960
## WG : 39701 09 : 92546 10-14 :139189 2210 : 41421
## GL : 38389 14 : 81942 < 1 :105342 1811 : 39845
## (Other):154923 (Other):584704 (Other):331007 (Other):958527
## Category Pay SupervisoryStatus Appointment
## *: 223 Min. : 0 *: 35 10 :673958
## A:474254 1st Qu.: 45693 2: 147715 38 :273766
## B: 58487 Median : 65810 4: 5132 15 :125912
## C:116111 Mean : 74122 5: 5343 48 : 63080
## O: 50943 3rd Qu.: 95620 6: 10615 32 : 41408
## P:325197 Max. :393411 7: 6215 30 : 34454
## T:245395 NA's :2219 8:1095555 (Other): 58032
## Schedule NSFTP
## F :1136271 1:1081701
## P : 52803 2: 188909
## I : 45535
## G : 31096
## J : 2810
## Q : 2022
## (Other): 73
## AgencyName Fulltime
## VETERANS HEALTH ADMINISTRATION :260206 Mode :logical
## INTERNAL REVENUE SERVICE :104069 FALSE:103243
## SOCIAL SECURITY ADMINISTRATION : 63229 TRUE :1167367
## TRANSPORTATION SECURITY ADMINISTRATION: 61324
## CUSTOMS AND BORDER PROTECTION : 55006
## (Other) :726714
## NA's : 62
## Seasonal
## Mode :logical
## FALSE:1234680
## TRUE :35930
##
##
##
##
Important Note: I did not clean the data to remove duplicates, etc. You need to do this first if you have not done it for project 1! Remove attributes not useful for predictive modeling
dat2 <- dat
dat2$PseudoID <- NULL
dat2$Name <- NULL
dat2$Date <- NULL
Make some attributes measured on an ordinal scale continuous
Reason: Decision trees are very slow with nominal/ordinal attributes with many different values.
dat2$Education <- as.numeric(as.character(dat2$Education))
## Warning: NAs introduced by coercion
dat2$Age <- as.numeric(substr(dat2$Age, start = 1, stop = 2))
## Warning: NAs introduced by coercion
dat2$LOS <- as.numeric(sub("-.*|\\+|<", "", dat2$LOS))
## Warning: NAs introduced by coercion
dat2$SupervisoryStatus <- as.numeric(as.character(dat2$SupervisoryStatus))
## Warning: NAs introduced by coercion
dat3 <- dat2[,c("Age", "LOS", "Education", "SupervisoryStatus", "Pay")]
dim(dat3)
## [1] 1270610 5
summary(dat3)
## Age LOS Education SupervisoryStatus
## Min. :15.00 Min. : 1.00 Min. : 1.00 Min. :2.000
## 1st Qu.:35.00 1st Qu.: 3.00 1st Qu.: 6.00 1st Qu.:8.000
## Median :45.00 Median :10.00 Median :11.00 Median :8.000
## Mean :43.71 Mean :11.83 Mean :10.45 Mean :7.252
## 3rd Qu.:55.00 3rd Qu.:20.00 3rd Qu.:13.00 3rd Qu.:8.000
## Max. :75.00 Max. :35.00 Max. :22.00 Max. :8.000
## NA's :108 NA's :9644 NA's :45857 NA's :35
## Pay
## Min. : 0
## 1st Qu.: 45693
## Median : 65810
## Mean : 74122
## 3rd Qu.: 95620
## Max. :393411
## NA's :2219
Note: k-means does not like missing values
take <- complete.cases(dat3)
dat3 <- dat3[take,]
dim(dat3)
## [1] 1213929 5
summary(dat3)
## Age LOS Education SupervisoryStatus
## Min. :15.00 Min. : 1.00 Min. : 1.00 Min. :2.000
## 1st Qu.:35.00 1st Qu.: 3.00 1st Qu.: 6.00 1st Qu.:8.000
## Median :45.00 Median :10.00 Median :11.00 Median :8.000
## Mean :43.87 Mean :12.15 Mean :10.45 Mean :7.232
## 3rd Qu.:55.00 3rd Qu.:20.00 3rd Qu.:13.00 3rd Qu.:8.000
## Max. :75.00 Max. :35.00 Max. :22.00 Max. :8.000
## Pay
## Min. : 0
## 1st Qu.: 46625
## Median : 67613
## Mean : 75166
## 3rd Qu.: 96484
## Max. :393411
fix supervisory Status (0-7; more is better)
dat3$SupervisoryStatus <- 8-dat3$SupervisoryStatus
dat3 <- scale(dat3)
summary(dat3)
## Age LOS Education SupervisoryStatus
## Min. :-2.50515 Min. :-1.1128 Min. :-1.9191 Min. :-0.3913
## 1st Qu.:-0.76988 1st Qu.:-0.9132 1st Qu.:-0.9041 1st Qu.:-0.3913
## Median : 0.09776 Median :-0.2147 Median : 0.1109 Median :-0.3913
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.96539 3rd Qu.: 0.7832 3rd Qu.: 0.5169 3rd Qu.:-0.3913
## Max. : 2.70066 Max. : 2.2800 Max. : 2.3438 Max. : 2.6664
## Pay
## Min. :-2.0194
## 1st Qu.:-0.7668
## Median :-0.2029
## Mean : 0.0000
## 3rd Qu.: 0.5727
## Max. : 8.5501
set.seed(1000)
k <- 3
km <- kmeans(dat3, centers = k)
str(km)
## List of 9
## $ cluster : Named int [1:1213929] 2 2 2 2 2 2 2 2 1 2 ...
## ..- attr(*, "names")= chr [1:1213929] "1" "2" "3" "4" ...
## $ centers : num [1:3, 1:5] -0.651 0.298 0.663 -0.743 0.526 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:5] "Age" "LOS" "Education" "SupervisoryStatus" ...
## $ totss : num 6069640
## $ withinss : num [1:3] 1289075 570374 1568056
## $ tot.withinss: num 3427505
## $ betweenss : num 2642135
## $ size : int [1:3] 569904 153833 490192
## $ iter : int 4
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
km$centers
## Age LOS Education SupervisoryStatus Pay
## 1 -0.6508947 -0.7426088 -0.1931255 -0.3776919 -0.5478810
## 2 0.2978375 0.5264870 0.2572201 2.5916395 0.8593497
## 3 0.6632713 0.6981441 0.1438090 -0.3742035 0.3672912
ggparcoord is better than parcoord in MASS
library("GGally")
ggparcoord(km$centers)
ggparcoord(cbind(data.frame(km$centers), data.frame(id = as.character(1:k))), columns = 1:ncol(km$centers), groupColumn = 'id')
dat_used <- dat2[take,]
dat_cluster_1 <- dat_used[km$cluster == 1,]
dat_cluster_2 <- dat_used[km$cluster == 2,]
dat_cluster_3 <- dat_used[km$cluster == 3,]
summary(dat_cluster_1[,c("Age", "Education", "LOS", "Agency", "Pay")])
## Age Education LOS Agency
## Min. :15.00 Min. : 1.000 Min. : 1.00 VATA :134106
## 1st Qu.:30.00 1st Qu.: 4.000 1st Qu.: 1.00 HSBC : 48962
## Median :35.00 Median :10.000 Median : 5.00 TR93 : 46978
## Mean :36.37 Mean : 9.502 Mean : 4.71 HSBD : 34792
## 3rd Qu.:45.00 3rd Qu.:13.000 3rd Qu.: 5.00 SZ00 : 28306
## Max. :75.00 Max. :22.000 Max. :25.00 DJ03 : 19283
## (Other):257477
## Pay
## Min. : 0
## 1st Qu.: 37706
## Median : 49651
## Mean : 54773
## 3rd Qu.: 67613
## Max. :215221
##
summary(dat_cluster_2[,c("Age", "Education", "LOS", "Agency", "Pay")])
## Age Education LOS Agency
## Min. :15.00 Min. : 1.00 Min. : 1.00 VATA :21217
## 1st Qu.:40.00 1st Qu.: 8.00 1st Qu.:10.00 TR93 : 9196
## Median :50.00 Median :13.00 Median :20.00 DJ03 : 8717
## Mean :47.31 Mean :11.72 Mean :17.43 HSBD : 8526
## 3rd Qu.:55.00 3rd Qu.:15.00 3rd Qu.:25.00 HSBC : 7779
## Max. :75.00 Max. :22.00 Max. :35.00 SZ00 : 6453
## (Other):91945
## Pay
## Min. : 0
## 1st Qu.: 76072
## Median :104525
## Mean :107152
## 3rd Qu.:133543
## Max. :389975
##
summary(dat_cluster_3[,c("Age", "Education", "LOS", "Agency", "Pay")])
## Age Education LOS Agency
## Min. :25.00 Min. : 1.00 Min. : 1.00 VATA :101924
## 1st Qu.:45.00 1st Qu.: 7.00 1st Qu.:15.00 TR93 : 45817
## Median :50.00 Median :13.00 Median :20.00 SZ00 : 28140
## Mean :51.52 Mean :11.16 Mean :19.15 TD03 : 24360
## 3rd Qu.:55.00 3rd Qu.:14.00 3rd Qu.:25.00 AG11 : 12117
## Max. :75.00 Max. :22.00 Max. :35.00 DJ02 : 11883
## (Other):265951
## Pay
## Min. : 0
## 1st Qu.: 60490
## Median : 85281
## Mean : 88837
## 3rd Qu.:109570
## Max. :393411
##
Where do peope in different clusters work?
head(sort(table(dat_cluster_1$Agency)/nrow(dat_cluster_1), decreasing = TRUE))
##
## VATA HSBC TR93 HSBD SZ00 DJ03
## 0.23531332 0.08591272 0.08243143 0.06104888 0.04966801 0.03383552
head(sort(table(dat_cluster_2$Agency)/nrow(dat_cluster_2), decreasing = TRUE))
##
## VATA TR93 DJ03 HSBD HSBC SZ00
## 0.13792229 0.05977911 0.05666534 0.05542374 0.05056782 0.04194809
head(sort(table(dat_cluster_3$Agency)/nrow(dat_cluster_3), decreasing = TRUE))
##
## VATA TR93 SZ00 TD03 AG11 DJ02
## 0.20792669 0.09346746 0.05740608 0.04969481 0.02471889 0.02424152
lift_cluster_1 <- sort((table(dat_cluster_1$Agency)/nrow(dat_cluster_1)) /
(table(dat_used$Agency)/nrow(dat_used)), decreasing = TRUE)
head(lift_cluster_1, n = 10)
##
## HSBC FQ02 HE10 AW00 EDEX EO00 HSBD HSAD
## 1.718522 1.478000 1.422878 1.420039 1.420039 1.389169 1.359374 1.339088
## ZU00 FQ01
## 1.331287 1.320479
tail(lift_cluster_1, n = 10)
##
## AB00 BW00 RE00 CX00 DB00 PU00 UJ00
## 0.1183366 0.1014314 0.1014314 0.0000000 0.0000000 0.0000000 0.0000000
## VADA VAHD VAKA
## 0.0000000 0.0000000 0.0000000
barplot(rev(head(lift_cluster_1, n=20)), horiz = TRUE, las = 2, xlab = "Lift")
lift_cluster_2 <- sort((table(dat_cluster_2$Agency)/nrow(dat_cluster_2)) /
(table(dat_used$Agency)/nrow(dat_used)), decreasing = TRUE)
head(lift_cluster_2, n = 10)
##
## DB00 AB00 UJ00 GS31 NK00 EDED GS16 GO00
## 6.904812 6.795211 6.677180 5.523849 5.366025 4.592427 4.509265 4.384007
## GS26 GS04
## 4.314620 4.304298
tail(lift_cluster_2, n = 10)
##
## VABD VAAD TR35 AP00 CX00 DLCA EDEX
## 0.4641890 0.3722270 0.3470323 0.0000000 0.0000000 0.0000000 0.0000000
## HE31 VAHD VAKA
## 0.0000000 0.0000000 0.0000000
barplot(rev(head(lift_cluster_2, n=20)), horiz = TRUE, las = 2, xlab = "Lift")
lift_cluster_3 <- sort((table(dat_cluster_3$Agency)/nrow(dat_cluster_3)) /
(table(dat_used$Agency)/nrow(dat_used)), decreasing = TRUE)
head(lift_cluster_3, n = 10)
##
## CX00 VAHD VAKA RE00 PU00 FM00 GY00 HUJJ
## 2.476436 2.476436 2.476436 2.063696 1.981149 1.950825 1.893745 1.857327
## VABA NN23
## 1.857327 1.850810
tail(lift_cluster_3, n = 10)
##
## GX00 UJ00 DF00 DB00 EO00 ZS00 AB00
## 0.4127393 0.3809901 0.3537765 0.3095545 0.2691778 0.2653324 0.2063696
## HSBC AW00 ZU00
## 0.1610232 0.0000000 0.0000000
barplot(rev(head(lift_cluster_3, n=20)), horiz = TRUE, las = 2, xlab = "Lift")
agency_data <-
aggregate(cbind(Age, Education, Grade, LOS, Pay, SupervisoryStatus, Fulltime, Seasonal) ~ Agency, data = dat_used, FUN = mean)
head(agency_data)
## Agency Age Education Grade LOS Pay SupervisoryStatus
## 1 AB00 53.61111 9.583333 15.166667 11.97222 81720.33 2.916667
## 2 AG01 45.86957 9.217391 9.086957 17.82609 102045.78 6.173913
## 3 AG02 46.13782 9.971154 11.732601 14.40980 67538.08 7.119963
## 4 AG03 46.34811 13.835414 12.005453 13.49261 74518.65 7.502655
## 5 AG07 46.26789 9.623164 11.751813 16.41197 65072.94 7.363636
## 6 AG08 46.14979 10.970464 13.759494 17.99367 87031.31 7.227848
## Fulltime Seasonal
## 1 1.0000000 0.000000000
## 2 1.0000000 0.000000000
## 3 0.9271978 0.003205128
## 4 0.9809155 0.001147941
## 5 0.9888455 0.000000000
## 6 0.9873418 0.000000000
Note: Does averaging make sense for these variables? Why? Why not?
Add agency size
agency_size <- as.data.frame(table(dat_used$Agency))
colnames(agency_size) <- c("Agency", "size")
head(agency_size)
## Agency size
## 1 AB00 36
## 2 AG01 23
## 3 AG02 2184
## 4 AG03 6969
## 5 AG07 5379
## 6 AG08 474
agency_data <- merge(agency_data, agency_size)
head(agency_data)
## Agency Age Education Grade LOS Pay SupervisoryStatus
## 1 AB00 53.61111 9.583333 15.166667 11.97222 81720.33 2.916667
## 2 AG01 45.86957 9.217391 9.086957 17.82609 102045.78 6.173913
## 3 AG02 46.13782 9.971154 11.732601 14.40980 67538.08 7.119963
## 4 AG03 46.34811 13.835414 12.005453 13.49261 74518.65 7.502655
## 5 AG07 46.26789 9.623164 11.751813 16.41197 65072.94 7.363636
## 6 AG08 46.14979 10.970464 13.759494 17.99367 87031.31 7.227848
## Fulltime Seasonal size
## 1 1.0000000 0.000000000 36
## 2 1.0000000 0.000000000 23
## 3 0.9271978 0.003205128 2184
## 4 0.9809155 0.001147941 6969
## 5 0.9888455 0.000000000 5379
## 6 0.9873418 0.000000000 474
rownames(agency_data) <- agency_data$Agency
agency_data <- agency_data[,-1]
d <- dist(scale(agency_data))
cl <- hclust(d)
plot(cl)
Larger agencies only
agency_large <- subset(agency_data, subset = size > 5000)
d <- dist(scale(agency_large))
cl <- hclust(d)
plot(cl)
Ignoring size
d <- dist(scale(agency_large[,colnames(agency_large) != "size"]))
cl <- hclust(d)
plot(cl)
Look at the structure of the data using PCA
pc <- prcomp(scale(agency_large[,colnames(agency_large) != "size"]))
biplot(pc, col = c("grey", "red"))