https://www.buzzfeed.com/jsvine/sharing-hundreds-of-millions-of-federal-payroll-records?utm_term=.jbk33NbAx#.uwVQQ8Vad
We use employment after 1 year in office (1Q 2009)
Data file from: https://archive.org/download/opm-federal-employment-data/data/1973-09-to-2014-06/non-dod/status/
dat_raw <- readLines("Status_Non_DoD_2009_03.txt")
head(dat_raw)
## [1] "000172152THOMAS,R SOLE 20090331AB0051010001350-54 17GS1515-19 0801P140969210F1"
## [2] "000299511NEESE,DANIEL L 20090331AB00FR750000055-59 13GS1425-29 1630A091791220F2"
## [3] "000343471NAME WITHHELD BY AGENCY20090331AB00#########45-49 10GS0910-14 1630A045044210F1"
## [4] "000547233GREEN,MICHAEL W 20090331AB00BE550000050-54 04GS1210-14 1630A067299210F1"
## [5] "000627711MICHAEL,R COONCE 20090331AB00TS300000045-49 04GS095-9 1630A******210F1"
## [6] "000806215GROSSO,RONALD A 20090331AB00IT700000055-59 13GS145-9 0301A******210F1"
Manually edited from https://archive.org/download/opm-federal-employment-data/docs/2015-02-11-opm-foia-response.pdf
Used file: header.csv
dat_header <- read.csv("headers.csv", header = TRUE)
dat <- t(sapply(dat_raw, FUN = function(x) trimws(substring(x, dat_header[,2], dat_header[,3]))))
dimnames(dat) <- NULL
dat <- as.data.frame(dat)
colnames(dat) <- dat_header[,1]
head(dat)
## PseudoID Name Date Agency Station Age
## 1 000172152 THOMAS,R SOLE 20090331 AB00 510100013 50-54
## 2 000299511 NEESE,DANIEL L 20090331 AB00 FR7500000 55-59
## 3 000343471 NAME WITHHELD BY AGENCY 20090331 AB00 ######### 45-49
## 4 000547233 GREEN,MICHAEL W 20090331 AB00 BE5500000 50-54
## 5 000627711 MICHAEL,R COONCE 20090331 AB00 TS3000000 45-49
## 6 000806215 GROSSO,RONALD A 20090331 AB00 IT7000000 55-59
## Education PayPlan Grade LOS Occupation Category Pay
## 1 17 GS 15 15-19 0801 P 140969
## 2 13 GS 14 25-29 1630 A 091791
## 3 10 GS 09 10-14 1630 A 045044
## 4 04 GS 12 10-14 1630 A 067299
## 5 04 GS 09 5-9 1630 A ******
## 6 13 GS 14 5-9 0301 A ******
## SupervisoryStatus Appointment Schedule NSFTP
## 1 2 10 F 1
## 2 2 20 F 2
## 3 2 10 F 1
## 4 2 10 F 1
## 5 2 10 F 1
## 6 2 10 F 1
summary(dat)
## PseudoID Name Date
## 004101442: 4 NAME WITHHELD BY OPM :304408 20090331:1270610
## 003167685: 3 NAME WITHHELD BY AGENCY:169859
## 003327886: 3 NAME UNKNOWN : 52
## 003509194: 3 SMITH,PATRICIA A : 20
## 004023994: 3 SMITH,JAMES E : 18
## 004188607: 3 SMITH,MICHAEL A : 17
## (Other) :1270591 (Other) :796236
## Agency Station Age Education
## VATA :260206 #########:381500 50-54 :203671 13 :319393
## TR93 :104069 110010001:103144 45-49 :194857 04 :271752
## SZ00 : 63229 240130031: 13718 55-59 :173459 17 :130966
## HSBC : 61324 241360031: 13179 40-44 :167649 07 : 74839
## HSBD : 55006 426540101: 12122 35-39 :137531 10 : 73869
## TD03 : 47034 241698005: 11452 30-34 :112498 15 : 57892
## (Other):679742 (Other) :735495 (Other):280945 (Other):341899
## PayPlan Grade LOS Occupation
## GS :885287 13 :145501 5-9 :236641 0301 : 61709
## SV : 61193 12 :136265 20-24 :160063 0303 : 56891
## VN : 45801 11 :128610 15-19 :151277 0610 : 56257
## AD : 45316 07 :101042 1-2 :147091 1802 : 55960
## WG : 39701 09 : 92546 10-14 :139189 2210 : 41421
## GL : 38389 14 : 81942 < 1 :105342 1811 : 39845
## (Other):154923 (Other):584704 (Other):331007 (Other):958527
## Category Pay SupervisoryStatus Appointment
## *: 223 030772 : 12688 *: 35 10 :673958
## A:474254 153200 : 10594 2: 147715 38 :273766
## B: 58487 027504 : 10453 4: 5132 15 :125912
## C:116111 038117 : 9306 5: 5343 48 : 63080
## O: 50943 056411 : 8421 6: 10615 32 : 41408
## P:325197 062050 : 7717 7: 6215 30 : 34454
## T:245395 (Other):1211431 8:1095555 (Other): 58032
## Schedule NSFTP
## F :1136271 1:1081701
## P : 52803 2: 188909
## I : 45535
## G : 31096
## J : 2810
## Q : 2022
## (Other): 73
Fix Pay (convert to a number by first making the factor into a character)
dat$Pay <- as.numeric(as.character(dat$Pay))
## Warning: NAs introduced by coercion
summary(dat)
## PseudoID Name Date
## 004101442: 4 NAME WITHHELD BY OPM :304408 20090331:1270610
## 003167685: 3 NAME WITHHELD BY AGENCY:169859
## 003327886: 3 NAME UNKNOWN : 52
## 003509194: 3 SMITH,PATRICIA A : 20
## 004023994: 3 SMITH,JAMES E : 18
## 004188607: 3 SMITH,MICHAEL A : 17
## (Other) :1270591 (Other) :796236
## Agency Station Age Education
## VATA :260206 #########:381500 50-54 :203671 13 :319393
## TR93 :104069 110010001:103144 45-49 :194857 04 :271752
## SZ00 : 63229 240130031: 13718 55-59 :173459 17 :130966
## HSBC : 61324 241360031: 13179 40-44 :167649 07 : 74839
## HSBD : 55006 426540101: 12122 35-39 :137531 10 : 73869
## TD03 : 47034 241698005: 11452 30-34 :112498 15 : 57892
## (Other):679742 (Other) :735495 (Other):280945 (Other):341899
## PayPlan Grade LOS Occupation
## GS :885287 13 :145501 5-9 :236641 0301 : 61709
## SV : 61193 12 :136265 20-24 :160063 0303 : 56891
## VN : 45801 11 :128610 15-19 :151277 0610 : 56257
## AD : 45316 07 :101042 1-2 :147091 1802 : 55960
## WG : 39701 09 : 92546 10-14 :139189 2210 : 41421
## GL : 38389 14 : 81942 < 1 :105342 1811 : 39845
## (Other):154923 (Other):584704 (Other):331007 (Other):958527
## Category Pay SupervisoryStatus Appointment
## *: 223 Min. : 0 *: 35 10 :673958
## A:474254 1st Qu.: 45693 2: 147715 38 :273766
## B: 58487 Median : 65810 4: 5132 15 :125912
## C:116111 Mean : 74122 5: 5343 48 : 63080
## O: 50943 3rd Qu.: 95620 6: 10615 32 : 41408
## P:325197 Max. :393411 7: 6215 30 : 34454
## T:245395 NA's :2219 8:1095555 (Other): 58032
## Schedule NSFTP
## F :1136271 1:1081701
## P : 52803 2: 188909
## I : 45535
## G : 31096
## J : 2810
## Q : 2022
## (Other): 73
Data explanations: https://archive.org/download/opm-federal-employment-data/data/2014-09-to-2016-09/non-dod/translations/
summary(dat)
## PseudoID Name Date
## 004101442: 4 NAME WITHHELD BY OPM :304408 20090331:1270610
## 003167685: 3 NAME WITHHELD BY AGENCY:169859
## 003327886: 3 NAME UNKNOWN : 52
## 003509194: 3 SMITH,PATRICIA A : 20
## 004023994: 3 SMITH,JAMES E : 18
## 004188607: 3 SMITH,MICHAEL A : 17
## (Other) :1270591 (Other) :796236
## Agency Station Age Education
## VATA :260206 #########:381500 50-54 :203671 13 :319393
## TR93 :104069 110010001:103144 45-49 :194857 04 :271752
## SZ00 : 63229 240130031: 13718 55-59 :173459 17 :130966
## HSBC : 61324 241360031: 13179 40-44 :167649 07 : 74839
## HSBD : 55006 426540101: 12122 35-39 :137531 10 : 73869
## TD03 : 47034 241698005: 11452 30-34 :112498 15 : 57892
## (Other):679742 (Other) :735495 (Other):280945 (Other):341899
## PayPlan Grade LOS Occupation
## GS :885287 13 :145501 5-9 :236641 0301 : 61709
## SV : 61193 12 :136265 20-24 :160063 0303 : 56891
## VN : 45801 11 :128610 15-19 :151277 0610 : 56257
## AD : 45316 07 :101042 1-2 :147091 1802 : 55960
## WG : 39701 09 : 92546 10-14 :139189 2210 : 41421
## GL : 38389 14 : 81942 < 1 :105342 1811 : 39845
## (Other):154923 (Other):584704 (Other):331007 (Other):958527
## Category Pay SupervisoryStatus Appointment
## *: 223 Min. : 0 *: 35 10 :673958
## A:474254 1st Qu.: 45693 2: 147715 38 :273766
## B: 58487 Median : 65810 4: 5132 15 :125912
## C:116111 Mean : 74122 5: 5343 48 : 63080
## O: 50943 3rd Qu.: 95620 6: 10615 32 : 41408
## P:325197 Max. :393411 7: 6215 30 : 34454
## T:245395 NA's :2219 8:1095555 (Other): 58032
## Schedule NSFTP
## F :1136271 1:1081701
## P : 52803 2: 188909
## I : 45535
## G : 31096
## J : 2810
## Q : 2022
## (Other): 73
hist(dat$Pay, breaks = 50)
barplot(table(dat$Education), las = 2)
boxplot(Pay~Education, data = dat)
med_pay_by_agency <- aggregate(Pay~Agency, data = dat, median)
head(med_pay_by_agency)
## Agency Pay
## 1 AB00 67299
## 2 AG01 82845
## 3 AG02 40320
## 4 AG03 57079
## 5 AG07 60612
## 6 AG08 88443
head(med_pay_by_agency[head(order(med_pay_by_agency$Pay)),])
## Agency Pay
## 47 BZ00 0
## 145 GE00 0
## 126 EDEZ 26088
## 194 HSBC 37011
## 61 CM63 37566
## 3 AG02 40320
head(med_pay_by_agency[head(order(med_pay_by_agency$Pay,
decreasing = TRUE)),])
## Agency Pay
## 254 NK00 153207.0
## 46 BW00 153200.0
## 39 AW00 153199.0
## 250 MA00 153126.5
## 281 SE00 149318.0
## 41 BF00 148893.0
translate Agency: https://archive.org/download/opm-federal-employment-data/data/1973-09-to-2014-06/SCTFILE.TXT
agency_trans <- readLines("SCTFILE.TXT")
## Warning in readLines("SCTFILE.TXT"): incomplete final line found on
## 'SCTFILE.TXT'
agency_ID <- sapply(agency_trans, FUN = function(x) substring(x, 3,6))
agency_name <- trimws(sapply(agency_trans, FUN = function(x) substring(x, 36,75)))
agency_trans_table <- data.frame(agency_ID = agency_ID, agency_name = agency_name)
head(agency_trans_table)
## agency_ID agency_name
## 1 AA00 ADMIN CONFERENCE OF THE UNITED STATES
## 2 AB00 AMERICAN BATTLE MONUMENTS COMMISSION
## 3 AC00 ADV CMSN ON INTERGOVERNMENTAL RELATIONS
## 4 AD00 U.S. ARMS CONTROL AND DISARMAMENT AGENCY
## 5 AE00 ATOMIC ENERGY COMMISSION
## 6 AF DEPARTMENT OF THE AIR FORCE
m <- match(dat$Agency, agency_trans_table$agency_ID)
dat$AgencyName <- agency_trans_table$agency_name[m]
summary(dat)
## PseudoID Name Date
## 004101442: 4 NAME WITHHELD BY OPM :304408 20090331:1270610
## 003167685: 3 NAME WITHHELD BY AGENCY:169859
## 003327886: 3 NAME UNKNOWN : 52
## 003509194: 3 SMITH,PATRICIA A : 20
## 004023994: 3 SMITH,JAMES E : 18
## 004188607: 3 SMITH,MICHAEL A : 17
## (Other) :1270591 (Other) :796236
## Agency Station Age Education
## VATA :260206 #########:381500 50-54 :203671 13 :319393
## TR93 :104069 110010001:103144 45-49 :194857 04 :271752
## SZ00 : 63229 240130031: 13718 55-59 :173459 17 :130966
## HSBC : 61324 241360031: 13179 40-44 :167649 07 : 74839
## HSBD : 55006 426540101: 12122 35-39 :137531 10 : 73869
## TD03 : 47034 241698005: 11452 30-34 :112498 15 : 57892
## (Other):679742 (Other) :735495 (Other):280945 (Other):341899
## PayPlan Grade LOS Occupation
## GS :885287 13 :145501 5-9 :236641 0301 : 61709
## SV : 61193 12 :136265 20-24 :160063 0303 : 56891
## VN : 45801 11 :128610 15-19 :151277 0610 : 56257
## AD : 45316 07 :101042 1-2 :147091 1802 : 55960
## WG : 39701 09 : 92546 10-14 :139189 2210 : 41421
## GL : 38389 14 : 81942 < 1 :105342 1811 : 39845
## (Other):154923 (Other):584704 (Other):331007 (Other):958527
## Category Pay SupervisoryStatus Appointment
## *: 223 Min. : 0 *: 35 10 :673958
## A:474254 1st Qu.: 45693 2: 147715 38 :273766
## B: 58487 Median : 65810 4: 5132 15 :125912
## C:116111 Mean : 74122 5: 5343 48 : 63080
## O: 50943 3rd Qu.: 95620 6: 10615 32 : 41408
## P:325197 Max. :393411 7: 6215 30 : 34454
## T:245395 NA's :2219 8:1095555 (Other): 58032
## Schedule NSFTP
## F :1136271 1:1081701
## P : 52803 2: 188909
## I : 45535
## G : 31096
## J : 2810
## Q : 2022
## (Other): 73
## AgencyName
## VETERANS HEALTH ADMINISTRATION :260206
## INTERNAL REVENUE SERVICE :104069
## SOCIAL SECURITY ADMINISTRATION : 63229
## TRANSPORTATION SECURITY ADMINISTRATION: 61324
## CUSTOMS AND BORDER PROTECTION : 55006
## (Other) :726714
## NA's : 62
#" # Age
boxplot(Pay~Age, data = dat)
boxplot(Pay~LOS, data = dat)
Fix order od levels
levels(dat$LOS)
## [1] "< 1" "1-2" "10-14" "15-19" "20-24" "25-29" "3-4" "30-34"
## [9] "35+" "5-9" "UNSP"
dat$LOS <- ordered(dat$LOS, levels = c("< 1", "1-2", "3-4", "5-9",
"10-14", "15-19", "20-24", "25-29", "30-34", "35+", "UNSP"))
boxplot(Pay~LOS, data = dat)
Work Schedule
Full-time F, G Seasonal: G, J, Q, T
dat$Fulltime <- FALSE
dat$Fulltime[dat$Schedule == "F" | dat$Schedule == "G"] <- TRUE
dat$Seasonal <- FALSE
dat$Seasonal[dat$Schedule %in% c("G", "J", "Q", "T")] <- TRUE
summary(dat)
## PseudoID Name Date
## 004101442: 4 NAME WITHHELD BY OPM :304408 20090331:1270610
## 003167685: 3 NAME WITHHELD BY AGENCY:169859
## 003327886: 3 NAME UNKNOWN : 52
## 003509194: 3 SMITH,PATRICIA A : 20
## 004023994: 3 SMITH,JAMES E : 18
## 004188607: 3 SMITH,MICHAEL A : 17
## (Other) :1270591 (Other) :796236
## Agency Station Age Education
## VATA :260206 #########:381500 50-54 :203671 13 :319393
## TR93 :104069 110010001:103144 45-49 :194857 04 :271752
## SZ00 : 63229 240130031: 13718 55-59 :173459 17 :130966
## HSBC : 61324 241360031: 13179 40-44 :167649 07 : 74839
## HSBD : 55006 426540101: 12122 35-39 :137531 10 : 73869
## TD03 : 47034 241698005: 11452 30-34 :112498 15 : 57892
## (Other):679742 (Other) :735495 (Other):280945 (Other):341899
## PayPlan Grade LOS Occupation
## GS :885287 13 :145501 5-9 :236641 0301 : 61709
## SV : 61193 12 :136265 20-24 :160063 0303 : 56891
## VN : 45801 11 :128610 15-19 :151277 0610 : 56257
## AD : 45316 07 :101042 1-2 :147091 1802 : 55960
## WG : 39701 09 : 92546 10-14 :139189 2210 : 41421
## GL : 38389 14 : 81942 < 1 :105342 1811 : 39845
## (Other):154923 (Other):584704 (Other):331007 (Other):958527
## Category Pay SupervisoryStatus Appointment
## *: 223 Min. : 0 *: 35 10 :673958
## A:474254 1st Qu.: 45693 2: 147715 38 :273766
## B: 58487 Median : 65810 4: 5132 15 :125912
## C:116111 Mean : 74122 5: 5343 48 : 63080
## O: 50943 3rd Qu.: 95620 6: 10615 32 : 41408
## P:325197 Max. :393411 7: 6215 30 : 34454
## T:245395 NA's :2219 8:1095555 (Other): 58032
## Schedule NSFTP
## F :1136271 1:1081701
## P : 52803 2: 188909
## I : 45535
## G : 31096
## J : 2810
## Q : 2022
## (Other): 73
## AgencyName Fulltime
## VETERANS HEALTH ADMINISTRATION :260206 Mode :logical
## INTERNAL REVENUE SERVICE :104069 FALSE:103243
## SOCIAL SECURITY ADMINISTRATION : 63229 TRUE :1167367
## TRANSPORTATION SECURITY ADMINISTRATION: 61324
## CUSTOMS AND BORDER PROTECTION : 55006
## (Other) :726714
## NA's : 62
## Seasonal
## Mode :logical
## FALSE:1234680
## TRUE :35930
##
##
##
##
Note: You need to clean/translate the data a lot more!
save(dat, file = "Employment_2009.rda")
#load("Employment_2009.rda")
tsa <- dat[dat$AgencyName=="TRANSPORTATION SECURITY ADMINISTRATION", ]
summary(tsa)
## PseudoID Name Date
## 000000170: 1 NAME WITHHELD BY OPM :56710 20090331:61324
## 000000314: 1 NAME WITHHELD BY AGENCY: 2611 NA's : 62
## 000000360: 1 JACKSON,ERICA : 2
## 000000418: 1 ABBOTT,JACK D : 1
## 000000576: 1 ABERMAN,DARA : 1
## (Other) :61319 (Other) : 1999
## NA's : 62 NA's : 62
## Agency Station Age Education
## HSBC :61324 #########:59156 25-29 : 7999 04 :37844
## AB00 : 0 510100013: 932 45-49 : 7904 07 : 5201
## AG01 : 0 511220059: 83 40-44 : 7689 13 : 4824
## AG02 : 0 *********: 69 50-54 : 7034 09 : 3052
## AG03 : 0 110010001: 59 35-39 : 6901 08 : 2743
## (Other): 0 (Other) : 1025 (Other):23797 (Other): 7660
## NA's : 62 NA's : 62 NA's : 62 NA's : 62
## PayPlan Grade LOS Occupation
## SV :61193 E :19396 5-9 :26614 1802 :49672
## SW : 131 D :16788 1-2 :13574 1801 : 7593
## ** : 0 F : 8561 3-4 : 6697 0343 : 894
## AA : 0 G : 6421 < 1 : 5044 0301 : 661
## AD : 0 I : 5149 10-14 : 5002 0340 : 380
## (Other): 0 (Other): 5009 (Other): 4393 (Other): 2124
## NA's : 62 NA's : 62 NA's : 62 NA's : 62
## Category Pay SupervisoryStatus Appointment
## C :29026 Min. : 21839 8 :52025 38 :61010
## T :21113 1st Qu.: 32170 2 : 7937 48 : 311
## A :10785 Median : 37011 6 : 1352 46 : 2
## P : 391 Mean : 45724 7 : 10 32 : 1
## O : 9 3rd Qu.: 48368 * : 0 ** : 0
## (Other): 0 Max. :177000 (Other): 0 (Other): 0
## NA's : 62 NA's :62 NA's : 62 NA's : 62
## Schedule NSFTP
## F :49549 1 :49244
## P :11764 2 :12080
## G : 5 NA's: 62
## I : 3
## S : 2
## (Other): 1
## NA's : 62
## AgencyName Fulltime
## TRANSPORTATION SECURITY ADMINISTRATION:61324 Mode :logical
## : 0 FALSE:11770
## 11TH AVIATION GROUP (USAREUR) : 0 TRUE :49554
## 11TH WING : 0 NA's :62
## 132975121 : 0
## (Other) : 0
## NA's : 62
## Seasonal
## Mode :logical
## FALSE:61318
## TRUE :6
## NA's :62
##
##
##
barplot(table(tsa$Education), las = 2)
hist(tsa$Pay)
boxplot(Pay~Education, data = tsa)
tsa_full_nonseason <- tsa[tsa$Fulltime & !tsa$Seasonal,]
barplot(table(tsa_full_nonseason$Education), las = 2)
hist(tsa_full_nonseason$Pay)
boxplot(Pay~Education, data = tsa_full_nonseason)
look at only education levels with more than 100 employees
cnt <- table(tsa_full_nonseason$Education)
take <- names(cnt)[cnt > 100]
tsa_fn_cnt100 <- tsa_full_nonseason[tsa_full_nonseason$Education %in% take,]
boxplot(Pay~Education, data = tsa_fn_cnt100)
boxplot(Pay~LOS, data = tsa)
boxplot(Pay~Age, data = tsa)