https://www.buzzfeed.com/jsvine/sharing-hundreds-of-millions-of-federal-payroll-records?utm_term=.jbk33NbAx#.uwVQQ8Vad

Barack Obama

We use employment after 1 year in office (1Q 2009)

Data file from: https://archive.org/download/opm-federal-employment-data/data/1973-09-to-2014-06/non-dod/status/

dat_raw <- readLines("Status_Non_DoD_2009_03.txt")
head(dat_raw)
## [1] "000172152THOMAS,R SOLE          20090331AB0051010001350-54 17GS1515-19 0801P140969210F1"
## [2] "000299511NEESE,DANIEL L         20090331AB00FR750000055-59 13GS1425-29 1630A091791220F2"
## [3] "000343471NAME WITHHELD BY AGENCY20090331AB00#########45-49 10GS0910-14 1630A045044210F1"
## [4] "000547233GREEN,MICHAEL W        20090331AB00BE550000050-54 04GS1210-14 1630A067299210F1"
## [5] "000627711MICHAEL,R COONCE       20090331AB00TS300000045-49 04GS095-9   1630A******210F1"
## [6] "000806215GROSSO,RONALD A        20090331AB00IT700000055-59 13GS145-9   0301A******210F1"

Manually edited from https://archive.org/download/opm-federal-employment-data/docs/2015-02-11-opm-foia-response.pdf

Used file: header.csv

dat_header <- read.csv("headers.csv", header = TRUE)

dat <- t(sapply(dat_raw, FUN = function(x) trimws(substring(x, dat_header[,2], dat_header[,3]))))
dimnames(dat) <- NULL
dat <- as.data.frame(dat)

colnames(dat) <- dat_header[,1]
head(dat)
##    PseudoID                    Name     Date Agency   Station   Age
## 1 000172152           THOMAS,R SOLE 20090331   AB00 510100013 50-54
## 2 000299511          NEESE,DANIEL L 20090331   AB00 FR7500000 55-59
## 3 000343471 NAME WITHHELD BY AGENCY 20090331   AB00 ######### 45-49
## 4 000547233         GREEN,MICHAEL W 20090331   AB00 BE5500000 50-54
## 5 000627711        MICHAEL,R COONCE 20090331   AB00 TS3000000 45-49
## 6 000806215         GROSSO,RONALD A 20090331   AB00 IT7000000 55-59
##   Education PayPlan Grade   LOS Occupation Category    Pay
## 1        17      GS    15 15-19       0801        P 140969
## 2        13      GS    14 25-29       1630        A 091791
## 3        10      GS    09 10-14       1630        A 045044
## 4        04      GS    12 10-14       1630        A 067299
## 5        04      GS    09   5-9       1630        A ******
## 6        13      GS    14   5-9       0301        A ******
##   SupervisoryStatus Appointment Schedule NSFTP
## 1                 2          10        F     1
## 2                 2          20        F     2
## 3                 2          10        F     1
## 4                 2          10        F     1
## 5                 2          10        F     1
## 6                 2          10        F     1
summary(dat)
##       PseudoID                            Name              Date        
##  004101442:      4   NAME WITHHELD BY OPM   :304408   20090331:1270610  
##  003167685:      3   NAME WITHHELD BY AGENCY:169859                     
##  003327886:      3   NAME UNKNOWN           :    52                     
##  003509194:      3   SMITH,PATRICIA A       :    20                     
##  004023994:      3   SMITH,JAMES E          :    18                     
##  004188607:      3   SMITH,MICHAEL A        :    17                     
##  (Other)  :1270591   (Other)                :796236                     
##      Agency            Station            Age           Education     
##  VATA   :260206   #########:381500   50-54  :203671   13     :319393  
##  TR93   :104069   110010001:103144   45-49  :194857   04     :271752  
##  SZ00   : 63229   240130031: 13718   55-59  :173459   17     :130966  
##  HSBC   : 61324   241360031: 13179   40-44  :167649   07     : 74839  
##  HSBD   : 55006   426540101: 12122   35-39  :137531   10     : 73869  
##  TD03   : 47034   241698005: 11452   30-34  :112498   15     : 57892  
##  (Other):679742   (Other)  :735495   (Other):280945   (Other):341899  
##     PayPlan           Grade             LOS           Occupation    
##  GS     :885287   13     :145501   5-9    :236641   0301   : 61709  
##  SV     : 61193   12     :136265   20-24  :160063   0303   : 56891  
##  VN     : 45801   11     :128610   15-19  :151277   0610   : 56257  
##  AD     : 45316   07     :101042   1-2    :147091   1802   : 55960  
##  WG     : 39701   09     : 92546   10-14  :139189   2210   : 41421  
##  GL     : 38389   14     : 81942   < 1    :105342   1811   : 39845  
##  (Other):154923   (Other):584704   (Other):331007   (Other):958527  
##  Category        Pay          SupervisoryStatus  Appointment    
##  *:   223   030772 :  12688   *:     35         10     :673958  
##  A:474254   153200 :  10594   2: 147715         38     :273766  
##  B: 58487   027504 :  10453   4:   5132         15     :125912  
##  C:116111   038117 :   9306   5:   5343         48     : 63080  
##  O: 50943   056411 :   8421   6:  10615         32     : 41408  
##  P:325197   062050 :   7717   7:   6215         30     : 34454  
##  T:245395   (Other):1211431   8:1095555         (Other): 58032  
##     Schedule       NSFTP      
##  F      :1136271   1:1081701  
##  P      :  52803   2: 188909  
##  I      :  45535              
##  G      :  31096              
##  J      :   2810              
##  Q      :   2022              
##  (Other):     73

Fix Pay (convert to a number by first making the factor into a character)

dat$Pay <- as.numeric(as.character(dat$Pay))
## Warning: NAs introduced by coercion
summary(dat)
##       PseudoID                            Name              Date        
##  004101442:      4   NAME WITHHELD BY OPM   :304408   20090331:1270610  
##  003167685:      3   NAME WITHHELD BY AGENCY:169859                     
##  003327886:      3   NAME UNKNOWN           :    52                     
##  003509194:      3   SMITH,PATRICIA A       :    20                     
##  004023994:      3   SMITH,JAMES E          :    18                     
##  004188607:      3   SMITH,MICHAEL A        :    17                     
##  (Other)  :1270591   (Other)                :796236                     
##      Agency            Station            Age           Education     
##  VATA   :260206   #########:381500   50-54  :203671   13     :319393  
##  TR93   :104069   110010001:103144   45-49  :194857   04     :271752  
##  SZ00   : 63229   240130031: 13718   55-59  :173459   17     :130966  
##  HSBC   : 61324   241360031: 13179   40-44  :167649   07     : 74839  
##  HSBD   : 55006   426540101: 12122   35-39  :137531   10     : 73869  
##  TD03   : 47034   241698005: 11452   30-34  :112498   15     : 57892  
##  (Other):679742   (Other)  :735495   (Other):280945   (Other):341899  
##     PayPlan           Grade             LOS           Occupation    
##  GS     :885287   13     :145501   5-9    :236641   0301   : 61709  
##  SV     : 61193   12     :136265   20-24  :160063   0303   : 56891  
##  VN     : 45801   11     :128610   15-19  :151277   0610   : 56257  
##  AD     : 45316   07     :101042   1-2    :147091   1802   : 55960  
##  WG     : 39701   09     : 92546   10-14  :139189   2210   : 41421  
##  GL     : 38389   14     : 81942   < 1    :105342   1811   : 39845  
##  (Other):154923   (Other):584704   (Other):331007   (Other):958527  
##  Category        Pay         SupervisoryStatus  Appointment    
##  *:   223   Min.   :     0   *:     35         10     :673958  
##  A:474254   1st Qu.: 45693   2: 147715         38     :273766  
##  B: 58487   Median : 65810   4:   5132         15     :125912  
##  C:116111   Mean   : 74122   5:   5343         48     : 63080  
##  O: 50943   3rd Qu.: 95620   6:  10615         32     : 41408  
##  P:325197   Max.   :393411   7:   6215         30     : 34454  
##  T:245395   NA's   :2219     8:1095555         (Other): 58032  
##     Schedule       NSFTP      
##  F      :1136271   1:1081701  
##  P      :  52803   2: 188909  
##  I      :  45535              
##  G      :  31096              
##  J      :   2810              
##  Q      :   2022              
##  (Other):     73

Data explanations: https://archive.org/download/opm-federal-employment-data/data/2014-09-to-2016-09/non-dod/translations/

summary(dat)
##       PseudoID                            Name              Date        
##  004101442:      4   NAME WITHHELD BY OPM   :304408   20090331:1270610  
##  003167685:      3   NAME WITHHELD BY AGENCY:169859                     
##  003327886:      3   NAME UNKNOWN           :    52                     
##  003509194:      3   SMITH,PATRICIA A       :    20                     
##  004023994:      3   SMITH,JAMES E          :    18                     
##  004188607:      3   SMITH,MICHAEL A        :    17                     
##  (Other)  :1270591   (Other)                :796236                     
##      Agency            Station            Age           Education     
##  VATA   :260206   #########:381500   50-54  :203671   13     :319393  
##  TR93   :104069   110010001:103144   45-49  :194857   04     :271752  
##  SZ00   : 63229   240130031: 13718   55-59  :173459   17     :130966  
##  HSBC   : 61324   241360031: 13179   40-44  :167649   07     : 74839  
##  HSBD   : 55006   426540101: 12122   35-39  :137531   10     : 73869  
##  TD03   : 47034   241698005: 11452   30-34  :112498   15     : 57892  
##  (Other):679742   (Other)  :735495   (Other):280945   (Other):341899  
##     PayPlan           Grade             LOS           Occupation    
##  GS     :885287   13     :145501   5-9    :236641   0301   : 61709  
##  SV     : 61193   12     :136265   20-24  :160063   0303   : 56891  
##  VN     : 45801   11     :128610   15-19  :151277   0610   : 56257  
##  AD     : 45316   07     :101042   1-2    :147091   1802   : 55960  
##  WG     : 39701   09     : 92546   10-14  :139189   2210   : 41421  
##  GL     : 38389   14     : 81942   < 1    :105342   1811   : 39845  
##  (Other):154923   (Other):584704   (Other):331007   (Other):958527  
##  Category        Pay         SupervisoryStatus  Appointment    
##  *:   223   Min.   :     0   *:     35         10     :673958  
##  A:474254   1st Qu.: 45693   2: 147715         38     :273766  
##  B: 58487   Median : 65810   4:   5132         15     :125912  
##  C:116111   Mean   : 74122   5:   5343         48     : 63080  
##  O: 50943   3rd Qu.: 95620   6:  10615         32     : 41408  
##  P:325197   Max.   :393411   7:   6215         30     : 34454  
##  T:245395   NA's   :2219     8:1095555         (Other): 58032  
##     Schedule       NSFTP      
##  F      :1136271   1:1081701  
##  P      :  52803   2: 188909  
##  I      :  45535              
##  G      :  31096              
##  J      :   2810              
##  Q      :   2022              
##  (Other):     73
hist(dat$Pay, breaks = 50)

Education

https://ia800608.us.archive.org/16/items/opm-federal-employment-data/data/2014-09-to-2016-09/non-dod/translations/Education%20Translation.txt

barplot(table(dat$Education), las = 2)

boxplot(Pay~Education, data = dat)

med_pay_by_agency <- aggregate(Pay~Agency, data = dat, median)
head(med_pay_by_agency)
##   Agency   Pay
## 1   AB00 67299
## 2   AG01 82845
## 3   AG02 40320
## 4   AG03 57079
## 5   AG07 60612
## 6   AG08 88443
head(med_pay_by_agency[head(order(med_pay_by_agency$Pay)),])
##     Agency   Pay
## 47    BZ00     0
## 145   GE00     0
## 126   EDEZ 26088
## 194   HSBC 37011
## 61    CM63 37566
## 3     AG02 40320
head(med_pay_by_agency[head(order(med_pay_by_agency$Pay,
  decreasing = TRUE)),])
##     Agency      Pay
## 254   NK00 153207.0
## 46    BW00 153200.0
## 39    AW00 153199.0
## 250   MA00 153126.5
## 281   SE00 149318.0
## 41    BF00 148893.0

Agency

translate Agency: https://archive.org/download/opm-federal-employment-data/data/1973-09-to-2014-06/SCTFILE.TXT

agency_trans <- readLines("SCTFILE.TXT")
## Warning in readLines("SCTFILE.TXT"): incomplete final line found on
## 'SCTFILE.TXT'
agency_ID <- sapply(agency_trans, FUN = function(x) substring(x, 3,6))
agency_name <- trimws(sapply(agency_trans, FUN = function(x) substring(x, 36,75)))

agency_trans_table <- data.frame(agency_ID = agency_ID, agency_name = agency_name)
head(agency_trans_table)
##   agency_ID                              agency_name
## 1      AA00    ADMIN CONFERENCE OF THE UNITED STATES
## 2      AB00     AMERICAN BATTLE MONUMENTS COMMISSION
## 3      AC00  ADV CMSN ON INTERGOVERNMENTAL RELATIONS
## 4      AD00 U.S. ARMS CONTROL AND DISARMAMENT AGENCY
## 5      AE00                 ATOMIC ENERGY COMMISSION
## 6      AF                DEPARTMENT OF THE AIR FORCE
m <- match(dat$Agency, agency_trans_table$agency_ID)
dat$AgencyName <-  agency_trans_table$agency_name[m]

summary(dat)
##       PseudoID                            Name              Date        
##  004101442:      4   NAME WITHHELD BY OPM   :304408   20090331:1270610  
##  003167685:      3   NAME WITHHELD BY AGENCY:169859                     
##  003327886:      3   NAME UNKNOWN           :    52                     
##  003509194:      3   SMITH,PATRICIA A       :    20                     
##  004023994:      3   SMITH,JAMES E          :    18                     
##  004188607:      3   SMITH,MICHAEL A        :    17                     
##  (Other)  :1270591   (Other)                :796236                     
##      Agency            Station            Age           Education     
##  VATA   :260206   #########:381500   50-54  :203671   13     :319393  
##  TR93   :104069   110010001:103144   45-49  :194857   04     :271752  
##  SZ00   : 63229   240130031: 13718   55-59  :173459   17     :130966  
##  HSBC   : 61324   241360031: 13179   40-44  :167649   07     : 74839  
##  HSBD   : 55006   426540101: 12122   35-39  :137531   10     : 73869  
##  TD03   : 47034   241698005: 11452   30-34  :112498   15     : 57892  
##  (Other):679742   (Other)  :735495   (Other):280945   (Other):341899  
##     PayPlan           Grade             LOS           Occupation    
##  GS     :885287   13     :145501   5-9    :236641   0301   : 61709  
##  SV     : 61193   12     :136265   20-24  :160063   0303   : 56891  
##  VN     : 45801   11     :128610   15-19  :151277   0610   : 56257  
##  AD     : 45316   07     :101042   1-2    :147091   1802   : 55960  
##  WG     : 39701   09     : 92546   10-14  :139189   2210   : 41421  
##  GL     : 38389   14     : 81942   < 1    :105342   1811   : 39845  
##  (Other):154923   (Other):584704   (Other):331007   (Other):958527  
##  Category        Pay         SupervisoryStatus  Appointment    
##  *:   223   Min.   :     0   *:     35         10     :673958  
##  A:474254   1st Qu.: 45693   2: 147715         38     :273766  
##  B: 58487   Median : 65810   4:   5132         15     :125912  
##  C:116111   Mean   : 74122   5:   5343         48     : 63080  
##  O: 50943   3rd Qu.: 95620   6:  10615         32     : 41408  
##  P:325197   Max.   :393411   7:   6215         30     : 34454  
##  T:245395   NA's   :2219     8:1095555         (Other): 58032  
##     Schedule       NSFTP      
##  F      :1136271   1:1081701  
##  P      :  52803   2: 188909  
##  I      :  45535              
##  G      :  31096              
##  J      :   2810              
##  Q      :   2022              
##  (Other):     73              
##                                   AgencyName    
##  VETERANS HEALTH ADMINISTRATION        :260206  
##  INTERNAL REVENUE SERVICE              :104069  
##  SOCIAL SECURITY ADMINISTRATION        : 63229  
##  TRANSPORTATION SECURITY ADMINISTRATION: 61324  
##  CUSTOMS AND BORDER PROTECTION         : 55006  
##  (Other)                               :726714  
##  NA's                                  :    62
#" # Age
boxplot(Pay~Age, data = dat)

Length of service

boxplot(Pay~LOS, data = dat)

Fix order od levels

levels(dat$LOS)
##  [1] "< 1"   "1-2"   "10-14" "15-19" "20-24" "25-29" "3-4"   "30-34"
##  [9] "35+"   "5-9"   "UNSP"
dat$LOS <- ordered(dat$LOS, levels = c("< 1", "1-2", "3-4", "5-9",
  "10-14", "15-19", "20-24", "25-29", "30-34", "35+", "UNSP"))

boxplot(Pay~LOS, data = dat)

Work Schedule

Full-time F, G Seasonal: G, J, Q, T

dat$Fulltime <- FALSE
dat$Fulltime[dat$Schedule == "F" | dat$Schedule == "G"] <- TRUE
dat$Seasonal <- FALSE
dat$Seasonal[dat$Schedule %in% c("G", "J", "Q", "T")] <- TRUE

summary(dat)
##       PseudoID                            Name              Date        
##  004101442:      4   NAME WITHHELD BY OPM   :304408   20090331:1270610  
##  003167685:      3   NAME WITHHELD BY AGENCY:169859                     
##  003327886:      3   NAME UNKNOWN           :    52                     
##  003509194:      3   SMITH,PATRICIA A       :    20                     
##  004023994:      3   SMITH,JAMES E          :    18                     
##  004188607:      3   SMITH,MICHAEL A        :    17                     
##  (Other)  :1270591   (Other)                :796236                     
##      Agency            Station            Age           Education     
##  VATA   :260206   #########:381500   50-54  :203671   13     :319393  
##  TR93   :104069   110010001:103144   45-49  :194857   04     :271752  
##  SZ00   : 63229   240130031: 13718   55-59  :173459   17     :130966  
##  HSBC   : 61324   241360031: 13179   40-44  :167649   07     : 74839  
##  HSBD   : 55006   426540101: 12122   35-39  :137531   10     : 73869  
##  TD03   : 47034   241698005: 11452   30-34  :112498   15     : 57892  
##  (Other):679742   (Other)  :735495   (Other):280945   (Other):341899  
##     PayPlan           Grade             LOS           Occupation    
##  GS     :885287   13     :145501   5-9    :236641   0301   : 61709  
##  SV     : 61193   12     :136265   20-24  :160063   0303   : 56891  
##  VN     : 45801   11     :128610   15-19  :151277   0610   : 56257  
##  AD     : 45316   07     :101042   1-2    :147091   1802   : 55960  
##  WG     : 39701   09     : 92546   10-14  :139189   2210   : 41421  
##  GL     : 38389   14     : 81942   < 1    :105342   1811   : 39845  
##  (Other):154923   (Other):584704   (Other):331007   (Other):958527  
##  Category        Pay         SupervisoryStatus  Appointment    
##  *:   223   Min.   :     0   *:     35         10     :673958  
##  A:474254   1st Qu.: 45693   2: 147715         38     :273766  
##  B: 58487   Median : 65810   4:   5132         15     :125912  
##  C:116111   Mean   : 74122   5:   5343         48     : 63080  
##  O: 50943   3rd Qu.: 95620   6:  10615         32     : 41408  
##  P:325197   Max.   :393411   7:   6215         30     : 34454  
##  T:245395   NA's   :2219     8:1095555         (Other): 58032  
##     Schedule       NSFTP      
##  F      :1136271   1:1081701  
##  P      :  52803   2: 188909  
##  I      :  45535              
##  G      :  31096              
##  J      :   2810              
##  Q      :   2022              
##  (Other):     73              
##                                   AgencyName      Fulltime      
##  VETERANS HEALTH ADMINISTRATION        :260206   Mode :logical  
##  INTERNAL REVENUE SERVICE              :104069   FALSE:103243   
##  SOCIAL SECURITY ADMINISTRATION        : 63229   TRUE :1167367  
##  TRANSPORTATION SECURITY ADMINISTRATION: 61324                  
##  CUSTOMS AND BORDER PROTECTION         : 55006                  
##  (Other)                               :726714                  
##  NA's                                  :    62                  
##   Seasonal      
##  Mode :logical  
##  FALSE:1234680  
##  TRUE :35930    
##                 
##                 
##                 
## 

Note: You need to clean/translate the data a lot more!

Save the data

save(dat, file = "Employment_2009.rda")

#load("Employment_2009.rda")

Look at a specific agency (TSA)

tsa <- dat[dat$AgencyName=="TRANSPORTATION SECURITY ADMINISTRATION", ]
summary(tsa)
##       PseudoID                          Name             Date      
##  000000170:    1   NAME WITHHELD BY OPM   :56710   20090331:61324  
##  000000314:    1   NAME WITHHELD BY AGENCY: 2611   NA's    :   62  
##  000000360:    1   JACKSON,ERICA          :    2                   
##  000000418:    1   ABBOTT,JACK D          :    1                   
##  000000576:    1   ABERMAN,DARA           :    1                   
##  (Other)  :61319   (Other)                : 1999                   
##  NA's     :   62   NA's                   :   62                   
##      Agency           Station           Age          Education    
##  HSBC   :61324   #########:59156   25-29  : 7999   04     :37844  
##  AB00   :    0   510100013:  932   45-49  : 7904   07     : 5201  
##  AG01   :    0   511220059:   83   40-44  : 7689   13     : 4824  
##  AG02   :    0   *********:   69   50-54  : 7034   09     : 3052  
##  AG03   :    0   110010001:   59   35-39  : 6901   08     : 2743  
##  (Other):    0   (Other)  : 1025   (Other):23797   (Other): 7660  
##  NA's   :   62   NA's     :   62   NA's   :   62   NA's   :   62  
##     PayPlan          Grade            LOS          Occupation   
##  SV     :61193   E      :19396   5-9    :26614   1802   :49672  
##  SW     :  131   D      :16788   1-2    :13574   1801   : 7593  
##  **     :    0   F      : 8561   3-4    : 6697   0343   :  894  
##  AA     :    0   G      : 6421   < 1    : 5044   0301   :  661  
##  AD     :    0   I      : 5149   10-14  : 5002   0340   :  380  
##  (Other):    0   (Other): 5009   (Other): 4393   (Other): 2124  
##  NA's   :   62   NA's   :   62   NA's   :   62   NA's   :   62  
##     Category          Pay         SupervisoryStatus  Appointment   
##  C      :29026   Min.   : 21839   8      :52025     38     :61010  
##  T      :21113   1st Qu.: 32170   2      : 7937     48     :  311  
##  A      :10785   Median : 37011   6      : 1352     46     :    2  
##  P      :  391   Mean   : 45724   7      :   10     32     :    1  
##  O      :    9   3rd Qu.: 48368   *      :    0     **     :    0  
##  (Other):    0   Max.   :177000   (Other):    0     (Other):    0  
##  NA's   :   62   NA's   :62       NA's   :   62     NA's   :   62  
##     Schedule      NSFTP      
##  F      :49549   1   :49244  
##  P      :11764   2   :12080  
##  G      :    5   NA's:   62  
##  I      :    3               
##  S      :    2               
##  (Other):    1               
##  NA's   :   62               
##                                   AgencyName     Fulltime      
##  TRANSPORTATION SECURITY ADMINISTRATION:61324   Mode :logical  
##                                        :    0   FALSE:11770    
##  11TH AVIATION GROUP (USAREUR)         :    0   TRUE :49554    
##  11TH WING                             :    0   NA's :62       
##  132975121                             :    0                  
##  (Other)                               :    0                  
##  NA's                                  :   62                  
##   Seasonal      
##  Mode :logical  
##  FALSE:61318    
##  TRUE :6        
##  NA's :62       
##                 
##                 
## 
barplot(table(tsa$Education), las = 2)

hist(tsa$Pay)

boxplot(Pay~Education, data = tsa)

tsa_full_nonseason <- tsa[tsa$Fulltime & !tsa$Seasonal,]

barplot(table(tsa_full_nonseason$Education), las = 2)

hist(tsa_full_nonseason$Pay)

boxplot(Pay~Education, data = tsa_full_nonseason)

look at only education levels with more than 100 employees

cnt <- table(tsa_full_nonseason$Education)
take <- names(cnt)[cnt > 100]
tsa_fn_cnt100 <- tsa_full_nonseason[tsa_full_nonseason$Education %in% take,]
boxplot(Pay~Education, data = tsa_fn_cnt100)

boxplot(Pay~LOS, data = tsa)

boxplot(Pay~Age, data = tsa)