Load data file from project 1

load("Employment_2009.rda")

summary(dat)
##       PseudoID                            Name              Date        
##  004101442:      4   NAME WITHHELD BY OPM   :304408   20090331:1270610  
##  003167685:      3   NAME WITHHELD BY AGENCY:169859                     
##  003327886:      3   NAME UNKNOWN           :    52                     
##  003509194:      3   SMITH,PATRICIA A       :    20                     
##  004023994:      3   SMITH,JAMES E          :    18                     
##  004188607:      3   SMITH,MICHAEL A        :    17                     
##  (Other)  :1270591   (Other)                :796236                     
##      Agency            Station            Age           Education     
##  VATA   :260206   #########:381500   50-54  :203671   13     :319393  
##  TR93   :104069   110010001:103144   45-49  :194857   04     :271752  
##  SZ00   : 63229   240130031: 13718   55-59  :173459   17     :130966  
##  HSBC   : 61324   241360031: 13179   40-44  :167649   07     : 74839  
##  HSBD   : 55006   426540101: 12122   35-39  :137531   10     : 73869  
##  TD03   : 47034   241698005: 11452   30-34  :112498   15     : 57892  
##  (Other):679742   (Other)  :735495   (Other):280945   (Other):341899  
##     PayPlan           Grade             LOS           Occupation    
##  GS     :885287   13     :145501   5-9    :236641   0301   : 61709  
##  SV     : 61193   12     :136265   20-24  :160063   0303   : 56891  
##  VN     : 45801   11     :128610   15-19  :151277   0610   : 56257  
##  AD     : 45316   07     :101042   1-2    :147091   1802   : 55960  
##  WG     : 39701   09     : 92546   10-14  :139189   2210   : 41421  
##  GL     : 38389   14     : 81942   < 1    :105342   1811   : 39845  
##  (Other):154923   (Other):584704   (Other):331007   (Other):958527  
##  Category        Pay         SupervisoryStatus  Appointment    
##  *:   223   Min.   :     0   *:     35         10     :673958  
##  A:474254   1st Qu.: 45693   2: 147715         38     :273766  
##  B: 58487   Median : 65810   4:   5132         15     :125912  
##  C:116111   Mean   : 74122   5:   5343         48     : 63080  
##  O: 50943   3rd Qu.: 95620   6:  10615         32     : 41408  
##  P:325197   Max.   :393411   7:   6215         30     : 34454  
##  T:245395   NA's   :2219     8:1095555         (Other): 58032  
##     Schedule       NSFTP      
##  F      :1136271   1:1081701  
##  P      :  52803   2: 188909  
##  I      :  45535              
##  G      :  31096              
##  J      :   2810              
##  Q      :   2022              
##  (Other):     73              
##                                   AgencyName      Fulltime      
##  VETERANS HEALTH ADMINISTRATION        :260206   Mode :logical  
##  INTERNAL REVENUE SERVICE              :104069   FALSE:103243   
##  SOCIAL SECURITY ADMINISTRATION        : 63229   TRUE :1167367  
##  TRANSPORTATION SECURITY ADMINISTRATION: 61324                  
##  CUSTOMS AND BORDER PROTECTION         : 55006                  
##  (Other)                               :726714                  
##  NA's                                  :    62                  
##   Seasonal      
##  Mode :logical  
##  FALSE:1234680  
##  TRUE :35930    
##                 
##                 
##                 
## 

Important Note: I did not clean the data to remove duplicates, etc. You need to do this first if you have not done it for project 1! Remove attributes not useful for predictive modeling

dat2 <- dat

dat2$PseudoID <- NULL
dat2$Name <- NULL
dat2$Date <- NULL

Make some attributes measured on an ordinal scale continuous

Reason: Decision trees are very slow with nominal/ordinal attributes with many different values.

dat2$Education <- as.numeric(as.character(dat2$Education))
## Warning: NAs introduced by coercion
dat2$Age <- as.numeric(substr(dat2$Age, start = 1, stop = 2))
## Warning: NAs introduced by coercion
dat2$LOS <- as.numeric(sub("-.*|\\+|<", "", dat2$LOS))
## Warning: NAs introduced by coercion

Prepare data for association rule mining

library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write

Discretize two subsets (two agencies) in the same way.

dat_set1 <- dat2[dat2$Agency == "VATA",]
dat_set2 <- dat2[dat2$Agency == "TR93",]

1) combine the subsets

dat_1_2 <- rbind(dat_set1, dat_set2)

2) discretize

dat_1_2$Pay <- discretize(dat_1_2$Pay, method = "frequency")
dat_1_2$LOS <- discretize(dat_1_2$LOS, method = "frequency")
dat_1_2$Education <- discretize(dat_1_2$Education, method = "frequency")
dat_1_2$Age <- discretize(dat_1_2$Age, method = "frequency")

3) split again

dat_set1 <- dat_1_2[dat_1_2$Agency == "VATA",]
dat_set2 <- dat_1_2[dat_1_2$Agency == "TR93",]

Mine 2 rule sets

trans1 <- as(dat_set1, "transactions")
trans2 <- as(dat_set2, "transactions")

rules1 <- apriori(trans1)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 26020 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[1144 item(s), 260206 transaction(s)] done [0.30s].
## sorting and recoding items ... [31 item(s)] done [0.04s].
## creating transaction tree ... done [0.28s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(trans1): Mining stopped (maxlen reached). Only patterns
## up to a length of 10 returned!
##  done [0.12s].
## writing ... [33150 rule(s)] done [0.01s].
## creating S4 object  ... done [0.07s].
rules2 <- apriori(trans2)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 10406 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[678 item(s), 104069 transaction(s)] done [0.11s].
## sorting and recoding items ... [34 item(s)] done [0.02s].
## creating transaction tree ... done [0.08s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(trans2): Mining stopped (maxlen reached). Only patterns
## up to a length of 10 returned!
##  done [0.17s].
## writing ... [68231 rule(s)] done [0.01s].
## creating S4 object  ... done [0.03s].
inspect(head(rules1, by = "lift", n=5))
##     lhs                  rhs            support confidence    lift count
## [1] {Occupation=0610,                                                   
##      Appointment=38,                                                    
##      NSFTP=1}         => {PayPlan=VN} 0.1676441  0.9914767 5.63293 43622
## [2] {Occupation=0610,                                                   
##      Appointment=38,                                                    
##      Fulltime}        => {PayPlan=VN} 0.1676441  0.9914767 5.63293 43622
## [3] {Occupation=0610,                                                   
##      Appointment=38,                                                    
##      Schedule=F}      => {PayPlan=VN} 0.1676441  0.9914767 5.63293 43622
## [4] {Occupation=0610,                                                   
##      Category=P,                                                        
##      Appointment=38,                                                    
##      NSFTP=1}         => {PayPlan=VN} 0.1676441  0.9914767 5.63293 43622
## [5] {Occupation=0610,                                                   
##      Category=P,                                                        
##      Appointment=38,                                                    
##      Fulltime}        => {PayPlan=VN} 0.1676441  0.9914767 5.63293 43622
inspect(head(rules2, by = "lift", n=5))
##     lhs                                      rhs                 support confidence     lift count
## [1] {Station=#########,                                                                           
##      Category=P,                                                                                  
##      Appointment=10}                      => {Occupation=0512} 0.1021342  0.9748693 7.842133 10629
## [2] {Agency=TR93,                                                                                 
##      Station=#########,                                                                           
##      Category=P,                                                                                  
##      Appointment=10}                      => {Occupation=0512} 0.1021342  0.9748693 7.842133 10629
## [3] {Station=#########,                                                                           
##      Category=P,                                                                                  
##      Appointment=10,                                                                              
##      AgencyName=INTERNAL REVENUE SERVICE} => {Occupation=0512} 0.1021342  0.9748693 7.842133 10629
## [4] {Agency=TR93,                                                                                 
##      Station=#########,                                                                           
##      Category=P,                                                                                  
##      Appointment=10,                                                                              
##      AgencyName=INTERNAL REVENUE SERVICE} => {Occupation=0512} 0.1021342  0.9748693 7.842133 10629
## [5] {Station=#########,                                                                           
##      PayPlan=GS,                                                                                  
##      Category=P,                                                                                  
##      SupervisoryStatus=8,                                                                         
##      NSFTP=1}                             => {Occupation=0512} 0.1069675  0.9682526 7.788906 11132

how many rules (%) are in both sets?

m <- match(rules1, rules2)
sum(!is.na(m))/min(length(rules1),length(rules2))
## [1] 0.05499246

caluclate quality for some of rules1 in trans2 and look at the difference

r <- sample(rules1, 100)

Note: this takes a while

q <- interestMeasure(r, measure = c("supp", "confidence", "lift"),
  transactions = trans2, reuse = FALSE)

% change in measure. Many do not occur at all (difference of 1 or NaN)

diff <- (quality(r)[,-4] - q)/quality(r)[,-4]
diff
##           support    confidence        lift
## 25518  1.00000000           NaN         NaN
## 17700  1.00000000  1.0000000000         NaN
## 23105  0.99749127  0.9917998417 -1.05018559
## 18890  1.00000000           NaN         NaN
## 8177   1.00000000           NaN         NaN
## 23706  1.00000000           NaN         NaN
## 9859   0.99340706  0.0000000000 -0.09498339
## 1060   1.00000000  1.0000000000         NaN
## 14707  1.00000000           NaN         NaN
## 1086   1.00000000  1.0000000000         NaN
## 33147  1.00000000           NaN         NaN
## 15516  1.00000000           NaN         NaN
## 5375   0.46830201  0.3909889615  0.29196083
## 29086  1.00000000           NaN         NaN
## 22471  1.00000000           NaN         NaN
## 8269   1.00000000           NaN         NaN
## 30426  1.00000000           NaN         NaN
## 14733  1.00000000           NaN         NaN
## 25989  1.00000000           NaN         NaN
## 10387  1.00000000           NaN         NaN
## 26613 -0.45159884  0.0000000000 -0.16260482
## 12678  1.00000000           NaN         NaN
## 16688  1.00000000           NaN         NaN
## 13903  1.00000000           NaN         NaN
## 11593  1.00000000           NaN         NaN
## 15544  1.00000000           NaN         NaN
## 6723   1.00000000           NaN         NaN
## 3685   1.00000000  1.0000000000         NaN
## 2916   1.00000000  1.0000000000         NaN
## 28482  1.00000000           NaN         NaN
## 25644  1.00000000           NaN         NaN
## 9863   1.00000000  1.0000000000         NaN
## 21674  1.00000000  1.0000000000         NaN
## 15057  1.00000000           NaN         NaN
## 25553  1.00000000           NaN         NaN
## 15225  1.00000000           NaN         NaN
## 29348  1.00000000           NaN         NaN
## 2253   1.00000000           NaN         NaN
## 12183  0.02092592  0.0618553350  0.02166727
## 25375  1.00000000           NaN         NaN
## 25557  1.00000000           NaN         NaN
## 2865   1.00000000           NaN         NaN
## 30677  1.00000000           NaN         NaN
## 16737  1.00000000           NaN         NaN
## 13054  1.00000000  1.0000000000         NaN
## 24614  1.00000000           NaN         NaN
## 13984  0.69406369  0.0000000000  0.05219498
## 19928  1.00000000           NaN         NaN
## 6075   1.00000000  1.0000000000         NaN
## 28420  1.00000000           NaN         NaN
## 11331  1.00000000           NaN         NaN
## 14467  1.00000000           NaN         NaN
## 8360  -0.29458823  0.0000000000  0.05219498
## 24963  1.00000000           NaN         NaN
## 26286  1.00000000           NaN         NaN
## 13208  0.99545879 -0.0235253541 -0.06737094
## 23244  1.00000000           NaN         NaN
## 19743  1.00000000  1.0000000000         NaN
## 32976  1.00000000           NaN         NaN
## 5418   1.00000000           NaN         NaN
## 17036  1.00000000           NaN         NaN
## 6262   1.00000000           NaN         NaN
## 3682   1.00000000           NaN         NaN
## 8864  -0.42170914 -0.0007166566  0.36195893
## 1235   1.00000000           NaN         NaN
## 12750 -1.54771252 -0.0783185870 -0.12451140
## 28794  1.00000000           NaN         NaN
## 13313  1.00000000  1.0000000000         NaN
## 7695   1.00000000           NaN         NaN
## 29470  1.00000000           NaN         NaN
## 9948   1.00000000           NaN         NaN
## 3036   1.00000000  1.0000000000         NaN
## 2656   1.00000000           NaN         NaN
## 22460  1.00000000           NaN         NaN
## 26513  1.00000000           NaN         NaN
## 11481  1.00000000           NaN         NaN
## 25326  1.00000000           NaN         NaN
## 24567  1.00000000           NaN         NaN
## 15109  1.00000000           NaN         NaN
## 32533  1.00000000           NaN         NaN
## 31507  1.00000000           NaN         NaN
## 21786  1.00000000           NaN         NaN
## 31626  1.00000000           NaN         NaN
## 1497   1.00000000           NaN         NaN
## 7097   1.00000000           NaN         NaN
## 6610   1.00000000           NaN         NaN
## 27155  1.00000000           NaN         NaN
## 19506  1.00000000           NaN         NaN
## 9979   1.00000000           NaN         NaN
## 14192  0.99697817 -0.0145656718 -0.05802745
## 8768   0.02134061  0.0778211160  0.41203336
## 500    1.00000000           NaN         NaN
## 9728   0.99372567  0.0000000000 -0.16260482
## 13578  0.99853633  0.1111111111  0.02668143
## 415    1.00000000           NaN         NaN
## 9141  -0.75458194  0.0000000000 -0.16260482
## 24932  1.00000000           NaN         NaN
## 14965  1.00000000           NaN         NaN
## 4087   1.00000000  1.0000000000         NaN
## 29040  1.00000000           NaN         NaN

for which rules did support increase/decrease by 10%

inspect(r[which(diff$supp > 0.2 & diff$supp!=1)])
##     lhs                      rhs                     support confidence     lift count
## [1] {LOS=[ 1,10),                                                                     
##      Category=P,                                                                      
##      SupervisoryStatus=8,                                                             
##      NSFTP=1,                                                                         
##      Fulltime}            => {Appointment=38}      0.1532094  0.9226319 2.087186 39866
## [2] {Education=[14,22],                                                               
##      Appointment=38,                                                                  
##      Schedule=F,                                                                      
##      Fulltime}            => {NSFTP=1}             0.1442895  1.0000000 1.182110 37545
## [3] {Age=[15,45),                                                                     
##      LOS=[ 1,10),                                                                     
##      Fulltime}            => {Schedule=F}          0.2442834  1.0000000 1.108665 63564
## [4] {LOS=[ 1,10),                                                                     
##      Category=P,                                                                      
##      SupervisoryStatus=8,                                                             
##      NSFTP=1}             => {Fulltime}            0.1660569  1.0000000 1.108665 43209
## [5] {Age=[15,45),                                                                     
##      Appointment=38,                                                                  
##      NSFTP=1,                                                                         
##      Fulltime}            => {SupervisoryStatus=8} 0.1375372  0.9478507 1.036919 35788
## [6] {LOS=[ 1,10),                                                                     
##      Appointment=38,                                                                  
##      Schedule=F,                                                                      
##      NSFTP=1}             => {SupervisoryStatus=8} 0.2194108  0.9445750 1.033336 57092
## [7] {Education=[14,22],                                                               
##      Category=P,                                                                      
##      Appointment=38,                                                                  
##      Fulltime}            => {Schedule=F}          0.1378331  1.0000000 1.108665 35865
## [8] {Education=[ 8,14),                                                               
##      LOS=[ 1,10),                                                                     
##      Appointment=38,                                                                  
##      Fulltime}            => {NSFTP=1}             0.1050399  1.0000000 1.182110 27332
inspect(r[which(diff$supp < -0.1)])
##     lhs                      rhs                     support confidence      lift count
## [1] {PayPlan=GS,                                                                       
##      Category=T,                                                                       
##      SupervisoryStatus=8,                                                              
##      Appointment=10,                                                                   
##      NSFTP=1,                                                                          
##      Fulltime}            => {Schedule=F}          0.1203047  1.0000000 1.1086655 31304
## [2] {PayPlan=GS,                                                                       
##      LOS=[10,20),                                                                      
##      SupervisoryStatus=8,                                                              
##      NSFTP=1}             => {Fulltime}            0.1100897  1.0000000 1.1086655 28646
## [3] {Category=T,                                                                       
##      Pay=[43201, 70401),                                                               
##      SupervisoryStatus=8,                                                              
##      NSFTP=1}             => {PayPlan=GS}          0.1018885  0.9992839 1.7202301 26512
## [4] {Education=[ 8,14),                                                                
##      PayPlan=GS,                                                                       
##      Appointment=10,                                                                   
##      Schedule=F}          => {SupervisoryStatus=8} 0.1080721  0.8861753 0.9694483 28121
## [5] {Category=T,                                                                       
##      Appointment=10,                                                                   
##      NSFTP=1,                                                                          
##      Fulltime}            => {Schedule=F}          0.1239556  1.0000000 1.1086655 32254

for which rules did lift increase/decrease by 10%

inspect(r[which(diff$lift > 0.1)])
##     lhs                      rhs            support confidence     lift count
## [1] {Age=[15,45),                                                            
##      LOS=[ 1,10),                                                            
##      Fulltime}            => {Schedule=F} 0.2442834  1.0000000 1.108665 63564
## [2] {Category=T,                                                             
##      Pay=[43201, 70401),                                                     
##      SupervisoryStatus=8,                                                    
##      NSFTP=1}             => {PayPlan=GS} 0.1018885  0.9992839 1.720230 26512
## [3] {Education=[ 1, 8),                                                      
##      Category=T,                                                             
##      Schedule=F,                                                             
##      Fulltime}            => {PayPlan=GS} 0.1364188  0.9966588 1.715711 35497
inspect(r[which(diff$lift < -0.1)])
##     lhs                      rhs                     support confidence      lift count
## [1] {LOS=[ 1,10),                                                                      
##      Category=P,                                                                       
##      SupervisoryStatus=8,                                                              
##      NSFTP=1,                                                                          
##      Fulltime}            => {Appointment=38}      0.1532094  0.9226319 2.0871856 39866
## [2] {PayPlan=GS,                                                                       
##      Category=T,                                                                       
##      SupervisoryStatus=8,                                                              
##      Appointment=10,                                                                   
##      NSFTP=1,                                                                          
##      Fulltime}            => {Schedule=F}          0.1203047  1.0000000 1.1086655 31304
## [3] {Education=[ 8,14),                                                                
##      PayPlan=GS,                                                                       
##      Appointment=10,                                                                   
##      Schedule=F}          => {SupervisoryStatus=8} 0.1080721  0.8861753 0.9694483 28121
## [4] {Education=[14,22],                                                                
##      Category=P,                                                                       
##      Appointment=38,                                                                   
##      Fulltime}            => {Schedule=F}          0.1378331  1.0000000 1.1086655 35865
## [5] {Category=T,                                                                       
##      Appointment=10,                                                                   
##      NSFTP=1,                                                                          
##      Fulltime}            => {Schedule=F}          0.1239556  1.0000000 1.1086655 32254