Load data file from project 1

load("Employment_2009.rda")

summary(dat)
##       PseudoID                            Name              Date        
##  004101442:      4   NAME WITHHELD BY OPM   :304408   20090331:1270610  
##  003167685:      3   NAME WITHHELD BY AGENCY:169859                     
##  003327886:      3   NAME UNKNOWN           :    52                     
##  003509194:      3   SMITH,PATRICIA A       :    20                     
##  004023994:      3   SMITH,JAMES E          :    18                     
##  004188607:      3   SMITH,MICHAEL A        :    17                     
##  (Other)  :1270591   (Other)                :796236                     
##      Agency            Station            Age           Education     
##  VATA   :260206   #########:381500   50-54  :203671   13     :319393  
##  TR93   :104069   110010001:103144   45-49  :194857   04     :271752  
##  SZ00   : 63229   240130031: 13718   55-59  :173459   17     :130966  
##  HSBC   : 61324   241360031: 13179   40-44  :167649   07     : 74839  
##  HSBD   : 55006   426540101: 12122   35-39  :137531   10     : 73869  
##  TD03   : 47034   241698005: 11452   30-34  :112498   15     : 57892  
##  (Other):679742   (Other)  :735495   (Other):280945   (Other):341899  
##     PayPlan           Grade             LOS           Occupation    
##  GS     :885287   13     :145501   5-9    :236641   0301   : 61709  
##  SV     : 61193   12     :136265   20-24  :160063   0303   : 56891  
##  VN     : 45801   11     :128610   15-19  :151277   0610   : 56257  
##  AD     : 45316   07     :101042   1-2    :147091   1802   : 55960  
##  WG     : 39701   09     : 92546   10-14  :139189   2210   : 41421  
##  GL     : 38389   14     : 81942   < 1    :105342   1811   : 39845  
##  (Other):154923   (Other):584704   (Other):331007   (Other):958527  
##  Category        Pay         SupervisoryStatus  Appointment    
##  *:   223   Min.   :     0   *:     35         10     :673958  
##  A:474254   1st Qu.: 45693   2: 147715         38     :273766  
##  B: 58487   Median : 65810   4:   5132         15     :125912  
##  C:116111   Mean   : 74122   5:   5343         48     : 63080  
##  O: 50943   3rd Qu.: 95620   6:  10615         32     : 41408  
##  P:325197   Max.   :393411   7:   6215         30     : 34454  
##  T:245395   NA's   :2219     8:1095555         (Other): 58032  
##     Schedule       NSFTP      
##  F      :1136271   1:1081701  
##  P      :  52803   2: 188909  
##  I      :  45535              
##  G      :  31096              
##  J      :   2810              
##  Q      :   2022              
##  (Other):     73              
##                                   AgencyName      Fulltime      
##  VETERANS HEALTH ADMINISTRATION        :260206   Mode :logical  
##  INTERNAL REVENUE SERVICE              :104069   FALSE:103243   
##  SOCIAL SECURITY ADMINISTRATION        : 63229   TRUE :1167367  
##  TRANSPORTATION SECURITY ADMINISTRATION: 61324                  
##  CUSTOMS AND BORDER PROTECTION         : 55006                  
##  (Other)                               :726714                  
##  NA's                                  :    62                  
##   Seasonal      
##  Mode :logical  
##  FALSE:1234680  
##  TRUE :35930    
##                 
##                 
##                 
## 

Important Note: I did not clean the data to remove duplicates, etc. You need to do this first if you have not done it for project 1! Remove attributes not useful for predictive modeling

dat2 <- dat
dat2$PseudoID <- NULL
dat2$Name <- NULL
dat2$Date <- NULL

Make some attributes measured on an ordinal scale continuous

Reason: Decision trees are very slow with nominal/ordinal attributes with many different values.

dat2$Education <- as.numeric(as.character(dat2$Education))
## Warning: NAs introduced by coercion
dat2$Age <- as.numeric(substr(dat2$Age, start = 1, stop = 2))
## Warning: NAs introduced by coercion
dat2$LOS <- as.numeric(sub("-.*|\\+|<", "", dat2$LOS))
## Warning: NAs introduced by coercion

Prepare data for association rule mining

library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write

Discretice continous (numeric) variabels

which(sapply(dat2, FUN = function(i) is.numeric(i)))
##       Age Education       LOS       Pay 
##         3         4         7        10
for(i in which(sapply(dat2, FUN = function(i) is.numeric(i))))
  dat2[[i]] <- discretize(dat2[[i]], method = "frequency")

Get rid of some more columns

dat2$Agency <- NULL ### we have AgencyName
dat2$Schedule <- NULL ### we have fulltime
dat2$NSFTP <- NULL ### ???


trans <- as(dat2, "transactions")
summary(trans)
## transactions as itemMatrix in sparse format with
##  1270610 rows (elements/itemsets/transactions) and
##  47255 columns (items) and a density of 0.0002730179 
## 
## most frequent items:
##            Fulltime SupervisoryStatus=8          PayPlan=GS 
##             1167367             1095555              885287 
##      Appointment=10         LOS=[ 1,10)             (Other) 
##              673958              589592            11980968 
## 
## element (itemset/transaction) length distribution:
## sizes
##       9      10      11      12      13      14 
##       5     499   24424  103560 1113400   28722 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     9.0    13.0    13.0    12.9    13.0    14.0 
## 
## includes extended item information - examples:
##              labels variables    levels
## 1 Station=*********   Station *********
## 2 Station=#########   Station #########
## 3 Station=010000003   Station 010000003
## 
## includes extended transaction information - examples:
##   transactionID
## 1             1
## 2             2
## 3             3

Look at transactions

itemFrequencyPlot(trans, topN = 50)

Mine and inspect rules

rules <- apriori(trans, parameter = list(supp = .01, conf = .8))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 12706 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[13238 item(s), 1270610 transaction(s)] done [1.40s].
## sorting and recoding items ... [106 item(s)] done [0.16s].
## creating transaction tree ... done [1.59s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(trans, parameter = list(supp = 0.01, conf = 0.8)):
## Mining stopped (maxlen reached). Only patterns up to a length of 10
## returned!
##  done [2.01s].
## writing ... [72374 rule(s)] done [0.02s].
## creating S4 object  ... done [0.42s].
summary(rules)
## set of 72374 rules
## 
## rule length distribution (lhs + rhs):sizes
##     1     2     3     4     5     6     7     8     9    10 
##     2   337  3176 11347 20002 19726 11734  4570  1252   228 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   5.000   6.000   5.613   6.000  10.000 
## 
## summary of quality measures:
##     support          confidence          lift             count        
##  Min.   :0.01000   Min.   :0.8000   Min.   : 0.8712   Min.   :  12707  
##  1st Qu.:0.01197   1st Qu.:0.9049   1st Qu.: 1.0806   1st Qu.:  15206  
##  Median :0.01406   Median :0.9701   Median : 1.5274   Median :  17867  
##  Mean   :0.01922   Mean   :0.9463   Mean   : 4.7226   Mean   :  24423  
##  3rd Qu.:0.02050   3rd Qu.:0.9967   3rd Qu.: 3.3206   3rd Qu.:  26054  
##  Max.   :0.91875   Max.   :1.0000   Max.   :94.8069   Max.   :1167367  
## 
## mining info:
##   data ntransactions support confidence
##  trans       1270610    0.01        0.8
inspect(head(rules, by = "lift"))
##     lhs                                                     rhs                  support confidence     lift count
## [1] {Station=#########,                                                                                           
##      Category=P,                                                                                                  
##      AgencyName=INTERNAL REVENUE SERVICE}                => {Occupation=0512} 0.01013608  0.9652976 94.80689 12879
## [2] {Category=P,                                                                                                  
##      AgencyName=INTERNAL REVENUE SERVICE}                => {Occupation=0512} 0.01018172  0.8133409 79.88243 12937
## [3] {Station=#########,                                                                                           
##      Category=O,                                                                                                  
##      AgencyName=BUREAU OF PRISONS/FEDERAL PRISON SYSTEM,                                                          
##      Fulltime}                                           => {Occupation=0007} 0.01326686  0.9997628 74.80764 16857
## [4] {Station=#########,                                                                                           
##      PayPlan=GL,                                                                                                  
##      Category=O,                                                                                                  
##      AgencyName=BUREAU OF PRISONS/FEDERAL PRISON SYSTEM} => {Occupation=0007} 0.01241372  0.9997465 74.80642 15773
## [5] {Station=#########,                                                                                           
##      PayPlan=GL,                                                                                                  
##      Category=O,                                                                                                  
##      AgencyName=BUREAU OF PRISONS/FEDERAL PRISON SYSTEM,                                                          
##      Fulltime}                                           => {Occupation=0007} 0.01241372  0.9997465 74.80642 15773
## [6] {Station=#########,                                                                                           
##      Category=O,                                                                                                  
##      SupervisoryStatus=8,                                                                                         
##      AgencyName=BUREAU OF PRISONS/FEDERAL PRISON SYSTEM,                                                          
##      Fulltime}                                           => {Occupation=0007} 0.01218627  0.9997417 74.80607 15484
library(arulesViz)
## Loading required package: grid
plot(rules, engine = "html")
## Warning: plot: Too many rules supplied. Only plotting the best 1000 rules
## using measure lift (change parameter max if needed)
plot(rules, method = "graph", engine = "html")
## Warning: Too many rules supplied. Only plotting the best 100 rules using
## lift (change control parameter max if needed)

Todo