Read “clean” data

(see code for Project 1)

load("h1b_clean.rda")
summary(h1b)
##        X                       CASE_STATUS   
##  Min.   :     1   CERTIFIED          :84522  
##  1st Qu.: 25001   CERTIFIED-WITHDRAWN: 9075  
##  Median : 50000   DENIED             : 2190  
##  Mean   : 50000   WITHDRAWN          : 4213  
##  3rd Qu.: 75000                              
##  Max.   :100000                              
##                                              
##                                          EMPLOYER_NAME  
##  ERNST & YOUNG U.S. LLP                         : 3428  
##  COGNIZANT TECHNOLOGY SOLUTIONS U.S. CORPORATION: 2482  
##  INFOSYS LIMITED                                : 2317  
##  DELOITTE CONSULTING LLP                        : 1745  
##  DELOITTE & TOUCHE LLP                          : 1146  
##  CAPGEMINI AMERICA INC                          : 1121  
##  (Other)                                        :87761  
##                                                SOC_NAME    
##  COMPUTER SYSTEMS ANALYSTS                         :18600  
##  MANAGEMENT ANALYSTS                               :15010  
##  ACCOUNTANTS AND AUDITORS                          :11204  
##  FINANCIAL ANALYSTS                                : 9605  
##  MARKET RESEARCH ANALYSTS AND MARKETING SPECIALISTS: 8089  
##  (Other)                                           :37491  
##  NA's                                              :    1  
##               JOB_TITLE     FULL_TIME_POSITION PREVAILING_WAGE 
##  BUSINESS ANALYST  : 4246   N:51569            Min.   :     0  
##  ACCOUNTANT        : 2331   Y:48431            1st Qu.: 55370  
##  PROGRAMMER ANALYST: 2096                      Median : 68848  
##  SYSTEMS ANALYST   : 1941                      Mean   : 77767  
##  FINANCIAL ANALYST : 1570                      3rd Qu.: 92976  
##  ASSOCIATE         : 1519                      Max.   :960000  
##  (Other)           :86297                      NA's   :19      
##       YEAR                           WORKSITE          lon         
##  Min.   :2016   NEW YORK, NEW YORK       :14424   Min.   :-157.86  
##  1st Qu.:2016   SAN FRANCISCO, CALIFORNIA: 3598   1st Qu.:-112.07  
##  Median :2016   HOUSTON, TEXAS           : 3038   Median : -84.39  
##  Mean   :2016   CHICAGO, ILLINOIS        : 2780   Mean   : -91.22  
##  3rd Qu.:2016   LOS ANGELES, CALIFORNIA  : 1850   3rd Qu.: -74.08  
##  Max.   :2016   ATLANTA, GEORGIA         : 1719   Max.   : 145.73  
##                 (Other)                  :72591   NA's   :3170     
##       lat           STATE          
##  Min.   :13.44   Length:100000     
##  1st Qu.:34.15   Class :character  
##  Median :39.64   Mode  :character  
##  Mean   :38.09                     
##  3rd Qu.:40.73                     
##  Max.   :64.84                     
##  NA's   :3170

Prepare data

Get rid of row number

h1b$X <- NULL
h1b$YEAR <- NULL
h1b$STATE <- factor(h1b$STATE)
h1b$lat <- NULL
h1b$lon <- NULL

library("arules")
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write

Discretize continuous data

h1b_disc <- discretizeDF(h1b)
summary(h1b_disc)
##               CASE_STATUS   
##  CERTIFIED          :84522  
##  CERTIFIED-WITHDRAWN: 9075  
##  DENIED             : 2190  
##  WITHDRAWN          : 4213  
##                             
##                             
##                             
##                                          EMPLOYER_NAME  
##  ERNST & YOUNG U.S. LLP                         : 3428  
##  COGNIZANT TECHNOLOGY SOLUTIONS U.S. CORPORATION: 2482  
##  INFOSYS LIMITED                                : 2317  
##  DELOITTE CONSULTING LLP                        : 1745  
##  DELOITTE & TOUCHE LLP                          : 1146  
##  CAPGEMINI AMERICA INC                          : 1121  
##  (Other)                                        :87761  
##                                                SOC_NAME    
##  COMPUTER SYSTEMS ANALYSTS                         :18600  
##  MANAGEMENT ANALYSTS                               :15010  
##  ACCOUNTANTS AND AUDITORS                          :11204  
##  FINANCIAL ANALYSTS                                : 9605  
##  MARKET RESEARCH ANALYSTS AND MARKETING SPECIALISTS: 8089  
##  (Other)                                           :37491  
##  NA's                                              :    1  
##               JOB_TITLE     FULL_TIME_POSITION            PREVAILING_WAGE 
##  BUSINESS ANALYST  : 4246   N:51569            [0,5.92e+04)       :33303  
##  ACCOUNTANT        : 2331   Y:48431            [5.92e+04,8.27e+04):33338  
##  PROGRAMMER ANALYST: 2096                      [8.27e+04,9.6e+05] :33340  
##  SYSTEMS ANALYST   : 1941                      NA's               :   19  
##  FINANCIAL ANALYST : 1570                                                 
##  ASSOCIATE         : 1519                                                 
##  (Other)           :86297                                                 
##                       WORKSITE            STATE      
##  NEW YORK, NEW YORK       :14424   CALIFORNIA:20479  
##  SAN FRANCISCO, CALIFORNIA: 3598   NEW YORK  :17736  
##  HOUSTON, TEXAS           : 3038   TEXAS     : 8914  
##  CHICAGO, ILLINOIS        : 2780   NEW JERSEY: 5905  
##  LOS ANGELES, CALIFORNIA  : 1850   ILLINOIS  : 5246  
##  ATLANTA, GEORGIA         : 1719   FLORIDA   : 3949  
##  (Other)                  :72591   (Other)   :37771

Note: I am lazy and just use the default discretization. I am sure you can do better!

trans <- as(h1b_disc, "transactions")
dim(trans)
## [1] 100000  58427
itemFrequencyPlot(trans, topN=10)

Mine some rules

rules <- apriori(trans, parameter = list(support = 0.05, confidence = 0.9))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.9    0.1    1 none FALSE            TRUE       5    0.05      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 5000 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[58427 item(s), 100000 transaction(s)] done [0.08s].
## sorting and recoding items ... [19 item(s)] done [0.01s].
## creating transaction tree ... done [0.06s].
## checking subsets of size 1 2 3 4 5 done [0.00s].
## writing ... [43 rule(s)] done [0.00s].
## creating S4 object  ... done [0.02s].
inspect(head(rules, by = "lift"))
##     lhs                                     rhs              support confidence    lift count
## [1] {WORKSITE=NEW YORK, NEW YORK}        => {STATE=NEW YORK} 0.14424          1 5.63825 14424
## [2] {PREVAILING_WAGE=[8.27e+04,9.6e+05],                                                     
##      WORKSITE=NEW YORK, NEW YORK}        => {STATE=NEW YORK} 0.06234          1 5.63825  6234
## [3] {FULL_TIME_POSITION=Y,                                                                   
##      WORKSITE=NEW YORK, NEW YORK}        => {STATE=NEW YORK} 0.08140          1 5.63825  8140
## [4] {FULL_TIME_POSITION=N,                                                                   
##      WORKSITE=NEW YORK, NEW YORK}        => {STATE=NEW YORK} 0.06284          1 5.63825  6284
## [5] {CASE_STATUS=CERTIFIED,                                                                  
##      WORKSITE=NEW YORK, NEW YORK}        => {STATE=NEW YORK} 0.12335          1 5.63825 12335
## [6] {FULL_TIME_POSITION=Y,                                                                   
##      PREVAILING_WAGE=[8.27e+04,9.6e+05],                                                     
##      WORKSITE=NEW YORK, NEW YORK}        => {STATE=NEW YORK} 0.06234          1 5.63825  6234

Try without the state. I could also try without the city

trans_no_state <- trans[, !grepl("^STATE=", colnames(trans))]
rules <- apriori(trans_no_state, parameter = list(support = 0.05, confidence = 0.9))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.9    0.1    1 none FALSE            TRUE       5    0.05      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 5000 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[58374 item(s), 100000 transaction(s)] done [0.07s].
## sorting and recoding items ... [14 item(s)] done [0.01s].
## creating transaction tree ... done [0.05s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [25 rule(s)] done [0.00s].
## creating S4 object  ... done [0.01s].
inspect(head(rules, by = "lift"))
##     lhs                                                     rhs                                  support confidence     lift count
## [1] {SOC_NAME=COMPUTER AND INFORMATION SYSTEMS MANAGERS,                                                                          
##      FULL_TIME_POSITION=Y}                               => {PREVAILING_WAGE=[8.27e+04,9.6e+05]} 0.05683  0.9337825 2.800787  5683
## [2] {CASE_STATUS=CERTIFIED,                                                                                                       
##      SOC_NAME=COMPUTER AND INFORMATION SYSTEMS MANAGERS,                                                                          
##      FULL_TIME_POSITION=Y}                               => {PREVAILING_WAGE=[8.27e+04,9.6e+05]} 0.05082  0.9336763 2.800469  5082
## [3] {CASE_STATUS=CERTIFIED,                                                                                                       
##      SOC_NAME=COMPUTER AND INFORMATION SYSTEMS MANAGERS} => {PREVAILING_WAGE=[8.27e+04,9.6e+05]} 0.05082  0.9198190 2.758905  5082
## [4] {SOC_NAME=COMPUTER AND INFORMATION SYSTEMS MANAGERS} => {PREVAILING_WAGE=[8.27e+04,9.6e+05]} 0.05683  0.9180937 2.753730  5683
## [5] {PREVAILING_WAGE=[8.27e+04,9.6e+05]}                 => {FULL_TIME_POSITION=Y}               0.33340  1.0000000 2.064793 33340
## [6] {SOC_NAME=COMPUTER AND INFORMATION SYSTEMS MANAGERS,                                                                          
##      PREVAILING_WAGE=[8.27e+04,9.6e+05]}                 => {FULL_TIME_POSITION=Y}               0.05683  1.0000000 2.064793  5683
library(arulesViz)
## Loading required package: grid
plot(rules, method = "graph", engine = "html")

Mine rules with not certified in the RHS

rules <- apriori(trans_no_state, parameter = list(support = 0.05, confidence = 0.3),
  appearance = list(rhs = "CASE_STATUS=DENIED"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.3    0.1    1 none FALSE            TRUE       5    0.05      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 5000 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[58374 item(s), 100000 transaction(s)] done [0.08s].
## sorting and recoding items ... [14 item(s)] done [0.01s].
## creating transaction tree ... done [0.06s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object  ... done [0.01s].

find a reasonably low support

5/nrow(trans)
## [1] 5e-05

try again

rules <- apriori(trans_no_state, parameter = list(support = 0.00005, confidence = 0.3),
  appearance = list(rhs = "CASE_STATUS=DENIED"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.3    0.1    1 none FALSE            TRUE       5   5e-05      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 5 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[58374 item(s), 100000 transaction(s)] done [0.07s].
## sorting and recoding items ... [5881 item(s)] done [0.02s].
## creating transaction tree ... done [0.08s].
## checking subsets of size 1 2 3 4 5 6 7 done [0.33s].
## writing ... [135 rule(s)] done [0.13s].
## creating S4 object  ... done [0.04s].
inspect(head(rules, by = "lift"))
##     lhs                                              rhs                  support confidence     lift count
## [1] {EMPLOYER_NAME=BOSTON INNOVATION GATEWAY LLC} => {CASE_STATUS=DENIED}   9e-05  0.7500000 34.24658     9
## [2] {EMPLOYER_NAME=BOSTON INNOVATION GATEWAY LLC,                                                          
##      WORKSITE=CAMBRIDGE, MASSACHUSETTS}           => {CASE_STATUS=DENIED}   9e-05  0.7500000 34.24658     9
## [3] {EMPLOYER_NAME=SNAPRAYS, LLC,                                                                          
##      WORKSITE=PROVO, UTAH}                        => {CASE_STATUS=DENIED}   5e-05  0.7142857 32.61579     5
## [4] {JOB_TITLE=CHIEF EXECUTIVE OFFICER,                                                                    
##      WORKSITE=PROVO, UTAH}                        => {CASE_STATUS=DENIED}   5e-05  0.7142857 32.61579     5
## [5] {SOC_NAME=CHIEF EXECUTIVES,                                                                            
##      WORKSITE=PROVO, UTAH}                        => {CASE_STATUS=DENIED}   5e-05  0.7142857 32.61579     5
## [6] {EMPLOYER_NAME=SNAPRAYS, LLC,                                                                          
##      JOB_TITLE=CHIEF EXECUTIVE OFFICER,                                                                    
##      WORKSITE=PROVO, UTAH}                        => {CASE_STATUS=DENIED}   5e-05  0.7142857 32.61579     5
inspectDT(rules)
plot(rules, engine = "html")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(rules, method = "graph", engine = "html")
## Warning: Too many rules supplied. Only plotting the best 100 rules using
## lift (change control parameter max if needed)

Use rule explorer

trans2 <- trans[,itemFrequency(trans) >= 0.00005]
# ruleExplorer(trans2)