(see code for Project 1)
load("h1b_clean.rda")
summary(h1b)
## X CASE_STATUS
## Min. : 1 CERTIFIED :84522
## 1st Qu.: 25001 CERTIFIED-WITHDRAWN: 9075
## Median : 50000 DENIED : 2190
## Mean : 50000 WITHDRAWN : 4213
## 3rd Qu.: 75000
## Max. :100000
##
## EMPLOYER_NAME
## ERNST & YOUNG U.S. LLP : 3428
## COGNIZANT TECHNOLOGY SOLUTIONS U.S. CORPORATION: 2482
## INFOSYS LIMITED : 2317
## DELOITTE CONSULTING LLP : 1745
## DELOITTE & TOUCHE LLP : 1146
## CAPGEMINI AMERICA INC : 1121
## (Other) :87761
## SOC_NAME
## COMPUTER SYSTEMS ANALYSTS :18600
## MANAGEMENT ANALYSTS :15010
## ACCOUNTANTS AND AUDITORS :11204
## FINANCIAL ANALYSTS : 9605
## MARKET RESEARCH ANALYSTS AND MARKETING SPECIALISTS: 8089
## (Other) :37491
## NA's : 1
## JOB_TITLE FULL_TIME_POSITION PREVAILING_WAGE
## BUSINESS ANALYST : 4246 N:51569 Min. : 0
## ACCOUNTANT : 2331 Y:48431 1st Qu.: 55370
## PROGRAMMER ANALYST: 2096 Median : 68848
## SYSTEMS ANALYST : 1941 Mean : 77767
## FINANCIAL ANALYST : 1570 3rd Qu.: 92976
## ASSOCIATE : 1519 Max. :960000
## (Other) :86297 NA's :19
## YEAR WORKSITE lon
## Min. :2016 NEW YORK, NEW YORK :14424 Min. :-157.86
## 1st Qu.:2016 SAN FRANCISCO, CALIFORNIA: 3598 1st Qu.:-112.07
## Median :2016 HOUSTON, TEXAS : 3038 Median : -84.39
## Mean :2016 CHICAGO, ILLINOIS : 2780 Mean : -91.22
## 3rd Qu.:2016 LOS ANGELES, CALIFORNIA : 1850 3rd Qu.: -74.08
## Max. :2016 ATLANTA, GEORGIA : 1719 Max. : 145.73
## (Other) :72591 NA's :3170
## lat STATE
## Min. :13.44 Length:100000
## 1st Qu.:34.15 Class :character
## Median :39.64 Mode :character
## Mean :38.09
## 3rd Qu.:40.73
## Max. :64.84
## NA's :3170
Get rid of row number
h1b$X <- NULL
h1b$YEAR <- NULL
h1b$STATE <- factor(h1b$STATE)
h1b$lat <- NULL
h1b$lon <- NULL
library("arules")
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
h1b_disc <- discretizeDF(h1b)
summary(h1b_disc)
## CASE_STATUS
## CERTIFIED :84522
## CERTIFIED-WITHDRAWN: 9075
## DENIED : 2190
## WITHDRAWN : 4213
##
##
##
## EMPLOYER_NAME
## ERNST & YOUNG U.S. LLP : 3428
## COGNIZANT TECHNOLOGY SOLUTIONS U.S. CORPORATION: 2482
## INFOSYS LIMITED : 2317
## DELOITTE CONSULTING LLP : 1745
## DELOITTE & TOUCHE LLP : 1146
## CAPGEMINI AMERICA INC : 1121
## (Other) :87761
## SOC_NAME
## COMPUTER SYSTEMS ANALYSTS :18600
## MANAGEMENT ANALYSTS :15010
## ACCOUNTANTS AND AUDITORS :11204
## FINANCIAL ANALYSTS : 9605
## MARKET RESEARCH ANALYSTS AND MARKETING SPECIALISTS: 8089
## (Other) :37491
## NA's : 1
## JOB_TITLE FULL_TIME_POSITION PREVAILING_WAGE
## BUSINESS ANALYST : 4246 N:51569 [0,5.92e+04) :33303
## ACCOUNTANT : 2331 Y:48431 [5.92e+04,8.27e+04):33338
## PROGRAMMER ANALYST: 2096 [8.27e+04,9.6e+05] :33340
## SYSTEMS ANALYST : 1941 NA's : 19
## FINANCIAL ANALYST : 1570
## ASSOCIATE : 1519
## (Other) :86297
## WORKSITE STATE
## NEW YORK, NEW YORK :14424 CALIFORNIA:20479
## SAN FRANCISCO, CALIFORNIA: 3598 NEW YORK :17736
## HOUSTON, TEXAS : 3038 TEXAS : 8914
## CHICAGO, ILLINOIS : 2780 NEW JERSEY: 5905
## LOS ANGELES, CALIFORNIA : 1850 ILLINOIS : 5246
## ATLANTA, GEORGIA : 1719 FLORIDA : 3949
## (Other) :72591 (Other) :37771
Note: I am lazy and just use the default discretization. I am sure you can do better!
trans <- as(h1b_disc, "transactions")
dim(trans)
## [1] 100000 58427
itemFrequencyPlot(trans, topN=10)
rules <- apriori(trans, parameter = list(support = 0.05, confidence = 0.9))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.9 0.1 1 none FALSE TRUE 5 0.05 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 5000
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[58427 item(s), 100000 transaction(s)] done [0.08s].
## sorting and recoding items ... [19 item(s)] done [0.01s].
## creating transaction tree ... done [0.06s].
## checking subsets of size 1 2 3 4 5 done [0.00s].
## writing ... [43 rule(s)] done [0.00s].
## creating S4 object ... done [0.02s].
inspect(head(rules, by = "lift"))
## lhs rhs support confidence lift count
## [1] {WORKSITE=NEW YORK, NEW YORK} => {STATE=NEW YORK} 0.14424 1 5.63825 14424
## [2] {PREVAILING_WAGE=[8.27e+04,9.6e+05],
## WORKSITE=NEW YORK, NEW YORK} => {STATE=NEW YORK} 0.06234 1 5.63825 6234
## [3] {FULL_TIME_POSITION=Y,
## WORKSITE=NEW YORK, NEW YORK} => {STATE=NEW YORK} 0.08140 1 5.63825 8140
## [4] {FULL_TIME_POSITION=N,
## WORKSITE=NEW YORK, NEW YORK} => {STATE=NEW YORK} 0.06284 1 5.63825 6284
## [5] {CASE_STATUS=CERTIFIED,
## WORKSITE=NEW YORK, NEW YORK} => {STATE=NEW YORK} 0.12335 1 5.63825 12335
## [6] {FULL_TIME_POSITION=Y,
## PREVAILING_WAGE=[8.27e+04,9.6e+05],
## WORKSITE=NEW YORK, NEW YORK} => {STATE=NEW YORK} 0.06234 1 5.63825 6234
trans_no_state <- trans[, !grepl("^STATE=", colnames(trans))]
rules <- apriori(trans_no_state, parameter = list(support = 0.05, confidence = 0.9))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.9 0.1 1 none FALSE TRUE 5 0.05 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 5000
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[58374 item(s), 100000 transaction(s)] done [0.07s].
## sorting and recoding items ... [14 item(s)] done [0.01s].
## creating transaction tree ... done [0.05s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [25 rule(s)] done [0.00s].
## creating S4 object ... done [0.01s].
inspect(head(rules, by = "lift"))
## lhs rhs support confidence lift count
## [1] {SOC_NAME=COMPUTER AND INFORMATION SYSTEMS MANAGERS,
## FULL_TIME_POSITION=Y} => {PREVAILING_WAGE=[8.27e+04,9.6e+05]} 0.05683 0.9337825 2.800787 5683
## [2] {CASE_STATUS=CERTIFIED,
## SOC_NAME=COMPUTER AND INFORMATION SYSTEMS MANAGERS,
## FULL_TIME_POSITION=Y} => {PREVAILING_WAGE=[8.27e+04,9.6e+05]} 0.05082 0.9336763 2.800469 5082
## [3] {CASE_STATUS=CERTIFIED,
## SOC_NAME=COMPUTER AND INFORMATION SYSTEMS MANAGERS} => {PREVAILING_WAGE=[8.27e+04,9.6e+05]} 0.05082 0.9198190 2.758905 5082
## [4] {SOC_NAME=COMPUTER AND INFORMATION SYSTEMS MANAGERS} => {PREVAILING_WAGE=[8.27e+04,9.6e+05]} 0.05683 0.9180937 2.753730 5683
## [5] {PREVAILING_WAGE=[8.27e+04,9.6e+05]} => {FULL_TIME_POSITION=Y} 0.33340 1.0000000 2.064793 33340
## [6] {SOC_NAME=COMPUTER AND INFORMATION SYSTEMS MANAGERS,
## PREVAILING_WAGE=[8.27e+04,9.6e+05]} => {FULL_TIME_POSITION=Y} 0.05683 1.0000000 2.064793 5683
library(arulesViz)
## Loading required package: grid
plot(rules, method = "graph", engine = "html")
rules <- apriori(trans_no_state, parameter = list(support = 0.05, confidence = 0.3),
appearance = list(rhs = "CASE_STATUS=DENIED"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.3 0.1 1 none FALSE TRUE 5 0.05 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 5000
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[58374 item(s), 100000 transaction(s)] done [0.08s].
## sorting and recoding items ... [14 item(s)] done [0.01s].
## creating transaction tree ... done [0.06s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object ... done [0.01s].
find a reasonably low support
5/nrow(trans)
## [1] 5e-05
try again
rules <- apriori(trans_no_state, parameter = list(support = 0.00005, confidence = 0.3),
appearance = list(rhs = "CASE_STATUS=DENIED"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.3 0.1 1 none FALSE TRUE 5 5e-05 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 5
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[58374 item(s), 100000 transaction(s)] done [0.07s].
## sorting and recoding items ... [5881 item(s)] done [0.02s].
## creating transaction tree ... done [0.08s].
## checking subsets of size 1 2 3 4 5 6 7 done [0.33s].
## writing ... [135 rule(s)] done [0.13s].
## creating S4 object ... done [0.04s].
inspect(head(rules, by = "lift"))
## lhs rhs support confidence lift count
## [1] {EMPLOYER_NAME=BOSTON INNOVATION GATEWAY LLC} => {CASE_STATUS=DENIED} 9e-05 0.7500000 34.24658 9
## [2] {EMPLOYER_NAME=BOSTON INNOVATION GATEWAY LLC,
## WORKSITE=CAMBRIDGE, MASSACHUSETTS} => {CASE_STATUS=DENIED} 9e-05 0.7500000 34.24658 9
## [3] {EMPLOYER_NAME=SNAPRAYS, LLC,
## WORKSITE=PROVO, UTAH} => {CASE_STATUS=DENIED} 5e-05 0.7142857 32.61579 5
## [4] {JOB_TITLE=CHIEF EXECUTIVE OFFICER,
## WORKSITE=PROVO, UTAH} => {CASE_STATUS=DENIED} 5e-05 0.7142857 32.61579 5
## [5] {SOC_NAME=CHIEF EXECUTIVES,
## WORKSITE=PROVO, UTAH} => {CASE_STATUS=DENIED} 5e-05 0.7142857 32.61579 5
## [6] {EMPLOYER_NAME=SNAPRAYS, LLC,
## JOB_TITLE=CHIEF EXECUTIVE OFFICER,
## WORKSITE=PROVO, UTAH} => {CASE_STATUS=DENIED} 5e-05 0.7142857 32.61579 5
inspectDT(rules)
plot(rules, engine = "html")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(rules, method = "graph", engine = "html")
## Warning: Too many rules supplied. Only plotting the best 100 rules using
## lift (change control parameter max if needed)
trans2 <- trans[,itemFrequency(trans) >= 0.00005]
# ruleExplorer(trans2)