(see code for Project 1)
load("h1b_clean.rda")
summary(h1b)
## X CASE_STATUS
## Min. : 1 CERTIFIED :84522
## 1st Qu.: 25001 CERTIFIED-WITHDRAWN: 9075
## Median : 50000 DENIED : 2190
## Mean : 50000 WITHDRAWN : 4213
## 3rd Qu.: 75000
## Max. :100000
##
## EMPLOYER_NAME
## ERNST & YOUNG U.S. LLP : 3428
## COGNIZANT TECHNOLOGY SOLUTIONS U.S. CORPORATION: 2482
## INFOSYS LIMITED : 2317
## DELOITTE CONSULTING LLP : 1745
## DELOITTE & TOUCHE LLP : 1146
## CAPGEMINI AMERICA INC : 1121
## (Other) :87761
## SOC_NAME
## COMPUTER SYSTEMS ANALYSTS :18600
## MANAGEMENT ANALYSTS :15010
## ACCOUNTANTS AND AUDITORS :11204
## FINANCIAL ANALYSTS : 9605
## MARKET RESEARCH ANALYSTS AND MARKETING SPECIALISTS: 8089
## (Other) :37491
## NA's : 1
## JOB_TITLE FULL_TIME_POSITION PREVAILING_WAGE
## BUSINESS ANALYST : 4246 N:51569 Min. : 0
## ACCOUNTANT : 2331 Y:48431 1st Qu.: 55370
## PROGRAMMER ANALYST: 2096 Median : 68848
## SYSTEMS ANALYST : 1941 Mean : 77767
## FINANCIAL ANALYST : 1570 3rd Qu.: 92976
## ASSOCIATE : 1519 Max. :960000
## (Other) :86297 NA's :19
## YEAR WORKSITE lon
## Min. :2016 NEW YORK, NEW YORK :14424 Min. :-157.86
## 1st Qu.:2016 SAN FRANCISCO, CALIFORNIA: 3598 1st Qu.:-112.07
## Median :2016 HOUSTON, TEXAS : 3038 Median : -84.39
## Mean :2016 CHICAGO, ILLINOIS : 2780 Mean : -91.22
## 3rd Qu.:2016 LOS ANGELES, CALIFORNIA : 1850 3rd Qu.: -74.08
## Max. :2016 ATLANTA, GEORGIA : 1719 Max. : 145.73
## (Other) :72591 NA's :3170
## lat STATE
## Min. :13.44 Length:100000
## 1st Qu.:34.15 Class :character
## Median :39.64 Mode :character
## Mean :38.09
## 3rd Qu.:40.73
## Max. :64.84
## NA's :3170
Get rid of row number
h1b$X <- NULL
Make state a factor
h1b$STATE <- as.factor(h1b$STATE)
Create a logical variable for certified
h1b$CERTIFIED <- h1b$CASE_STATUS == "CERTIFIED"
Fix SOC_NAME somewhat (take first 6 letters to reduce number of differnt values)
h1b$SOC_NAME_short <- as.factor(substr(h1b$SOC_NAME, 1, 6))
length(levels(h1b$SOC_NAME))
## [1] 199
length(levels(h1b$SOC_NAME_short))
## [1] 73
summary(h1b)
## CASE_STATUS
## CERTIFIED :84522
## CERTIFIED-WITHDRAWN: 9075
## DENIED : 2190
## WITHDRAWN : 4213
##
##
##
## EMPLOYER_NAME
## ERNST & YOUNG U.S. LLP : 3428
## COGNIZANT TECHNOLOGY SOLUTIONS U.S. CORPORATION: 2482
## INFOSYS LIMITED : 2317
## DELOITTE CONSULTING LLP : 1745
## DELOITTE & TOUCHE LLP : 1146
## CAPGEMINI AMERICA INC : 1121
## (Other) :87761
## SOC_NAME
## COMPUTER SYSTEMS ANALYSTS :18600
## MANAGEMENT ANALYSTS :15010
## ACCOUNTANTS AND AUDITORS :11204
## FINANCIAL ANALYSTS : 9605
## MARKET RESEARCH ANALYSTS AND MARKETING SPECIALISTS: 8089
## (Other) :37491
## NA's : 1
## JOB_TITLE FULL_TIME_POSITION PREVAILING_WAGE
## BUSINESS ANALYST : 4246 N:51569 Min. : 0
## ACCOUNTANT : 2331 Y:48431 1st Qu.: 55370
## PROGRAMMER ANALYST: 2096 Median : 68848
## SYSTEMS ANALYST : 1941 Mean : 77767
## FINANCIAL ANALYST : 1570 3rd Qu.: 92976
## ASSOCIATE : 1519 Max. :960000
## (Other) :86297 NA's :19
## YEAR WORKSITE lon
## Min. :2016 NEW YORK, NEW YORK :14424 Min. :-157.86
## 1st Qu.:2016 SAN FRANCISCO, CALIFORNIA: 3598 1st Qu.:-112.07
## Median :2016 HOUSTON, TEXAS : 3038 Median : -84.39
## Mean :2016 CHICAGO, ILLINOIS : 2780 Mean : -91.22
## 3rd Qu.:2016 LOS ANGELES, CALIFORNIA : 1850 3rd Qu.: -74.08
## Max. :2016 ATLANTA, GEORGIA : 1719 Max. : 145.73
## (Other) :72591 NA's :3170
## lat STATE CERTIFIED SOC_NAME_short
## Min. :13.44 CALIFORNIA:20479 Mode :logical COMPUT :27561
## 1st Qu.:34.15 NEW YORK :17736 FALSE:15478 MANAGE :16025
## Median :39.64 TEXAS : 8914 TRUE :84522 FINANC :13190
## Mean :38.09 NEW JERSEY: 5905 ACCOUN :11256
## 3rd Qu.:40.73 ILLINOIS : 5246 MARKET :11251
## Max. :64.84 FLORIDA : 3949 (Other):20716
## NA's :3170 (Other) :37771 NA's : 1
h1b_use <- h1b[,c("FULL_TIME_POSITION", "PREVAILING_WAGE",
"STATE", "SOC_NAME_short", "CERTIFIED")]
str(h1b_use$CERTIFIED)
## logi [1:100000] FALSE FALSE FALSE FALSE FALSE FALSE ...
The class variable needs to be a factor for classification!
h1b_use$CERTIFIED <- factor(h1b_use$CERTIFIED)
str(h1b_use$CERTIFIED)
## Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 1 1 1 1 1 1 ...
library("rpart")
tree_default <- rpart(CERTIFIED ~ ., data = h1b_use)
tree_default
## n= 100000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 100000 15478 TRUE (0.1547800 0.8452200) *
library(rpart.plot)
rpart.plot(tree_default, extra = 2, under = TRUE, varlen=0, faclen=0)
The dataset is highly imbalanced
table(h1b_use$CERTIFIED)
##
## FALSE TRUE
## 15478 84522
Resample
library(sampling)
id <- strata(h1b_use, stratanames="CERTIFIED", size=c(50000,50000),
method="srswr")
h1b_use_balanced <- h1b_use[id$ID_unit, ]
table(h1b_use_balanced$CERTIFIED)
##
## FALSE TRUE
## 50000 50000
tree_default <- rpart(CERTIFIED ~ .,
data = h1b_use_balanced)
tree_default
## n= 100000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 100000 50000 FALSE (0.5000000 0.5000000)
## 2) SOC_NAME_short=ADMINI,ADVERT,AGENTS,APPLIC,APPRAI,BIOCHE,BUDGET,BUSINE,CHIEF ,COMPEN,COMPLI,COMPUT,CONSTR,EDUCAT,ENGINE,FINANC,FOOD S,FUNDRA,GENERA,GEOGRA,HUMAN ,INFORM,INSURA,INVEST,LOGIST,MANAGM,MARKET,MEETIN,NATURA,NETWOR,PERSON,PROPER,PUBLIC,PURCHA,QUALIT,RISK M,SOCIAL,SOFTWA,SUPPLY,TAX PR,TRAINI,TREASU 71969 31873 FALSE (0.5571288 0.4428712)
## 4) SOC_NAME_short=ADMINI,AGENTS,APPLIC,APPRAI,BIOCHE,BUSINE,CHIEF ,COMPLI,COMPUT,ENGINE,GEOGRA,INVEST,MANAGM,NETWOR,PROPER,RISK M,SOFTWA,SUPPLY,TAX PR,TREASU 37321 14347 FALSE (0.6155784 0.3844216) *
## 5) SOC_NAME_short=ADVERT,BUDGET,COMPEN,CONSTR,EDUCAT,FINANC,FOOD S,FUNDRA,GENERA,HUMAN ,INFORM,INSURA,LOGIST,MARKET,MEETIN,NATURA,PERSON,PUBLIC,PURCHA,QUALIT,SOCIAL,TRAINI 34648 17122 TRUE (0.4941699 0.5058301)
## 10) STATE=COLORADO,CONNECTICUT,DELAWARE,DISTRICT OF COLUMBIA,HAWAII,IDAHO,INDIANA,IOWA,LOUISIANA,MARYLAND,MICHIGAN,MINNESOTA,MISSISSIPPI,NA,NEW HAMPSHIRE,NEW YORK,NORTH DAKOTA,OKLAHOMA,OREGON,RHODE ISLAND,SOUTH DAKOTA,TENNESSEE,TEXAS,VERMONT,WEST VIRGINIA,WISCONSIN 16656 7760 FALSE (0.5341018 0.4658982) *
## 11) STATE=ALABAMA,ALASKA,ARIZONA,ARKANSAS,CALIFORNIA,FLORIDA,GEORGIA,ILLINOIS,KANSAS,KENTUCKY,MAINE,MASSACHUSETTS,MISSOURI,MONTANA,NEBRASKA,NEVADA,NEW JERSEY,NEW MEXICO,NORTH CAROLINA,OHIO,PENNSYLVANIA,PUERTO RICO,SOUTH CAROLINA,UTAH,VIRGINIA,WASHINGTON,WYOMING 17992 8226 TRUE (0.4572032 0.5427968) *
## 3) SOC_NAME_short=ACCOUN,ARCHIT,AUDITO,BUYERS,CLAIMS,CLINIC,COST E,CREDIT,DATABA,EMERGE,FARMER,FITNES,GAMING,INDUST,LABOR ,LOAN C,LOAN O,LODGIN,MANAGE,MANGEM,MEDICA,REGULA,SALES ,SUSTAI,TAX EX,TRANSP,VICE P,WHOLES 28031 9904 TRUE (0.3533231 0.6466769) *
rpart.plot(tree_default, extra = 2, under = TRUE, varlen=0, faclen=0)
Make a larger tree
tree_default <- rpart(CERTIFIED ~ .,
data = h1b_use_balanced,
control=rpart.control(cp=0.001))
tree_default
## n= 100000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 100000 50000 FALSE (0.5000000 0.5000000)
## 2) SOC_NAME_short=ADMINI,ADVERT,AGENTS,APPLIC,APPRAI,BIOCHE,BUDGET,BUSINE,CHIEF ,COMPEN,COMPLI,COMPUT,CONSTR,EDUCAT,ENGINE,FINANC,FOOD S,FUNDRA,GENERA,GEOGRA,HUMAN ,INFORM,INSURA,INVEST,LOGIST,MANAGM,MARKET,MEETIN,NATURA,NETWOR,PERSON,PROPER,PUBLIC,PURCHA,QUALIT,RISK M,SOCIAL,SOFTWA,SUPPLY,TAX PR,TRAINI,TREASU 71969 31873 FALSE (0.5571288 0.4428712)
## 4) SOC_NAME_short=ADMINI,AGENTS,APPLIC,APPRAI,BIOCHE,BUSINE,CHIEF ,COMPLI,COMPUT,ENGINE,GEOGRA,INVEST,MANAGM,NETWOR,PROPER,RISK M,SOFTWA,SUPPLY,TAX PR,TREASU 37321 14347 FALSE (0.6155784 0.3844216)
## 8) PREVAILING_WAGE< 72935.5 19477 5789 FALSE (0.7027776 0.2972224)
## 16) STATE=ALABAMA,ALASKA,ARIZONA,CALIFORNIA,COLORADO,CONNECTICUT,DELAWARE,DISTRICT OF COLUMBIA,ILLINOIS,LOUISIANA,MAINE,MASSACHUSETTS,MINNESOTA,MISSISSIPPI,MISSOURI,MONTANA,NA,NEBRASKA,NEVADA,NEW HAMPSHIRE,NEW JERSEY,NEW MEXICO,NORTH CAROLINA,NORTH DAKOTA,OHIO,OREGON,PUERTO RICO,SOUTH CAROLINA,SOUTH DAKOTA,TENNESSEE,TEXAS,UTAH,VIRGINIA,WEST VIRGINIA,WISCONSIN,WYOMING 13471 3391 FALSE (0.7482741 0.2517259) *
## 17) STATE=ARKANSAS,FLORIDA,GEORGIA,HAWAII,IDAHO,INDIANA,IOWA,KANSAS,KENTUCKY,MARYLAND,MICHIGAN,NEW YORK,OKLAHOMA,PENNSYLVANIA,RHODE ISLAND,VERMONT,WASHINGTON 6006 2398 FALSE (0.6007326 0.3992674)
## 34) STATE=FLORIDA,GEORGIA,IDAHO,INDIANA,IOWA,MARYLAND,MICHIGAN,NEW YORK,OKLAHOMA,WASHINGTON 4571 1723 FALSE (0.6230584 0.3769416)
## 68) PREVAILING_WAGE< 61781.5 2565 847 FALSE (0.6697856 0.3302144) *
## 69) PREVAILING_WAGE>=61781.5 2006 876 FALSE (0.5633101 0.4366899)
## 138) PREVAILING_WAGE>=62618 1697 649 FALSE (0.6175604 0.3824396) *
## 139) PREVAILING_WAGE< 62618 309 82 TRUE (0.2653722 0.7346278) *
## 35) STATE=ARKANSAS,HAWAII,KANSAS,KENTUCKY,PENNSYLVANIA,RHODE ISLAND,VERMONT 1435 675 FALSE (0.5296167 0.4703833)
## 70) PREVAILING_WAGE>=60372 559 191 FALSE (0.6583184 0.3416816) *
## 71) PREVAILING_WAGE< 60372 876 392 TRUE (0.4474886 0.5525114)
## 142) PREVAILING_WAGE< 59716.9 593 246 FALSE (0.5851602 0.4148398) *
## 143) PREVAILING_WAGE>=59716.9 283 45 TRUE (0.1590106 0.8409894) *
## 9) PREVAILING_WAGE>=72935.5 17844 8558 FALSE (0.5203990 0.4796010)
## 18) STATE=CALIFORNIA,COLORADO,CONNECTICUT,DELAWARE,FLORIDA,HAWAII,IDAHO,ILLINOIS,IOWA,LOUISIANA,MAINE,MARYLAND,MICHIGAN,MISSISSIPPI,MISSOURI,MONTANA,NA,NEBRASKA,NEVADA,NEW HAMPSHIRE,NEW JERSEY,NEW MEXICO,NORTH DAKOTA,RHODE ISLAND,SOUTH CAROLINA,UTAH,WEST VIRGINIA,WYOMING 9890 4205 FALSE (0.5748231 0.4251769)
## 36) PREVAILING_WAGE< 85946.5 3549 1139 FALSE (0.6790645 0.3209355) *
## 37) PREVAILING_WAGE>=85946.5 6341 3066 FALSE (0.5164801 0.4835199)
## 74) PREVAILING_WAGE< 104062.5 2732 1163 FALSE (0.5743045 0.4256955) *
## 75) PREVAILING_WAGE>=104062.5 3609 1706 TRUE (0.4727071 0.5272929)
## 150) STATE=MAINE,NA,NEBRASKA,NEW HAMPSHIRE,NEW MEXICO,WEST VIRGINIA,WYOMING 71 10 FALSE (0.8591549 0.1408451) *
## 151) STATE=CALIFORNIA,COLORADO,CONNECTICUT,DELAWARE,FLORIDA,HAWAII,IDAHO,ILLINOIS,IOWA,MARYLAND,MICHIGAN,MISSOURI,NEVADA,NEW JERSEY,RHODE ISLAND,SOUTH CAROLINA,UTAH 3538 1645 TRUE (0.4649520 0.5350480) *
## 19) STATE=ALABAMA,ALASKA,ARIZONA,ARKANSAS,DISTRICT OF COLUMBIA,GEORGIA,INDIANA,KANSAS,KENTUCKY,MASSACHUSETTS,MINNESOTA,NEW YORK,NORTH CAROLINA,OHIO,OKLAHOMA,OREGON,PENNSYLVANIA,PUERTO RICO,SOUTH DAKOTA,TENNESSEE,TEXAS,VERMONT,VIRGINIA,WASHINGTON,WISCONSIN 7954 3601 TRUE (0.4527282 0.5472718)
## 38) STATE=ALABAMA,DISTRICT OF COLUMBIA,KENTUCKY,MASSACHUSETTS,NEW YORK,OREGON,PENNSYLVANIA,TENNESSEE,TEXAS,VIRGINIA,WASHINGTON,WISCONSIN 6085 2915 TRUE (0.4790468 0.5209532)
## 76) PREVAILING_WAGE< 100454 3741 1782 FALSE (0.5236568 0.4763432)
## 152) PREVAILING_WAGE>=99777.5 87 9 FALSE (0.8965517 0.1034483) *
## 153) PREVAILING_WAGE< 99777.5 3654 1773 FALSE (0.5147783 0.4852217)
## 306) PREVAILING_WAGE< 84063 2237 1005 FALSE (0.5507376 0.4492624) *
## 307) PREVAILING_WAGE>=84063 1417 649 TRUE (0.4580099 0.5419901) *
## 77) PREVAILING_WAGE>=100454 2344 956 TRUE (0.4078498 0.5921502)
## 154) SOC_NAME_short=BUSINE,CHIEF ,ENGINE,PROPER,TREASU 185 65 FALSE (0.6486486 0.3513514) *
## 155) SOC_NAME_short=ADMINI,COMPLI,COMPUT,SOFTWA 2159 836 TRUE (0.3872163 0.6127837) *
## 39) STATE=ALASKA,ARIZONA,ARKANSAS,GEORGIA,INDIANA,KANSAS,MINNESOTA,NORTH CAROLINA,OHIO,OKLAHOMA,PUERTO RICO,SOUTH DAKOTA,VERMONT 1869 686 TRUE (0.3670412 0.6329588) *
## 5) SOC_NAME_short=ADVERT,BUDGET,COMPEN,CONSTR,EDUCAT,FINANC,FOOD S,FUNDRA,GENERA,HUMAN ,INFORM,INSURA,LOGIST,MARKET,MEETIN,NATURA,PERSON,PUBLIC,PURCHA,QUALIT,SOCIAL,TRAINI 34648 17122 TRUE (0.4941699 0.5058301)
## 10) STATE=COLORADO,CONNECTICUT,DELAWARE,DISTRICT OF COLUMBIA,HAWAII,IDAHO,INDIANA,IOWA,LOUISIANA,MARYLAND,MICHIGAN,MINNESOTA,MISSISSIPPI,NA,NEW HAMPSHIRE,NEW YORK,NORTH DAKOTA,OKLAHOMA,OREGON,RHODE ISLAND,SOUTH DAKOTA,TENNESSEE,TEXAS,VERMONT,WEST VIRGINIA,WISCONSIN 16656 7760 FALSE (0.5341018 0.4658982)
## 20) STATE=DELAWARE,IDAHO,INDIANA,LOUISIANA,MINNESOTA,MISSISSIPPI,NORTH DAKOTA,OREGON,RHODE ISLAND,SOUTH DAKOTA,VERMONT 1305 439 FALSE (0.6636015 0.3363985) *
## 21) STATE=COLORADO,CONNECTICUT,DISTRICT OF COLUMBIA,HAWAII,IOWA,MARYLAND,MICHIGAN,NA,NEW HAMPSHIRE,NEW YORK,OKLAHOMA,TENNESSEE,TEXAS,WEST VIRGINIA,WISCONSIN 15351 7321 FALSE (0.5230930 0.4769070)
## 42) PREVAILING_WAGE>=65805.5 8195 3681 FALSE (0.5508237 0.4491763) *
## 43) PREVAILING_WAGE< 65805.5 7156 3516 TRUE (0.4913359 0.5086641)
## 86) PREVAILING_WAGE< 64271.9 5820 2814 FALSE (0.5164948 0.4835052)
## 172) PREVAILING_WAGE>=63838.5 150 30 FALSE (0.8000000 0.2000000) *
## 173) PREVAILING_WAGE< 63838.5 5670 2784 FALSE (0.5089947 0.4910053)
## 346) STATE=COLORADO,CONNECTICUT,DISTRICT OF COLUMBIA,MICHIGAN,NA,OKLAHOMA,WEST VIRGINIA 933 375 FALSE (0.5980707 0.4019293) *
## 347) STATE=HAWAII,IOWA,MARYLAND,NEW HAMPSHIRE,NEW YORK,TENNESSEE,TEXAS,WISCONSIN 4737 2328 TRUE (0.4914503 0.5085497)
## 694) SOC_NAME_short=ADVERT,EDUCAT,FOOD S,MEETIN,NATURA,PUBLIC,SOCIAL 341 121 FALSE (0.6451613 0.3548387) *
## 695) SOC_NAME_short=BUDGET,COMPEN,CONSTR,FINANC,FUNDRA,GENERA,HUMAN ,INSURA,LOGIST,MARKET,PERSON,PURCHA,TRAINI 4396 2108 TRUE (0.4795268 0.5204732)
## 1390) PREVAILING_WAGE< 60901 3820 1897 TRUE (0.4965969 0.5034031)
## 2780) PREVAILING_WAGE>=59371 322 51 FALSE (0.8416149 0.1583851) *
## 2781) PREVAILING_WAGE< 59371 3498 1626 TRUE (0.4648370 0.5351630)
## 5562) PREVAILING_WAGE< 44844.8 869 387 FALSE (0.5546605 0.4453395)
## 11124) PREVAILING_WAGE>=41997.5 269 58 FALSE (0.7843866 0.2156134) *
## 11125) PREVAILING_WAGE< 41997.5 600 271 TRUE (0.4516667 0.5483333) *
## 5563) PREVAILING_WAGE>=44844.8 2629 1144 TRUE (0.4351464 0.5648536) *
## 1391) PREVAILING_WAGE>=60901 576 211 TRUE (0.3663194 0.6336806) *
## 87) PREVAILING_WAGE>=64271.9 1336 510 TRUE (0.3817365 0.6182635) *
## 11) STATE=ALABAMA,ALASKA,ARIZONA,ARKANSAS,CALIFORNIA,FLORIDA,GEORGIA,ILLINOIS,KANSAS,KENTUCKY,MAINE,MASSACHUSETTS,MISSOURI,MONTANA,NEBRASKA,NEVADA,NEW JERSEY,NEW MEXICO,NORTH CAROLINA,OHIO,PENNSYLVANIA,PUERTO RICO,SOUTH CAROLINA,UTAH,VIRGINIA,WASHINGTON,WYOMING 17992 8226 TRUE (0.4572032 0.5427968)
## 22) STATE=ALASKA,ARIZONA,ARKANSAS,CALIFORNIA,FLORIDA,GEORGIA,ILLINOIS,KANSAS,KENTUCKY,MASSACHUSETTS,NEVADA,NEW JERSEY,NORTH CAROLINA,OHIO,UTAH 15598 7347 TRUE (0.4710219 0.5289781)
## 44) PREVAILING_WAGE< 39339 1065 460 FALSE (0.5680751 0.4319249) *
## 45) PREVAILING_WAGE>=39339 14533 6742 TRUE (0.4639097 0.5360903) *
## 23) STATE=ALABAMA,MAINE,MISSOURI,MONTANA,NEBRASKA,NEW MEXICO,PENNSYLVANIA,PUERTO RICO,SOUTH CAROLINA,VIRGINIA,WASHINGTON,WYOMING 2394 879 TRUE (0.3671679 0.6328321) *
## 3) SOC_NAME_short=ACCOUN,ARCHIT,AUDITO,BUYERS,CLAIMS,CLINIC,COST E,CREDIT,DATABA,EMERGE,FARMER,FITNES,GAMING,INDUST,LABOR ,LOAN C,LOAN O,LODGIN,MANAGE,MANGEM,MEDICA,REGULA,SALES ,SUSTAI,TAX EX,TRANSP,VICE P,WHOLES 28031 9904 TRUE (0.3533231 0.6466769)
## 6) SOC_NAME_short=ARCHIT,COST E,CREDIT,FARMER,INDUST,LOAN O,MEDICA,REGULA,SALES ,WHOLES 5254 2258 TRUE (0.4297678 0.5702322)
## 12) STATE=COLORADO,HAWAII,IOWA,KENTUCKY,MASSACHUSETTS,MINNESOTA,NA,NEW HAMPSHIRE,NORTH CAROLINA,PUERTO RICO,WEST VIRGINIA 527 212 FALSE (0.5977230 0.4022770) *
## 13) STATE=ALABAMA,ALASKA,ARIZONA,ARKANSAS,CALIFORNIA,CONNECTICUT,DELAWARE,DISTRICT OF COLUMBIA,FLORIDA,GEORGIA,IDAHO,ILLINOIS,INDIANA,KANSAS,LOUISIANA,MAINE,MARYLAND,MICHIGAN,MISSISSIPPI,MISSOURI,NEBRASKA,NEVADA,NEW JERSEY,NEW MEXICO,NEW YORK,NORTH DAKOTA,OHIO,OKLAHOMA,OREGON,PENNSYLVANIA,RHODE ISLAND,SOUTH CAROLINA,TENNESSEE,TEXAS,UTAH,VERMONT,VIRGINIA,WASHINGTON,WISCONSIN,WYOMING 4727 1943 TRUE (0.4110429 0.5889571) *
## 7) SOC_NAME_short=ACCOUN,AUDITO,BUYERS,CLAIMS,CLINIC,DATABA,EMERGE,FITNES,GAMING,LABOR ,LOAN C,LODGIN,MANAGE,MANGEM,SUSTAI,TAX EX,TRANSP,VICE P 22777 7646 TRUE (0.3356895 0.6643105) *
rpart.plot(tree_default, extra = 2, under = TRUE, varlen=0, faclen=0)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
Does not seem so helpful
summary(h1b$PREVAILING_WAGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 55370 68848 77767 92976 960000 19
h1b_use$LOW_WAGE <- factor(h1b$PREVAILING_WAGE < 68000)
tree_default <- rpart(LOW_WAGE ~ .,
data = h1b_use)
tree_default
## n=99981 (19 observations deleted due to missingness)
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 99981 48993 FALSE (0.5099769 0.4900231)
## 2) PREVAILING_WAGE>=67997.6 50988 0 FALSE (1.0000000 0.0000000) *
## 3) PREVAILING_WAGE< 67997.6 48993 0 TRUE (0.0000000 1.0000000) *
rpart.plot(tree_default, extra = 2, under = TRUE, varlen=0, faclen=0)
The algorithm is not stupid!
h1b_use$PREVAILING_WAGE <- NULL
tree_default <- rpart(LOW_WAGE ~ .,
data = h1b_use)
tree_default
## n=99981 (19 observations deleted due to missingness)
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 99981 48993 FALSE (0.50997690 0.49002310)
## 2) FULL_TIME_POSITION=Y 48412 0 FALSE (1.00000000 0.00000000) *
## 3) FULL_TIME_POSITION=N 51569 2576 TRUE (0.04995249 0.95004751) *
rpart.plot(tree_default, extra = 2, under = TRUE, varlen=0, faclen=0)
tree_default <- rpart(LOW_WAGE ~ .,
data = h1b_use,
control=rpart.control(cp=0.0001))
tree_default
## n=99981 (19 observations deleted due to missingness)
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 99981 48993 FALSE (0.50997690 0.49002310)
## 2) FULL_TIME_POSITION=Y 48412 0 FALSE (1.00000000 0.00000000) *
## 3) FULL_TIME_POSITION=N 51569 2576 TRUE (0.04995249 0.95004751)
## 6) STATE=CALIFORNIA,DELAWARE,IDAHO,ILLINOIS,IOWA,KANSAS,MAINE,MINNESOTA,MONTANA,NORTH CAROLINA,TENNESSEE,WASHINGTON,WISCONSIN 15774 1663 TRUE (0.10542665 0.89457335)
## 12) SOC_NAME_short=COMPUT,GENERA,MANAGE,NATURA,PUBLIC 5850 1321 TRUE (0.22581197 0.77418803)
## 24) STATE=WASHINGTON 365 115 FALSE (0.68493151 0.31506849)
## 48) SOC_NAME_short=COMPUT 274 31 FALSE (0.88686131 0.11313869) *
## 49) SOC_NAME_short=GENERA,MANAGE 91 7 TRUE (0.07692308 0.92307692) *
## 25) STATE=CALIFORNIA,DELAWARE,IDAHO,ILLINOIS,IOWA,KANSAS,MAINE,MINNESOTA,MONTANA,NORTH CAROLINA,TENNESSEE,WISCONSIN 5485 1071 TRUE (0.19525980 0.80474020) *
## 13) SOC_NAME_short=ACCOUN,ADMINI,ADVERT,AGENTS,APPLIC,ARCHIT,AUDITO,BUDGET,BUSINE,BUYERS,CHIEF ,CLAIMS,COMPEN,COMPLI,CONSTR,COST E,CREDIT,DATA W,DATABA,EDUCAT,EMERGE,FARMER,FINANC,FOOD S,FUNDRA,HUMAN ,INDUST,INFORM,INSURA,LABOR ,LOAN O,LODGIN,LOGIST,MARKET,MEDICA,MEETIN,NETWOR,PERSON,PROPER,PURCHA,QUALIT,REGULA,SALES ,SOCIAL,SOFTWA,SUPPLY,TAX PR,TRAINI,TRANSP,TREASU,WHOLES 9924 342 TRUE (0.03446191 0.96553809) *
## 7) STATE=ALABAMA,ALASKA,ARIZONA,ARKANSAS,COLORADO,CONNECTICUT,DISTRICT OF COLUMBIA,FLORIDA,GEORGIA,HAWAII,INDIANA,KENTUCKY,LOUISIANA,MARYLAND,MASSACHUSETTS,MICHIGAN,MISSISSIPPI,MISSOURI,NA,NEBRASKA,NEVADA,NEW HAMPSHIRE,NEW JERSEY,NEW MEXICO,NEW YORK,NORTH DAKOTA,OHIO,OKLAHOMA,OREGON,PENNSYLVANIA,PUERTO RICO,RHODE ISLAND,SOUTH CAROLINA,SOUTH DAKOTA,TEXAS,UTAH,VERMONT,VIRGINIA,WEST VIRGINIA,WYOMING 35795 913 TRUE (0.02550636 0.97449364)
## 14) SOC_NAME_short=APPLIC,GEOGRA,MANGEM 42 0 FALSE (1.00000000 0.00000000) *
## 15) SOC_NAME_short=ACCOUN,ADMINI,ADVERT,AGENTS,APPRAI,ARCHIT,AUDITO,BIOCHE,BUDGET,BUSINE,BUYERS,CHIEF ,CLAIMS,CLINIC,COMPEN,COMPLI,COMPUT,CONSTR,COST E,CREDIT,DATABA,EDUCAT,EMERGE,ENVIRO,FARMER,FINANC,FITNES,FOOD S,FRAUD ,FUNDRA,GAMING,GENERA,HUMAN ,INDUST,INSURA,LABOR ,LOAN C,LOAN O,LODGIN,LOGIST,MANAGE,MANAGM,MARKET,MEDICA,MEETIN,NATURA,NETWOR,PERSON,PROPER,PUBLIC,PURCHA,REGULA,RISK M,SALES ,SOCIAL,SOFTWA,SUPPLY,SUSTAI,TAX EX,TAX PR,TRAINI,TRANSP,WHOLES 35753 871 TRUE (0.02436159 0.97563841) *
rpart.plot(tree_default, extra = 2, under = TRUE, varlen=0, faclen=0)