Read “clean” data

(see code for Project 1)

load("h1b_clean.rda")
summary(h1b)
##        X                       CASE_STATUS   
##  Min.   :     1   CERTIFIED          :84522  
##  1st Qu.: 25001   CERTIFIED-WITHDRAWN: 9075  
##  Median : 50000   DENIED             : 2190  
##  Mean   : 50000   WITHDRAWN          : 4213  
##  3rd Qu.: 75000                              
##  Max.   :100000                              
##                                              
##                                          EMPLOYER_NAME  
##  ERNST & YOUNG U.S. LLP                         : 3428  
##  COGNIZANT TECHNOLOGY SOLUTIONS U.S. CORPORATION: 2482  
##  INFOSYS LIMITED                                : 2317  
##  DELOITTE CONSULTING LLP                        : 1745  
##  DELOITTE & TOUCHE LLP                          : 1146  
##  CAPGEMINI AMERICA INC                          : 1121  
##  (Other)                                        :87761  
##                                                SOC_NAME    
##  COMPUTER SYSTEMS ANALYSTS                         :18600  
##  MANAGEMENT ANALYSTS                               :15010  
##  ACCOUNTANTS AND AUDITORS                          :11204  
##  FINANCIAL ANALYSTS                                : 9605  
##  MARKET RESEARCH ANALYSTS AND MARKETING SPECIALISTS: 8089  
##  (Other)                                           :37491  
##  NA's                                              :    1  
##               JOB_TITLE     FULL_TIME_POSITION PREVAILING_WAGE 
##  BUSINESS ANALYST  : 4246   N:51569            Min.   :     0  
##  ACCOUNTANT        : 2331   Y:48431            1st Qu.: 55370  
##  PROGRAMMER ANALYST: 2096                      Median : 68848  
##  SYSTEMS ANALYST   : 1941                      Mean   : 77767  
##  FINANCIAL ANALYST : 1570                      3rd Qu.: 92976  
##  ASSOCIATE         : 1519                      Max.   :960000  
##  (Other)           :86297                      NA's   :19      
##       YEAR                           WORKSITE          lon         
##  Min.   :2016   NEW YORK, NEW YORK       :14424   Min.   :-157.86  
##  1st Qu.:2016   SAN FRANCISCO, CALIFORNIA: 3598   1st Qu.:-112.07  
##  Median :2016   HOUSTON, TEXAS           : 3038   Median : -84.39  
##  Mean   :2016   CHICAGO, ILLINOIS        : 2780   Mean   : -91.22  
##  3rd Qu.:2016   LOS ANGELES, CALIFORNIA  : 1850   3rd Qu.: -74.08  
##  Max.   :2016   ATLANTA, GEORGIA         : 1719   Max.   : 145.73  
##                 (Other)                  :72591   NA's   :3170     
##       lat           STATE          
##  Min.   :13.44   Length:100000     
##  1st Qu.:34.15   Class :character  
##  Median :39.64   Mode  :character  
##  Mean   :38.09                     
##  3rd Qu.:40.73                     
##  Max.   :64.84                     
##  NA's   :3170

Prepare data

Get rid of row number

h1b$X <- NULL

Make state a factor

h1b$STATE <- as.factor(h1b$STATE)

Create a logical variable for certified

h1b$CERTIFIED <- h1b$CASE_STATUS == "CERTIFIED"

Fix SOC_NAME somewhat (take first 6 letters to reduce number of differnt values)

h1b$SOC_NAME_short <- as.factor(substr(h1b$SOC_NAME, 1, 6))
length(levels(h1b$SOC_NAME))
## [1] 199
length(levels(h1b$SOC_NAME_short))
## [1] 73
summary(h1b)
##               CASE_STATUS   
##  CERTIFIED          :84522  
##  CERTIFIED-WITHDRAWN: 9075  
##  DENIED             : 2190  
##  WITHDRAWN          : 4213  
##                             
##                             
##                             
##                                          EMPLOYER_NAME  
##  ERNST & YOUNG U.S. LLP                         : 3428  
##  COGNIZANT TECHNOLOGY SOLUTIONS U.S. CORPORATION: 2482  
##  INFOSYS LIMITED                                : 2317  
##  DELOITTE CONSULTING LLP                        : 1745  
##  DELOITTE & TOUCHE LLP                          : 1146  
##  CAPGEMINI AMERICA INC                          : 1121  
##  (Other)                                        :87761  
##                                                SOC_NAME    
##  COMPUTER SYSTEMS ANALYSTS                         :18600  
##  MANAGEMENT ANALYSTS                               :15010  
##  ACCOUNTANTS AND AUDITORS                          :11204  
##  FINANCIAL ANALYSTS                                : 9605  
##  MARKET RESEARCH ANALYSTS AND MARKETING SPECIALISTS: 8089  
##  (Other)                                           :37491  
##  NA's                                              :    1  
##               JOB_TITLE     FULL_TIME_POSITION PREVAILING_WAGE 
##  BUSINESS ANALYST  : 4246   N:51569            Min.   :     0  
##  ACCOUNTANT        : 2331   Y:48431            1st Qu.: 55370  
##  PROGRAMMER ANALYST: 2096                      Median : 68848  
##  SYSTEMS ANALYST   : 1941                      Mean   : 77767  
##  FINANCIAL ANALYST : 1570                      3rd Qu.: 92976  
##  ASSOCIATE         : 1519                      Max.   :960000  
##  (Other)           :86297                      NA's   :19      
##       YEAR                           WORKSITE          lon         
##  Min.   :2016   NEW YORK, NEW YORK       :14424   Min.   :-157.86  
##  1st Qu.:2016   SAN FRANCISCO, CALIFORNIA: 3598   1st Qu.:-112.07  
##  Median :2016   HOUSTON, TEXAS           : 3038   Median : -84.39  
##  Mean   :2016   CHICAGO, ILLINOIS        : 2780   Mean   : -91.22  
##  3rd Qu.:2016   LOS ANGELES, CALIFORNIA  : 1850   3rd Qu.: -74.08  
##  Max.   :2016   ATLANTA, GEORGIA         : 1719   Max.   : 145.73  
##                 (Other)                  :72591   NA's   :3170     
##       lat               STATE       CERTIFIED       SOC_NAME_short 
##  Min.   :13.44   CALIFORNIA:20479   Mode :logical   COMPUT :27561  
##  1st Qu.:34.15   NEW YORK  :17736   FALSE:15478     MANAGE :16025  
##  Median :39.64   TEXAS     : 8914   TRUE :84522     FINANC :13190  
##  Mean   :38.09   NEW JERSEY: 5905                   ACCOUN :11256  
##  3rd Qu.:40.73   ILLINOIS  : 5246                   MARKET :11251  
##  Max.   :64.84   FLORIDA   : 3949                   (Other):20716  
##  NA's   :3170    (Other)   :37771                   NA's   :    1

Create a simple classification model for certification

h1b_use <- h1b[,c("FULL_TIME_POSITION", "PREVAILING_WAGE",
                  "STATE", "SOC_NAME_short", "CERTIFIED")]

str(h1b_use$CERTIFIED)
##  logi [1:100000] FALSE FALSE FALSE FALSE FALSE FALSE ...

The class variable needs to be a factor for classification!

h1b_use$CERTIFIED <- factor(h1b_use$CERTIFIED)
str(h1b_use$CERTIFIED)
##  Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 1 1 1 1 1 1 ...
library("rpart")
tree_default <- rpart(CERTIFIED ~ ., data = h1b_use)
tree_default
## n= 100000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 100000 15478 TRUE (0.1547800 0.8452200) *
library(rpart.plot)
rpart.plot(tree_default, extra = 2, under = TRUE, varlen=0, faclen=0)

The dataset is highly imbalanced

table(h1b_use$CERTIFIED)
## 
## FALSE  TRUE 
## 15478 84522

Resample

library(sampling)

id <- strata(h1b_use, stratanames="CERTIFIED", size=c(50000,50000), 
             method="srswr")
h1b_use_balanced <- h1b_use[id$ID_unit, ]
table(h1b_use_balanced$CERTIFIED)
## 
## FALSE  TRUE 
## 50000 50000
tree_default <- rpart(CERTIFIED ~ ., 
      data = h1b_use_balanced)
tree_default
## n= 100000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 100000 50000 FALSE (0.5000000 0.5000000)  
##    2) SOC_NAME_short=ADMINI,ADVERT,AGENTS,APPLIC,APPRAI,BIOCHE,BUDGET,BUSINE,CHIEF ,COMPEN,COMPLI,COMPUT,CONSTR,EDUCAT,ENGINE,FINANC,FOOD S,FUNDRA,GENERA,GEOGRA,HUMAN ,INFORM,INSURA,INVEST,LOGIST,MANAGM,MARKET,MEETIN,NATURA,NETWOR,PERSON,PROPER,PUBLIC,PURCHA,QUALIT,RISK M,SOCIAL,SOFTWA,SUPPLY,TAX PR,TRAINI,TREASU 71969 31873 FALSE (0.5571288 0.4428712)  
##      4) SOC_NAME_short=ADMINI,AGENTS,APPLIC,APPRAI,BIOCHE,BUSINE,CHIEF ,COMPLI,COMPUT,ENGINE,GEOGRA,INVEST,MANAGM,NETWOR,PROPER,RISK M,SOFTWA,SUPPLY,TAX PR,TREASU 37321 14347 FALSE (0.6155784 0.3844216) *
##      5) SOC_NAME_short=ADVERT,BUDGET,COMPEN,CONSTR,EDUCAT,FINANC,FOOD S,FUNDRA,GENERA,HUMAN ,INFORM,INSURA,LOGIST,MARKET,MEETIN,NATURA,PERSON,PUBLIC,PURCHA,QUALIT,SOCIAL,TRAINI 34648 17122 TRUE (0.4941699 0.5058301)  
##       10) STATE=COLORADO,CONNECTICUT,DELAWARE,DISTRICT OF COLUMBIA,HAWAII,IDAHO,INDIANA,IOWA,LOUISIANA,MARYLAND,MICHIGAN,MINNESOTA,MISSISSIPPI,NA,NEW HAMPSHIRE,NEW YORK,NORTH DAKOTA,OKLAHOMA,OREGON,RHODE ISLAND,SOUTH DAKOTA,TENNESSEE,TEXAS,VERMONT,WEST VIRGINIA,WISCONSIN 16656  7760 FALSE (0.5341018 0.4658982) *
##       11) STATE=ALABAMA,ALASKA,ARIZONA,ARKANSAS,CALIFORNIA,FLORIDA,GEORGIA,ILLINOIS,KANSAS,KENTUCKY,MAINE,MASSACHUSETTS,MISSOURI,MONTANA,NEBRASKA,NEVADA,NEW JERSEY,NEW MEXICO,NORTH CAROLINA,OHIO,PENNSYLVANIA,PUERTO RICO,SOUTH CAROLINA,UTAH,VIRGINIA,WASHINGTON,WYOMING 17992  8226 TRUE (0.4572032 0.5427968) *
##    3) SOC_NAME_short=ACCOUN,ARCHIT,AUDITO,BUYERS,CLAIMS,CLINIC,COST E,CREDIT,DATABA,EMERGE,FARMER,FITNES,GAMING,INDUST,LABOR ,LOAN C,LOAN O,LODGIN,MANAGE,MANGEM,MEDICA,REGULA,SALES ,SUSTAI,TAX EX,TRANSP,VICE P,WHOLES 28031  9904 TRUE (0.3533231 0.6466769) *
rpart.plot(tree_default, extra = 2, under = TRUE, varlen=0, faclen=0)

Make a larger tree

tree_default <- rpart(CERTIFIED ~ ., 
      data = h1b_use_balanced,
      control=rpart.control(cp=0.001))
tree_default
## n= 100000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##     1) root 100000 50000 FALSE (0.5000000 0.5000000)  
##       2) SOC_NAME_short=ADMINI,ADVERT,AGENTS,APPLIC,APPRAI,BIOCHE,BUDGET,BUSINE,CHIEF ,COMPEN,COMPLI,COMPUT,CONSTR,EDUCAT,ENGINE,FINANC,FOOD S,FUNDRA,GENERA,GEOGRA,HUMAN ,INFORM,INSURA,INVEST,LOGIST,MANAGM,MARKET,MEETIN,NATURA,NETWOR,PERSON,PROPER,PUBLIC,PURCHA,QUALIT,RISK M,SOCIAL,SOFTWA,SUPPLY,TAX PR,TRAINI,TREASU 71969 31873 FALSE (0.5571288 0.4428712)  
##         4) SOC_NAME_short=ADMINI,AGENTS,APPLIC,APPRAI,BIOCHE,BUSINE,CHIEF ,COMPLI,COMPUT,ENGINE,GEOGRA,INVEST,MANAGM,NETWOR,PROPER,RISK M,SOFTWA,SUPPLY,TAX PR,TREASU 37321 14347 FALSE (0.6155784 0.3844216)  
##           8) PREVAILING_WAGE< 72935.5 19477  5789 FALSE (0.7027776 0.2972224)  
##            16) STATE=ALABAMA,ALASKA,ARIZONA,CALIFORNIA,COLORADO,CONNECTICUT,DELAWARE,DISTRICT OF COLUMBIA,ILLINOIS,LOUISIANA,MAINE,MASSACHUSETTS,MINNESOTA,MISSISSIPPI,MISSOURI,MONTANA,NA,NEBRASKA,NEVADA,NEW HAMPSHIRE,NEW JERSEY,NEW MEXICO,NORTH CAROLINA,NORTH DAKOTA,OHIO,OREGON,PUERTO RICO,SOUTH CAROLINA,SOUTH DAKOTA,TENNESSEE,TEXAS,UTAH,VIRGINIA,WEST VIRGINIA,WISCONSIN,WYOMING 13471  3391 FALSE (0.7482741 0.2517259) *
##            17) STATE=ARKANSAS,FLORIDA,GEORGIA,HAWAII,IDAHO,INDIANA,IOWA,KANSAS,KENTUCKY,MARYLAND,MICHIGAN,NEW YORK,OKLAHOMA,PENNSYLVANIA,RHODE ISLAND,VERMONT,WASHINGTON 6006  2398 FALSE (0.6007326 0.3992674)  
##              34) STATE=FLORIDA,GEORGIA,IDAHO,INDIANA,IOWA,MARYLAND,MICHIGAN,NEW YORK,OKLAHOMA,WASHINGTON 4571  1723 FALSE (0.6230584 0.3769416)  
##                68) PREVAILING_WAGE< 61781.5 2565   847 FALSE (0.6697856 0.3302144) *
##                69) PREVAILING_WAGE>=61781.5 2006   876 FALSE (0.5633101 0.4366899)  
##                 138) PREVAILING_WAGE>=62618 1697   649 FALSE (0.6175604 0.3824396) *
##                 139) PREVAILING_WAGE< 62618 309    82 TRUE (0.2653722 0.7346278) *
##              35) STATE=ARKANSAS,HAWAII,KANSAS,KENTUCKY,PENNSYLVANIA,RHODE ISLAND,VERMONT 1435   675 FALSE (0.5296167 0.4703833)  
##                70) PREVAILING_WAGE>=60372 559   191 FALSE (0.6583184 0.3416816) *
##                71) PREVAILING_WAGE< 60372 876   392 TRUE (0.4474886 0.5525114)  
##                 142) PREVAILING_WAGE< 59716.9 593   246 FALSE (0.5851602 0.4148398) *
##                 143) PREVAILING_WAGE>=59716.9 283    45 TRUE (0.1590106 0.8409894) *
##           9) PREVAILING_WAGE>=72935.5 17844  8558 FALSE (0.5203990 0.4796010)  
##            18) STATE=CALIFORNIA,COLORADO,CONNECTICUT,DELAWARE,FLORIDA,HAWAII,IDAHO,ILLINOIS,IOWA,LOUISIANA,MAINE,MARYLAND,MICHIGAN,MISSISSIPPI,MISSOURI,MONTANA,NA,NEBRASKA,NEVADA,NEW HAMPSHIRE,NEW JERSEY,NEW MEXICO,NORTH DAKOTA,RHODE ISLAND,SOUTH CAROLINA,UTAH,WEST VIRGINIA,WYOMING 9890  4205 FALSE (0.5748231 0.4251769)  
##              36) PREVAILING_WAGE< 85946.5 3549  1139 FALSE (0.6790645 0.3209355) *
##              37) PREVAILING_WAGE>=85946.5 6341  3066 FALSE (0.5164801 0.4835199)  
##                74) PREVAILING_WAGE< 104062.5 2732  1163 FALSE (0.5743045 0.4256955) *
##                75) PREVAILING_WAGE>=104062.5 3609  1706 TRUE (0.4727071 0.5272929)  
##                 150) STATE=MAINE,NA,NEBRASKA,NEW HAMPSHIRE,NEW MEXICO,WEST VIRGINIA,WYOMING 71    10 FALSE (0.8591549 0.1408451) *
##                 151) STATE=CALIFORNIA,COLORADO,CONNECTICUT,DELAWARE,FLORIDA,HAWAII,IDAHO,ILLINOIS,IOWA,MARYLAND,MICHIGAN,MISSOURI,NEVADA,NEW JERSEY,RHODE ISLAND,SOUTH CAROLINA,UTAH 3538  1645 TRUE (0.4649520 0.5350480) *
##            19) STATE=ALABAMA,ALASKA,ARIZONA,ARKANSAS,DISTRICT OF COLUMBIA,GEORGIA,INDIANA,KANSAS,KENTUCKY,MASSACHUSETTS,MINNESOTA,NEW YORK,NORTH CAROLINA,OHIO,OKLAHOMA,OREGON,PENNSYLVANIA,PUERTO RICO,SOUTH DAKOTA,TENNESSEE,TEXAS,VERMONT,VIRGINIA,WASHINGTON,WISCONSIN 7954  3601 TRUE (0.4527282 0.5472718)  
##              38) STATE=ALABAMA,DISTRICT OF COLUMBIA,KENTUCKY,MASSACHUSETTS,NEW YORK,OREGON,PENNSYLVANIA,TENNESSEE,TEXAS,VIRGINIA,WASHINGTON,WISCONSIN 6085  2915 TRUE (0.4790468 0.5209532)  
##                76) PREVAILING_WAGE< 100454 3741  1782 FALSE (0.5236568 0.4763432)  
##                 152) PREVAILING_WAGE>=99777.5 87     9 FALSE (0.8965517 0.1034483) *
##                 153) PREVAILING_WAGE< 99777.5 3654  1773 FALSE (0.5147783 0.4852217)  
##                   306) PREVAILING_WAGE< 84063 2237  1005 FALSE (0.5507376 0.4492624) *
##                   307) PREVAILING_WAGE>=84063 1417   649 TRUE (0.4580099 0.5419901) *
##                77) PREVAILING_WAGE>=100454 2344   956 TRUE (0.4078498 0.5921502)  
##                 154) SOC_NAME_short=BUSINE,CHIEF ,ENGINE,PROPER,TREASU 185    65 FALSE (0.6486486 0.3513514) *
##                 155) SOC_NAME_short=ADMINI,COMPLI,COMPUT,SOFTWA 2159   836 TRUE (0.3872163 0.6127837) *
##              39) STATE=ALASKA,ARIZONA,ARKANSAS,GEORGIA,INDIANA,KANSAS,MINNESOTA,NORTH CAROLINA,OHIO,OKLAHOMA,PUERTO RICO,SOUTH DAKOTA,VERMONT 1869   686 TRUE (0.3670412 0.6329588) *
##         5) SOC_NAME_short=ADVERT,BUDGET,COMPEN,CONSTR,EDUCAT,FINANC,FOOD S,FUNDRA,GENERA,HUMAN ,INFORM,INSURA,LOGIST,MARKET,MEETIN,NATURA,PERSON,PUBLIC,PURCHA,QUALIT,SOCIAL,TRAINI 34648 17122 TRUE (0.4941699 0.5058301)  
##          10) STATE=COLORADO,CONNECTICUT,DELAWARE,DISTRICT OF COLUMBIA,HAWAII,IDAHO,INDIANA,IOWA,LOUISIANA,MARYLAND,MICHIGAN,MINNESOTA,MISSISSIPPI,NA,NEW HAMPSHIRE,NEW YORK,NORTH DAKOTA,OKLAHOMA,OREGON,RHODE ISLAND,SOUTH DAKOTA,TENNESSEE,TEXAS,VERMONT,WEST VIRGINIA,WISCONSIN 16656  7760 FALSE (0.5341018 0.4658982)  
##            20) STATE=DELAWARE,IDAHO,INDIANA,LOUISIANA,MINNESOTA,MISSISSIPPI,NORTH DAKOTA,OREGON,RHODE ISLAND,SOUTH DAKOTA,VERMONT 1305   439 FALSE (0.6636015 0.3363985) *
##            21) STATE=COLORADO,CONNECTICUT,DISTRICT OF COLUMBIA,HAWAII,IOWA,MARYLAND,MICHIGAN,NA,NEW HAMPSHIRE,NEW YORK,OKLAHOMA,TENNESSEE,TEXAS,WEST VIRGINIA,WISCONSIN 15351  7321 FALSE (0.5230930 0.4769070)  
##              42) PREVAILING_WAGE>=65805.5 8195  3681 FALSE (0.5508237 0.4491763) *
##              43) PREVAILING_WAGE< 65805.5 7156  3516 TRUE (0.4913359 0.5086641)  
##                86) PREVAILING_WAGE< 64271.9 5820  2814 FALSE (0.5164948 0.4835052)  
##                 172) PREVAILING_WAGE>=63838.5 150    30 FALSE (0.8000000 0.2000000) *
##                 173) PREVAILING_WAGE< 63838.5 5670  2784 FALSE (0.5089947 0.4910053)  
##                   346) STATE=COLORADO,CONNECTICUT,DISTRICT OF COLUMBIA,MICHIGAN,NA,OKLAHOMA,WEST VIRGINIA 933   375 FALSE (0.5980707 0.4019293) *
##                   347) STATE=HAWAII,IOWA,MARYLAND,NEW HAMPSHIRE,NEW YORK,TENNESSEE,TEXAS,WISCONSIN 4737  2328 TRUE (0.4914503 0.5085497)  
##                     694) SOC_NAME_short=ADVERT,EDUCAT,FOOD S,MEETIN,NATURA,PUBLIC,SOCIAL 341   121 FALSE (0.6451613 0.3548387) *
##                     695) SOC_NAME_short=BUDGET,COMPEN,CONSTR,FINANC,FUNDRA,GENERA,HUMAN ,INSURA,LOGIST,MARKET,PERSON,PURCHA,TRAINI 4396  2108 TRUE (0.4795268 0.5204732)  
##                      1390) PREVAILING_WAGE< 60901 3820  1897 TRUE (0.4965969 0.5034031)  
##                        2780) PREVAILING_WAGE>=59371 322    51 FALSE (0.8416149 0.1583851) *
##                        2781) PREVAILING_WAGE< 59371 3498  1626 TRUE (0.4648370 0.5351630)  
##                          5562) PREVAILING_WAGE< 44844.8 869   387 FALSE (0.5546605 0.4453395)  
##                           11124) PREVAILING_WAGE>=41997.5 269    58 FALSE (0.7843866 0.2156134) *
##                           11125) PREVAILING_WAGE< 41997.5 600   271 TRUE (0.4516667 0.5483333) *
##                          5563) PREVAILING_WAGE>=44844.8 2629  1144 TRUE (0.4351464 0.5648536) *
##                      1391) PREVAILING_WAGE>=60901 576   211 TRUE (0.3663194 0.6336806) *
##                87) PREVAILING_WAGE>=64271.9 1336   510 TRUE (0.3817365 0.6182635) *
##          11) STATE=ALABAMA,ALASKA,ARIZONA,ARKANSAS,CALIFORNIA,FLORIDA,GEORGIA,ILLINOIS,KANSAS,KENTUCKY,MAINE,MASSACHUSETTS,MISSOURI,MONTANA,NEBRASKA,NEVADA,NEW JERSEY,NEW MEXICO,NORTH CAROLINA,OHIO,PENNSYLVANIA,PUERTO RICO,SOUTH CAROLINA,UTAH,VIRGINIA,WASHINGTON,WYOMING 17992  8226 TRUE (0.4572032 0.5427968)  
##            22) STATE=ALASKA,ARIZONA,ARKANSAS,CALIFORNIA,FLORIDA,GEORGIA,ILLINOIS,KANSAS,KENTUCKY,MASSACHUSETTS,NEVADA,NEW JERSEY,NORTH CAROLINA,OHIO,UTAH 15598  7347 TRUE (0.4710219 0.5289781)  
##              44) PREVAILING_WAGE< 39339 1065   460 FALSE (0.5680751 0.4319249) *
##              45) PREVAILING_WAGE>=39339 14533  6742 TRUE (0.4639097 0.5360903) *
##            23) STATE=ALABAMA,MAINE,MISSOURI,MONTANA,NEBRASKA,NEW MEXICO,PENNSYLVANIA,PUERTO RICO,SOUTH CAROLINA,VIRGINIA,WASHINGTON,WYOMING 2394   879 TRUE (0.3671679 0.6328321) *
##       3) SOC_NAME_short=ACCOUN,ARCHIT,AUDITO,BUYERS,CLAIMS,CLINIC,COST E,CREDIT,DATABA,EMERGE,FARMER,FITNES,GAMING,INDUST,LABOR ,LOAN C,LOAN O,LODGIN,MANAGE,MANGEM,MEDICA,REGULA,SALES ,SUSTAI,TAX EX,TRANSP,VICE P,WHOLES 28031  9904 TRUE (0.3533231 0.6466769)  
##         6) SOC_NAME_short=ARCHIT,COST E,CREDIT,FARMER,INDUST,LOAN O,MEDICA,REGULA,SALES ,WHOLES 5254  2258 TRUE (0.4297678 0.5702322)  
##          12) STATE=COLORADO,HAWAII,IOWA,KENTUCKY,MASSACHUSETTS,MINNESOTA,NA,NEW HAMPSHIRE,NORTH CAROLINA,PUERTO RICO,WEST VIRGINIA 527   212 FALSE (0.5977230 0.4022770) *
##          13) STATE=ALABAMA,ALASKA,ARIZONA,ARKANSAS,CALIFORNIA,CONNECTICUT,DELAWARE,DISTRICT OF COLUMBIA,FLORIDA,GEORGIA,IDAHO,ILLINOIS,INDIANA,KANSAS,LOUISIANA,MAINE,MARYLAND,MICHIGAN,MISSISSIPPI,MISSOURI,NEBRASKA,NEVADA,NEW JERSEY,NEW MEXICO,NEW YORK,NORTH DAKOTA,OHIO,OKLAHOMA,OREGON,PENNSYLVANIA,RHODE ISLAND,SOUTH CAROLINA,TENNESSEE,TEXAS,UTAH,VERMONT,VIRGINIA,WASHINGTON,WISCONSIN,WYOMING 4727  1943 TRUE (0.4110429 0.5889571) *
##         7) SOC_NAME_short=ACCOUN,AUDITO,BUYERS,CLAIMS,CLINIC,DATABA,EMERGE,FITNES,GAMING,LABOR ,LOAN C,LODGIN,MANAGE,MANGEM,SUSTAI,TAX EX,TRANSP,VICE P 22777  7646 TRUE (0.3356895 0.6643105) *
rpart.plot(tree_default, extra = 2, under = TRUE, varlen=0, faclen=0)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

Does not seem so helpful

Try to predict “Low wage”

summary(h1b$PREVAILING_WAGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0   55370   68848   77767   92976  960000      19
h1b_use$LOW_WAGE <- factor(h1b$PREVAILING_WAGE < 68000)

tree_default <- rpart(LOW_WAGE ~ ., 
      data = h1b_use)
tree_default
## n=99981 (19 observations deleted due to missingness)
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 99981 48993 FALSE (0.5099769 0.4900231)  
##   2) PREVAILING_WAGE>=67997.6 50988     0 FALSE (1.0000000 0.0000000) *
##   3) PREVAILING_WAGE< 67997.6 48993     0 TRUE (0.0000000 1.0000000) *
rpart.plot(tree_default, extra = 2, under = TRUE, varlen=0, faclen=0)

The algorithm is not stupid!

h1b_use$PREVAILING_WAGE <- NULL

tree_default <- rpart(LOW_WAGE ~ ., 
      data = h1b_use)
tree_default
## n=99981 (19 observations deleted due to missingness)
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 99981 48993 FALSE (0.50997690 0.49002310)  
##   2) FULL_TIME_POSITION=Y 48412     0 FALSE (1.00000000 0.00000000) *
##   3) FULL_TIME_POSITION=N 51569  2576 TRUE (0.04995249 0.95004751) *
rpart.plot(tree_default, extra = 2, under = TRUE, varlen=0, faclen=0)

tree_default <- rpart(LOW_WAGE ~ ., 
      data = h1b_use,
      control=rpart.control(cp=0.0001))
tree_default
## n=99981 (19 observations deleted due to missingness)
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 99981 48993 FALSE (0.50997690 0.49002310)  
##    2) FULL_TIME_POSITION=Y 48412     0 FALSE (1.00000000 0.00000000) *
##    3) FULL_TIME_POSITION=N 51569  2576 TRUE (0.04995249 0.95004751)  
##      6) STATE=CALIFORNIA,DELAWARE,IDAHO,ILLINOIS,IOWA,KANSAS,MAINE,MINNESOTA,MONTANA,NORTH CAROLINA,TENNESSEE,WASHINGTON,WISCONSIN 15774  1663 TRUE (0.10542665 0.89457335)  
##       12) SOC_NAME_short=COMPUT,GENERA,MANAGE,NATURA,PUBLIC 5850  1321 TRUE (0.22581197 0.77418803)  
##         24) STATE=WASHINGTON 365   115 FALSE (0.68493151 0.31506849)  
##           48) SOC_NAME_short=COMPUT 274    31 FALSE (0.88686131 0.11313869) *
##           49) SOC_NAME_short=GENERA,MANAGE 91     7 TRUE (0.07692308 0.92307692) *
##         25) STATE=CALIFORNIA,DELAWARE,IDAHO,ILLINOIS,IOWA,KANSAS,MAINE,MINNESOTA,MONTANA,NORTH CAROLINA,TENNESSEE,WISCONSIN 5485  1071 TRUE (0.19525980 0.80474020) *
##       13) SOC_NAME_short=ACCOUN,ADMINI,ADVERT,AGENTS,APPLIC,ARCHIT,AUDITO,BUDGET,BUSINE,BUYERS,CHIEF ,CLAIMS,COMPEN,COMPLI,CONSTR,COST E,CREDIT,DATA W,DATABA,EDUCAT,EMERGE,FARMER,FINANC,FOOD S,FUNDRA,HUMAN ,INDUST,INFORM,INSURA,LABOR ,LOAN O,LODGIN,LOGIST,MARKET,MEDICA,MEETIN,NETWOR,PERSON,PROPER,PURCHA,QUALIT,REGULA,SALES ,SOCIAL,SOFTWA,SUPPLY,TAX PR,TRAINI,TRANSP,TREASU,WHOLES 9924   342 TRUE (0.03446191 0.96553809) *
##      7) STATE=ALABAMA,ALASKA,ARIZONA,ARKANSAS,COLORADO,CONNECTICUT,DISTRICT OF COLUMBIA,FLORIDA,GEORGIA,HAWAII,INDIANA,KENTUCKY,LOUISIANA,MARYLAND,MASSACHUSETTS,MICHIGAN,MISSISSIPPI,MISSOURI,NA,NEBRASKA,NEVADA,NEW HAMPSHIRE,NEW JERSEY,NEW MEXICO,NEW YORK,NORTH DAKOTA,OHIO,OKLAHOMA,OREGON,PENNSYLVANIA,PUERTO RICO,RHODE ISLAND,SOUTH CAROLINA,SOUTH DAKOTA,TEXAS,UTAH,VERMONT,VIRGINIA,WEST VIRGINIA,WYOMING 35795   913 TRUE (0.02550636 0.97449364)  
##       14) SOC_NAME_short=APPLIC,GEOGRA,MANGEM 42     0 FALSE (1.00000000 0.00000000) *
##       15) SOC_NAME_short=ACCOUN,ADMINI,ADVERT,AGENTS,APPRAI,ARCHIT,AUDITO,BIOCHE,BUDGET,BUSINE,BUYERS,CHIEF ,CLAIMS,CLINIC,COMPEN,COMPLI,COMPUT,CONSTR,COST E,CREDIT,DATABA,EDUCAT,EMERGE,ENVIRO,FARMER,FINANC,FITNES,FOOD S,FRAUD ,FUNDRA,GAMING,GENERA,HUMAN ,INDUST,INSURA,LABOR ,LOAN C,LOAN O,LODGIN,LOGIST,MANAGE,MANAGM,MARKET,MEDICA,MEETIN,NATURA,NETWOR,PERSON,PROPER,PUBLIC,PURCHA,REGULA,RISK M,SALES ,SOCIAL,SOFTWA,SUPPLY,SUSTAI,TAX EX,TAX PR,TRAINI,TRANSP,WHOLES 35753   871 TRUE (0.02436159 0.97563841) *
rpart.plot(tree_default, extra = 2, under = TRUE, varlen=0, faclen=0)