Read Data

#h1b <- read.csv(unz("h-1b-visa.zip", filename="h1b_kaggle.csv"))

I read only the first 100k rows to be faster

h1b <- read.csv(unz("h-1b-visa.zip", filename="h1b_kaggle.csv"),
  nrows = 100000)
dim(h1b)
## [1] 100000     11
summary(h1b)
##        X                       CASE_STATUS   
##  Min.   :     1   CERTIFIED          :84522  
##  1st Qu.: 25001   CERTIFIED-WITHDRAWN: 9075  
##  Median : 50000   DENIED             : 2190  
##  Mean   : 50000   WITHDRAWN          : 4213  
##  3rd Qu.: 75000                              
##  Max.   :100000                              
##                                              
##                                          EMPLOYER_NAME  
##  ERNST & YOUNG U.S. LLP                         : 3428  
##  COGNIZANT TECHNOLOGY SOLUTIONS U.S. CORPORATION: 2482  
##  INFOSYS LIMITED                                : 2317  
##  DELOITTE CONSULTING LLP                        : 1745  
##  DELOITTE & TOUCHE LLP                          : 1146  
##  CAPGEMINI AMERICA INC                          : 1121  
##  (Other)                                        :87761  
##                                                SOC_NAME    
##  COMPUTER SYSTEMS ANALYSTS                         :18600  
##  MANAGEMENT ANALYSTS                               :15010  
##  ACCOUNTANTS AND AUDITORS                          :11204  
##  FINANCIAL ANALYSTS                                : 9605  
##  MARKET RESEARCH ANALYSTS AND MARKETING SPECIALISTS: 8089  
##  (Other)                                           :37491  
##  NA's                                              :    1  
##               JOB_TITLE     FULL_TIME_POSITION PREVAILING_WAGE    
##  BUSINESS ANALYST  : 4246   N:51569            Min.   :        0  
##  ACCOUNTANT        : 2331   Y:48431            1st Qu.:    55370  
##  PROGRAMMER ANALYST: 2096                      Median :    68865  
##  SYSTEMS ANALYST   : 1941                      Mean   :   100643  
##  FINANCIAL ANALYST : 1570                      3rd Qu.:    92976  
##  ASSOCIATE         : 1519                      Max.   :329139200  
##  (Other)           :86297                                         
##       YEAR                           WORKSITE          lon         
##  Min.   :2016   NEW YORK, NEW YORK       :14424   Min.   :-157.86  
##  1st Qu.:2016   SAN FRANCISCO, CALIFORNIA: 3598   1st Qu.:-112.07  
##  Median :2016   HOUSTON, TEXAS           : 3038   Median : -84.39  
##  Mean   :2016   CHICAGO, ILLINOIS        : 2780   Mean   : -91.22  
##  3rd Qu.:2016   LOS ANGELES, CALIFORNIA  : 1850   3rd Qu.: -74.08  
##  Max.   :2016   ATLANTA, GEORGIA         : 1719   Max.   : 145.73  
##                 (Other)                  :72591   NA's   :3170     
##       lat       
##  Min.   :13.44  
##  1st Qu.:34.15  
##  Median :39.64  
##  Mean   :38.09  
##  3rd Qu.:40.73  
##  Max.   :64.84  
##  NA's   :3170

Look at Wage

summary(h1b$PREVAILING_WAGE)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0     55370     68865    100643     92976 329139200

Seems like there are some crazy wages

which(h1b$PREVAILING_WAGE> 1e6)
##  [1]   398  1702  5580 10955 16046 19773 24145 25929 31202 34846 59538
## [12] 64831 74508 77278 80398 89217 94020 94652 99315
h1b[which(h1b$PREVAILING_WAGE> 1e6),]
##           X CASE_STATUS                          EMPLOYER_NAME
## 398     398      DENIED DON IGNACIO CULINARY ARTS SCHOOL, INC.
## 1702   1702      DENIED                                   GVHB
## 5580   5580      DENIED                     E AND D MEDIA INC.
## 10955 10955      DENIED             TERRALOGIC SOLUTIONS, INC.
## 16046 16046      DENIED                     ASTEC AMERICA, LLC
## 19773 19773      DENIED                             3M COMPANY
## 24145 24145      DENIED               LEVEL CONSTRUCTION, INC.
## 25929 25929      DENIED                            CELINE INC.
## 31202 31202      DENIED          TECH MAHINDRA (AMERICAS),INC.
## 34846 34846   WITHDRAWN                    SLRI SOLUTIONS, LLC
## 59538 59538      DENIED                        DMS AMERICA LLC
## 64831 64831      DENIED                 G.C. CONSULTANTS, INC.
## 74508 74508      DENIED                DELOITTE CONSULTING LLP
## 77278 77278      DENIED              PANTHEON VENTURES (US) LP
## 80398 80398      DENIED             CARNEGIE MELLON UNIVERSITY
## 89217 89217      DENIED      BAPTIST HEALTH SOUTH FLORIDA INC.
## 94020 94020      DENIED                    SIERRA INFOSYS, INC
## 94652 94652      DENIED                       MINDTREE LIMITED
## 99315 99315      DENIED                               KPMG LLP
##                                           SOC_NAME
## 398                               CHIEF EXECUTIVES
## 1702               GENERAL AND OPERATIONS MANAGERS
## 5580                            MARKETING MANAGERS
## 10955    COMPUTER AND INFORMATION SYSTEMS MANAGERS
## 16046             QUALITY CONTROL SYSTEMS MANAGERS
## 19773       ARCHITECTURAL AND ENGINEERING MANAGERS
## 24145                              COST ESTIMATORS
## 25929                  HUMAN RESOURCES SPECIALISTS
## 31202                          MANAGEMENT ANALYSTS
## 34846                          MANAGEMENT ANALYSTS
## 59538                     ACCOUNTANTS AND AUDITORS
## 64831                                  ACCOUNTANTS
## 74508                           FINANCIAL ANALYSTS
## 77278             FINANCIAL SPECIALISTS, ALL OTHER
## 80398 COMPUTER AND INFORMATION RESEARCH SCIENTISTS
## 89217                    COMPUTER SYSTEMS ANALYSTS
## 94020                    COMPUTER SYSTEMS ANALYSTS
## 94652                    COMPUTER SYSTEMS ANALYSTS
## 99315                    COMPUTER SYSTEMS ANALYSTS
##                                   JOB_TITLE FULL_TIME_POSITION
## 398   PRESIDENT AND CHIEF EXECUTIVE OFFICER                  Y
## 1702                     PROPOSED JOB TITLE                  Y
## 5580     DIRECTOR, SOCIAL AND DIGITAL MEDIA                  Y
## 10955                            QA MANAGER                  Y
## 16046              PRODUCT SUPPORT ENGINEER                  Y
## 19773               GLOBAL BUSINESS MANAGER                  Y
## 24145                             ESTIMATOR                  Y
## 25929            HUMAN RESOURCES GENERALIST                  Y
## 31202                    MANAGEMENT ANALYST                  Y
## 34846                      BUSINESS ANALYST                  Y
## 59538                            ACCOUNTANT                  Y
## 64831                            ACCOUNTANT                  Y
## 74508                            CONSULTANT                  Y
## 77278                  INVESTMENT ASSOCIATE                  Y
## 80398                    RESEARCH ASSOCIATE                  Y
## 89217                    ERP BI DEVELOPER 3                  Y
## 94020              BUSINESS SYSTEMS ANALYST                  Y
## 94652                      BUSINESS ANALYST                  Y
## 99315                      SENIOR ASSOCIATE                  Y
##       PREVAILING_WAGE YEAR                   WORKSITE        lon      lat
## 398           2648400 2016             DORAL, FLORIDA  -80.35533 25.81954
## 1702         72800000 2016           CITY, CALIFORNIA         NA       NA
## 5580        329139200 2016   SANTA MONICA, CALIFORNIA -118.49119 34.01945
## 10955       259496640 2016       SAN JOSE, CALIFORNIA -121.88633 37.33821
## 16046         1016748 2016 MARLBOROUGH, MASSACHUSETTS  -71.55229 42.34593
## 19773       251796480 2016        ST. PAUL, MINNESOTA  -93.08996 44.95370
## 24145        88431200 2016  HARWOOD HEIGHTS, ILLINOIS         NA       NA
## 25929         3081468 2016         NEW YORK, NEW YORK  -74.00594 40.71278
## 31202       128970400 2016         ST LOUIS, MISSOURI  -90.19940 38.62700
## 34846       115385920 2016             HOUSTON, TEXAS  -95.36980 29.76043
## 59538        88647520 2016             MIAMI, FLORIDA  -80.19179 25.76168
## 64831       119192320 2016  SAN FRANCISCO, CALIFORNIA -122.41942 37.77493
## 74508       136757920 2016         EDISON, NEW JERSEY  -74.41210 40.51872
## 77278       182185120 2016         NEW YORK, NEW YORK  -74.00594 40.71278
## 80398         6044480 2016   PITTSBURGH, PENNSYLVANIA  -79.99589 40.44062
## 89217       115211200 2016      CORAL GABLES, FLORIDA  -80.26838 25.72149
## 94020       114000640 2016               BOISE, IDAHO -116.21461 43.61871
## 94652       122048160 2016          CHANDLER, ARIZONA -111.84125 33.30616
## 99315       152201920 2016              DALLAS, TEXAS  -96.79699 32.77666
h1b$PREVAILING_WAGE[which(h1b$PREVAILING_WAGE> 1e6)] <- NA

hist(h1b$PREVAILING_WAGE, breaks = 100)

Look at Wage by status

summary(h1b$CASE_STATUS)
##           CERTIFIED CERTIFIED-WITHDRAWN              DENIED 
##               84522                9075                2190 
##           WITHDRAWN 
##                4213
hist(h1b$PREVAILING_WAGE[h1b$CASE_STATUS == "CERTIFIED"], breaks = 50,
  col ="green", main = "", ylab = "Number of Applications", xlab = "Income")
hist(h1b$PREVAILING_WAGE[h1b$CASE_STATUS == "DENIED"], breaks = 100, add = T,
  col = "red")

legend("topright", legend = c("Total", "Denied"),
  col = c("green", "red"), pch = 15)

Simple Cross-tabulation

Use a regular expression to parse the state name

h1b$STATE <- sub(".*, (.*)", "\\1", h1b$WORKSITE)
table(h1b$STATE)
## 
##              ALABAMA               ALASKA              ARIZONA 
##                  310                   49                  990 
##             ARKANSAS           CALIFORNIA             COLORADO 
##                  415                20479                  858 
##          CONNECTICUT             DELAWARE DISTRICT OF COLUMBIA 
##                 1472                  461                  825 
##              FLORIDA              GEORGIA               HAWAII 
##                 3949                 3005                  150 
##                IDAHO             ILLINOIS              INDIANA 
##                   92                 5246                  801 
##                 IOWA               KANSAS             KENTUCKY 
##                  330                  250                  303 
##            LOUISIANA                MAINE             MARYLAND 
##                  190                   86                 1421 
##        MASSACHUSETTS             MICHIGAN            MINNESOTA 
##                 3815                 1851                 1185 
##          MISSISSIPPI             MISSOURI              MONTANA 
##                   78                  804                   30 
##                   NA             NEBRASKA               NEVADA 
##                  211                  172                  363 
##        NEW HAMPSHIRE           NEW JERSEY           NEW MEXICO 
##                  244                 5905                   92 
##             NEW YORK       NORTH CAROLINA         NORTH DAKOTA 
##                17736                 2118                   42 
##                 OHIO             OKLAHOMA               OREGON 
##                 1864                  286                  489 
##         PENNSYLVANIA          PUERTO RICO         RHODE ISLAND 
##                 3368                   66                  266 
##       SOUTH CAROLINA         SOUTH DAKOTA            TENNESSEE 
##                  384                   33                  857 
##                TEXAS                 UTAH              VERMONT 
##                 8914                  564                   53 
##             VIRGINIA           WASHINGTON        WEST VIRGINIA 
##                 2651                 2929                   67 
##            WISCONSIN              WYOMING 
##                  859                   22
tbl <- table(h1b$STATE, h1b$CASE_STATUS)
head(tbl)
##             
##              CERTIFIED CERTIFIED-WITHDRAWN DENIED WITHDRAWN
##   ALABAMA          267                  26      7        10
##   ALASKA            40                   5      3         1
##   ARIZONA          865                  76     17        32
##   ARKANSAS         352                  44      5        14
##   CALIFORNIA     17235                1950    484       810
##   COLORADO         673                 112     25        48

What are the most successful states?

rs <- rowSums(tbl)

tbl_relative <- tbl/rs
head(tbl_relative)
##             
##               CERTIFIED CERTIFIED-WITHDRAWN     DENIED  WITHDRAWN
##   ALABAMA    0.86129032          0.08387097 0.02258065 0.03225806
##   ALASKA     0.81632653          0.10204082 0.06122449 0.02040816
##   ARIZONA    0.87373737          0.07676768 0.01717172 0.03232323
##   ARKANSAS   0.84819277          0.10602410 0.01204819 0.03373494
##   CALIFORNIA 0.84159383          0.09521949 0.02363397 0.03955271
##   COLORADO   0.78438228          0.13053613 0.02913753 0.05594406
tbl_relative <- tbl_relative[order(tbl_relative[,"CERTIFIED"] , decreasing = TRUE),]

tbl_relative
##                       
##                          CERTIFIED CERTIFIED-WITHDRAWN      DENIED
##   PENNSYLVANIA         0.886876485         0.065617577 0.010688836
##   WASHINGTON           0.876066917         0.083304882 0.013997952
##   OKLAHOMA             0.874125874         0.066433566 0.027972028
##   ARIZONA              0.873737374         0.076767677 0.017171717
##   VERMONT              0.867924528         0.056603774 0.037735849
##   KANSAS               0.864000000         0.068000000 0.016000000
##   MASSACHUSETTS        0.863433814         0.079161206 0.020183486
##   KENTUCKY             0.861386139         0.059405941 0.029702970
##   ALABAMA              0.861290323         0.083870968 0.022580645
##   GEORGIA              0.856239601         0.086522463 0.021297837
##   NEW YORK             0.855209743         0.084517366 0.023060442
##   FLORIDA              0.849835401         0.074702456 0.034692327
##   ILLINOIS             0.849409074         0.085589020 0.018109035
##   ARKANSAS             0.848192771         0.106024096 0.012048193
##   CONNECTICUT          0.847826087         0.095108696 0.020380435
##   NEW MEXICO           0.847826087         0.076086957 0.010869565
##   MISSOURI             0.845771144         0.060945274 0.026119403
##   OHIO                 0.845493562         0.081545064 0.013948498
##   LOUISIANA            0.842105263         0.073684211 0.031578947
##   CALIFORNIA           0.841593828         0.095219493 0.023633967
##   MINNESOTA            0.841350211         0.084388186 0.018565401
##   UTAH                 0.840425532         0.085106383 0.042553191
##   MICHIGAN             0.840086440         0.099945975 0.017287952
##   TEXAS                0.839578192         0.104330267 0.020529504
##   MARYLAND             0.838845883         0.085855032 0.022519353
##   VIRGINIA             0.837042625         0.112033195 0.018106375
##   NEW JERSEY           0.836748518         0.092294666 0.016596105
##   TENNESSEE            0.836639440         0.095682614 0.016336056
##   HAWAII               0.833333333         0.066666667 0.086666667
##   IDAHO                0.826086957         0.086956522 0.021739130
##   NORTH CAROLINA       0.823418319         0.122757318 0.016525024
##   DISTRICT OF COLUMBIA 0.823030303         0.104242424 0.035151515
##   NA                   0.819905213         0.000000000 0.123222749
##   RHODE ISLAND         0.819548872         0.090225564 0.026315789
##   ALASKA               0.816326531         0.102040816 0.061224490
##   NEBRASKA             0.813953488         0.110465116 0.011627907
##   WISCONSIN            0.811408615         0.115250291 0.013969732
##   NEVADA               0.809917355         0.035812672 0.052341598
##   SOUTH CAROLINA       0.809895833         0.127604167 0.031250000
##   MAINE                0.802325581         0.104651163 0.034883721
##   IOWA                 0.796969697         0.109090909 0.042424242
##   DELAWARE             0.791757050         0.121475054 0.013015184
##   NEW HAMPSHIRE        0.790983607         0.127049180 0.008196721
##   OREGON               0.787321063         0.126789366 0.053169734
##   COLORADO             0.784382284         0.130536131 0.029137529
##   INDIANA              0.781523096         0.088639201 0.007490637
##   PUERTO RICO          0.757575758         0.000000000 0.015151515
##   MONTANA              0.733333333         0.000000000 0.000000000
##   SOUTH DAKOTA         0.727272727         0.121212121 0.060606061
##   MISSISSIPPI          0.717948718         0.115384615 0.038461538
##   WYOMING              0.681818182         0.090909091 0.090909091
##   NORTH DAKOTA         0.666666667         0.190476190 0.023809524
##   WEST VIRGINIA        0.641791045         0.119402985 0.104477612
##                       
##                          WITHDRAWN
##   PENNSYLVANIA         0.036817102
##   WASHINGTON           0.026630249
##   OKLAHOMA             0.031468531
##   ARIZONA              0.032323232
##   VERMONT              0.037735849
##   KANSAS               0.052000000
##   MASSACHUSETTS        0.037221494
##   KENTUCKY             0.049504950
##   ALABAMA              0.032258065
##   GEORGIA              0.035940100
##   NEW YORK             0.037212449
##   FLORIDA              0.040769815
##   ILLINOIS             0.046892871
##   ARKANSAS             0.033734940
##   CONNECTICUT          0.036684783
##   NEW MEXICO           0.065217391
##   MISSOURI             0.067164179
##   OHIO                 0.059012876
##   LOUISIANA            0.052631579
##   CALIFORNIA           0.039552713
##   MINNESOTA            0.055696203
##   UTAH                 0.031914894
##   MICHIGAN             0.042679633
##   TEXAS                0.035562037
##   MARYLAND             0.052779733
##   VIRGINIA             0.032817805
##   NEW JERSEY           0.054360711
##   TENNESSEE            0.051341890
##   HAWAII               0.013333333
##   IDAHO                0.065217391
##   NORTH CAROLINA       0.037299339
##   DISTRICT OF COLUMBIA 0.037575758
##   NA                   0.056872038
##   RHODE ISLAND         0.063909774
##   ALASKA               0.020408163
##   NEBRASKA             0.063953488
##   WISCONSIN            0.059371362
##   NEVADA               0.101928375
##   SOUTH CAROLINA       0.031250000
##   MAINE                0.058139535
##   IOWA                 0.051515152
##   DELAWARE             0.073752711
##   NEW HAMPSHIRE        0.073770492
##   OREGON               0.032719836
##   COLORADO             0.055944056
##   INDIANA              0.122347066
##   PUERTO RICO          0.227272727
##   MONTANA              0.266666667
##   SOUTH DAKOTA         0.090909091
##   MISSISSIPPI          0.128205128
##   WYOMING              0.136363636
##   NORTH DAKOTA         0.119047619
##   WEST VIRGINIA        0.134328358
oldpar <- par(mar = c(5, 9, 4, 2) + 0.1)
barplot(rev(tbl_relative[, "CERTIFIED"])*100, horiz = TRUE, las =2, cex.names = .6,
  xlim = c(0,100), xlab = "Percentage Certified")

par(oldpar)