#h1b <- read.csv(unz("h-1b-visa.zip", filename="h1b_kaggle.csv"))
I read only the first 100k rows to be faster
h1b <- read.csv(unz("h-1b-visa.zip", filename="h1b_kaggle.csv"),
nrows = 100000)
dim(h1b)
## [1] 100000 11
summary(h1b)
## X CASE_STATUS
## Min. : 1 CERTIFIED :84522
## 1st Qu.: 25001 CERTIFIED-WITHDRAWN: 9075
## Median : 50000 DENIED : 2190
## Mean : 50000 WITHDRAWN : 4213
## 3rd Qu.: 75000
## Max. :100000
##
## EMPLOYER_NAME
## ERNST & YOUNG U.S. LLP : 3428
## COGNIZANT TECHNOLOGY SOLUTIONS U.S. CORPORATION: 2482
## INFOSYS LIMITED : 2317
## DELOITTE CONSULTING LLP : 1745
## DELOITTE & TOUCHE LLP : 1146
## CAPGEMINI AMERICA INC : 1121
## (Other) :87761
## SOC_NAME
## COMPUTER SYSTEMS ANALYSTS :18600
## MANAGEMENT ANALYSTS :15010
## ACCOUNTANTS AND AUDITORS :11204
## FINANCIAL ANALYSTS : 9605
## MARKET RESEARCH ANALYSTS AND MARKETING SPECIALISTS: 8089
## (Other) :37491
## NA's : 1
## JOB_TITLE FULL_TIME_POSITION PREVAILING_WAGE
## BUSINESS ANALYST : 4246 N:51569 Min. : 0
## ACCOUNTANT : 2331 Y:48431 1st Qu.: 55370
## PROGRAMMER ANALYST: 2096 Median : 68865
## SYSTEMS ANALYST : 1941 Mean : 100643
## FINANCIAL ANALYST : 1570 3rd Qu.: 92976
## ASSOCIATE : 1519 Max. :329139200
## (Other) :86297
## YEAR WORKSITE lon
## Min. :2016 NEW YORK, NEW YORK :14424 Min. :-157.86
## 1st Qu.:2016 SAN FRANCISCO, CALIFORNIA: 3598 1st Qu.:-112.07
## Median :2016 HOUSTON, TEXAS : 3038 Median : -84.39
## Mean :2016 CHICAGO, ILLINOIS : 2780 Mean : -91.22
## 3rd Qu.:2016 LOS ANGELES, CALIFORNIA : 1850 3rd Qu.: -74.08
## Max. :2016 ATLANTA, GEORGIA : 1719 Max. : 145.73
## (Other) :72591 NA's :3170
## lat
## Min. :13.44
## 1st Qu.:34.15
## Median :39.64
## Mean :38.09
## 3rd Qu.:40.73
## Max. :64.84
## NA's :3170
summary(h1b$PREVAILING_WAGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 55370 68865 100643 92976 329139200
Seems like there are some crazy wages
which(h1b$PREVAILING_WAGE> 1e6)
## [1] 398 1702 5580 10955 16046 19773 24145 25929 31202 34846 59538
## [12] 64831 74508 77278 80398 89217 94020 94652 99315
h1b[which(h1b$PREVAILING_WAGE> 1e6),]
## X CASE_STATUS EMPLOYER_NAME
## 398 398 DENIED DON IGNACIO CULINARY ARTS SCHOOL, INC.
## 1702 1702 DENIED GVHB
## 5580 5580 DENIED E AND D MEDIA INC.
## 10955 10955 DENIED TERRALOGIC SOLUTIONS, INC.
## 16046 16046 DENIED ASTEC AMERICA, LLC
## 19773 19773 DENIED 3M COMPANY
## 24145 24145 DENIED LEVEL CONSTRUCTION, INC.
## 25929 25929 DENIED CELINE INC.
## 31202 31202 DENIED TECH MAHINDRA (AMERICAS),INC.
## 34846 34846 WITHDRAWN SLRI SOLUTIONS, LLC
## 59538 59538 DENIED DMS AMERICA LLC
## 64831 64831 DENIED G.C. CONSULTANTS, INC.
## 74508 74508 DENIED DELOITTE CONSULTING LLP
## 77278 77278 DENIED PANTHEON VENTURES (US) LP
## 80398 80398 DENIED CARNEGIE MELLON UNIVERSITY
## 89217 89217 DENIED BAPTIST HEALTH SOUTH FLORIDA INC.
## 94020 94020 DENIED SIERRA INFOSYS, INC
## 94652 94652 DENIED MINDTREE LIMITED
## 99315 99315 DENIED KPMG LLP
## SOC_NAME
## 398 CHIEF EXECUTIVES
## 1702 GENERAL AND OPERATIONS MANAGERS
## 5580 MARKETING MANAGERS
## 10955 COMPUTER AND INFORMATION SYSTEMS MANAGERS
## 16046 QUALITY CONTROL SYSTEMS MANAGERS
## 19773 ARCHITECTURAL AND ENGINEERING MANAGERS
## 24145 COST ESTIMATORS
## 25929 HUMAN RESOURCES SPECIALISTS
## 31202 MANAGEMENT ANALYSTS
## 34846 MANAGEMENT ANALYSTS
## 59538 ACCOUNTANTS AND AUDITORS
## 64831 ACCOUNTANTS
## 74508 FINANCIAL ANALYSTS
## 77278 FINANCIAL SPECIALISTS, ALL OTHER
## 80398 COMPUTER AND INFORMATION RESEARCH SCIENTISTS
## 89217 COMPUTER SYSTEMS ANALYSTS
## 94020 COMPUTER SYSTEMS ANALYSTS
## 94652 COMPUTER SYSTEMS ANALYSTS
## 99315 COMPUTER SYSTEMS ANALYSTS
## JOB_TITLE FULL_TIME_POSITION
## 398 PRESIDENT AND CHIEF EXECUTIVE OFFICER Y
## 1702 PROPOSED JOB TITLE Y
## 5580 DIRECTOR, SOCIAL AND DIGITAL MEDIA Y
## 10955 QA MANAGER Y
## 16046 PRODUCT SUPPORT ENGINEER Y
## 19773 GLOBAL BUSINESS MANAGER Y
## 24145 ESTIMATOR Y
## 25929 HUMAN RESOURCES GENERALIST Y
## 31202 MANAGEMENT ANALYST Y
## 34846 BUSINESS ANALYST Y
## 59538 ACCOUNTANT Y
## 64831 ACCOUNTANT Y
## 74508 CONSULTANT Y
## 77278 INVESTMENT ASSOCIATE Y
## 80398 RESEARCH ASSOCIATE Y
## 89217 ERP BI DEVELOPER 3 Y
## 94020 BUSINESS SYSTEMS ANALYST Y
## 94652 BUSINESS ANALYST Y
## 99315 SENIOR ASSOCIATE Y
## PREVAILING_WAGE YEAR WORKSITE lon lat
## 398 2648400 2016 DORAL, FLORIDA -80.35533 25.81954
## 1702 72800000 2016 CITY, CALIFORNIA NA NA
## 5580 329139200 2016 SANTA MONICA, CALIFORNIA -118.49119 34.01945
## 10955 259496640 2016 SAN JOSE, CALIFORNIA -121.88633 37.33821
## 16046 1016748 2016 MARLBOROUGH, MASSACHUSETTS -71.55229 42.34593
## 19773 251796480 2016 ST. PAUL, MINNESOTA -93.08996 44.95370
## 24145 88431200 2016 HARWOOD HEIGHTS, ILLINOIS NA NA
## 25929 3081468 2016 NEW YORK, NEW YORK -74.00594 40.71278
## 31202 128970400 2016 ST LOUIS, MISSOURI -90.19940 38.62700
## 34846 115385920 2016 HOUSTON, TEXAS -95.36980 29.76043
## 59538 88647520 2016 MIAMI, FLORIDA -80.19179 25.76168
## 64831 119192320 2016 SAN FRANCISCO, CALIFORNIA -122.41942 37.77493
## 74508 136757920 2016 EDISON, NEW JERSEY -74.41210 40.51872
## 77278 182185120 2016 NEW YORK, NEW YORK -74.00594 40.71278
## 80398 6044480 2016 PITTSBURGH, PENNSYLVANIA -79.99589 40.44062
## 89217 115211200 2016 CORAL GABLES, FLORIDA -80.26838 25.72149
## 94020 114000640 2016 BOISE, IDAHO -116.21461 43.61871
## 94652 122048160 2016 CHANDLER, ARIZONA -111.84125 33.30616
## 99315 152201920 2016 DALLAS, TEXAS -96.79699 32.77666
h1b$PREVAILING_WAGE[which(h1b$PREVAILING_WAGE> 1e6)] <- NA
hist(h1b$PREVAILING_WAGE, breaks = 100)
summary(h1b$CASE_STATUS)
## CERTIFIED CERTIFIED-WITHDRAWN DENIED
## 84522 9075 2190
## WITHDRAWN
## 4213
hist(h1b$PREVAILING_WAGE[h1b$CASE_STATUS == "CERTIFIED"], breaks = 50,
col ="green", main = "", ylab = "Number of Applications", xlab = "Income")
hist(h1b$PREVAILING_WAGE[h1b$CASE_STATUS == "DENIED"], breaks = 100, add = T,
col = "red")
legend("topright", legend = c("Total", "Denied"),
col = c("green", "red"), pch = 15)
Use a regular expression to parse the state name
h1b$STATE <- sub(".*, (.*)", "\\1", h1b$WORKSITE)
table(h1b$STATE)
##
## ALABAMA ALASKA ARIZONA
## 310 49 990
## ARKANSAS CALIFORNIA COLORADO
## 415 20479 858
## CONNECTICUT DELAWARE DISTRICT OF COLUMBIA
## 1472 461 825
## FLORIDA GEORGIA HAWAII
## 3949 3005 150
## IDAHO ILLINOIS INDIANA
## 92 5246 801
## IOWA KANSAS KENTUCKY
## 330 250 303
## LOUISIANA MAINE MARYLAND
## 190 86 1421
## MASSACHUSETTS MICHIGAN MINNESOTA
## 3815 1851 1185
## MISSISSIPPI MISSOURI MONTANA
## 78 804 30
## NA NEBRASKA NEVADA
## 211 172 363
## NEW HAMPSHIRE NEW JERSEY NEW MEXICO
## 244 5905 92
## NEW YORK NORTH CAROLINA NORTH DAKOTA
## 17736 2118 42
## OHIO OKLAHOMA OREGON
## 1864 286 489
## PENNSYLVANIA PUERTO RICO RHODE ISLAND
## 3368 66 266
## SOUTH CAROLINA SOUTH DAKOTA TENNESSEE
## 384 33 857
## TEXAS UTAH VERMONT
## 8914 564 53
## VIRGINIA WASHINGTON WEST VIRGINIA
## 2651 2929 67
## WISCONSIN WYOMING
## 859 22
tbl <- table(h1b$STATE, h1b$CASE_STATUS)
head(tbl)
##
## CERTIFIED CERTIFIED-WITHDRAWN DENIED WITHDRAWN
## ALABAMA 267 26 7 10
## ALASKA 40 5 3 1
## ARIZONA 865 76 17 32
## ARKANSAS 352 44 5 14
## CALIFORNIA 17235 1950 484 810
## COLORADO 673 112 25 48
What are the most successful states?
rs <- rowSums(tbl)
tbl_relative <- tbl/rs
head(tbl_relative)
##
## CERTIFIED CERTIFIED-WITHDRAWN DENIED WITHDRAWN
## ALABAMA 0.86129032 0.08387097 0.02258065 0.03225806
## ALASKA 0.81632653 0.10204082 0.06122449 0.02040816
## ARIZONA 0.87373737 0.07676768 0.01717172 0.03232323
## ARKANSAS 0.84819277 0.10602410 0.01204819 0.03373494
## CALIFORNIA 0.84159383 0.09521949 0.02363397 0.03955271
## COLORADO 0.78438228 0.13053613 0.02913753 0.05594406
tbl_relative <- tbl_relative[order(tbl_relative[,"CERTIFIED"] , decreasing = TRUE),]
tbl_relative
##
## CERTIFIED CERTIFIED-WITHDRAWN DENIED
## PENNSYLVANIA 0.886876485 0.065617577 0.010688836
## WASHINGTON 0.876066917 0.083304882 0.013997952
## OKLAHOMA 0.874125874 0.066433566 0.027972028
## ARIZONA 0.873737374 0.076767677 0.017171717
## VERMONT 0.867924528 0.056603774 0.037735849
## KANSAS 0.864000000 0.068000000 0.016000000
## MASSACHUSETTS 0.863433814 0.079161206 0.020183486
## KENTUCKY 0.861386139 0.059405941 0.029702970
## ALABAMA 0.861290323 0.083870968 0.022580645
## GEORGIA 0.856239601 0.086522463 0.021297837
## NEW YORK 0.855209743 0.084517366 0.023060442
## FLORIDA 0.849835401 0.074702456 0.034692327
## ILLINOIS 0.849409074 0.085589020 0.018109035
## ARKANSAS 0.848192771 0.106024096 0.012048193
## CONNECTICUT 0.847826087 0.095108696 0.020380435
## NEW MEXICO 0.847826087 0.076086957 0.010869565
## MISSOURI 0.845771144 0.060945274 0.026119403
## OHIO 0.845493562 0.081545064 0.013948498
## LOUISIANA 0.842105263 0.073684211 0.031578947
## CALIFORNIA 0.841593828 0.095219493 0.023633967
## MINNESOTA 0.841350211 0.084388186 0.018565401
## UTAH 0.840425532 0.085106383 0.042553191
## MICHIGAN 0.840086440 0.099945975 0.017287952
## TEXAS 0.839578192 0.104330267 0.020529504
## MARYLAND 0.838845883 0.085855032 0.022519353
## VIRGINIA 0.837042625 0.112033195 0.018106375
## NEW JERSEY 0.836748518 0.092294666 0.016596105
## TENNESSEE 0.836639440 0.095682614 0.016336056
## HAWAII 0.833333333 0.066666667 0.086666667
## IDAHO 0.826086957 0.086956522 0.021739130
## NORTH CAROLINA 0.823418319 0.122757318 0.016525024
## DISTRICT OF COLUMBIA 0.823030303 0.104242424 0.035151515
## NA 0.819905213 0.000000000 0.123222749
## RHODE ISLAND 0.819548872 0.090225564 0.026315789
## ALASKA 0.816326531 0.102040816 0.061224490
## NEBRASKA 0.813953488 0.110465116 0.011627907
## WISCONSIN 0.811408615 0.115250291 0.013969732
## NEVADA 0.809917355 0.035812672 0.052341598
## SOUTH CAROLINA 0.809895833 0.127604167 0.031250000
## MAINE 0.802325581 0.104651163 0.034883721
## IOWA 0.796969697 0.109090909 0.042424242
## DELAWARE 0.791757050 0.121475054 0.013015184
## NEW HAMPSHIRE 0.790983607 0.127049180 0.008196721
## OREGON 0.787321063 0.126789366 0.053169734
## COLORADO 0.784382284 0.130536131 0.029137529
## INDIANA 0.781523096 0.088639201 0.007490637
## PUERTO RICO 0.757575758 0.000000000 0.015151515
## MONTANA 0.733333333 0.000000000 0.000000000
## SOUTH DAKOTA 0.727272727 0.121212121 0.060606061
## MISSISSIPPI 0.717948718 0.115384615 0.038461538
## WYOMING 0.681818182 0.090909091 0.090909091
## NORTH DAKOTA 0.666666667 0.190476190 0.023809524
## WEST VIRGINIA 0.641791045 0.119402985 0.104477612
##
## WITHDRAWN
## PENNSYLVANIA 0.036817102
## WASHINGTON 0.026630249
## OKLAHOMA 0.031468531
## ARIZONA 0.032323232
## VERMONT 0.037735849
## KANSAS 0.052000000
## MASSACHUSETTS 0.037221494
## KENTUCKY 0.049504950
## ALABAMA 0.032258065
## GEORGIA 0.035940100
## NEW YORK 0.037212449
## FLORIDA 0.040769815
## ILLINOIS 0.046892871
## ARKANSAS 0.033734940
## CONNECTICUT 0.036684783
## NEW MEXICO 0.065217391
## MISSOURI 0.067164179
## OHIO 0.059012876
## LOUISIANA 0.052631579
## CALIFORNIA 0.039552713
## MINNESOTA 0.055696203
## UTAH 0.031914894
## MICHIGAN 0.042679633
## TEXAS 0.035562037
## MARYLAND 0.052779733
## VIRGINIA 0.032817805
## NEW JERSEY 0.054360711
## TENNESSEE 0.051341890
## HAWAII 0.013333333
## IDAHO 0.065217391
## NORTH CAROLINA 0.037299339
## DISTRICT OF COLUMBIA 0.037575758
## NA 0.056872038
## RHODE ISLAND 0.063909774
## ALASKA 0.020408163
## NEBRASKA 0.063953488
## WISCONSIN 0.059371362
## NEVADA 0.101928375
## SOUTH CAROLINA 0.031250000
## MAINE 0.058139535
## IOWA 0.051515152
## DELAWARE 0.073752711
## NEW HAMPSHIRE 0.073770492
## OREGON 0.032719836
## COLORADO 0.055944056
## INDIANA 0.122347066
## PUERTO RICO 0.227272727
## MONTANA 0.266666667
## SOUTH DAKOTA 0.090909091
## MISSISSIPPI 0.128205128
## WYOMING 0.136363636
## NORTH DAKOTA 0.119047619
## WEST VIRGINIA 0.134328358
oldpar <- par(mar = c(5, 9, 4, 2) + 0.1)
barplot(rev(tbl_relative[, "CERTIFIED"])*100, horiz = TRUE, las =2, cex.names = .6,
xlim = c(0,100), xlab = "Percentage Certified")
par(oldpar)