Slides, code and about the author

CC This work is licensed under the Creative Commons Attribution 4.0 International License. For questions please contact Michael Hahsler.

Obtaining the data

This demo uses data from the Stop-Question-and-Frisk program in New York City. “Stop-Question-and-Frisk is a practice of the New York City Police Department by which police officers stop and question hundreds of thousands of pedestrians annually, and frisk them for weapons and other contraband.” (Wikipedia)

The data can be obtained from http://www.nyclu.org/content/stop-and-frisk-data

Download files

if(!file.exists("SQF_Codebook.pdf")) {
  download.file("http://www.nyclu.org/files/SQF_Codebook.pdf", "SQF_Codebook.pdf")
}

Click here for a description of the data.

if(!file.exists("SQF 2012.csv")) {
  download.file("http://www.nyclu.org/files/stopandfrisk/Stop-and-Frisk-2012.zip",
    "Stop-and-Frisk-2012.zip")
  unzip("Stop-and-Frisk-2012.zip")
}

dat <- read.csv("SQF 2012.csv")
dim(dat)
## [1] 532911    101
summary(dat)
##       year           pct           ser_num         datestop       
##  Min.   :2012   Min.   :  1.0   Min.   :    1   Min.   : 1012012  
##  1st Qu.:2012   1st Qu.: 41.0   1st Qu.: 1790   1st Qu.: 2292012  
##  Median :2012   Median : 70.0   Median : 3790   Median : 5052012  
##  Mean   :2012   Mean   : 66.5   Mean   : 4980   Mean   : 5563631  
##  3rd Qu.:2012   3rd Qu.: 94.0   3rd Qu.: 6982   3rd Qu.: 8252012  
##  Max.   :2012   Max.   :123.0   Max.   :24652   Max.   :12312012  
##                                                                   
##     timestop         city            sex             race      
##  Min.   :   0   Min.   :1.000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:1002   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :1615   Median :2.000   Median :1.000   Median :1.000  
##  Mean   :1415   Mean   :2.518   Mean   :0.928   Mean   :2.012  
##  3rd Qu.:2030   3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.:3.000  
##  Max.   :2359   Max.   :5.000   Max.   :1.000   Max.   :6.000  
##                 NA's   :5       NA's   :7784    NA's   :13861  
##       dob                age             height         weight     
##  Min.   : 1011001   Min.   :  0.00   Min.   :36.0   Min.   :  0.0  
##  1st Qu.: 6051983   1st Qu.: 19.00   1st Qu.:67.0   1st Qu.:150.0  
##  Median :10311996   Median : 24.00   Median :69.0   Median :165.0  
##  Mean   : 8947660   Mean   : 28.77   Mean   :68.6   Mean   :169.3  
##  3rd Qu.:12311900   3rd Qu.: 34.00   3rd Qu.:71.0   3rd Qu.:180.0  
##  Max.   :12312012   Max.   :999.00   Max.   :95.0   Max.   :999.0  
##                                                                    
##     haircolr         eyecolor         build          othfeatr     
##  Min.   : 1.000   Min.   :1.000   Min.   :1.000   Min.   :0.0     
##  1st Qu.: 1.000   1st Qu.:2.000   1st Qu.:3.000   1st Qu.:2.0     
##  Median : 1.000   Median :2.000   Median :3.000   Median :5.0     
##  Mean   : 1.406   Mean   :1.992   Mean   :3.161   Mean   :3.9     
##  3rd Qu.: 1.000   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:5.0     
##  Max.   :10.000   Max.   :8.000   Max.   :4.000   Max.   :8.0     
##  NA's   :4449     NA's   :3315    NA's   :5398    NA's   :532896  
##     frisked          searched          contrabn           pistol        
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.000000  
##  Median :1.0000   Median :0.00000   Median :0.00000   Median :0.000000  
##  Mean   :0.5578   Mean   :0.08303   Mean   :0.01733   Mean   :0.001242  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.000000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.000000  
##                                                                         
##     riflshot           asltweap           knifcuti       
##  Min.   :0.00e+00   Min.   :0.00e+00   Min.   :0.000000  
##  1st Qu.:0.00e+00   1st Qu.:0.00e+00   1st Qu.:0.000000  
##  Median :0.00e+00   Median :0.00e+00   Median :0.000000  
##  Mean   :2.44e-05   Mean   :9.57e-05   Mean   :0.008829  
##  3rd Qu.:0.00e+00   3rd Qu.:0.00e+00   3rd Qu.:0.000000  
##  Max.   :1.00e+00   Max.   :1.00e+00   Max.   :1.000000  
##                                                          
##     machgun           othrweap           arstmade          arstoffn     
##  Min.   :0.0e+00   Min.   :0.000000   Min.   :0.00000          :500600  
##  1st Qu.:0.0e+00   1st Qu.:0.000000   1st Qu.:0.00000   CPM    :  3080  
##  Median :0.0e+00   Median :0.000000   Median :0.00000   CPW    :  2513  
##  Mean   :5.6e-06   Mean   :0.002481   Mean   :0.06064   CPCS   :  1369  
##  3rd Qu.:0.0e+00   3rd Qu.:0.000000   3rd Qu.:0.00000   ROBBERY:  1186  
##  Max.   :1.0e+00   Max.   :1.000000   Max.   :1.00000   221.1  :  1029  
##                                                         (Other): 23134  
##     sumissue                 sumoffen          crimsusp     
##  Min.   :0.00000                 :512042   FEL     :104949  
##  1st Qu.:0.00000   OPEN CONTAINER:  1613   MISD    : 81113  
##  Median :0.00000   DISCON        :  1289   FELONY  : 69204  
##  Mean   :0.05149   DIS CON       :   690   CPW     : 55974  
##  3rd Qu.:0.00000   *             :   673   ROBBERY : 48670  
##  Max.   :1.00000   (Other)       : 16603   BURGLARY: 19940  
##                    NA's          :     1   (Other) :153061  
##     detailcm          perobs           perstop          pf_hands     
##  Min.   :  1.00   Min.   :  0.000   5      :236912   Min.   :0.0000  
##  1st Qu.: 20.00   1st Qu.:  1.000   2      : 66583   1st Qu.:0.0000  
##  Median : 28.00   Median :  1.000   3      : 65098   Median :0.0000  
##  Mean   : 41.23   Mean   :  2.446   10     : 63996   Mean   :0.1319  
##  3rd Qu.: 68.00   3rd Qu.:  2.000   1      : 27924   3rd Qu.:0.0000  
##  Max.   :113.00   Max.   :955.000   4      : 22344   Max.   :1.0000  
##  NA's   :2                          (Other): 50054                   
##     pf_wall           pf_grnd            pf_drwep       
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.00000   Median :0.000000   Median :0.000000  
##  Mean   :0.02369   Mean   :0.003419   Mean   :0.002439  
##  3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :1.00000   Max.   :1.000000   Max.   :1.000000  
##                                                         
##     pf_ptwep           pf_baton           pf_hcuff      
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.00000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.00000  
##  Median :0.000000   Median :0.000000   Median :0.00000  
##  Mean   :0.002314   Mean   :0.000107   Mean   :0.03611  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.00000  
##  Max.   :1.000000   Max.   :1.000000   Max.   :1.00000  
##                                                         
##     pf_pepsp           pf_other           cs_objcs          cs_descr    
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.000  
##  Median :0.000000   Median :0.000000   Median :0.00000   Median :0.000  
##  Mean   :0.000107   Mean   :0.006549   Mean   :0.02524   Mean   :0.165  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:0.000  
##  Max.   :1.000000   Max.   :1.000000   Max.   :1.00000   Max.   :1.000  
##                                                                         
##     cs_casng         cs_lkout         cs_cloth          cs_drgtr      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   :0.3563   Mean   :0.1888   Mean   :0.04691   Mean   :0.07314  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##                                                                       
##     cs_furtv         cs_vcrim         cs_bulge          cs_other     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :0.00000   Median :0.0000  
##  Mean   :0.5154   Mean   :0.1105   Mean   :0.07304   Mean   :0.1563  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##                                                                      
##     rf_vcrim         rf_othsw          rf_attir          rf_vcact      
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.1294   Mean   :0.03357   Mean   :0.07405   Mean   :0.08189  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##                                                                        
##     rf_rfcmp          rf_verbl           rf_knowl          rf_furt      
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.000000   Median :0.00000   Median :0.0000  
##  Mean   :0.07933   Mean   :0.006309   Mean   :0.01875   Mean   :0.3931  
##  3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :1.000000   Max.   :1.00000   Max.   :1.0000  
##                                                                         
##     rf_bulg           sb_hdobj          sb_outln           sb_admis       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.00000   Median :0.00000   Median :0.000000   Median :0.000000  
##  Mean   :0.07005   Mean   :0.04354   Mean   :0.005245   Mean   :0.002742  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.000000   Max.   :1.000000  
##                                                                           
##     sb_other          ac_proxm         ac_evasv       ac_assoc      
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.00   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00   1st Qu.:0.00000  
##  Median :0.00000   Median :0.0000   Median :0.00   Median :0.00000  
##  Mean   :0.03555   Mean   :0.2259   Mean   :0.19   Mean   :0.04538  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.00   Max.   :1.00000  
##                                                                     
##     ac_cgdir         ac_incid         ac_time          ac_stsnd      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :1.0000   Median :0.0000   Median :0.00000  
##  Mean   :0.2383   Mean   :0.6002   Mean   :0.4383   Mean   :0.02558  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##                                                                      
##     ac_other          forceuse          inout           trhsloc      
##  Min.   :0.00000   Min.   :1.0      Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:2.0      1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :3.0      Median :0.0000   Median :0.0000  
##  Mean   :0.03439   Mean   :3.4      Mean   :0.2168   Mean   :0.3011  
##  3rd Qu.:0.00000   3rd Qu.:5.0      3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :6.0      Max.   :1.0000   Max.   :2.0000  
##                    NA's   :465609                                    
##      premname         addrnum                   stname      
##  STREET  :200106          :312070                  :306954  
##  SIDEWALK: 94800   60     :   947   BROADWAY       :  2906  
##          : 37665   300    :   907   8 AVENUE       :  2417  
##  LOBBY   : 16055   200    :   780   SUTTER AVENUE  :  2258  
##  MEZZ    : 10960   120    :   775   PARK AVENUE    :  2009  
##  (Other) :173299   315    :   769   AMSTERDAM AVENU:  1909  
##  NA's    :    26   (Other):216663   (Other)        :214458  
##                   stinter                  crossst          addrpct     
##  BROADWAY             : 11975   BROADWAY       :  5110   Min.   :  1.0  
##  8 AVENUE             :  6571   ROCKAWAY AVENUE:  4526   1st Qu.: 41.0  
##  LEXINGTON AVENUE     :  5925   PARK AVENUE    :  3931   Median : 70.0  
##  3 AVENUE             :  5886   NOSTRAND AVENUE:  3657   Mean   : 66.5  
##  7 AVENUE             :  4952   3 AVENUE       :  3387   3rd Qu.: 94.0  
##  SAINT NICHOLAS AVENUE:  4914   EAST 112 STREET:  3227   Max.   :123.0  
##  (Other)              :492688   (Other)        :509073   NA's   :5      
##      sector            beat             post            xcoord       
##  A      : 55505          :337499   Min.   : 1.0     Min.   : 913844  
##  E      : 51528   *      : 68491   1st Qu.:10.0     1st Qu.: 996734  
##  B      : 51467   9      : 19792   Median :22.0     Median :1005807  
##  H      : 48822   1      : 16183   Mean   :28.1     Mean   :1006808  
##  C      : 48820   2      : 15779   3rd Qu.:41.0     3rd Qu.:1016476  
##  G      : 46189   7      : 15598   Max.   :99.0     Max.   :1067249  
##  (Other):230580   (Other): 59569   NA's   :477757   NA's   :15585    
##      ycoord          typeofid        othpers          explnstp     
##  Min.   :121152   Min.   :1.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:183094   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :198317   Median :1.000   Median :0.0000   Median :1.0000  
##  Mean   :205079   Mean   :1.495   Mean   :0.1914   Mean   :0.9993  
##  3rd Qu.:234311   3rd Qu.:2.000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :271882   Max.   :4.000   Max.   :1.0000   Max.   :1.0000  
##  NA's   :15585                                                     
##      repcmd          revcmd         offunif          offverb      
##  Min.   :  1.0   Min.   :  1.0   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 69.0   1st Qu.: 67.0   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :110.0   Median :107.0   Median :1.0000   Median :0.0000  
##  Mean   :240.6   Mean   :237.8   Mean   :0.7526   Mean   :0.1931  
##  3rd Qu.:165.0   3rd Qu.:165.0   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :985.0   Max.   :879.0   Max.   :1.0000   Max.   :1.0000  
##  NA's   :4                                                        
##     officrid          offshld         ac_rept          ac_inves     
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.01201   Mean   :0.244   Mean   :0.1257   Mean   :0.1397  
##  3rd Qu.:0.00000   3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##                                                                     
##      radio        recstat        linecm      
##  Min.   :0.0000   1:269234   Min.   :0.0000  
##  1st Qu.:0.0000   A:263677   1st Qu.:1.0000  
##  Median :0.0000              Median :1.0000  
##  Mean   :0.2421              Mean   :0.8013  
##  3rd Qu.:0.0000              3rd Qu.:1.0000  
##  Max.   :1.0000              Max.   :1.0000  
##                              NA's   :2

Cleaning the data

Fix date and time

dat$datestop <- as.Date(sprintf("%08d", dat$datestop), format ="%m%d%Y")
dat$timestop <- as.integer(substr(sprintf("%04d", dat$timestop), 1, 2))

Clean continuous variables

Fix observation period

dat$perobs[ dat$perobs<1 | dat$perobs>120 ] <- NA

Fix stop period

head(dat$perstop)
## [1] 5  5  5  5  10 5 
## 87 Levels: ** 0 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 ... 99
dat$perstop[dat$perstop =="**"] <- NA
dat$perstop <- as.numeric(dat$perstop)

DOB is a really bad variable and we have age!

dat$dob <- NULL

Clean age

hist(dat$age)

table(dat$age)
## 
##     0     1     2     3     4     5     6     7     8     9    10    11 
##    53   712    53    37    20    95    17     6     4    13    41   117 
##    12    13    14    15    16    17    18    19    20    21    22    23 
##   616  2429  7682 17210 24413 28481 29548 29686 29649 28833 27618 22207 
##    24    25    26    27    28    29    30    31    32    33    34    35 
## 20051 19040 16546 15115 13669 13327 12989 11670 11809  9516  8889  8508 
##    36    37    38    39    40    41    42    43    44    45    46    47 
##  7048  6542  5927  5967  6298  6147  6512  5654  5561  5636  5254  5266 
##    48    49    50    51    52    53    54    55    56    57    58    59 
##  5108  5008  4843  4422  4395  3509  3144  2972  2596  2118  1645  1463 
##    60    61    62    63    64    65    66    67    68    69    70    71 
##  1363  1132  1121   780   665   509   407   346   291   221   214   162 
##    72    73    74    75    76    77    78    79    80    81    82    83 
##   143   122    70    55    58    50    39    27    17    18    18     8 
##    84    85    86    87    88    89    91    95    96    97    98    99 
##     9     3     2     2     3     2     3     2     2     1     1   362 
##   100   108   110   113   114   115   116   117   118   119   120   121 
##    77     1     2     1     1     3     2     3     6     3    15     2 
##   123   124   125   127   128   129   130   131   135   136   140   144 
##     2     1     6     1     1     2    14     2     8     1    17     2 
##   145   146   150   154   155   156   160   161   164   165   166   168 
##    15     2    32     1     5     4    42     2     2    36     1     1 
##   169   170   171   174   175   176   177   178   180   181   183   185 
##     4    33     3     2    16     2     2     5    44     4     1    15 
##   187   188   189   190   191   195   196   198   199   200   201   202 
##     1     2     7    16     1    13     5     5     1    21     1     1 
##   205   206   209   210   211   212   214   215   216   217   218   219 
##     5     1     1     3     2     1     3     5     2     1     2     1 
##   220   221   222   223   224   225   226   227   228   229   230   231 
##     2     5     4     4     3     4     4     4     1     2     3     3 
##   232   233   235   236   240   241   242   243   244   245   246   247 
##     4     3     5     3     2     2     1     1     1     2     1     1 
##   250   251   253   254   255   256   258   260   261   263   265   269 
##     3     1     1     1     4     1     1     1     1     2     5     1 
##   270   274   278   285   287   296   298   300   301   305   306   309 
##     1     4     5     2     2     6     2     1     1     5     1     2 
##   310   312   315   319   321   323   326   332   333   334   335   338 
##     1     3     1     1     1     2     1     1     2     1     5     2 
##   343   345   346   347   352   354   360   361   363   365   366   375 
##     2     1     1     3     1     1     1     1     1     1     1     1 
##   385   387   396   401   405   410   411   412   421   422   425   433 
##     4     2     1     1     1     2     2     1     2     1     1     1 
##   435   437   442   447   451   452   455   461   463   469   478   479 
##     1     1     1     1     1     2     1     1     1     1     1     1 
##   487   488   490   496   501   505   510   511   512   515   520   521 
##     1     2     1     1     1     1     3     2     1     1     3     1 
##   522   523   553   555   556   565   585   622   674   691   694   699 
##     2     1     2     1     1     1     1     1     1     1     1     1 
##   700   851   999 
##     1     1   210
dat$age[dat$age < 10 | dat$age > 90] <- NA
hist(dat$age, breaks=40)

Clean height

table(dat$height)
## 
##    36    37    38    39    40    41    42    43    44    45    46    47 
##    68     8     5     5     5    13     3     7    13    11     7     5 
##    48    49    50    51    52    53    54    55    56    57    58    59 
##    38    29    32    29    38   123   129   134   205   257   274   507 
##    60    61    62    63    64    65    66    67    68    69    70    71 
##  3876  3193  6843  9054 17215 32624 52838 61087 72983 69105 64694 38571 
##    72    73    74    75    76    77    78    79    80    81    82    83 
## 46109 23823 16836  6488  2725  1230   553   242   160   116   284   197 
##    84    85    86    87    88    89    90    91    93    94    95 
##    25    16     3     2     2     9     3     7     3     2    48
barplot(table(dat$height))

dat$height[dat$height < 40 | dat$height > 90] <- NA
hist(dat$height, breaks=40)

Clean weight

table(dat$weight)
## 
##     0     1     2     3     4     5     6     7     8     9    10    11 
##     6   257    10     1     6    24    15    15     6    16    53    13 
##    12    13    14    15    16    17    18    19    20    21    22    23 
##     5     8    13    30    44    37    50    14    45     9    20    10 
##    24    25    26    27    28    29    30    31    32    33    34    35 
##     6    13     8     6     4     5     9     3     3     2     6     3 
##    36    37    38    39    40    41    42    43    44    45    46    47 
##     2     2     2     1    11     2     1     2     4     2     1     2 
##    48    49    50    52    53    54    55    56    57    58    59    60 
##     2     1    10     2     2     1    10     3     6     4     6    23 
##    61    62    63    65    67    68    70    75    80    83    84    85 
##     1     1     2     3     1     1    18    12    43     1     2    36 
##    86    87    88    89    90    91    92    93    94    95    96    97 
##     1     1     1     3   238     2     4     4     2   148     6    10 
##    98    99   100   101   102   103   104   105   106   107   108   109 
##    32    16  1877    10    30    14    21   647    27    24    47    28 
##   110   111   112   113   114   115   116   117   118   119   120   121 
##  4076    31    55    23    30  1949    71    43   101    43 11818    35 
##   122   123   124   125   126   127   128   129   130   131   132   133 
##    45    57    39  4221    70    74   122    82 18584    37   116    54 
##   134   135   136   137   138   139   140   141   142   143   144   145 
##    62  6728    94   103   211   107 34437    71   145   127    78 12013 
##   146   147   148   149   150   151   152   153   154   155   156   157 
##   121   198   340   181 59997    72   242   145   533 11098   393   233 
##   158   159   160   161   162   163   164   165   166   167   168   169 
##   482   249 69203    85   286   192   130 23055   129   223   538   245 
##   170   171   172   173   174   175   176   177   178   179   180   181 
## 54721    99   243   175   121 21041   216    83   376   159 59353   101 
##   182   183   184   185   186   187   188   189   190   191   192   193 
##   179   162    94 13700   152   160   239   270 26949    60   130    62 
##   194   195   196   197   198   199   200   201   202   203   204   205 
##    37  6776   123    84   287   205 30714    68    44    50    33  2403 
##   206   207   208   209   210   211   212   213   214   215   216   217 
##    39    51    75    58 10282    35    50    25    37  2628    39    35 
##   218   219   220   221   222   223   224   225   226   227   228   229 
##    56    28  9484    12    38    15    17  2047    15    16    22    15 
##   230   231   232   233   234   235   236   237   238   239   240   241 
##  4419     5    13    11    10   571     9     6    24     4  3284     2 
##   242   243   244   245   246   247   248   249   250   251   252   253 
##    11     6     5   306    12     8    11    12  4567     2     8     5 
##   254   255   256   257   258   259   260   261   262   263   264   265 
##     4   138     8     4     8     8  1313     2     7     2     4   211 
##   266   267   268   269   270   271   272   273   274   275   276   277 
##     2     6     8     2   632     1     4     3     3   311     3     3 
##   278   279   280   281   282   283   284   285   286   287   288   289 
##     4     2   958     3     5     4     1   124     4     4     3     8 
##   290   291   295   296   297   298   299   300   301   302   303   304 
##   246     2    50     3     1     2     5  1044     4     1     2     2 
##   305   307   308   310   311   312   314   315   316   318   319   320 
##    17     1     1   101     1     2     1    40     1     1     1   132 
##   321   325   329   330   332   335   336   337   339   340   345   349 
##     1    36     1    45     1     2     1     1     1    38     4     1 
##   350   353   355   360   365   368   370   371   374   375   380   385 
##   149     1     2    25     6     1    14     2     2     9    12     2 
##   390   395   399   400   405   410   415   417   418   423   425   429 
##     4     2     1    34     1     1     1     1     1     1     3     1 
##   430   436   440   442   445   450   459   460   465   470   475   480 
##     1     1     1     1     2     5     1     3     1     1     1     3 
##   485   499   500   510   511   514   515   518   519   525   530   540 
##     1     1     8     6     5     1     1     1     1     1     1     3 
##   552   600   612   614   700   750   810   870   916   918   999 
##     1     2     1     1     2     2     1     1     1     1   359
barplot(table(dat$weight))

dat$weight[dat$weight < 50 | dat$weight > 400] <- NA
hist(dat$weight, breaks=40)

Change nominal variables into factors

dat$city <- factor(dat$city, labels=c("Manhattan", "Brooklyn", "Bronx",
  "Queens", "Staten Island"))

dat$race <- factor(dat$race, labels=c("Black", "Black Hispanic",
  "White Hispanic", "White", "Asian/Pacific Islander",
  "Am. Indian/ Native Alaskan"))

dat$sex <- factor(dat$sex+1L, label=c("female", "male"))

dat$build <- factor(dat$build, labels=c("heavy", "muscular",
  "medium", "thin"))

dat$forceuse <- factor(dat$forceuse, labels =c("defense of other",
  "defense of self", "overcome resistence", "other",
  "suspected flight", "suspected weapon"))

dat$inout <- factor(dat$inout+1L, labels=c("outside", "inside"))

dat$trhsloc <- factor(dat$trhsloc+1L, labels=c("neither",
  "housing authority", "transit authority"))

I copied the crimecodes from the variable description PDF into Excel and saved them in crimecodes.csv

crimecodes <- read.csv("crimecodes.csv", header = FALSE)
dat$detailcm <- factor(dat$detailcm, levels= crimecodes[,1],
  labels=crimecodes[,2])

dat$pct <- as.factor(dat$pct) # use names instead?
dat$addrpct <- as.factor(dat$addrpct)
dat$sector <- as.factor(dat$sector)

there are 4 types of ID in the data!!! So I leave the 4th as unknown

dat$typeofid <- factor(dat$typeofid,
  labels=c("photo id", "verbal id", "refused to provide id", "unknown"))
dat$repcmd <- as.factor(dat$repcmd)
dat$revcmd <- as.factor(dat$revcmd)

Convert binary variables into logical

binary <- strsplit("frisked searched contrabn pistol riflshot asltweap knifcuti machgun othrweap arstmade sumissue sumoffen",
  " ")[[1]]
for(b in binary) dat[[b]] <- as.logical(dat[[b]])

cs = reason for stop

for(b in grep("cs_", colnames(dat), value=TRUE)) dat[[b]] <- as.logical(dat[[b]])

rf = reason for frisk

for(b in grep("rf_", colnames(dat), value=TRUE)) dat[[b]] <- as.logical(dat[[b]])

sb = basis of search

for(b in grep("sb_", colnames(dat), value=TRUE)) dat[[b]] <- as.logical(dat[[b]])

ac = additional circumstance

for(b in grep("ac_", colnames(dat), value=TRUE)) dat[[b]] <- as.logical(dat[[b]])

pf = force used

for(b in grep("pf_", colnames(dat), value=TRUE)) dat[[b]] <- as.logical(dat[[b]])

dat$othpers <- as.logical(dat$othpers)
dat$explnstp <- as.logical(dat$explnstp)

If the officer is in uniform then the officer does not need to inform the person that he/she is an officer and show his/her shield.

dat$offunif <- as.logical(dat$offunif)
dat$offverb <- as.logical(dat$offverb)
dat$offverb[dat$offunif] <- NA
dat$officrid <- as.logical(dat$officrid)
dat$officrid[dat$offunif] <- NA
dat$offshld <- as.logical(dat$offshld)
dat$offshld[dat$offunif] <- NA

dat$radio <- as.logical(dat$radio)

Remove unused variables

dat$year <- NULL # we only have 2012
dat$haircolr <- NULL
dat$eyecolor <- NULL
dat$ser_num <- NULL
dat$othfeatr <- NULL
dat$arstoffn <- NULL
dat$crimsusp <- NULL
dat$premname <- NULL
dat$addrnum <- NULL
dat$stname <- NULL
dat$stinter <- NULL
dat$ crossst <- NULL
dat$beat <- NULL
dat$post <- NULL
dat$recstat <- NULL
dat$linecm <- NULL

Check summary of cleaned data

summary(dat)
##       pct            datestop             timestop    
##  75     : 24408   Min.   :2012-01-01   Min.   : 0.00  
##  73     : 22148   1st Qu.:2012-02-29   1st Qu.:10.00  
##  40     : 18276   Median :2012-05-05   Median :16.00  
##  44     : 15414   Mean   :2012-05-28   Mean   :13.89  
##  79     : 15294   3rd Qu.:2012-08-25   3rd Qu.:20.00  
##  103    : 12986   Max.   :2012-12-31   Max.   :23.00  
##  (Other):424385                                       
##             city            sex        
##  Manhattan    :110366   female: 38062  
##  Brooklyn     :188707   male  :487065  
##  Bronx        :102143   NA's  :  7784  
##  Queens       :110958                  
##  Staten Island: 20732                  
##  NA's         :     5                  
##                                        
##                          race             age            height    
##  Black                     :284229   Min.   :10.00   Min.   :40.0  
##  Black Hispanic            : 35772   1st Qu.:19.00   1st Qu.:67.0  
##  White Hispanic            :129368   Median :24.00   Median :69.0  
##  White                     : 50366   Mean   :28.13   Mean   :68.6  
##  Asian/Pacific Islander    : 17058   3rd Qu.:34.00   3rd Qu.:71.0  
##  Am. Indian/ Native Alaskan:  2257   Max.   :89.00   Max.   :90.0  
##  NA's                      : 13861   NA's   :2360    NA's   :146   
##      weight           build         frisked         searched      
##  Min.   : 50.0   heavy   : 43101   Mode :logical   Mode :logical  
##  1st Qu.:150.0   muscular:  2259   FALSE:235667    FALSE:488663   
##  Median :165.0   medium  :308601   TRUE :297244    TRUE :44248    
##  Mean   :168.9   thin    :173552   NA's :0         NA's :0        
##  3rd Qu.:180.0   NA's    :  5398                                  
##  Max.   :400.0                                                    
##  NA's   :1243                                                     
##   contrabn         pistol         riflshot        asltweap      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:523677    FALSE:532249    FALSE:532898    FALSE:532860   
##  TRUE :9234      TRUE :662       TRUE :13        TRUE :51       
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##   knifcuti        machgun         othrweap        arstmade      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:528206    FALSE:532908    FALSE:531589    FALSE:500596   
##  TRUE :4705      TRUE :3         TRUE :1322      TRUE :32315    
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##   sumissue       sumoffen                     detailcm     
##  Mode :logical   Mode:logical   CPW               :129722  
##  FALSE:505472    NA's:532911    ROBBERY           :116843  
##  TRUE :27439                    BURGLARY          : 60090  
##  NA's :0                        GRAND LARCENY AUTO: 44639  
##                                 CRIMINAL TRESPASS : 37967  
##                                 (Other)           :143648  
##                                 NA's              :     2  
##      perobs           perstop       pf_hands        pf_wall       
##  Min.   :  1.000   Min.   : 2.00   Mode :logical   Mode :logical  
##  1st Qu.:  1.000   1st Qu.:14.00   FALSE:462623    FALSE:520284   
##  Median :  1.000   Median :36.00   TRUE :70288     TRUE :12627    
##  Mean   :  2.436   Mean   :31.73   NA's :0         NA's :0        
##  3rd Qu.:  2.000   3rd Qu.:47.00                                  
##  Max.   :120.000   Max.   :87.00                                  
##  NA's   :3853      NA's   :243                                    
##   pf_grnd         pf_drwep        pf_ptwep        pf_baton      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:531089    FALSE:531611    FALSE:531678    FALSE:532854   
##  TRUE :1822      TRUE :1300      TRUE :1233      TRUE :57       
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##   pf_hcuff        pf_pepsp        pf_other        cs_objcs      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:513665    FALSE:532854    FALSE:529421    FALSE:519463   
##  TRUE :19246     TRUE :57        TRUE :3490      TRUE :13448    
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##   cs_descr        cs_casng        cs_lkout        cs_cloth      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:444997    FALSE:343055    FALSE:432273    FALSE:507914   
##  TRUE :87914     TRUE :189856    TRUE :100638    TRUE :24997    
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##   cs_drgtr        cs_furtv        cs_vcrim        cs_bulge      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:493933    FALSE:258266    FALSE:474002    FALSE:493989   
##  TRUE :38978     TRUE :274645    TRUE :58909     TRUE :38922    
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##   cs_other        rf_vcrim        rf_othsw        rf_attir      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:449598    FALSE:463955    FALSE:515019    FALSE:493450   
##  TRUE :83313     TRUE :68956     TRUE :17892     TRUE :39461    
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##   rf_vcact        rf_rfcmp        rf_verbl        rf_knowl      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:489273    FALSE:490633    FALSE:529549    FALSE:522919   
##  TRUE :43638     TRUE :42278     TRUE :3362      TRUE :9992     
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##   rf_furt         rf_bulg         sb_hdobj        sb_outln      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:323411    FALSE:495579    FALSE:509706    FALSE:530116   
##  TRUE :209500    TRUE :37332     TRUE :23205     TRUE :2795     
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##   sb_admis        sb_other        ac_proxm        ac_evasv      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:531450    FALSE:513967    FALSE:412500    FALSE:431677   
##  TRUE :1461      TRUE :18944     TRUE :120411    TRUE :101234   
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##   ac_assoc        ac_cgdir        ac_incid        ac_time       
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:508725    FALSE:405926    FALSE:213049    FALSE:299354   
##  TRUE :24186     TRUE :126985    TRUE :319862    TRUE :233557   
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##   ac_stsnd        ac_other                      forceuse     
##  Mode :logical   Mode :logical   defense of other   :  1016  
##  FALSE:519278    FALSE:514585    defense of self    : 30733  
##  TRUE :13633     TRUE :18326     overcome resistence:  2120  
##  NA's :0         NA's :0         other              : 14421  
##                                  suspected flight   : 13941  
##                                  suspected weapon   :  5071  
##                                  NA's               :465609  
##      inout                     trhsloc          addrpct      
##  outside:417377   neither          :413990   75     : 24408  
##  inside :115534   housing authority: 77373   73     : 22148  
##                   transit authority: 41548   40     : 18276  
##                                              44     : 15414  
##                                              79     : 15294  
##                                              (Other):437366  
##                                              NA's   :     5  
##      sector           xcoord            ycoord      
##  A      : 55505   Min.   : 913844   Min.   :121152  
##  E      : 51528   1st Qu.: 996734   1st Qu.:183094  
##  B      : 51467   Median :1005807   Median :198317  
##  H      : 48822   Mean   :1006808   Mean   :205079  
##  C      : 48820   3rd Qu.:1016476   3rd Qu.:234311  
##  G      : 46189   Max.   :1067249   Max.   :271882  
##  (Other):230580   NA's   :15585     NA's   :15585   
##                   typeofid       othpers         explnstp      
##  photo id             :289131   Mode :logical   Mode :logical  
##  verbal id            :229801   FALSE:430923    FALSE:376      
##  refused to provide id:  8149   TRUE :101988    TRUE :532535   
##  unknown              :  5830   NA's :0         NA's :0        
##                                                                
##                                                                
##                                                                
##      repcmd           revcmd        offunif         offverb       
##  165    : 35820   165    : 29202   Mode :logical   Mode :logical  
##  163    : 27662   163    : 23044   FALSE:131852    FALSE:28973    
##  802    : 13211   802    : 13197   TRUE :401059    TRUE :102879   
##  164    : 12322   120    : 11149   NA's :0         NA's :401059   
##  162    : 11178   162    : 10791                                  
##  (Other):432714   101    :  9134                                  
##  NA's   :     4   (Other):436394                                  
##   officrid        offshld         ac_rept         ac_inves      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:125452    FALSE:1843      FALSE:465902    FALSE:458488   
##  TRUE :6400      TRUE :130009    TRUE :67009     TRUE :74423    
##  NA's :401059    NA's :401059    NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##    radio        
##  Mode :logical  
##  FALSE:403875   
##  TRUE :129036   
##  NA's :0        
##                 
##                 
## 

Save cleaned data

save(dat, file="SFQ_clean.rda")

#load("SFQ_clean.rda")

Look at number of stops

oldpar <- par("mar" = c(6,4,4,2)+.1)
barplot(table(dat$city), ylab="Number of Stops", las=3)

par(oldpar)

Compare to population (from Wikipedia)

pop <- c(Manhattan =1626159, Brooklyn = 2592149, Bronx = 1418733,
  Queens =2296175, 'Staten Island'=472621)
oldpar <- par("mar" = c(6,4,4,2)+.1)
barplot(table(dat$city)/pop*100, ylab="Stops in % of Population",
  las=3, ylim=c(0,10))

par(oldpar)

tbl <- table(dat$race)
names(tbl) <- abbreviate(names(tbl), 8)
barplot(tbl, ylab="Number of Stops", las=3)

Compare to NYC population (from Wikipedia)

pop <- c(White=44.6, Black=25.1, Hispanic=27.5, Other=11.8)
sum(pop)
## [1] 109

does not add up to 100

pop <- pop/sum(pop)

aggregate data so the groups match the population data

tbl <- table(dat$race)
tbl <- c(tbl["White"], tbl["Black"],
  tbl["White Hispanic"]+tbl["Black Hispanic"],
  tbl["Asian/Pacific Islander"]+tbl["Am. Indian/ Native Alaskan"])
names(tbl) <- c("White", "Black", "Hispanic", "Other")
tbl <- tbl/sum(tbl)

barplot((rbind(pop, tbl)*100), beside = TRUE, ylab="Proportion [%]",
  col=gray.colors(2), ylim=c(0,70), main = "Stops in NYC")
legend(x = "topright", legend = c("Population", "Stops"),
  fill=gray.colors(2), bty="n")

Association plots (between nominal/binary variables)

if(! "vcd" %in% installed.packages()) install.packages("vcd", depend = TRUE)
library(vcd)
## Loading required package: grid
assoc(~ sex + race, data=dat, shade=TRUE, abbreviate_labs=6)

assoc(~ sex + forceuse, data=dat, shade=TRUE,
  labeling_args = list(rot_labels = c(25, 90, 0, 90), varnames=FALSE))