Change to the folder with the file (use Session tab or setwd) do not do data prep again…

prep <- FALSE

if(prep) {
  data <- read.csv("Dallas_Police_Public_Data_-_RMS_Incidents-With_GeoLocation.csv")
  dim(data)
  summary(data)

  #' # Start creating a new "clean" data frame
  data_clean <- data[, c("IncidentNum","ReportedDate", "Time1", "UCROffDesc", "PCClass", "City",
    "ZipCode", "Beat", "CompRace", "CompSex", "CompAge", "VictimCond", "Status", "CompState", "GeoLocation")]
  summary(data_clean)

  #' # Here are some examples:
  #'
  #' ## Fix dates/time
  head(data_clean$ReportedDate)

  # install.packages("lubridate")
  library(lubridate)
  d <- parse_date_time(as.character(data_clean$ReportedDate), "%m/%d/%Y %H:%M:%S", tz = "CST")
  # Note: use tz = ":US/Central" on Macs with OS X

  head(d)
  hour(d[1:5])
  day(d[1:5])
  month(d[1:5])

  # put new data back into the data.frame
  data_clean$ReportedDate <- d

  # example of adding time
  d[1]
  d[1] + hours(18)


  #' # Fix nominal variables
  summary(data_clean)
  data_clean$ZipCode <- as.factor(data_clean$ZipCode)
  data_clean$Beat <- as.factor(data_clean$Beat)
  summary(data_clean)

  #' # Fix geolocation
  gl <- as.character(data_clean$GeoLocation)
  start <- regexpr("\\(", gl)
  end <- regexpr("\\)", gl)
  gl <- substr(gl, start+1, end-1)
  #gl[gl==""] <- "NA, NA"
  gl <- strsplit(gl, ", ")
  lon <- sapply(gl, FUN = function(x) as.numeric(x[1]))
  lat <- sapply(gl, FUN = function(x) as.numeric(x[2]))
  data_clean$lon <- lon
  data_clean$lat <- lat
  data_clean$GeoLocation <- NULL # remove

  summary(data_clean)

  ### remove entries with the same incidence number and remove incidence number
  data_clean <- data_clean[!duplicated(data_clean$IncidentNum),]
  data_clean <- data_clean[,-1]

  #' # Save clean data
  save(data_clean, file = "data_clean.rda")
}

now we can always load the cleaned data with

load("data_clean.rda")

Look at rare ZIP codes (mistakes or outliers) using a count table

sort(table(data_clean$ZipCode))
## 
## 48232 72231 72543 74212 74243 75014 75021 75024 75026 75035 75039 75044 
##     1     1     1     1     1     1     1     1     1     1     1     1 
## 75050 75063 75082 75126 75221 75255 75261 75288 76234 77521 75074 75075 
##     1     1     1     1     1     1     1     1     1     1     2     2 
## 75141 75146 75032 75048 75119 75154 75180 75182 75213 75006 75043 75060 
##     2     2     3     3     3     3     3     3     3     4     4     4 
## 75137 75001 75087 75181 75081 75041 75062 75159 75149 75052 75150 75007 
##     4     5     5     5     6     7     9    11    12    13    13    14 
## 75134 75080 75040 75115 75089 75019 75042 75116 75051 75088 75246 75205 
##    15    16    17    18    19    20    23    39    59    65   340   367 
## 75234 75244 75251 75249 75252 75209 75233 75226 75230 75225 75248 75236 
##   412   559   572   660   820  1006  1097  1135  1187  1188  1195  1216 
## 75207 75202 75253 75254 75223 75210 75247 75218 75203 75214 75235 75201 
##  1319  1378  1427  1446  1500  1563  1595  1646  1944  1955  2018  2157 
## 75238 75219 75212 75232 75208 75240 75287 75224 75229 75237 75206 75241 
##  2243  2260  2335  2543  2579  2709  2744  2845  2949  3251  3270  3393 
## 75204 75215 75231 75227 75211 75220 75216 75228 75243 75217 
##  3615  3778  3992  4988  5357  5476  5634  5921  6322  7355

Fix empty fields in nominal data

summary(data_clean$City)
##                         ADDISON       ARLINGTON   BALCH SPRINGS 
##            1375               6               1               1 
##      CARROLLTON   COCKRELL HILL        COMMERCE         COPPELL 
##              13               1               1               5 
##               D             DAL           DALAS           DALLA 
##               1               2               2               1 
##          Dallas          DALLAS          DESOTO             DLS 
##            2551          114924               7               2 
##     DUNCANVILLE  FARMERS BRANCH         GARLAND   GRAND PRAIRIE 
##              14               6              15              27 
##   HIGHLAND PARK        HUTCHINS          IRVING       LANCASTER 
##               1               1               4               4 
##        MESQUITE           PLANO  PLEASANT GROVE      RICHARDSON 
##              10               5               1              15 
##        ROCKWALL         ROWLETT      SEAGOVILLE       SUNNYVALE 
##               4              27               3               2 
## UNIVERSITY PARK 
##               1
data_clean$City[data_clean$City == ""] <- NA
data_clean$City[data_clean$City == "D"] <- "DALLAS"
summary(data_clean$City)
##                         ADDISON       ARLINGTON   BALCH SPRINGS 
##               0               6               1               1 
##      CARROLLTON   COCKRELL HILL        COMMERCE         COPPELL 
##              13               1               1               5 
##               D             DAL           DALAS           DALLA 
##               0               2               2               1 
##          Dallas          DALLAS          DESOTO             DLS 
##            2551          114925               7               2 
##     DUNCANVILLE  FARMERS BRANCH         GARLAND   GRAND PRAIRIE 
##              14               6              15              27 
##   HIGHLAND PARK        HUTCHINS          IRVING       LANCASTER 
##               1               1               4               4 
##        MESQUITE           PLANO  PLEASANT GROVE      RICHARDSON 
##              10               5               1              15 
##        ROCKWALL         ROWLETT      SEAGOVILLE       SUNNYVALE 
##               4              27               3               2 
## UNIVERSITY PARK            NA's 
##               1            1375

You still need to fix the other Dallas values by setting them all to DALLAS

Remove “empty” level

data_clean$City <- factor(data_clean$City)
summary(data_clean$City)
##         ADDISON       ARLINGTON   BALCH SPRINGS      CARROLLTON 
##               6               1               1              13 
##   COCKRELL HILL        COMMERCE         COPPELL             DAL 
##               1               1               5               2 
##           DALAS           DALLA          Dallas          DALLAS 
##               2               1            2551          114925 
##          DESOTO             DLS     DUNCANVILLE  FARMERS BRANCH 
##               7               2              14               6 
##         GARLAND   GRAND PRAIRIE   HIGHLAND PARK        HUTCHINS 
##              15              27               1               1 
##          IRVING       LANCASTER        MESQUITE           PLANO 
##               4               4              10               5 
##  PLEASANT GROVE      RICHARDSON        ROCKWALL         ROWLETT 
##               1              15               4              27 
##      SEAGOVILLE       SUNNYVALE UNIVERSITY PARK            NA's 
##               3               2               1            1375

summarize by beat

head(data_clean$Beat)
## [1] 711 211 219 714 412 432
## 234 Levels: 111 112 113 114 115 116 121 122 123 124 125 131 132 133 ... 757
barplot(sort(table(data_clean$Beat), decreasing = T), main = "Entries per Beat")

summary(as.numeric(table(data_clean$Beat)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    21.0   393.0   485.0   508.0   594.5  1238.0

look at offense code per beat

agg <- aggregate(UCROffDesc ~ Beat, FUN = table,data = data_clean)
head(agg)
##   Beat UCROffDesc.ACCIDENTAL INJURY UCROffDesc.AGGRAVATED ASSAULT
## 1  111                            6                             7
## 2  112                            7                             0
## 3  113                            8                             6
## 4  114                            8                            10
## 5  115                           39                             8
## 6  116                           11                             7
##   UCROffDesc.AIRPLANE UCROffDesc.ANIMAL BITE UCROffDesc.ARSON
## 1                   0                      0                0
## 2                   0                      2                2
## 3                   0                      1                0
## 4                   0                      1                1
## 5                   0                      3                2
## 6                   0                      0                1
##   UCROffDesc.ASSAULT UCROffDesc.AUTO THEFT - UUMV UCROffDesc.BURGLARY
## 1                 14                           28                  57
## 2                 17                           37                  84
## 3                 20                           43                  85
## 4                 25                           52                  66
## 5                 41                           34                  56
## 6                 11                           16                  18
##   UCROffDesc.CHILD (OFFENSES AGAINST)
## 1                                   1
## 2                                   2
## 3                                   5
## 4                                   1
## 5                                   0
## 6                                   0
##   UCROffDesc.CRIMINAL MISCHIEF/VANDALISM UCROffDesc.DISORDERLY CONDUCT
## 1                                     48                             8
## 2                                     52                             3
## 3                                     55                             8
## 4                                     93                             9
## 5                                     54                             7
## 6                                     21                             4
##   UCROffDesc.DWI UCROffDesc.EMBEZZLEMENT UCROffDesc.FIREARMS ACCIDENT
## 1              2                       0                            0
## 2              0                       0                            0
## 3              4                       1                            1
## 4              4                       2                            0
## 5              3                      10                            0
## 6              2                       0                            3
##   UCROffDesc.FORGERY & COUNTERFEIT UCROffDesc.FOUND PROPERTY
## 1                                5                        47
## 2                                5                        36
## 3                                0                        40
## 4                                2                        53
## 5                                3                       176
## 6                                6                        19
##   UCROffDesc.FRAUD UCROffDesc.GAMBLING UCROffDesc.HOME ACCIDENT
## 1                2                   0                        1
## 2                6                   0                        9
## 3                1                   0                        5
## 4                1                   0                        8
## 5                7                   1                        4
## 6                2                   0                        1
##   UCROffDesc.INTOXICATION MANSLAUGHTER UCROffDesc.LIQUOR
## 1                                    0                 0
## 2                                    0                 0
## 3                                    0                 0
## 4                                    0                 0
## 5                                    0                 1
## 6                                    0                 0
##   UCROffDesc.LOST PROPERTY UCROffDesc.MOTOR VEHICLE ACCIDENT
## 1                        3                                37
## 2                        3                                31
## 3                        2                                53
## 4                        5                                36
## 5                       32                                49
## 6                        7                                32
##   UCROffDesc.MURDER UCROffDesc.NARCOTICS/DRUGS
## 1                 0                          3
## 2                 0                          0
## 3                 0                          3
## 4                 0                          1
## 5                 0                          3
## 6                 1                          1
##   UCROffDesc.OCCUPATIONAL ACCIDENT UCROffDesc.OTHER OFFENSES
## 1                                0                         5
## 2                                0                        10
## 3                                0                         9
## 4                                0                        23
## 5                                0                        13
## 6                                0                         4
##   UCROffDesc.PROSTITUTION UCROffDesc.ROBBERY UCROffDesc.SUDDEN DEATH
## 1                       0                 27                       1
## 2                       0                  5                      12
## 3                       0                 24                       8
## 4                       0                 19                       6
## 5                       0                 19                       1
## 6                       0                 14                       7
##   UCROffDesc.THEFT UCROffDesc.TRAFFIC FATALITY UCROffDesc.WEAPONS
## 1              137                           0                  1
## 2              187                           0                  0
## 3               88                           0                  0
## 4              185                           0                  0
## 5              143                           0                  0
## 6               62                           0                  0

this count data is ratio so we can divide. Look at updown.

for(i in  7:11) {
  oldpar <- par(mar=c(15,4,4,2)+.1)
  barplot(agg[i,-1]/colMeans(agg[,-1], na.rm = T), las=2,
    main = paste("Beat:", agg[i,1]), ylab = "Lift")
  abline(h=1, col="red")
  par(oldpar)
}

with ratio data we can also do correlation

co <- cor(agg[,-1])

library(seriation)
hmap(co, margins = c(15,15))

look at offense code per beat

agg <- aggregate(CompRace ~ Beat, FUN = table,data = data_clean)
head(agg)
##   Beat CompRace.V1 CompRace.A CompRace.B CompRace.C CompRace.I CompRace.L
## 1  111         176          6         58          0          2         93
## 2  112         119          3         25          2          0         68
## 3  113         163          3         31          7          0        190
## 4  114         162          5         64          4          0        144
## 5  115         322          1        130          2          1        130
## 6  116          93          1         35          4          2         35
##   CompRace.N CompRace.TEST CompRace.U CompRace.W
## 1          0             0          1        104
## 2          1             1          0        291
## 3          0             0          0         76
## 4          0             0          3        229
## 5          0             0          1        122
## 6          1             0          0         79
co <- cor(agg[,-1])

hmap(co, margins = c(15,15))

use corrplot instead

library(corrplot)
corrplot(co, method = "ellipse")

corrplot(co, method = "ellipse", order = "hclust")

look at PCClass code per beat

agg <- aggregate(PCClass ~ Beat, FUN = table,data = data_clean)
head(agg)
##   Beat PCClass.F* PCClass.F1 PCClass.F2 PCClass.F3 PCClass.FS PCClass.FX
## 1  111          1         24         30          3         95          0
## 2  112          1          2         65          6         83          0
## 3  113          2         15         56          3        104          0
## 4  114          3         12         69          6         89          0
## 5  115          2         13         39          7        108          0
## 6  116          0          7         18          6         43          0
##   PCClass.M* PCClass.MA PCClass.MB PCClass.MC PCClass.NC
## 1          0         89         99         35          0
## 2          0        162         90         31          0
## 3          0         82        103         39          0
## 4          0        142        138         63          1
## 5          1        110        102         71          0
## 6          0         36         65         25          0
co <- cor(agg[,-1])

hmap(co, margins = c(15,15))