Change to the folder with the file (use Session tab or setwd) do not do data prep again…
prep <- FALSE
if(prep) {
data <- read.csv("Dallas_Police_Public_Data_-_RMS_Incidents-With_GeoLocation.csv")
dim(data)
summary(data)
#' # Start creating a new "clean" data frame
data_clean <- data[, c("IncidentNum","ReportedDate", "Time1", "UCROffDesc", "PCClass", "City",
"ZipCode", "Beat", "CompRace", "CompSex", "CompAge", "VictimCond", "Status", "CompState", "GeoLocation")]
summary(data_clean)
#' # Here are some examples:
#'
#' ## Fix dates/time
head(data_clean$ReportedDate)
# install.packages("lubridate")
library(lubridate)
d <- parse_date_time(as.character(data_clean$ReportedDate), "%m/%d/%Y %H:%M:%S", tz = "CST")
# Note: use tz = ":US/Central" on Macs with OS X
head(d)
hour(d[1:5])
day(d[1:5])
month(d[1:5])
# put new data back into the data.frame
data_clean$ReportedDate <- d
# example of adding time
d[1]
d[1] + hours(18)
#' # Fix nominal variables
summary(data_clean)
data_clean$ZipCode <- as.factor(data_clean$ZipCode)
data_clean$Beat <- as.factor(data_clean$Beat)
summary(data_clean)
#' # Fix geolocation
gl <- as.character(data_clean$GeoLocation)
start <- regexpr("\\(", gl)
end <- regexpr("\\)", gl)
gl <- substr(gl, start+1, end-1)
#gl[gl==""] <- "NA, NA"
gl <- strsplit(gl, ", ")
lon <- sapply(gl, FUN = function(x) as.numeric(x[1]))
lat <- sapply(gl, FUN = function(x) as.numeric(x[2]))
data_clean$lon <- lon
data_clean$lat <- lat
data_clean$GeoLocation <- NULL # remove
summary(data_clean)
### remove entries with the same incidence number and remove incidence number
data_clean <- data_clean[!duplicated(data_clean$IncidentNum),]
data_clean <- data_clean[,-1]
#' # Save clean data
save(data_clean, file = "data_clean.rda")
}
now we can always load the cleaned data with
load("data_clean.rda")
Look at rare ZIP codes (mistakes or outliers) using a count table
sort(table(data_clean$ZipCode))
##
## 48232 72231 72543 74212 74243 75014 75021 75024 75026 75035 75039 75044
## 1 1 1 1 1 1 1 1 1 1 1 1
## 75050 75063 75082 75126 75221 75255 75261 75288 76234 77521 75074 75075
## 1 1 1 1 1 1 1 1 1 1 2 2
## 75141 75146 75032 75048 75119 75154 75180 75182 75213 75006 75043 75060
## 2 2 3 3 3 3 3 3 3 4 4 4
## 75137 75001 75087 75181 75081 75041 75062 75159 75149 75052 75150 75007
## 4 5 5 5 6 7 9 11 12 13 13 14
## 75134 75080 75040 75115 75089 75019 75042 75116 75051 75088 75246 75205
## 15 16 17 18 19 20 23 39 59 65 340 367
## 75234 75244 75251 75249 75252 75209 75233 75226 75230 75225 75248 75236
## 412 559 572 660 820 1006 1097 1135 1187 1188 1195 1216
## 75207 75202 75253 75254 75223 75210 75247 75218 75203 75214 75235 75201
## 1319 1378 1427 1446 1500 1563 1595 1646 1944 1955 2018 2157
## 75238 75219 75212 75232 75208 75240 75287 75224 75229 75237 75206 75241
## 2243 2260 2335 2543 2579 2709 2744 2845 2949 3251 3270 3393
## 75204 75215 75231 75227 75211 75220 75216 75228 75243 75217
## 3615 3778 3992 4988 5357 5476 5634 5921 6322 7355
Fix empty fields in nominal data
summary(data_clean$City)
## ADDISON ARLINGTON BALCH SPRINGS
## 1375 6 1 1
## CARROLLTON COCKRELL HILL COMMERCE COPPELL
## 13 1 1 5
## D DAL DALAS DALLA
## 1 2 2 1
## Dallas DALLAS DESOTO DLS
## 2551 114924 7 2
## DUNCANVILLE FARMERS BRANCH GARLAND GRAND PRAIRIE
## 14 6 15 27
## HIGHLAND PARK HUTCHINS IRVING LANCASTER
## 1 1 4 4
## MESQUITE PLANO PLEASANT GROVE RICHARDSON
## 10 5 1 15
## ROCKWALL ROWLETT SEAGOVILLE SUNNYVALE
## 4 27 3 2
## UNIVERSITY PARK
## 1
data_clean$City[data_clean$City == ""] <- NA
data_clean$City[data_clean$City == "D"] <- "DALLAS"
summary(data_clean$City)
## ADDISON ARLINGTON BALCH SPRINGS
## 0 6 1 1
## CARROLLTON COCKRELL HILL COMMERCE COPPELL
## 13 1 1 5
## D DAL DALAS DALLA
## 0 2 2 1
## Dallas DALLAS DESOTO DLS
## 2551 114925 7 2
## DUNCANVILLE FARMERS BRANCH GARLAND GRAND PRAIRIE
## 14 6 15 27
## HIGHLAND PARK HUTCHINS IRVING LANCASTER
## 1 1 4 4
## MESQUITE PLANO PLEASANT GROVE RICHARDSON
## 10 5 1 15
## ROCKWALL ROWLETT SEAGOVILLE SUNNYVALE
## 4 27 3 2
## UNIVERSITY PARK NA's
## 1 1375
You still need to fix the other Dallas values by setting them all to DALLAS
Remove “empty” level
data_clean$City <- factor(data_clean$City)
summary(data_clean$City)
## ADDISON ARLINGTON BALCH SPRINGS CARROLLTON
## 6 1 1 13
## COCKRELL HILL COMMERCE COPPELL DAL
## 1 1 5 2
## DALAS DALLA Dallas DALLAS
## 2 1 2551 114925
## DESOTO DLS DUNCANVILLE FARMERS BRANCH
## 7 2 14 6
## GARLAND GRAND PRAIRIE HIGHLAND PARK HUTCHINS
## 15 27 1 1
## IRVING LANCASTER MESQUITE PLANO
## 4 4 10 5
## PLEASANT GROVE RICHARDSON ROCKWALL ROWLETT
## 1 15 4 27
## SEAGOVILLE SUNNYVALE UNIVERSITY PARK NA's
## 3 2 1 1375
summarize by beat
head(data_clean$Beat)
## [1] 711 211 219 714 412 432
## 234 Levels: 111 112 113 114 115 116 121 122 123 124 125 131 132 133 ... 757
barplot(sort(table(data_clean$Beat), decreasing = T), main = "Entries per Beat")
summary(as.numeric(table(data_clean$Beat)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.0 393.0 485.0 508.0 594.5 1238.0
look at offense code per beat
agg <- aggregate(UCROffDesc ~ Beat, FUN = table,data = data_clean)
head(agg)
## Beat UCROffDesc.ACCIDENTAL INJURY UCROffDesc.AGGRAVATED ASSAULT
## 1 111 6 7
## 2 112 7 0
## 3 113 8 6
## 4 114 8 10
## 5 115 39 8
## 6 116 11 7
## UCROffDesc.AIRPLANE UCROffDesc.ANIMAL BITE UCROffDesc.ARSON
## 1 0 0 0
## 2 0 2 2
## 3 0 1 0
## 4 0 1 1
## 5 0 3 2
## 6 0 0 1
## UCROffDesc.ASSAULT UCROffDesc.AUTO THEFT - UUMV UCROffDesc.BURGLARY
## 1 14 28 57
## 2 17 37 84
## 3 20 43 85
## 4 25 52 66
## 5 41 34 56
## 6 11 16 18
## UCROffDesc.CHILD (OFFENSES AGAINST)
## 1 1
## 2 2
## 3 5
## 4 1
## 5 0
## 6 0
## UCROffDesc.CRIMINAL MISCHIEF/VANDALISM UCROffDesc.DISORDERLY CONDUCT
## 1 48 8
## 2 52 3
## 3 55 8
## 4 93 9
## 5 54 7
## 6 21 4
## UCROffDesc.DWI UCROffDesc.EMBEZZLEMENT UCROffDesc.FIREARMS ACCIDENT
## 1 2 0 0
## 2 0 0 0
## 3 4 1 1
## 4 4 2 0
## 5 3 10 0
## 6 2 0 3
## UCROffDesc.FORGERY & COUNTERFEIT UCROffDesc.FOUND PROPERTY
## 1 5 47
## 2 5 36
## 3 0 40
## 4 2 53
## 5 3 176
## 6 6 19
## UCROffDesc.FRAUD UCROffDesc.GAMBLING UCROffDesc.HOME ACCIDENT
## 1 2 0 1
## 2 6 0 9
## 3 1 0 5
## 4 1 0 8
## 5 7 1 4
## 6 2 0 1
## UCROffDesc.INTOXICATION MANSLAUGHTER UCROffDesc.LIQUOR
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 1
## 6 0 0
## UCROffDesc.LOST PROPERTY UCROffDesc.MOTOR VEHICLE ACCIDENT
## 1 3 37
## 2 3 31
## 3 2 53
## 4 5 36
## 5 32 49
## 6 7 32
## UCROffDesc.MURDER UCROffDesc.NARCOTICS/DRUGS
## 1 0 3
## 2 0 0
## 3 0 3
## 4 0 1
## 5 0 3
## 6 1 1
## UCROffDesc.OCCUPATIONAL ACCIDENT UCROffDesc.OTHER OFFENSES
## 1 0 5
## 2 0 10
## 3 0 9
## 4 0 23
## 5 0 13
## 6 0 4
## UCROffDesc.PROSTITUTION UCROffDesc.ROBBERY UCROffDesc.SUDDEN DEATH
## 1 0 27 1
## 2 0 5 12
## 3 0 24 8
## 4 0 19 6
## 5 0 19 1
## 6 0 14 7
## UCROffDesc.THEFT UCROffDesc.TRAFFIC FATALITY UCROffDesc.WEAPONS
## 1 137 0 1
## 2 187 0 0
## 3 88 0 0
## 4 185 0 0
## 5 143 0 0
## 6 62 0 0
this count data is ratio so we can divide. Look at updown.
for(i in 7:11) {
oldpar <- par(mar=c(15,4,4,2)+.1)
barplot(agg[i,-1]/colMeans(agg[,-1], na.rm = T), las=2,
main = paste("Beat:", agg[i,1]), ylab = "Lift")
abline(h=1, col="red")
par(oldpar)
}
with ratio data we can also do correlation
co <- cor(agg[,-1])
library(seriation)
hmap(co, margins = c(15,15))
look at offense code per beat
agg <- aggregate(CompRace ~ Beat, FUN = table,data = data_clean)
head(agg)
## Beat CompRace.V1 CompRace.A CompRace.B CompRace.C CompRace.I CompRace.L
## 1 111 176 6 58 0 2 93
## 2 112 119 3 25 2 0 68
## 3 113 163 3 31 7 0 190
## 4 114 162 5 64 4 0 144
## 5 115 322 1 130 2 1 130
## 6 116 93 1 35 4 2 35
## CompRace.N CompRace.TEST CompRace.U CompRace.W
## 1 0 0 1 104
## 2 1 1 0 291
## 3 0 0 0 76
## 4 0 0 3 229
## 5 0 0 1 122
## 6 1 0 0 79
co <- cor(agg[,-1])
hmap(co, margins = c(15,15))
use corrplot instead
library(corrplot)
corrplot(co, method = "ellipse")
corrplot(co, method = "ellipse", order = "hclust")
look at PCClass code per beat
agg <- aggregate(PCClass ~ Beat, FUN = table,data = data_clean)
head(agg)
## Beat PCClass.F* PCClass.F1 PCClass.F2 PCClass.F3 PCClass.FS PCClass.FX
## 1 111 1 24 30 3 95 0
## 2 112 1 2 65 6 83 0
## 3 113 2 15 56 3 104 0
## 4 114 3 12 69 6 89 0
## 5 115 2 13 39 7 108 0
## 6 116 0 7 18 6 43 0
## PCClass.M* PCClass.MA PCClass.MB PCClass.MC PCClass.NC
## 1 0 89 99 35 0
## 2 0 162 90 31 0
## 3 0 82 103 39 0
## 4 0 142 138 63 1
## 5 1 110 102 71 0
## 6 0 36 65 25 0
co <- cor(agg[,-1])
hmap(co, margins = c(15,15))