-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkilling_data_cleaning.R
371 lines (280 loc) · 12.3 KB
/
killing_data_cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
# ------------------------------------- Loading Packages and Data ------------------------------------------
# Loading in libraries
library(janitor)
library(lubridate)
library(tidycensus)
library(tidyverse)
library(zoo)
# Defining Links
killing_data_link <- "data/killing_data/Mapping Police Violence.csv"
agency_locations_link <- "data/misconduct_data/data_agency-reference-list.csv"
# Reading in data
killing_data <- read_csv(here::here(killing_data_link))
agency_locations <- read_csv(here::here(agency_locations_link))
# ------------------------------------- Cleaning Data Process ----------------------------------------------
# Variable to write in the newest date the data was available
newest_date = "2023-10-20"
# Filtering for Louisiana killings and cleaning
la_killing <- killing_data %>%
# Cleaning names
clean_names() %>%
# Filtering for Louisiana killings
filter(state == "LA") %>%
# Separating Parish from Parish names
separate_wider_delim(county, delim = " Parish", names = c("parish", "extra"), too_few = "align_start") %>%
# Removing "extra" column
select(-extra) %>%
# Fixing demographic variables
mutate(race = ifelse(is.na(race), "Unknown Race", race),
race = ifelse(race == "Unknown race", "Unknown Race", race),
gender = ifelse(is.na(gender), "Unknown Gender", gender),
age = ifelse(is.na(age), "Unknown Age", age),
# Fixing and defining year and month variables
date = mdy(date),
year = year(date),
year_month = month(date),
# Creating an age category
age_category = case_when(
age < 18 ~ "<18",
age >= 18 & age < 35 ~ "18 - 34",
age >= 35 & age < 55 ~ "35 - 54",
age >= 55 ~ "55+",
TRUE ~ NA),
parish = ifelse(parish == "Dallas", "Rapides", parish),
parish = str_replace(parish, "Saint", "St."),
parish = ifelse(parish == "Acadiana", "Acadia", parish),
parish = ifelse(city == "Monroe", "St. Tammany", parish)) %>%
filter(!(name == "Omarr Jackson" & race == "White"))
# Loading in census data
census_api_key("8b0dc67a5d26f4d27b193904ac4ef087b0409b5e")
vars_2020 <- load_variables(2020, "pl")
# Filter census data for race
race_vars <- vars_2020 %>%
filter(concept == "RACE")
# Name the race columns
v = race_vars$name
# Creating a census data that counts the number of black and white people per parish
census_data <- get_decennial(geography = "county", variables = v, year = "2020", sumfile = "pl", state = "Louisiana") %>%
# Cleaning column names
clean_names() %>%
# Renaming variables
rename(county = name,
name = variable) %>%
# Joining with race variables
left_join(race_vars, by = "name") %>%
# Selecting specific variables
select(county, value, label) %>%
pivot_wider(names_from = "label", values_from = "value", values_fn = sum) %>%
# Cleaning names
clean_names() %>%
# Creating a variable "any_part_black" to calculate any demographics any part black
mutate(any_part_black = rowSums(across(contains("black")))) %>%
# Selecting specific variables
select(parish = county, total, white_alone = total_population_of_one_race_white_alone, any_part_black) %>%
# Separating Parish from Parish names
separate_wider_delim(parish, delim = " Parish", names = c("parish", "extra"), too_few = "align_start") %>%
# Removing "extra" column
select(-extra)
# ------------------------------------- Data Analysis Process ----------------------------------------------
# Number of killings per parish
killings_per_parish <- la_killing %>%
tabyl(parish) %>%
mutate(n = ifelse(n == 1, paste0(n, " Person Killed"), paste0(n, " People Killed")))
# Description of deaths in each parish
description_data <- la_killing %>%
select(parish, name, age, description = circumstances) %>%
unite(name_age, c(name, age), sep = ", ")
# Group by race
demographic_race <- la_killing %>%
tabyl(race)
# Group by gender
demographic_gender <- la_killing %>%
tabyl(gender)
# Group by age category
demographic_age <- la_killing %>%
tabyl(age_category)
# Percent of people killed who were black
percent_killed_black <- demographic_race %>%
filter(race == "Black") %>%
adorn_pct_formatting() %>%
pull(percent)
# Percent of people who are black in Louisiana population
percent_la_black <- sum(census_data$any_part_black)/sum(census_data$total)
# Percent of people killed who were male
percent_killed_male <- demographic_gender %>%
filter(gender == "Male") %>%
adorn_pct_formatting() %>%
pull(percent)
# Average age of person killed in Louisiana
average_age_killed <- mean(as.integer(la_killing$age), na.rm = TRUE)
# Killing Rate & Demographics
killings_rate_demographics <- la_killing %>%
tabyl(parish, race) %>%
adorn_totals(where = "col") %>%
# Joining killing rates and census data
left_join(census_data, by = "parish") %>%
# Creating killing rate variables
mutate(total_kill_rate = 100000 * (Total / total),
black_kill_rate = 100000 * (Black / any_part_black),
white_kill_rate = 100000 * (White / white_alone),
# Creating a killing ratio variable
ratio_bw = black_kill_rate / white_kill_rate) %>%
# Selecting variables to keep
select(parish, total_kill_rate, black_kill_rate, white_kill_rate, ratio_bw)
# Killings for every hundred thousand residents by demographic
killing_rate_total <- 100000 * nrow(la_killing)/sum(census_data$total)
killing_rate_black <- 100000 * sum(la_killing$race == "Black", na.rm = TRUE)/sum(census_data$any_part_black)
killing_rate_white <- 100000 * sum(la_killing$race == "White", na.rm = TRUE)/sum(census_data$white_alone)
killing_ratio_bw <- killing_rate_black/killing_rate_white
killing_ratio_bw
# Parishes with the most killings per hundred thousand total, black, and white residents
parish_most_total_kill_rate <- killings_rate_demographics %>%
arrange(desc(total_kill_rate))
parish_most_black_kill_rate <- killings_rate_demographics %>%
arrange(desc(black_kill_rate))
parish_most_white_kill_rate <- killings_rate_demographics %>%
arrange(desc(white_kill_rate))
# Killings by parish and gender, race, and age
gender_by_parish <- la_killing %>%
tabyl(parish, gender)
age_by_parish <- la_killing %>%
tabyl(parish, age_category)
race_by_parish <- la_killing %>%
tabyl(parish, race)
# Number of months the data collection has occurred
date1 <- as.Date("2013-01-01")
date2 <- as.Date(newest_date) # Change for the newest update
num_months <- interval(date1, date2) %/% months(1)
num_months
# Months without a police killing in Louisiana
months_no_killing <- num_months - length(unique(format(as.Date(la_killing$date), "%Y-%m")))
length(unique(format(as.Date(la_killing$date), "%Y-%m")))
# Moving Timeline Killings per year by demographic
race_killing_per_year <- la_killing %>%
tabyl(year, race) %>%
# Aggregating values
mutate(across(Asian:White, cumsum)) %>%
t()
colnames(race_killing_per_year) <- NULL
gender_killing_per_year <- la_killing %>%
tabyl(year, gender) %>%
# Aggregating values
mutate(across(Female:Male, cumsum)) %>%
t()
age_killing_per_year <- la_killing %>%
tabyl(year, age_category) %>%
# Aggregating values
mutate(across(`<18`:`55+`, cumsum)) %>%
t()
# Average age killed
ave_age_killed_per_year <- la_killing %>%
mutate(age = as.numeric(age)) %>%
group_by(year) %>%
summarize(mean_age = mean(age, na.rm = TRUE))
mean(as.numeric(la_killing$age), na.rm = TRUE)
ave_age_killed_per_year
# Armed Status Barchart
arm_status_by_race <- la_killing %>%
# Mutating "allegedly_armed" variable so that any allegation which includes "Allegedly" becomes "Allegedly Armed"
mutate(allegedly_armed = ifelse(str_detect(allegedly_armed, "Allegedly"), "Allegedly Armed", allegedly_armed)) %>%
tabyl(allegedly_armed, race) %>%
adorn_totals(where = "col")
# Flee Status Barchart
fleeing_status <- la_killing %>%
# Filtering data to post 2014 (this is when fleeing began being recorded)
filter(year >= 2015) %>%
# Creating a "fleeing" variable that defines not fleeing as if "wapo_flee" is empty or is "Not Fleeing"
mutate(fleeing = if_else((is.na(wapo_flee) | wapo_flee == "Not Fleeing"), "Not Fleeing", "Fleeing")) %>%
tabyl(race, fleeing) %>%
adorn_totals(where = "row")
# Percent of people allegedly fleeing
pct_fleeing_status <- fleeing_status %>%
adorn_percentages()
# Violent v Non-Violent
violent_crime_distribution <- la_killing %>%
# Filtering data to post 2016 (this is when encounter types began being recorded)
filter(year >= 2017) %>%
# Creating a binary "crime_status", classifications being: "Violent Crime", "Non-Violent Crime"
mutate(crime_status = if_else(encounter_type %in% c("Part 1 Violent Crime",
"Part 1 Violent Crime/Domestic Disturbance"),
"Violent Crime",
"Non-Violent Crime")) %>%
tabyl(crime_status)
violent_crime_distribution
# Counting mental health status groups
mental_health <- la_killing %>%
tabyl(signs_of_mental_illness)
# Police Department Graphs
killings_per_department <- la_killing %>%
# Splitting agencies by comma and removing all unnecessary text from agency names
mutate(
agency_name = str_split(as.character(str_replace_all(agency_responsible, ", ", ",")), ","),
agency_name = map(agency_name, ~ str_remove_all(.x, '^"|"$|[()]'))
) %>%
unnest(agency_name) %>%
tabyl(agency_name, race) %>%
adorn_totals(where = "col")
killings_per_department %>%
arrange(desc(White)) %>%
head(5)
# Police agencies represented
departments_represented <- killings_per_department %>%
select("agency_name","Total") %>%
# Fixing names to include Parish Sheriff's Office
mutate(agency_name = ifelse(str_detect(agency_name, "Parish"),
agency_name,
gsub("Sheriff's Office",
"Parish Sheriff's Office",
agency_name))) %>%
# Merging with "agency_locations" by "agency_name"
merge(agency_locations, by = "agency_name", all = TRUE) %>%
# Creating a column of whether the department is represented or not in the data
mutate(agency_slug = ifelse(is.na(Total),
"Not Represented in the Killing Data",
"Represented in the Killing Data"))
# Mapping
mapping_department_killings <- la_killing %>%
# Making a single "latlong" variable
unite(latlong, c(latitude, longitude), sep = " ", remove = F) %>%
select("date", "latlong", "agency_responsible") %>%
arrange(date)
toString(la_killing$name[1:25])
# Officers who killed people
officers_killing <- la_killing %>%
# Creating an officer variable which removes unnecessary text and white spaces
mutate(
officers = gsub("\\s*\\([^\\)]*\\)\\s*", "", officer_names, perl = TRUE),
officers = strsplit(officers, "\\s* and \\s*|,\\s*")
) %>%
unnest(officers) %>%
# Including the date the killing occured with the name
mutate(officer_date = paste0(officers, ", " , date)) %>%
# Removing officers who are not named
drop_na(officers) %>%
select(officer_date)
officers_killing
# Charge status distribution
charge_status <- la_killing %>%
# Making an binary "officer_charged" variable with categories: "Criminal Charges", "No Known Charges"
mutate(officer_charged = if_else(str_detect(tolower(officer_charged), "charged"),
"Criminal Charges",
"No Known Charges")) %>%
tabyl(officer_charged)
# Disposition status distribution
disposition_status <- la_killing %>%
mutate(
# Making dispositions lowercase
disposition_official = tolower(disposition_official),
# Creating disposition categories
disposition_fixed = case_when(
str_detect(disposition_official, "charged") ~ "Charged",
str_detect(disposition_official, "pending|under") ~ "Pending Investigation",
str_detect(disposition_official, "justified") ~ "Justified",
str_detect(disposition_official, "cleared") ~ "Cleared",
str_detect(disposition_official, "family awarded") ~ "Family Awarded Money",
str_detect(disposition_official, "unreported") ~ "Unreported",
disposition_official %in% c("unknown", NA) ~ "Unknown",
TRUE ~ disposition_official
)
) %>%
tabyl(disposition_fixed)