Brain Injury Awareness Month, observed each March, was established 3 decades ago to educate the public about the incidence of brain injury and the needs of persons with brain injuries and their families (1). Caused by a bump, blow, or jolt to the head, or penetrating head injury, a traumatic brain injury (TBI) can lead to short- or long-term changes affecting thinking, sensation, language, or emotion.
One of every 60 people in the U.S. lives with a TBI related disability. Moderate and severe traumatic brain injury (TBI) can lead to a lifetime of physical, cognitive, emotional, and behavioral changes.
The data this week comes from the CDC and Veterans Brain Injury Center. Additional stats can be found at
This data and cleaning script are primarily from scraping tables out of a PDF. This would be a good example of trying to clean and organize tables from PDFs, using the pdftools
package from ropensci
. I have included the PDFs this data was scraped from, and there are lots of examples of potential graphs to recreate or improve upon. Try your hand at improving or otherwise learning how to use packages like stringr
or tidyr
to extract data from messy PDF tables.
# Get the Data
tbi_age <- readr::read_csv('')
tbi_year <- readr::read_csv('')
tbi_military <- readr::read_csv('')
# Or read in with tidytuesdayR package (
# PLEASE NOTE TO USE 2020 DATA YOU NEED TO USE tidytuesdayR version ? from GitHub
# Either ISO-8601 date or year/week works!
# Install via pak::pak("dslc-io/tidytuesdayR")
tuesdata <- tidytuesdayR::tt_load('2020-03-24')
tuesdata <- tidytuesdayR::tt_load(2020, week = 13)
tbi_age <- tuesdata$tbi_age
variable | class | description |
age_group | character | Age group |
type | character | Type of measure |
injury_mechanism | character | Injury mechanism |
number_est | double | Estimated observed cases in 2014 |
rate_est | double | Rate/100,000 in 2014 |
variable | class | description |
injury_mechanism | character | Injury mechanism |
type | character | Type of measure |
year | character | Year |
rate_est | double | Rate/100,000 in 2014 |
number_est | integer | Estimated observed cases in each year |
variable | class | description |
service | character | Military branch |
component | character | Military component (active, guard, reserve) |
severity | character | Severity/type of TBI |
diagnosed | double | Number diagnosed |
year | integer | Year for observation |
# Hospital TBI data -------------------------------------------------------
all_years <- read_html("") %>%
html_node(xpath = '//*[@id="acc-panel-1"]/div/div/table/thead/tr') %>%
html_text() %>%
str_split("\\\n") %>%
str_extract_all("[0-9]+") %>%
hospital_url <- ""
raw_html_hospital <- read_html(url)
get_table_row <- function(input_html, tab_num){
node <- html_node(input_html, xpath = glue::glue('//*[@id="acc-panel-1"]/div/div/table/tbody/tr[{tab_num}]'))
tbi_hospital <- tibble(
input_html = list(raw_html_hospital),
tab_num = 1:4
) %>%
mutate(raw_data = map2_chr(input_html, tab_num, get_table_row)) %>%
separate(raw_data, into = c("type", all_years), sep = "\\\n") %>%
select(-tab_num,-input_html) %>%
pivot_longer(cols = -type, values_to = "value", names_to = "year") %>%
mutate(value = parse_number(value),
year = as.double(year))
# TBI Deaths --------------------------------------------------------------
url_deaths <- ""
raw_html_deaths <- read_html(url_deaths)
tbi_deaths <- tibble(
tab_num = 1:7,
input_html = list(raw_html_deaths)
) %>%
mutate(raw_data = map2_chr(input_html, tab_num, get_table_row)) %>%
separate(raw_data, into = c("type", all_years), sep = "\\\n") %>%
select(-tab_num,-input_html) %>%
mutate(type = str_remove(type, "††|‡‡|§§")) %>%
pivot_longer(cols = -type, values_to = "value", names_to = "year") %>%
mutate(value = parse_number(value),
year = as.double(year))
# ED Visits ---------------------------------------------------------------
url_ed <- ""
raw_html_ed <- read_html(url_ed)
tbi_ed_visits <- tibble(
tab_num = 1:7,
input_html = list(raw_html_ed)
) %>%
mutate(raw_data = map2_chr(input_html, tab_num, get_table_row)) %>%
separate(raw_data, into = c("type", all_years), sep = "\\\n") %>%
select(-tab_num,-input_html) %>%
mutate(type = str_remove(type, "††|‡‡|§§")) %>%
pivot_longer(cols = -type, values_to = "value", names_to = "year") %>%
mutate(value = parse_number(value),
year = as.double(year))
# Hospitalizations --------------------------------------------------------
url_hosp <- ""
raw_html_hosp <- read_html(url_hosp)
tbi_hosp <- tibble(
tab_num = 1:7,
input_html = list(raw_html_hosp)
) %>%
mutate(raw_data = map2_chr(input_html, tab_num, get_table_row)) %>%
separate(raw_data, into = c("type", all_years), sep = "\\\n") %>%
select(-tab_num,-input_html) %>%
mutate(type = str_remove(type, "††|‡‡|§§")) %>%
pivot_longer(cols = -type, values_to = "value", names_to = "year") %>%
mutate(value = parse_number(value),
year = as.double(year))
# PDF report --------------------------------------------------------------
pdf_url <- ""
raw_pdf_text <- pdftools::pdf_text(pdf_url)
# General table scraping function -----------------------------------------
clean_pdf_table <- function(table_number){
input_text <- chuck(raw_pdf_text, table_number)
raw_table_text <- str_split(raw_pdf_text[[table_number]],
"\\)\\\n|\\*\\\n", simplify = TRUE)
vec_length <- c("0-1", "0-4", "5-1", "15-", "25-", "35-", "45-", "55-", "65-",
"75+", "Tot") %>% paste(collapse = "|")
vec_subset <- str_detect(str_sub(raw_table_text, 1, 4), vec_length)
small_tab <- raw_table_text[vec_subset]
if(length(small_tab) <= 2) stop("No table found!")
pre_tab <- small_tab %>%
str_squish() %>%
str_replace_all(" \\(", "_(") %>%
str_replace_all("\\s", "|") %>%
str_replace_all(",", "")
text_con <- textConnection(pre_tab)
output_df <- read.csv(text_con, header = FALSE, sep = "|", stringsAsFactors = FALSE)
tibble_df <- output_df %>%
as_tibble() %>%
mutate_all(str_replace, "##|¶¶", "")
tibble_fixed <- tibble_df %>%
mutate_at(.vars = vars(2:last_col()),
str_extract, "[^_]+")
tibble_typed <- suppressMessages(type_convert(tibble_fixed))
# PDF Tab 1 ---------------------------------------------------------------
# This was a first attempt before moving to a function
raw_table_text <- str_split(raw_pdf_text[[9]], "\\\n", simplify = TRUE)
pre_table <- raw_table %>%
.[9:19] %>%
str_squish() %>%
str_replace_all(" \\(", "_(") %>%
str_replace_all("\\s", "|")
text_con <- textConnection(pre_table)
data_table <- read.csv(text_con, sep = "|", header = FALSE, stringsAsFactors = FALSE)
tab_1 <- data_table %>%
set_names(nm = c("age_group", "ed_visit", "ed_visit_rate", "hospitilizations",
"hospitilization_rate", "death_no", "death_rate", "total_edhd",
"total_edhd_rate")) %>%
as_tibble() %>%
mutate_all(as.character) %>%
mutate_at(vars(contains("rate")), str_extract, "[^_]+") %>%
mutate_at(vars(ed_visit:total_edhd_rate), parse_number) %>%
add_column(type = "ED Visits, Hospitalizations, and deaths", .after = "age_group")
# PDF Tab 2 ---------------------------------------------------------------
tab_2 <- clean_pdf_table(10) %>%
set_names(nm = c("age_group", "num_crash", "rate_crash", "num_falls",
"rate_falls", "num_struck", "rate_struck", "num_inj_unspecified",
"rate_inj_unspecified", "num_self_harm", "rate_self_harm",
"num_assault", "rate_assault", "num_other", "rate_other")) %>%
add_column(type = "Emergency Department Visit", .after = "age_group")
# PDF Tab 3 ---------------------------------------------------------------
tab_3 <- clean_pdf_table(11) %>%
set_names(nm = c("age_group", "num_crash", "rate_crash", "num_falls",
"rate_falls", "num_struck", "rate_struck", "num_inj_unspecified",
"rate_inj_unspecified", "num_self_harm", "rate_self_harm",
"num_assault", "rate_assault", "num_other", "rate_other")) %>%
add_column(type = "Hospitalizations", .after = "age_group")
# PDF Tab 4 ---------------------------------------------------------------
tab_4 <- clean_pdf_table(12) %>%
set_names(nm = c("age_group", "num_crash", "rate_crash", "num_falls",
"rate_falls", "num_struck", "rate_struck", "num_inj_unspecified",
"rate_inj_unspecified", "num_self_harm", "rate_self_harm",
"num_assault", "rate_assault", "num_other", "rate_other")) %>%
add_column(type = "Deaths", .after = "age_group")
# PDF Combo Table ---------------------------------------------------------
combo_tab <- bind_rows(tab_2, tab_3, tab_4)
num_df <- combo_tab %>%
pivot_longer(cols = contains("num"), names_to = "injury_mechanism",
values_to = "number_est") %>%
select(-contains("rate")) %>%
separate(injury_mechanism, into = c("count_type", "injury_mechanism"),
sep = "_", extra = "merge")
rate_df <- combo_tab %>%
pivot_longer(cols = contains("rate"), names_to = "injury_mechanism",
values_to = "rate_est") %>%
select(-contains("num")) %>%
separate(injury_mechanism, into = c("count_type", "injury_mechanism"),
sep = "_", extra = "merge")
long_combo_df <- left_join(num_df, rate_df,
by = c("age_group", "type", "injury_mechanism")) %>%
select(-contains("count_type")) %>%
injury_mechanism = factor(
levels = c( "crash", "falls", "struck", "inj_unspecified","self_harm",
"assault", "other"),
labels = c("Motor Vehicle Crashes", "Unintentional Falls",
"Unintentionally struck by or against an object",
"Other unintentional injury, mechanism unspecified",
"Intentional self-harm", "Assault", "Other or no mechanism specified"
injury_mechanism = as.character(injury_mechanism)
# Other PDF Tables --------------------------------------------------------
get_graph_table <- function(page_number){
titles <- c(
"Motor vehicle crashes", "Unintentional falls", "Unintentionally struck by or against an object",
"Other unintentional injury, mechanism unspecified","Intentional self-harm",
"Assault", "Other or no mechanism specified", "Total"
titles_collapse <- titles %>% paste0(collapse = "|")
raw_table_text <- raw_pdf_text[[page_number]] %>%
str_remove_all("††\\\n|§§|‡‡\\\n|††|‡‡") %>%
str_split("\\\n", simplify = TRUE) %>%
str_squish() %>%
str_replace_all(titles_collapse, "type") %>%
str_remove_all(",") %>%
str_squish() %>%
str_replace_all("\\s", "|")
vec_subset <- str_detect(str_sub(raw_table_text, 1, 4), "type")
small_tab <- raw_table_text[vec_subset]
if(length(small_tab) <= 2) stop("No table found!")
text_con <- textConnection(small_tab)
output_df <- read.csv(text_con, header = FALSE, sep = "|", stringsAsFactors = FALSE)
tibble_df <- output_df %>%
tibble_typed <- suppressMessages(type_convert(tibble_df))
type_measure <- raw_table_text[str_detect(str_sub(raw_table_text, 1, 6), "FIGURE")] %>%
measure_label <- if_else(type_measure == TRUE, "rate_est", "number_est")
tibble_typed %>%
set_names(nm = c("injury_mechanism", as.character(2006:2014))) %>%
mutate(injury_mechanism = titles[1:nrow(tibble_typed)],
type = case_when(
page_number %in% c(15,16) ~ "Emergency Department Visit",
page_number %in% c(17,18) ~ "Hospitalizations",
page_number %in% c(19,20) ~ "Deaths",
)) %>%
pivot_longer(cols = c(-type, -injury_mechanism),
names_to = "year", values_to = measure_label)
ed_time <- 15:16 %>%
map(get_graph_table) %>%
reduce(left_join, by = c("injury_mechanism", "type", "year"))
hosp_time <- 17:18 %>%
map(get_graph_table) %>%
reduce(left_join, by = c("injury_mechanism", "type", "year"))
death_time <- 19:20 %>%
map(get_graph_table) %>%
reduce(left_join, by = c("injury_mechanism", "type", "year"))
long_time_df <- bind_rows(ed_time, hosp_time, death_time)
# Saving Tables -----------------------------------------------------------
long_combo_df %>% write_csv(here::here("2020", "2020-03-24", "tbi_age.csv"))
long_time_df %>% write_csv(here::here("2020", "2020-03-24", "tbi_year.csv"))
# Military TBI ------------------------------------------------------------
url <- ""
download_pdf <- function(year){
url <- glue::glue("{year}_jun-21-2018_v1.0_2018-07-26_0.pdf")
download.file(url = url, destfile = here::here("2020","2020-03-24", glue::glue("dod_tbi_{year}.pdf")))
2006:2014 %>%
get_dod_tbi <- function(year, page_number){
dod_file <- here::here("2020","2020-03-24", glue::glue("dod_tbi_{year}.pdf"))
raw_dod_text <- pdftools::pdf_text(dod_file)
text_df <- raw_dod_text[[page_number]] %>%
str_remove_all(",") %>%
str_split("\\\n", simplify = F) %>%
service_lab <- case_when(
page_number == 2 ~ "Army",
page_number == 3 ~ "Navy",
page_number == 4 ~ "Air Force",
page_number == 5 ~ "Marines"
text_df %>%
rename("col1" = 1) %>%
as_tibble() %>%
separate(col1, sep = " ", into = c("service", "component", "severity", "diagnosed", "count", "count2")) %>%
mutate_all(str_squish) %>%
slice(-1) %>%
mutate(count = if_else(str_length(count) < 2, count2, count),
severity = if_else(str_length(severity) < 2, diagnosed, severity),
diagnosed = as.double(count),
service = if_else(str_length(service) == 0, NA_character_, service),
component = if_else(str_length(component) == 0, NA_character_, component),
year = year,
service = if_else(, service_lab, service)) %>%
select(-count2, -count) %>%
filter(! %>%
fill(service, .direction = "downup") %>%
fill(component, .direction = "downup")
test_df <- get_dod_tbi(2006, 2)
all_years <- crossing(
year = 2006:2014,
page_number = 2:5
) %>%
mutate(data = map2(.x = year, .y = page_number, .f = get_dod_tbi)) %>%
select(data) %>%
all_years %>%
ggplot(aes(x = year, y = diagnosed, color = severity)) +
geom_point() +
geom_line() +
facet_grid(service ~ component)
all_years %>%
write_csv(here::here("2020", "2020-03-24", "tbi_military.csv"))