Skip to content
This repository has been archived by the owner on Jul 24, 2024. It is now read-only.

Integrate build_US_setup into pipeline and... #271

Merged
merged 20 commits into from
May 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ jobs:
env:
CENSUS_API_KEY: ${{ secrets.CENSUS_API_KEY }}
run: |
git lfs pull
Rscript local_install.R
cd test
/home/app/python_venv/bin/pytest run_tests.py
Expand Down
151 changes: 121 additions & 30 deletions R/scripts/build_US_setup.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,38 @@
##
# @file
# @brief Creates mobility and geodata for US
#
# @details
#
# ## Configuration Items
#
# ```yaml
# spatial_setup:
# base_path: <path to directory>
# modeled_states: <list of state postal codes> e.g. MD, CA, NY
# mobility: <path to file relative to base_path> optional; default is 'mobility.csv'
# geodata: <path to file relative to base_path> optional; default is 'geodata.csv'
# popnodes: <string> optional; default is 'population'
#
# importation:
# census_api_key: <string, optional> default is environment variable CENSUS_API_KEY. Environment variable is preferred so you don't accidentally commit your key.
# ```
#
# ## Input Data
#
# None
#
# ## Output Data
#
# * {spatial_setup::base_path}/{spatial_setup::mobility}
# * {spatial_setup::base_path}/{spatial_setup::geodata}
#

## @cond

library(dplyr)
library(tidyr)
library(tidycensus)

option_list = list(
optparse::make_option(c("-c", "--config"), action="store", default=Sys.getenv("CONFIG_PATH"), type='character', help="path to the config file"),
eclee25 marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -9,49 +42,96 @@ option_list = list(
opt = optparse::parse_args(optparse::OptionParser(option_list=option_list))

config <- covidcommon::load_config(opt$c)
if (is.na(config)) {
if (length(config) == 0) {
stop("no configuration found -- please set CONFIG_PATH environment variable or use the -c command flag")
}

outdir <- config$spatial_setup$base_path
filterUSPS <- config$spatial_setup$modeled_states

commute_data <- readr::read_csv(paste(opt$p,"sample_data","united-states-commutes","commute_data.csv",sep='/'))
census_data <- readr::read_csv(paste(opt$p,"sample_data","united-states-commutes","census_tracts_2010.csv", sep = '/'))

# Get census key
census_key = Sys.getenv("CENSUS_API_KEY")
if(length(config$importation$census_api_key) != 0)
{
census_key = config$importation$census_api_key
}
if(census_key == "")
{
stop("no census key found -- please set CENSUS_API_KEY environment variable or specify importation::census_api_key in config file")
}
tidycensus::census_api_key(key = census_key)

# CENSUS DATA
census_data <- tidycensus::get_acs(geography="county", state=filterUSPS,
variables="B01003_001", year=config$spatial_setup$census_year,
keep_geo_vars=TRUE, geometry=FALSE, show_call=TRUE)
census_data <- census_data %>%
filter(USPS %in% filterUSPS) %>%
select(USPS,GEOID,POP10) %>%
mutate(GEOID = substr(GEOID,1,5))
rename(population=estimate, geoid=GEOID) %>%
select(geoid, population) %>%
mutate(geoid = substr(geoid,1,5))

# Add USPS column
data(fips_codes)
fips_geoid_codes <- mutate(fips_codes, geoid=paste0(state_code,county_code)) %>%
group_by(geoid) %>%
summarize(USPS=unique(state))
census_data <- census_data %>% left_join(fips_geoid_codes, by="geoid")

# Make each territory one county.
# Puerto Rico is the only one in the 2018 ACS estimates right now. Aggregate it.
# Keeping the other territories in the aggregation just in case they're there in the future.
name_changer <- setNames(
unique(census_data$GEOID),
unique(census_data$GEOID)
unique(census_data$geoid),
unique(census_data$geoid)
)
name_changer[grepl("^60",name_changer)] <- "60000"
name_changer[grepl("^66",name_changer)] <- "66000"
name_changer[grepl("^69",name_changer)] <- "69000"
name_changer[grepl("^72",name_changer)] <- "72000"
name_changer[grepl("^78",name_changer)] <- "78000"
name_changer[grepl("^60",name_changer)] <- "60000" # Amerian Samoa
name_changer[grepl("^66",name_changer)] <- "66000" # Guam
name_changer[grepl("^69",name_changer)] <- "69000" # Northern Mariana Islands
name_changer[grepl("^72",name_changer)] <- "72000" # Puerto Rico
name_changer[grepl("^78",name_changer)] <- "78000" # Virgin Islands

census_data <- census_data %>%
mutate(GEOID = name_changer[GEOID]) %>%
group_by(GEOID) %>%
summarize(USPS = unique(USPS),POP10 = sum(POP10)) %>%
arrange(POP10)
mutate(geoid = name_changer[geoid]) %>%
group_by(geoid) %>%
summarize(USPS = unique(USPS), population = sum(population))

# Territory populations (except Puerto Rico) taken from from https://www.census.gov/prod/cen2010/cph-2-1.pdf
terr_census_data <- readr::read_csv(paste(opt$p,"sample_data","united-states-commutes","census_tracts_island_areas_2010.csv",sep='/'))

census_data <- terr_census_data %>%
filter(length(filterUSPS) == 0 | ((USPS %in% filterUSPS) & !(USPS %in% census_data)))%>%
rbind(census_data)

# Sort by population
census_data <- census_data %>%
arrange(population)

if (!is.null(config$spatial_setup$popnodes)) {
names(census_data)[names(census_data) == "population"] <- config$spatial_setup$popnodes
}


if (length(config$spatial_setup$geodata) > 0) {
geodata_file <- config$spatial_setup$geodata
} else {
geodata_file <- 'geodata.csv'
}
write.csv(file = file.path(outdir, geodata_file), census_data, row.names=FALSE)
print(paste("Wrote geodata file:", file.path(outdir, geodata_file)))

# COMMUTE DATA
commute_data <- readr::read_csv(paste(opt$p,"sample_data","united-states-commutes","commute_data.csv",sep='/'))
eclee25 marked this conversation as resolved.
Show resolved Hide resolved
commute_data <- commute_data %>%
mutate(OFIPS = substr(OFIPS,1,5), DFIPS = substr(DFIPS,1,5)) %>%
mutate(OFIPS = name_changer[OFIPS], DFIPS = name_changer[DFIPS]) %>%
filter(OFIPS %in% census_data$GEOID, DFIPS %in% census_data$GEOID) %>%
filter(OFIPS %in% census_data$geoid, DFIPS %in% census_data$geoid) %>%
group_by(OFIPS,DFIPS) %>%
summarize(FLOW = sum(FLOW)) %>%
filter(OFIPS != DFIPS)

padding_table <- tibble(
OFIPS = census_data$GEOID,
DFIPS = census_data$GEOID,
OFIPS = census_data$geoid,
DFIPS = census_data$geoid,
FLOW = 0
)

Expand All @@ -62,20 +142,31 @@ t_commute_table <- tibble(
)

rc <- padding_table %>% bind_rows(commute_data) %>% bind_rows(t_commute_table)

if(opt$w){
rc <- rc %>%pivot_wider(OFIPS,names_from=DFIPS,values_from=FLOW, values_fill=c("FLOW"=0),values_fn = list(FLOW=sum))
mobility_file <- 'mobility.txt'
} else if (length(config$spatial_setup$mobility) > 0) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the behavior here will be that the config filename (if it ends with .txt) overwrites the -w option in the config call

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this will be good for backward compatibility

mobility_file <- config$spatial_setup$mobility
} else {
mobility_file <- 'mobility.csv'
}

print(outdir)
if(opt$w){
if(!isTRUE(all(rc$OFIPS == census_data$GEOID))){
if(endsWith(mobility_file, '.txt')) {
rc <- rc %>% pivot_wider(OFIPS,names_from=DFIPS,values_from=FLOW, values_fill=c("FLOW"=0),values_fn = list(FLOW=sum))
if(!isTRUE(all(rc$OFIPS == census_data$geoid))){
print(rc$OFIPS)
print(census_data$geoid)
stop("There was a problem generating the mobility matrix")
}
write.table(file = file.path(outdir,'mobility.txt'), as.matrix(rc[,-1]), row.names=FALSE, col.names = FALSE, sep = " ")
} else {
write.table(file = file.path(outdir, mobility_file), as.matrix(rc[,-1]), row.names=FALSE, col.names = FALSE, sep = " ")
} else if(endsWith(mobility_file, '.csv')) {
names(rc) <- c("ori","dest","amount")
rc <- rc[rc$ori != rc$dest,]
write.csv(file = file.path(outdir,'mobility.csv'), rc, row.names=FALSE)
write.csv(file = file.path(outdir, mobility_file), rc, row.names=FALSE)
} else {
stop("Only .txt and .csv extensions supported for mobility matrix. Please check config's spatial_setup::mobility.")
}
names(census_data) <- c("geoid","USPS","pop2010")
write.csv(file = file.path(outdir,'geodata.csv'), census_data,row.names=FALSE)

print(paste("Wrote mobility file:", file.path(outdir, mobility_file)))

## @endcond
Loading