-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Adjust script for raw to warehouse workflow * Adjust queries to iterate through years * Remove unneeded packages * Remove purrr * Remove graphing file * Add newline for lintr * Add note about 2023 change in dbt docs * Add additive logic * Update dbt docs * Edit line length
- Loading branch information
1 parent
91e90e6
commit 2027c68
Showing
3 changed files
with
115 additions
and
29 deletions.
There are no files selected for viewing
44 changes: 44 additions & 0 deletions
44
aws-s3/scripts-ccao-data-raw-us-east-1/spatial-environment-major_road.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
library(aws.s3) | ||
library(dplyr) | ||
library(geoarrow) | ||
library(osmdata) | ||
library(sf) | ||
source("utils.R") | ||
|
||
# This script queries OpenStreetMap for major roads in Cook County and | ||
# saves them as a spatial parquet | ||
AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") | ||
output_bucket <- file.path(AWS_S3_RAW_BUCKET, "spatial", "environment") | ||
current_year <- strftime(Sys.Date(), "%Y") | ||
|
||
##### Major roads ##### | ||
# Query OpenStreetMap API for major roads in Cook | ||
# Create a sequence of years from 2014 to the current year | ||
years <- 2014:current_year | ||
|
||
# Iterate over the years | ||
for (year in years) { | ||
remote_file <- file.path( | ||
output_bucket, "major_road", | ||
paste0("year=", year), | ||
paste0("major_road-", year, ".parquet") | ||
) | ||
|
||
if (!aws.s3::object_exists(remote_file)) { | ||
# Update the datetime in the opq function | ||
osm_roads <- opq(bbox = "Cook County, IL", | ||
datetime = paste0(year, "-01-01T00:00:00Z"), | ||
timeout = 900) %>% | ||
add_osm_feature( | ||
key = "highway", | ||
value = c("motorway", "trunk", "primary") | ||
) %>% | ||
osmdata_sf() %>% | ||
.$osm_lines %>% | ||
select(osm_id, name) %>% | ||
st_transform(4326) %>% | ||
mutate(geometry_3435 = st_transform(geometry, 3435)) | ||
|
||
geoarrow::write_geoparquet(osm_roads, remote_file) | ||
} | ||
} |
95 changes: 66 additions & 29 deletions
95
aws-s3/scripts-ccao-data-warehouse-us-east-1/spatial-environment-major_road.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,38 +1,75 @@ | ||
library(arrow) | ||
library(aws.s3) | ||
library(dplyr) | ||
library(geoarrow) | ||
library(glue) | ||
library(noctua) | ||
library(osmdata) | ||
library(purrr) | ||
library(sf) | ||
source("utils.R") | ||
|
||
# This script queries OpenStreetMap for major roads in Cook County and | ||
# saves them as a spatial parquet | ||
# This script is designed to ingest spatial data on major roads for each year | ||
# from 2014 to the present, simplify it for efficiency, and store a | ||
# deduplicated, aggregated version of this data in a warehouse bucket. | ||
# | ||
# We take an additive approach here to ensure distance to these roads is | ||
# consistent from earlier pin-level data. If there are new major roads in 2015 | ||
# data, they will be added to existing 2014 major roads data, and that addition | ||
# will become our 2015 major roads data. If there are identical osm_id | ||
# observations between 2014 and 2015, we preserve the data from 2014. | ||
|
||
# Instantiate S3 bucket names | ||
AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") | ||
AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") | ||
output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment") | ||
current_year <- strftime(Sys.Date(), "%Y") | ||
|
||
##### Major roads ##### | ||
# Query OpenStreetMap API for major roads in Cook | ||
remote_file <- file.path( | ||
output_bucket, "major_road", | ||
paste0("year=", current_year), | ||
paste0("major_road-", current_year, ".parquet") | ||
) | ||
|
||
if (!aws.s3::object_exists(remote_file)) { | ||
osm_roads <- opq(bbox = "Cook County, IL") %>% | ||
add_osm_feature( | ||
key = "highway", | ||
value = c("motorway", "trunk", "primary") | ||
) %>% | ||
osmdata_sf() %>% | ||
.$osm_lines %>% | ||
select(osm_id, name) %>% | ||
st_transform(4326) %>% | ||
mutate(geometry_3435 = st_transform(geometry, 3435)) | ||
|
||
geoarrow::write_geoparquet(osm_roads, remote_file) | ||
|
||
# Set up variables for iteration through years | ||
current_year <- as.integer(strftime(Sys.Date(), "%Y")) | ||
years <- 2014:current_year | ||
master_dataset <- NULL | ||
|
||
# Iterate over the years | ||
for (year in years) { | ||
# Ingest path | ||
ingest_file <- file.path( | ||
AWS_S3_RAW_BUCKET, "spatial", | ||
"environment", "major_road", | ||
paste0("year=", year), | ||
paste0("major_road-", year, ".parquet")) | ||
|
||
# Simplify linestrings | ||
current_data <- read_geoparquet_sf(ingest_file) %>% | ||
mutate(geometry_3435 = st_simplify(geometry_3435, dTolerance = 10)) | ||
|
||
# Initiate master data set with first available year, add column for de-duping | ||
if (is.null(master_dataset)) { | ||
master_dataset <- current_data %>% | ||
mutate(temporal = 0) | ||
|
||
data_to_write <- current_data | ||
} else { | ||
# Create temporal column to preserve earliest data | ||
combined_data <- bind_rows(master_dataset, | ||
current_data %>% mutate(temporal = 1)) | ||
|
||
# Arrange by osm_id and temporal, then deduplicate and preserve earlier data | ||
data_to_write <- combined_data %>% | ||
arrange(osm_id, temporal) %>% | ||
group_by(osm_id) %>% | ||
slice(1) %>% | ||
ungroup() %>% | ||
select(-temporal) | ||
|
||
# Reset temporal tag for the next iteration | ||
master_dataset <- data_to_write %>% | ||
mutate(temporal = 0) | ||
} | ||
|
||
# Define the output file path for the data to write | ||
output_file <- file.path( | ||
AWS_S3_WAREHOUSE_BUCKET, "spatial", | ||
"environment", "major_road", | ||
paste0("year=", year), | ||
paste0("major_road-", year, ".parquet") | ||
) | ||
|
||
geoarrow::write_geoparquet(data_to_write, output_file) | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters