From 2027c68014f703167870599d78265fc475d7b8e4 Mon Sep 17 00:00:00 2001 From: wagnerlmichael <93889413+wagnerlmichael@users.noreply.github.com> Date: Thu, 16 Nov 2023 16:41:10 -0600 Subject: [PATCH] Simplify major roads feature (#230) * Adjust script for raw to warehouse workflow * Adjust queries to iterate through years * Remove unneeded packages * Remove purrr * Remove graphing file * Add newline for lintr * Add note about 2023 change in dbt docs * Add additive logic * Update dbt docs * Edit line length --- .../spatial-environment-major_road.R | 44 +++++++++ .../spatial-environment-major_road.R | 95 +++++++++++++------ dbt/models/spatial/docs.md | 5 + 3 files changed, 115 insertions(+), 29 deletions(-) create mode 100644 aws-s3/scripts-ccao-data-raw-us-east-1/spatial-environment-major_road.R diff --git a/aws-s3/scripts-ccao-data-raw-us-east-1/spatial-environment-major_road.R b/aws-s3/scripts-ccao-data-raw-us-east-1/spatial-environment-major_road.R new file mode 100644 index 000000000..f39271ab4 --- /dev/null +++ b/aws-s3/scripts-ccao-data-raw-us-east-1/spatial-environment-major_road.R @@ -0,0 +1,44 @@ +library(aws.s3) +library(dplyr) +library(geoarrow) +library(osmdata) +library(sf) +source("utils.R") + +# This script queries OpenStreetMap for major roads in Cook County and +# saves them as a spatial parquet +AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") +output_bucket <- file.path(AWS_S3_RAW_BUCKET, "spatial", "environment") +current_year <- strftime(Sys.Date(), "%Y") + +##### Major roads ##### +# Query OpenStreetMap API for major roads in Cook +# Create a sequence of years from 2014 to the current year +years <- 2014:current_year + +# Iterate over the years +for (year in years) { + remote_file <- file.path( + output_bucket, "major_road", + paste0("year=", year), + paste0("major_road-", year, ".parquet") + ) + + if (!aws.s3::object_exists(remote_file)) { + # Update the datetime in the opq function + osm_roads <- opq(bbox = "Cook County, IL", + datetime = paste0(year, "-01-01T00:00:00Z"), + timeout = 900) %>% + add_osm_feature( + key = "highway", + value = c("motorway", "trunk", "primary") + ) %>% + osmdata_sf() %>% + .$osm_lines %>% + select(osm_id, name) %>% + st_transform(4326) %>% + mutate(geometry_3435 = st_transform(geometry, 3435)) + + geoarrow::write_geoparquet(osm_roads, remote_file) + } +} diff --git a/aws-s3/scripts-ccao-data-warehouse-us-east-1/spatial-environment-major_road.R b/aws-s3/scripts-ccao-data-warehouse-us-east-1/spatial-environment-major_road.R index f182ac947..bdaf4c1c0 100644 --- a/aws-s3/scripts-ccao-data-warehouse-us-east-1/spatial-environment-major_road.R +++ b/aws-s3/scripts-ccao-data-warehouse-us-east-1/spatial-environment-major_road.R @@ -1,38 +1,75 @@ +library(arrow) library(aws.s3) library(dplyr) library(geoarrow) -library(glue) -library(noctua) library(osmdata) -library(purrr) library(sf) source("utils.R") -# This script queries OpenStreetMap for major roads in Cook County and -# saves them as a spatial parquet +# This script is designed to ingest spatial data on major roads for each year +# from 2014 to the present, simplify it for efficiency, and store a +# deduplicated, aggregated version of this data in a warehouse bucket. +# +# We take an additive approach here to ensure distance to these roads is +# consistent from earlier pin-level data. If there are new major roads in 2015 +# data, they will be added to existing 2014 major roads data, and that addition +# will become our 2015 major roads data. If there are identical osm_id +# observations between 2014 and 2015, we preserve the data from 2014. + +# Instantiate S3 bucket names +AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") -output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment") -current_year <- strftime(Sys.Date(), "%Y") - -##### Major roads ##### -# Query OpenStreetMap API for major roads in Cook -remote_file <- file.path( - output_bucket, "major_road", - paste0("year=", current_year), - paste0("major_road-", current_year, ".parquet") -) - -if (!aws.s3::object_exists(remote_file)) { - osm_roads <- opq(bbox = "Cook County, IL") %>% - add_osm_feature( - key = "highway", - value = c("motorway", "trunk", "primary") - ) %>% - osmdata_sf() %>% - .$osm_lines %>% - select(osm_id, name) %>% - st_transform(4326) %>% - mutate(geometry_3435 = st_transform(geometry, 3435)) - - geoarrow::write_geoparquet(osm_roads, remote_file) + +# Set up variables for iteration through years +current_year <- as.integer(strftime(Sys.Date(), "%Y")) +years <- 2014:current_year +master_dataset <- NULL + +# Iterate over the years +for (year in years) { + # Ingest path + ingest_file <- file.path( + AWS_S3_RAW_BUCKET, "spatial", + "environment", "major_road", + paste0("year=", year), + paste0("major_road-", year, ".parquet")) + + # Simplify linestrings + current_data <- read_geoparquet_sf(ingest_file) %>% + mutate(geometry_3435 = st_simplify(geometry_3435, dTolerance = 10)) + + # Initiate master data set with first available year, add column for de-duping + if (is.null(master_dataset)) { + master_dataset <- current_data %>% + mutate(temporal = 0) + + data_to_write <- current_data + } else { + # Create temporal column to preserve earliest data + combined_data <- bind_rows(master_dataset, + current_data %>% mutate(temporal = 1)) + + # Arrange by osm_id and temporal, then deduplicate and preserve earlier data + data_to_write <- combined_data %>% + arrange(osm_id, temporal) %>% + group_by(osm_id) %>% + slice(1) %>% + ungroup() %>% + select(-temporal) + + # Reset temporal tag for the next iteration + master_dataset <- data_to_write %>% + mutate(temporal = 0) + } + + # Define the output file path for the data to write + output_file <- file.path( + AWS_S3_WAREHOUSE_BUCKET, "spatial", + "environment", "major_road", + paste0("year=", year), + paste0("major_road-", year, ".parquet") + ) + + geoarrow::write_geoparquet(data_to_write, output_file) + } diff --git a/dbt/models/spatial/docs.md b/dbt/models/spatial/docs.md index 00dcffd9a..8a8b3b21f 100644 --- a/dbt/models/spatial/docs.md +++ b/dbt/models/spatial/docs.md @@ -215,6 +215,11 @@ Major roads sourced from OpenStreetMap (OSM). Major roads include any OSM ways tagged with `highway/motorway`, `highway/trunk`, or `highway/primary` +This data covers major roads data from 2014 onwards and uses an additive +approach for data integration. Starting from 2014, each year's data builds +upon the previous year, with new major road additions being added to the +existing dataset. + **Geometry:** `MULTILINESTRING` {% enddocs %}