Skip to content

Commit

Permalink
Simplify major roads feature (#230)
Browse files Browse the repository at this point in the history
* Adjust script for raw to warehouse workflow

* Adjust queries to iterate through years

* Remove unneeded packages

* Remove purrr

* Remove graphing file

* Add newline for lintr

* Add note about 2023 change in dbt docs

* Add additive logic

* Update dbt docs

* Edit line length
  • Loading branch information
wagnerlmichael authored Nov 16, 2023
1 parent 91e90e6 commit 2027c68
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 29 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
library(aws.s3)
library(dplyr)
library(geoarrow)
library(osmdata)
library(sf)
source("utils.R")

# This script queries OpenStreetMap for major roads in Cook County and
# saves them as a spatial parquet
AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET")
output_bucket <- file.path(AWS_S3_RAW_BUCKET, "spatial", "environment")
current_year <- strftime(Sys.Date(), "%Y")

##### Major roads #####
# Query OpenStreetMap API for major roads in Cook
# Create a sequence of years from 2014 to the current year
years <- 2014:current_year

# Iterate over the years
for (year in years) {
remote_file <- file.path(
output_bucket, "major_road",
paste0("year=", year),
paste0("major_road-", year, ".parquet")
)

if (!aws.s3::object_exists(remote_file)) {
# Update the datetime in the opq function
osm_roads <- opq(bbox = "Cook County, IL",
datetime = paste0(year, "-01-01T00:00:00Z"),
timeout = 900) %>%
add_osm_feature(
key = "highway",
value = c("motorway", "trunk", "primary")
) %>%
osmdata_sf() %>%
.$osm_lines %>%
select(osm_id, name) %>%
st_transform(4326) %>%
mutate(geometry_3435 = st_transform(geometry, 3435))

geoarrow::write_geoparquet(osm_roads, remote_file)
}
}
Original file line number Diff line number Diff line change
@@ -1,38 +1,75 @@
library(arrow)
library(aws.s3)
library(dplyr)
library(geoarrow)
library(glue)
library(noctua)
library(osmdata)
library(purrr)
library(sf)
source("utils.R")

# This script queries OpenStreetMap for major roads in Cook County and
# saves them as a spatial parquet
# This script is designed to ingest spatial data on major roads for each year
# from 2014 to the present, simplify it for efficiency, and store a
# deduplicated, aggregated version of this data in a warehouse bucket.
#
# We take an additive approach here to ensure distance to these roads is
# consistent from earlier pin-level data. If there are new major roads in 2015
# data, they will be added to existing 2014 major roads data, and that addition
# will become our 2015 major roads data. If there are identical osm_id
# observations between 2014 and 2015, we preserve the data from 2014.

# Instantiate S3 bucket names
AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET")
AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET")
output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment")
current_year <- strftime(Sys.Date(), "%Y")

##### Major roads #####
# Query OpenStreetMap API for major roads in Cook
remote_file <- file.path(
output_bucket, "major_road",
paste0("year=", current_year),
paste0("major_road-", current_year, ".parquet")
)

if (!aws.s3::object_exists(remote_file)) {
osm_roads <- opq(bbox = "Cook County, IL") %>%
add_osm_feature(
key = "highway",
value = c("motorway", "trunk", "primary")
) %>%
osmdata_sf() %>%
.$osm_lines %>%
select(osm_id, name) %>%
st_transform(4326) %>%
mutate(geometry_3435 = st_transform(geometry, 3435))

geoarrow::write_geoparquet(osm_roads, remote_file)

# Set up variables for iteration through years
current_year <- as.integer(strftime(Sys.Date(), "%Y"))
years <- 2014:current_year
master_dataset <- NULL

# Iterate over the years
for (year in years) {
# Ingest path
ingest_file <- file.path(
AWS_S3_RAW_BUCKET, "spatial",
"environment", "major_road",
paste0("year=", year),
paste0("major_road-", year, ".parquet"))

# Simplify linestrings
current_data <- read_geoparquet_sf(ingest_file) %>%
mutate(geometry_3435 = st_simplify(geometry_3435, dTolerance = 10))

# Initiate master data set with first available year, add column for de-duping
if (is.null(master_dataset)) {
master_dataset <- current_data %>%
mutate(temporal = 0)

data_to_write <- current_data
} else {
# Create temporal column to preserve earliest data
combined_data <- bind_rows(master_dataset,
current_data %>% mutate(temporal = 1))

# Arrange by osm_id and temporal, then deduplicate and preserve earlier data
data_to_write <- combined_data %>%
arrange(osm_id, temporal) %>%
group_by(osm_id) %>%
slice(1) %>%
ungroup() %>%
select(-temporal)

# Reset temporal tag for the next iteration
master_dataset <- data_to_write %>%
mutate(temporal = 0)
}

# Define the output file path for the data to write
output_file <- file.path(
AWS_S3_WAREHOUSE_BUCKET, "spatial",
"environment", "major_road",
paste0("year=", year),
paste0("major_road-", year, ".parquet")
)

geoarrow::write_geoparquet(data_to_write, output_file)

}
5 changes: 5 additions & 0 deletions dbt/models/spatial/docs.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,11 @@ Major roads sourced from OpenStreetMap (OSM).
Major roads include any OSM ways tagged with
`highway/motorway`, `highway/trunk`, or `highway/primary`

This data covers major roads data from 2014 onwards and uses an additive
approach for data integration. Starting from 2014, each year's data builds
upon the previous year, with new major road additions being added to the
existing dataset.

**Geometry:** `MULTILINESTRING`
{% enddocs %}

Expand Down

0 comments on commit 2027c68

Please sign in to comment.