Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify major roads feature #230

Merged
merged 11 commits into from
Nov 16, 2023
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
library(aws.s3)
library(dplyr)
library(geoarrow)
library(osmdata)
library(sf)
source("utils.R")

# This script queries OpenStreetMap for major roads in Cook County and
# saves them as a spatial parquet
AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET")
output_bucket <- file.path(AWS_S3_RAW_BUCKET, "spatial", "environment")
current_year <- strftime(Sys.Date(), "%Y")

##### Major roads #####
# Query OpenStreetMap API for major roads in Cook
# Create a sequence of years from 2014 to the current year
years <- 2014:current_year

# Iterate over the years
for (year in years) {
remote_file <- file.path(
output_bucket, "major_road",
paste0("year=", year),
paste0("major_road-", year, ".parquet")
)

if (!aws.s3::object_exists(remote_file)) {
# Update the datetime in the opq function
osm_roads <- opq(bbox = "Cook County, IL",
datetime = paste0(year, "-01-01T00:00:00Z"),
timeout = 900) %>%
add_osm_feature(
key = "highway",
value = c("motorway", "trunk", "primary")
) %>%
osmdata_sf() %>%
.$osm_lines %>%
select(osm_id, name) %>%
st_transform(4326) %>%
mutate(geometry_3435 = st_transform(geometry, 3435))

geoarrow::write_geoparquet(osm_roads, remote_file)
}
}
Original file line number Diff line number Diff line change
@@ -1,38 +1,75 @@
library(arrow)
library(aws.s3)
library(dplyr)
library(geoarrow)
library(glue)
library(noctua)
library(osmdata)
library(purrr)
library(sf)
source("utils.R")

# This script queries OpenStreetMap for major roads in Cook County and
# saves them as a spatial parquet
# This script is designed to ingest spatial data on major roads for each year
# from 2014 to the present, simplify it for efficiency, and store a
# deduplicated, aggregated version of this data in a warehouse bucket.
#
# We take an additive approach here to ensure distance to these roads is
# consistent from earlier pin-level data. If there are new major roads in 2015
# data, they will be added to existing 2014 major roads data, and that addition
# will become our 2015 major roads data. If there are identical osm_id
# observations between 2014 and 2015, we preserve the data from 2014.

# Instantiate S3 bucket names
AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET")
AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET")
output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, "spatial", "environment")
current_year <- strftime(Sys.Date(), "%Y")

##### Major roads #####
# Query OpenStreetMap API for major roads in Cook
remote_file <- file.path(
output_bucket, "major_road",
paste0("year=", current_year),
paste0("major_road-", current_year, ".parquet")
)

if (!aws.s3::object_exists(remote_file)) {
osm_roads <- opq(bbox = "Cook County, IL") %>%
add_osm_feature(
key = "highway",
value = c("motorway", "trunk", "primary")
) %>%
osmdata_sf() %>%
.$osm_lines %>%
select(osm_id, name) %>%
st_transform(4326) %>%
mutate(geometry_3435 = st_transform(geometry, 3435))

geoarrow::write_geoparquet(osm_roads, remote_file)

# Set up variables for iteration through years
current_year <- as.integer(strftime(Sys.Date(), "%Y"))
years <- 2014:current_year
master_dataset <- NULL

# Iterate over the years
for (year in years) {
# Ingest path
ingest_file <- file.path(
AWS_S3_RAW_BUCKET, "spatial",
"environment", "major_road",
paste0("year=", year),
paste0("major_road-", year, ".parquet"))

# Simplify linestrings
current_data <- read_geoparquet_sf(ingest_file) %>%
mutate(geometry_3435 = st_simplify(geometry_3435, dTolerance = 10))

# Initiate master data set with first available year, add column for de-duping
if (is.null(master_dataset)) {
master_dataset <- current_data %>%
mutate(temporal = 0)

data_to_write <- current_data
} else {
# Create temporal column to preserve earliest data
combined_data <- bind_rows(master_dataset,
current_data %>% mutate(temporal = 1))

# Arrange by osm_id and temporal, then deduplicate and preserve earlier data
data_to_write <- combined_data %>%
arrange(osm_id, temporal) %>%
group_by(osm_id) %>%
slice(1) %>%
ungroup() %>%
select(-temporal)

# Reset temporal tag for the next iteration
master_dataset <- data_to_write %>%
mutate(temporal = 0)
}

# Define the output file path for the data to write
output_file <- file.path(
AWS_S3_WAREHOUSE_BUCKET, "spatial",
"environment", "major_road",
paste0("year=", year),
paste0("major_road-", year, ".parquet")
)

geoarrow::write_geoparquet(data_to_write, output_file)

}
5 changes: 5 additions & 0 deletions dbt/models/spatial/docs.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,11 @@ Major roads sourced from OpenStreetMap (OSM).
Major roads include any OSM ways tagged with
`highway/motorway`, `highway/trunk`, or `highway/primary`

This data covers major roads data from 2014 onwards and uses an additive
approach for data integration. Starting from 2014, each year's data builds
upon the previous year, with new major road additions being added to the
existing dataset.

**Geometry:** `MULTILINESTRING`
{% enddocs %}

Expand Down