Skip to content

Commit

Permalink
Update ingest to mark excluded sales
Browse files Browse the repository at this point in the history
  • Loading branch information
dfsnow committed Mar 13, 2024
1 parent b54758c commit 8ab984c
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 3 deletions.
31 changes: 30 additions & 1 deletion pipeline/00-ingest.R
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,22 @@ land_nbhd_rate_data <- dbGetQuery(
)
tictoc::toc()

# Raw sales document number data used to identify some sales accidentally
# excluded from the original training runs. See
# https://github.com/ccao-data/data-architecture/pull/334 for more info
tictoc::tic("Sales data pulled")
sales_data <- dbGetQuery(
conn = AWS_ATHENA_CONN_NOCTUA, glue("
SELECT DISTINCT
substr(saledt, 1, 4) AS year,
instruno AS doc_no_old,
NULLIF(REPLACE(instruno, 'D', ''), '') AS doc_no_new
FROM iasworld.sales
WHERE substr(saledt, 1, 4) >= '{params$input$min_sale_year}'
")
)
tictoc::toc()

# Close connection to Athena
dbDisconnect(AWS_ATHENA_CONN_NOCTUA)
rm(AWS_ATHENA_CONN_NOCTUA)
Expand Down Expand Up @@ -262,11 +278,24 @@ assessment_data_w_hie <- assessment_data %>%
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
message("Adding time features and cleaning")

training_data_klg <- training_data_w_hie %>%
left_join(
sales_data %>%
distinct(doc_no_new, .keep_all = TRUE),
by = c("meta_sale_document_num" = "doc_no_new", "year")
) %>%
mutate(
sv_added_later = as.logical(endsWith(doc_no_old, "D")),
sv_added_later = replace_na(sv_added_later, FALSE)
) %>%
select(-doc_no_old)


## 5.1. Training Data ----------------------------------------------------------

# Clean up the training data. Goal is to get it into a publishable format.
# Final featurization, missingness, etc. is handled via Tidymodels recipes
training_data_clean <- training_data_w_hie %>%
training_data_clean <- training_data_klg %>%
# Recode factor variables using the definitions stored in ccao::vars_dict
# This will remove any categories not stored in the dictionary and convert
# them to NA (useful since there are a lot of misrecorded variables)
Expand Down
5 changes: 3 additions & 2 deletions pipeline/02-assess.R
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ sales_data_two_most_recent <- sales_data %>%
distinct(
meta_pin, meta_year,
meta_sale_price, meta_sale_date, meta_sale_document_num,
sv_outlier_type
sv_outlier_type, sv_added_later
) %>%
# Include outliers, since these data are used for desk review and
# not for modeling
Expand All @@ -386,7 +386,8 @@ sales_data_two_most_recent <- sales_data %>%
meta_sale_date,
meta_sale_price,
meta_sale_document_num,
meta_sale_outlier_type
meta_sale_outlier_type,
sv_added_later
),
names_glue = "{mr}_{gsub('meta_sale_', '', .value)}"
) %>%
Expand Down

0 comments on commit 8ab984c

Please sign in to comment.