From 4b34b467426f997543b49dba5aef0cf359a6d0a5 Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Thu, 14 Jan 2021 18:05:00 -0500 Subject: [PATCH 01/18] incremental changes to several files, esp. utils --- R/analyze.R | 5 +- R/functions.R | 1 - R/process.R | 19 ++- R/utils.R | 328 +++++++++++++++++++++++++++++-------------- config.R | 107 +++----------- reports/notebook.Rmd | 19 +-- run.R | 13 +- 7 files changed, 270 insertions(+), 222 deletions(-) diff --git a/R/analyze.R b/R/analyze.R index 7a4f070..5bb191d 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -1,8 +1,7 @@ ############################################################ # This file handles the primary analysis using the tidied -# data as input. Should never read from `dir_data_raw`, -# only `dir_data_processed`. -# +# data as input. Should never read from `dir_data_raw()`, +# only `dir_data_processed()`. # ############################################################ diff --git a/R/functions.R b/R/functions.R index fb8ebd1..5705d88 100644 --- a/R/functions.R +++ b/R/functions.R @@ -1,5 +1,4 @@ ############################################################ # Project-specific functions. # -# ############################################################ diff --git a/R/process.R b/R/process.R index 3a3aee1..5a7864a 100644 --- a/R/process.R +++ b/R/process.R @@ -1,15 +1,16 @@ ############################################################ -# This file is used to read in raw data, tidy, clean it, -# and save a file to src *before* proceeding to analysis. -# If you're using `mutate()` for any actual analysis you're -# doing it wrong. -# -# Specify column types as required. +# This file is used to read in raw data, clean it, and save +# a file to `dir_data_processed()` *before* proceeding to +# analysis. If this file is run from run.R, all variables +# created by this step will be erased after the step is +# complete to keep a clean working environment. Tip: If your +# analysis is complicated enough that you need to break the +# processing out into multiple files, simply source them +# from this file by calling something like +# `source(dir_src('process_files', 'process_step_1.R'))` # ############################################################ -# begin_processing() - # sample.raw <- read_csv(sample.raw.file) %>% # rename( # cma = 'CMA', @@ -24,5 +25,3 @@ # arrange(cma, desc(date)) # # write_feather(sample.raw, here::here(dir_data_processed, 'sample.feather')) -# -# end_processing() diff --git a/R/utils.R b/R/utils.R index c64f0e0..fe9c9ab 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,71 +1,169 @@ -# Load required packages -load_requirements <- function(pkg){ - new.pkg <- pkg[!(pkg %in% installed.packages()[, 'Package'])] - if (length(new.pkg)) - install.packages(new.pkg, dependencies = TRUE) - sapply(pkg, require, character.only = TRUE) -} - -# Run the gauntlet of basic exploratory data analysis on your data -run_basic_eda <- function(data){ - glimpse(data) - df_status(data) - freq(data) - profiling_num(data) - plot_num(data) - describe(data) -} - -read_all_excel_sheets <- function( - filepath, - range = NULL, - col_types = NULL, - col_names = TRUE, - na = '', - trim_ws = TRUE, - skip = 0, - n_max = Inf, - guess_max = min(1000, n_max), - .name_repair = 'unique' +# PACKAGES THIS REQUIRES + +'here' +'librarian' +'tidyverse' +'openxlsx' +'feather' +'knitr' +'beepr' +'ggthemes' +'clipr' + +initialize_startr <- function( + scipen = 999, + timezone = 'America/Toronto', + should_render_notebook = FALSE, + should_process_data = TRUE, + should_timestamp_output_files = FALSE, + should_clean_processing_variables = TRUE, + should_beep = TRUE, + packages = c() ) { + + if (scipen) options(scipen = scipen) + if (timezone) Sys.setenv(TZ = timezone) + + assign('should_render_notebook', should_render_notebook, envir = .GlobalEnv) + assign('should_process_data', should_process_data, envir = .GlobalEnv) + assign('should_timestamp_output_files', should_timestamp_output_files, envir = .GlobalEnv) + assign('should_clean_processing_variables', should_clean_processing_variables, envir = .GlobalEnv) + assign('should_beep', should_beep, envir = .GlobalEnv) + + # DO LIBRARIAN STUFF HERE + # load_requirements(packages) + ggthemes::theme_set(theme_minimal()) + + knitr::opts_chunk$set( + eval = TRUE, + echo = FALSE, + message = FALSE, + cache = FALSE, + warning = FALSE, + error = FALSE, + comment = '#', + tidy = FALSE, + collapse = TRUE, + results = 'asis', + fig.width = 12, + dpi = 150, + root.dir = here::here() + ) + + if ('cansim' %in% packages) { + options(cansim.cache_path = dir_data_cache()) + } + + if ('cancensus' %in% packages) { + options( + # CANCENSUS_API should be set in your home directory's + # .Renviron file, and will get pulled down from there + cancensus.api_key = Sys.getenv(c('CANCENSUS_API')), + cancensus.cache_path = dir_data_cache(), + ) + } + +} + +dir_constructor <- function(path, ...) { + here::here(path, ...) +} + +dir_src <- function(...) { + dir_constructor('R', ...) +} + +dir_data_raw <- function(...) { + dir_constructor('data/raw', ...) +} + +dir_data_cache <- function(...) { + dir_constructor('data/cache', ...) +} + +dir_data_processed <- function(...) { + dir_constructor('data/processed', ...) +} + +dir_data_out <- function(...) { + dir_constructor('data/out', ...) +} + +dir_reports <- function(...) { + dir_constructor('reports', ...) +} + +dir_plots <- function(...) { + dir_constructor('plots', ...) +} + +run_config <- function() { + source(here::here('config.R')) + source(dir_src('functions.R')) +} + +run_process <- function() { + if (should_process_data) { + begin_processing(clean = clean_processing_variables) + source(dir_src('process.R')) + end_processing(should_beep = should_beep) + } +} + +run_analyze <- function() { + source(dir_src('analyze.R')) +} + +run_visualize <- function() { + source(dir_src('visualize.R')) +} + +run_render_notebook <- function(path = r_notebook.file, should_beep = should_beep) { + if (should_render_notebook) render_notebook(path) + if (should_beep) beep() +} + +# load_requirements <- function(pkg){ +# new.pkg <- pkg[!(pkg %in% installed.packages()[, 'Package'])] +# if (length(new.pkg)) +# install.packages(new.pkg, dependencies = TRUE) +# sapply(pkg, require, character.only = TRUE) +# } + +# Contributed by Andy Lin of News Nerdery Slack +combine_csvs <- function(dir, ...) { + list.files(dir, pattern = '*.csv', full.names = T) %>% + map_dfr(function(x) { + read_csv(path = filepath, ...) + }, .id = 'filename') +} + +read_all_excel_sheets <- function(filepath, ...) { filepath %>% excel_sheets() %>% set_names() %>% - map_df(~ read_excel( - path = filepath, - skip = skip, - range = range, - na = na, - trim_ws = trim_ws, - guess_max = guess_max, - col_names = col_names, - col_types = col_types, - n_max = n_max, - sheet = .x, - .name_repair = .name_repair - ), .id = 'sheet') -} - -# geocoding function using OSM Nominatim API -# details: http://wiki.openstreetmap.org/wiki/Nominatim -# made by: D.Kisler -nominatim_osm <- function(address = NULL) { - if(suppressWarnings(is.null(address))) - return(data.frame()) - tryCatch( - d <- jsonlite::fromJSON( - gsub('\\@addr\\@', gsub('\\s+', '\\%20', address), - 'http://nominatim.openstreetmap.org/search/@addr@?format=json&addressdetails=0&limit=1') - ), error = function(c) return(data.frame()) - ) - if(length(d) == 0) return(data.frame()) - return(data.frame(lon = as.numeric(d$lon), lat = as.numeric(d$lat))) + map_df(function(x) { + read_excel(path = filepath, sheet = x, ...) + }, .id = 'sheet') +} + +combine_excels <- function(dir, all_sheets = FALSE, ...) { + read_excel_constructor <- function(x) { + if (all_sheets) { + read_all_excel_sheets(path = filepath, ...) + } else { + read_excel(path = filepath, ...) + } + } + + list.files(dir, pattern = '.xls[x]?', full.names = T) %>% + map_dfr(read_excel_constructor, .id = 'filename') } render_notebook <- function(notebook_file) { rmarkdown::render( notebook_file, - output_dir = dir_reports, + output_dir = dir_reports(), encoding = 'utf-8' ) } @@ -83,21 +181,59 @@ unaccent <- function(x) { iconv(x, to = 'ASCII//TRANSLIT') } -simplify_string <- function(x, alpha = TRUE, digits = FALSE) { - re <- '^\\s' +remove_non_utf8 <- function(x) { + iconv(x, to = 'UTF-8', sub = '') +} - if (alpha) re <- paste(re, 'a-zA-Z', sep = '') - if (digits) re <- paste(re, '0-9', sep = '') +`%not_in%` <- purrr::negate(`%in%`) - # TODO: add corporate stop words like INC, LTD, CORP? +not.na <- purrr::negate(is.na) - x %>% - unaccent(.) %>% - str_replace_all(., paste('[', re, ']', sep = ''), '') %>% - str_replace_all(., '[\\s]+', ' ') %>% - toupper(.) %>% - trimws(.) -} +simplify_string <- function( + x, + alpha = TRUE, + digits = FALSE, + unaccent = TRUE, + utf8_only = TRUE, + uppercase = TRUE, + trim = TRUE, + stopwords = NA + ) { + + x_temp <- x + + if (unaccent) { + x_temp <- unaccent(x_temp) + } + + if (utf8_only) { + x_temp <- remove_non_utf8(x_temp) + } + + if (uppercase) { + x_temp <- str_to_upper(x_temp) + } + + if (alpha | digits) { + re <- '^\\s' + if (alpha) re <- paste(re, 'a-zA-Z', sep = '') + if (digits) re <- paste(re, '0-9', sep = '') + x_temp <- str_replace_all(x_temp, paste('[', re, ']', sep = ''), '') + } + + if (!any(is.na(stopwords))) { + if (uppercase) stopwords <- str_to_upper(stopwords) + stopwords_regex <- paste0('\\b', paste(stopwords, collapse = '\\b|\\b'), '\\b') + x_temp <- str_replace_all(x_temp, stopwords_regex, '') + } + + if (trim) { + x_temp <- str_squish(x_temp) + } + + return(x_temp) + + } clean_columns <- function(x) { cols <- x %>% @@ -105,8 +241,8 @@ clean_columns <- function(x) { str_replace_all(., '[\\s]+', '_') %>% str_replace_all(., '[_]+', '_') %>% str_replace_all(., '[^_a-zA-Z]', '') %>% - tolower(.) %>% - trimws(.) + str_to_lower(.) %>% + str_squish(.) for (i in 1:length(cols)) { if (!as.logical(str_count(cols[i]))) { @@ -120,64 +256,52 @@ clean_columns <- function(x) { return(cols) } -convert_str_to_logical <- function(x, truthy = 'T|TRUE', falsy = 'F|FALSE') { +convert_str_to_logical <- function(x, truthy = 'T|TRUE|Y|YES', falsy = 'F|FALSE|N|NO') { x %>% - toupper(.) %>% - trimws(.) %>% + str_to_upper(.) %>% + str_squish(.) %>% str_replace_all(., truthy, 'TRUE') %>% str_replace_all(., falsy, 'FALSE') %>% as.logical(.) } -write_excel <- function(variable, timestamp = timestamp_output_files) { +write_excel <- function(variable, should_timestamp_output_files = should_timestamp_output_files) { filename <- deparse(substitute(variable)) - if (timestamp) { + if (should_timestamp_output_files) { now <- Sys.time() filename <- glue('{filename}_{format(now, "%Y%m%d%H%M%S")}') } - write.xlsx(variable, file = here::here(dir_data_out, glue('{filename}.xlsx'))) + write.xlsx(variable, file = dir_data_out(glue('{filename}.xlsx'))) } -begin_processing <- function() { - if (clean_processing_variables) { +begin_processing <- function(should_clean_processing_variables = should_clean_processing_variables) { + if (should_clean_processing_variables) { assign('curr_env', ls(.GlobalEnv), envir = .GlobalEnv) } } -end_processing <- function() { - if (clean_processing_variables) { +end_processing <- function(should_clean_processing_variables = should_clean_processing_variables, should_beep = should_beep) { + if (should_clean_processing_variables) { ls(.GlobalEnv) %>% setdiff(., curr_env) %>% as.character() %>% rm(list = ., envir = .GlobalEnv) } - beep() + if (should_beep) beep() } -write_plot <- function(variable, filename = NA, width = NA, height = NA, format = NA, units = NA, dpi = NA, limitsize = NA) { - default_format <- 'png' - default_units <- 'in' - default_dpi <- 300 - default_filename <- deparse(substitute(variable)) - default_limitsize <- TRUE +write_plot <- function(variable, filename = NA, format = 'png', ...) { - if(!is.na(format)) default_format <- format - if(!is.na(units)) default_units <- units - if(!is.na(dpi)) default_dpi <- dpi - if(!is.na(filename)) default_filename <- filename - if(!is.na(limitsize)) default_limitsize <- limitsize + if (is.na(filename)) filename <- deparse(substitute(variable)) args <- list( plot = variable, - file = here::here(dir_plots, glue('{default_filename}.{default_format}')), - units = default_units, - dpi = default_dpi, - width = width, - height = height, - limitsize = default_limitsize + format = format, + file = dir_plots(glue('{filename}.{format}')), + ... ) - if (default_format == 'pdf') args[['useDingbats']] <- FALSE + if (format == 'pdf') args[['useDingbats']] <- FALSE do.call(ggsave, args) } diff --git a/config.R b/config.R index 817bc1f..4f82b21 100644 --- a/config.R +++ b/config.R @@ -1,93 +1,28 @@ ############################################################ -# This file sets the config for the project including -# specifying packages to load and global variables. +# This file configures the project by specifying filenames, +# loading packages and setting up some project-specific +# variables. # ############################################################ -options(scipen = 999) -Sys.setenv(TZ = 'America/Toronto') - -# Project-specific -config_author <- 'Firstname Lastname ' -config_title <- 'startr' - -# Directories to read from and write to -dir_data <- 'data' -dir_src <- 'R' -dir_data_raw <- 'data/raw' -dir_data_cache <- 'data/cache' -dir_data_processed <- 'data/processed' -dir_data_out <- 'data/out' -dir_reports <- 'reports' -dir_plots <- 'plots' - -# Files: You'll want to edit this to add your source data file names -sample.raw.file <- here::here(dir_data_raw, 'sample.csv') - -# Primary and supplemental notebooks. -# Set should_render_notebook to TRUE if using notebooks -r_notebook <- here::here(dir_reports, 'notebook.Rmd') - -# startr-specific configuration, consumed by helper functions -# Should a notebook be rendered in run.R? -should_render_notebook <- FALSE - -# Should the processing step be run in run.R? -should_process_data <- TRUE - -# Should files written with write_excel have a timestamp in the filename? -timestamp_output_files <- FALSE - -# Should the variables created during process.R be cleaned up after processing? -clean_processing_variables <- TRUE - -packages <- c( - # essentials - 'here', 'devtools', 'tidyverse', - # manipulation - 'lubridate', 'janitor', 'zoo', 'glue', 'clipr', - # modelling - 'tidymodels', - # Read/write files - 'readxl', 'openxlsx', 'feather', - # visualization - 'scales', 'ggthemes', 'gganimate', - # scraping - 'rvest', - # GIS - 'sf', - # RMarkdown - 'knitr', 'ezknitr', 'kableExtra', 'DT', - # other stuff - # 'cansim', 'cancensus', - 'beepr' +# This initializes your startr project +startr_config <- c( + author = 'Firstname Lastname ', + title = 'startr', + timezone = 'America/Toronto', + should_render_notebook = FALSE, + should_process_data = TRUE, + should_timestamp_output_files = FALSE, + packages = c( + 'janitor', 'zoo', + 'tidymodels', + 'scales', 'gganimate', + 'sf', + 'cansim', 'cancensus', + ) ) -source(here::here(dir_src, 'utils.R')) -source(here::here(dir_src, 'functions.R')) - -load_requirements(packages) +initialize_startr(startr_config) -options( - # CANCENSUS_API should be set in your home directory's .Renviron file, - # and will get pulled down from there - cancensus.api_key = Sys.getenv(c('CANCENSUS_API')), - cancensus.cache_path = here::here(dir_data_cache), - cansim.cache_path = here::here(dir_data_cache) -) - -knitr::opts_chunk$set( - eval = TRUE, - echo = FALSE, - message = FALSE, - cache = FALSE, - warning = FALSE, - error = FALSE, - comment = '#', - tidy = FALSE, - collapse = TRUE, - results = 'asis', - fig.width = 12, - dpi = 150, - root.dir = here::here() -) +# Filenames: Refer to your source data filenames here +sample.raw.file <- dir_data_raw('sample.csv') diff --git a/reports/notebook.Rmd b/reports/notebook.Rmd index 200baaa..b5c18bf 100644 --- a/reports/notebook.Rmd +++ b/reports/notebook.Rmd @@ -1,7 +1,7 @@ --- -title: '`r config_title`' +title: '`r startr_config['config_title']`' date: '`r format(Sys.Date(), "%B %d, %Y")`' -author: '`r config_author`' +author: '`r startr_config[['config_author']`' output: html_notebook: code_folding: hide @@ -13,17 +13,10 @@ output: toc_float: yes --- -## Config -```{r config, echo=FALSE, message=FALSE, warning=FALSE, results='hide'} +## First heading -if (!require('devtools')) install.packages('devtools'); library('devtools') -if (!require('here')) install.packages('here'); library('here') - -source(here::here('config.R')) -if (should_process_data) source(here::here(dir_src, 'process.R')) -source(here::here(dir_src, 'analyze.R')) -source(here::here(dir_src, 'visualize.R')) - -options(warn = 1, width = 200) +Text goes here. +```{r} +# R code goes here ``` diff --git a/run.R b/run.R index 53b74be..cbf9678 100644 --- a/run.R +++ b/run.R @@ -1,8 +1,7 @@ -if (!require('here')) install.packages('here'); library('here') +if (!require('startr-utils')) install.packages('startr-utils'); library('startr-utils') -source(here::here('config.R')) -if (should_process_data) { source(here::here(dir_src, 'process.R')) } -source(here::here(dir_src, 'analyze.R')) -source(here::here(dir_src, 'visualize.R')) - -if (should_render_notebook) { render_notebook(r_notebook) } +run_config() +run_process() +run_analyze() +run_visualize() +run_render_notebook() From 2b31e817cf1722c78e6a505b0e3c5ea3bc80f21d Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Fri, 15 Jan 2021 19:00:26 -0500 Subject: [PATCH 02/18] cleaning up --- R/utils.R | 332 ------------------------------------------- R/visualize.R | 2 +- config.R | 11 +- reports/notebook.Rmd | 4 +- 4 files changed, 9 insertions(+), 340 deletions(-) delete mode 100644 R/utils.R diff --git a/R/utils.R b/R/utils.R deleted file mode 100644 index fe9c9ab..0000000 --- a/R/utils.R +++ /dev/null @@ -1,332 +0,0 @@ -# PACKAGES THIS REQUIRES - -'here' -'librarian' -'tidyverse' -'openxlsx' -'feather' -'knitr' -'beepr' -'ggthemes' -'clipr' - -initialize_startr <- function( - scipen = 999, - timezone = 'America/Toronto', - should_render_notebook = FALSE, - should_process_data = TRUE, - should_timestamp_output_files = FALSE, - should_clean_processing_variables = TRUE, - should_beep = TRUE, - packages = c() - ) { - - if (scipen) options(scipen = scipen) - if (timezone) Sys.setenv(TZ = timezone) - - assign('should_render_notebook', should_render_notebook, envir = .GlobalEnv) - assign('should_process_data', should_process_data, envir = .GlobalEnv) - assign('should_timestamp_output_files', should_timestamp_output_files, envir = .GlobalEnv) - assign('should_clean_processing_variables', should_clean_processing_variables, envir = .GlobalEnv) - assign('should_beep', should_beep, envir = .GlobalEnv) - - # DO LIBRARIAN STUFF HERE - # load_requirements(packages) - ggthemes::theme_set(theme_minimal()) - - knitr::opts_chunk$set( - eval = TRUE, - echo = FALSE, - message = FALSE, - cache = FALSE, - warning = FALSE, - error = FALSE, - comment = '#', - tidy = FALSE, - collapse = TRUE, - results = 'asis', - fig.width = 12, - dpi = 150, - root.dir = here::here() - ) - - if ('cansim' %in% packages) { - options(cansim.cache_path = dir_data_cache()) - } - - if ('cancensus' %in% packages) { - options( - # CANCENSUS_API should be set in your home directory's - # .Renviron file, and will get pulled down from there - cancensus.api_key = Sys.getenv(c('CANCENSUS_API')), - cancensus.cache_path = dir_data_cache(), - ) - } - -} - -dir_constructor <- function(path, ...) { - here::here(path, ...) -} - -dir_src <- function(...) { - dir_constructor('R', ...) -} - -dir_data_raw <- function(...) { - dir_constructor('data/raw', ...) -} - -dir_data_cache <- function(...) { - dir_constructor('data/cache', ...) -} - -dir_data_processed <- function(...) { - dir_constructor('data/processed', ...) -} - -dir_data_out <- function(...) { - dir_constructor('data/out', ...) -} - -dir_reports <- function(...) { - dir_constructor('reports', ...) -} - -dir_plots <- function(...) { - dir_constructor('plots', ...) -} - -run_config <- function() { - source(here::here('config.R')) - source(dir_src('functions.R')) -} - -run_process <- function() { - if (should_process_data) { - begin_processing(clean = clean_processing_variables) - source(dir_src('process.R')) - end_processing(should_beep = should_beep) - } -} - -run_analyze <- function() { - source(dir_src('analyze.R')) -} - -run_visualize <- function() { - source(dir_src('visualize.R')) -} - -run_render_notebook <- function(path = r_notebook.file, should_beep = should_beep) { - if (should_render_notebook) render_notebook(path) - if (should_beep) beep() -} - -# load_requirements <- function(pkg){ -# new.pkg <- pkg[!(pkg %in% installed.packages()[, 'Package'])] -# if (length(new.pkg)) -# install.packages(new.pkg, dependencies = TRUE) -# sapply(pkg, require, character.only = TRUE) -# } - -# Contributed by Andy Lin of News Nerdery Slack -combine_csvs <- function(dir, ...) { - list.files(dir, pattern = '*.csv', full.names = T) %>% - map_dfr(function(x) { - read_csv(path = filepath, ...) - }, .id = 'filename') -} - -read_all_excel_sheets <- function(filepath, ...) { - filepath %>% - excel_sheets() %>% - set_names() %>% - map_df(function(x) { - read_excel(path = filepath, sheet = x, ...) - }, .id = 'sheet') -} - -combine_excels <- function(dir, all_sheets = FALSE, ...) { - read_excel_constructor <- function(x) { - if (all_sheets) { - read_all_excel_sheets(path = filepath, ...) - } else { - read_excel(path = filepath, ...) - } - } - - list.files(dir, pattern = '.xls[x]?', full.names = T) %>% - map_dfr(read_excel_constructor, .id = 'filename') -} - -render_notebook <- function(notebook_file) { - rmarkdown::render( - notebook_file, - output_dir = dir_reports(), - encoding = 'utf-8' - ) -} - -index <- function(m) { - (m - first(m)) / first(m) -} - -mode <- function(x) { - ux <- unique(x) - ux[which.max(tabulate(match(x, ux)))] -} - -unaccent <- function(x) { - iconv(x, to = 'ASCII//TRANSLIT') -} - -remove_non_utf8 <- function(x) { - iconv(x, to = 'UTF-8', sub = '') -} - -`%not_in%` <- purrr::negate(`%in%`) - -not.na <- purrr::negate(is.na) - -simplify_string <- function( - x, - alpha = TRUE, - digits = FALSE, - unaccent = TRUE, - utf8_only = TRUE, - uppercase = TRUE, - trim = TRUE, - stopwords = NA - ) { - - x_temp <- x - - if (unaccent) { - x_temp <- unaccent(x_temp) - } - - if (utf8_only) { - x_temp <- remove_non_utf8(x_temp) - } - - if (uppercase) { - x_temp <- str_to_upper(x_temp) - } - - if (alpha | digits) { - re <- '^\\s' - if (alpha) re <- paste(re, 'a-zA-Z', sep = '') - if (digits) re <- paste(re, '0-9', sep = '') - x_temp <- str_replace_all(x_temp, paste('[', re, ']', sep = ''), '') - } - - if (!any(is.na(stopwords))) { - if (uppercase) stopwords <- str_to_upper(stopwords) - stopwords_regex <- paste0('\\b', paste(stopwords, collapse = '\\b|\\b'), '\\b') - x_temp <- str_replace_all(x_temp, stopwords_regex, '') - } - - if (trim) { - x_temp <- str_squish(x_temp) - } - - return(x_temp) - - } - -clean_columns <- function(x) { - cols <- x %>% - unaccent(.) %>% - str_replace_all(., '[\\s]+', '_') %>% - str_replace_all(., '[_]+', '_') %>% - str_replace_all(., '[^_a-zA-Z]', '') %>% - str_to_lower(.) %>% - str_squish(.) - - for (i in 1:length(cols)) { - if (!as.logical(str_count(cols[i]))) { - cols[i] <- glue('column_{i}') - } - if (any(cols[1:i - 1] == cols[i])) { - cols[i] <- glue('{cols[i]}_{i}') - } - } - - return(cols) -} - -convert_str_to_logical <- function(x, truthy = 'T|TRUE|Y|YES', falsy = 'F|FALSE|N|NO') { - x %>% - str_to_upper(.) %>% - str_squish(.) %>% - str_replace_all(., truthy, 'TRUE') %>% - str_replace_all(., falsy, 'FALSE') %>% - as.logical(.) -} - -write_excel <- function(variable, should_timestamp_output_files = should_timestamp_output_files) { - filename <- deparse(substitute(variable)) - if (should_timestamp_output_files) { - now <- Sys.time() - filename <- glue('{filename}_{format(now, "%Y%m%d%H%M%S")}') - } - write.xlsx(variable, file = dir_data_out(glue('{filename}.xlsx'))) -} - -begin_processing <- function(should_clean_processing_variables = should_clean_processing_variables) { - if (should_clean_processing_variables) { - assign('curr_env', ls(.GlobalEnv), envir = .GlobalEnv) - } -} - -end_processing <- function(should_clean_processing_variables = should_clean_processing_variables, should_beep = should_beep) { - if (should_clean_processing_variables) { - ls(.GlobalEnv) %>% - setdiff(., curr_env) %>% - as.character() %>% - rm(list = ., envir = .GlobalEnv) - } - if (should_beep) beep() -} - -write_plot <- function(variable, filename = NA, format = 'png', ...) { - - if (is.na(filename)) filename <- deparse(substitute(variable)) - - args <- list( - plot = variable, - format = format, - file = dir_plots(glue('{filename}.{format}')), - ... - ) - - if (format == 'pdf') args[['useDingbats']] <- FALSE - - do.call(ggsave, args) -} - -write_shp <- function(shp, path) { - if (file.exists(path)) { - file.remove(path) - } - - st_write(shp, path, update = TRUE) -} - -# FROM https://github.com/dgrtwo/drlib/blob/master/R/reorder_within.R - -reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) { - new_x <- paste(x, within, sep = sep) - stats::reorder(new_x, by, FUN = fun) -} - -scale_x_reordered <- function(..., sep = "___") { - reg <- paste0(sep, ".+$") - ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...) -} - -scale_y_reordered <- function(..., sep = "___") { - reg <- paste0(sep, ".+$") - ggplot2::scale_y_discrete(labels = function(x) gsub(reg, "", x), ...) -} diff --git a/R/visualize.R b/R/visualize.R index 25978af..9aa4df0 100644 --- a/R/visualize.R +++ b/R/visualize.R @@ -21,6 +21,6 @@ # ) + # theme_classic() # -# plot(plot_house_price_change) +# plot_house_price_change # # write_plot(plot_house_price_change) diff --git a/config.R b/config.R index 4f82b21..7ffd35b 100644 --- a/config.R +++ b/config.R @@ -14,11 +14,12 @@ startr_config <- c( should_process_data = TRUE, should_timestamp_output_files = FALSE, packages = c( - 'janitor', 'zoo', - 'tidymodels', - 'scales', 'gganimate', - 'sf', - 'cansim', 'cancensus', + 'tidyverse' + # 'janitor', 'zoo', + # 'tidymodels', + # 'scales', 'gganimate', + # 'sf', + # 'cansim', 'cancensus', ) ) diff --git a/reports/notebook.Rmd b/reports/notebook.Rmd index b5c18bf..6ceff70 100644 --- a/reports/notebook.Rmd +++ b/reports/notebook.Rmd @@ -1,7 +1,7 @@ --- -title: '`r startr_config['config_title']`' +title: '`r startr_config[['config_title']]`' date: '`r format(Sys.Date(), "%B %d, %Y")`' -author: '`r startr_config[['config_author']`' +author: '`r startr_config[['config_author']]`' output: html_notebook: code_folding: hide From 2867876ab23caabd453d41be54333299c0326f0d Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Sat, 16 Jan 2021 01:42:42 -0500 Subject: [PATCH 03/18] cleaning up --- R/analyze.R | 2 +- R/process.R | 2 +- R/visualize.R | 6 +++--- config.R | 7 +++---- reports/notebook.Rmd | 4 ++-- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/R/analyze.R b/R/analyze.R index 5bb191d..b52f798 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -5,7 +5,7 @@ # ############################################################ -# sample <- read_feather(here::here(dir_data_processed, 'sample.feather')) %>% +# sample <- read_feather(dir_data_processed('sample.feather')) %>% # group_by(cma) %>% # arrange(desc(date)) %>% # mutate(sale_avg_3mo = rollmean(sale_avg, k = 3, fill = NA)) %>% diff --git a/R/process.R b/R/process.R index 5a7864a..041f9b1 100644 --- a/R/process.R +++ b/R/process.R @@ -24,4 +24,4 @@ # ) %>% # arrange(cma, desc(date)) # -# write_feather(sample.raw, here::here(dir_data_processed, 'sample.feather')) +# write_feather(sample.raw, dir_data_processed('sample.feather')) diff --git a/R/visualize.R b/R/visualize.R index 9aa4df0..2be37f7 100644 --- a/R/visualize.R +++ b/R/visualize.R @@ -7,9 +7,9 @@ # ############################################################ -# plot_house_price_change <- ggplot(sample %>% -# filter(cma != 'C11'), -# aes(x = reorder(cma, yoy), y = yoy)) + +# plot_house_price_change <- sample %>% +# filter(cma != 'C11') %>% +# ggplot(aes(x = reorder(cma, yoy), y = yoy)) + # geom_bar(colour = 'white', stat = 'identity') + # scale_y_continuous(expand = c(0, 0), limits = c(0, 25)) + # coord_flip() + diff --git a/config.R b/config.R index 7ffd35b..e882381 100644 --- a/config.R +++ b/config.R @@ -6,7 +6,7 @@ ############################################################ # This initializes your startr project -startr_config <- c( +initialize_startr( author = 'Firstname Lastname ', title = 'startr', timezone = 'America/Toronto', @@ -14,7 +14,8 @@ startr_config <- c( should_process_data = TRUE, should_timestamp_output_files = FALSE, packages = c( - 'tidyverse' + 'tidyverse', 'glue', 'magrittr', 'lubridate', 'hms', + 'readxl', 'feather', 'rvest' # 'janitor', 'zoo', # 'tidymodels', # 'scales', 'gganimate', @@ -23,7 +24,5 @@ startr_config <- c( ) ) -initialize_startr(startr_config) - # Filenames: Refer to your source data filenames here sample.raw.file <- dir_data_raw('sample.csv') diff --git a/reports/notebook.Rmd b/reports/notebook.Rmd index 6ceff70..279b8d1 100644 --- a/reports/notebook.Rmd +++ b/reports/notebook.Rmd @@ -1,7 +1,7 @@ --- -title: '`r startr_config[['config_title']]`' +title: '`r getOption('startr.title')`' date: '`r format(Sys.Date(), "%B %d, %Y")`' -author: '`r startr_config[['config_author']]`' +author: '`r getOption('startr.author')`' output: html_notebook: code_folding: hide From 129132dcbf81d2c8f60e4f81864d7781ba3aec72 Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Tue, 19 Jan 2021 19:06:28 -0500 Subject: [PATCH 04/18] cleaning up a bit --- R/analyze.R | 10 ++++------ R/functions.R | 5 ++--- R/process.R | 21 +++++++++------------ R/visualize.R | 12 ++++-------- config.R | 12 +++++------- run.R | 5 +++-- scrape/scrape.R | 15 +++++++++------ 7 files changed, 36 insertions(+), 44 deletions(-) diff --git a/R/analyze.R b/R/analyze.R index b52f798..38f2ca3 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -1,9 +1,7 @@ -############################################################ -# This file handles the primary analysis using the tidied -# data as input. Should never read from `dir_data_raw()`, -# only `dir_data_processed()`. -# -############################################################ +# ======================================================================= +# This file handles the primary analysis using the tidied data as input. +# Should never read from `dir_data_raw()`, only `dir_data_processed()`. +# ======================================================================= # sample <- read_feather(dir_data_processed('sample.feather')) %>% # group_by(cma) %>% diff --git a/R/functions.R b/R/functions.R index 5705d88..ac1ca70 100644 --- a/R/functions.R +++ b/R/functions.R @@ -1,4 +1,3 @@ -############################################################ +# ======================================================================= # Project-specific functions. -# -############################################################ +# ======================================================================= diff --git a/R/process.R b/R/process.R index 041f9b1..8890595 100644 --- a/R/process.R +++ b/R/process.R @@ -1,15 +1,12 @@ -############################################################ -# This file is used to read in raw data, clean it, and save -# a file to `dir_data_processed()` *before* proceeding to -# analysis. If this file is run from run.R, all variables -# created by this step will be erased after the step is -# complete to keep a clean working environment. Tip: If your -# analysis is complicated enough that you need to break the -# processing out into multiple files, simply source them -# from this file by calling something like -# `source(dir_src('process_files', 'process_step_1.R'))` -# -############################################################ +# ======================================================================= +# This file is used to read in raw data, clean it, and save a file to +# `dir_data_processed()` before proceeding to analysis. If this file is +# run from run.R, all variables created by this step will be erased after +# the step is complete to keep a clean working environment. Tip: If your +# analysis is complicated enough that you need to break the processing +# out into multiple files, simply source them from this file by calling +# something like `source(dir_src('process_files', 'process_step_1.R'))` +# ======================================================================= # sample.raw <- read_csv(sample.raw.file) %>% # rename( diff --git a/R/visualize.R b/R/visualize.R index 2be37f7..6278082 100644 --- a/R/visualize.R +++ b/R/visualize.R @@ -1,11 +1,7 @@ -############################################################ -# Charts, maps, etc. from your data -# -# Use the `write_plot` function to write the plot directly -# to the `plots/` folder, using the variable name as -# the filename. -# -############################################################ +# ======================================================================= +# Graphics. Use the `write_plot` function to write the plot directly +# to the `plots/` folder, using the variable name as the filename. +# ======================================================================= # plot_house_price_change <- sample %>% # filter(cma != 'C11') %>% diff --git a/config.R b/config.R index e882381..54a3e48 100644 --- a/config.R +++ b/config.R @@ -1,9 +1,7 @@ -############################################################ -# This file configures the project by specifying filenames, -# loading packages and setting up some project-specific -# variables. -# -############################################################ +# ================================================================= +# This file configures the project by specifying filenames, loading +# packages and setting up some project-specific variables. +# ================================================================= # This initializes your startr project initialize_startr( @@ -25,4 +23,4 @@ initialize_startr( ) # Filenames: Refer to your source data filenames here -sample.raw.file <- dir_data_raw('sample.csv') +sample.raw.file <- dir_data_raw('your-filename-here.csv') diff --git a/run.R b/run.R index cbf9678..edb1fb7 100644 --- a/run.R +++ b/run.R @@ -1,7 +1,8 @@ -if (!require('startr-utils')) install.packages('startr-utils'); library('startr-utils') +# if (!require('upstartr')) install.packages('upstartr'); library('upstartr') +devtools::install_github('globeandmail/upstartr'); library('upstartr') run_config() run_process() run_analyze() run_visualize() -run_render_notebook() +run_notebook() diff --git a/scrape/scrape.R b/scrape/scrape.R index fedb322..ff98d1f 100644 --- a/scrape/scrape.R +++ b/scrape/scrape.R @@ -1,6 +1,9 @@ -############################################################ -# This file handles any scrapes that might be necessary, -# and doesn't get called by the main block. -# -# -############################################################ +# ======================================================================= +# Put any scraping code here. This file doesn't get called by `run.R`. +# ======================================================================= + +if (!require('upstartr')) install.packages('upstartr'); library('upstartr') + +run_config() + +# Scraping code goes here. From 31d41dfc23facf6dde4c0104eb10a066c23d74e5 Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Tue, 19 Jan 2021 19:12:50 -0500 Subject: [PATCH 05/18] simplifying note --- R/process.R | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/R/process.R b/R/process.R index 8890595..5b1e0e8 100644 --- a/R/process.R +++ b/R/process.R @@ -1,11 +1,9 @@ # ======================================================================= -# This file is used to read in raw data, clean it, and save a file to -# `dir_data_processed()` before proceeding to analysis. If this file is -# run from run.R, all variables created by this step will be erased after -# the step is complete to keep a clean working environment. Tip: If your -# analysis is complicated enough that you need to break the processing -# out into multiple files, simply source them from this file by calling -# something like `source(dir_src('process_files', 'process_step_1.R'))` +# Read raw data, clean it and save it out to `dir_data_processed()` here +# before moving to analysis. If run from `run.R`, all variables generated +# in this file will be wiped after completion to keep the environment +# clean. If your processing is complex, you can break it out into several +# files like this: `source(dir_src('process_files', 'process_step_1.R'))` # ======================================================================= # sample.raw <- read_csv(sample.raw.file) %>% From 056fca43ecc967ceb330561c42b66e32659bc113 Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Wed, 20 Jan 2021 13:09:25 -0500 Subject: [PATCH 06/18] adding stuff --- R/process.R | 4 ++-- config.R | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/R/process.R b/R/process.R index 5b1e0e8..ad0323e 100644 --- a/R/process.R +++ b/R/process.R @@ -2,8 +2,8 @@ # Read raw data, clean it and save it out to `dir_data_processed()` here # before moving to analysis. If run from `run.R`, all variables generated # in this file will be wiped after completion to keep the environment -# clean. If your processing is complex, you can break it out into several -# files like this: `source(dir_src('process_files', 'process_step_1.R'))` +# clean. If your process step is complex, you can break it into several +# files like so: `source(dir_src('process_files', 'process_step_1.R'))` # ======================================================================= # sample.raw <- read_csv(sample.raw.file) %>% diff --git a/config.R b/config.R index 54a3e48..786c9aa 100644 --- a/config.R +++ b/config.R @@ -13,7 +13,8 @@ initialize_startr( should_timestamp_output_files = FALSE, packages = c( 'tidyverse', 'glue', 'magrittr', 'lubridate', 'hms', - 'readxl', 'feather', 'rvest' + 'readxl', 'feather', 'rvest', + # 'globeandmail/tgamtheme', # 'janitor', 'zoo', # 'tidymodels', # 'scales', 'gganimate', From 046e07f6aae1ee8bc3dc299294b7c1750ea2d743 Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Thu, 21 Jan 2021 02:23:52 -0500 Subject: [PATCH 07/18] updates readme --- README.md | 100 ++++++++++++++++++++++++++++++++++++++---------------- config.R | 2 +- 2 files changed, 72 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index a5ad83c..21acbd4 100644 --- a/README.md +++ b/README.md @@ -13,47 +13,48 @@ Broadly, `startr` does a few things: * **Improves communication**: Documents the analysis steps and questions to be answered for large, multi-disciplinary teams (say, developers, data journalists and traditional reporters) * **Simplifies the generation of charts and reports**: Generates easily updatable RMarkdown reports, Adobe Illustrator-ready graphics, and datasets during analysis - ## How do I use this? -This template works with R and RStudio, so you'll need both of those installed. Then, just clone down this project, or, better yet, use our scaffolding tool, [`startr-cli`](https://www.github.com/globeandmail/startr-cli). +This template works with R and RStudio, so you'll need both of those installed. To scaffold a new `startr` project, we recommend using our command-line tool, [`startr-cli`](https://www.github.com/globeandmail/startr-cli), which will rename some files, configure the project and initialize an empty Git repository. Alternatively, you can run: +```sh +git clone https://github.com/globeandmail/startr.git +``` Once the project's cloned, double-click on the `.Rproj` file to start a scoped RStudio instance. -You can then start adding your data and writing your analysis. At The Globe, we like to work in a code editor like Atom or Sublime Text, and use something like [`r-exec`](https://atom.io/packages/r-exec) to send code chunks to RStudio. - +You can then start copying in your data and writing your analysis. At The Globe, we like to work in a code editor like Atom or Sublime Text, and use something like [`r-exec`](https://atom.io/packages/r-exec) to send code chunks to RStudio. ## Example workflow using `startr` Here's how we use `startr` for our own analysis workflow right now. The heart of the project lies in these three files: -* **`process.R`**: Imports source data, tidies it, fixes errors, sets types, applies manipulations and saves out a Feather file ready for analysis (or, in other cases, a CSV, a shapefile, etc.). +* **`process.R`**: Import your source data, tidy it, fix any errors, set types, apply upfront manipulations and save out a file ready for analysis. We recommend saving out a `.feather` file, which will retain types and reads extremely quickly — but you can also use a CSV, shapefile, RDS file or something else if you'd prefer. -* **`analyze.R`**: Consumes the data files saved out by `process.R`, and is where all of the true "analysis" occurs, including grouping, summarizing, filtering, etc. All descriptive and relational statistical analysis. More complicated analysis can be split into additional `analyze_somestep.R` files as required. +* **`analyze.R`**: Here you'll consume the data files saved out by `process.R`. This is where all of the true "analysis" occurs, including grouping, summarizing, filtering, etc. If your analysis is complex enough, you may want to split it into additional `analyze_step_X.R` files as required. -* **`visualize.R`**: Generates plots. +* **`visualize.R`**: Draw and save out your graphics. There's also an optional (but recommended) RMarkdown file (**`notebook.Rmd`**) you can use to generate a report – especially useful for longer-term projects where you need to document the questions you're asking. #### Step 1: Set up your project -Packages are managed through the `packages` list in the `config.R` file. `devtools` and `here` are loaded by default. The `load_requirements()` function loads, and optionally installs, required packages. +The bulk of any `startr` project's code lives within the `R` directory, in files that are sourced and run in sequence by the `run.R` at the project's root. -The bulk of the analysis is based on a set of files within the `R` directory which are sourced and run in order by `run.R` at the project root. +Many of the core functions for this project are managed by a specialty package, [**upstartr**](https://www.github.com/globeandmail/upstartr). That package is installed and imported in `run.R`. -Before starting an analysis, you'll want to point to your data files in `config.R` and make sure it's loading all the packages you'll need. For instance, you might want to add the [`cancensus`](https://github.com/mountainMath/cancensus) package. To do that, just add `'cancensus'` to the `packages` vector. Package suggestions for GIS work, scraping, dataset summaries, etc. are included and commented out to avoid bloat. +Before starting an analysis, you'll want to point to your data files in `config.R` and make sure it's loading all the packages you'll need. For instance, you might want to add the [`cancensus`](https://github.com/mountainMath/cancensus) package. To do that, just add `'cancensus'` to the `packages` vector. Package suggestions for GIS work, scraping, dataset summaries, etc. are included and commented out to avoid bloat. The [`initialize_startr()`](https://globeandmail.github.io/upstartr/reference/initialize_startr.html) function also takes several other optional parameters — for a full list, see the [function's](https://globeandmail.github.io/upstartr/reference/initialize_startr.html) documentation. -Once that's done, you'll want to reference your raw data filenames. For instance, if you're adding pizza delivery data, you'd add this line to the "Files" block in `config.R`: +Once you've listed the packages you want to import, you'll want to reference your raw data filenames. For instance, if you're adding pizza delivery data, you'd add this line to the filenames block in `config.R`: ```R -pizza.raw.file <- here::here(dir_data_raw, 'Citywide Pizza Deliveries 1998-2016.xlsx') +pizza.raw.file <- dir_data_raw('Citywide Pizza Deliveries 1998-2016.xlsx') ``` Our naming convention is to append `.raw` to variables that reference raw data, and `.file` to variables that are just filename strings. #### Step 2: Import and process your data -In `process.R`, you'll consume the variables you created in `config.R`, clean them up, rename variables, deal with any errors, convert multiple data files to a common structure if necessary, and save out the result, plus some cleanup at the end so as to not pollute the environment. It might look something like this: +In `process.R`, you'll consume the variables you created in `config.R`, clean them up, rename variables, deal with any errors, convert multiple data files to a common structure if necessary, and save out the result. It might look something like this: ```R pizza.raw <- read_excel(pizza.raw.file, skip = 2) %>% @@ -76,19 +77,21 @@ pizza.raw <- read_excel(pizza.raw.file, skip = 2) %>% ) %>% filter(!is.na(date)) -write_feather(pizza.raw, here::here(dir_data_processed, 'pizza.feather')) +write_feather(pizza.raw, dir_data_processed('pizza.feather')) ``` -We prefer to write out the output as a `.feather` file, which is a binary format designed to read and write files extremely fast (at roughly 600 MB/s). Feather files can also be opened in other analysis frameworks (i.e. Jupyter Notebooks) and, most importantly, embed the column types so that you don't have to re-assert them later. If you'd rather save out files in a different format, you can just use a different function, like the Tidyverse's `write_csv`. +When called via the `run_process()` function in `run.R`, variables generated during processing will be removed once the step is completed to keep the working environment clean for analysis. + +We prefer to write out our outputs as a `.feather` file, which is a binary format designed to read and write files extremely fast (at roughly 600 MB/s). Feather files can also be opened in other analysis frameworks (i.e. Jupyter Notebooks) and, most importantly, have embedded column type data so that you don't have to re-assert them later. If you'd rather save out files in a different format, you can just use a different function, like the Tidyverse's `write_csv`. -Output files written to `dir_data_processed` (that is, `/data/processed`) aren't checked into Git by design — you should be able to reproduce the analysis-ready files from someone else's project by running `process.R`. +Output files are written to `/data/processed` using the `dir_data_processed()` function. By design, processed files aren't checked into Git — you should be able to reproduce the analysis-ready files from someone else's project by running `process.R`. #### Step 2: Do your analysis This part's as simple as consuming that file in `analyze.R` and running with it. It might look something like this: ```R -pizza <- read_feather(here::here(dir_data_processed, 'pizza.feather')) +pizza <- read_feather(dir_data_processed('pizza.feather')) delivery_person_counts <- pizza %>% group_by(person) %>% @@ -124,9 +127,9 @@ write_plot(plot_deliveries_monthly) ## Helper functions -This template comes with several pre-made helper functions that we've found useful in daily data journalism tasks. +`startr`'s companion package [`upstartr`](https://www.github.com/globeandmail/upstartr) comes with several functions to support `startr`, plus helpers we've found useful in daily data journalism tasks. A full list can be found on the [reference page here](https://globeandmail.github.io/upstartr/reference/index.html). Below is a partial list of some of its most handy functions: -- `read_all_excel_sheets`: Combines all Excel sheets in a given file into a single dataframe, adding an extra column called `sheet` for the sheet name. Takes all the same arguments as `readxl`'s `read_excel`. +- [`read_all_excel_sheets`](https://globeandmail.github.io/upstartr/reference/read_all_excel_sheets.html): Combines all Excel sheets in a given file into a single dataframe, adding an extra column called `sheet` for the sheet name. Takes all the same arguments as `readxl`'s `read_excel`. ```r pizza_deliveries <- read_all_excel_sheets( @@ -136,15 +139,14 @@ This template comes with several pre-made helper functions that we've found usef rename(pizza_shop = 'sheet') ``` -- `simplify_string`: By default, takes strings and simplifies them by force-uppercasing, replacing accents with non-accented characters, removing every non-alphanumeric character, and simplifying double/mutli-spaces into single spaces. Very useful when dealing with messy human-entry data with people's names, corporations, etc. +- [`simplify_string`](https://globeandmail.github.io/upstartr/reference/simplify_string.html): By default, takes strings and simplifies them by force-uppercasing, replacing accents with non-accented characters, removing every non-alphanumeric character, and simplifying double/mutli-spaces into single spaces. Very useful when dealing with messy human-entry data with people's names, corporations, etc. ```r pizza_deliveries %>% mutate(customer_simplified = simplify_string(customer_name)) - ``` -- `index`: Calculate percentage growth by indexing values to the first value: +- [`calc_index`](https://globeandmail.github.io/upstartr/reference/calc_index.html): Calculate percentage growth by indexing values to the first value: ```r pizza_deliveries %>% @@ -155,7 +157,7 @@ This template comes with several pre-made helper functions that we've found usef mutate(indexed_deliveries = index(total_deliveries)) ``` -- `mode`: Calculate the mode for a given field: +- [`calc_mode`](https://globeandmail.github.io/upstartr/reference/calc_mode.html): Calculate the mode for a given field: ```r @@ -164,21 +166,21 @@ This template comes with several pre-made helper functions that we've found usef summarise(most_common_size = mode(size)) ``` -- `clean_columns`: Renaming columns to something that doesn't have to be referenced with backticks (`` `Column Name!` ``) or square brackets (`.[['Column Name!']]`) gets tedious. This function speeds up the process by forcing everything to lowercase and using underscores – the tidyverse's preferred naming convention for columns. If there are many columns with the same name during cleanup, they'll be appended with an index number. +- [`clean_columns`](https://globeandmail.github.io/upstartr/reference/clean_columns.html): Renaming columns to something that doesn't have to be referenced with backticks (`` `Column Name!` ``) or square brackets (`.[['Column Name!']]`) gets tedious. This function speeds up the process by forcing everything to lowercase and using underscores – the tidyverse's preferred naming convention for columns. If there are many columns with the same name during cleanup, they'll be appended with an index number. ```r pizza_deliveries %>% rename_all(clean_columns) ``` -- `convert_str_to_logical`: Does the work of cleaning up your True, TRUE, true, T, False, FALSE, false, F, etc. strings to logicals. +- [`convert_str_to_logical`](https://globeandmail.github.io/upstartr/reference/convert_str_to_logical.html): Does the work of cleaning up your True, TRUE, true, T, False, FALSE, false, F, etc. strings to logicals. ```r pizza_deliveries %>% mutate(was_delivered_logi = convert_str_to_logical(was_delivered)) ``` -- `write_excel`: Writes out an Excel file to `data/out` using the variable name as the file name. Useful for quickly generating summary tables for sharing with others. By design, doesn't take any arguments to keep things as simple as possible. If `timestamp_output_files` is set to TRUE in `config.R`, will append a timestamp to the filename in the format `%Y%m%d%H%M%S`. +- [`write_excel`](https://globeandmail.github.io/upstartr/reference/write_excel.html): Writes out an Excel file to `data/out` using the variable name as the file name. Useful for quickly generating summary tables for sharing with others. By design, doesn't take any arguments to keep things as simple as possible. If `should_timestamp_output_files` is set to TRUE in `config.R`, will append a timestamp to the filename in the format `%Y%m%d%H%M%S`. ```r undelivered_pizzas <- pizza_deliveries %>% @@ -187,7 +189,7 @@ This template comes with several pre-made helper functions that we've found usef write_excel(undelivered_pizzas) ``` -- `write_plot`: Similar to `write_excel`, designed to quickly save out a plot directly to `/plots`. Takes all the same arguments as `ggsave`. +- [`write_plot`](https://globeandmail.github.io/upstartr/reference/write_plot.html): Similar to `write_excel`, designed to quickly save out a plot directly to `/plots`. Takes all the same arguments as `ggsave`. ```r plot_undelivered_pizzas <- undelivered_pizzas %>% @@ -199,7 +201,40 @@ This template comes with several pre-made helper functions that we've found usef write_plot(plot_undelivered_pizzas) ``` -- `begin_processing` and `end_processing`: functions that are run at the top and bottom of `process.R` that clean up the environment of temporary variables created during the data processing step. To disable this, set the `clean_processing_variables` flag in `config.R` to FALSE. +- [`combine_csvs`](https://globeandmail.github.io/upstartr/reference/combine_csvs.html): Read all CSVs in a given directory and concatenate them into a single file. + + ```r + pizzas <- combine_csvs(dir_data_raw()) + ``` + +- [`combine_excels`](https://globeandmail.github.io/upstartr/reference/combine_excels.html): Read all Excel spreadsheets in a given directory and concatenate them. + + ```r + pizzas_in_excel <- combine_excels(dir_data_raw()) + ``` + +- `unaccent`(https://globeandmail.github.io/upstartr/reference/unaccent.html): Remove accents from strings. + + ```r + unaccent('Montréal') + ``` + +- [`remove_non_utf8`](https://globeandmail.github.io/upstartr/reference/remove_non_utf8.html): Remove non-UTF-8 characters from strings. + + ```r + non_utf8 <- 'fa\xE7ile' + Encoding(non_utf8) <- 'latin1' + remove_non_utf8(non_utf8) + ``` + +- [`%not_in%`](https://globeandmail.github.io/upstartr/reference/grapes-not_in-grapes.html): The opposite of the `%in%` operator. + + ```r + c(1, 2, 3, 4, 5) %not_in% c(4, 5, 6, 7, 8) + ``` + +- [`not.na`](https://globeandmail.github.io/upstartr/reference/not.na.html): The opposite of the `is.na` function. +- [`not.null`](https://globeandmail.github.io/upstartr/reference/not.null.html): The opposite of the `is.null` funciton. ## Tips for using `startr` @@ -223,7 +258,6 @@ This template comes with several pre-made helper functions that we've found usef │   ├── process.R # Data processing including tidying, processing and manupulation. │   ├── analyze.R # The primary analysis steps. │   ├── visualize.R # Generate plots as png, pdf, etc. -│   ├── utils.R # Commonly-used functions. │   └── functions.R # Project-specific functions. ├── scrape/ │   └── scrape.R # Scraping scripts that save collected data to the `/data/raw/` directory. @@ -237,6 +271,14 @@ This template comes with several pre-made helper functions that we've found usef An `.nvmrc` is included at the project root for scraping with Node. A `venv` and `requirements.txt` file should be included within the scraper directory if Python is used for scraping. +## See also + +`startr` is part of a small ecosystem of R utilities. Those include: + +- [**upstartr**](https://www.github.com/globeandmail/upstartr), a library of functions that support `startr` and daily data journalism tasks +- [**tgamtheme**](https://www.github.com/globeandmail/tgamtheme), The Globe and Mail's graphics theme +- [**startr-cli**](https://www.github.com/globeandmail/startr-cli), a command-line tool that scaffolds new `startr` projects + ## Version 1.0.2 diff --git a/config.R b/config.R index 786c9aa..240e8e1 100644 --- a/config.R +++ b/config.R @@ -23,5 +23,5 @@ initialize_startr( ) ) -# Filenames: Refer to your source data filenames here +# Refer to your source data filenames here sample.raw.file <- dir_data_raw('your-filename-here.csv') From 2809d2d8dfcc71f317ca69aab8b3e21ddffc7cf7 Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Thu, 21 Jan 2021 02:25:53 -0500 Subject: [PATCH 08/18] update readme --- README.md | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 21acbd4..921cc75 100644 --- a/README.md +++ b/README.md @@ -129,21 +129,25 @@ write_plot(plot_deliveries_monthly) `startr`'s companion package [`upstartr`](https://www.github.com/globeandmail/upstartr) comes with several functions to support `startr`, plus helpers we've found useful in daily data journalism tasks. A full list can be found on the [reference page here](https://globeandmail.github.io/upstartr/reference/index.html). Below is a partial list of some of its most handy functions: -- [`read_all_excel_sheets`](https://globeandmail.github.io/upstartr/reference/read_all_excel_sheets.html): Combines all Excel sheets in a given file into a single dataframe, adding an extra column called `sheet` for the sheet name. Takes all the same arguments as `readxl`'s `read_excel`. +- [`simplify_string`](https://globeandmail.github.io/upstartr/reference/simplify_string.html): By default, takes strings and simplifies them by force-uppercasing, replacing accents with non-accented characters, removing every non-alphanumeric character, and simplifying double/mutli-spaces into single spaces. Very useful when dealing with messy human-entry data with people's names, corporations, etc. ```r - pizza_deliveries <- read_all_excel_sheets( - pizza_deliveries.file, - skip = 3, - ) %>% - rename(pizza_shop = 'sheet') + pizza_deliveries %>% + mutate(customer_simplified = simplify_string(customer_name)) ``` -- [`simplify_string`](https://globeandmail.github.io/upstartr/reference/simplify_string.html): By default, takes strings and simplifies them by force-uppercasing, replacing accents with non-accented characters, removing every non-alphanumeric character, and simplifying double/mutli-spaces into single spaces. Very useful when dealing with messy human-entry data with people's names, corporations, etc. +- [`clean_columns`](https://globeandmail.github.io/upstartr/reference/clean_columns.html): Renaming columns to something that doesn't have to be referenced with backticks (`` `Column Name!` ``) or square brackets (`.[['Column Name!']]`) gets tedious. This function speeds up the process by forcing everything to lowercase and using underscores – the tidyverse's preferred naming convention for columns. If there are many columns with the same name during cleanup, they'll be appended with an index number. ```r pizza_deliveries %>% - mutate(customer_simplified = simplify_string(customer_name)) + rename_all(clean_columns) + ``` + +- [`convert_str_to_logical`](https://globeandmail.github.io/upstartr/reference/convert_str_to_logical.html): Does the work of cleaning up your True, TRUE, true, T, False, FALSE, false, F, etc. strings to logicals. + + ```r + pizza_deliveries %>% + mutate(was_delivered_logi = convert_str_to_logical(was_delivered)) ``` - [`calc_index`](https://globeandmail.github.io/upstartr/reference/calc_index.html): Calculate percentage growth by indexing values to the first value: @@ -159,27 +163,12 @@ write_plot(plot_deliveries_monthly) - [`calc_mode`](https://globeandmail.github.io/upstartr/reference/calc_mode.html): Calculate the mode for a given field: - ```r pizza_deliveries %>% group_by(pizza_shop) %>% summarise(most_common_size = mode(size)) ``` -- [`clean_columns`](https://globeandmail.github.io/upstartr/reference/clean_columns.html): Renaming columns to something that doesn't have to be referenced with backticks (`` `Column Name!` ``) or square brackets (`.[['Column Name!']]`) gets tedious. This function speeds up the process by forcing everything to lowercase and using underscores – the tidyverse's preferred naming convention for columns. If there are many columns with the same name during cleanup, they'll be appended with an index number. - - ```r - pizza_deliveries %>% - rename_all(clean_columns) - ``` - -- [`convert_str_to_logical`](https://globeandmail.github.io/upstartr/reference/convert_str_to_logical.html): Does the work of cleaning up your True, TRUE, true, T, False, FALSE, false, F, etc. strings to logicals. - - ```r - pizza_deliveries %>% - mutate(was_delivered_logi = convert_str_to_logical(was_delivered)) - ``` - - [`write_excel`](https://globeandmail.github.io/upstartr/reference/write_excel.html): Writes out an Excel file to `data/out` using the variable name as the file name. Useful for quickly generating summary tables for sharing with others. By design, doesn't take any arguments to keep things as simple as possible. If `should_timestamp_output_files` is set to TRUE in `config.R`, will append a timestamp to the filename in the format `%Y%m%d%H%M%S`. ```r @@ -201,6 +190,16 @@ write_plot(plot_deliveries_monthly) write_plot(plot_undelivered_pizzas) ``` +- [`read_all_excel_sheets`](https://globeandmail.github.io/upstartr/reference/read_all_excel_sheets.html): Combines all Excel sheets in a given file into a single dataframe, adding an extra column called `sheet` for the sheet name. Takes all the same arguments as `readxl`'s `read_excel`. + + ```r + pizza_deliveries <- read_all_excel_sheets( + pizza_deliveries.file, + skip = 3, + ) %>% + rename(pizza_shop = 'sheet') + ``` + - [`combine_csvs`](https://globeandmail.github.io/upstartr/reference/combine_csvs.html): Read all CSVs in a given directory and concatenate them into a single file. ```r @@ -213,7 +212,7 @@ write_plot(plot_deliveries_monthly) pizzas_in_excel <- combine_excels(dir_data_raw()) ``` -- `unaccent`(https://globeandmail.github.io/upstartr/reference/unaccent.html): Remove accents from strings. +- [`unaccent`](https://globeandmail.github.io/upstartr/reference/unaccent.html): Remove accents from strings. ```r unaccent('Montréal') From bac815e259b59e3e551583fdc19d7944f41ec8ef Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Thu, 21 Jan 2021 02:30:50 -0500 Subject: [PATCH 09/18] small tweak to config --- config.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.R b/config.R index 240e8e1..c1889d3 100644 --- a/config.R +++ b/config.R @@ -5,8 +5,8 @@ # This initializes your startr project initialize_startr( - author = 'Firstname Lastname ', title = 'startr', + author = 'Firstname Lastname ', timezone = 'America/Toronto', should_render_notebook = FALSE, should_process_data = TRUE, From 5a06dbb171f7b1c8c1cf3ffed734960a0e325f40 Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Thu, 21 Jan 2021 02:32:26 -0500 Subject: [PATCH 10/18] bumping version number --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 921cc75..31648eb 100644 --- a/README.md +++ b/README.md @@ -280,7 +280,7 @@ An `.nvmrc` is included at the project root for scraping with Node. A `venv` and ## Version -1.0.2 +1.1.0 ## License From 3c726b2590090c46f915d4d3283abc1410f1a464 Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Thu, 21 Jan 2021 02:32:42 -0500 Subject: [PATCH 11/18] updating description --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 27172d7..d67bbeb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: startR Type: Package Title: A Template For Data Journalism Projects In R -Version: 1.0.1 +Version: 1.1.0 Author: Michael Pereira and Tom Cardoso Maintainer: Michael Pereira and Tom Cardoso Description: This project structures the data analysis process around an expected set of files and steps. From d96ea0a751496279adffd311803b94f91735b739 Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Fri, 29 Jan 2021 17:16:05 -0500 Subject: [PATCH 12/18] updates README --- README.md | 115 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 31648eb..044cc5e 100644 --- a/README.md +++ b/README.md @@ -13,38 +13,67 @@ Broadly, `startr` does a few things: * **Improves communication**: Documents the analysis steps and questions to be answered for large, multi-disciplinary teams (say, developers, data journalists and traditional reporters) * **Simplifies the generation of charts and reports**: Generates easily updatable RMarkdown reports, Adobe Illustrator-ready graphics, and datasets during analysis -## How do I use this? - -This template works with R and RStudio, so you'll need both of those installed. To scaffold a new `startr` project, we recommend using our command-line tool, [`startr-cli`](https://www.github.com/globeandmail/startr-cli), which will rename some files, configure the project and initialize an empty Git repository. Alternatively, you can run: +## Table of contents +* [startr](#startr) +* [Table of contents](#table-of-contents) +* [Installation](#installation) +* [Philosophy on data analysis](#philosophy-on-data-analysis) +* [Workflow](#workflow) + 1. [Set up your project](step-1-set-up-your-analysis) + 2. [Import and process data](step-2-import-and-process-data) + 3. [Analyze](step-3-analyze) + 4. [Visualize](step-4-visualize) + 5. [Write a notebook](step-5-write-a-notebook) +* [Helper functions](#helper-functions) +* [Tips](tips) +* [Directory structure](directory-structure) +* [See also](see-also) +* [Version](version) +* [License](license) +* [Get in touch](get-in-touch) + +## Installation + +This template works with R and RStudio, so you'll need both of those installed. To scaffold a new `startr` project, we recommend using our command-line tool, [`startr-cli`](https://www.github.com/globeandmail/startr-cli), which will rename some files, configure the project and initialize an empty Git repository. + +Alternatively, you can run: ```sh git clone https://github.com/globeandmail/startr.git ``` -Once the project's cloned, double-click on the `.Rproj` file to start a scoped RStudio instance. +(But, if you do that, be sure to rename your `startr.Rproj` file to `.Rproj`.) + +Once a fresh project is ready, double-click on the `.Rproj` file to start a scoped RStudio instance. You can then start copying in your data and writing your analysis. At The Globe, we like to work in a code editor like Atom or Sublime Text, and use something like [`r-exec`](https://atom.io/packages/r-exec) to send code chunks to RStudio. -## Example workflow using `startr` +## Philosophy on data analysis + +TKTKTKTK + +## Workflow -Here's how we use `startr` for our own analysis workflow right now. The heart of the project lies in these three files: +The heart of the project lies in these three files: -* **`process.R`**: Import your source data, tidy it, fix any errors, set types, apply upfront manipulations and save out a file ready for analysis. We recommend saving out a `.feather` file, which will retain types and reads extremely quickly — but you can also use a CSV, shapefile, RDS file or something else if you'd prefer. +* **`process.R`**: Import your source data, tidy it, fix any errors, set types, apply upfront manipulations and save out a file ready for analysis. We recommend saving out a [`.feather`](https://github.com/wesm/feather) file, which will retain types and reads extremely quickly — but you can also use a CSV, shapefile, RDS file or something else if you'd prefer. * **`analyze.R`**: Here you'll consume the data files saved out by `process.R`. This is where all of the true "analysis" occurs, including grouping, summarizing, filtering, etc. If your analysis is complex enough, you may want to split it into additional `analyze_step_X.R` files as required. * **`visualize.R`**: Draw and save out your graphics. -There's also an optional (but recommended) RMarkdown file (**`notebook.Rmd`**) you can use to generate a report – especially useful for longer-term projects where you need to document the questions you're asking. +There's also an optional (but recommended) RMarkdown file (**`notebook.Rmd`**) you can use to generate an HTML codebook – especially useful for longer-term projects where you need to document the questions you're asking. #### Step 1: Set up your project The bulk of any `startr` project's code lives within the `R` directory, in files that are sourced and run in sequence by the `run.R` at the project's root. -Many of the core functions for this project are managed by a specialty package, [**upstartr**](https://www.github.com/globeandmail/upstartr). That package is installed and imported in `run.R`. +Many of the core functions for this project are managed by a specialty package, [**upstartr**](https://www.github.com/globeandmail/upstartr). That package is installed and imported in `run.R` automatically. -Before starting an analysis, you'll want to point to your data files in `config.R` and make sure it's loading all the packages you'll need. For instance, you might want to add the [`cancensus`](https://github.com/mountainMath/cancensus) package. To do that, just add `'cancensus'` to the `packages` vector. Package suggestions for GIS work, scraping, dataset summaries, etc. are included and commented out to avoid bloat. The [`initialize_startr()`](https://globeandmail.github.io/upstartr/reference/initialize_startr.html) function also takes several other optional parameters — for a full list, see the [function's](https://globeandmail.github.io/upstartr/reference/initialize_startr.html) documentation. +Before starting an analysis, you'll need to set up your `config.R` file. -Once you've listed the packages you want to import, you'll want to reference your raw data filenames. For instance, if you're adding pizza delivery data, you'd add this line to the filenames block in `config.R`: +That file uses the [`initialize_startr()`](https://globeandmail.github.io/upstartr/reference/initialize_startr.html) function to prepare the environment for analysis. It will also load all the packages you'll need. For instance, you might want to add the [`cancensus`](https://github.com/mountainMath/cancensus) library. To do that, just add `'cancensus'` to the `packages` vector. Package suggestions for GIS work, scraping, dataset summaries, etc. are included in commented-out form to avoid bloat. The function also takes several other optional parameters — for a full list, see our [documentation](https://globeandmail.github.io/upstartr/reference/initialize_startr.html). + +Once you've listed the packages you want to import, you'll want to reference your raw data filenames so that you can read them in during `process.R`. For instance, if you're adding pizza delivery data, you'd add this line to the filenames block in `config.R`: ```R pizza.raw.file <- dir_data_raw('Citywide Pizza Deliveries 1998-2016.xlsx') @@ -52,9 +81,9 @@ pizza.raw.file <- dir_data_raw('Citywide Pizza Deliveries 1998-2016.xlsx') Our naming convention is to append `.raw` to variables that reference raw data, and `.file` to variables that are just filename strings. -#### Step 2: Import and process your data +#### Step 2: Import and process data -In `process.R`, you'll consume the variables you created in `config.R`, clean them up, rename variables, deal with any errors, convert multiple data files to a common structure if necessary, and save out the result. It might look something like this: +In `process.R`, you'll read in the data for the filename variables you assigned in `config.R`, do some clean-up, rename variables, deal with any errors, convert multiple files to a common data structure if necessary, and finally save out the result. It might look something like this: ```R pizza.raw <- read_excel(pizza.raw.file, skip = 2) %>% @@ -80,13 +109,13 @@ pizza.raw <- read_excel(pizza.raw.file, skip = 2) %>% write_feather(pizza.raw, dir_data_processed('pizza.feather')) ``` -When called via the `run_process()` function in `run.R`, variables generated during processing will be removed once the step is completed to keep the working environment clean for analysis. +When called via the [`run_process()`](https://globeandmail.github.io/upstartr/reference/run_process.html) function in `run.R`, variables generated during processing will be removed once the step is completed to keep the working environment clean for analysis. -We prefer to write out our outputs as a `.feather` file, which is a binary format designed to read and write files extremely fast (at roughly 600 MB/s). Feather files can also be opened in other analysis frameworks (i.e. Jupyter Notebooks) and, most importantly, have embedded column type data so that you don't have to re-assert them later. If you'd rather save out files in a different format, you can just use a different function, like the Tidyverse's `write_csv`. +We prefer to write out our processed files using the binary [`.feather`](https://github.com/wesm/feather) format, which is designed to read and write files extremely quickly (at roughly 600 MB/s). Feather files can also be opened in other analysis frameworks (i.e. Jupyter Notebooks) and, most importantly, embed column types into the data so that you don't have to re-declare a column as logicals, dates or characters later on. If you'd rather save out files in a different format, you can just use a different function, like the tidyverse's [`write_csv()`](https://readr.tidyverse.org/reference/write_delim.html). -Output files are written to `/data/processed` using the `dir_data_processed()` function. By design, processed files aren't checked into Git — you should be able to reproduce the analysis-ready files from someone else's project by running `process.R`. +Output files are written to `/data/processed` using the [`dir_data_processed()`](https://globeandmail.github.io/upstartr/reference/dir-data_processed.html) function. By design, processed files aren't checked into Git — you should be able to reproduce the analysis-ready files from someone else's project by running `process.R`. -#### Step 2: Do your analysis +#### Step 3: Analyze This part's as simple as consuming that file in `analyze.R` and running with it. It might look something like this: @@ -106,51 +135,59 @@ deliveries_monthly <- pizza %>% ) ``` -#### Step 3: Visualize your analysis +#### Step 4: Visualize You can use `visualize.R` to consume the variables created in `analyze.R`. For instance: ```R -plot_delivery_persons <- ggplot(delivery_person_counts, aes(x = person, y = n)) + +plot_delivery_persons <- delivery_person_counts %>% + ggplot(aes(x = person, y = n)) + geom_col() + coord_flip() -plot(plot_delivery_persons) +plot_delivery_persons + +write_plot(plot_delivery_persons) -plot_deliveries_monthly <- ggplot(deliveries_monthly, aes(x = year_month, y = n)) + +plot_deliveries_monthly <- deliveries_monthly %>% + ggplot(aes(x = year_month, y = n)) + geom_col() -plot(plot_deliveries_monthly) +plot_deliveries_monthly write_plot(plot_deliveries_monthly) ``` +#### Step 5: Write a notebook + +TKTKTKTK + ## Helper functions `startr`'s companion package [`upstartr`](https://www.github.com/globeandmail/upstartr) comes with several functions to support `startr`, plus helpers we've found useful in daily data journalism tasks. A full list can be found on the [reference page here](https://globeandmail.github.io/upstartr/reference/index.html). Below is a partial list of some of its most handy functions: -- [`simplify_string`](https://globeandmail.github.io/upstartr/reference/simplify_string.html): By default, takes strings and simplifies them by force-uppercasing, replacing accents with non-accented characters, removing every non-alphanumeric character, and simplifying double/mutli-spaces into single spaces. Very useful when dealing with messy human-entry data with people's names, corporations, etc. +- [`simplify_string()`](https://globeandmail.github.io/upstartr/reference/simplify_string.html): By default, takes strings and simplifies them by force-uppercasing, replacing accents with non-accented characters, removing every non-alphanumeric character, and simplifying double/mutli-spaces into single spaces. Very useful when dealing with messy human-entry data with people's names, corporations, etc. ```r pizza_deliveries %>% mutate(customer_simplified = simplify_string(customer_name)) ``` -- [`clean_columns`](https://globeandmail.github.io/upstartr/reference/clean_columns.html): Renaming columns to something that doesn't have to be referenced with backticks (`` `Column Name!` ``) or square brackets (`.[['Column Name!']]`) gets tedious. This function speeds up the process by forcing everything to lowercase and using underscores – the tidyverse's preferred naming convention for columns. If there are many columns with the same name during cleanup, they'll be appended with an index number. +- [`clean_columns()`](https://globeandmail.github.io/upstartr/reference/clean_columns.html): Renaming columns to something that doesn't have to be referenced with backticks (`` `Column Name!` ``) or square brackets (`.[['Column Name!']]`) gets tedious. This function speeds up the process by forcing everything to lowercase and using underscores – the tidyverse's preferred naming convention for columns. If there are many columns with the same name during cleanup, they'll be appended with an index number. ```r pizza_deliveries %>% rename_all(clean_columns) ``` -- [`convert_str_to_logical`](https://globeandmail.github.io/upstartr/reference/convert_str_to_logical.html): Does the work of cleaning up your True, TRUE, true, T, False, FALSE, false, F, etc. strings to logicals. +- [`convert_str_to_logical()`](https://globeandmail.github.io/upstartr/reference/convert_str_to_logical.html): Does the work of cleaning up your True, TRUE, true, T, False, FALSE, false, F, etc. strings to logicals. ```r pizza_deliveries %>% mutate(was_delivered_logi = convert_str_to_logical(was_delivered)) ``` -- [`calc_index`](https://globeandmail.github.io/upstartr/reference/calc_index.html): Calculate percentage growth by indexing values to the first value: +- [`calc_index()`](https://globeandmail.github.io/upstartr/reference/calc_index.html): Calculate percentage growth by indexing values to the first value: ```r pizza_deliveries %>% @@ -161,7 +198,7 @@ write_plot(plot_deliveries_monthly) mutate(indexed_deliveries = index(total_deliveries)) ``` -- [`calc_mode`](https://globeandmail.github.io/upstartr/reference/calc_mode.html): Calculate the mode for a given field: +- [`calc_mode()`](https://globeandmail.github.io/upstartr/reference/calc_mode.html): Calculate the mode for a given field: ```r pizza_deliveries %>% @@ -169,7 +206,7 @@ write_plot(plot_deliveries_monthly) summarise(most_common_size = mode(size)) ``` -- [`write_excel`](https://globeandmail.github.io/upstartr/reference/write_excel.html): Writes out an Excel file to `data/out` using the variable name as the file name. Useful for quickly generating summary tables for sharing with others. By design, doesn't take any arguments to keep things as simple as possible. If `should_timestamp_output_files` is set to TRUE in `config.R`, will append a timestamp to the filename in the format `%Y%m%d%H%M%S`. +- [`write_excel()`](https://globeandmail.github.io/upstartr/reference/write_excel.html): Writes out an Excel file to `data/out` using the variable name as the file name. Useful for quickly generating summary tables for sharing with others. By design, doesn't take any arguments to keep things as simple as possible. If `should_timestamp_output_files` is set to TRUE in `config.R`, will append a timestamp to the filename in the format `%Y%m%d%H%M%S`. ```r undelivered_pizzas <- pizza_deliveries %>% @@ -178,7 +215,7 @@ write_plot(plot_deliveries_monthly) write_excel(undelivered_pizzas) ``` -- [`write_plot`](https://globeandmail.github.io/upstartr/reference/write_plot.html): Similar to `write_excel`, designed to quickly save out a plot directly to `/plots`. Takes all the same arguments as `ggsave`. +- [`write_plot()`](https://globeandmail.github.io/upstartr/reference/write_plot.html): Similar to [`write_excel()`](https://globeandmail.github.io/upstartr/reference/write_excel.html), designed to quickly save out a plot directly to `/plots`. Takes all the same arguments as [`ggsave()`](https://ggplot2.tidyverse.org/reference/ggsave.html). ```r plot_undelivered_pizzas <- undelivered_pizzas %>% @@ -190,7 +227,7 @@ write_plot(plot_deliveries_monthly) write_plot(plot_undelivered_pizzas) ``` -- [`read_all_excel_sheets`](https://globeandmail.github.io/upstartr/reference/read_all_excel_sheets.html): Combines all Excel sheets in a given file into a single dataframe, adding an extra column called `sheet` for the sheet name. Takes all the same arguments as `readxl`'s `read_excel`. +- [`read_all_excel_sheets()`](https://globeandmail.github.io/upstartr/reference/read_all_excel_sheets.html): Combines all Excel sheets in a given file into a single dataframe, adding an extra column called `sheet` for the sheet name. Takes all the same arguments as [`readxl`](https://readxl.tidyverse.org/)'s [`read_excel()`](https://readxl.tidyverse.org/reference/read_excel.html). ```r pizza_deliveries <- read_all_excel_sheets( @@ -200,25 +237,25 @@ write_plot(plot_deliveries_monthly) rename(pizza_shop = 'sheet') ``` -- [`combine_csvs`](https://globeandmail.github.io/upstartr/reference/combine_csvs.html): Read all CSVs in a given directory and concatenate them into a single file. +- [`combine_csvs()`](https://globeandmail.github.io/upstartr/reference/combine_csvs.html): Read all CSVs in a given directory and concatenate them into a single file. Takes all the same arguments as [`read_csv()`](https://readr.tidyverse.org/reference/read_delim.html) ```r pizzas <- combine_csvs(dir_data_raw()) ``` -- [`combine_excels`](https://globeandmail.github.io/upstartr/reference/combine_excels.html): Read all Excel spreadsheets in a given directory and concatenate them. +- [`combine_excels()`](https://globeandmail.github.io/upstartr/reference/combine_excels.html): Read all Excel spreadsheets in a given directory and concatenate them. ```r pizzas_in_excel <- combine_excels(dir_data_raw()) ``` -- [`unaccent`](https://globeandmail.github.io/upstartr/reference/unaccent.html): Remove accents from strings. +- [`unaccent()`](https://globeandmail.github.io/upstartr/reference/unaccent.html): Remove accents from strings. ```r unaccent('Montréal') ``` -- [`remove_non_utf8`](https://globeandmail.github.io/upstartr/reference/remove_non_utf8.html): Remove non-UTF-8 characters from strings. +- [`remove_non_utf8()`](https://globeandmail.github.io/upstartr/reference/remove_non_utf8.html): Remove non-UTF-8 characters from strings. ```r non_utf8 <- 'fa\xE7ile' @@ -226,19 +263,19 @@ write_plot(plot_deliveries_monthly) remove_non_utf8(non_utf8) ``` -- [`%not_in%`](https://globeandmail.github.io/upstartr/reference/grapes-not_in-grapes.html): The opposite of the `%in%` operator. +- [`%not_in%`](https://globeandmail.github.io/upstartr/reference/grapes-not_in-grapes.html): The opposite of the [`%in%`](https://stat.ethz.ch/R-manual/R-devel/library/base/html/match.html) operator. ```r c(1, 2, 3, 4, 5) %not_in% c(4, 5, 6, 7, 8) ``` -- [`not.na`](https://globeandmail.github.io/upstartr/reference/not.na.html): The opposite of the `is.na` function. -- [`not.null`](https://globeandmail.github.io/upstartr/reference/not.null.html): The opposite of the `is.null` funciton. +- [`not.na()`](https://globeandmail.github.io/upstartr/reference/not.na.html): The opposite of the [`is.na`](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/NA) function. +- [`not.null()`](https://globeandmail.github.io/upstartr/reference/not.null.html): The opposite of the [`is.null`](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/NULL) function. -## Tips for using `startr` +## Tips `startr` works best when you assume certain coding standards: -1. No variables should ever be overwritten or reassigned. Same goes for fields generated via `mutate()`. +1. No variables should ever be overwritten or reassigned. Same goes for fields generated via [`mutate()`](https://dplyr.tidyverse.org/reference/mutate.html). 2. If using RStudio (our preferred tool for work in R), restart and clear the environment often to make sure your code is reproducible. 3. Only ever run code sequentially to prevent order-of-execution accidents. In other words: don't jump around. For example, avoid running a block of code at line 22, then code at line 11, then some more code at line 37, since that may lead to unexpected results that another journalist won't be able to reproduce. 4. Treat raw data files (those in `data/raw`) as immutable and read-only. From 25c04f9f4e8d5d242204c519f12d34ea2c71f4b6 Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Fri, 29 Jan 2021 17:17:51 -0500 Subject: [PATCH 13/18] fixing issue with TOC --- README.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 044cc5e..616b0b6 100644 --- a/README.md +++ b/README.md @@ -19,18 +19,18 @@ Broadly, `startr` does a few things: * [Installation](#installation) * [Philosophy on data analysis](#philosophy-on-data-analysis) * [Workflow](#workflow) - 1. [Set up your project](step-1-set-up-your-analysis) - 2. [Import and process data](step-2-import-and-process-data) - 3. [Analyze](step-3-analyze) - 4. [Visualize](step-4-visualize) - 5. [Write a notebook](step-5-write-a-notebook) + 1. [Set up your project](#step-1-set-up-your-project) + 2. [Import and process data](#step-2-import-and-process-data) + 3. [Analyze](#step-3-analyze) + 4. [Visualize](#step-4-visualize) + 5. [Write a notebook](#step-5-write-a-notebook) * [Helper functions](#helper-functions) -* [Tips](tips) -* [Directory structure](directory-structure) -* [See also](see-also) -* [Version](version) -* [License](license) -* [Get in touch](get-in-touch) +* [Tips](#tips) +* [Directory structure](#directory-structure) +* [See also](#see-also) +* [Version](#version) +* [License](#license) +* [Get in touch](#get-in-touch) ## Installation From ee6f825e59b2f22859812226f9a71e4ba59f759d Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Fri, 5 Feb 2021 19:20:50 -0500 Subject: [PATCH 14/18] cleaning up description --- DESCRIPTION | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d67bbeb..365eea2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,8 +5,7 @@ Version: 1.1.0 Author: Michael Pereira and Tom Cardoso Maintainer: Michael Pereira and Tom Cardoso Description: This project structures the data analysis process around an expected set of files and steps. - This lowers the upfront effort of starting and maintaining a project and supports easier verification by providing reviewers with an expected and logically organized project. - Think of it like Ruby on Rails or React, but for R analysis. + This lowers the upfront effort of starting and maintaining a project and supports easier verification by providing reviewers with an expected and logically organized project. Think of it like Ruby on Rails or React, but for R analysis. License: MIT Encoding: UTF-8 LazyData: true From 7e547abf8925ea8ada473669ee235f74ae96ebc6 Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Fri, 5 Feb 2021 19:21:29 -0500 Subject: [PATCH 15/18] pointing to upstartr --- config.R | 4 ++-- run.R | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/config.R b/config.R index c1889d3..c39ce85 100644 --- a/config.R +++ b/config.R @@ -13,8 +13,8 @@ initialize_startr( should_timestamp_output_files = FALSE, packages = c( 'tidyverse', 'glue', 'magrittr', 'lubridate', 'hms', - 'readxl', 'feather', 'rvest', - # 'globeandmail/tgamtheme', + 'readxl', 'feather', 'rvest' + # 'tgamtheme', # 'janitor', 'zoo', # 'tidymodels', # 'scales', 'gganimate', diff --git a/run.R b/run.R index edb1fb7..14b101f 100644 --- a/run.R +++ b/run.R @@ -1,5 +1,4 @@ -# if (!require('upstartr')) install.packages('upstartr'); library('upstartr') -devtools::install_github('globeandmail/upstartr'); library('upstartr') +if (!require('upstartr')) install.packages('upstartr'); library('upstartr') run_config() run_process() From 8bf4484fefb1783160279e7f3a5f72c5258c8837 Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Thu, 11 Feb 2021 12:16:14 -0500 Subject: [PATCH 16/18] adds more readme stuff --- README.md | 71 +++++++++++++++++++++++++++++++------------------------ config.R | 2 +- 2 files changed, 41 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 616b0b6..1ba7fa2 100644 --- a/README.md +++ b/README.md @@ -34,14 +34,14 @@ Broadly, `startr` does a few things: ## Installation -This template works with R and RStudio, so you'll need both of those installed. To scaffold a new `startr` project, we recommend using our command-line tool, [`startr-cli`](https://www.github.com/globeandmail/startr-cli), which will rename some files, configure the project and initialize an empty Git repository. +This template works with R and RStudio, so you'll need both of those installed. To scaffold a new `startr` project, we recommend using our command-line tool, [`startr-cli`](https://github.com/globeandmail/startr-cli), which will copy down the folder structure, rename some files, configure the project and initialize an empty Git repository. Alternatively, you can run: ```sh git clone https://github.com/globeandmail/startr.git ``` -(But, if you do that, be sure to rename your `startr.Rproj` file to `.Rproj`.) +(But, if you do that, be sure to rename your `startr.Rproj` file to `.Rproj` and set up your settings in `config.R` manually.) Once a fresh project is ready, double-click on the `.Rproj` file to start a scoped RStudio instance. @@ -51,13 +51,27 @@ You can then start copying in your data and writing your analysis. At The Globe, TKTKTKTK +- **Your raw data is immutable**: +- **Your outputs are disposable**: +- **Never overwrite variables**: +- **Order matters**: We only ever run our R code sequentially +- **Wipe your environment often**: +- **Use the tidyverse**: For coding style, we rely on the [tidyverse style guide](https://style.tidyverse.org/). + +`startr` works best when you assume certain coding standards: +1. No variables should ever be overwritten or reassigned. Same goes for fields generated via [`mutate()`](https://dplyr.tidyverse.org/reference/mutate.html). +2. If using RStudio (our preferred tool for work in R), restart and clear the environment often to make sure your code is reproducible. +3. Only ever run code sequentially to prevent order-of-execution accidents. In other words: don't jump around. For example, avoid running a block of code at line 22, then code at line 11, then some more code at line 37, since that may lead to unexpected results that another journalist won't be able to reproduce. +4. Treat raw data files (those in `data/raw`) as immutable and read-only. +5. Conversely, treat all outputs (everything else, including data, plots and reports) as a disposable product. By default, this project's `.gitignore` file ignores them, so they're never checked into source management tools. + ## Workflow The heart of the project lies in these three files: -* **`process.R`**: Import your source data, tidy it, fix any errors, set types, apply upfront manipulations and save out a file ready for analysis. We recommend saving out a [`.feather`](https://github.com/wesm/feather) file, which will retain types and reads extremely quickly — but you can also use a CSV, shapefile, RDS file or something else if you'd prefer. +* **`process.R`**: Import your source data, tidy it, fix any errors, set types, apply upfront manipulations and save out a file ready for analysis. We recommend saving out a [`.feather`](https://github.com/wesm/feather) file, which will retain types and is design to read extremely quickly — but you can also use a .CSV, shapefile, .RDS file or something else if you'd prefer. -* **`analyze.R`**: Here you'll consume the data files saved out by `process.R`. This is where all of the true "analysis" occurs, including grouping, summarizing, filtering, etc. If your analysis is complex enough, you may want to split it into additional `analyze_step_X.R` files as required. +* **`analyze.R`**: Here you'll consume the data files saved out by `process.R`. This is where all of the true "analysis" occurs, including grouping, summarizing, filtering, etc. If your analysis is complex enough, you may want to split it out into additional `analyze_step_X.R` files as required. * **`visualize.R`**: Draw and save out your graphics. @@ -67,7 +81,7 @@ There's also an optional (but recommended) RMarkdown file (**`notebook.Rmd`**) y The bulk of any `startr` project's code lives within the `R` directory, in files that are sourced and run in sequence by the `run.R` at the project's root. -Many of the core functions for this project are managed by a specialty package, [**upstartr**](https://www.github.com/globeandmail/upstartr). That package is installed and imported in `run.R` automatically. +Many of the core functions for this project are managed by a specialty package, [**upstartr**](https://github.com/globeandmail/upstartr). That package is installed and imported in `run.R` automatically. Before starting an analysis, you'll need to set up your `config.R` file. @@ -164,7 +178,7 @@ TKTKTKTK ## Helper functions -`startr`'s companion package [`upstartr`](https://www.github.com/globeandmail/upstartr) comes with several functions to support `startr`, plus helpers we've found useful in daily data journalism tasks. A full list can be found on the [reference page here](https://globeandmail.github.io/upstartr/reference/index.html). Below is a partial list of some of its most handy functions: +`startr`'s companion package [`upstartr`](https://github.com/globeandmail/upstartr) comes with several functions to support `startr`, plus helpers we've found useful in daily data journalism tasks. A full list can be found on the [reference page here](https://globeandmail.github.io/upstartr/reference/index.html). Below is a partial list of some of its most handy functions: - [`simplify_string()`](https://globeandmail.github.io/upstartr/reference/simplify_string.html): By default, takes strings and simplifies them by force-uppercasing, replacing accents with non-accented characters, removing every non-alphanumeric character, and simplifying double/mutli-spaces into single spaces. Very useful when dealing with messy human-entry data with people's names, corporations, etc. @@ -195,7 +209,7 @@ TKTKTKTK group_by(size, year) %>% summarise(total_deliveries = n()) %>% arrange(year) %>% - mutate(indexed_deliveries = index(total_deliveries)) + mutate(indexed_deliveries = calc_index(total_deliveries)) ``` - [`calc_mode()`](https://globeandmail.github.io/upstartr/reference/calc_mode.html): Calculate the mode for a given field: @@ -203,7 +217,7 @@ TKTKTKTK ```r pizza_deliveries %>% group_by(pizza_shop) %>% - summarise(most_common_size = mode(size)) + summarise(most_common_size = calc_mode(size)) ``` - [`write_excel()`](https://globeandmail.github.io/upstartr/reference/write_excel.html): Writes out an Excel file to `data/out` using the variable name as the file name. Useful for quickly generating summary tables for sharing with others. By design, doesn't take any arguments to keep things as simple as possible. If `should_timestamp_output_files` is set to TRUE in `config.R`, will append a timestamp to the filename in the format `%Y%m%d%H%M%S`. @@ -233,8 +247,7 @@ TKTKTKTK pizza_deliveries <- read_all_excel_sheets( pizza_deliveries.file, skip = 3, - ) %>% - rename(pizza_shop = 'sheet') + ) ``` - [`combine_csvs()`](https://globeandmail.github.io/upstartr/reference/combine_csvs.html): Read all CSVs in a given directory and concatenate them into a single file. Takes all the same arguments as [`read_csv()`](https://readr.tidyverse.org/reference/read_delim.html) @@ -274,46 +287,43 @@ TKTKTKTK ## Tips -`startr` works best when you assume certain coding standards: -1. No variables should ever be overwritten or reassigned. Same goes for fields generated via [`mutate()`](https://dplyr.tidyverse.org/reference/mutate.html). -2. If using RStudio (our preferred tool for work in R), restart and clear the environment often to make sure your code is reproducible. -3. Only ever run code sequentially to prevent order-of-execution accidents. In other words: don't jump around. For example, avoid running a block of code at line 22, then code at line 11, then some more code at line 37, since that may lead to unexpected results that another journalist won't be able to reproduce. -4. Treat raw data files (those in `data/raw`) as immutable and read-only. -5. Conversely, treat all outputs (everything else, including data, plots and reports) as a disposable product. By default, this project's `.gitignore` file ignores them, so they're never checked into source management tools. -6. For coding style, we rely on the [tidyverse style guide](https://style.tidyverse.org/). +- **You don't always need to process your data**: If your [processing step](#step-2-import-and-process-data) takes a while and you've already generated your processed files during a previous run, you can tell `startr` to skip this step by setting `should_process_data` to `FALSE` in `config.R`'s [`initialize_startr()`](https://globeandmail.github.io/upstartr/reference/initialize_startr.html) function. Just be sure to set it back to `TRUE` if your processing code changes! +- **Consider timestamping your output files**: If you're using [`upstartr`](https://github.com/globeandmail/upstartr)'s [`write_excel()`](https://globeandmail.github.io/upstartr/reference/write_excel.html) helper, you can automatically timestamp your filenames by setting `should_timestamp_output_files` to `TRUE` in [`initialize_startr()`](https://globeandmail.github.io/upstartr/reference/initialize_startr.html). +- **Use the functions file**: Reduce repetition in your code by putting writing functions and putting them in the `functions.R` file, which gets `source()`'d when [`run_config()`](https://globeandmail.github.io/upstartr/reference/run_config.html) is run. +- **Help us make `startr` better**: Using this package? Find yourself wishing the structure were slightly different, or have an often-used function you're tired of copying and pasting between projects? Please [send us your feedback](#get-in-touch). ## Directory structure -```bash +``` ├── data/ -│   ├── raw # The original data files. Treat this directory as read-only. -│   ├── cache # Cached files, mostly used when scraping or dealing with packages such as `cancensus` -│   ├── processed # Imported and tidied data used throughout the analysis. -│   └── out # Exports of data at key steps or as a final output. +│   ├── raw/ # The original data files. Treat this directory as read-only. +│   ├── cache/ # Cached files, mostly used when scraping or dealing with packages such as `cancensus` +│   ├── processed/ # Imported and tidied data used throughout the analysis. +│   └── out/ # Exports of data at key steps or as a final output. ├── R/ │   ├── process.R # Data processing including tidying, processing and manupulation. │   ├── analyze.R # The primary analysis steps. │   ├── visualize.R # Generate plots as png, pdf, etc. │   └── functions.R # Project-specific functions. -├── scrape/ -│   └── scrape.R # Scraping scripts that save collected data to the `/data/raw/` directory. ├── plots/ # Visualizations saved out plot files in standard formats. ├── reports/ # Generated reports and associated files. +├── scrape/ +│   └── scrape.R # Scraping scripts that save collected data to the `/data/raw/` directory. │   └── notebook.Rmd # Standard notebook to render reports. ├── config.R # Global project variables including packages, key project paths and data sources. ├── run.R # Wrapper file to run the analysis steps, either inline or sourced from component R files. └── startr.Rproj # Rproj file for RStudio ``` -An `.nvmrc` is included at the project root for scraping with Node. A `venv` and `requirements.txt` file should be included within the scraper directory if Python is used for scraping. +An `.nvmrc` is included at the project root for Node.js-based scraping. If you prefer to scrape with Python, be sure to add `venv` and `requirements.txt` files, or a `Gemfile` if working in Ruby. ## See also `startr` is part of a small ecosystem of R utilities. Those include: -- [**upstartr**](https://www.github.com/globeandmail/upstartr), a library of functions that support `startr` and daily data journalism tasks -- [**tgamtheme**](https://www.github.com/globeandmail/tgamtheme), The Globe and Mail's graphics theme -- [**startr-cli**](https://www.github.com/globeandmail/startr-cli), a command-line tool that scaffolds new `startr` projects +- [**upstartr**](https://github.com/globeandmail/upstartr), a library of functions that support `startr` and daily data journalism tasks +- [**tgamtheme**](https://github.com/globeandmail/tgamtheme), The Globe and Mail's graphics theme +- [**startr-cli**](https://github.com/globeandmail/startr-cli), a command-line tool that scaffolds new `startr` projects ## Version @@ -327,6 +337,5 @@ startr © 2020 The Globe and Mail. It is free software, and may be redistributed If you've got any questions, feel free to send us an email, or give us a shout on Twitter: -[![Michael Pereira](https://avatars0.githubusercontent.com/u/212666?v=3&s=200)](https://github.com/monkeycycle)| [![Tom Cardoso](https://avatars0.githubusercontent.com/u/2408118?v=3&s=200)](https://github.com/tomcardoso) ----|--- -[Michael Pereira](mailto:mpereira@globeandmail.com)
[@__m_pereira](https://www.twitter.com/__m_pereira) | [Tom Cardoso](mailto:tcardoso@globeandmail.com)
[@tom_cardoso](https://www.twitter.com/tom_cardoso) +[![Tom Cardoso](https://avatars0.githubusercontent.com/u/2408118?v=3&s=65)](https://github.com/tomcardoso) +[Tom Cardoso](mailto:tcardoso@globeandmail.com)
[@tom_cardoso](https://www.twitter.com/tom_cardoso) diff --git a/config.R b/config.R index c39ce85..c944e95 100644 --- a/config.R +++ b/config.R @@ -6,7 +6,7 @@ # This initializes your startr project initialize_startr( title = 'startr', - author = 'Firstname Lastname ', + author = 'Firstname Lastname ', timezone = 'America/Toronto', should_render_notebook = FALSE, should_process_data = TRUE, From e92a02e7aad4a5696f8caebae674c7e2ca2aa9c2 Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Wed, 17 Feb 2021 14:07:48 -0500 Subject: [PATCH 17/18] updates readme --- README.md | 39 ++++++++++++++++++--------------------- config.R | 3 ++- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 1ba7fa2..c7f10a5 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,13 @@ This project structures the data analysis process, reducing the amount of time y Broadly, `startr` does a few things: -* **Standardizes your projects**: Eliminates the need to think about project structure so you can focus on the analysis -* **Breaks analysis into discrete steps**: Supports a flexible analysis workflow with clearly-defined steps which can be shared easily across a team -* **Bakes in flexibility**: Has a format that works for both large (multi-month) and small (single-day) projects -* **De-clutters your code**: Improves the painstaking data verification/fact-checking process by cutting down on spaghetti code -* **Improves communication**: Documents the analysis steps and questions to be answered for large, multi-disciplinary teams (say, developers, data journalists and traditional reporters) -* **Simplifies the generation of charts and reports**: Generates easily updatable RMarkdown reports, Adobe Illustrator-ready graphics, and datasets during analysis +* **Standardizes your projects**: Eliminates the need to think about project structure so you can focus on the analysis. +* **Breaks analysis into discrete steps**: Supports a flexible analysis workflow with clearly-defined steps which can be shared easily across a team. +* **Helps you catch mistakes**: With structure and workflow baked in, you can focus on writing analysis code, reducing the opportunities for mistakes. +* **Bakes in flexibility**: Has a format that works for both large (multi-month) and small (single-day) projects. +* **De-clutters your code**: Improves the painstaking data verification/fact-checking process by cutting down on spaghetti code. +* **Improves communication**: Documents the analysis steps and questions to be answered for large, multi-disciplinary teams (say, developers, data journalists and traditional reporters). +* **Simplifies the generation of charts and reports**: Generates easily updatable RMarkdown reports, Adobe Illustrator-ready graphics, and datasets during analysis. ## Table of contents * [startr](#startr) @@ -49,22 +50,17 @@ You can then start copying in your data and writing your analysis. At The Globe, ## Philosophy on data analysis -TKTKTKTK +This analysis framework is designed to be flexible, reproducible and easy to jump into for a new user. `startr` works best when you assume certain coding standards: -- **Your raw data is immutable**: -- **Your outputs are disposable**: -- **Never overwrite variables**: -- **Order matters**: We only ever run our R code sequentially -- **Wipe your environment often**: +- **Your raw data is immutable**: Treat the files in `data/raw` as read-only. This means you only ever alter them programmatically, and never edit or overwrite files in that folder. If you need to manually rewrite certain columns in a raw data file, do so by creating a new spreadsheet with the new values, then join it to the original data file during the [processing step](#step-2-import-and-process-data). +- **Your outputs are disposable**: Treat all project outputs (everything in `data/processed`, `data/out/`, `data/cache` and `plots/`) as disposable products. By default, this project's `.gitignore` file ignores those files, so they're never checked into source management tools. Unless absolutely necessary, do not alter `.gitignore` to check in those files — the analysis pipeline should be able to reproduce them all from your raw data files. +- **Shorter is not always better**: Your code should, as much as possible, be self-documenting. Keep it clean and as simple as possible. If an analysis chain is becoming particularly long or complex, break it out into smaller chunks, or consider writing a function to abstract out the complexity in your code. +- **Only optimize your code for performance when necessary**: It's easy to fall into a premature optimization rabbit hole, especially on larger or more complex projects. In most cases, there's no need to optimize your code for performance — only do this if your analysis process is taking several minutes or longer. +- **Never overwrite variables**: No variables should ever be overwritten or reassigned. Same goes for fields generated via [`mutate()`](https://dplyr.tidyverse.org/reference/mutate.html). +- **Order matters**: We only ever run our R code sequentially, which prevents reproducibility issues resulting from users running code chunks in different orders. For instance, do not run a block of code at line 22, then code at line 11, then some more code at line 37, since that may lead to unexpected results that another journalist won't be able to reproduce. +- **Wipe your environment often**: If using RStudio (our preferred tool for work in R), restart and clear the environment often to make sure your code is reproducible. - **Use the tidyverse**: For coding style, we rely on the [tidyverse style guide](https://style.tidyverse.org/). -`startr` works best when you assume certain coding standards: -1. No variables should ever be overwritten or reassigned. Same goes for fields generated via [`mutate()`](https://dplyr.tidyverse.org/reference/mutate.html). -2. If using RStudio (our preferred tool for work in R), restart and clear the environment often to make sure your code is reproducible. -3. Only ever run code sequentially to prevent order-of-execution accidents. In other words: don't jump around. For example, avoid running a block of code at line 22, then code at line 11, then some more code at line 37, since that may lead to unexpected results that another journalist won't be able to reproduce. -4. Treat raw data files (those in `data/raw`) as immutable and read-only. -5. Conversely, treat all outputs (everything else, including data, plots and reports) as a disposable product. By default, this project's `.gitignore` file ignores them, so they're never checked into source management tools. - ## Workflow The heart of the project lies in these three files: @@ -337,5 +333,6 @@ startr © 2020 The Globe and Mail. It is free software, and may be redistributed If you've got any questions, feel free to send us an email, or give us a shout on Twitter: -[![Tom Cardoso](https://avatars0.githubusercontent.com/u/2408118?v=3&s=65)](https://github.com/tomcardoso) -[Tom Cardoso](mailto:tcardoso@globeandmail.com)
[@tom_cardoso](https://www.twitter.com/tom_cardoso) +[![Tom Cardoso](https://avatars0.githubusercontent.com/u/2408118?v=3&s=65)](https://github.com/tomcardoso) | [![Michael Pereira](https://avatars0.githubusercontent.com/u/212666?v=3&s=65)](https://github.com/monkeycycle) +---|--- +[Tom Cardoso](mailto:tcardoso@globeandmail.com)
[@tom_cardoso](https://www.twitter.com/tom_cardoso) | [Michael Pereira](mailto:hello@monkeycycle.org)
[@__m_pereira](https://www.twitter.com/__m_pereira) diff --git a/config.R b/config.R index c944e95..0dc602f 100644 --- a/config.R +++ b/config.R @@ -13,7 +13,8 @@ initialize_startr( should_timestamp_output_files = FALSE, packages = c( 'tidyverse', 'glue', 'magrittr', 'lubridate', 'hms', - 'readxl', 'feather', 'rvest' + 'readxl', 'feather' + # 'rvest' # 'tgamtheme', # 'janitor', 'zoo', # 'tidymodels', From 6a4732ee1a87e4eed304c96719c156622888528e Mon Sep 17 00:00:00 2001 From: Tom Cardoso Date: Wed, 17 Feb 2021 14:19:32 -0500 Subject: [PATCH 18/18] updates readme and config --- README.md | 4 ++++ config.R | 7 +++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c7f10a5..a7d525a 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,10 @@ Broadly, `startr` does a few things: This template works with R and RStudio, so you'll need both of those installed. To scaffold a new `startr` project, we recommend using our command-line tool, [`startr-cli`](https://github.com/globeandmail/startr-cli), which will copy down the folder structure, rename some files, configure the project and initialize an empty Git repository. +Using [`startr-cli`](https://github.com/globeandmail/startr-cli), you can scaffold a new project by simply running `create-startr` in your terminal and following the prompts: + +![startr-cli interface GIF](http://i.imgur.com/4qtiJar.gif) + Alternatively, you can run: ```sh git clone https://github.com/globeandmail/startr.git diff --git a/config.R b/config.R index 0dc602f..98d99e5 100644 --- a/config.R +++ b/config.R @@ -24,5 +24,8 @@ initialize_startr( ) ) -# Refer to your source data filenames here -sample.raw.file <- dir_data_raw('your-filename-here.csv') +# Refer to your source data filenames here. These can be either references +# to files in your `data/raw` folder, or paths to files hosted on the web. +# For example: +# sample.raw.file <- dir_data_raw('your-filename-here.csv') +# sample.raw.path <- 'https://github.com/tidyverse/dplyr/raw/master/data-raw/starwars.csv'