diff --git a/DESCRIPTION b/DESCRIPTION index 27172d7..365eea2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,11 @@ Package: startR Type: Package Title: A Template For Data Journalism Projects In R -Version: 1.0.1 +Version: 1.1.0 Author: Michael Pereira and Tom Cardoso Maintainer: Michael Pereira and Tom Cardoso Description: This project structures the data analysis process around an expected set of files and steps. - This lowers the upfront effort of starting and maintaining a project and supports easier verification by providing reviewers with an expected and logically organized project. - Think of it like Ruby on Rails or React, but for R analysis. + This lowers the upfront effort of starting and maintaining a project and supports easier verification by providing reviewers with an expected and logically organized project. Think of it like Ruby on Rails or React, but for R analysis. License: MIT Encoding: UTF-8 LazyData: true diff --git a/R/analyze.R b/R/analyze.R index 7a4f070..38f2ca3 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -1,12 +1,9 @@ -############################################################ -# This file handles the primary analysis using the tidied -# data as input. Should never read from `dir_data_raw`, -# only `dir_data_processed`. -# -# -############################################################ +# ======================================================================= +# This file handles the primary analysis using the tidied data as input. +# Should never read from `dir_data_raw()`, only `dir_data_processed()`. +# ======================================================================= -# sample <- read_feather(here::here(dir_data_processed, 'sample.feather')) %>% +# sample <- read_feather(dir_data_processed('sample.feather')) %>% # group_by(cma) %>% # arrange(desc(date)) %>% # mutate(sale_avg_3mo = rollmean(sale_avg, k = 3, fill = NA)) %>% diff --git a/R/functions.R b/R/functions.R index fb8ebd1..ac1ca70 100644 --- a/R/functions.R +++ b/R/functions.R @@ -1,5 +1,3 @@ -############################################################ +# ======================================================================= # Project-specific functions. -# -# -############################################################ +# ======================================================================= diff --git a/R/process.R b/R/process.R index 3a3aee1..ad0323e 100644 --- a/R/process.R +++ b/R/process.R @@ -1,14 +1,10 @@ -############################################################ -# This file is used to read in raw data, tidy, clean it, -# and save a file to src *before* proceeding to analysis. -# If you're using `mutate()` for any actual analysis you're -# doing it wrong. -# -# Specify column types as required. -# -############################################################ - -# begin_processing() +# ======================================================================= +# Read raw data, clean it and save it out to `dir_data_processed()` here +# before moving to analysis. If run from `run.R`, all variables generated +# in this file will be wiped after completion to keep the environment +# clean. If your process step is complex, you can break it into several +# files like so: `source(dir_src('process_files', 'process_step_1.R'))` +# ======================================================================= # sample.raw <- read_csv(sample.raw.file) %>% # rename( @@ -23,6 +19,4 @@ # ) %>% # arrange(cma, desc(date)) # -# write_feather(sample.raw, here::here(dir_data_processed, 'sample.feather')) -# -# end_processing() +# write_feather(sample.raw, dir_data_processed('sample.feather')) diff --git a/R/utils.R b/R/utils.R deleted file mode 100644 index c64f0e0..0000000 --- a/R/utils.R +++ /dev/null @@ -1,208 +0,0 @@ -# Load required packages -load_requirements <- function(pkg){ - new.pkg <- pkg[!(pkg %in% installed.packages()[, 'Package'])] - if (length(new.pkg)) - install.packages(new.pkg, dependencies = TRUE) - sapply(pkg, require, character.only = TRUE) -} - -# Run the gauntlet of basic exploratory data analysis on your data -run_basic_eda <- function(data){ - glimpse(data) - df_status(data) - freq(data) - profiling_num(data) - plot_num(data) - describe(data) -} - -read_all_excel_sheets <- function( - filepath, - range = NULL, - col_types = NULL, - col_names = TRUE, - na = '', - trim_ws = TRUE, - skip = 0, - n_max = Inf, - guess_max = min(1000, n_max), - .name_repair = 'unique' - ) { - filepath %>% - excel_sheets() %>% - set_names() %>% - map_df(~ read_excel( - path = filepath, - skip = skip, - range = range, - na = na, - trim_ws = trim_ws, - guess_max = guess_max, - col_names = col_names, - col_types = col_types, - n_max = n_max, - sheet = .x, - .name_repair = .name_repair - ), .id = 'sheet') -} - -# geocoding function using OSM Nominatim API -# details: http://wiki.openstreetmap.org/wiki/Nominatim -# made by: D.Kisler -nominatim_osm <- function(address = NULL) { - if(suppressWarnings(is.null(address))) - return(data.frame()) - tryCatch( - d <- jsonlite::fromJSON( - gsub('\\@addr\\@', gsub('\\s+', '\\%20', address), - 'http://nominatim.openstreetmap.org/search/@addr@?format=json&addressdetails=0&limit=1') - ), error = function(c) return(data.frame()) - ) - if(length(d) == 0) return(data.frame()) - return(data.frame(lon = as.numeric(d$lon), lat = as.numeric(d$lat))) -} - -render_notebook <- function(notebook_file) { - rmarkdown::render( - notebook_file, - output_dir = dir_reports, - encoding = 'utf-8' - ) -} - -index <- function(m) { - (m - first(m)) / first(m) -} - -mode <- function(x) { - ux <- unique(x) - ux[which.max(tabulate(match(x, ux)))] -} - -unaccent <- function(x) { - iconv(x, to = 'ASCII//TRANSLIT') -} - -simplify_string <- function(x, alpha = TRUE, digits = FALSE) { - re <- '^\\s' - - if (alpha) re <- paste(re, 'a-zA-Z', sep = '') - if (digits) re <- paste(re, '0-9', sep = '') - - # TODO: add corporate stop words like INC, LTD, CORP? - - x %>% - unaccent(.) %>% - str_replace_all(., paste('[', re, ']', sep = ''), '') %>% - str_replace_all(., '[\\s]+', ' ') %>% - toupper(.) %>% - trimws(.) -} - -clean_columns <- function(x) { - cols <- x %>% - unaccent(.) %>% - str_replace_all(., '[\\s]+', '_') %>% - str_replace_all(., '[_]+', '_') %>% - str_replace_all(., '[^_a-zA-Z]', '') %>% - tolower(.) %>% - trimws(.) - - for (i in 1:length(cols)) { - if (!as.logical(str_count(cols[i]))) { - cols[i] <- glue('column_{i}') - } - if (any(cols[1:i - 1] == cols[i])) { - cols[i] <- glue('{cols[i]}_{i}') - } - } - - return(cols) -} - -convert_str_to_logical <- function(x, truthy = 'T|TRUE', falsy = 'F|FALSE') { - x %>% - toupper(.) %>% - trimws(.) %>% - str_replace_all(., truthy, 'TRUE') %>% - str_replace_all(., falsy, 'FALSE') %>% - as.logical(.) -} - -write_excel <- function(variable, timestamp = timestamp_output_files) { - filename <- deparse(substitute(variable)) - if (timestamp) { - now <- Sys.time() - filename <- glue('{filename}_{format(now, "%Y%m%d%H%M%S")}') - } - write.xlsx(variable, file = here::here(dir_data_out, glue('{filename}.xlsx'))) -} - -begin_processing <- function() { - if (clean_processing_variables) { - assign('curr_env', ls(.GlobalEnv), envir = .GlobalEnv) - } -} - -end_processing <- function() { - if (clean_processing_variables) { - ls(.GlobalEnv) %>% - setdiff(., curr_env) %>% - as.character() %>% - rm(list = ., envir = .GlobalEnv) - } - beep() -} - -write_plot <- function(variable, filename = NA, width = NA, height = NA, format = NA, units = NA, dpi = NA, limitsize = NA) { - default_format <- 'png' - default_units <- 'in' - default_dpi <- 300 - default_filename <- deparse(substitute(variable)) - default_limitsize <- TRUE - - if(!is.na(format)) default_format <- format - if(!is.na(units)) default_units <- units - if(!is.na(dpi)) default_dpi <- dpi - if(!is.na(filename)) default_filename <- filename - if(!is.na(limitsize)) default_limitsize <- limitsize - - args <- list( - plot = variable, - file = here::here(dir_plots, glue('{default_filename}.{default_format}')), - units = default_units, - dpi = default_dpi, - width = width, - height = height, - limitsize = default_limitsize - ) - - if (default_format == 'pdf') args[['useDingbats']] <- FALSE - - do.call(ggsave, args) -} - -write_shp <- function(shp, path) { - if (file.exists(path)) { - file.remove(path) - } - - st_write(shp, path, update = TRUE) -} - -# FROM https://github.com/dgrtwo/drlib/blob/master/R/reorder_within.R - -reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) { - new_x <- paste(x, within, sep = sep) - stats::reorder(new_x, by, FUN = fun) -} - -scale_x_reordered <- function(..., sep = "___") { - reg <- paste0(sep, ".+$") - ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...) -} - -scale_y_reordered <- function(..., sep = "___") { - reg <- paste0(sep, ".+$") - ggplot2::scale_y_discrete(labels = function(x) gsub(reg, "", x), ...) -} diff --git a/R/visualize.R b/R/visualize.R index 25978af..6278082 100644 --- a/R/visualize.R +++ b/R/visualize.R @@ -1,15 +1,11 @@ -############################################################ -# Charts, maps, etc. from your data -# -# Use the `write_plot` function to write the plot directly -# to the `plots/` folder, using the variable name as -# the filename. -# -############################################################ +# ======================================================================= +# Graphics. Use the `write_plot` function to write the plot directly +# to the `plots/` folder, using the variable name as the filename. +# ======================================================================= -# plot_house_price_change <- ggplot(sample %>% -# filter(cma != 'C11'), -# aes(x = reorder(cma, yoy), y = yoy)) + +# plot_house_price_change <- sample %>% +# filter(cma != 'C11') %>% +# ggplot(aes(x = reorder(cma, yoy), y = yoy)) + # geom_bar(colour = 'white', stat = 'identity') + # scale_y_continuous(expand = c(0, 0), limits = c(0, 25)) + # coord_flip() + @@ -21,6 +17,6 @@ # ) + # theme_classic() # -# plot(plot_house_price_change) +# plot_house_price_change # # write_plot(plot_house_price_change) diff --git a/README.md b/README.md index a5ad83c..a7d525a 100644 --- a/README.md +++ b/README.md @@ -6,54 +6,98 @@ This project structures the data analysis process, reducing the amount of time y Broadly, `startr` does a few things: -* **Standardizes your projects**: Eliminates the need to think about project structure so you can focus on the analysis -* **Breaks analysis into discrete steps**: Supports a flexible analysis workflow with clearly-defined steps which can be shared easily across a team -* **Bakes in flexibility**: Has a format that works for both large (multi-month) and small (single-day) projects -* **De-clutters your code**: Improves the painstaking data verification/fact-checking process by cutting down on spaghetti code -* **Improves communication**: Documents the analysis steps and questions to be answered for large, multi-disciplinary teams (say, developers, data journalists and traditional reporters) -* **Simplifies the generation of charts and reports**: Generates easily updatable RMarkdown reports, Adobe Illustrator-ready graphics, and datasets during analysis +* **Standardizes your projects**: Eliminates the need to think about project structure so you can focus on the analysis. +* **Breaks analysis into discrete steps**: Supports a flexible analysis workflow with clearly-defined steps which can be shared easily across a team. +* **Helps you catch mistakes**: With structure and workflow baked in, you can focus on writing analysis code, reducing the opportunities for mistakes. +* **Bakes in flexibility**: Has a format that works for both large (multi-month) and small (single-day) projects. +* **De-clutters your code**: Improves the painstaking data verification/fact-checking process by cutting down on spaghetti code. +* **Improves communication**: Documents the analysis steps and questions to be answered for large, multi-disciplinary teams (say, developers, data journalists and traditional reporters). +* **Simplifies the generation of charts and reports**: Generates easily updatable RMarkdown reports, Adobe Illustrator-ready graphics, and datasets during analysis. + +## Table of contents +* [startr](#startr) +* [Table of contents](#table-of-contents) +* [Installation](#installation) +* [Philosophy on data analysis](#philosophy-on-data-analysis) +* [Workflow](#workflow) + 1. [Set up your project](#step-1-set-up-your-project) + 2. [Import and process data](#step-2-import-and-process-data) + 3. [Analyze](#step-3-analyze) + 4. [Visualize](#step-4-visualize) + 5. [Write a notebook](#step-5-write-a-notebook) +* [Helper functions](#helper-functions) +* [Tips](#tips) +* [Directory structure](#directory-structure) +* [See also](#see-also) +* [Version](#version) +* [License](#license) +* [Get in touch](#get-in-touch) + +## Installation + +This template works with R and RStudio, so you'll need both of those installed. To scaffold a new `startr` project, we recommend using our command-line tool, [`startr-cli`](https://github.com/globeandmail/startr-cli), which will copy down the folder structure, rename some files, configure the project and initialize an empty Git repository. + +Using [`startr-cli`](https://github.com/globeandmail/startr-cli), you can scaffold a new project by simply running `create-startr` in your terminal and following the prompts: + +![startr-cli interface GIF](http://i.imgur.com/4qtiJar.gif) + +Alternatively, you can run: +```sh +git clone https://github.com/globeandmail/startr.git +``` +(But, if you do that, be sure to rename your `startr.Rproj` file to `.Rproj` and set up your settings in `config.R` manually.) -## How do I use this? +Once a fresh project is ready, double-click on the `.Rproj` file to start a scoped RStudio instance. -This template works with R and RStudio, so you'll need both of those installed. Then, just clone down this project, or, better yet, use our scaffolding tool, [`startr-cli`](https://www.github.com/globeandmail/startr-cli). +You can then start copying in your data and writing your analysis. At The Globe, we like to work in a code editor like Atom or Sublime Text, and use something like [`r-exec`](https://atom.io/packages/r-exec) to send code chunks to RStudio. -Once the project's cloned, double-click on the `.Rproj` file to start a scoped RStudio instance. +## Philosophy on data analysis -You can then start adding your data and writing your analysis. At The Globe, we like to work in a code editor like Atom or Sublime Text, and use something like [`r-exec`](https://atom.io/packages/r-exec) to send code chunks to RStudio. +This analysis framework is designed to be flexible, reproducible and easy to jump into for a new user. `startr` works best when you assume certain coding standards: +- **Your raw data is immutable**: Treat the files in `data/raw` as read-only. This means you only ever alter them programmatically, and never edit or overwrite files in that folder. If you need to manually rewrite certain columns in a raw data file, do so by creating a new spreadsheet with the new values, then join it to the original data file during the [processing step](#step-2-import-and-process-data). +- **Your outputs are disposable**: Treat all project outputs (everything in `data/processed`, `data/out/`, `data/cache` and `plots/`) as disposable products. By default, this project's `.gitignore` file ignores those files, so they're never checked into source management tools. Unless absolutely necessary, do not alter `.gitignore` to check in those files — the analysis pipeline should be able to reproduce them all from your raw data files. +- **Shorter is not always better**: Your code should, as much as possible, be self-documenting. Keep it clean and as simple as possible. If an analysis chain is becoming particularly long or complex, break it out into smaller chunks, or consider writing a function to abstract out the complexity in your code. +- **Only optimize your code for performance when necessary**: It's easy to fall into a premature optimization rabbit hole, especially on larger or more complex projects. In most cases, there's no need to optimize your code for performance — only do this if your analysis process is taking several minutes or longer. +- **Never overwrite variables**: No variables should ever be overwritten or reassigned. Same goes for fields generated via [`mutate()`](https://dplyr.tidyverse.org/reference/mutate.html). +- **Order matters**: We only ever run our R code sequentially, which prevents reproducibility issues resulting from users running code chunks in different orders. For instance, do not run a block of code at line 22, then code at line 11, then some more code at line 37, since that may lead to unexpected results that another journalist won't be able to reproduce. +- **Wipe your environment often**: If using RStudio (our preferred tool for work in R), restart and clear the environment often to make sure your code is reproducible. +- **Use the tidyverse**: For coding style, we rely on the [tidyverse style guide](https://style.tidyverse.org/). -## Example workflow using `startr` +## Workflow -Here's how we use `startr` for our own analysis workflow right now. The heart of the project lies in these three files: +The heart of the project lies in these three files: -* **`process.R`**: Imports source data, tidies it, fixes errors, sets types, applies manipulations and saves out a Feather file ready for analysis (or, in other cases, a CSV, a shapefile, etc.). +* **`process.R`**: Import your source data, tidy it, fix any errors, set types, apply upfront manipulations and save out a file ready for analysis. We recommend saving out a [`.feather`](https://github.com/wesm/feather) file, which will retain types and is design to read extremely quickly — but you can also use a .CSV, shapefile, .RDS file or something else if you'd prefer. -* **`analyze.R`**: Consumes the data files saved out by `process.R`, and is where all of the true "analysis" occurs, including grouping, summarizing, filtering, etc. All descriptive and relational statistical analysis. More complicated analysis can be split into additional `analyze_somestep.R` files as required. +* **`analyze.R`**: Here you'll consume the data files saved out by `process.R`. This is where all of the true "analysis" occurs, including grouping, summarizing, filtering, etc. If your analysis is complex enough, you may want to split it out into additional `analyze_step_X.R` files as required. -* **`visualize.R`**: Generates plots. +* **`visualize.R`**: Draw and save out your graphics. -There's also an optional (but recommended) RMarkdown file (**`notebook.Rmd`**) you can use to generate a report – especially useful for longer-term projects where you need to document the questions you're asking. +There's also an optional (but recommended) RMarkdown file (**`notebook.Rmd`**) you can use to generate an HTML codebook – especially useful for longer-term projects where you need to document the questions you're asking. #### Step 1: Set up your project -Packages are managed through the `packages` list in the `config.R` file. `devtools` and `here` are loaded by default. The `load_requirements()` function loads, and optionally installs, required packages. +The bulk of any `startr` project's code lives within the `R` directory, in files that are sourced and run in sequence by the `run.R` at the project's root. + +Many of the core functions for this project are managed by a specialty package, [**upstartr**](https://github.com/globeandmail/upstartr). That package is installed and imported in `run.R` automatically. -The bulk of the analysis is based on a set of files within the `R` directory which are sourced and run in order by `run.R` at the project root. +Before starting an analysis, you'll need to set up your `config.R` file. -Before starting an analysis, you'll want to point to your data files in `config.R` and make sure it's loading all the packages you'll need. For instance, you might want to add the [`cancensus`](https://github.com/mountainMath/cancensus) package. To do that, just add `'cancensus'` to the `packages` vector. Package suggestions for GIS work, scraping, dataset summaries, etc. are included and commented out to avoid bloat. +That file uses the [`initialize_startr()`](https://globeandmail.github.io/upstartr/reference/initialize_startr.html) function to prepare the environment for analysis. It will also load all the packages you'll need. For instance, you might want to add the [`cancensus`](https://github.com/mountainMath/cancensus) library. To do that, just add `'cancensus'` to the `packages` vector. Package suggestions for GIS work, scraping, dataset summaries, etc. are included in commented-out form to avoid bloat. The function also takes several other optional parameters — for a full list, see our [documentation](https://globeandmail.github.io/upstartr/reference/initialize_startr.html). -Once that's done, you'll want to reference your raw data filenames. For instance, if you're adding pizza delivery data, you'd add this line to the "Files" block in `config.R`: +Once you've listed the packages you want to import, you'll want to reference your raw data filenames so that you can read them in during `process.R`. For instance, if you're adding pizza delivery data, you'd add this line to the filenames block in `config.R`: ```R -pizza.raw.file <- here::here(dir_data_raw, 'Citywide Pizza Deliveries 1998-2016.xlsx') +pizza.raw.file <- dir_data_raw('Citywide Pizza Deliveries 1998-2016.xlsx') ``` Our naming convention is to append `.raw` to variables that reference raw data, and `.file` to variables that are just filename strings. -#### Step 2: Import and process your data +#### Step 2: Import and process data -In `process.R`, you'll consume the variables you created in `config.R`, clean them up, rename variables, deal with any errors, convert multiple data files to a common structure if necessary, and save out the result, plus some cleanup at the end so as to not pollute the environment. It might look something like this: +In `process.R`, you'll read in the data for the filename variables you assigned in `config.R`, do some clean-up, rename variables, deal with any errors, convert multiple files to a common data structure if necessary, and finally save out the result. It might look something like this: ```R pizza.raw <- read_excel(pizza.raw.file, skip = 2) %>% @@ -76,19 +120,21 @@ pizza.raw <- read_excel(pizza.raw.file, skip = 2) %>% ) %>% filter(!is.na(date)) -write_feather(pizza.raw, here::here(dir_data_processed, 'pizza.feather')) +write_feather(pizza.raw, dir_data_processed('pizza.feather')) ``` -We prefer to write out the output as a `.feather` file, which is a binary format designed to read and write files extremely fast (at roughly 600 MB/s). Feather files can also be opened in other analysis frameworks (i.e. Jupyter Notebooks) and, most importantly, embed the column types so that you don't have to re-assert them later. If you'd rather save out files in a different format, you can just use a different function, like the Tidyverse's `write_csv`. +When called via the [`run_process()`](https://globeandmail.github.io/upstartr/reference/run_process.html) function in `run.R`, variables generated during processing will be removed once the step is completed to keep the working environment clean for analysis. -Output files written to `dir_data_processed` (that is, `/data/processed`) aren't checked into Git by design — you should be able to reproduce the analysis-ready files from someone else's project by running `process.R`. +We prefer to write out our processed files using the binary [`.feather`](https://github.com/wesm/feather) format, which is designed to read and write files extremely quickly (at roughly 600 MB/s). Feather files can also be opened in other analysis frameworks (i.e. Jupyter Notebooks) and, most importantly, embed column types into the data so that you don't have to re-declare a column as logicals, dates or characters later on. If you'd rather save out files in a different format, you can just use a different function, like the tidyverse's [`write_csv()`](https://readr.tidyverse.org/reference/write_delim.html). -#### Step 2: Do your analysis +Output files are written to `/data/processed` using the [`dir_data_processed()`](https://globeandmail.github.io/upstartr/reference/dir-data_processed.html) function. By design, processed files aren't checked into Git — you should be able to reproduce the analysis-ready files from someone else's project by running `process.R`. + +#### Step 3: Analyze This part's as simple as consuming that file in `analyze.R` and running with it. It might look something like this: ```R -pizza <- read_feather(here::here(dir_data_processed, 'pizza.feather')) +pizza <- read_feather(dir_data_processed('pizza.feather')) delivery_person_counts <- pizza %>% group_by(person) %>% @@ -103,82 +149,78 @@ deliveries_monthly <- pizza %>% ) ``` -#### Step 3: Visualize your analysis +#### Step 4: Visualize You can use `visualize.R` to consume the variables created in `analyze.R`. For instance: ```R -plot_delivery_persons <- ggplot(delivery_person_counts, aes(x = person, y = n)) + +plot_delivery_persons <- delivery_person_counts %>% + ggplot(aes(x = person, y = n)) + geom_col() + coord_flip() -plot(plot_delivery_persons) +plot_delivery_persons + +write_plot(plot_delivery_persons) -plot_deliveries_monthly <- ggplot(deliveries_monthly, aes(x = year_month, y = n)) + +plot_deliveries_monthly <- deliveries_monthly %>% + ggplot(aes(x = year_month, y = n)) + geom_col() -plot(plot_deliveries_monthly) +plot_deliveries_monthly write_plot(plot_deliveries_monthly) ``` -## Helper functions +#### Step 5: Write a notebook -This template comes with several pre-made helper functions that we've found useful in daily data journalism tasks. +TKTKTKTK -- `read_all_excel_sheets`: Combines all Excel sheets in a given file into a single dataframe, adding an extra column called `sheet` for the sheet name. Takes all the same arguments as `readxl`'s `read_excel`. +## Helper functions - ```r - pizza_deliveries <- read_all_excel_sheets( - pizza_deliveries.file, - skip = 3, - ) %>% - rename(pizza_shop = 'sheet') - ``` +`startr`'s companion package [`upstartr`](https://github.com/globeandmail/upstartr) comes with several functions to support `startr`, plus helpers we've found useful in daily data journalism tasks. A full list can be found on the [reference page here](https://globeandmail.github.io/upstartr/reference/index.html). Below is a partial list of some of its most handy functions: -- `simplify_string`: By default, takes strings and simplifies them by force-uppercasing, replacing accents with non-accented characters, removing every non-alphanumeric character, and simplifying double/mutli-spaces into single spaces. Very useful when dealing with messy human-entry data with people's names, corporations, etc. +- [`simplify_string()`](https://globeandmail.github.io/upstartr/reference/simplify_string.html): By default, takes strings and simplifies them by force-uppercasing, replacing accents with non-accented characters, removing every non-alphanumeric character, and simplifying double/mutli-spaces into single spaces. Very useful when dealing with messy human-entry data with people's names, corporations, etc. ```r pizza_deliveries %>% mutate(customer_simplified = simplify_string(customer_name)) - ``` -- `index`: Calculate percentage growth by indexing values to the first value: +- [`clean_columns()`](https://globeandmail.github.io/upstartr/reference/clean_columns.html): Renaming columns to something that doesn't have to be referenced with backticks (`` `Column Name!` ``) or square brackets (`.[['Column Name!']]`) gets tedious. This function speeds up the process by forcing everything to lowercase and using underscores – the tidyverse's preferred naming convention for columns. If there are many columns with the same name during cleanup, they'll be appended with an index number. ```r pizza_deliveries %>% - mutate(year = year(date)) %>% - group_by(size, year) %>% - summarise(total_deliveries = n()) %>% - arrange(year) %>% - mutate(indexed_deliveries = index(total_deliveries)) + rename_all(clean_columns) ``` -- `mode`: Calculate the mode for a given field: - +- [`convert_str_to_logical()`](https://globeandmail.github.io/upstartr/reference/convert_str_to_logical.html): Does the work of cleaning up your True, TRUE, true, T, False, FALSE, false, F, etc. strings to logicals. ```r pizza_deliveries %>% - group_by(pizza_shop) %>% - summarise(most_common_size = mode(size)) + mutate(was_delivered_logi = convert_str_to_logical(was_delivered)) ``` -- `clean_columns`: Renaming columns to something that doesn't have to be referenced with backticks (`` `Column Name!` ``) or square brackets (`.[['Column Name!']]`) gets tedious. This function speeds up the process by forcing everything to lowercase and using underscores – the tidyverse's preferred naming convention for columns. If there are many columns with the same name during cleanup, they'll be appended with an index number. +- [`calc_index()`](https://globeandmail.github.io/upstartr/reference/calc_index.html): Calculate percentage growth by indexing values to the first value: ```r pizza_deliveries %>% - rename_all(clean_columns) + mutate(year = year(date)) %>% + group_by(size, year) %>% + summarise(total_deliveries = n()) %>% + arrange(year) %>% + mutate(indexed_deliveries = calc_index(total_deliveries)) ``` -- `convert_str_to_logical`: Does the work of cleaning up your True, TRUE, true, T, False, FALSE, false, F, etc. strings to logicals. +- [`calc_mode()`](https://globeandmail.github.io/upstartr/reference/calc_mode.html): Calculate the mode for a given field: ```r pizza_deliveries %>% - mutate(was_delivered_logi = convert_str_to_logical(was_delivered)) + group_by(pizza_shop) %>% + summarise(most_common_size = calc_mode(size)) ``` -- `write_excel`: Writes out an Excel file to `data/out` using the variable name as the file name. Useful for quickly generating summary tables for sharing with others. By design, doesn't take any arguments to keep things as simple as possible. If `timestamp_output_files` is set to TRUE in `config.R`, will append a timestamp to the filename in the format `%Y%m%d%H%M%S`. +- [`write_excel()`](https://globeandmail.github.io/upstartr/reference/write_excel.html): Writes out an Excel file to `data/out` using the variable name as the file name. Useful for quickly generating summary tables for sharing with others. By design, doesn't take any arguments to keep things as simple as possible. If `should_timestamp_output_files` is set to TRUE in `config.R`, will append a timestamp to the filename in the format `%Y%m%d%H%M%S`. ```r undelivered_pizzas <- pizza_deliveries %>% @@ -187,7 +229,7 @@ This template comes with several pre-made helper functions that we've found usef write_excel(undelivered_pizzas) ``` -- `write_plot`: Similar to `write_excel`, designed to quickly save out a plot directly to `/plots`. Takes all the same arguments as `ggsave`. +- [`write_plot()`](https://globeandmail.github.io/upstartr/reference/write_plot.html): Similar to [`write_excel()`](https://globeandmail.github.io/upstartr/reference/write_excel.html), designed to quickly save out a plot directly to `/plots`. Takes all the same arguments as [`ggsave()`](https://ggplot2.tidyverse.org/reference/ggsave.html). ```r plot_undelivered_pizzas <- undelivered_pizzas %>% @@ -199,47 +241,93 @@ This template comes with several pre-made helper functions that we've found usef write_plot(plot_undelivered_pizzas) ``` -- `begin_processing` and `end_processing`: functions that are run at the top and bottom of `process.R` that clean up the environment of temporary variables created during the data processing step. To disable this, set the `clean_processing_variables` flag in `config.R` to FALSE. +- [`read_all_excel_sheets()`](https://globeandmail.github.io/upstartr/reference/read_all_excel_sheets.html): Combines all Excel sheets in a given file into a single dataframe, adding an extra column called `sheet` for the sheet name. Takes all the same arguments as [`readxl`](https://readxl.tidyverse.org/)'s [`read_excel()`](https://readxl.tidyverse.org/reference/read_excel.html). + + ```r + pizza_deliveries <- read_all_excel_sheets( + pizza_deliveries.file, + skip = 3, + ) + ``` + +- [`combine_csvs()`](https://globeandmail.github.io/upstartr/reference/combine_csvs.html): Read all CSVs in a given directory and concatenate them into a single file. Takes all the same arguments as [`read_csv()`](https://readr.tidyverse.org/reference/read_delim.html) + + ```r + pizzas <- combine_csvs(dir_data_raw()) + ``` + +- [`combine_excels()`](https://globeandmail.github.io/upstartr/reference/combine_excels.html): Read all Excel spreadsheets in a given directory and concatenate them. + + ```r + pizzas_in_excel <- combine_excels(dir_data_raw()) + ``` + +- [`unaccent()`](https://globeandmail.github.io/upstartr/reference/unaccent.html): Remove accents from strings. + + ```r + unaccent('Montréal') + ``` -## Tips for using `startr` +- [`remove_non_utf8()`](https://globeandmail.github.io/upstartr/reference/remove_non_utf8.html): Remove non-UTF-8 characters from strings. -`startr` works best when you assume certain coding standards: -1. No variables should ever be overwritten or reassigned. Same goes for fields generated via `mutate()`. -2. If using RStudio (our preferred tool for work in R), restart and clear the environment often to make sure your code is reproducible. -3. Only ever run code sequentially to prevent order-of-execution accidents. In other words: don't jump around. For example, avoid running a block of code at line 22, then code at line 11, then some more code at line 37, since that may lead to unexpected results that another journalist won't be able to reproduce. -4. Treat raw data files (those in `data/raw`) as immutable and read-only. -5. Conversely, treat all outputs (everything else, including data, plots and reports) as a disposable product. By default, this project's `.gitignore` file ignores them, so they're never checked into source management tools. -6. For coding style, we rely on the [tidyverse style guide](https://style.tidyverse.org/). + ```r + non_utf8 <- 'fa\xE7ile' + Encoding(non_utf8) <- 'latin1' + remove_non_utf8(non_utf8) + ``` + +- [`%not_in%`](https://globeandmail.github.io/upstartr/reference/grapes-not_in-grapes.html): The opposite of the [`%in%`](https://stat.ethz.ch/R-manual/R-devel/library/base/html/match.html) operator. + + ```r + c(1, 2, 3, 4, 5) %not_in% c(4, 5, 6, 7, 8) + ``` + +- [`not.na()`](https://globeandmail.github.io/upstartr/reference/not.na.html): The opposite of the [`is.na`](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/NA) function. +- [`not.null()`](https://globeandmail.github.io/upstartr/reference/not.null.html): The opposite of the [`is.null`](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/NULL) function. + +## Tips + +- **You don't always need to process your data**: If your [processing step](#step-2-import-and-process-data) takes a while and you've already generated your processed files during a previous run, you can tell `startr` to skip this step by setting `should_process_data` to `FALSE` in `config.R`'s [`initialize_startr()`](https://globeandmail.github.io/upstartr/reference/initialize_startr.html) function. Just be sure to set it back to `TRUE` if your processing code changes! +- **Consider timestamping your output files**: If you're using [`upstartr`](https://github.com/globeandmail/upstartr)'s [`write_excel()`](https://globeandmail.github.io/upstartr/reference/write_excel.html) helper, you can automatically timestamp your filenames by setting `should_timestamp_output_files` to `TRUE` in [`initialize_startr()`](https://globeandmail.github.io/upstartr/reference/initialize_startr.html). +- **Use the functions file**: Reduce repetition in your code by putting writing functions and putting them in the `functions.R` file, which gets `source()`'d when [`run_config()`](https://globeandmail.github.io/upstartr/reference/run_config.html) is run. +- **Help us make `startr` better**: Using this package? Find yourself wishing the structure were slightly different, or have an often-used function you're tired of copying and pasting between projects? Please [send us your feedback](#get-in-touch). ## Directory structure -```bash +``` ├── data/ -│   ├── raw # The original data files. Treat this directory as read-only. -│   ├── cache # Cached files, mostly used when scraping or dealing with packages such as `cancensus` -│   ├── processed # Imported and tidied data used throughout the analysis. -│   └── out # Exports of data at key steps or as a final output. +│   ├── raw/ # The original data files. Treat this directory as read-only. +│   ├── cache/ # Cached files, mostly used when scraping or dealing with packages such as `cancensus` +│   ├── processed/ # Imported and tidied data used throughout the analysis. +│   └── out/ # Exports of data at key steps or as a final output. ├── R/ │   ├── process.R # Data processing including tidying, processing and manupulation. │   ├── analyze.R # The primary analysis steps. │   ├── visualize.R # Generate plots as png, pdf, etc. -│   ├── utils.R # Commonly-used functions. │   └── functions.R # Project-specific functions. -├── scrape/ -│   └── scrape.R # Scraping scripts that save collected data to the `/data/raw/` directory. ├── plots/ # Visualizations saved out plot files in standard formats. ├── reports/ # Generated reports and associated files. +├── scrape/ +│   └── scrape.R # Scraping scripts that save collected data to the `/data/raw/` directory. │   └── notebook.Rmd # Standard notebook to render reports. ├── config.R # Global project variables including packages, key project paths and data sources. ├── run.R # Wrapper file to run the analysis steps, either inline or sourced from component R files. └── startr.Rproj # Rproj file for RStudio ``` -An `.nvmrc` is included at the project root for scraping with Node. A `venv` and `requirements.txt` file should be included within the scraper directory if Python is used for scraping. +An `.nvmrc` is included at the project root for Node.js-based scraping. If you prefer to scrape with Python, be sure to add `venv` and `requirements.txt` files, or a `Gemfile` if working in Ruby. + +## See also + +`startr` is part of a small ecosystem of R utilities. Those include: + +- [**upstartr**](https://github.com/globeandmail/upstartr), a library of functions that support `startr` and daily data journalism tasks +- [**tgamtheme**](https://github.com/globeandmail/tgamtheme), The Globe and Mail's graphics theme +- [**startr-cli**](https://github.com/globeandmail/startr-cli), a command-line tool that scaffolds new `startr` projects ## Version -1.0.2 +1.1.0 ## License @@ -249,6 +337,6 @@ startr © 2020 The Globe and Mail. It is free software, and may be redistributed If you've got any questions, feel free to send us an email, or give us a shout on Twitter: -[![Michael Pereira](https://avatars0.githubusercontent.com/u/212666?v=3&s=200)](https://github.com/monkeycycle)| [![Tom Cardoso](https://avatars0.githubusercontent.com/u/2408118?v=3&s=200)](https://github.com/tomcardoso) +[![Tom Cardoso](https://avatars0.githubusercontent.com/u/2408118?v=3&s=65)](https://github.com/tomcardoso) | [![Michael Pereira](https://avatars0.githubusercontent.com/u/212666?v=3&s=65)](https://github.com/monkeycycle) ---|--- -[Michael Pereira](mailto:mpereira@globeandmail.com)
[@__m_pereira](https://www.twitter.com/__m_pereira) | [Tom Cardoso](mailto:tcardoso@globeandmail.com)
[@tom_cardoso](https://www.twitter.com/tom_cardoso) +[Tom Cardoso](mailto:tcardoso@globeandmail.com)
[@tom_cardoso](https://www.twitter.com/tom_cardoso) | [Michael Pereira](mailto:hello@monkeycycle.org)
[@__m_pereira](https://www.twitter.com/__m_pereira) diff --git a/config.R b/config.R index 817bc1f..98d99e5 100644 --- a/config.R +++ b/config.R @@ -1,93 +1,31 @@ -############################################################ -# This file sets the config for the project including -# specifying packages to load and global variables. -# -############################################################ - -options(scipen = 999) -Sys.setenv(TZ = 'America/Toronto') - -# Project-specific -config_author <- 'Firstname Lastname ' -config_title <- 'startr' - -# Directories to read from and write to -dir_data <- 'data' -dir_src <- 'R' -dir_data_raw <- 'data/raw' -dir_data_cache <- 'data/cache' -dir_data_processed <- 'data/processed' -dir_data_out <- 'data/out' -dir_reports <- 'reports' -dir_plots <- 'plots' - -# Files: You'll want to edit this to add your source data file names -sample.raw.file <- here::here(dir_data_raw, 'sample.csv') - -# Primary and supplemental notebooks. -# Set should_render_notebook to TRUE if using notebooks -r_notebook <- here::here(dir_reports, 'notebook.Rmd') - -# startr-specific configuration, consumed by helper functions -# Should a notebook be rendered in run.R? -should_render_notebook <- FALSE - -# Should the processing step be run in run.R? -should_process_data <- TRUE - -# Should files written with write_excel have a timestamp in the filename? -timestamp_output_files <- FALSE - -# Should the variables created during process.R be cleaned up after processing? -clean_processing_variables <- TRUE - -packages <- c( - # essentials - 'here', 'devtools', 'tidyverse', - # manipulation - 'lubridate', 'janitor', 'zoo', 'glue', 'clipr', - # modelling - 'tidymodels', - # Read/write files - 'readxl', 'openxlsx', 'feather', - # visualization - 'scales', 'ggthemes', 'gganimate', - # scraping - 'rvest', - # GIS - 'sf', - # RMarkdown - 'knitr', 'ezknitr', 'kableExtra', 'DT', - # other stuff - # 'cansim', 'cancensus', - 'beepr' +# ================================================================= +# This file configures the project by specifying filenames, loading +# packages and setting up some project-specific variables. +# ================================================================= + +# This initializes your startr project +initialize_startr( + title = 'startr', + author = 'Firstname Lastname ', + timezone = 'America/Toronto', + should_render_notebook = FALSE, + should_process_data = TRUE, + should_timestamp_output_files = FALSE, + packages = c( + 'tidyverse', 'glue', 'magrittr', 'lubridate', 'hms', + 'readxl', 'feather' + # 'rvest' + # 'tgamtheme', + # 'janitor', 'zoo', + # 'tidymodels', + # 'scales', 'gganimate', + # 'sf', + # 'cansim', 'cancensus', + ) ) -source(here::here(dir_src, 'utils.R')) -source(here::here(dir_src, 'functions.R')) - -load_requirements(packages) - -options( - # CANCENSUS_API should be set in your home directory's .Renviron file, - # and will get pulled down from there - cancensus.api_key = Sys.getenv(c('CANCENSUS_API')), - cancensus.cache_path = here::here(dir_data_cache), - cansim.cache_path = here::here(dir_data_cache) -) - -knitr::opts_chunk$set( - eval = TRUE, - echo = FALSE, - message = FALSE, - cache = FALSE, - warning = FALSE, - error = FALSE, - comment = '#', - tidy = FALSE, - collapse = TRUE, - results = 'asis', - fig.width = 12, - dpi = 150, - root.dir = here::here() -) +# Refer to your source data filenames here. These can be either references +# to files in your `data/raw` folder, or paths to files hosted on the web. +# For example: +# sample.raw.file <- dir_data_raw('your-filename-here.csv') +# sample.raw.path <- 'https://github.com/tidyverse/dplyr/raw/master/data-raw/starwars.csv' diff --git a/reports/notebook.Rmd b/reports/notebook.Rmd index 200baaa..279b8d1 100644 --- a/reports/notebook.Rmd +++ b/reports/notebook.Rmd @@ -1,7 +1,7 @@ --- -title: '`r config_title`' +title: '`r getOption('startr.title')`' date: '`r format(Sys.Date(), "%B %d, %Y")`' -author: '`r config_author`' +author: '`r getOption('startr.author')`' output: html_notebook: code_folding: hide @@ -13,17 +13,10 @@ output: toc_float: yes --- -## Config -```{r config, echo=FALSE, message=FALSE, warning=FALSE, results='hide'} +## First heading -if (!require('devtools')) install.packages('devtools'); library('devtools') -if (!require('here')) install.packages('here'); library('here') - -source(here::here('config.R')) -if (should_process_data) source(here::here(dir_src, 'process.R')) -source(here::here(dir_src, 'analyze.R')) -source(here::here(dir_src, 'visualize.R')) - -options(warn = 1, width = 200) +Text goes here. +```{r} +# R code goes here ``` diff --git a/run.R b/run.R index 53b74be..14b101f 100644 --- a/run.R +++ b/run.R @@ -1,8 +1,7 @@ -if (!require('here')) install.packages('here'); library('here') +if (!require('upstartr')) install.packages('upstartr'); library('upstartr') -source(here::here('config.R')) -if (should_process_data) { source(here::here(dir_src, 'process.R')) } -source(here::here(dir_src, 'analyze.R')) -source(here::here(dir_src, 'visualize.R')) - -if (should_render_notebook) { render_notebook(r_notebook) } +run_config() +run_process() +run_analyze() +run_visualize() +run_notebook() diff --git a/scrape/scrape.R b/scrape/scrape.R index fedb322..ff98d1f 100644 --- a/scrape/scrape.R +++ b/scrape/scrape.R @@ -1,6 +1,9 @@ -############################################################ -# This file handles any scrapes that might be necessary, -# and doesn't get called by the main block. -# -# -############################################################ +# ======================================================================= +# Put any scraping code here. This file doesn't get called by `run.R`. +# ======================================================================= + +if (!require('upstartr')) install.packages('upstartr'); library('upstartr') + +run_config() + +# Scraping code goes here.