_main.tex

% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
\PassOptionsToPackage{dvipsnames,svgnames*,x11names*}{xcolor}
%
\documentclass[
]{book}
\usepackage{lmodern}
\usepackage{amssymb,amsmath}
\usepackage{ifxetex,ifluatex}
\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
  \usepackage[T1]{fontenc}
  \usepackage[utf8]{inputenc}
  \usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
  \usepackage{unicode-math}
  \defaultfontfeatures{Scale=MatchLowercase}
  \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
  \usepackage[]{microtype}
  \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
  \IfFileExists{parskip.sty}{%
    \usepackage{parskip}
  }{% else
    \setlength{\parindent}{0pt}
    \setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
  \KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
\hypersetup{
  pdftitle={Computational Thinking for Social Scientists},
  pdfauthor={Jae Yeon Kim},
  colorlinks=true,
  linkcolor=Maroon,
  filecolor=Maroon,
  citecolor=Blue,
  urlcolor=Blue,
  pdfcreator={LaTeX via pandoc}}
\urlstyle{same} % disable monospaced font for URLs
\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\usepackage{framed}
\definecolor{shadecolor}{RGB}{248,248,248}
\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}}
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\BuiltInTok}[1]{#1}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}}
\newcommand{\ExtensionTok}[1]{#1}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\ImportTok}[1]{#1}
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\NormalTok}[1]{#1}
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\RegionMarkerTok}[1]{#1}
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\usepackage{longtable,booktabs}
% Correct order of tables after \paragraph or \subparagraph
\usepackage{etoolbox}
\makeatletter
\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
\makeatother
% Allow footnotes in longtable head/foot
\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
\makesavenoteenv{longtable}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother
\setlength{\emergencystretch}{3em} % prevent overfull lines
\providecommand{\tightlist}{%
  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\setcounter{secnumdepth}{5}
\usepackage{booktabs}
\usepackage{amsthm}
\makeatletter
\def\thm@space@setup{%
  \thm@preskip=8pt plus 2pt minus 4pt
  \thm@postskip=\thm@preskip
}
\makeatother
\usepackage{booktabs}
\usepackage{longtable}
\usepackage{array}
\usepackage{multirow}
\usepackage{wrapfig}
\usepackage{float}
\usepackage{colortbl}
\usepackage{pdflscape}
\usepackage{tabu}
\usepackage{threeparttable}
\usepackage{threeparttablex}
\usepackage[normalem]{ulem}
\usepackage{makecell}
\usepackage{xcolor}
\usepackage[]{natbib}
\bibliographystyle{apalike}

\title{Computational Thinking for Social Scientists}
\author{\href{https://jaeyk.github.io/}{Jae Yeon Kim}}
\date{2020-10-28}

\begin{document}
\maketitle

{
\hypersetup{linkcolor=}
\setcounter{tocdepth}{1}
\tableofcontents
}
\hypertarget{hello-world}{%
\chapter{Hello World}\label{hello-world}}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{print}\NormalTok{(}\StringTok{"Hello, World!"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Hello, World!"
\end{verbatim}

\begin{quote}
Make simple things simple, and complex things possible. - \href{https://www.quora.com/What-is-the-story-behind-Alan-Kay-s-adage-Simple-things-should-be-simple-complex-things-should-be-possible}{Alan Kay}
\end{quote}

This is the website for \emph{Computational Thinking for Social Scientists}. This book intends to help social scientists to think computationally and develop proficiency with computational tools and techniques, necessary to conduct research in computational social science. Mastering these tools and techniques not only enables social scientists to collect, wrangle, analyze, and interpret data with less pain and more fun, but it also let them to work on research projects that would previously seem impossible.

The book is not intended to be a comprehensive guide for computational social science or any particular programming language, computational tool or technique. For general introduction to computational social science, I recommend \href{http://www.princeton.edu/~mjs3/}{Matthew Salganik}'s \href{https://www.bitbybitbook.com/}{Bit By Bit (2017)}.

The book is currently divided into two main subjects (fundamentals and applications) and seven main sessions.

\hypertarget{part-i-fundamentals}{%
\subsection{Part I Fundamentals}\label{part-i-fundamentals}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  \protect\hyperlink{motivation}{Why computational thinking}
\item
  \protect\hyperlink{git_bash}{Best practices in data and code management using Git and Bash}
\item
  \protect\hyperlink{tidy_data}{How to wrangle, model, and visualize data easier and faster}
\item
  \protect\hyperlink{functional_programming}{How to use functions to automate repeated things and develop data products (e.g., packages and apps)}
\end{enumerate}

\hypertarget{part-ii-applications}{%
\subsection{Part II Applications}\label{part-ii-applications}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{4}
\item
  \protect\hyperlink{semi_structured_data}{How to collect and parse semi-structured data at scale (e.g., using APIs and webscraping)}
\item
  \protect\hyperlink{machine_learning}{How to analyze high-dimensional data (e.g., text) using machine learning}
\item
  \protect\hyperlink{big_data}{How to access, query, and manage big data using SQL and Spark}
\end{enumerate}

The book teaches how to do all of these mostly in \href{https://www.r-project.org/about.html}{\textbf{R}}, and sometimes in \href{https://www.gnu.org/software/bash/}{\textbf{bash}} and \href{https://www.python.org/about/}{\textbf{Python}}.

\begin{itemize}
\item
  Why R? R is free, easy to learn (thanks to \href{https://www.tidyverse.org/}{\texttt{tidyverse}} and \href{https://rstudio.com/}{RStudio}), fast (thanks to \href{https://cran.r-project.org/web/packages/Rcpp/index.html}{\texttt{Rcpp}}), runs everywhere, \textbf{open} (16,000+ packages; counting only ones \href{https://cran.r-project.org/web/packages/}{available at CRAN}), and has a growing massive and inclusive community (\href{https://twitter.com/search?q=\%23rstats\&src=typed_query}{\texttt{\#rstats}}).
\item
  Why R + Python + bash?

  \begin{quote}
  \begin{quote}
  ``For R and Python, Python is first and foremost a programming language. And that has a lot of good features, but it tends to mean, that if you are going to do data science in Python, you have to first learn how to program in Python. Whereas I think you are going to get up and running faster with R, than with Python because there's just a bunch more stuff built in and you don't have to learn as many programming concepts. You can focus on being a great political scientist or whatever you do and learning enough R that you don't have to become an expert programmer as well to get stuff done.'' - Hadley Wickham
  \end{quote}
  \end{quote}

  \begin{itemize}
  \tightlist
  \item
    However, this feature of the R community also raises a challenge.
  \end{itemize}

  \begin{quote}
  \begin{quote}
  Compared to other programming languages, the R community tends to be more focused on results instead of processes. Knowledge of software engineering best practices is patchy: for instance, not enough R programmers use source code control or automated testing. Inconsistency is rife across contributed packages, even within base R. You are confronted with over 20 years of evolution every time you use R. R is not a particularly fast programming language, and poorly written R code can be terribly slow. R is also a profligate user of memory. - Hadley Wickham
  \end{quote}
  \end{quote}

  \begin{itemize}
  \tightlist
  \item
    RStudio, especially the tidyverse team, has made heroic efforts to amend the problems listed above. Readers you will learn these recent advances in the R ecosystem and how to complement R with Python and Bash.
  \end{itemize}
\end{itemize}

\hypertarget{special-thanks}{%
\section{Special thanks}\label{special-thanks}}

This book is collected as much as it is authored. It is a remix version of \href{https://github.com/rochelleterman/PS239T}{PS239T}, a graduate-level computational methods course at UC Berkeley, originally developed by \href{http://rochelleterman.com/}{Rochelle Terman} then revised by \href{http://rachelbernhard.com/}{Rachel Bernhard}. I have taught PS239T as lead instructor in Spring 2019 and TA in Spring 2018 and will co-teach it in Spring 2020. Other teaching materials draw from the workshops I have created for \href{https://dlab.berkeley.edu/}{D-Lab} and \href{https://data.berkeley.edu/research/discovery-program-home}{Data Science Discovery Program} at UC Berkeley. I also have cited all the other references whenever I am aware of related books, articles, slides, blog posts, or YouTube video clips.

\hypertarget{suggestions-questions-or-comments}{%
\section{Suggestions, questions, or comments}\label{suggestions-questions-or-comments}}

Please feel free to \href{https://github.com/jaeyk/PS239T/issues}{create issues} if you find typos, errors, missing citations, etc via the GitHub repository associated with this book.

\hypertarget{license}{%
\section{License}\label{license}}

\includegraphics{https://i.creativecommons.org/l/by/4.0/88x31.png} This work is licensed under a \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International License}.

\hypertarget{motivation}{%
\chapter{Computational thinking}\label{motivation}}

\hypertarget{why-computational-thinking}{%
\section{Why computational thinking}\label{why-computational-thinking}}

\begin{itemize}
\item
  If social scientists want to know how to work smart and not just hard, they need to take full advantage of the power of modern programming languages, and that power is \textbf{automation}.
\item
  Let's think about the following two cases.

  \begin{itemize}
  \item
    Case 1: Suppose a social scientist needs to collect data on civic organizations in the United States from websites, Internal Revenue Service reports, and social media posts. As the number of these organizations is large, the researcher could not collect a large volume of data from diverse sources, so they would hire undergraduates and distribute tasks among them. This is a typical data collection plan in social science research, and it is labor-intensive. Automation is not part of the game plan. Yet, it is critical for so many reasons. Because the process is costly, no one is likely to either replicate or update the data collection effort. Put differently, without making the process efficient, it is difficult for it to be reproducible and scalable.
  \item
    Case 2: An alternative is to write computer programs that collect such data automatically, parse them, and store them in interconnected databases. Additionally, someone may need to maintain and validate the quality of the data infrastructure. Nevertheless, this approach lowers the cost of the data collection process, thereby substantially increasing the \textbf{reproducibility} and \textbf{scalability} of the process. Furthermore, the researcher can document their code and publicly share it using their GitHub repository or even gather some of the functions they used and distribute them as open-source libraries.
  \end{itemize}
\item
  Programming is as valuable a skill as writing in social science research. The extent to which a researcher can automate the research process can determine its efficiency, reproducibility, and scalability.
\end{itemize}

\begin{quote}
Every modern statistical and data analysis problem needs code to solve it. You shouldn't learn just the basics of programming, spend some time gaining mastery. Improving your programming skills pays off because code is a \textbf{force multiplier}: once you've solved a problem once, code allows you to solve it much faster in the future. As your programming skill increases, the generality of your solutions improves: you solve not just the precise problem you encountered, but a wider class of related problems (in this way programming skill is very much like mathematical skill). Finally, sharing your code with others allows them to benefit from your experience. - \href{https://imstat.org/2014/12/16/hadley-wickham-impact-the-world-by-being-useful/}{Hadley Wickham}
\end{quote}

\begin{itemize}
\tightlist
\item
  How can we automate our research process? How can we talk to and teach a machine?
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://bam.files.bbci.co.uk/bam/live/content/znmb87h/large}
\caption{From BBC Bitesize}
\end{figure}

\begin{itemize}
\item
  This book teaches how you to do that in R in incremental steps.

  \begin{itemize}
  \tightlist
  \item
    From graphic user interface to command-line interface
  \item
    From short programs to long programs
  \item
    The ultimate goal is to solve complex problems at scale using computation
  \end{itemize}
\end{itemize}

\begin{quote}
``{[}W{]}e wanted users to be able to begin in an interactive environment, where they did not consciously think of themselves as progamming. Then as their needs became clearer and their sophistication increased, they should be able to slide gradually into programming, when the language and system aspects would become more important.'' - \emph{Stages in the Evolution of S} by John Chambers (S is the progenitor of R)
\end{quote}

\hypertarget{computational-way-of-thinking-about-data}{%
\section{Computational way of thinking about data}\label{computational-way-of-thinking-about-data}}

\hypertarget{structure}{%
\subsection{Structure}\label{structure}}

\begin{itemize}
\tightlist
\item
  Structured data (Excel spreadsheets, CSVs)

  \begin{itemize}
  \tightlist
  \item
    Tidy data
  \end{itemize}
\item
  Semi-structured data

  \begin{itemize}
  \tightlist
  \item
    HTML/CSS: Websites
  \item
    JSON/XML: APIs
  \end{itemize}
\end{itemize}

\hypertarget{dimension}{%
\subsection{Dimension}\label{dimension}}

\begin{itemize}
\item
  n = the number of observations
\item
  p = the number of variables
\item
  Low-dimensional data (n \textgreater{} p)

  \begin{itemize}
  \tightlist
  \item
    Survey, experimental, and administrative data
  \end{itemize}
\item
  High-dimensional data (n \textless{} p)

  \begin{itemize}
  \tightlist
  \item
    Text, speech, image, video, etc.
  \end{itemize}
\end{itemize}

\hypertarget{size}{%
\subsection{Size}\label{size}}

\begin{itemize}
\tightlist
\item
  Data fit in your laptop's memory
\item
  Data don't fit in your laptop's memory (=big data)
\end{itemize}

\hypertarget{computational-way-of-thinking-about-research-process}{%
\section{Computational way of thinking about research process}\label{computational-way-of-thinking-about-research-process}}

Computational tools and techniques make \ldots{}

\begin{itemize}
\tightlist
\item
  Doing traditional research easier, faster, scalable, and more reproducible

  \begin{itemize}
  \tightlist
  \item
    Data wrangling
  \item
    Modeling
  \item
    Visualization
  \end{itemize}
\item
  Documentation and collaboration easier, faster and scalable

  \begin{itemize}
  \tightlist
  \item
    Dynamic reporting (markdown)
  \item
    Version control system (Git and GitHub)
  \end{itemize}
\item
  Collecting and analyzing large and complex data possible

  \begin{itemize}
  \tightlist
  \item
    Digital data collection (API and web scraping)

    \begin{itemize}
    \tightlist
    \item
      Building a data infrastructure (SQL)
    \end{itemize}
  \item
    Machine learning
  \end{itemize}
\end{itemize}

\hypertarget{git_bash}{%
\chapter{Managing data and code}\label{git_bash}}

\hypertarget{getting-started-in-r}{%
\section{Getting started in R}\label{getting-started-in-r}}

\hypertarget{rstudio}{%
\subsection{RStudio}\label{rstudio}}

There are two main ways of interacting with R: using the console or by using script files (plain text files that contain your code).

If R is ready to accept commands, the R console shows a \texttt{\textgreater{}} prompt. If it receives a command (by typing, copy-pasting or sent from the script editor using \texttt{Ctrl-Enter}; \texttt{Command-Enter} will also work on Macs), R will try to execute it, and when ready, show the results and come back with a new \texttt{\textgreater{}}-prompt to wait for new commands. This is the equivalent of the \texttt{\$} in your terminal.

\hypertarget{basic-syntax}{%
\subsection{Basic Syntax}\label{basic-syntax}}

\textbf{Comments}

Use \texttt{\#} signs to comment. Comment liberally in your R scripts. Anything to the right of a \texttt{\#} is ignored by R. For those of you familiar with other languages, there is no doc string, or equivalent to \texttt{"""} in R.

\textbf{Assignment operator}

\texttt{\textless{}-} is the assignment operator. It assigns values on the right to objects on the left. So, after executing \texttt{x\ \textless{}-\ 3}, the value of \texttt{x} is \texttt{3}. The arrow can be read as 3 \textbf{goes into} \texttt{x}. You can also use \texttt{=} for assignments.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{USweird \textless{}{-}}\StringTok{ "Why use lb for pound!"} \CommentTok{\# Use this}

\StringTok{"Why use lb for pound!"}\NormalTok{ =}\StringTok{ }\NormalTok{USweird}
\end{Highlighting}
\end{Shaded}

Nonetheless, \emph{can} does not mean you \emph{should}. It is good practice to use \texttt{\textless{}-} for assignments. \texttt{=} should only be used to specify the values of arguments of functions. This is what Google and Hadley Wickham recommend as well. If they don't convince you enough, here's \href{https://csgillespie.wordpress.com/2010/11/16/assignment-operators-in-r-vs/}{a real example}.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{mean}\NormalTok{(}\DataTypeTok{x =} \DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{) }\CommentTok{\# Does it save x?}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 5.5
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{rm}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning in rm(x): object 'x' not found
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{mean}\NormalTok{(x \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{) }\CommentTok{\# Does it save x?}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 5.5
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{rm}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\textbf{Printing}

In R, the contents of an object can be printed by either simply executing the the object name or calling the \texttt{print()} function.

\textbf{Help}

\begin{itemize}
\tightlist
\item
  \texttt{?} + object opens a help page for that specific object
\item
  \texttt{??} + object searches help pages containing the name of the object
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{?mean}
\NormalTok{??mean}
\KeywordTok{help}\NormalTok{(mean)}

\CommentTok{\# The above three will do same. }

\KeywordTok{example}\NormalTok{(ls) }\CommentTok{\# provides example for how to use ls }

\KeywordTok{help.search}\NormalTok{(}\StringTok{"visualization"}\NormalTok{) }\CommentTok{\# search functions and packages that have "visualization" in their descriptions}
\end{Highlighting}
\end{Shaded}

\hypertarget{environment}{%
\section{Environment}\label{environment}}

Environment = a collection of pairs

\hypertarget{objects}{%
\subsection{Objects}\label{objects}}

\begin{itemize}
\tightlist
\item
  List objects in your current environment
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create a numeric object }
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{,}\DecValTok{2}\NormalTok{,}\DecValTok{3}\NormalTok{,}\DecValTok{4}\NormalTok{,}\DecValTok{5}\NormalTok{)}

\CommentTok{\# List the object }
\KeywordTok{ls}\NormalTok{()}

\CommentTok{\# Remove the object }
\KeywordTok{rm}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Remove objects from your current environment
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create an object }
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{5}

\CommentTok{\# Remove the object }
\KeywordTok{rm}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Remove all objects from your current environment
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create an object }
\NormalTok{a \textless{}{-}}\StringTok{ }\DecValTok{7}

\NormalTok{b \textless{}{-}}\StringTok{ }\DecValTok{3}

\CommentTok{\# Remove the object }
\KeywordTok{rm}\NormalTok{(}\DataTypeTok{list =} \KeywordTok{ls}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Force memory release
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Garbage collect; for more information, type ?gc() }

\KeywordTok{gc}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\hypertarget{packages}{%
\subsection{Packages}\label{packages}}

\texttt{install.packages(package-name)} will download a package from one of the CRAN mirrors assuming that a binary is available for your operating system.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# From CRAN}
\KeywordTok{install.packages}\NormalTok{(}\StringTok{"dplyr"}\NormalTok{) }

\CommentTok{\# Load package }
\KeywordTok{library}\NormalTok{(dplyr)}

\CommentTok{\# From GitHub }
\NormalTok{devtools}\OperatorTok{::}\KeywordTok{install\_github}\NormalTok{(}\StringTok{"jaeyk/tidytweetjson"}\NormalTok{) }\CommentTok{\# my own package }

\CommentTok{\# Unload package }
\CommentTok{\# detach("package:stats", unload=TRUE)}
\end{Highlighting}
\end{Shaded}

\textbf{Tips}

If you have multiple packages to install, then please consider using pacman package. The following is the example. First, you install pacman. Then, you load several libraries by using \texttt{p\_load()} method.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)}

\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{  ggplot2,}
\NormalTok{  dplyr, }
\NormalTok{  broom}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

If you don't like to use \texttt{pacman}, then the other option is to create a list (we're going to learn what is list soon).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pkgs \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"ggplot2"}\NormalTok{, }\StringTok{"dplyr"}\NormalTok{, }\StringTok{"broom"}\NormalTok{)}

\KeywordTok{install.packages}\NormalTok{(pkgs)}
\end{Highlighting}
\end{Shaded}

Still, we have to write two lines. The simpler, the better, right? Here's another approach that can simplify the code further.

Note that \texttt{lapply()} applies (there's a family of apply functions) a function to a list. In this case, library to pkgs. apply is an advanced concept, which is related to anonymous functions. We will learn about it later when we study functions.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{inst \textless{}{-}}\StringTok{ }\KeywordTok{lapply}\NormalTok{(pkgs, library, }
               \DataTypeTok{character.only =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{project-oriented-research}{%
\section{Project-oriented research}\label{project-oriented-research}}

\hypertarget{computational-reproducibility}{%
\subsection{Computational reproducibility}\label{computational-reproducibility}}

\begin{itemize}
\item
  Replication = code + data
\item
  Computational reproduciblity = code + data + environment + distribution
\item
  Reproducibility checklist by \href{http://www.biostat.jhsph.edu/~rpeng/}{Roger Peng}

  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \item
    Start with science (avoid vague questions and concepts)
  \item
    Don't do things by hand (not only about automation but also documentation)
  \item
    Don't point and click (same problem)
  \item
    Teach a computer (automation also solves documentation to some extent)
  \item
    Use some version control
  \item
    Don't save output (instead keep the input and code)
  \item
    Set your seed
  \item
    Think about the entire pipeline
  \end{enumerate}
\end{itemize}

\hypertarget{setup}{%
\subsubsection{Setup}\label{setup}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{  tidyverse, }\CommentTok{\# tidyverse}
\NormalTok{  here }\CommentTok{\# computational reproducibility}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{motivation-1}{%
\subsubsection{Motivation}\label{motivation-1}}

Why do you need to make your research project computationally reproducible?

For your self-interest and public benefits.

\includegraphics{https://github.com/dlab-berkeley/efficient-reproducible-project-management-in-R/blob/master/misc/screenshot.png?raw=true}

\hypertarget{how-to-organize-files-in-a-project}{%
\subsubsection{How to organize files in a project}\label{how-to-organize-files-in-a-project}}

You won't be able to reproduce your project unless it is efficiently organized.

Step 1. \href{https://environments.rstudio.com/}{\textbf{Environment}} is part of your project. If someone can't reproduce your environment, they won't be able to run your code.

\begin{itemize}
\tightlist
\item
  Launch R Studio. Choose Tools \textgreater{} Global Options. You should not check \texttt{Restor\ .RData\ into\ workspace\ at\ startup} and set saving workspace option to \texttt{NEVER}.
\end{itemize}

Step 2. For each project, create a project directory named after the project.

name\_of\_the\_project

\begin{itemize}
\tightlist
\item
  data:

  \begin{itemize}
  \tightlist
  \item
    raw
  \item
    processed (all processed, cleaned, and tided)
  \end{itemize}
\item
  figures
\item
  packrat (optional)
\item
  reports (PDF, HTML, TEX, etc.,)
\item
  results (model outcomes, etc.,)
\item
  scripts (i.e., functions)
\item
  .gitignore (for Git)
\item
  name\_of\_project.Rproj (for R)
\item
  README.md (for Git)
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://datacarpentry.org/R-ecology-lesson/img/working-directory-structure.png}
\caption{Working directory structure example}
\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Don\textquotesingle{}t name it a project. Use a name that\textquotesingle{}s more informative. For instance, us\_election not my\_project.}

\KeywordTok{dir.create}\NormalTok{(}\StringTok{"../us\_election"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Step 3. Launch R Studio. Choose File \textgreater{} New project \textgreater{} Browse existing directories \textgreater{} Create project This allows each project has its own workspace.

Step 4. Organize files by putting them in separate subdirectories and naming them in a sensible way.

\begin{itemize}
\item
  Treat raw data as read only (raw data should be RAW!) and put in the \texttt{data} subdirectory.

  \begin{itemize}
  \tightlist
  \item
    Note that version control does not need replace backup. You still need to backup your raw data.
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dir.create}\NormalTok{(here}\OperatorTok{::}\KeywordTok{here}\NormalTok{(}\StringTok{"us\_election"}\NormalTok{, }\StringTok{"data"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Separate read-only data from processed data and put in the \texttt{processed\_data} subdirectory.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dir.create}\NormalTok{(here}\OperatorTok{::}\KeywordTok{here}\NormalTok{(}\StringTok{"us\_election"}\NormalTok{, }\StringTok{"processed\_data"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Put your code in the \texttt{src} subdirectory.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dir.create}\NormalTok{(here}\OperatorTok{::}\KeywordTok{here}\NormalTok{(}\StringTok{"us\_election"}\NormalTok{, }\StringTok{"src"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Put generated outputs (e.g., tables, figures) in the \texttt{outputs} subdirectory and treat them as disposable.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dir.create}\NormalTok{(here}\OperatorTok{::}\KeywordTok{here}\NormalTok{(}\StringTok{"us\_election"}\NormalTok{, }\StringTok{"outputs"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Put your custom functions in the \texttt{functions} subdirectory. You can gather some of these functions and distribute them as an open-source library.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dir.create}\NormalTok{(here}\OperatorTok{::}\KeywordTok{here}\NormalTok{(}\StringTok{"us\_election"}\NormalTok{, }\StringTok{"src"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\textbf{Challenge}

Set a project structure for a project named ``starwars''.

\hypertarget{how-to-organize-code-in-a-r-markdown-file}{%
\subsubsection{How to organize code in a R markdown file}\label{how-to-organize-code-in-a-r-markdown-file}}

\begin{itemize}
\item
  In addition to environment, \textbf{workflow} is an important component of project efficiency and reproducibility.
\item
  What is R markdown? An R package, developed by \href{https://yihui.org/en/}{Yihui Xie}, that provides an authoring framework for data science. Xie is also a developer of many widely popular R packages such as \texttt{knitr}, \href{https://github.com/yihui/xaringan}{\texttt{xaringan}} (cool kids use xaringan not \href{https://en.wikipedia.org/wiki/Beamer_(LaTeX)}{Beamer} these days), \texttt{blogdown} (used to create \href{https://jaeyk.github.io/}{my personal website}), and \texttt{bookdown} (used to create this book) among many others.

  \begin{itemize}
  \tightlist
  \item
    Many applications: \href{https://rstudio.github.io/distill/basics.html}{reports}, \href{https://bookdown.org/yihui/rmarkdown/xaringan.html}{presentations}, \href{https://rmarkdown.rstudio.com/flexdashboard/}{dashboards}, \href{https://bookdown.org/yihui/rmarkdown/websites.html}{websites}\\
  \item
    Check out \href{https://ysc-rmarkdown.netlify.app/}{Communicating with R markdown workshop} by \href{https://alison.rbind.io/}{Alison Hill} (RStudio)

    \begin{itemize}
    \tightlist
    \item
      Alison Hill is a co-author of \href{https://bookdown.org/yihui/blogdown/}{\texttt{blogdown:\ Creating\ Websites\ with\ R\ Markdown.}}
    \end{itemize}
  \item
    Key strengths: dynamic reporting + reproducible science + easy deployment
  \end{itemize}
\end{itemize}

\begin{itemize}
\tightlist
\item
  R Markdown basic syntax
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Header 1}
\CommentTok{\#\# Header 2}
\CommentTok{\#\#\# Header 3}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Use these section headers to indicate workflow.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Import packages and data}
\CommentTok{\# Tidy data}
\CommentTok{\# Wrangle data}
\CommentTok{\# Model data}
\CommentTok{\# Visualize data}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\item
  Press \texttt{ctrl\ +\ shift\ +\ o}. You can see a document outline based on these headers. This is a nice feature for finding code you need to focus.
\item
  If your project's scale is large, then divide these sections into files, number, and save them in \texttt{code} subdirectory.

  \begin{itemize}
  \tightlist
  \item
    01\_wrangling.Rmd
  \item
    02\_modeling.Rmd
    \ldots{}
  \end{itemize}
\end{itemize}

\hypertarget{making-a-project-computationally-reproducible}{%
\subsubsection{Making a project computationally reproducible}\label{making-a-project-computationally-reproducible}}

\begin{itemize}
\item
  \texttt{setwd()}: set a working directory.
\item
  Note that using \texttt{setwd()} is not a reproducible way to set up your project. For instance, none will be able to run the following code except me.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Set a working directory }
\KeywordTok{setwd}\NormalTok{(}\StringTok{"/home/jae/starwars"}\NormalTok{)}

\CommentTok{\# Do something }
\KeywordTok{ggplot}\NormalTok{(mtcars, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ mpg, }\DataTypeTok{y =}\NormalTok{ wt)) }\OperatorTok{+}
\StringTok{   }\KeywordTok{geom\_point}\NormalTok{()}

\CommentTok{\# Export the object. }
\CommentTok{\# dot means the working directory set by setwd()}
\KeywordTok{ggsave}\NormalTok{(}\StringTok{"./outputs/example.png"}\NormalTok{) }\CommentTok{\# This is called relative path }
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\item
  Instead, learn how to use \texttt{here()}'.

  \begin{itemize}
  \item
    Key idea: separate workflow (e.g., workspace information) from products (code and data). For more information, read Jenny Bryan's wonderful piece on \href{https://www.tidyverse.org/blog/2017/12/workflow-vs-script/}{project-oriented workflow}.
  \item
    Example
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# New: Reproducible }

\KeywordTok{ggplot}\NormalTok{(mtcars, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ mpg, }\DataTypeTok{y =}\NormalTok{ wt)) }\OperatorTok{+}
\StringTok{   }\KeywordTok{geom\_point}\NormalTok{()}

\KeywordTok{ggsave}\NormalTok{(}\KeywordTok{here}\NormalTok{(}\StringTok{"project"}\NormalTok{, }\StringTok{"outputs"}\NormalTok{, }\StringTok{"example.png"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  How \texttt{here} works
\end{itemize}

\texttt{here()} function shows what's the top-level project directory.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{here}\OperatorTok{::}\KeywordTok{here}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Build a path including subdirectories
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{here}\OperatorTok{::}\KeywordTok{here}\NormalTok{(}\StringTok{"project"}\NormalTok{, }\StringTok{"outputs"}\NormalTok{)}
           \CommentTok{\#depth 1   \#depth 2}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\item
  How \texttt{here} defines the top-level project directory. The following list came from \href{https://github.com/jennybc/here_here}{the here package vignette}).

  \begin{itemize}
  \item
    Is a file named .here present?
  \item
    Is this an RStudio Project? (\textbf{Note that we already set up an RStudio Project!} So, if you use RStudio's project feature, then you are ready to use \texttt{here}.)
  \item
    Is this an R package? Does it have a DESCRIPTION file?
  \item
    Is this a remake project? Does it have a file named \texttt{remake.yml}?
  \item
    Is this a projectile project? Does it have a file named \texttt{.projectile}?
  \item
    Is this a checkout from a version control system? Does it have a directory named \texttt{.git} or \texttt{.svn}? Currently, only Git and Subversion are supported.
  \item
    If there's no match then use \texttt{set\_here()} to create an empty \texttt{.here} file.
  \end{itemize}
\end{itemize}

\textbf{Challenge}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Can you define computational reproducibility?
\item
  Can you explain why sharing code and data is not enough for computational reproducibility?
\end{enumerate}

\hypertarget{version-control-git-and-bash}{%
\subsection{Version control (Git and Bash)}\label{version-control-git-and-bash}}

\includegraphics{https://github.com/dlab-berkeley/BashGit/raw/master/octobash.png}

\hypertarget{what-is-bash}{%
\subsubsection{What Is Bash?}\label{what-is-bash}}

The following materials on UNIX and Shell are adapted from {[}the software carpentry{]}(\url{https://bids.github.io/2015-06-04-berkeley/shell/00-intro.html}.

\hypertarget{unix}{%
\paragraph{Unix}\label{unix}}

UNIX is an operating system which was first developed by AT \& T employees at Bell Labs (1969-1971). Bell Labs canceled the project (MULTICS) but was continued by the employees worked in a smaller scale. The new project was named UNICS (Uniplexed Information and Computation System) and then renamed UNIX. Due to \href{https://en.wikipedia.org/wiki/Breakup_of_the_Bell_System}{the anti-trust issue}, AT \& T gave away UNIX in 1975. Berkeley is one of the main places where UNIX was developed. \href{https://en.wikipedia.org/wiki/Berkeley_Software_Distribution}{The Berkeley Software Distribution}, one of the branches of UNIX, came out it 1977.

From Mac OS X to Linux, many of current operation systems are some versions of UNIX.

For more information on the history of UNIX, see \href{https://docs.google.com/presentation/d/1kKt9V6rom55hU6SJ2_3nGluobjtScptlnJV9YFe6Jz4/pub?start=false\&loop=false\&delayms=3000\&slide=id.g163c5ae2ce_0_17}{this link}.

\begin{figure}
\centering
\includegraphics{https://upload.wikimedia.org/wikipedia/commons/thumb/7/77/Unix_history-simple.svg/1200px-Unix_history-simple.svg.png}
\caption{Unix history}
\end{figure}

\hypertarget{kernel}{%
\paragraph{Kernel}\label{kernel}}

The kernel of UNIX is the hub of the operating system: it allocates time and memory to programs and handles the \href{http://users.ox.ac.uk/~martinw/unix/chap3.html}{filestore} (e.g., files and directories) and communications in response to system calls.

\hypertarget{shell}{%
\paragraph{Shell}\label{shell}}

The shell is an interactive program that provides an interface between the user and the kernel. The shell interprets commands entered by the user or supplied by a shell script, and passes them to the kernel for execution.

As an illustration of the way that the shell and the kernel work together, suppose a user types \texttt{rm\ myfile} (which has the effect of removing the file \emph{myfile}). The shell searches the filestore for the file containing the program \texttt{rm}, and then requests the kernel, through system calls, to execute the program \texttt{rm} on \emph{myfile}. When the process \texttt{rm\ myfile} has finished running, the shell then returns the UNIX prompt \% to the user, indicating that it is waiting for further commands.

We'll talk more about shells in a little bit.

\hypertarget{human-computer-interfaces}{%
\paragraph{Human-Computer interfaces}\label{human-computer-interfaces}}

At a high level, computers do four things:

\begin{itemize}
\item
  run programs
\item
  store data
\item
  communicate with each other
\item
  interact with us

  They can do the last of these in many different ways, including direct brain-computer links and speech interfaces. Since these are still in their infancy, most of us use windows, icons, mice, and pointers. These technologies didn't become widespread until the 1980s, but their roots go back to \href{https://en.wikipedia.org/wiki/Douglas_Engelbart}{Doug Engelbart} who received his Ph.D.~in electrical engineering from the University of California, Berkeley in 1955 and was hired as an assistant professor at the same department for a year. He then left academia and joined the tech industry and became one of the founding fathers in HCI field (mouse, hypertext, GUI, etc.,).
\end{itemize}

Going back even further, the only way to interact with early computers was to rewire them. But in between, from the 1950s to the 1980s, most people used line printers. These devices only allowed input and output of the letters, numbers, and punctuation found on a standard keyboard, so programming languages and interfaces had to be designed around that constraint.

\hypertarget{the-command-line}{%
\paragraph{The Command Line}\label{the-command-line}}

This kind of interface is called a \textbf{command-line interface}, or CLI,
to distinguish it from the \textbf{graphical user interface}, or GUI, that most people now use.

The heart of a CLI is a \textbf{read-evaluate-print loop}, or REPL: when the user types a command and then presses the enter (or return) key, the computer reads it, executes it, and prints its output. The user then types another command,
and so on until the user logs off.

As William Shotts the author of \emph{\href{http://linuxcommand.org/tlcl.php}{The Linux Command Line}} put it:
\textgreater graphical user interfaces make easy tasks easy, while command line interfaces make difficult tasks possible.

\hypertarget{the-shell}{%
\paragraph{The Shell}\label{the-shell}}

This description makes it sound as though the user sends commands directly to the computer, and the computer sends output directly to the user. In fact,
there is usually a program in between called a \textbf{command shell}.

What the user types goes into the shell; it figures out what commands to run and orders the computer to execute them.

Note, the reason why the shell is called \emph{the shell}: it encloses the operating system in order to hide some of its complexity and make it simpler to interact with.

A shell is a program like any other. What's special about it is that its job is to run other programs rather than to do calculations itself. The commands are themselves programs: when they terminate, the shell gives the user another prompt (\$ on our systems).

\hypertarget{bash}{%
\paragraph{Bash}\label{bash}}

The most popular Unix shell is \textbf{Bash}, the Bourne Again Shell (so-called because it's derived from a shell written by Stephen Bourne --- this is what passes for wit among programmers). Bash is the default shell on most modern implementations of \textbf{Unix}, and in most packages that provide Unix-like tools for Windows.

\hypertarget{why-shell}{%
\paragraph{Why Shell?}\label{why-shell}}

Using Bash or any other shell sometimes feels more like programming than like using a mouse. Commands are terse (often only a couple of characters long), their names are frequently cryptic, and their output is lines of text rather than something visual like a graph.

On the other hand, the shell allows us to combine existing tools in powerful ways with only a few keystrokes and to set up pipelines to handle large volumes of data automatically.

In addition, the command line is often the easiest way to interact with remote machines. As clusters and cloud computing become more popular for scientific data crunching, being able to drive them is becoming a necessary skill.

\hypertarget{our-first-command}{%
\paragraph{Our first command}\label{our-first-command}}

The part of the operating system responsible for managing files and directories is called the \textbf{file system}. It organizes our data into files, which hold information, and directories (also called ``folders''), which hold files or other directories.

Several commands are frequently used to create, inspect, rename, and delete files and directories. To start exploring them, let's open a shell window:

\begin{verbatim}
$
\end{verbatim}

The dollar sign is a \textbf{prompt}, which shows us that the shell is waiting for input; your shell may show something more elaborate.

Type the command \texttt{whoami}, then press the Enter key (sometimes marked Return) to send the command to the shell.

The command's output is the ID of the current user, i.e., it shows us who the shell thinks we are:

\begin{verbatim}
$ whoami

oski
\end{verbatim}

More specifically, when we type \texttt{whoami} the shell:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  finds a program called \texttt{whoami},
\item
  runs that program,
\item
  displays that program's output, then
\item
  displays a new prompt to tell us that it's ready for more commands.
\end{enumerate}

\hypertarget{communicating-to-other-systems}{%
\paragraph{Communicating to other systems}\label{communicating-to-other-systems}}

In the next unit, we'll be focusing on the structure of our own operating systems. But our operating systems rarely work in isolation; often, we are relying on the Internet to communicate with others! You can visualize this sort of communication within your own shell by asking your computer to \texttt{ping} (based on the old term for submarine sonar) an IP address provided by Google (8.8.8.8); in effect, this will test whether your Internet (thanks Airbears2) is working.

\begin{verbatim}
$ ping 8.8.8.8
\end{verbatim}

Note: Windows users may have to try a slightly different alternative:

\begin{verbatim}
$ ping -t 8.8.8.8
\end{verbatim}

Your computer will begin continuously pinging this IP address and reporting back the ``latency,'' or how long it took for the ping data packet to go to that IP address and back. If your Internet isn't working, it will instead report an error saying ``No route to host.''

Ping runs continuously, so when we want it to stop, we have to manually tell the kernel to stop executing the ping command. We do this simply by typing ctrl+c.~

(Thanks \href{http://www.paulthissen.org/}{Paul Thissen} for the suggestion!)

\hypertarget{file-system-organization}{%
\paragraph{File system organization}\label{file-system-organization}}

Next, let's find out where we are by running a command called \texttt{pwd} (\textbf{print working directory}).

At any moment, our \textbf{current working directory} is our current default directory, i.e., the directory that the computer assumes we want to run commands in unless we explicitly specify something else.

Here, the computer's response is \texttt{/home/oski}, which is the \textbf{home directory}:

\begin{verbatim}
$ pwd

/home/oski
\end{verbatim}

\begin{quote}
\hypertarget{home-directory}{%
\subsubsection{Home Directory}\label{home-directory}}

The home directory path will look different on different operating systems. On Linux it will look like \texttt{/home/oski}, and on Windows it will be similar to \texttt{C:\textbackslash{}Documents\ and\ Settings\textbackslash{}oski}. Note that it may look slightly different for different versions of Windows.
\end{quote}

\begin{quote}
\hypertarget{alphabet-soup}{%
\subsubsection{Alphabet Soup}\label{alphabet-soup}}

If the command to find out who we are is \texttt{whoami}, the command to find out where we are ought to be called \texttt{whereami}, so why is it \texttt{pwd} instead? The usual answer is that in the early 1970s, when Unix was
first being developed, every keystroke counted: the devices of the day were slow, and backspacing on a teletype was so painful that cutting the number of keystrokes in order to cut the number of typing mistakes was actually a win for usability. The reality is that commands were added to Unix one by one, without any master plan, by people who were immersed in its jargon. The result is as inconsistent as the roolz uv Inglish speling, but we're stuck with it now.

The good news: because these basic commands were so integral to the development of early Unix, they have stuck around, and appear (in some form) in almost all programming languages.
\end{quote}

To understand what a ``home directory'' is, let's have a look at how the file system as a whole is organized. At the top is the \textbf{root directory} that holds everything else.

We refer to it using a slash character \texttt{/} on its own; this is the leading slash in \texttt{/home/oski}.

Inside that directory are several other directories: \texttt{bin} (which is where some built-in programs are stored), \texttt{data} (holding miscellaneous data files) \texttt{etc} (where local configuration files are stored), \texttt{tmp} (for temporary files that don't need to be stored long-term), and so on.

\begin{quote}
If you're working on a Mac, the file structure will look similar, but not
identical. The following image shows a file system graph for the typical Mac.
\end{quote}

\begin{figure}
\centering
\includegraphics{https://swcarpentry.github.io/shell-novice/fig/home-directories.svg}
\caption{File Directory}
\end{figure}

We know that our current working directory \texttt{/home/oski} is stored inside \texttt{/home} because \texttt{/home} is the first part of its name. Similarly, we know that \texttt{/home} is stored inside the root directory \texttt{/} because its name begins with \texttt{/}.

\begin{quote}
\hypertarget{path}{%
\subsubsection{Path}\label{path}}

Notice that there are two meanings for the \texttt{/} character.
When it appears at the front of a file or directory name, it refers to the root directory. When it appears \emph{inside} a name, it's just a separator.
\end{quote}

\hypertarget{listing}{%
\paragraph{Listing}\label{listing}}

Let's see what's in your home directory by running \texttt{ls} (**list files and directories):

\begin{verbatim}
$ ls

Applications        Dropbox         Pictures
Creative Cloud Files    Google Drive        Public
Desktop         Library         Untitled.ipynb
Documents       Movies          anaconda
Downloads       Music           file.txt
\end{verbatim}

\texttt{ls} prints the names of the files and directories in the current directory in alphabetical order, arranged neatly into columns.

We can make its output more comprehensible by using the \textbf{flag} \texttt{-F}, which tells \texttt{ls} to add a trailing \texttt{/} to the names of directories:

\begin{verbatim}
$ ls -F

Applications        Dropbox         Pictures
Creative Cloud Files    Google Drive        Public
Desktop         Library         Untitled.ipynb
Documents       Movies          anaconda
Downloads       Music           file.txt
\end{verbatim}

And note that there is a space between \texttt{ls} and \texttt{-F}: without it, the shell thinks we're trying to run a command called \texttt{ls-F}, which doesn't exist.

\begin{quote}
\hypertarget{whats-in-a-name}{%
\subsubsection{What's In A Name?}\label{whats-in-a-name}}

You may have noticed that all of our's files' names are ``something dot something''. This is just a convention: we can call a file \texttt{file} or almost anything else we want. However, most people use two-part names most of the time to help them (and their programs) tell different kinds of files apart. The second part of such a name is called the \textbf{filename extension}, and indicates what type of data the file holds:
\texttt{.txt} signals a plain text file, \texttt{.pdf} indicates a PDF document, \texttt{.cfg} is a configuration file full of parameters for some program or other, and so on.

This is just a convention, albeit an important one. Files contain bytes: it's up to us and our programs to interpret those bytes according to the rules for PDF documents, images, and so on.

Naming a PNG image of a whale as \texttt{whale.mp3} doesn't somehow magically turn it into a recording of whalesong, though it \emph{might} cause the operating system to try to open it with a music player when someone double-clicks it.
\end{quote}

Now let's take a look at what's in your \texttt{Desktop} directory by running \texttt{ls\ -F\ data}, i.e., the command \texttt{ls} with the \textbf{arguments} \texttt{-F} and \texttt{PS239T}. The second argument --- the one \emph{without} a leading dash --- tells \texttt{ls} that we want a listing of the files in something other than our current working directory:

\begin{verbatim}
$ ls -F PS239T

01_Introduction/            10_python-basics/
02_Unix-Bash/               11_FINAL PROJECTS/
03_r-basics/                12_text-analysis-python/
04_r-data-analysis/         13_text-analysis-r/
05_r-visualization/         14_machine-learning/
06_APIs/                15_machine-learning-applications/
07_html-css-javascript/         A_Syllabus.md
08_webscraping/             B_Install.md
09_qualtrics-mturk/         README.md
\end{verbatim}

The output shows us that there are three files and fifteen sub-sub-directories. Organizing things hierarchically in this way helps us keep track of our work: it's possible to put hundreds of files in our home directory, just as it's possible to pile hundreds of printed papers on our desk, but it's a self-defeating strategy.

Notice, by the way that we spelled the directory name \texttt{Desktop}. It doesn't have a trailing slash: that's added to directory names by \texttt{ls} when we use the \texttt{-F} flag to help us tell things apart. And it doesn't begin with a slash because it's a \textbf{relative path}, i.e., it tells \texttt{ls} how to find something from where we are, rather than from the root of the file system.

\begin{quote}
\hypertarget{parameters-vs.-arguments}{%
\subsubsection{Parameters vs.~Arguments}\label{parameters-vs.-arguments}}

According to \href{https://en.wikipedia.org/wiki/Parameter_(computer_programming)\#Parameters_and_arguments}{Wikipedia},
the terms \textbf{argument} and \textbf{parameter} mean slightly different things.
In practice, however, most people use them interchangeably or inconsistently,
so we will too.
\end{quote}

If we run \texttt{ls\ -F\ /Desktop} (\emph{with} a leading slash) we get a different answer, because \texttt{/Desktop} is an \textbf{absolute path}:

\begin{verbatim}
$ ls -F /Desktop

ls: /Desktop: No such file or directory
\end{verbatim}

The leading \texttt{/} tells the computer to follow the path from the root of the file system, so it always refers to exactly one directory, no matter where we are when we run the command.

What if we want to change our current working directory? Before we do this, \texttt{pwd} shows us that we're in \texttt{/home/oski}, and \texttt{ls} without any arguments shows us that directory's contents:

\begin{verbatim}
$ pwd

/home/oski (/Users/rachel)

$ ls

Applications        Dropbox         Pictures
Creative Cloud Files    Google Drive        Public
Desktop         Library         Untitled.ipynb
Documents       Movies          anaconda
Downloads       Music           file.txt
\end{verbatim}

Use relative paths (e.g., ../PS239T/references.md) whenever it's possible so that your code is not dependable on how your system is configured.

\hypertarget{moving-around}{%
\paragraph{Moving around}\label{moving-around}}

We can use \texttt{cd} (\textbf{change directory}) followed by a directory name to change our working directory.

\begin{verbatim}
$ cd Desktop
\end{verbatim}

\texttt{cd} doesn't print anything, but if we run \texttt{pwd} after it, we can see that we are now in \texttt{/home/oski/Desktop}.

If we run \texttt{ls} without arguments now, it lists the contents of \texttt{/home/oski/Desktop}, because that's where we now are:

\begin{verbatim}
$ pwd

/home/oski/Desktop
\end{verbatim}

We now know how to go down the directory tree: how do we go up? We could use an absolute path:

\begin{verbatim}
$ cd /home/oski/
\end{verbatim}

but it's almost always simpler to use \texttt{cd\ ..} to go up one level:

\begin{verbatim}
$ pwd

/home/oski/Desktop

$ cd ..
\end{verbatim}

\texttt{..} is a special directory name meaning ``the directory containing this one'',
or more succinctly, the \textbf{parent} of the current directory. Sure enough, if we run \texttt{pwd} after running \texttt{cd\ ..}, we're back in \texttt{/home/oski/}:

\begin{verbatim}
$ pwd

/home/oski/
\end{verbatim}

The special directory \texttt{..} doesn't usually show up when we run \texttt{ls}. If we want to display it, we can give \texttt{ls} the \texttt{-a} flag:

\begin{verbatim}
$ ls -a

.       .localized  Shared
..      Guest       rachel
\end{verbatim}

\texttt{-a} stands for ``show all''; it forces \texttt{ls} to show us file and directory names that begin with \texttt{.}, such as \texttt{..}.

\begin{quote}
\hypertarget{hidden-files-for-your-own-protection}{%
\subsubsection{Hidden Files: For Your Own Protection}\label{hidden-files-for-your-own-protection}}

As you can see, a bunch of other items just appeared when we enter \texttt{ls\ -a}.
These files and directories begin with \texttt{.} followed by a name. These are
usually files and directories that hold important programmatic information,
not usually edited by the casual computer user. They are kept hidden so that
users don't accidentally delete or edit them without knowing what they're
doing.
\end{quote}

As you can see, it also displays another special directory that's just called \texttt{.}, which means ``the current working directory''. It may seem redundant to have a name for it, but we'll see some uses for it soon.

\begin{quote}
\hypertarget{phone-home}{%
\subsubsection{Phone Home}\label{phone-home}}

If you ever want to get to the home directory immediately, you can use the
shortcut \texttt{\textasciitilde{}}. For example, type \texttt{cd\ \textasciitilde{}} and you'll get back home in a jiffy.
\texttt{\textasciitilde{}} will also stand in for your home directory in paths, so for instance
\texttt{\textasciitilde{}/Desktop} is the same as \texttt{/home/oski/Desktop}. This only works if it is
the first character in the path: \texttt{here/there/\textasciitilde{}/elsewhere} is not
\texttt{/home/oski/elsewhere}.
\end{quote}

\hypertarget{tab-completion}{%
\paragraph{Tab completion}\label{tab-completion}}

If you are in you home directory, you can see what files you have on your \texttt{Desktop} using the command:

\begin{verbatim}
$ ls ~/Desktop
\end{verbatim}

This is a lot to type, but she can let the shell do most of the work. If she types:

\begin{verbatim}
$ ls ~/Des
\end{verbatim}

and then presses tab, the shell automatically completes the directory name for her:

\begin{verbatim}
$ ls ~/Desktop
\end{verbatim}

Pressing tab again does nothing, since there are multiple possibilities. Pressing tab twice brings up a list of all the files and directories, and so on.

This is called \textbf{tab completion}, and we will see it in many other tools as we go on.

\begin{quote}
\hypertarget{quick-file-paths}{%
\subsubsection{Quick File Paths}\label{quick-file-paths}}

If you quickly need the path of a file or directory, you can also copy the
file/directory in the GUI and paste.The full path of the file or directory
will appear.
\end{quote}

\hypertarget{writing-your-first-shell-script}{%
\paragraph{Writing your first shell script}\label{writing-your-first-shell-script}}

Write a shell script that creates a directory called \texttt{/pdfs} under \texttt{/Download} directory, then find PDF files in \texttt{/Download} and copy those files to \texttt{pdfs}. This shell script creates a backup.

\begin{Shaded}
\begin{Highlighting}[]

\CommentTok{\#!/bin/sh}

\FunctionTok{mkdir}\NormalTok{ /home/jae/Downloads/pdfs }

\BuiltInTok{cd}\NormalTok{ Download}

\FunctionTok{cp}\NormalTok{ *.pdf pdfs/ }

\BuiltInTok{echo} \StringTok{"Copied pdfs"}
\end{Highlighting}
\end{Shaded}

\hypertarget{git-and-github}{%
\subsubsection{Git and GitHub}\label{git-and-github}}

\hypertarget{version-control-system}{%
\paragraph{Version control system}\label{version-control-system}}

\begin{figure}
\centering
\includegraphics{https://i2.wp.com/cdn-images-1.medium.com/max/399/1*7HHA_UkjUK7wp7qP4CYu1g.png?zoom=1.75\&w=456\&ssl=1}
\caption{Why you should do version control}
\end{figure}

According to \href{https://guides.github.com}{Github Guides}, a versin control system ``tracks the history of changes as people and teams collaborate on projects together''. Specifically, it helps to track the following information:

\begin{itemize}
\tightlist
\item
  Which changes were made?
\item
  Who made the changes?
\item
  When were the changes made?
\item
  Why were changes needed?
\end{itemize}

Git is a case of a \href{https://en.wikipedia.org/wiki/Distributed_version_control}{distributed version control system}, common in open source and commercial software development. This is no surprising given that Git \href{https://lkml.org/lkml/2005/4/6/121}{was originally created} to deal with Linux kernal development.

\begin{itemize}
\tightlist
\item
  If you're curious about how the Intenret works, learn one of the key ideas of the Internet: \href{https://en.wikipedia.org/wiki/End-to-end_principle}{end-to-end principle}. This also explains why \href{https://en.wikipedia.org/wiki/Net_neutrality}{net neutrality} matters.
\end{itemize}

The following images, from \href{git-scm.com}{Pro Git}, show how a centralized (e.g., CVS, Subversion, and Perforce) and decentralized VCS (e.g., Git, Mercurial, Bazzar or Darcs) works differently.

\begin{figure}
\centering
\includegraphics{https://git-scm.com/book/en/v2/images/centralized.png}
\caption{Centralized version control system}
\end{figure}

Figure 2. Centralized VCS.

\begin{figure}
\centering
\includegraphics{https://git-scm.com/book/en/v2/images/distributed.png}
\caption{Decentralized version control system}
\end{figure}

Figure 3. Decentralized VCS.

For more information on the varieties of version control systems, please read \href{https://pdfs.semanticscholar.org/4490/4c70bc91e1bed4fe02b9e2282f031b7c90ea.pdf}{Petr Baudis's review} on that subject.

\begin{figure}
\centering
\includegraphics{https://plain-text.co/figures/git-basic.png}
\caption{Figure 2.1. A schematic git workflow from Healy's ``The Plain Person's Guide to Plain Text Social Science''}
\end{figure}

\hypertarget{setup-1}{%
\paragraph{Setup}\label{setup-1}}

We'll start with telling Git who you are.

\begin{verbatim}
$ git config --global user.name "Firstname Lastname"
$ git config --global user.email username@company.extension
\end{verbatim}

\hypertarget{making-a-repository}{%
\paragraph{Making a repository}\label{making-a-repository}}

Create a new directory and move to it.

\begin{verbatim}
$ mkdir code_exercise 
$ cd code_exercise 
\end{verbatim}

\begin{verbatim}
$ git init 
\end{verbatim}

Alternatively, you can create a Git repository via Github and then clone it on your local machine.

\begin{verbatim}
$ git clone /path/to/repository
\end{verbatim}

If you're unfamiliar with basic Git commands, then please refer to \href{http://rogerdudler.github.io/git-guide/files/git_cheat_sheet.pdf}{this Git cheet sheet}.

\hypertarget{commit-changes}{%
\paragraph{Commit changes}\label{commit-changes}}

These feature show how Git works as a version control system.

If you edited files or added new ones, then you need to update your repository. In Git terms, this action is called committing changes.

\begin{verbatim}
$ git add . # update every change. In Git terms, you're staging. 
$ git add file_name # or stage a specific file.
$ git commit -m "your comment" # your comment for the commit. 
$ git push origin master # commit the change. Origin is a defaul name given to a server by Git. 
\end{verbatim}

Another image from \href{https://git-scm.com/about/staging-area}{Pro Git} well illustrates this process.

\begin{figure}
\centering
\includegraphics{https://git-scm.com/images/about/index1@2x.png}
\caption{Git Workflow}
\end{figure}

\hypertarget{other-useful-commands-for-tracking-history}{%
\paragraph{Other useful commands for tracking history}\label{other-useful-commands-for-tracking-history}}

\begin{verbatim}
$ git diff # to see what changed (e.g., inside a file)
$ git log # to track who committed what
$ git checkout the commit hash (e.g., a5e556) file name (fruit_list.txt) # to recover old files 
$ git revert 1q84 # revert to the previous commit 
\end{verbatim}

\hypertarget{doing-other-than-adding}{%
\paragraph{Doing other than adding}\label{doing-other-than-adding}}

\begin{verbatim}
$ git rm file_name # remove 
$ git mv old_file_name new_file_name # rename a file 
\end{verbatim}

\hypertarget{push-and-pull-or-fetch}{%
\paragraph{Push and pull (or fetch)}\label{push-and-pull-or-fetch}}

These features show how Git works as a collaboration tool.

If you have not already done, let's clone PS239T directory on your local machine.

\begin{verbatim}
$ git clone https://github.com/jaeyk/PS239T # clone 
\end{verbatim}

Then, let's learn more about the repository.

\begin{verbatim}
$ git remote -v 
\end{verbatim}

Previously, we learned how to send your data save in the local machine to the remote (the Github server). You can do that by editing or creating files, committing, and then typing \textbf{git push}.

Instead, if you want to update your local data with the remote data, then you can type \textbf{git pull origin} (something like pwd in bash). Alternatively, you can use fetch (retrieve data from a remote). When you do that, Git retrieves the data and merge it into your local data.

\begin{verbatim}
$ git fetch origin
\end{verbatim}

\hypertarget{branching}{%
\paragraph{Branching}\label{branching}}

It's an advanced feature of Git's version control system that allows developers to ``diverge from the main line of development and continue to do work without messing with that main line'' according to \href{https://git-scm.com/book/en/v1/Git-Branching}{Scott Chacon and Ben Straub}.

If you start working on a new feature, then create a new branch.

\begin{verbatim}
$ git branch new_features
$ git checkout new_features
\end{verbatim}

You can see the newly created branch by typing \textbf{git branch}.

In short, branching makes Git \href{https://git-scm.com/book/en/v2/Getting-Started-Git-Basics}{works like} a mini file system.

\hypertarget{collaborations}{%
\paragraph{Collaborations}\label{collaborations}}

Two options.

\begin{itemize}
\tightlist
\item
  Sharing a repository (suitable for a private project).
\item
  Fork and pull (suitable for an open source project).
  \hspace{0pt} * The one who maintains the repository becomes the maintainer.
  \hspace{0pt} * The others can \href{https://help.github.com/articles/about-forks/}{fork}, make changes, and even \href{https://help.github.com/articles/about-pull-requests/}{pull} them back.
\end{itemize}

\hypertarget{other-stuff}{%
\paragraph{Other stuff}\label{other-stuff}}

\begin{verbatim}
$ git status # show the status of changes 
$ git branch # show the branch being worked on locally
$ git merge # merge branches 
$ git reset --hard # restore the pristine version
$ git commit -a -m "additional backup" # to save the state again
\end{verbatim}

\hypertarget{deployment-github-pages}{%
\paragraph{Deployment: GitHub Pages}\label{deployment-github-pages}}

\hypertarget{tracking-progress-github-issues}{%
\paragraph{Tracking progress: GitHub Issues}\label{tracking-progress-github-issues}}

\hypertarget{project-management-github-dashboards}{%
\paragraph{Project management: GitHub Dashboards}\label{project-management-github-dashboards}}

\hypertarget{writing-code-how-to-code-like-a-professional}{%
\section{Writing code: How to code like a professional}\label{writing-code-how-to-code-like-a-professional}}

\hypertarget{write-readable-code}{%
\subsection{Write readable code}\label{write-readable-code}}

\begin{itemize}
\tightlist
\item
  What is code style?
\end{itemize}

\begin{quote}
Every major open-source project has its own style guide: a set of conventions (sometimes arbitrary) about how to write code for that project. It is much easier to understand a large codebase when all the code in it is in a consistent style. - \href{https://google.github.io/styleguide/}{Google Style Guides}
\end{quote}

\begin{itemize}
\item
  How to avoid smelly code?

  \begin{itemize}
  \tightlist
  \item
    Check out \href{https://github.com/jennybc/code-smells-and-feels\#readme}{the code-smells Git repository} by Jenny Bryan.
  \end{itemize}
\end{itemize}

\begin{itemize}
\item
  Naming matters

  \begin{itemize}
  \tightlist
  \item
    When naming files, remember the following three rules:

    \begin{itemize}
    \tightlist
    \item
      Machine readable (avoid spaces, punctuation, periods, and any other special characters except \_ and -)
    \item
      Human readable (should be meaningful. No text1, image1, etc.,)
    \item
      Ordering (e.g., 01, 02, 03, \ldots{} )
    \end{itemize}
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Good}
\NormalTok{fit\_models.R}

\CommentTok{\# Bad}
\NormalTok{fit models.R}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  When naming objects:

  \begin{itemize}
  \tightlist
  \item
    Don't use special characters.
  \item
    Don't capitalize.
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Good }
\NormalTok{day\_one}
    
\CommentTok{\# Bad }
\NormalTok{DayOne}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  When naming functions:

  \begin{itemize}
  \tightlist
  \item
    Don't use special characters.
  \item
    Don't capitalize.
  \item
    Use \texttt{verbs} instead of \texttt{nouns}. (Functions do something!)
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Good }
\NormalTok{run\_rdd }

\CommentTok{\# Bad }
\NormalTok{rdd}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Spacing
\end{itemize}

Some people do spacing by pressing the Tab key and others do it by pressing the Space key multiple times (and this is a serious subject).

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Good}
\NormalTok{x[, }\DecValTok{1}\NormalTok{] }

\KeywordTok{mean}\NormalTok{(x, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{) }

\CommentTok{\# Bad}

\NormalTok{x[,}\DecValTok{1}\NormalTok{]}

\KeywordTok{mean}\NormalTok{ (x, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Indenting
\end{itemize}

Indent at least 4 spaces. Note that some people, including none other than \href{https://simplystatistics.org/2018/07/27/why-i-indent-my-code-8-spaces/}{Roger Peng}, indent 8 spaces. The below example shows how you can change the default indentation setting using RStudio configuration.

\begin{figure}
\centering
\includegraphics{https://pbs.twimg.com/media/CuHHs7yXgAAFWeh?format=jpg\&name=360x360}
\caption{Roger Peng's tweet}
\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Good}
\ControlFlowTok{if}\NormalTok{ (y }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{0}\NormalTok{) \{}
  \KeywordTok{message}\NormalTok{(}\StringTok{"y is negative"}\NormalTok{)}
\NormalTok{\}}

\CommentTok{\# Bad}
\ControlFlowTok{if}\NormalTok{ (y }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{0}\NormalTok{) \{}
\KeywordTok{message}\NormalTok{(}\StringTok{"Y is negative"}\NormalTok{)\}}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Long lines
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Good}
\KeywordTok{do\_something\_very\_complicated}\NormalTok{(}
  \DataTypeTok{something =} \StringTok{"that"}\NormalTok{,}
  \DataTypeTok{requires =}\NormalTok{ many,}
  \DataTypeTok{arguments =} \StringTok{"some of which may be long"}
\NormalTok{)}

\CommentTok{\# Bad}
\KeywordTok{do\_something\_very\_complicated}\NormalTok{(}\StringTok{"that"}\NormalTok{, }\DataTypeTok{requires =}\NormalTok{ many, }\DataTypeTok{arguments =}
                              \StringTok{"some of which may be long"}
\NormalTok{                              )}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Comments

  \begin{itemize}
  \tightlist
  \item
    Use comments to explain your decisions.
  \item
    But, show your code; Do not try to explain your code by comments.
  \item
    Also, try to comment out rather than delete the code that you experiment with.
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Average sleep hours of Jae}
\NormalTok{jae }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# By week}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(week) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Mean sleep hours }
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{week\_sleep =} \KeywordTok{mean}\NormalTok{(sleep, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Pipes (chaining commands)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Good}
\NormalTok{iris }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(Species) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize\_if}\NormalTok{(is.numeric, mean) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ungroup}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{gather}\NormalTok{(measure, value, }\OperatorTok{{-}}\NormalTok{Species) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{arrange}\NormalTok{(value)}

\CommentTok{\# Bad}
\NormalTok{iris }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{group\_by}\NormalTok{(Species) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{summarize\_all}\NormalTok{(mean) }\OperatorTok{\%\textgreater{}\%}
\NormalTok{ungroup }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{gather}\NormalTok{(measure, value, }\OperatorTok{{-}}\NormalTok{Species) }\OperatorTok{\%\textgreater{}\%}
\KeywordTok{arrange}\NormalTok{(value)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\item
  Additional tips
\item
  Use \texttt{lintr} to check whether your code complies with a recommended style guideline (e.g., \texttt{tidyverse}) and \texttt{styler} package to format your code according to the style guideline.
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://camo.githubusercontent.com/6cb80270269165a8d3046d2da03cbf2b8f19ee2f/687474703a2f2f692e696d6775722e636f6d2f61635632374e562e676966}
\caption{how lintr works}
\end{figure}

\hypertarget{write-reusable-code}{%
\subsection{Write reusable code}\label{write-reusable-code}}

\begin{itemize}
\tightlist
\item
  Pasting
\end{itemize}

\begin{quote}
Copy-and-paste programming, sometimes referred to as just pasting, is the production of highly repetitive computer programming code, as produced by copy and paste operations. It is primarily a pejorative term; those who use the term are often implying a lack of programming competence. It may also be the result of technology limitations (e.g., an insufficiently expressive development environment) as subroutines or libraries would normally be used instead. However, there are occasions when copy-and-paste programming is considered acceptable or necessary, such as for boilerplate, loop unrolling (when not supported automatically by the compiler), or certain programming idioms, and it is supported by some source code editors in the form of snippets. - \href{https://en.wikipedia.org/wiki/Copy-and-paste_programming}{Wikipedia}
\end{quote}

\begin{itemize}
\item
  It's okay for pasting for the first attempt to solve a problem. But if you copy and paste three times (a.k.a. \href{https://en.wikipedia.org/wiki/Rule_of_three_(computer_programming)}{Rule of Three} in programming), something's wrong. You're working too hard. You need to be lazy. What do I mean and how can you do that?
\item
  The following exercise was inspired by \href{http://adv-r.had.co.nz/Functional-programming.html}{Wickham's example}.
\item
  Let's imagine \texttt{df} is a survey dataset.

  \begin{itemize}
  \item
    \texttt{a,\ b,\ c,\ d} = Survey questions
  \item
    \texttt{-99}: non-responses
  \item
    Your goal: replace \texttt{-99} with \texttt{NA}
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Data}

\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{) }\CommentTok{\# for reproducibility }

\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}\StringTok{"a"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{ , }\DataTypeTok{replace=} \OtherTok{TRUE}\NormalTok{),}
             \StringTok{"b"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{ , }\DataTypeTok{replace=} \OtherTok{TRUE}\NormalTok{),}
             \StringTok{"c"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{ , }\DataTypeTok{replace=} \OtherTok{TRUE}\NormalTok{),}
             \StringTok{"d"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{ , }\DataTypeTok{replace=} \OtherTok{TRUE}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Copy and paste }
\NormalTok{df}\OperatorTok{$}\NormalTok{a[df}\OperatorTok{$}\NormalTok{a }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}
\NormalTok{df}\OperatorTok{$}\NormalTok{b[df}\OperatorTok{$}\NormalTok{b }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}
\NormalTok{df}\OperatorTok{$}\NormalTok{c[df}\OperatorTok{$}\NormalTok{c }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}
\NormalTok{df}\OperatorTok{$}\NormalTok{d[df}\OperatorTok{$}\NormalTok{d }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}

\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 4
##       a     b     c     d
##   <dbl> <dbl> <dbl> <dbl>
## 1     3     3     3     1
## 2     3     2     3     1
## 3     1    NA     1     2
## 4     1    NA     2     1
## 5    NA     1     1     3
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Using a function

  \begin{itemize}
  \tightlist
  \item
    function: input + computation + output
  \item
    If you write a function, you gain efficiency because you don't need to copy and paste the computation part.
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create a custom function}
\NormalTok{fix\_missing \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x) \{ }\CommentTok{\# INPUT}
\NormalTok{  x[x }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA} \CommentTok{\# COMPUTATION}
\NormalTok{  x }\CommentTok{\# OUTPUT }
\NormalTok{\}}

\CommentTok{\# Apply the function to each column (vector)}
\CommentTok{\# This iterated part can and should be automated.}
\NormalTok{df}\OperatorTok{$}\NormalTok{a \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{a)}
\NormalTok{df}\OperatorTok{$}\NormalTok{b \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{b)}
\NormalTok{df}\OperatorTok{$}\NormalTok{c \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{c)}
\NormalTok{df}\OperatorTok{$}\NormalTok{d \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{d)}

\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Automation

  \begin{itemize}
  \tightlist
  \item
    Many options for automation in R: \texttt{for\ loop}, \texttt{apply} family, etc.
  \item
    Here's a tidy solution comes from \texttt{purrr} package.
  \item
    The power and joy of one-liner.
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\NormalTok{purrr}\OperatorTok{::}\KeywordTok{map\_df}\NormalTok{(df, fix\_missing) }\CommentTok{\# What is this magic? We will unpack the blackbox (\textasciigrave{}map\_df()\textasciigrave{}) later.}

\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Takeaways
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Your code becomes more reusable, when it's easier to \textbf{change, debug, and scale up}. Don't repeat yourself and embrace the power of lazy programming.
\end{enumerate}

\begin{quote}
Lazy, because only lazy programmers will want to write the kind of tools that might replace them in the end. Lazy, because only a lazy programmer will avoid writing monotonous, repetitive code---thus avoiding redundancy, the enemy of software maintenance and flexible refactoring. Mostly, the tools and processes that come out of this endeavor fired by laziness will speed up the production. - \href{http://blogoscoped.com/archive/2005-08-24-n14.html}{Philipp Lenssen}
\end{quote}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Only when your code becomes \textbf{reusable}, you would become \textbf{efficient} in your data work. Otherwise, you need to start from scratch or copy and paste, when you work on a new project.
\end{enumerate}

\begin{quote}
Code reuse aims to save time and resources and reduce redundancy by taking advantage of assets that have already been created in some form within the software product development process.{[}2{]} The key idea in reuse is that parts of a computer program written at one time can be or should be used in the construction of other programs written at a later time. - Wikipedia
\end{quote}

\hypertarget{test-your-code-systematically}{%
\subsection{Test your code systematically}\label{test-your-code-systematically}}

\hypertarget{asking-questions-minimal-reproducible-example}{%
\section{Asking questions: Minimal reproducible example}\label{asking-questions-minimal-reproducible-example}}

\hypertarget{how-to-create-a-minimal-reproducible-example}{%
\subsection{How to create a minimal reproducible example}\label{how-to-create-a-minimal-reproducible-example}}

\begin{itemize}
\item
  Chances are you're going to use StackOverFlow a lot to solve a pressing problem you face. However, other can't understand/be interested in your problem unless you can provide an example which they can understand with minimal efforts. Such example is called a minimal reproducible example.
\item
  Read \href{https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example}{this StackOverFlow post} to understand the concept and best practices.
\item
  Simply put, a MRE consists of the following items:

  \begin{itemize}
  \tightlist
  \item
    A minimal dataset
  \item
    The minimal burnable code
  \item
    The necessary information on package, R version, system (use \texttt{sessionInfo()})
  \item
    A seed for reproducibility (\texttt{set.seed()}), if you used a random process.
  \end{itemize}
\end{itemize}

\hypertarget{references}{%
\section{References}\label{references}}

\begin{itemize}
\item
  Project-oriented research

  \begin{itemize}
  \item
    Computational reproducibility

    \begin{itemize}
    \item
      \href{https://github.com/swcarpentry/good-enough-practices-in-scientific-computing/blob/gh-pages/good-enough-practices-for-scientific-computing.pdf}{``Good Enough Practices in Scientific Computing''} by PLOS
    \item
      \href{https://swcarpentry.github.io/r-novice-gapminder/02-project-intro/}{Project Management with RStudio} by Software Carpentry
    \item
      \href{https://kbroman.org/steps2rr/}{Initial steps toward reproducible research} by Karl Broman
    \end{itemize}
  \item
    Version control

    \begin{itemize}
    \item
      \href{https://swcarpentry.github.io/git-novice/}{Version Control with Git} by Software Carpentry
    \item
      \href{http://plain-text.co/}{The Plain Person's Guide to Plain Text Social Science} by Kieran Healy
    \end{itemize}
  \end{itemize}
\item
  Writing code

  \begin{itemize}
  \tightlist
  \item
    Style guides

    \begin{itemize}
    \tightlist
    \item
      R

      \begin{itemize}
      \tightlist
      \item
        \href{https://google.github.io/styleguide/Rguide.xml}{Google's R style guide}
      \item
        \href{http://r-pkgs.had.co.nz/r.html}{R code style guide} by Hadley Wickham
      \item
        \href{http://style.tidyverse.org/}{The tidyverse style guide} by Hadley Wickham
      \end{itemize}
    \item
      Python

      \begin{itemize}
      \tightlist
      \item
        \href{https://github.com/google/styleguide/blob/gh-pages/pyguide.md}{Google Python Style Guide}
      \item
        \href{https://docs.python-guide.org/writing/style/\#zen-of-python}{Code Style} by the Hitchhiker's Guide to Python
      \end{itemize}
    \end{itemize}
  \end{itemize}
\end{itemize}

\hypertarget{tidy_data}{%
\chapter{Tidy data and its friends}\label{tidy_data}}

\hypertarget{setup-2}{%
\section{Setup}\label{setup-2}}

\begin{itemize}
\tightlist
\item
  Check your \texttt{dplyr} package is up-to-date by typing \texttt{packageVersion("dplyr")}. If the current installed version is less than 1.0, then update by typing \texttt{update.packages("dplyr")}. You may need to restart R to make it work.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ifelse}\NormalTok{(}\KeywordTok{packageVersion}\NormalTok{(}\StringTok{"dplyr"}\NormalTok{) }\OperatorTok{\textgreater{}=}\StringTok{ }\DecValTok{1}\NormalTok{,}
  \StringTok{"The installed version of dplyr package is greater than or equal to 1.0.0"}\NormalTok{, }\KeywordTok{update.packages}\NormalTok{(}\StringTok{"dplyr"}\NormalTok{)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "The installed version of dplyr package is greater than or equal to 1.0.0"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)) }\KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Loading required package: pacman
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{  tidyverse, }\CommentTok{\# for the tidyverse framework}
\NormalTok{  here, }\CommentTok{\# for computational reproducibility}
\NormalTok{  gapminder, }\CommentTok{\# toy data}
\NormalTok{  nycflights13, }\CommentTok{\# for exercise}
\NormalTok{  ggthemes, }\CommentTok{\# additional themes}
\NormalTok{  ggrepel, }\CommentTok{\# arranging ggplots}
\NormalTok{  patchwork, }\CommentTok{\# arranging ggplots}
\NormalTok{  broom, }\CommentTok{\# tidying model outputs}
\NormalTok{  waldo }\CommentTok{\# side{-}by{-}side code comparison}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

The rest of the chapter follows the basic structure in \href{https://rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf}{the Data Wrangling Cheat Sheet} created by RStudio.

\hypertarget{r-data-structures}{%
\section{R Data structures}\label{r-data-structures}}

To make the best use of the R language, you'll need a strong understanding of the basic data types and data structures and how to operate on those. R is an \textbf{object-oriented} language, so the importance of this cannot be understated.

It is \textbf{critical} to understand because these are the objects you will manipulate on a day-to-day basis in R, and they are not always as easy to work with as they sound at the outset. Dealing with object conversions is one of the most common sources of frustration for beginners.

\begin{quote}
To understand computations in R, two slogans are helpful:
- Everything that exists is an object.
- Everything that happens is a function call.
\end{quote}

\begin{quote}
\_\_John Chambers\_\_the creator of S (the mother of R)
\end{quote}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  \protect\hyperlink{main-classes}{Main Classes} introduces you to R's one-dimensional or atomic classes and data structures. R has five basic atomic classes: logical, integer, numeric, complex, character. Social scientists don't use complex class. (Also, remember that we rarely use trigonometry.)
\item
  \protect\hyperlink{attributes}{Attributes} takes a small detour to discuss attributes, R's flexible metadata specification. Here you'll learn about factors, an important data structure created by setting attributes of an atomic vector. R has many data structures: vector, list, matrix, data frame, factors, tables.
\end{enumerate}

\hypertarget{d-data-vectors}{%
\section{1D data: Vectors}\label{d-data-vectors}}

\hypertarget{atomic-classes}{%
\subsection{Atomic classes}\label{atomic-classes}}

\texttt{R}'s main atomic classes are:

\begin{itemize}
\tightlist
\item
  character (or a ``string'' in Python and Stata)
\item
  numeric (integer or float)
\item
  integer (just integer)
\item
  logical (booleans)
\end{itemize}

\begin{longtable}[]{@{}ll@{}}
\toprule
Example & Type\tabularnewline
\midrule
\endhead
``a'', ``swc'' & character\tabularnewline
2, 15.5 & numeric\tabularnewline
2 (Must add a \texttt{L} at end to denote integer) & integer\tabularnewline
\texttt{TRUE}, \texttt{FALSE} & logical\tabularnewline
\bottomrule
\end{longtable}

Like Python, R is dynamically typed. There are a few differences in terminology, however, that are pertinent.

\begin{itemize}
\tightlist
\item
  First, ``types'' in Python are referred to as ``classes'' in R.
\end{itemize}

What is a class?

\begin{figure}
\centering
\includegraphics{https://ds055uzetaobb.cloudfront.net/brioche/uploads/pJZt3mh3Ht-prettycars.png?width=2400}
\caption{from \url{https://brilliant.org/}}
\end{figure}

\begin{itemize}
\tightlist
\item
  Second, R has some different names for the types string, integer, and float --- specifically \textbf{character}, \textbf{integer} (not different), and \textbf{numeric}. Because there is no ``float'' class in R, users tend to default to the ``numeric'' class when they want to work with numerical data.
\end{itemize}

The function for recovering object classes is \texttt{class()}. L suffix to qualify any number with the intent of making it an explicit integer. See more from the \href{https://cran.r-project.org/doc/manuals/R-lang.html}{R language definition}.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(}\DecValTok{3}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "numeric"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(3L)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "integer"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(}\StringTok{"Three"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "character"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(F)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "logical"
\end{verbatim}

\hypertarget{data-structures}{%
\subsection{Data structures}\label{data-structures}}

R's base data structures can be organized by their dimensionality (1d, 2d, or nd) and whether they're homogeneous (all contents must be of the same type) or heterogeneous (the contents can be of different types). This gives rise to the five data types most often used in data analysis:

\begin{longtable}[]{@{}lll@{}}
\toprule
& Homogeneous & Heterogeneous\tabularnewline
\midrule
\endhead
1d & Atomic vector & List\tabularnewline
2d & Matrix & Data frame\tabularnewline
nd & Array &\tabularnewline
\bottomrule
\end{longtable}

Each data structure has its own specifications and behavior. For our purposes, an important thing to remember is that R is always \textbf{faster} (more efficient) working with homogeneous (\textbf{vectorized}) data.

\hypertarget{vector-properties}{%
\subsubsection{Vector properties}\label{vector-properties}}

Vectors have three common properties:

\begin{itemize}
\tightlist
\item
  Class, \texttt{class()}, or what type of object it is (same as \texttt{type()} in Python).
\item
  Length, \texttt{length()}, how many elements it contains (same as \texttt{len()} in Python).
\item
  Attributes, \texttt{attributes()}, additional arbitrary metadata.
\end{itemize}

They differ in the types of their elements: all elements of an atomic vector must be the same type, whereas the elements of a list can have different types.

\hypertarget{creating-different-types-of-atomic-vectors}{%
\subsubsection{Creating different types of atomic vectors}\label{creating-different-types-of-atomic-vectors}}

Remember, there are four common types of vectors:
* \texttt{logical}
* \texttt{integer}
* \texttt{numeric} (same as \texttt{double})
* \texttt{character}.

You can create an empty vector with \texttt{vector()} (By default the mode is \texttt{logical}. You can be more explicit as shown in the examples below.) It is more common to use direct constructors such as \texttt{character()}, \texttt{numeric()}, etc.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{vector}\NormalTok{()}

\CommentTok{\# with a length and type}
\KeywordTok{vector}\NormalTok{(}\StringTok{"character"}\NormalTok{, }\DataTypeTok{length =} \DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "" "" "" "" "" "" "" "" "" ""
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#\# character vector of length 5}
\KeywordTok{character}\NormalTok{(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "" "" "" "" ""
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{numeric}\NormalTok{(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 0 0 0 0 0
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{logical}\NormalTok{(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE FALSE FALSE FALSE FALSE
\end{verbatim}

Atomic vectors are usually created with \texttt{c()}, which is short for concatenate:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{)}

\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 2 3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{length}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 3
\end{verbatim}

\texttt{x} is a numeric vector. These are the most common kind. You can also have logical vectors.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{y \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\OtherTok{TRUE}\NormalTok{, }\OtherTok{TRUE}\NormalTok{, }\OtherTok{FALSE}\NormalTok{, }\OtherTok{FALSE}\NormalTok{)}

\NormalTok{y}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1]  TRUE  TRUE FALSE FALSE
\end{verbatim}

Finally you can have character vectors:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{kim\_family \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"Jae"}\NormalTok{, }\StringTok{"Sun"}\NormalTok{, }\StringTok{"Jane"}\NormalTok{)}

\KeywordTok{is.integer}\NormalTok{(kim\_family) }\CommentTok{\# integer?}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.character}\NormalTok{(kim\_family) }\CommentTok{\# character?}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.atomic}\NormalTok{(kim\_family) }\CommentTok{\# atomic?}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{typeof}\NormalTok{(kim\_family) }\CommentTok{\# what\textquotesingle{}s the type?}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "character"
\end{verbatim}

\textbf{Short exercise: Create and examine your vector}

Create a character vector called \texttt{fruit} that contain 4 of your favorite fruits. Then evaluate its structure using the commands below.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# First create your fruit vector}
\CommentTok{\# YOUR CODE HERE}
\NormalTok{fruit \textless{}{-}}

\StringTok{  }\CommentTok{\# Examine your vector}
\StringTok{  }\KeywordTok{length}\NormalTok{(fruit)}
\KeywordTok{class}\NormalTok{(fruit)}
\KeywordTok{str}\NormalTok{(fruit)}
\end{Highlighting}
\end{Shaded}

\textbf{Add elements}

You can add elements to the end of a vector by passing the original vector into the \texttt{c} function, like so:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{z \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"Beyonce"}\NormalTok{, }\StringTok{"Kelly"}\NormalTok{, }\StringTok{"Michelle"}\NormalTok{, }\StringTok{"LeToya"}\NormalTok{)}

\NormalTok{z \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(z, }\StringTok{"Farrah"}\NormalTok{)}

\NormalTok{z}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Beyonce"  "Kelly"    "Michelle" "LeToya"   "Farrah"
\end{verbatim}

More examples of vectors

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{0.5}\NormalTok{, }\FloatTok{0.7}\NormalTok{)}

\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\OtherTok{TRUE}\NormalTok{, }\OtherTok{FALSE}\NormalTok{)}

\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{, }\StringTok{"d"}\NormalTok{, }\StringTok{"e"}\NormalTok{)}

\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{9}\OperatorTok{:}\DecValTok{100}
\end{Highlighting}
\end{Shaded}

You can also create vectors as a sequence of numbers:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{series \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{10}

\NormalTok{series}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1]  1  2  3  4  5  6  7  8  9 10
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{seq}\NormalTok{(}\DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1]  1  2  3  4  5  6  7  8  9 10
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{seq}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DataTypeTok{by =} \FloatTok{0.1}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1]  1.0  1.1  1.2  1.3  1.4  1.5  1.6  1.7  1.8  1.9  2.0  2.1  2.2  2.3  2.4
## [16]  2.5  2.6  2.7  2.8  2.9  3.0  3.1  3.2  3.3  3.4  3.5  3.6  3.7  3.8  3.9
## [31]  4.0  4.1  4.2  4.3  4.4  4.5  4.6  4.7  4.8  4.9  5.0  5.1  5.2  5.3  5.4
## [46]  5.5  5.6  5.7  5.8  5.9  6.0  6.1  6.2  6.3  6.4  6.5  6.6  6.7  6.8  6.9
## [61]  7.0  7.1  7.2  7.3  7.4  7.5  7.6  7.7  7.8  7.9  8.0  8.1  8.2  8.3  8.4
## [76]  8.5  8.6  8.7  8.8  8.9  9.0  9.1  9.2  9.3  9.4  9.5  9.6  9.7  9.8  9.9
## [91] 10.0
\end{verbatim}

Atomic vectors are always flat, even if you nest \texttt{c()}'s:

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\KeywordTok{c}\NormalTok{(}\DecValTok{2}\NormalTok{, }\KeywordTok{c}\NormalTok{(}\DecValTok{3}\NormalTok{, }\DecValTok{4}\NormalTok{)))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 2 3 4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# the same as}
\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{4}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 2 3 4
\end{verbatim}

\textbf{Types and Tests}

Given a vector, you can determine its class with \texttt{class}, or check if it's a specific type with an ``is'' function: \texttt{is.character()}, \texttt{is.numeric()}, \texttt{is.integer()}, \texttt{is.logical()}, or, more generally, \texttt{is.atomic()}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{char\_var \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"harry"}\NormalTok{, }\StringTok{"sally"}\NormalTok{)}

\KeywordTok{class}\NormalTok{(char\_var)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "character"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.character}\NormalTok{(char\_var)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.atomic}\NormalTok{(char\_var)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{num\_var \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\FloatTok{2.5}\NormalTok{, }\FloatTok{4.5}\NormalTok{)}

\KeywordTok{class}\NormalTok{(num\_var)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "numeric"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.numeric}\NormalTok{(num\_var)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.atomic}\NormalTok{(num\_var)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

NB: \texttt{is.vector()} does not test if an object is a vector. Instead it returns \texttt{TRUE} only if the object is a vector with no attributes apart from names. Use \texttt{is.atomic(x)\ \textbar{}\textbar{}\ is.list(x)} to test if an object is actually a vector.

\textbf{Coercion}

All elements of an atomic vector must be the same type, so when you attempt to combine different types they will be \textbf{coerced} to the most flexible type. Types from least to most flexible are: logical, integer, double, and character.

For example, combining a character and an integer yields a character:

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{str}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\DecValTok{1}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  chr [1:2] "a" "1"
\end{verbatim}

\textbf{Guess what the following do without running them first}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{c}\NormalTok{(}\FloatTok{1.7}\NormalTok{, }\StringTok{"a"}\NormalTok{)}

\KeywordTok{c}\NormalTok{(}\OtherTok{TRUE}\NormalTok{, }\DecValTok{2}\NormalTok{)}

\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Notice that when a logical vector is coerced to an integer or double, \texttt{TRUE} becomes 1 and \texttt{FALSE} becomes 0. This is very useful in conjunction with \texttt{sum()} and \texttt{mean()}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\OtherTok{FALSE}\NormalTok{, }\OtherTok{FALSE}\NormalTok{, }\OtherTok{TRUE}\NormalTok{)}

\KeywordTok{as.numeric}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 0 0 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Total number of TRUEs}
\KeywordTok{sum}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Proportion that are TRUE}
\KeywordTok{mean}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 0.3333333
\end{verbatim}

Coercion often happens automatically. This is called implicit coercion. Most mathematical functions (\texttt{+}, \texttt{log}, \texttt{abs}, etc.) will coerce to a numeric or integer, and most logical operations (\texttt{\&}, \texttt{\textbar{}}, \texttt{any}, etc) will coerce to a logical. You will usually get a warning message if the coercion might lose information.

\begin{Shaded}
\begin{Highlighting}[]
\DecValTok{1} \OperatorTok{\textless{}}\StringTok{ "2"}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\StringTok{"1"} \OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{2}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE
\end{verbatim}

You can also coerce vectors explicitly coerce with \texttt{as.character()}, \texttt{as.numeric()}, \texttt{as.integer()}, or \texttt{as.logical()}. Example:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{0}\OperatorTok{:}\DecValTok{6}

\KeywordTok{as.numeric}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 0 1 2 3 4 5 6
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{as.logical}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{as.character}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "0" "1" "2" "3" "4" "5" "6"
\end{verbatim}

Sometimes coercions, especially nonsensical ones, won't work.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{)}

\KeywordTok{as.numeric}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: NAs introduced by coercion
\end{verbatim}

\begin{verbatim}
## [1] NA NA NA
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{as.logical}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] NA NA NA
\end{verbatim}

\textbf{Short Exercise}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# 1. Create a vector of a sequence of numbers between 1 to 10.}

\CommentTok{\# 2. Coerce that vector into a character vector}

\CommentTok{\# 3. Add the element "11" to the end of the vector}

\CommentTok{\# 4. Coerce it back to a numeric vector.}
\end{Highlighting}
\end{Shaded}

\hypertarget{lists}{%
\subsubsection{Lists}\label{lists}}

Lists are also vectors, but different from atomic vectors because their elements can be of any type. In short, they are generic vectors. You construct lists by using \texttt{list()} instead of \texttt{c()}:

Lists are sometimes called recursive vectors, because a list can contain other lists. This makes them fundamentally different from atomic vectors.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DecValTok{1}\NormalTok{, }\StringTok{"a"}\NormalTok{, }\OtherTok{TRUE}\NormalTok{, }\KeywordTok{c}\NormalTok{(}\DecValTok{4}\NormalTok{, }\DecValTok{5}\NormalTok{, }\DecValTok{6}\NormalTok{))}

\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] 1
## 
## [[2]]
## [1] "a"
## 
## [[3]]
## [1] TRUE
## 
## [[4]]
## [1] 4 5 6
\end{verbatim}

You can coerce other objects using \texttt{as.list()}. You can test for a list with \texttt{is.list()}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{10}

\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{as.list}\NormalTok{(x)}

\KeywordTok{is.list}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{length}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 10
\end{verbatim}

\texttt{c()} will combine several lists into one. If given a combination of atomic vectors and lists, \texttt{c()} (con\textbf{c}atenate) will coerce the vectors to lists before combining them. Compare the results of \texttt{list()} and \texttt{c()}:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{list}\NormalTok{(}\KeywordTok{list}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{), }\KeywordTok{c}\NormalTok{(}\DecValTok{3}\NormalTok{, }\DecValTok{4}\NormalTok{))}

\NormalTok{y \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\KeywordTok{list}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{), }\KeywordTok{c}\NormalTok{(}\DecValTok{3}\NormalTok{, }\DecValTok{4}\NormalTok{))}

\KeywordTok{str}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## List of 2
##  $ :List of 2
##   ..$ : num 1
##   ..$ : num 2
##  $ : num [1:2] 3 4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{str}\NormalTok{(y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## List of 4
##  $ : num 1
##  $ : num 2
##  $ : num 3
##  $ : num 4
\end{verbatim}

You can turn a list into an atomic vector with \texttt{unlist()}. If the elements of a list have different types, \texttt{unlist()} uses the same coercion rules as \texttt{c()}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{list}\NormalTok{(}\KeywordTok{list}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{), }\KeywordTok{c}\NormalTok{(}\DecValTok{3}\NormalTok{, }\DecValTok{4}\NormalTok{))}

\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [[1]][[1]]
## [1] 1
## 
## [[1]][[2]]
## [1] 2
## 
## 
## [[2]]
## [1] 3 4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{unlist}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 2 3 4
\end{verbatim}

Lists are used to build up many of the more complicated data structures in R. For example, both data frames and linear models objects (as produced by \texttt{lm()}) are lists:

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.list}\NormalTok{(mtcars)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mod \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(mpg }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{wt, }\DataTypeTok{data =}\NormalTok{ mtcars)}

\KeywordTok{is.list}\NormalTok{(mod)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

For this reason, lists are extremely useful inside functions. You can ``staple'' together lots of different kinds of results into a single object that a function can return.

A list does not print to the console like a vector. Instead, each element of the list starts on a new line.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x.vec \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{)}
\NormalTok{x.list \textless{}{-}}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{)}
\NormalTok{x.vec}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 2 3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x.list}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 2
## 
## [[3]]
## [1] 3
\end{verbatim}

For lists, elements are \textbf{indexed by double brackets}. Single brackets will still return a(nother) list. (We'll talk more about subsetting and indexing in the fourth lesson.)

\textbf{Exercises}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  What are the four basic types of atomic vector? How does a list differ from an
  atomic vector?
\item
  Why is \texttt{1\ ==\ "1"} true? Why is \texttt{-1\ \textless{}\ FALSE} true? Why is \texttt{"one"\ \textless{}\ 2} false?
\item
  Create three vectors and then combine them into a list.
\item
  If \texttt{x} is a list, what is the class of \texttt{x{[}1{]}}? How about \texttt{x{[}{[}1{]}{]}}?
\end{enumerate}

\hypertarget{attributes}{%
\subsection{Attributes}\label{attributes}}

Attributes provide additional information about the data to you, the user, and to R. We've already seen the following three attributes in action:

\begin{itemize}
\item
  Names (\texttt{names(x)}), a character vector giving each element a name.
\item
  Dimensions (\texttt{dim(x)}), used to turn vectors into matrices.
\item
  Class (\texttt{class(x)}), used to implement the S3 object system.
\end{itemize}

\textbf{Additional tips}

In an object-oriented system, a \href{https://www.google.com/search?q=what+is+class+programming\&oq=what+is+class+programming\&aqs=chrome.0.0l6.3543j0j4\&sourceid=chrome\&ie=UTF-8}{class} (an extensible problem-code-template) defines a type of objects like what its properties are, how it behaves, and how it relates to other types of objects. Therefore, technically, an object is an \href{https://en.wikipedia.org/wiki/Instance_(computer_science)}{instance} (or occurrence) of a class. A method is a function associated with a particular type of object.

\hypertarget{names}{%
\subsubsection{Names}\label{names}}

You can name a vector when you create it:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DataTypeTok{a =} \DecValTok{1}\NormalTok{, }\DataTypeTok{b =} \DecValTok{2}\NormalTok{, }\DataTypeTok{c =} \DecValTok{3}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

You can also modifying an existing vector:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{3}

\KeywordTok{names}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## NULL
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{names}\NormalTok{(x) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"e"}\NormalTok{, }\StringTok{"f"}\NormalTok{, }\StringTok{"g"}\NormalTok{)}

\KeywordTok{names}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "e" "f" "g"
\end{verbatim}

Names don't have to be unique. However, character subsetting, described in the next lesson, is the most important reason to use names and it is most useful when the names are unique. (For Python users: when names are unique, a vector behaves kind of like a Python dictionary key.)

Not all elements of a vector need to have a name. If some names are missing, \texttt{names()} will return an empty string for those elements. If all names are missing, \texttt{names()} will return \texttt{NULL}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{y \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DataTypeTok{a =} \DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{)}

\KeywordTok{names}\NormalTok{(y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "a" ""  ""
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{z \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{)}

\KeywordTok{names}\NormalTok{(z)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## NULL
\end{verbatim}

You can create a new vector without names using \texttt{unname(x)}, or remove names in place with \texttt{names(x)\ \textless{}-\ NULL}.

\hypertarget{factors}{%
\subsubsection{Factors}\label{factors}}

Factors are special vectors that represent categorical data. Factors can be ordered (ordinal variable) or unordered (nominal or categorical variable) and are important for modeling functions such as \texttt{lm()} and \texttt{glm()} and also in plot methods.

\textbf{Quiz}
1. If you want to enter dummy variables (Democrats = 1, Non-democrats = 0) in your regression model, should you use numeric or factor variable?

Factors can only contain pre-defined values. Set allowed values using the \texttt{levels()} attribute. Note that a factor's levels will always be character values.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"a"}\NormalTok{)}

\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{factor}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"a"}\NormalTok{))}

\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] a b b a
## Levels: a b
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "factor"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{levels}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "a" "b"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# You can\textquotesingle{}t use values that are not in the levels}
\NormalTok{x[}\DecValTok{2}\NormalTok{] \textless{}{-}}\StringTok{ "c"}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning in `[<-.factor`(`*tmp*`, 2, value = "c"): invalid factor level, NA
## generated
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# NB: you can\textquotesingle{}t combine factors}
\KeywordTok{c}\NormalTok{(}\KeywordTok{factor}\NormalTok{(}\StringTok{"a"}\NormalTok{), }\KeywordTok{factor}\NormalTok{(}\StringTok{"b"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{rep}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{5}\NormalTok{, }\KeywordTok{rep}\NormalTok{(}\DecValTok{6}\NormalTok{, }\DecValTok{5}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] 1 1 1 1 1 1 2 2 2 2 2 2 3 3 3 3 3 3 4 4 4 4 4 4 5 5 5 5 5 5
\end{verbatim}

Factors are pretty much integers that have labels on them. Underneath, it's really numbers (1, 2, 3\ldots).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{factor}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"a"}\NormalTok{))}

\KeywordTok{str}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  Factor w/ 2 levels "a","b": 1 2 2 1
\end{verbatim}

They are better than using simple integer labels because factors are what are called self describing. For example, \texttt{democrat} and \texttt{republican} is more descriptive than \texttt{1}s and \texttt{2}s.

Factors are useful when you know the possible values a variable may take, even if you don't see all values in a given dataset. Using a factor instead of a character vector makes it obvious when some groups contain no observations:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{party\_char \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"democrat"}\NormalTok{, }\StringTok{"democrat"}\NormalTok{, }\StringTok{"democrat"}\NormalTok{)}

\NormalTok{party\_char}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "democrat" "democrat" "democrat"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{party\_factor \textless{}{-}}\StringTok{ }\KeywordTok{factor}\NormalTok{(party\_char, }\DataTypeTok{levels =} \KeywordTok{c}\NormalTok{(}\StringTok{"democrat"}\NormalTok{, }\StringTok{"republican"}\NormalTok{))}

\NormalTok{party\_factor}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] democrat democrat democrat
## Levels: democrat republican
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{table}\NormalTok{(party\_char) }\CommentTok{\# shows only democrats}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## party_char
## democrat 
##        3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{table}\NormalTok{(party\_factor) }\CommentTok{\# shows republicans too}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## party_factor
##   democrat republican 
##          3          0
\end{verbatim}

Sometimes factors can be left unordered. Example: \texttt{democrat}, \texttt{republican}.

Other times you might want factors to be ordered (or ranked). Example: \texttt{low}, \texttt{medium}, \texttt{high}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{factor}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"low"}\NormalTok{, }\StringTok{"medium"}\NormalTok{, }\StringTok{"high"}\NormalTok{))}

\KeywordTok{str}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  Factor w/ 3 levels "high","low","medium": 2 3 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.ordered}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{y \textless{}{-}}\StringTok{ }\KeywordTok{ordered}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"low"}\NormalTok{, }\StringTok{"medium"}\NormalTok{, }\StringTok{"high"}\NormalTok{), }\DataTypeTok{levels =} \KeywordTok{c}\NormalTok{(}\StringTok{"high"}\NormalTok{, }\StringTok{"medium"}\NormalTok{, }\StringTok{"low"}\NormalTok{))}

\KeywordTok{is.ordered}\NormalTok{(y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

While factors look (and often behave) like character vectors, they are actually integers. Be careful when treating them like strings. Some string methods (like \texttt{gsub()} and \texttt{grepl()}) will coerce factors to strings, while others (like \texttt{nchar()}) will throw an error, and still others (like \texttt{c()}) will use the underlying integer values.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"a"}\NormalTok{)}

\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "a" "b" "b" "a"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.factor}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{as.factor}\NormalTok{(x)}

\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] a b b a
## Levels: a b
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{c}\NormalTok{(x, }\StringTok{"c"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "1" "2" "2" "1" "c"
\end{verbatim}

For this reason, it's usually best to explicitly convert factors to character vectors if you need string-like behavior. In early versions of R, there was a memory advantage to using factors instead of character vectors, but this is no longer the case.

Unfortunately, most data loading functions in R automatically convert character vectors to factors. This is suboptimal, because there's no way for those functions to know the set of all possible levels or their optimal order. If this becomes a problem, use the argument \texttt{stringsAsFactors\ =\ FALSE} to suppress this behavior, and then manually convert character vectors to factors using your knowledge of the data.

\textbf{More attributes}

All R objects can have arbitrary additional attributes, used to store metadata about the object. Attributes can be thought of as a named list (with unique names). Attributes can be accessed individually with \texttt{attr()} or all at once (as a list) with \texttt{attributes()}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{y \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{10}

\KeywordTok{attr}\NormalTok{(y, }\StringTok{"my\_attribute"}\NormalTok{) \textless{}{-}}\StringTok{ "This is a vector"}

\KeywordTok{attr}\NormalTok{(y, }\StringTok{"my\_attribute"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "This is a vector"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# str returns a new object with modified information}
\KeywordTok{str}\NormalTok{(}\KeywordTok{attributes}\NormalTok{(y))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## List of 1
##  $ my_attribute: chr "This is a vector"
\end{verbatim}

\textbf{Exercises}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  What happens to a factor when you modify its levels?
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{f1 \textless{}{-}}\StringTok{ }\KeywordTok{factor}\NormalTok{(letters)}

\KeywordTok{levels}\NormalTok{(f1) \textless{}{-}}\StringTok{ }\KeywordTok{rev}\NormalTok{(}\KeywordTok{levels}\NormalTok{(f1))}

\NormalTok{f1}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] z y x w v u t s r q p o n m l k j i h g f e d c b a
## Levels: z y x w v u t s r q p o n m l k j i h g f e d c b a
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  What does this code do? How do \texttt{f2} and \texttt{f3} differ from \texttt{f1}?
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{f2 \textless{}{-}}\StringTok{ }\KeywordTok{rev}\NormalTok{(}\KeywordTok{factor}\NormalTok{(letters))}

\NormalTok{f3 \textless{}{-}}\StringTok{ }\KeywordTok{factor}\NormalTok{(letters, }\DataTypeTok{levels =} \KeywordTok{rev}\NormalTok{(letters))}
\end{Highlighting}
\end{Shaded}

\hypertarget{d-data-matrices-and-dataframes}{%
\section{2D data: matrices and dataframes}\label{d-data-matrices-and-dataframes}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Matrices: data structures for storing 2d data that is all the same class.
\item
  Dataframes: teaches you about the dataframe, the most important data structure for storing data in R, because it stores different kinds of (2d) data.
\end{enumerate}

\hypertarget{matrices}{%
\subsection{Matrices}\label{matrices}}

Matrices are created when we combine multiple vectors that all have the same class (e.g., numeric). This creates a dataset with rows and columns. By definition, if you want to combine multiple classes of vectors, you want a dataframe. You can coerce a matrix to become a dataframe, and vice-versa, but as with all vector coercions, the results can be unpredictable, so be sure you know how each variable (column) will convert.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{m \textless{}{-}}\StringTok{ }\KeywordTok{matrix}\NormalTok{(}\DataTypeTok{nrow =} \DecValTok{2}\NormalTok{, }\DataTypeTok{ncol =} \DecValTok{2}\NormalTok{)}

\NormalTok{m}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      [,1] [,2]
## [1,]   NA   NA
## [2,]   NA   NA
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dim}\NormalTok{(m)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2 2
\end{verbatim}

Matrices are filled column-wise.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{m \textless{}{-}}\StringTok{ }\KeywordTok{matrix}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{6}\NormalTok{, }\DataTypeTok{nrow =} \DecValTok{2}\NormalTok{, }\DataTypeTok{ncol =} \DecValTok{3}\NormalTok{)}

\NormalTok{m}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
\end{verbatim}

Other ways to construct a matrix

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{m \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{10}

\KeywordTok{dim}\NormalTok{(m) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{2}\NormalTok{, }\DecValTok{5}\NormalTok{)}

\NormalTok{m}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    3    5    7    9
## [2,]    2    4    6    8   10
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dim}\NormalTok{(m) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{5}\NormalTok{, }\DecValTok{2}\NormalTok{)}

\NormalTok{m}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      [,1] [,2]
## [1,]    1    6
## [2,]    2    7
## [3,]    3    8
## [4,]    4    9
## [5,]    5   10
\end{verbatim}

You can transpose a matrix (or dataframe) with \texttt{t()}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{m \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{10}

\KeywordTok{dim}\NormalTok{(m) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{2}\NormalTok{, }\DecValTok{5}\NormalTok{)}

\NormalTok{m}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    3    5    7    9
## [2,]    2    4    6    8   10
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{t}\NormalTok{(m)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4
## [3,]    5    6
## [4,]    7    8
## [5,]    9   10
\end{verbatim}

Another way is to bind columns or rows using \texttt{cbind()} and \texttt{rbind()}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{3}

\NormalTok{y \textless{}{-}}\StringTok{ }\DecValTok{10}\OperatorTok{:}\DecValTok{12}

\KeywordTok{cbind}\NormalTok{(x, y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      x  y
## [1,] 1 10
## [2,] 2 11
## [3,] 3 12
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# or}

\KeywordTok{rbind}\NormalTok{(x, y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   [,1] [,2] [,3]
## x    1    2    3
## y   10   11   12
\end{verbatim}

You can also use the \texttt{byrow} argument to specify how the matrix is filled. From R's own documentation:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mdat \textless{}{-}}\StringTok{ }\KeywordTok{matrix}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{11}\NormalTok{, }\DecValTok{12}\NormalTok{, }\DecValTok{13}\NormalTok{),}
  \DataTypeTok{nrow =} \DecValTok{2}\NormalTok{,}
  \DataTypeTok{ncol =} \DecValTok{3}\NormalTok{,}
  \DataTypeTok{byrow =} \OtherTok{TRUE}\NormalTok{,}
  \DataTypeTok{dimnames =} \KeywordTok{list}\NormalTok{(}
    \KeywordTok{c}\NormalTok{(}\StringTok{"row1"}\NormalTok{, }\StringTok{"row2"}\NormalTok{),}
    \KeywordTok{c}\NormalTok{(}\StringTok{"C.1"}\NormalTok{, }\StringTok{"C.2"}\NormalTok{, }\StringTok{"C.3"}\NormalTok{)}
\NormalTok{  )}
\NormalTok{)}
\NormalTok{mdat}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      C.1 C.2 C.3
## row1   1   2   3
## row2  11  12  13
\end{verbatim}

Notice that we gave \texttt{names} to the dimensions in \texttt{mdat}.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dimnames}\NormalTok{(mdat)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] "row1" "row2"
## 
## [[2]]
## [1] "C.1" "C.2" "C.3"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{rownames}\NormalTok{(mdat)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "row1" "row2"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{colnames}\NormalTok{(mdat)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "C.1" "C.2" "C.3"
\end{verbatim}

\hypertarget{dataframes}{%
\subsection{Dataframes}\label{dataframes}}

A data frame is a very important data type in R. It's pretty much the \textbf{de facto} data structure for most tabular data and what we use for statistics.

\hypertarget{creation}{%
\subsubsection{Creation}\label{creation}}

You create a data frame using \texttt{data.frame()}, which takes named vectors as input:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{vec1 \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{3}
\NormalTok{vec2 \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{)}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(vec1, vec2)}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   vec1 vec2
## 1    1    a
## 2    2    b
## 3    3    c
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{str}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 'data.frame':    3 obs. of  2 variables:
##  $ vec1: int  1 2 3
##  $ vec2: chr  "a" "b" "c"
\end{verbatim}

Beware: \texttt{data.frame()}'s default behavior which turns strings into factors. Remember to use \texttt{stringAsFactors\ =\ FALSE} to suppress this behavior as needed:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}
  \DataTypeTok{x =} \DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{,}
  \DataTypeTok{y =} \KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{),}
  \DataTypeTok{stringsAsFactors =} \OtherTok{FALSE}
\NormalTok{)}
\KeywordTok{str}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 'data.frame':    3 obs. of  2 variables:
##  $ x: int  1 2 3
##  $ y: chr  "a" "b" "c"
\end{verbatim}

In reality, we rarely type up our datasets ourselves, and certainly not in R. The most common way to make a data.frame is by calling a file using \texttt{read.csv} (which relies on the \texttt{foreign} package), \texttt{read.dta} (if you're using a Stata file), or some other kind of data file input.

\hypertarget{structure-and-attributes}{%
\subsubsection{Structure and Attributes}\label{structure-and-attributes}}

Under the hood, a data frame is a list of equal-length vectors. This makes it a 2-dimensional structure, so it shares properties of both the matrix and the list.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{vec1 \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{3}
\NormalTok{vec2 \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{)}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(vec1, vec2)}

\KeywordTok{str}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 'data.frame':    3 obs. of  2 variables:
##  $ vec1: int  1 2 3
##  $ vec2: chr  "a" "b" "c"
\end{verbatim}

This means that a dataframe has \texttt{names()}, \texttt{colnames()}, and \texttt{rownames()}, although \texttt{names()} and \texttt{colnames()} are the same thing.

** Summary **

\begin{itemize}
\tightlist
\item
  Set column names: \texttt{names()} in data frame, \texttt{colnames()} in matrix
\item
  Set row names: \texttt{row.names()} in data frame, \texttt{rownames()} in matrix
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{vec1 \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{3}
\NormalTok{vec2 \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{)}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(vec1, vec2)}

\CommentTok{\# these two are equivalent}
\KeywordTok{names}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "vec1" "vec2"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{colnames}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "vec1" "vec2"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# change the colnames}
\KeywordTok{colnames}\NormalTok{(df) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"Number"}\NormalTok{, }\StringTok{"Character"}\NormalTok{)}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   Number Character
## 1      1         a
## 2      2         b
## 3      3         c
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{names}\NormalTok{(df) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"Number"}\NormalTok{, }\StringTok{"Character"}\NormalTok{)}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   Number Character
## 1      1         a
## 2      2         b
## 3      3         c
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# change the rownames}
\KeywordTok{rownames}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "1" "2" "3"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{rownames}\NormalTok{(df) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"donut"}\NormalTok{, }\StringTok{"pickle"}\NormalTok{, }\StringTok{"pretzel"}\NormalTok{)}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##         Number Character
## donut        1         a
## pickle       2         b
## pretzel      3         c
\end{verbatim}

The \texttt{length()} of a dataframe is the length of the underlying list and so is the same as \texttt{ncol()}; \texttt{nrow()} gives the number of rows.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{vec1 \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{3}
\NormalTok{vec2 \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{)}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(vec1, vec2)}

\CommentTok{\# these two are equivalent {-} number of columns}
\KeywordTok{length}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ncol}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# get number of rows}
\KeywordTok{nrow}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# get number of both columns and rows}
\KeywordTok{dim}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 3 2
\end{verbatim}

\hypertarget{testing-and-coercion}{%
\subsubsection{Testing and coercion}\label{testing-and-coercion}}

To check if an object is a dataframe, use \texttt{class()} or test explicitly with \texttt{is.data.frame()}:

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "data.frame"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.data.frame}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

You can coerce an object to a dataframe with \texttt{as.data.frame()}:

\begin{itemize}
\item
  A vector will create a one-column dataframe.
\item
  A list will create one column for each element; it's an error if they're
  not all the same length.
\item
  A matrix will create a data frame with the same number of columns and rows as the matrix.
\end{itemize}

\hypertarget{combining-dataframes}{%
\subsubsection{Combining dataframes}\label{combining-dataframes}}

You can combine dataframes using \texttt{cbind()} and \texttt{rbind()}:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}
  \DataTypeTok{x =} \DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{,}
  \DataTypeTok{y =} \KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{),}
  \DataTypeTok{stringsAsFactors =} \OtherTok{FALSE}
\NormalTok{)}

\KeywordTok{cbind}\NormalTok{(df, }\KeywordTok{data.frame}\NormalTok{(}\DataTypeTok{z =} \DecValTok{3}\OperatorTok{:}\DecValTok{1}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x y z
## 1 1 a 3
## 2 2 b 2
## 3 3 c 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{rbind}\NormalTok{(df, }\KeywordTok{data.frame}\NormalTok{(}\DataTypeTok{x =} \DecValTok{10}\NormalTok{, }\DataTypeTok{y =} \StringTok{"z"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##    x y
## 1  1 a
## 2  2 b
## 3  3 c
## 4 10 z
\end{verbatim}

When combining column-wise, the number of rows must match, but row names are ignored. When combining row-wise, both the number and names of columns must match. (If you want to combine rows that don't have the same columns, there are other functions / packages in R that can help.)

It's a common mistake to try and create a dataframe by \texttt{cbind()}ing vectors together. This doesn't work because \texttt{cbind()} will create a matrix unless one of the arguments is already a dataframe. Instead use \texttt{data.frame()} directly:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{bad \textless{}{-}}\StringTok{ }\NormalTok{(}\KeywordTok{cbind}\NormalTok{(}\DataTypeTok{x =} \DecValTok{1}\OperatorTok{:}\DecValTok{2}\NormalTok{, }\DataTypeTok{y =} \KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{)))}
\NormalTok{bad}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      x   y  
## [1,] "1" "a"
## [2,] "2" "b"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{str}\NormalTok{(bad)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  chr [1:2, 1:2] "1" "2" "a" "b"
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:2] "x" "y"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{good \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}
  \DataTypeTok{x =} \DecValTok{1}\OperatorTok{:}\DecValTok{2}\NormalTok{, }\DataTypeTok{y =} \KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{),}
  \DataTypeTok{stringsAsFactors =} \OtherTok{FALSE}
\NormalTok{)}
\NormalTok{good}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x y
## 1 1 a
## 2 2 b
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{str}\NormalTok{(good)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 'data.frame':    2 obs. of  2 variables:
##  $ x: int  1 2
##  $ y: chr  "a" "b"
\end{verbatim}

The conversion rules for \texttt{cbind()} are complicated and best avoided by ensuring all inputs are of the same type.

\hypertarget{list-columns-tbd}{%
\subsubsection{List columns (TBD)}\label{list-columns-tbd}}

\textbf{Other objects}

Missing values are specified with \texttt{NA}, which is a logical vector of length 1. \texttt{NA} will always be coerced to the correct type if used inside \texttt{c()}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\OtherTok{NA}\NormalTok{, }\DecValTok{1}\NormalTok{)}
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] NA  1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{typeof}\NormalTok{(}\OtherTok{NA}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "logical"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{typeof}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "double"
\end{verbatim}

\texttt{Inf} is infinity. You can have either positive or negative infinity.

\begin{Shaded}
\begin{Highlighting}[]
\DecValTok{1} \OperatorTok{/}\StringTok{ }\DecValTok{0}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] Inf
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\DecValTok{1} \OperatorTok{/}\StringTok{ }\OtherTok{Inf}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 0
\end{verbatim}

\texttt{NaN} means Not a number. It's an undefined value.

\begin{Shaded}
\begin{Highlighting}[]
\DecValTok{0} \OperatorTok{/}\StringTok{ }\DecValTok{0}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] NaN
\end{verbatim}

\hypertarget{subset}{%
\section{Subset}\label{subset}}

When working with data, you'll need to subset objects early and often. Luckily, R's subsetting operators are powerful and fast. Mastery of subsetting allows you to succinctly express complex operations in a way that few other languages can match. Subsetting is hard to learn because you need to master a number of interrelated concepts:

\begin{itemize}
\item
  The three subsetting operators, \texttt{{[}}, \texttt{{[}{[}}, and \texttt{\$}.
\item
  Important differences in behavior for different objects (e.g., vectors, lists, factors, matrices, and data frames).
\item
  The use of subsetting in conjunction with assignment.
\end{itemize}

This unit helps you master subsetting by starting with the simplest type of subsetting: subsetting an atomic vector with \texttt{{[}}. It then gradually extends your knowledge, first to more complicated data types (like dataframes and lists), and then to the other subsetting operators, \texttt{{[}{[}} and \texttt{\$}. You'll then learn how subsetting and assignment can be combined to modify parts of an object, and, finally, you'll see a large number of useful applications.

\hypertarget{atomic-vectors}{%
\subsection{Atomic vectors}\label{atomic-vectors}}

Let's explore the different types of subsetting with a simple vector, \texttt{x}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Note that the number after the decimal point gives the original position in the vector.

\textbf{NB:} In R, positions start at 1, unlike Python, which starts at 0. Fun!**

There are five things that you can use to subset a vector:

\hypertarget{positive-integers}{%
\subsubsection{Positive integers}\label{positive-integers}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 4.2 3.3 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\DecValTok{3}\NormalTok{, }\DecValTok{1}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 3.3 2.1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# \textasciigrave{}order(x)\textasciigrave{} gives the positions of smallest to largest values.}
\KeywordTok{order}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 3 2 4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\KeywordTok{order}\NormalTok{(x)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 3.3 4.2 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{4}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 3.3 4.2 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Duplicated indices yield duplicated values}
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 2.1
\end{verbatim}

\hypertarget{negative-integers}{%
\subsubsection{Negative integers}\label{negative-integers}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}
\NormalTok{x[}\OperatorTok{{-}}\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4.2 3.3 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\OperatorTok{{-}}\KeywordTok{c}\NormalTok{(}\DecValTok{3}\NormalTok{, }\DecValTok{1}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4.2 5.4
\end{verbatim}

You can't mix positive and negative integers in a single subset:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Error in x[c(-1, 2)]: only 0's may be mixed with negative subscripts
\end{verbatim}

\hypertarget{logical-vectors}{%
\subsubsection{Logical vectors}\label{logical-vectors}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}

\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\OtherTok{TRUE}\NormalTok{, }\OtherTok{TRUE}\NormalTok{, }\OtherTok{FALSE}\NormalTok{, }\OtherTok{FALSE}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 4.2
\end{verbatim}

This is probably the most useful type of subsetting because you write the expression that creates the logical vector

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}

\CommentTok{\# this returns a logical vector}
\NormalTok{x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{3}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE  TRUE  TRUE  TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 4.2 3.3 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# use a conditional statement to create an implicit logical vector}
\NormalTok{x[x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{3}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4.2 3.3 5.4
\end{verbatim}

You can combine conditional statements with \texttt{\&} (and), \texttt{\textbar{}} (or), and \texttt{!} (not)

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}

\CommentTok{\# combing two conditional statements with \&}
\NormalTok{x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{3} \OperatorTok{\&}\StringTok{ }\NormalTok{x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{5}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE  TRUE  TRUE FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{3} \OperatorTok{\&}\StringTok{ }\NormalTok{x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{5}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4.2 3.3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# combing two conditional statements with |}
\NormalTok{x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{3} \OperatorTok{|}\StringTok{ }\NormalTok{x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{5}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1]  TRUE FALSE FALSE  TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{3} \OperatorTok{|}\StringTok{ }\NormalTok{x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{5}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# combining conditional statements with !}
\OperatorTok{!}\NormalTok{x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{5}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1]  TRUE  TRUE  TRUE FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\OperatorTok{!}\NormalTok{x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{5}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 4.2 3.3
\end{verbatim}

Another way to generate implicit conditional statements is using the \texttt{\%in\%} operator, which works like the \texttt{in} keywords in Python.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# generate implicit logical vectors through the \%in\% operator}
\NormalTok{x }\OperatorTok{\%in\%}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{3.3}\NormalTok{, }\FloatTok{4.2}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE  TRUE  TRUE FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 4.2 3.3 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[x }\OperatorTok{\%in\%}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{3.3}\NormalTok{, }\FloatTok{4.2}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4.2 3.3
\end{verbatim}

\hypertarget{character-vectors}{%
\subsubsection{Character vectors}\label{character-vectors}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}

\CommentTok{\# apply names}
\KeywordTok{names}\NormalTok{(x) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{, }\StringTok{"d"}\NormalTok{)}
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   a   b   c   d 
## 2.1 4.2 3.3 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# subset using names}
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\StringTok{"d"}\NormalTok{, }\StringTok{"c"}\NormalTok{, }\StringTok{"a"}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   d   c   a 
## 5.4 3.3 2.1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Like integer indices, you can repeat indices}
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"a"}\NormalTok{, }\StringTok{"a"}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   a   a   a 
## 2.1 2.1 2.1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Careful! names are always matched exactly}
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DataTypeTok{abc =} \DecValTok{1}\NormalTok{, }\DataTypeTok{def =} \DecValTok{2}\NormalTok{)}
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## abc def 
##   1   2
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"d"}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## <NA> <NA> 
##   NA   NA
\end{verbatim}

\hypertarget{more-on-string-operations}{%
\subparagraph{More on string operations}\label{more-on-string-operations}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{firstName \textless{}{-}}\StringTok{ "Jae Yeon"}
\NormalTok{lastName \textless{}{-}}\StringTok{ "Kim"}
\end{Highlighting}
\end{Shaded}

Unlike in Python, R does not have a reserved operator for string concatenation such as \texttt{+}. Furthermore, using the usual concatenation operator \texttt{c()} on two or more character strings will not create a single character string, but rather a \textbf{vector} of character strings.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fullName \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(firstName, lastName)}

\KeywordTok{print}\NormalTok{(fullName)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Jae Yeon" "Kim"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{length}\NormalTok{(fullName)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2
\end{verbatim}

In order to combine two or more character strings into one larger character string, we use the \texttt{paste()} function. This function takes character strings or vectors and collapses their values into a single character string, with each value separated by a character string selected by the user.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fullName \textless{}{-}}\StringTok{ }\KeywordTok{paste}\NormalTok{(firstName, lastName)}

\KeywordTok{print}\NormalTok{(fullName)}

\NormalTok{fullName \textless{}{-}}\StringTok{ }\KeywordTok{paste}\NormalTok{(firstName, lastName, }\DataTypeTok{sep =} \StringTok{"+"}\NormalTok{)}

\KeywordTok{print}\NormalTok{(fullName)}

\NormalTok{fullName \textless{}{-}}\StringTok{ }\KeywordTok{paste}\NormalTok{(firstName, lastName, }\DataTypeTok{sep =} \StringTok{"\_\_\_"}\NormalTok{)}
\KeywordTok{print}\NormalTok{(fullName)}
\end{Highlighting}
\end{Shaded}

As with Python, R can also extract substrings based on the index position of its characters. There are, however, two critical differences. First, \textbf{index positions in R start at 1}. This is in contrast to Python, where indexation starts at 0.

Second, \textbf{object subsets using index positions in R contain all the elements in the specified range}. If some object called \texttt{data} contains five elements, \texttt{data{[}2:4{]}} will return the elements at the second, third, and fourth positions. By contrast, the same subset in Python would return the objects at the third and fourth positions (or second and third positions, depending upon whether your index starts at 0 or 1).

Third, \textbf{R does not allow indexing of character strings}*. Instead, you must use the \texttt{substr()} function. Note that this function must receive both the \texttt{start} and \texttt{stop} arguments. So if you want to get all the characters between some index and the end of the string, you must make use of the \texttt{nchar()} function, which will tell you the length of a character string.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fullName \textless{}{-}}\StringTok{ }\KeywordTok{paste}\NormalTok{(firstName, lastName)}

\CommentTok{\# this won\textquotesingle{}t work like in Python}
\NormalTok{fullName[}\DecValTok{1}\NormalTok{] }\CommentTok{\# R sees the string as a unitary object {-} it can\textquotesingle{}t be indexed this way}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Jae Yeon Kim"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fullName[}\DecValTok{1}\OperatorTok{:}\DecValTok{4}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Jae Yeon Kim" NA             NA             NA
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# So use this instead}
\KeywordTok{substr}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName, }\DataTypeTok{start =} \DecValTok{1}\NormalTok{, }\DataTypeTok{stop =} \DecValTok{2}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Ja"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{substr}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName, }\DataTypeTok{start =} \DecValTok{5}\NormalTok{, }\DataTypeTok{stop =} \DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Y"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{substr}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName, }\DataTypeTok{start =} \DecValTok{1}\NormalTok{, }\DataTypeTok{stop =} \DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Jae Yeon K"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{substr}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName, }\DataTypeTok{start =} \DecValTok{11}\NormalTok{, }\DataTypeTok{stop =} \KeywordTok{nchar}\NormalTok{(fullName))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "im"
\end{verbatim}

Like Python, R has a number of string methods, though these exist as individual rather than ``mix-and-match'' functions. For example:

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{toupper}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "JAE YEON KIM"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{tolower}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "jae yeon kim"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{strsplit}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName, }\DataTypeTok{split =} \StringTok{" "}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] "Jae"  "Yeon" "Kim"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{strsplit}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName, }\DataTypeTok{split =} \StringTok{"n"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] "Jae Yeo" " Kim"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{gsub}\NormalTok{(}\DataTypeTok{pattern =} \StringTok{"Kim"}\NormalTok{, }\DataTypeTok{replacement =} \StringTok{"Choi"}\NormalTok{, }\DataTypeTok{x =}\NormalTok{ fullName)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Jae Yeon Choi"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{gsub}\NormalTok{(}\DataTypeTok{pattern =} \StringTok{"Jae Yeon"}\NormalTok{, }\DataTypeTok{replacement =} \StringTok{"Danny"}\NormalTok{, }\DataTypeTok{x =}\NormalTok{ fullName)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Danny Kim"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Note the importance of cases! This doesn\textquotesingle{}t throw an error, so you won\textquotesingle{}t realize your function didn\textquotesingle{}t work unless you double{-}check several entries}

\KeywordTok{gsub}\NormalTok{(}\DataTypeTok{pattern =} \StringTok{" "}\NormalTok{, }\DataTypeTok{replacement =} \StringTok{""}\NormalTok{, }\DataTypeTok{x =}\NormalTok{ fullName) }\CommentTok{\# The same function is used for replacements and stripping}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "JaeYeonKim"
\end{verbatim}

\hypertarget{lists-1}{%
\subsection{Lists}\label{lists-1}}

Subsetting a list works in the same way as subsetting an atomic vector. Using \texttt{{[}} will always return a list; \texttt{{[}{[}} and \texttt{\$}, as described below, let you pull out the components of the list.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{l \textless{}{-}}\StringTok{ }\KeywordTok{list}\NormalTok{(}\StringTok{"a"}\NormalTok{ =}\StringTok{ }\DecValTok{1}\NormalTok{, }\StringTok{"b"}\NormalTok{ =}\StringTok{ }\DecValTok{2}\NormalTok{)}
\NormalTok{l}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## $a
## [1] 1
## 
## $b
## [1] 2
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{l[}\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## $a
## [1] 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{l[[}\DecValTok{1}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{l[}\StringTok{"a"}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## $a
## [1] 1
\end{verbatim}

\hypertarget{matrices-1}{%
\subsection{Matrices}\label{matrices-1}}

The most common way of subsetting matrices (2d) is a simple generalization of 1d subsetting: you supply a 1d index for each dimension, separated by a comma. Blank subsetting is now useful because it lets you keep all rows or all columns.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{a \textless{}{-}}\StringTok{ }\KeywordTok{matrix}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{9}\NormalTok{, }\DataTypeTok{nrow =} \DecValTok{3}\NormalTok{)}
\KeywordTok{colnames}\NormalTok{(a) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"A"}\NormalTok{, }\StringTok{"B"}\NormalTok{, }\StringTok{"C"}\NormalTok{)}
\NormalTok{a}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      A B C
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# rows come first, then columns}
\NormalTok{a[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{), ]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      A B C
## [1,] 1 4 7
## [2,] 2 5 8
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{a[}\KeywordTok{c}\NormalTok{(T, F, T), }\KeywordTok{c}\NormalTok{(}\StringTok{"B"}\NormalTok{, }\StringTok{"A"}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      B A
## [1,] 4 1
## [2,] 6 3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{a[}\DecValTok{0}\NormalTok{, }\DecValTok{{-}2}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      A C
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{a[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{), }\DecValTok{{-}2}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      A C
## [1,] 1 7
## [2,] 2 8
\end{verbatim}

\hypertarget{data-frames}{%
\subsection{Data frames}\label{data-frames}}

Data from data frames can be addressed like matrices (with row and column indicators separated by a comma).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}\DataTypeTok{x =} \DecValTok{4}\OperatorTok{:}\DecValTok{6}\NormalTok{, }\DataTypeTok{y =} \DecValTok{3}\OperatorTok{:}\DecValTok{1}\NormalTok{, }\DataTypeTok{z =}\NormalTok{ letters[}\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{])}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x y z
## 1 4 3 a
## 2 5 2 b
## 3 6 1 c
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# return only the rows where x == 6}
\NormalTok{df[df}\OperatorTok{$}\NormalTok{x }\OperatorTok{==}\StringTok{ }\DecValTok{6}\NormalTok{, ]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x y z
## 3 6 1 c
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# return the first and third row}
\NormalTok{df[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{3}\NormalTok{), ]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x y z
## 1 4 3 a
## 3 6 1 c
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# return the first and third row, and the first and second column}
\NormalTok{df[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{3}\NormalTok{), }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x y
## 1 4 3
## 3 6 1
\end{verbatim}

Data frames possess the characteristics of both lists and matrices: if you subset with a single vector, they behave like lists, and return only the columns.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# There are two ways to select columns from a data frame}
\CommentTok{\# Like a list:}
\NormalTok{df[}\KeywordTok{c}\NormalTok{(}\StringTok{"x"}\NormalTok{, }\StringTok{"z"}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x z
## 1 4 a
## 2 5 b
## 3 6 c
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Like a matrix}
\NormalTok{df[, }\KeywordTok{c}\NormalTok{(}\StringTok{"x"}\NormalTok{, }\StringTok{"z"}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x z
## 1 4 a
## 2 5 b
## 3 6 c
\end{verbatim}

But there's an important difference when you select a single column: matrix subsetting simplifies by default, list subsetting does not.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{(df[}\StringTok{"x"}\NormalTok{])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x
## 1 4
## 2 5
## 3 6
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{((df[}\StringTok{"x"}\NormalTok{]))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "data.frame"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{(df[, }\StringTok{"x"}\NormalTok{])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4 5 6
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{((df[, }\StringTok{"x"}\NormalTok{]))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "integer"
\end{verbatim}

See the bottom section on \protect\hyperlink{simplify-preserve}{Simplying and Preserving to know more}

\hypertarget{subsetting-operators}{%
\subsection{Subsetting operators}\label{subsetting-operators}}

There are two other subsetting operators: \texttt{{[}{[}} and \texttt{\$}.

\begin{itemize}
\tightlist
\item
  \texttt{{[}{[}} is similar to \texttt{{[}}, except it can only return a single value and it allows you to pull pieces out of a list.
\item
  \texttt{\$} is a useful shorthand for \texttt{{[}{[}} combined with character subsetting.
\end{itemize}

\hypertarget{section}{%
\paragraph{\texorpdfstring{\texttt{{[}{[}}}{{[}{[}}}\label{section}}

You need \texttt{{[}{[}} when working with lists. This is because when \texttt{{[}} is applied to a list it always returns a list: it never gives you the contents of the list. To get the contents, you need \texttt{{[}{[}}:

\begin{quote}
``If list \texttt{x} is a train carrying objects, then \texttt{x{[}{[}5{]}{]}} is
the object in car 5; \texttt{x{[}4:6{]}} is a train of cars 4-6.''

--- \citet{RLangTip}
\end{quote}

Because data frames are lists of columns, you can use \texttt{{[}{[}} to extract a column from data frames:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mtcars}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# these two are equivalent}
\NormalTok{mtcars[[}\DecValTok{1}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
## [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
## [31] 15.0 21.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mtcars[, }\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
## [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
## [31] 15.0 21.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# which differs from this:}
\NormalTok{mtcars[}\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##                      mpg
## Mazda RX4           21.0
## Mazda RX4 Wag       21.0
## Datsun 710          22.8
## Hornet 4 Drive      21.4
## Hornet Sportabout   18.7
## Valiant             18.1
## Duster 360          14.3
## Merc 240D           24.4
## Merc 230            22.8
## Merc 280            19.2
## Merc 280C           17.8
## Merc 450SE          16.4
## Merc 450SL          17.3
## Merc 450SLC         15.2
## Cadillac Fleetwood  10.4
## Lincoln Continental 10.4
## Chrysler Imperial   14.7
## Fiat 128            32.4
## Honda Civic         30.4
## Toyota Corolla      33.9
## Toyota Corona       21.5
## Dodge Challenger    15.5
## AMC Javelin         15.2
## Camaro Z28          13.3
## Pontiac Firebird    19.2
## Fiat X1-9           27.3
## Porsche 914-2       26.0
## Lotus Europa        30.4
## Ford Pantera L      15.8
## Ferrari Dino        19.7
## Maserati Bora       15.0
## Volvo 142E          21.4
\end{verbatim}

\hypertarget{section-1}{%
\paragraph{\texorpdfstring{\texttt{\$}}{\$}}\label{section-1}}

\texttt{\$} is a shorthand operator, where \texttt{x\$y} is equivalent to \texttt{x{[}{[}"y",\ exact\ =\ FALSE{]}{]}}. It's often used to access variables in a data frame:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# these two are equivalent}
\NormalTok{mtcars[[}\StringTok{"cyl"}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mtcars}\OperatorTok{$}\NormalTok{cyl}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
\end{verbatim}

One common mistake with \texttt{\$} is to try and use it when you have the name of a column stored in a variable:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{var \textless{}{-}}\StringTok{ "cyl"}
\CommentTok{\# Doesn\textquotesingle{}t work {-} mtcars$var translated to mtcars[["var"]]}
\NormalTok{mtcars}\OperatorTok{$}\NormalTok{var}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## NULL
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Instead use [[}
\NormalTok{mtcars[[var]]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
\end{verbatim}

\hypertarget{subassignment}{%
\subsection{Subassignment}\label{subassignment}}

All subsetting operators can be combined with assignment to modify selected values of the input vector.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{5}
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 2 3 4 5
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{)] \textless{}{-}}\StringTok{ }\DecValTok{2}\OperatorTok{:}\DecValTok{3}
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2 3 3 4 5
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# The length of the LHS needs to match the RHS!}
\NormalTok{x[}\OperatorTok{{-}}\DecValTok{1}\NormalTok{] \textless{}{-}}\StringTok{ }\DecValTok{4}\OperatorTok{:}\DecValTok{1}
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2 4 3 2 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\DecValTok{1}\NormalTok{] \textless{}{-}}\StringTok{ }\DecValTok{4}\OperatorTok{:}\DecValTok{1}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning in x[1] <- 4:1: number of items to replace is not a multiple of
## replacement length
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# This is mostly useful when conditionally modifying vectors}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}\DataTypeTok{a =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{10}\NormalTok{, }\OtherTok{NA}\NormalTok{))}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##    a
## 1  1
## 2 10
## 3 NA
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df}\OperatorTok{$}\NormalTok{a[df}\OperatorTok{$}\NormalTok{a }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{5}\NormalTok{] \textless{}{-}}\StringTok{ }\DecValTok{0}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##    a
## 1  0
## 2 10
## 3 NA
\end{verbatim}

\hypertarget{tidyverse}{%
\section{Tidyverse}\label{tidyverse}}

\begin{itemize}
\item
  I adapted the following content from Wickham's \href{https://r4ds.had.co.nz/tidy-data.html}{R for Data Science}, his \href{http://www.jstatsoft.org/v59/i10/paper}{earlier paper} published in the Journal of Statistical Software, \href{https://csgillespie.github.io/efficientR/}{Efficient R Programming} by Gillespie and Lovelace, and \href{https://bookdown.org/rdpeng/rprogdatascience/}{R Programming for Data Science} by Roger P. Peng.
\item
  \href{https://design.tidyverse.org/unifying-principles.html}{Tidyverse design guide}

  \begin{itemize}
  \item
    Human centered
  \item
    Consistent
  \item
    Composable (modualized)
  \item
    Inclusive
  \item
    Influenced by the \href{https://homepage.cs.uri.edu/~thenry/resources/unix_art/ch01s06.html}{Basics of the Unix Philosophy}, \href{https://www.python.org/dev/peps/pep-0020/}{The Zen of Python}, and the \href{https://refs.devinmcgloin.com/smalltalk/Design-Principles-Behind-Smalltalk.pdf}{Design Principles Behind Smalltalk}
  \end{itemize}
\end{itemize}

\hypertarget{tidy-data}{%
\section{Tidy data}\label{tidy-data}}

\begin{quote}
``Tidy data sets are easy to manipulate, model and visualize, and have a specific structure: each variable is a column, each observation is a row, and each type of observational unit is a table.'' - Hadley Wickham
\end{quote}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Variables -\textgreater{} \textbf{Columns}
\item
  Observations -\textgreater{} \textbf{Rows}
\item
  Values -\textgreater{} \textbf{Cells}
\end{enumerate}

\begin{figure}
\centering
\includegraphics{https://garrettgman.github.io/images/tidy-1.png}
\caption{Tidy Data Example (Source: R for Data Science)}
\end{figure}

If dataframes are tidy, it's easy to transform, visualize, model, and program them using tidyverse packages (a whole workflow).

\begin{figure}
\centering
\includegraphics{https://miro.medium.com/max/960/0*mlPyX0NE0WQwEzpS.png}
\caption{Tidyverse: an opinionated collection of R packages}
\end{figure}

\begin{itemize}
\tightlist
\item
  Nevertheless, don't be \textbf{religious}.
\end{itemize}

\begin{quote}
In summary, tidy data is a useful conceptual idea and is often the right way to go for general, small data sets, but may not be appropriate for all problems. - Jeff Leek
\end{quote}

For instance, in many data science applications, linear algebra-based computations are essential (e.g., \href{https://www.math.upenn.edu/~kazdan/312S13/JJ/PCA-JJ.pdf}{Principal Component Analysis}). These computations are optimized to work on matrices, not tidy data frames (for more information, read \href{https://simplystatistics.org/2016/02/17/non-tidy-data/}{Jeff Leek's blog post}).

This is what a tidy data looks like.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{library}\NormalTok{(tidyverse)}

\NormalTok{table1}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
\end{verbatim}

\begin{itemize}
\tightlist
\item
  The big picture

  \begin{itemize}
  \tightlist
  \item
    Tidying data with \textbf{tidyr}
  \item
    Processing data with \textbf{dplyr}
  \end{itemize}
\end{itemize}

These two packages don't do anything new, but simplify most common tasks in data manipulation. Plus, they are fast, consistent, and more readable.

Practically, this approach is good because you're going to have consistency in the format of data across all the projects you're working on. Also, tidy data works well with key packages (e.g., dplyr, ggplot2) in R.

Computationally, this approach is useful for vectorized programming because ``different variables from the same observation are always paired''. Vectorized means a function applies to a vector treats each element individually (=operations working in parallel).

\hypertarget{tidyr}{%
\section{tidyr}\label{tidyr}}

\hypertarget{reshaping}{%
\subsection{Reshaping}\label{reshaping}}

\textbf{Signs of messy datasets}

\begin{itemize}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \tightlist
  \item
    Column headers are values, not variable names.
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{1}
  \tightlist
  \item
    Multiple variables are not stored in one column.
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{2}
  \tightlist
  \item
    Variables are stored in both rows and columns.
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{3}
  \tightlist
  \item
    Multiple types of observational units are stored in the same table.
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{4}
  \tightlist
  \item
    A single observational unit is stored in multiple tables.
  \end{enumerate}
\end{itemize}

Let's take a look at the cases of untidy data.

\begin{figure}
\centering
\includegraphics{https://garrettgman.github.io/images/tidy-5.png}
\caption{Messy Data Case 1 (Source: R for Data Science)}
\end{figure}

\begin{itemize}
\item
  Make It Longer

  \begin{longtable}[]{@{}lll@{}}
  \toprule
  Col1 & Col2 & Col3\tabularnewline
  \midrule
  \endhead
  & &\tabularnewline
  & &\tabularnewline
  & &\tabularnewline
  \bottomrule
  \end{longtable}
\end{itemize}

\textbf{Challenge}: Why this data is not tidy?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{table4a}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##   country     `1999` `2000`
## * <chr>        <int>  <int>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766
\end{verbatim}

\begin{itemize}
\item
  Let's pivot (rotate by 90 degree).
\item
  \href{https://tidyr.tidyverse.org/reference/pivot_longer.html}{\texttt{pivot\_longer()}} increases the number of rows (longer) and decreases the number of columns. The inverse function is \texttt{pivot\_wider()}. These functions improve the usability of \texttt{gather()} and \texttt{spread()}.
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://www.storybench.org/wp-content/uploads/2019/08/pivot-longer-image.png}
\caption{What pivot\_longer() does (Source: \url{https://www.storybench.org})}
\end{figure}

\begin{itemize}
\tightlist
\item
  The pipe operator \texttt{\%\textgreater{}\%} originally comes from the \texttt{magrittr} package. The idea behind the pipe operator is \href{https://www.datacamp.com/community/tutorials/pipe-r-tutorial}{similar to} what we learned about chaining functions in high school. f: B -\textgreater{} C and g: A -\textgreater{} B can be expressed as \(f(g(x))\). Basically, the pipe operator chains operations.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{table4a}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##   country     `1999` `2000`
## * <chr>        <int>  <int>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Old way, less intuitive}
\NormalTok{table4a }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{gather}\NormalTok{(}
    \DataTypeTok{key =} \StringTok{"year"}\NormalTok{, }\CommentTok{\# Current column names}
    \DataTypeTok{value =} \StringTok{"cases"}\NormalTok{, }\CommentTok{\# The values matched to cases}
    \KeywordTok{c}\NormalTok{(}\StringTok{"1999"}\NormalTok{, }\StringTok{"2000"}\NormalTok{)}
\NormalTok{  ) }\CommentTok{\# Selected columns}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 3
##   country     year   cases
##   <chr>       <chr>  <int>
## 1 Afghanistan 1999     745
## 2 Brazil      1999   37737
## 3 China       1999  212258
## 4 Afghanistan 2000    2666
## 5 Brazil      2000   80488
## 6 China       2000  213766
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# New way, more intuitive}
\NormalTok{table4a }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pivot\_longer}\NormalTok{(}
    \DataTypeTok{cols =} \KeywordTok{c}\NormalTok{(}\StringTok{"1999"}\NormalTok{, }\StringTok{"2000"}\NormalTok{), }\CommentTok{\# Selected columns}
    \DataTypeTok{names\_to =} \StringTok{"year"}\NormalTok{, }\CommentTok{\# Shorter columns (the columns going to be in one column called year)}
    \DataTypeTok{values\_to =} \StringTok{"cases"}
\NormalTok{  ) }\CommentTok{\# Longer rows (the values are going to be in a separate column called named cases)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 3
##   country     year   cases
##   <chr>       <chr>  <int>
## 1 Afghanistan 1999     745
## 2 Afghanistan 2000    2666
## 3 Brazil      1999   37737
## 4 Brazil      2000   80488
## 5 China       1999  212258
## 6 China       2000  213766
\end{verbatim}

\begin{itemize}
\item
  There's another problem, did you catch it?
\item
  The data type of \texttt{year} variable should be \texttt{numeric} not \texttt{character}. By default, \texttt{pivot\_longer()} transforms uninformative columns to character.
\item
  You can fix this problem by using \texttt{names\_transform} argument.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{table4a }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pivot\_longer}\NormalTok{(}
    \DataTypeTok{cols =} \KeywordTok{c}\NormalTok{(}\StringTok{"1999"}\NormalTok{, }\StringTok{"2000"}\NormalTok{), }\CommentTok{\# Put two columns together}
    \DataTypeTok{names\_to =} \StringTok{"year"}\NormalTok{, }\CommentTok{\# Shorter columns (the columns going to be in one column called year)}
    \DataTypeTok{values\_to =} \StringTok{"cases"}\NormalTok{, }\CommentTok{\# Longer rows (the values are going to be in a separate column called named cases)}
    \DataTypeTok{names\_transform =} \KeywordTok{list}\NormalTok{(}\DataTypeTok{year =}\NormalTok{ readr}\OperatorTok{::}\NormalTok{parse\_number)}
\NormalTok{  ) }\CommentTok{\# Transform the variable}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 3
##   country      year  cases
##   <chr>       <dbl>  <int>
## 1 Afghanistan  1999    745
## 2 Afghanistan  2000   2666
## 3 Brazil       1999  37737
## 4 Brazil       2000  80488
## 5 China        1999 212258
## 6 China        2000 213766
\end{verbatim}

\textbf{Additional tips}

\texttt{parse\_number()} also keeps only numeric information in a variable.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{parse\_number}\NormalTok{(}\StringTok{"reply1994"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1994
\end{verbatim}

A flat file (e.g., CSV) is a rectangular shaped combination of strings. \href{https://cran.r-project.org/web/packages/readr/vignettes/readr.html}{Parsing} determines the type of each column and turns into a vector of a more specific type. Tidyverse has \texttt{parse\_} functions (from \texttt{readr} package) that are flexible and fast (e.g., \texttt{parse\_integer()}, \texttt{parse\_double()}, \texttt{parse\_logical()}, \texttt{parse\_datetime()}, \texttt{parse\_date()}, \texttt{parse\_time()}, \texttt{parse\_factor()}, etc).

\begin{itemize}
\tightlist
\item
  Let's do another practice.
\end{itemize}

\textbf{Challenge}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Why this data is not tidy? (This exercise comes from \href{https://tidyr.tidyverse.org/articles/pivot.html}{\texttt{pivot} function vigenette}.) Too long or too wide?
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{billboard}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 317 x 79
##    artist track date.entered   wk1   wk2   wk3   wk4   wk5   wk6   wk7   wk8
##    <chr>  <chr> <date>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 2 Pac  Baby~ 2000-02-26      87    82    72    77    87    94    99    NA
##  2 2Ge+h~ The ~ 2000-09-02      91    87    92    NA    NA    NA    NA    NA
##  3 3 Doo~ Kryp~ 2000-04-08      81    70    68    67    66    57    54    53
##  4 3 Doo~ Loser 2000-10-21      76    76    72    69    67    65    55    59
##  5 504 B~ Wobb~ 2000-04-15      57    34    25    17    17    31    36    49
##  6 98^0   Give~ 2000-08-19      51    39    34    26    26    19     2     2
##  7 A*Tee~ Danc~ 2000-07-08      97    97    96    95   100    NA    NA    NA
##  8 Aaliy~ I Do~ 2000-01-29      84    62    51    41    38    35    35    38
##  9 Aaliy~ Try ~ 2000-03-18      59    53    38    28    21    18    16    14
## 10 Adams~ Open~ 2000-08-26      76    76    74    69    68    67    61    58
## # ... with 307 more rows, and 68 more variables: wk9 <dbl>, wk10 <dbl>,
## #   wk11 <dbl>, wk12 <dbl>, wk13 <dbl>, wk14 <dbl>, wk15 <dbl>, wk16 <dbl>,
## #   wk17 <dbl>, wk18 <dbl>, wk19 <dbl>, wk20 <dbl>, wk21 <dbl>, wk22 <dbl>,
## #   wk23 <dbl>, wk24 <dbl>, wk25 <dbl>, wk26 <dbl>, wk27 <dbl>, wk28 <dbl>,
## #   wk29 <dbl>, wk30 <dbl>, wk31 <dbl>, wk32 <dbl>, wk33 <dbl>, wk34 <dbl>,
## #   wk35 <dbl>, wk36 <dbl>, wk37 <dbl>, wk38 <dbl>, wk39 <dbl>, wk40 <dbl>,
## #   wk41 <dbl>, wk42 <dbl>, wk43 <dbl>, wk44 <dbl>, wk45 <dbl>, wk46 <dbl>,
## #   wk47 <dbl>, wk48 <dbl>, wk49 <dbl>, wk50 <dbl>, wk51 <dbl>, wk52 <dbl>,
## #   wk53 <dbl>, wk54 <dbl>, wk55 <dbl>, wk56 <dbl>, wk57 <dbl>, wk58 <dbl>,
## #   wk59 <dbl>, wk60 <dbl>, wk61 <dbl>, wk62 <dbl>, wk63 <dbl>, wk64 <dbl>,
## #   wk65 <dbl>, wk66 <lgl>, wk67 <lgl>, wk68 <lgl>, wk69 <lgl>, wk70 <lgl>,
## #   wk71 <lgl>, wk72 <lgl>, wk73 <lgl>, wk74 <lgl>, wk75 <lgl>, wk76 <lgl>
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  How can you fix it? Which pivot?
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Old way}
\NormalTok{billboard }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{gather}\NormalTok{(}
    \DataTypeTok{key =} \StringTok{"week"}\NormalTok{,}
    \DataTypeTok{value =} \StringTok{"rank"}\NormalTok{,}
    \KeywordTok{starts\_with}\NormalTok{(}\StringTok{"wk"}\NormalTok{)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Use regular expressions}
\StringTok{  }\KeywordTok{drop\_na}\NormalTok{() }\CommentTok{\# Drop NAs}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5,307 x 5
##    artist         track                   date.entered week   rank
##    <chr>          <chr>                   <date>       <chr> <dbl>
##  1 2 Pac          Baby Don't Cry (Keep... 2000-02-26   wk1      87
##  2 2Ge+her        The Hardest Part Of ... 2000-09-02   wk1      91
##  3 3 Doors Down   Kryptonite              2000-04-08   wk1      81
##  4 3 Doors Down   Loser                   2000-10-21   wk1      76
##  5 504 Boyz       Wobble Wobble           2000-04-15   wk1      57
##  6 98^0           Give Me Just One Nig... 2000-08-19   wk1      51
##  7 A*Teens        Dancing Queen           2000-07-08   wk1      97
##  8 Aaliyah        I Don't Wanna           2000-01-29   wk1      84
##  9 Aaliyah        Try Again               2000-03-18   wk1      59
## 10 Adams, Yolanda Open My Heart           2000-08-26   wk1      76
## # ... with 5,297 more rows
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Note that \texttt{pivot\_longer()} is more versatile than \texttt{gather()}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# New way}
\NormalTok{billboard }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pivot\_longer}\NormalTok{(}
    \DataTypeTok{cols =} \KeywordTok{starts\_with}\NormalTok{(}\StringTok{"wk"}\NormalTok{), }\CommentTok{\# Use regular expressions}
    \DataTypeTok{names\_to =} \StringTok{"week"}\NormalTok{,}
    \DataTypeTok{values\_to =} \StringTok{"rank"}\NormalTok{,}
    \DataTypeTok{values\_drop\_na =} \OtherTok{TRUE} \CommentTok{\# Drop NAs}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5,307 x 5
##    artist  track                   date.entered week   rank
##    <chr>   <chr>                   <date>       <chr> <dbl>
##  1 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk1      87
##  2 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk2      82
##  3 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk3      72
##  4 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk4      77
##  5 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk5      87
##  6 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk6      94
##  7 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk7      99
##  8 2Ge+her The Hardest Part Of ... 2000-09-02   wk1      91
##  9 2Ge+her The Hardest Part Of ... 2000-09-02   wk2      87
## 10 2Ge+her The Hardest Part Of ... 2000-09-02   wk3      92
## # ... with 5,297 more rows
\end{verbatim}

\begin{itemize}
\item
  Make It Wider
\item
  Why this data is not tidy?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{table2}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 12 x 4
##    country      year type            count
##    <chr>       <int> <chr>           <int>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583
\end{verbatim}

\begin{itemize}
\item
  Each observation is spread across two rows.
\item
  How can you fix it?: \texttt{pivot\_wider()}.
\end{itemize}

\textbf{Two differences between \texttt{pivot\_longer()} and \texttt{pivot\_wider()}}

\begin{itemize}
\item
  In \texttt{pivot\_longer()}, the arguments are named \texttt{names\_to} and \texttt{values\_to} (\emph{to}).
\item
  In \texttt{pivot\_wider()}, this pattern is opposite. The arguments are named \texttt{names\_from} and \texttt{values\_from} (\emph{from}).
\item
  The number of required arguments for \texttt{pivot\_longer()} is 3 (col, names\_to, values\_to).
\item
  The number of required arguments for \texttt{pivot\_wider()} is 2 (names\_from, values\_from).
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://www.storybench.org/wp-content/uploads/2019/08/pivot-wider-image.png}
\caption{What pivot\_wider() does (Source: \url{https://www.storybench.org})}
\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Old way}
\NormalTok{table2 }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{spread}\NormalTok{(}
    \DataTypeTok{key =}\NormalTok{ type,}
    \DataTypeTok{value =}\NormalTok{ count}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# New way}
\NormalTok{table2 }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pivot\_wider}\NormalTok{(}
    \DataTypeTok{names\_from =}\NormalTok{ type, }\CommentTok{\# first}
    \DataTypeTok{values\_from =}\NormalTok{ count }\CommentTok{\# second}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
\end{verbatim}

Sometimes, a consultee came to me and asked: ``I don't have missing values in my original dataframe. Then R said that I have missing values after I've done some data transformations. What happened?''

Here's an answer.

R defines missing values in two ways.

\begin{itemize}
\item
  \emph{Implicit missing values}: simply not present in the data.
\item
  \emph{Explicit missing values}: flagged with NA
\end{itemize}

\textbf{Challenge}

The example comes from \href{https://r4ds.had.co.nz/tidy-data.html}{\emph{R for Data Science}}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{stocks \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{year =} \KeywordTok{c}\NormalTok{(}\DecValTok{2019}\NormalTok{, }\DecValTok{2019}\NormalTok{, }\DecValTok{2019}\NormalTok{, }\DecValTok{2020}\NormalTok{, }\DecValTok{2020}\NormalTok{, }\DecValTok{2020}\NormalTok{),}
  \DataTypeTok{qtr =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{4}\NormalTok{),}
  \DataTypeTok{return =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{, }\OtherTok{NA}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{)}
\NormalTok{)}

\NormalTok{stocks}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 3
##    year   qtr return
##   <dbl> <dbl>  <dbl>
## 1  2019     1      1
## 2  2019     2      2
## 3  2019     3      3
## 4  2020     2     NA
## 5  2020     3      2
## 6  2020     4      3
\end{verbatim}

\begin{itemize}
\item
  Where is explicit missing value?
\item
  Does \texttt{stocks} have implicit missing values?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# implicit missing values become explicit}
\NormalTok{stocks }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pivot\_wider}\NormalTok{(}
    \DataTypeTok{names\_from =}\NormalTok{ year,}
    \DataTypeTok{values\_from =}\NormalTok{ return}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 4 x 3
##     qtr `2019` `2020`
##   <dbl>  <dbl>  <dbl>
## 1     1      1     NA
## 2     2      2     NA
## 3     3      3      2
## 4     4     NA      3
\end{verbatim}

\textbf{Challenge}

\begin{itemize}
\item
  This exercise comes from \href{https://tidyr.tidyverse.org/articles/pivot.html}{\texttt{pivot} function vigenette}.
\item
  Could you make \texttt{station} a series of dummy variables using \texttt{pivot\_wider()}?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fish\_encounters}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 114 x 3
##    fish  station  seen
##    <fct> <fct>   <int>
##  1 4842  Release     1
##  2 4842  I80_1       1
##  3 4842  Lisbon      1
##  4 4842  Rstr        1
##  5 4842  Base_TD     1
##  6 4842  BCE         1
##  7 4842  BCW         1
##  8 4842  BCE2        1
##  9 4842  BCW2        1
## 10 4842  MAE         1
## # ... with 104 more rows
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Which pivot you should use?
\item
  Are there explicit missing values?
\item
  How could you turn these NAs into 0s? Check \texttt{values\_fill} argument in the \texttt{pivot\_wider()} function.
\end{enumerate}

\begin{itemize}
\tightlist
\item
  Separate
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://garrettgman.github.io/images/tidy-6.png}
\caption{Messy Data Case 2 (Source: R for Data Science)}
\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Toy example}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{c}\NormalTok{(}\OtherTok{NA}\NormalTok{, }\StringTok{"Dad.apple"}\NormalTok{, }\StringTok{"Mom.orange"}\NormalTok{, }\StringTok{"Daughter.banana"}\NormalTok{))}

\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##                 x
## 1            <NA>
## 2       Dad.apple
## 3      Mom.orange
## 4 Daughter.banana
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Separate}
\NormalTok{df }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{separate}\NormalTok{(x, }\DataTypeTok{into =} \KeywordTok{c}\NormalTok{(}\StringTok{"Name"}\NormalTok{, }\StringTok{"Preferred\_fruit"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##       Name Preferred_fruit
## 1     <NA>            <NA>
## 2      Dad           apple
## 3      Mom          orange
## 4 Daughter          banana
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Don\textquotesingle{}t need the first variable}

\NormalTok{df }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{separate}\NormalTok{(x, }\DataTypeTok{into =} \KeywordTok{c}\NormalTok{(}\OtherTok{NA}\NormalTok{, }\StringTok{"Preferred\_fruit"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   Preferred_fruit
## 1            <NA>
## 2           apple
## 3          orange
## 4          banana
\end{verbatim}

\textbf{Practice}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{table3}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 3
##   country      year rate             
## * <chr>       <int> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Note \texttt{sep} argument. You can specify how to separate joined values.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{table3 }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{separate}\NormalTok{(rate,}
    \DataTypeTok{into =} \KeywordTok{c}\NormalTok{(}\StringTok{"cases"}\NormalTok{, }\StringTok{"population"}\NormalTok{),}
    \DataTypeTok{sep =} \StringTok{"/"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 4
##   country      year cases  population
##   <chr>       <int> <chr>  <chr>     
## 1 Afghanistan  1999 745    19987071  
## 2 Afghanistan  2000 2666   20595360  
## 3 Brazil       1999 37737  172006362 
## 4 Brazil       2000 80488  174504898 
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Note \texttt{convert} argument. You can specify whether automatically convert the new values or not.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{table3 }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{separate}\NormalTok{(rate,}
    \DataTypeTok{into =} \KeywordTok{c}\NormalTok{(}\StringTok{"cases"}\NormalTok{, }\StringTok{"population"}\NormalTok{),}
    \DataTypeTok{sep =} \StringTok{"/"}\NormalTok{,}
    \DataTypeTok{convert =} \OtherTok{TRUE}
\NormalTok{  ) }\CommentTok{\# cases and population become integers}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Unite
\end{itemize}

\texttt{pivot\_longer()} \textless-\textgreater{} \texttt{pivot\_wider()}

\texttt{separate()} \textless-\textgreater{} \texttt{unite()}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create a toy example}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}
  \DataTypeTok{name =} \KeywordTok{c}\NormalTok{(}\StringTok{"Jae"}\NormalTok{, }\StringTok{"Sun"}\NormalTok{, }\StringTok{"Jane"}\NormalTok{, }\OtherTok{NA}\NormalTok{),}
  \DataTypeTok{birthmonth =} \KeywordTok{c}\NormalTok{(}\StringTok{"April"}\NormalTok{, }\StringTok{"April"}\NormalTok{, }\StringTok{"June"}\NormalTok{, }\OtherTok{NA}\NormalTok{)}
\NormalTok{)}

\CommentTok{\# Include missing values}
\NormalTok{df }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{unite}\NormalTok{(}
  \StringTok{"contact"}\NormalTok{,}
  \KeywordTok{c}\NormalTok{(}\StringTok{"name"}\NormalTok{, }\StringTok{"birthmonth"}\NormalTok{)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##     contact
## 1 Jae_April
## 2 Sun_April
## 3 Jane_June
## 4     NA_NA
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Do not include missing values}
\NormalTok{df }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{unite}\NormalTok{(}\StringTok{"contact"}\NormalTok{,}
  \KeywordTok{c}\NormalTok{(}\StringTok{"name"}\NormalTok{, }\StringTok{"birthmonth"}\NormalTok{),}
  \DataTypeTok{na.rm =} \OtherTok{TRUE}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##     contact
## 1 Jae_April
## 2 Sun_April
## 3 Jane_June
## 4
\end{verbatim}

\hypertarget{filling-tbd}{%
\subsection{Filling (TBD)}\label{filling-tbd}}

\hypertarget{dplyr}{%
\section{dplyr}\label{dplyr}}

dplyr is better than the base R approaches to data processing:

\begin{itemize}
\tightlist
\item
  fast to run (due to the C++ backed) and intuitive to type
\item
  works well with tidy data and databases
\end{itemize}

\hypertarget{rearranging}{%
\subsection{Rearranging}\label{rearranging}}

\begin{itemize}
\item
  Arrange
\item
  Order rows
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{arrange}\NormalTok{(mtcars, mpg) }\CommentTok{\# Low to High (default)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{arrange}\NormalTok{(mtcars, }\KeywordTok{desc}\NormalTok{(mpg)) }\CommentTok{\# High to Row}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
\end{verbatim}

\begin{itemize}
\item
  Rename
\item
  Rename columns
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{y =} \KeywordTok{c}\NormalTok{(}\DecValTok{2011}\NormalTok{, }\DecValTok{2012}\NormalTok{, }\DecValTok{2013}\NormalTok{))}

\NormalTok{df }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{rename}\NormalTok{(}
    \DataTypeTok{Year =} \CommentTok{\# NEW name}
\NormalTok{    y}
\NormalTok{  ) }\CommentTok{\# OLD name}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 1
##    Year
##   <dbl>
## 1  2011
## 2  2012
## 3  2013
\end{verbatim}

\hypertarget{subset-observations-rows}{%
\subsection{Subset observations (rows)}\label{subset-observations-rows}}

\begin{itemize}
\item
  Choose row by logical condition
\item
  Single condition
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(gender }\OperatorTok{==}\StringTok{ "feminine"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{arrange}\NormalTok{(}\KeywordTok{desc}\NormalTok{(height))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 17 x 14
##    name  height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Taun~    213  NA   none       grey       black             NA fema~ femin~
##  2 Adi ~    184  50   none       dark       blue              NA fema~ femin~
##  3 Ayla~    178  55   none       blue       hazel             48 fema~ femin~
##  4 Shaa~    178  57   none       red, blue~ black             NA fema~ femin~
##  5 Lumi~    170  56.2 black      yellow     blue              58 fema~ femin~
##  6 Zam ~    168  55   blonde     fair, gre~ yellow            NA fema~ femin~
##  7 Joca~    167  NA   white      fair       blue              NA fema~ femin~
##  8 Barr~    166  50   black      yellow     blue              40 fema~ femin~
##  9 Beru~    165  75   brown      light      blue              47 fema~ femin~
## 10 Dormé    165  NA   brown      light      brown             NA fema~ femin~
## 11 Padm~    165  45   brown      light      brown             46 fema~ femin~
## 12 Shmi~    163  NA   black      fair       brown             72 fema~ femin~
## 13 Cordé    157  NA   brown      light      brown             NA fema~ femin~
## 14 Leia~    150  49   brown      light      brown             19 fema~ femin~
## 15 Mon ~    150  NA   auburn     fair       blue              48 fema~ femin~
## 16 R4-P~     96  NA   none       silver, r~ red, blue         NA none  femin~
## 17 Rey       NA  NA   brown      light      hazel             NA fema~ femin~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

The following filtering example was inspired by \href{https://suzan.rbind.io/2018/02/dplyr-tutorial-3/}{the suzanbert's dplyr blog post}.

\begin{itemize}
\tightlist
\item
  Multiple conditions (numeric)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# First example}
\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(height }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{180}\NormalTok{, height }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{160}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{nrow}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 24
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Same as above}
\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(height }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{180} \OperatorTok{\&}\StringTok{ }\NormalTok{height }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{160}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{nrow}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 24
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Not same as above}
\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(height }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{180} \OperatorTok{|}\StringTok{ }\NormalTok{height }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{160}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{nrow}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 81
\end{verbatim}

\textbf{Challenge}

\begin{enumerate}
\def\labelenumi{(\arabic{enumi})}
\tightlist
\item
  Use \texttt{filter(between())} to find characters whose heights are between 180 and 160 and (2) count the number of these observations.
\end{enumerate}

\begin{itemize}
\tightlist
\item
  Minimum reproducible example
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{heights =} \KeywordTok{c}\NormalTok{(}\DecValTok{160}\OperatorTok{:}\DecValTok{180}\NormalTok{),}
  \DataTypeTok{char =} \KeywordTok{rep}\NormalTok{(}\StringTok{"none"}\NormalTok{, }\KeywordTok{length}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\DecValTok{160}\OperatorTok{:}\DecValTok{180}\NormalTok{)))}
\NormalTok{)}

\NormalTok{df }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(}\KeywordTok{between}\NormalTok{(heights, }\DecValTok{161}\NormalTok{, }\DecValTok{179}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 19 x 2
##    heights char 
##      <int> <chr>
##  1     161 none 
##  2     162 none 
##  3     163 none 
##  4     164 none 
##  5     165 none 
##  6     166 none 
##  7     167 none 
##  8     168 none 
##  9     169 none 
## 10     170 none 
## 11     171 none 
## 12     172 none 
## 13     173 none 
## 14     174 none 
## 15     175 none 
## 16     176 none 
## 17     177 none 
## 18     178 none 
## 19     179 none
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Multiple conditions (character)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Filter names include ars; \textasciigrave{}grepl\textasciigrave{} is a base R function}

\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(}\KeywordTok{grepl}\NormalTok{(}\StringTok{"ars"}\NormalTok{, }\KeywordTok{tolower}\NormalTok{(name)))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 4 x 14
##   name  height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Owen~    178   120 brown, gr~ light      blue              52 male  mascu~
## 2 Beru~    165    75 brown      light      blue              47 fema~ femin~
## 3 Quar~    183    NA black      dark       brown             62 <NA>  <NA>  
## 4 Clie~    183    NA brown      fair       blue              82 male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Or, if you prefer dplyr way}

\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(}\KeywordTok{str\_detect}\NormalTok{(}\KeywordTok{tolower}\NormalTok{(name), }\StringTok{"ars"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 4 x 14
##   name  height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Owen~    178   120 brown, gr~ light      blue              52 male  mascu~
## 2 Beru~    165    75 brown      light      blue              47 fema~ femin~
## 3 Quar~    183    NA black      dark       brown             62 <NA>  <NA>  
## 4 Clie~    183    NA brown      fair       blue              82 male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Filter brown and black hair\_color}

\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(hair\_color }\OperatorTok{\%in\%}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"black"}\NormalTok{, }\StringTok{"brown"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 31 x 14
##    name  height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Leia~    150  49   brown      light      brown           19   fema~ femin~
##  2 Beru~    165  75   brown      light      blue            47   fema~ femin~
##  3 Bigg~    183  84   black      light      brown           24   male  mascu~
##  4 Chew~    228 112   brown      unknown    blue           200   male  mascu~
##  5 Han ~    180  80   brown      fair       brown           29   male  mascu~
##  6 Wedg~    170  77   brown      fair       hazel           21   male  mascu~
##  7 Jek ~    180 110   brown      fair       blue            NA   male  mascu~
##  8 Boba~    183  78.2 black      fair       brown           31.5 male  mascu~
##  9 Land~    177  79   black      dark       brown           31   male  mascu~
## 10 Arve~     NA  NA   brown      fair       brown           NA   male  mascu~
## # ... with 21 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## #   films <list>, vehicles <list>, starships <list>
\end{verbatim}

\textbf{Challenge}

Use \texttt{str\_detect()} to find characters whose names include ``Han''.

\begin{itemize}
\tightlist
\item
  Choose row by position (row index)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{arrange}\NormalTok{(}\KeywordTok{desc}\NormalTok{(height)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{slice}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{6}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 14
##   name  height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Yara~    264    NA none       white      yellow            NA male  mascu~
## 2 Tarf~    234   136 brown      brown      blue              NA male  mascu~
## 3 Lama~    229    88 none       grey       black             NA male  mascu~
## 4 Chew~    228   112 brown      unknown    blue             200 male  mascu~
## 5 Roos~    224    82 none       grey       orange            NA male  mascu~
## 6 Grie~    216   159 none       brown, wh~ green, y~         NA male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Sample by fraction
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# For reproducibility}
\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{)}

\CommentTok{\# Old way}

\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{sample\_frac}\NormalTok{(}\FloatTok{0.10}\NormalTok{,}
    \DataTypeTok{replace =} \OtherTok{FALSE}
\NormalTok{  ) }\CommentTok{\# Without replacement}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 9 x 14
##   name  height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Arve~     NA    NA brown      fair       brown           NA   male  mascu~
## 2 Sly ~    178    48 none       pale       white           NA   <NA>  <NA>  
## 3 IG-88    200   140 none       metal      red             15   none  mascu~
## 4 Bigg~    183    84 black      light      brown           24   male  mascu~
## 5 Leia~    150    49 brown      light      brown           19   fema~ femin~
## 6 Watto    137    NA black      blue, grey yellow          NA   male  mascu~
## 7 Jabb~    175  1358 <NA>       green-tan~ orange         600   herm~ mascu~
## 8 Dart~    202   136 none       white      yellow          41.9 male  mascu~
## 9 Taun~    213    NA none       grey       black           NA   fema~ femin~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# New way}

\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{slice\_sample}\NormalTok{(}
    \DataTypeTok{prop =} \FloatTok{0.10}\NormalTok{,}
    \DataTypeTok{replace =} \OtherTok{FALSE}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 8 x 14
##   name  height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Raym~    188  79   brown      light      brown           NA   male  mascu~
## 2 Tarf~    234 136   brown      brown      blue            NA   male  mascu~
## 3 Han ~    180  80   brown      fair       brown           29   male  mascu~
## 4 Mas ~    196  NA   none       blue       blue            NA   male  mascu~
## 5 Barr~    166  50   black      yellow     blue            40   fema~ femin~
## 6 Dart~    202 136   none       white      yellow          41.9 male  mascu~
## 7 Finn      NA  NA   black      dark       dark            NA   male  mascu~
## 8 Boba~    183  78.2 black      fair       brown           31.5 male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Sample by number
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Old way}

\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{sample\_n}\NormalTok{(}\DecValTok{20}\NormalTok{,}
    \DataTypeTok{replace =} \OtherTok{FALSE}
\NormalTok{  ) }\CommentTok{\# Without replacement}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 20 x 14
##    name  height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Quar~    183    NA black      dark       brown             62 <NA>  <NA>  
##  2 Poe ~     NA    NA brown      light      brown             NA male  mascu~
##  3 Mas ~    196    NA none       blue       blue              NA male  mascu~
##  4 Zam ~    168    55 blonde     fair, gre~ yellow            NA fema~ femin~
##  5 Leia~    150    49 brown      light      brown             19 fema~ femin~
##  6 Jang~    183    79 black      tan        brown             66 male  mascu~
##  7 Ben ~    163    65 none       grey, gre~ orange            NA male  mascu~
##  8 Padm~    165    45 brown      light      brown             46 fema~ femin~
##  9 Mace~    188    84 none       dark       brown             72 male  mascu~
## 10 R2-D2     96    32 <NA>       white, bl~ red               33 none  mascu~
## 11 Shmi~    163    NA black      fair       brown             72 fema~ femin~
## 12 Ratt~     79    15 none       grey, blue unknown           NA male  mascu~
## 13 Nute~    191    90 none       mottled g~ red               NA male  mascu~
## 14 Dart~    175    80 none       red        yellow            54 male  mascu~
## 15 Bib ~    180    NA none       pale       pink              NA male  mascu~
## 16 C-3PO    167    75 <NA>       gold       yellow           112 none  mascu~
## 17 Yara~    264    NA none       white      yellow            NA male  mascu~
## 18 Ki-A~    198    82 white      pale       yellow            92 male  mascu~
## 19 BB8       NA    NA none       none       black             NA none  mascu~
## 20 Eeth~    171    NA black      brown      brown             NA male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# New way}

\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{slice\_sample}\NormalTok{(}
    \DataTypeTok{n =} \DecValTok{20}\NormalTok{,}
    \DataTypeTok{replace =} \OtherTok{FALSE}
\NormalTok{  ) }\CommentTok{\# Without replacement}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 20 x 14
##    name  height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Owen~    178   120 brown, gr~ light      blue              52 male  mascu~
##  2 Ki-A~    198    82 white      pale       yellow            92 male  mascu~
##  3 Capt~     NA    NA unknown    unknown    unknown           NA <NA>  <NA>  
##  4 Greg~    185    85 black      dark       brown             NA male  mascu~
##  5 R5-D4     97    32 <NA>       white, red red               NA none  mascu~
##  6 Ackb~    180    83 none       brown mot~ orange            41 male  mascu~
##  7 Wedg~    170    77 brown      fair       hazel             21 male  mascu~
##  8 Dormé    165    NA brown      light      brown             NA fema~ femin~
##  9 Rey       NA    NA brown      light      hazel             NA fema~ femin~
## 10 IG-88    200   140 none       metal      red               15 none  mascu~
## 11 Roos~    224    82 none       grey       orange            NA male  mascu~
## 12 Shmi~    163    NA black      fair       brown             72 fema~ femin~
## 13 R2-D2     96    32 <NA>       white, bl~ red               33 none  mascu~
## 14 Poe ~     NA    NA brown      light      brown             NA male  mascu~
## 15 Obi-~    182    77 auburn, w~ fair       blue-gray         57 male  mascu~
## 16 Plo ~    188    80 none       orange     black             22 male  mascu~
## 17 Tarf~    234   136 brown      brown      blue              NA male  mascu~
## 18 Lobot    175    79 none       light      blue              37 male  mascu~
## 19 San ~    191    NA none       grey       gold              NA male  mascu~
## 20 Kit ~    196    87 none       green      black             NA male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Top 10 rows orderd by height
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Old way}
\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{top\_n}\NormalTok{(}\DecValTok{10}\NormalTok{, height)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 10 x 14
##    name  height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Dart~    202   136 none       white      yellow          41.9 male  mascu~
##  2 Chew~    228   112 brown      unknown    blue           200   male  mascu~
##  3 Roos~    224    82 none       grey       orange          NA   male  mascu~
##  4 Rugo~    206    NA none       green      orange          NA   male  mascu~
##  5 Yara~    264    NA none       white      yellow          NA   male  mascu~
##  6 Lama~    229    88 none       grey       black           NA   male  mascu~
##  7 Taun~    213    NA none       grey       black           NA   fema~ femin~
##  8 Grie~    216   159 none       brown, wh~ green, y~       NA   male  mascu~
##  9 Tarf~    234   136 brown      brown      blue            NA   male  mascu~
## 10 Tion~    206    80 none       grey       black           NA   male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# New way}
\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{slice\_max}\NormalTok{(height, }\DataTypeTok{n =} \DecValTok{10}\NormalTok{) }\CommentTok{\# Variable first, Argument second}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 10 x 14
##    name  height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Yara~    264    NA none       white      yellow          NA   male  mascu~
##  2 Tarf~    234   136 brown      brown      blue            NA   male  mascu~
##  3 Lama~    229    88 none       grey       black           NA   male  mascu~
##  4 Chew~    228   112 brown      unknown    blue           200   male  mascu~
##  5 Roos~    224    82 none       grey       orange          NA   male  mascu~
##  6 Grie~    216   159 none       brown, wh~ green, y~       NA   male  mascu~
##  7 Taun~    213    NA none       grey       black           NA   fema~ femin~
##  8 Rugo~    206    NA none       green      orange          NA   male  mascu~
##  9 Tion~    206    80 none       grey       black           NA   male  mascu~
## 10 Dart~    202   136 none       white      yellow          41.9 male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\hypertarget{subset-variables-columns}{%
\subsection{Subset variables (columns)}\label{subset-variables-columns}}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{names}\NormalTok{(msleep)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "name"         "genus"        "vore"         "order"        "conservation"
##  [6] "sleep_total"  "sleep_rem"    "sleep_cycle"  "awake"        "brainwt"     
## [11] "bodywt"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Select only numeric columns
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Only numeric}
\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{select}\NormalTok{(}\KeywordTok{where}\NormalTok{(is.numeric))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 6
##    sleep_total sleep_rem sleep_cycle awake  brainwt  bodywt
##          <dbl>     <dbl>       <dbl> <dbl>    <dbl>   <dbl>
##  1        12.1      NA        NA      11.9 NA        50    
##  2        17         1.8      NA       7    0.0155    0.48 
##  3        14.4       2.4      NA       9.6 NA         1.35 
##  4        14.9       2.3       0.133   9.1  0.00029   0.019
##  5         4         0.7       0.667  20    0.423   600    
##  6        14.4       2.2       0.767   9.6 NA         3.85 
##  7         8.7       1.4       0.383  15.3 NA        20.5  
##  8         7        NA        NA      17   NA         0.045
##  9        10.1       2.9       0.333  13.9  0.07     14    
## 10         3        NA        NA      21    0.0982   14.8  
## # ... with 73 more rows
\end{verbatim}

\textbf{Challenge}

Use \texttt{select(where())} to find only non-numeric columns

\begin{itemize}
\tightlist
\item
  Select the columns that include ``sleep'' in their names
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{select}\NormalTok{(}\KeywordTok{contains}\NormalTok{(}\StringTok{"sleep"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 3
##    sleep_total sleep_rem sleep_cycle
##          <dbl>     <dbl>       <dbl>
##  1        12.1      NA        NA    
##  2        17         1.8      NA    
##  3        14.4       2.4      NA    
##  4        14.9       2.3       0.133
##  5         4         0.7       0.667
##  6        14.4       2.2       0.767
##  7         8.7       1.4       0.383
##  8         7        NA        NA    
##  9        10.1       2.9       0.333
## 10         3        NA        NA    
## # ... with 73 more rows
\end{verbatim}

\begin{itemize}
\item
  Select the columns that include either ``sleep'' or ``wt'' in their names
\item
  Basic R way
\end{itemize}

\texttt{grepl} is one of the R base pattern matching functions.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{msleep[}\KeywordTok{grepl}\NormalTok{(}\StringTok{"sleep|wt"}\NormalTok{, }\KeywordTok{names}\NormalTok{(msleep))]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 5
##    sleep_total sleep_rem sleep_cycle  brainwt  bodywt
##          <dbl>     <dbl>       <dbl>    <dbl>   <dbl>
##  1        12.1      NA        NA     NA        50    
##  2        17         1.8      NA      0.0155    0.48 
##  3        14.4       2.4      NA     NA         1.35 
##  4        14.9       2.3       0.133  0.00029   0.019
##  5         4         0.7       0.667  0.423   600    
##  6        14.4       2.2       0.767 NA         3.85 
##  7         8.7       1.4       0.383 NA        20.5  
##  8         7        NA        NA     NA         0.045
##  9        10.1       2.9       0.333  0.07     14    
## 10         3        NA        NA      0.0982   14.8  
## # ... with 73 more rows
\end{verbatim}

\textbf{Challenge}

Use \texttt{select(match())} to find columns whose names include either ``sleep'' or ``wt''.

\begin{itemize}
\tightlist
\item
  Select the columns that starts with ``b''
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{select}\NormalTok{(}\KeywordTok{starts\_with}\NormalTok{(}\StringTok{"b"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 2
##     brainwt  bodywt
##       <dbl>   <dbl>
##  1 NA        50    
##  2  0.0155    0.48 
##  3 NA         1.35 
##  4  0.00029   0.019
##  5  0.423   600    
##  6 NA         3.85 
##  7 NA        20.5  
##  8 NA         0.045
##  9  0.07     14    
## 10  0.0982   14.8  
## # ... with 73 more rows
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Select the columns that ends with ``wt''
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{select}\NormalTok{(}\KeywordTok{ends\_with}\NormalTok{(}\StringTok{"wt"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 2
##     brainwt  bodywt
##       <dbl>   <dbl>
##  1 NA        50    
##  2  0.0155    0.48 
##  3 NA         1.35 
##  4  0.00029   0.019
##  5  0.423   600    
##  6 NA         3.85 
##  7 NA        20.5  
##  8 NA         0.045
##  9  0.07     14    
## 10  0.0982   14.8  
## # ... with 73 more rows
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Select the columns using both beginning and end string patterns
\end{itemize}

The key idea is you can use Boolean operators (\texttt{!}, \texttt{\&}, \texttt{\textbar{}})to combine different string pattern matching statements.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{select}\NormalTok{(}\KeywordTok{starts\_with}\NormalTok{(}\StringTok{"b"}\NormalTok{) }\OperatorTok{\&}\StringTok{ }\KeywordTok{ends\_with}\NormalTok{(}\StringTok{"wt"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 2
##     brainwt  bodywt
##       <dbl>   <dbl>
##  1 NA        50    
##  2  0.0155    0.48 
##  3 NA         1.35 
##  4  0.00029   0.019
##  5  0.423   600    
##  6 NA         3.85 
##  7 NA        20.5  
##  8 NA         0.045
##  9  0.07     14    
## 10  0.0982   14.8  
## # ... with 73 more rows
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Select order and move it before everything
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# By specifying a column}
\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{select}\NormalTok{(order, }\KeywordTok{everything}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 11
##    order name  genus vore  conservation sleep_total sleep_rem sleep_cycle awake
##    <chr> <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Carn~ Chee~ Acin~ carni lc                  12.1      NA        NA      11.9
##  2 Prim~ Owl ~ Aotus omni  <NA>                17         1.8      NA       7  
##  3 Rode~ Moun~ Aplo~ herbi nt                  14.4       2.4      NA       9.6
##  4 Sori~ Grea~ Blar~ omni  lc                  14.9       2.3       0.133   9.1
##  5 Arti~ Cow   Bos   herbi domesticated         4         0.7       0.667  20  
##  6 Pilo~ Thre~ Brad~ herbi <NA>                14.4       2.2       0.767   9.6
##  7 Carn~ Nort~ Call~ carni vu                   8.7       1.4       0.383  15.3
##  8 Rode~ Vesp~ Calo~ <NA>  <NA>                 7        NA        NA      17  
##  9 Carn~ Dog   Canis carni domesticated        10.1       2.9       0.333  13.9
## 10 Arti~ Roe ~ Capr~ herbi lc                   3        NA        NA      21  
## # ... with 73 more rows, and 2 more variables: brainwt <dbl>, bodywt <dbl>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Select variables from a character vector.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{select}\NormalTok{(}\KeywordTok{any\_of}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"name"}\NormalTok{, }\StringTok{"order"}\NormalTok{))) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{colnames}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "name"  "order"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Select the variables named in the character + number pattern
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{msleep}\OperatorTok{$}\NormalTok{week8 \textless{}{-}}\StringTok{ }\OtherTok{NA}

\NormalTok{msleep}\OperatorTok{$}\NormalTok{week12 \textless{}{-}}\StringTok{ }\OtherTok{NA}

\NormalTok{msleep}\OperatorTok{$}\NormalTok{week\_extra \textless{}{-}}\StringTok{ }\DecValTok{0}

\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{select}\NormalTok{(}\KeywordTok{num\_range}\NormalTok{(}\StringTok{"week"}\NormalTok{, }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{12}\NormalTok{)))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 2
##    week8 week12
##    <lgl> <lgl> 
##  1 NA    NA    
##  2 NA    NA    
##  3 NA    NA    
##  4 NA    NA    
##  5 NA    NA    
##  6 NA    NA    
##  7 NA    NA    
##  8 NA    NA    
##  9 NA    NA    
## 10 NA    NA    
## # ... with 73 more rows
\end{verbatim}

\hypertarget{create-variables-tbd}{%
\subsection{Create variables (TBD)}\label{create-variables-tbd}}

\hypertarget{rename-variables-tbd}{%
\subsection{Rename variables (TBD)}\label{rename-variables-tbd}}

\hypertarget{clean-names-tbd}{%
\subsubsection{Clean names (TBD)}\label{clean-names-tbd}}

\hypertarget{recode-values-tbd}{%
\subsection{Recode values (TBD)}\label{recode-values-tbd}}

\hypertarget{counting}{%
\subsection{Counting}\label{counting}}

\begin{itemize}
\tightlist
\item
  How may countries in each continent?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(continent)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 2
##   continent     n
##   <fct>     <int>
## 1 Africa      624
## 2 Americas    300
## 3 Asia        396
## 4 Europe      360
## 5 Oceania      24
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Let's arrange the result.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Just add a new argument \textasciigrave{}sort = TRUE\textasciigrave{}}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(continent, }\DataTypeTok{sort =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 2
##   continent     n
##   <fct>     <int>
## 1 Africa      624
## 2 Asia        396
## 3 Europe      360
## 4 Americas    300
## 5 Oceania      24
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Same as above; How nice!}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{arrange}\NormalTok{(}\KeywordTok{desc}\NormalTok{(n))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 2
##   continent     n
##   <fct>     <int>
## 1 Africa      624
## 2 Asia        396
## 3 Europe      360
## 4 Americas    300
## 5 Oceania      24
\end{verbatim}

\textbf{Challenge}

Count the number of observations per \texttt{continent} as well as \texttt{year} and arrange them with descending order.

Let's take a deeper look at how things work under the hood.

\begin{itemize}
\item
  \texttt{tally()} works similar to \texttt{nrow()}: Calculate the total number of cases in a dataframe
\item
  \texttt{count} = \texttt{group\_by()} + \texttt{tally()}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tally}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 1
##       n
##   <int>
## 1  1704
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \texttt{add\_tally()} = \texttt{mutate(n\ =\ n())}
\end{itemize}

\textbf{Challenge}

What does n in the below example represent?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{select}\NormalTok{(continent, country) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{add\_tally}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1,704 x 3
##    continent country         n
##    <fct>     <fct>       <int>
##  1 Asia      Afghanistan  1704
##  2 Asia      Afghanistan  1704
##  3 Asia      Afghanistan  1704
##  4 Asia      Afghanistan  1704
##  5 Asia      Afghanistan  1704
##  6 Asia      Afghanistan  1704
##  7 Asia      Afghanistan  1704
##  8 Asia      Afghanistan  1704
##  9 Asia      Afghanistan  1704
## 10 Asia      Afghanistan  1704
## # ... with 1,694 more rows
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \texttt{add\_count}
\end{itemize}

Add count as a column

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Add count as a column}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{add\_count}\NormalTok{(year)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1,704 x 7
## # Groups:   continent [5]
##    country     continent  year lifeExp      pop gdpPercap     n
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl> <int>
##  1 Afghanistan Asia       1952    28.8  8425333      779.    33
##  2 Afghanistan Asia       1957    30.3  9240934      821.    33
##  3 Afghanistan Asia       1962    32.0 10267083      853.    33
##  4 Afghanistan Asia       1967    34.0 11537966      836.    33
##  5 Afghanistan Asia       1972    36.1 13079460      740.    33
##  6 Afghanistan Asia       1977    38.4 14880372      786.    33
##  7 Afghanistan Asia       1982    39.9 12881816      978.    33
##  8 Afghanistan Asia       1987    40.8 13867957      852.    33
##  9 Afghanistan Asia       1992    41.7 16317921      649.    33
## 10 Afghanistan Asia       1997    41.8 22227415      635.    33
## # ... with 1,694 more rows
\end{verbatim}

\textbf{Challenge}

Do the cases 1 and 2 in the below code chunk produce same outputs? If so, why?

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Case 1}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 60 x 3
## # Groups:   continent, year [60]
##    continent  year     n
##    <fct>     <int> <int>
##  1 Africa     1952    52
##  2 Africa     1957    52
##  3 Africa     1962    52
##  4 Africa     1967    52
##  5 Africa     1972    52
##  6 Africa     1977    52
##  7 Africa     1982    52
##  8 Africa     1987    52
##  9 Africa     1992    52
## 10 Africa     1997    52
## # ... with 50 more rows
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Case 2}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(year)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 60 x 3
## # Groups:   continent [5]
##    continent  year     n
##    <fct>     <int> <int>
##  1 Africa     1952    52
##  2 Africa     1957    52
##  3 Africa     1962    52
##  4 Africa     1967    52
##  5 Africa     1972    52
##  6 Africa     1977    52
##  7 Africa     1982    52
##  8 Africa     1987    52
##  9 Africa     1992    52
## 10 Africa     1997    52
## # ... with 50 more rows
\end{verbatim}

\texttt{count()} is a simple function, but it is still helpful to learn a very important concept underlying complex data wrangling: split-apply-combine strategy. For more information, read Wickham's article (2011) \href{http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.182.5667\&rep=rep1\&type=pdf}{``The Split-Apply-Combine Strategy for Data Analysis''} published in the \emph{Journal of Statistical Software} (especially pages 7-8). \href{https://github.com/hadley/plyr}{\texttt{plyr}} was the package (retired) that demonstrated this idea, which has evolved into two directions: \href{https://dplyr.tidyverse.org/}{dplyr} (for data frames) and \href{https://purrr.tidyverse.org/}{purrr} (for lists)

\hypertarget{summarizing}{%
\subsection{Summarizing}\label{summarizing}}

\hypertarget{basic}{%
\subsubsection{Basic}\label{basic}}

\begin{itemize}
\tightlist
\item
  Create a summary
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}
    \DataTypeTok{n =} \KeywordTok{n}\NormalTok{(),}
    \DataTypeTok{mean\_gdp =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{sd\_gdp =} \KeywordTok{sd}\NormalTok{(gdpPercap)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## # A tibble: 5 x 4
##   continent     n mean_gdp sd_gdp
##   <fct>     <int>    <dbl>  <dbl>
## 1 Africa      624    2194.  2828.
## 2 Americas    300    7136.  6397.
## 3 Asia        396    7902. 14045.
## 4 Europe      360   14469.  9355.
## 5 Oceania      24   18622.  6359.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tablea \textless{}{-}}\StringTok{ }\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}
    \DataTypeTok{n =} \KeywordTok{n}\NormalTok{(),}
    \DataTypeTok{mean\_gdp =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{sd\_gdp =} \KeywordTok{sd}\NormalTok{(gdpPercap)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Produce publishable tables
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# For HTML and LaTeX}
\NormalTok{tablea }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\NormalTok{kableExtra}\OperatorTok{::}\KeywordTok{kable}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{l|r|r|r}
\hline
continent & n & mean\_gdp & sd\_gdp\\
\hline
Africa & 624 & 2193.755 & 2827.930\\
\hline
Americas & 300 & 7136.110 & 6396.764\\
\hline
Asia & 396 & 7902.150 & 14045.373\\
\hline
Europe & 360 & 14469.476 & 9355.213\\
\hline
Oceania & 24 & 18621.609 & 6358.983\\
\hline
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# For HTML and MS Office suite}
\NormalTok{tablea }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\NormalTok{flextable}\OperatorTok{::}\KeywordTok{flextable}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics[width=3.00in,height=1.50in,keepaspectratio]{03_tidy_data_files/figure-latex/unnamed-chunk-128-1.png}

\hypertarget{scoped-summaries}{%
\subsubsection{Scoped summaries}\label{scoped-summaries}}

\begin{itemize}
\item
  Old way
\item
  \texttt{summarise\_all()}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create a wide{-}shaped data example}
\NormalTok{wide\_gapminder \textless{}{-}}\StringTok{ }\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pivot\_wider}\NormalTok{(}
    \DataTypeTok{names\_from =}\NormalTok{ country,}
    \DataTypeTok{values\_from =}\NormalTok{ gdpPercap}
\NormalTok{  )}

\CommentTok{\# Apply summarise\_all}
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{select}\NormalTok{(}\OperatorTok{{-}}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{4}\NormalTok{)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise\_all}\NormalTok{(mean, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 30
##   Albania Austria Belgium `Bosnia and Her~ Bulgaria Croatia `Czech Republic`
##     <dbl>   <dbl>   <dbl>            <dbl>    <dbl>   <dbl>            <dbl>
## 1   3255.  20412.  19901.            3485.    6384.   9332.           13920.
## # ... with 23 more variables: Denmark <dbl>, Finland <dbl>, France <dbl>,
## #   Germany <dbl>, Greece <dbl>, Hungary <dbl>, Iceland <dbl>, Ireland <dbl>,
## #   Italy <dbl>, Montenegro <dbl>, Netherlands <dbl>, Norway <dbl>,
## #   Poland <dbl>, Portugal <dbl>, Romania <dbl>, Serbia <dbl>, `Slovak
## #   Republic` <dbl>, Slovenia <dbl>, Spain <dbl>, Sweden <dbl>,
## #   Switzerland <dbl>, Turkey <dbl>, `United Kingdom` <dbl>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \texttt{summarise\_if()}: using a logical condition
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise\_if}\NormalTok{(is.double, mean, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 31
##   lifeExp Albania Austria Belgium `Bosnia and Her~ Bulgaria Croatia
##     <dbl>   <dbl>   <dbl>   <dbl>            <dbl>    <dbl>   <dbl>
## 1    71.9   3255.  20412.  19901.            3485.    6384.   9332.
## # ... with 24 more variables: `Czech Republic` <dbl>, Denmark <dbl>,
## #   Finland <dbl>, France <dbl>, Germany <dbl>, Greece <dbl>, Hungary <dbl>,
## #   Iceland <dbl>, Ireland <dbl>, Italy <dbl>, Montenegro <dbl>,
## #   Netherlands <dbl>, Norway <dbl>, Poland <dbl>, Portugal <dbl>,
## #   Romania <dbl>, Serbia <dbl>, `Slovak Republic` <dbl>, Slovenia <dbl>,
## #   Spain <dbl>, Sweden <dbl>, Switzerland <dbl>, Turkey <dbl>, `United
## #   Kingdom` <dbl>
\end{verbatim}

\begin{itemize}
\item
  \texttt{summarise\_at()}
\item
  \texttt{vars()\ =\ select()}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise\_at}\NormalTok{(}\KeywordTok{vars}\NormalTok{(}\OperatorTok{{-}}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{4}\NormalTok{)),}
\NormalTok{    mean,}
    \DataTypeTok{na.rm =} \OtherTok{TRUE}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 30
##   Albania Austria Belgium `Bosnia and Her~ Bulgaria Croatia `Czech Republic`
##     <dbl>   <dbl>   <dbl>            <dbl>    <dbl>   <dbl>            <dbl>
## 1   3255.  20412.  19901.            3485.    6384.   9332.           13920.
## # ... with 23 more variables: Denmark <dbl>, Finland <dbl>, France <dbl>,
## #   Germany <dbl>, Greece <dbl>, Hungary <dbl>, Iceland <dbl>, Ireland <dbl>,
## #   Italy <dbl>, Montenegro <dbl>, Netherlands <dbl>, Norway <dbl>,
## #   Poland <dbl>, Portugal <dbl>, Romania <dbl>, Serbia <dbl>, `Slovak
## #   Republic` <dbl>, Slovenia <dbl>, Spain <dbl>, Sweden <dbl>,
## #   Switzerland <dbl>, Turkey <dbl>, `United Kingdom` <dbl>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise\_at}\NormalTok{(}\KeywordTok{vars}\NormalTok{(}\KeywordTok{contains}\NormalTok{(}\StringTok{"life"}\NormalTok{)),}
\NormalTok{    mean,}
    \DataTypeTok{na.rm =} \OtherTok{TRUE}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 1
##   lifeExp
##     <dbl>
## 1    71.9
\end{verbatim}

\begin{itemize}
\item
  New way
\item
  \texttt{summarise()} + \texttt{across()}
\item
  If you find using \texttt{summarise\_all()}, \texttt{summarise\_if()} and \texttt{summarise\_at()} confusing, here's a solution: use \texttt{summarise()} with \texttt{across()}.
\item
  \texttt{summarise\_all()}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\KeywordTok{across}\NormalTok{(Albania}\OperatorTok{:}\StringTok{\textasciigrave{}}\DataTypeTok{United Kingdom}\StringTok{\textasciigrave{}}\NormalTok{, mean, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 30
##   Albania Austria Belgium `Bosnia and Her~ Bulgaria Croatia `Czech Republic`
##     <dbl>   <dbl>   <dbl>            <dbl>    <dbl>   <dbl>            <dbl>
## 1   3255.  20412.  19901.            3485.    6384.   9332.           13920.
## # ... with 23 more variables: Denmark <dbl>, Finland <dbl>, France <dbl>,
## #   Germany <dbl>, Greece <dbl>, Hungary <dbl>, Iceland <dbl>, Ireland <dbl>,
## #   Italy <dbl>, Montenegro <dbl>, Netherlands <dbl>, Norway <dbl>,
## #   Poland <dbl>, Portugal <dbl>, Romania <dbl>, Serbia <dbl>, `Slovak
## #   Republic` <dbl>, Slovenia <dbl>, Spain <dbl>, Sweden <dbl>,
## #   Switzerland <dbl>, Turkey <dbl>, `United Kingdom` <dbl>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\KeywordTok{across}\NormalTok{(}\OperatorTok{{-}}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{4}\NormalTok{), mean, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 30
##   Albania Austria Belgium `Bosnia and Her~ Bulgaria Croatia `Czech Republic`
##     <dbl>   <dbl>   <dbl>            <dbl>    <dbl>   <dbl>            <dbl>
## 1   3255.  20412.  19901.            3485.    6384.   9332.           13920.
## # ... with 23 more variables: Denmark <dbl>, Finland <dbl>, France <dbl>,
## #   Germany <dbl>, Greece <dbl>, Hungary <dbl>, Iceland <dbl>, Ireland <dbl>,
## #   Italy <dbl>, Montenegro <dbl>, Netherlands <dbl>, Norway <dbl>,
## #   Poland <dbl>, Portugal <dbl>, Romania <dbl>, Serbia <dbl>, `Slovak
## #   Republic` <dbl>, Slovenia <dbl>, Spain <dbl>, Sweden <dbl>,
## #   Switzerland <dbl>, Turkey <dbl>, `United Kingdom` <dbl>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \texttt{summarise\_if()}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\KeywordTok{across}\NormalTok{(is.double, mean, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Predicate functions must be wrapped in `where()`.
## 
##   # Bad
##   data %>% select(is.double)
## 
##   # Good
##   data %>% select(where(is.double))
## 
## i Please update your code.
## This message is displayed once per session.
\end{verbatim}

\begin{verbatim}
## # A tibble: 1 x 31
##   lifeExp Albania Austria Belgium `Bosnia and Her~ Bulgaria Croatia
##     <dbl>   <dbl>   <dbl>   <dbl>            <dbl>    <dbl>   <dbl>
## 1    71.9   3255.  20412.  19901.            3485.    6384.   9332.
## # ... with 24 more variables: `Czech Republic` <dbl>, Denmark <dbl>,
## #   Finland <dbl>, France <dbl>, Germany <dbl>, Greece <dbl>, Hungary <dbl>,
## #   Iceland <dbl>, Ireland <dbl>, Italy <dbl>, Montenegro <dbl>,
## #   Netherlands <dbl>, Norway <dbl>, Poland <dbl>, Portugal <dbl>,
## #   Romania <dbl>, Serbia <dbl>, `Slovak Republic` <dbl>, Slovenia <dbl>,
## #   Spain <dbl>, Sweden <dbl>, Switzerland <dbl>, Turkey <dbl>, `United
## #   Kingdom` <dbl>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \texttt{summarise\_at()}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\KeywordTok{across}\NormalTok{(}\OperatorTok{{-}}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{4}\NormalTok{),}
\NormalTok{    mean,}
    \DataTypeTok{na.rm =} \OtherTok{TRUE}
\NormalTok{  ))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 30
##   Albania Austria Belgium `Bosnia and Her~ Bulgaria Croatia `Czech Republic`
##     <dbl>   <dbl>   <dbl>            <dbl>    <dbl>   <dbl>            <dbl>
## 1   3255.  20412.  19901.            3485.    6384.   9332.           13920.
## # ... with 23 more variables: Denmark <dbl>, Finland <dbl>, France <dbl>,
## #   Germany <dbl>, Greece <dbl>, Hungary <dbl>, Iceland <dbl>, Ireland <dbl>,
## #   Italy <dbl>, Montenegro <dbl>, Netherlands <dbl>, Norway <dbl>,
## #   Poland <dbl>, Portugal <dbl>, Romania <dbl>, Serbia <dbl>, `Slovak
## #   Republic` <dbl>, Slovenia <dbl>, Spain <dbl>, Sweden <dbl>,
## #   Switzerland <dbl>, Turkey <dbl>, `United Kingdom` <dbl>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\KeywordTok{across}\NormalTok{(}\KeywordTok{contains}\NormalTok{(}\StringTok{"life"}\NormalTok{),}
\NormalTok{    mean,}
    \DataTypeTok{na.rm =} \OtherTok{TRUE}
\NormalTok{  ))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 1
##   lifeExp
##     <dbl>
## 1    71.9
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\KeywordTok{across}\NormalTok{(}\KeywordTok{contains}\NormalTok{(}\StringTok{"A"}\NormalTok{, }\DataTypeTok{ignore.case =} \OtherTok{FALSE}\NormalTok{)))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 360 x 2
##    Albania Austria
##      <dbl>   <dbl>
##  1   1601.      NA
##  2   1942.      NA
##  3   2313.      NA
##  4   2760.      NA
##  5   3313.      NA
##  6   3533.      NA
##  7   3631.      NA
##  8   3739.      NA
##  9   2497.      NA
## 10   3193.      NA
## # ... with 350 more rows
\end{verbatim}

Note that this workshop does not cover creating and manipulating variables using \texttt{mutate()} because many techniques you learned from playing with \texttt{summarise()} can be directly applied to \texttt{mutate()}.

\textbf{Challenge}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Summarize average GDP of countries whose names starting with alphabet ``A''.
\item
  Turn the summary dataframe into a publishable table using either \texttt{kableExtra} or \texttt{flextable} package.
\end{enumerate}

\hypertarget{tabulation-tbd}{%
\subsubsection{Tabulation (TBD)}\label{tabulation-tbd}}

\hypertarget{grouping}{%
\subsection{Grouping}\label{grouping}}

\hypertarget{grouped-summaries}{%
\subsubsection{Grouped summaries}\label{grouped-summaries}}

\begin{itemize}
\item
  Calculate the mean of \texttt{gdpPercap}.
\item
  Some functions are designed to work together. For instance, the group\_by
  function defines the strata that you're going to use for summary statistics. Then, use summarise() or summarize() for producing summary statistics.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\#}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{mean\_gdp =} \KeywordTok{mean}\NormalTok{(gdpPercap))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## # A tibble: 5 x 2
##   continent mean_gdp
##   <fct>        <dbl>
## 1 Africa       2194.
## 2 Americas     7136.
## 3 Asia         7902.
## 4 Europe      14469.
## 5 Oceania     18622.
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Calculate multiple summary statistics.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\#}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}
    \DataTypeTok{mean\_gdp =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{count =} \KeywordTok{n}\NormalTok{()}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## # A tibble: 5 x 3
##   continent mean_gdp count
##   <fct>        <dbl> <int>
## 1 Africa       2194.   624
## 2 Americas     7136.   300
## 3 Asia         7902.   396
## 4 Europe      14469.   360
## 5 Oceania     18622.    24
\end{verbatim}

\textbf{Optional}

\begin{itemize}
\tightlist
\item
  Other summary statistics
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Measures of spread: \texttt{median(x)}, \texttt{sd(x)}, \texttt{IQR(x)}, \texttt{mad(x)} (the median absolute deviation)
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# The Interquartile Range = The Difference Between 75t and 25t Percentiles}

\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\#}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{IQR\_gdp =} \KeywordTok{IQR}\NormalTok{(gdpPercap))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## # A tibble: 5 x 2
##   continent IQR_gdp
##   <fct>       <dbl>
## 1 Africa      1616.
## 2 Americas    4402.
## 3 Asia        7492.
## 4 Europe     13248.
## 5 Oceania     8072.
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Measures of rank: \texttt{min(x)}, \texttt{quantile(x,\ 0.25)}, \texttt{max(x)}
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\#}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}
    \DataTypeTok{min\_gdp =} \KeywordTok{min}\NormalTok{(gdpPercap),}
    \DataTypeTok{max\_gdp =} \KeywordTok{max}\NormalTok{(gdpPercap)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## # A tibble: 5 x 3
##   continent min_gdp max_gdp
##   <fct>       <dbl>   <dbl>
## 1 Africa       241.  21951.
## 2 Americas    1202.  42952.
## 3 Asia         331  113523.
## 4 Europe       974.  49357.
## 5 Oceania    10040.  34435.
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Measures of position: \texttt{first(x)}, \texttt{last(x)}, \texttt{nth(x,\ 2)}
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}
    \DataTypeTok{first\_gdp =} \KeywordTok{first}\NormalTok{(gdpPercap),}
    \DataTypeTok{last\_gdp =} \KeywordTok{last}\NormalTok{(gdpPercap)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## # A tibble: 5 x 3
##   continent first_gdp last_gdp
##   <fct>         <dbl>    <dbl>
## 1 Africa        2449.     470.
## 2 Americas      5911.   11416.
## 3 Asia           779.    2281.
## 4 Europe        1601.   33203.
## 5 Oceania      10040.   25185.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{arrange}\NormalTok{(gdpPercap) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Adding arrange}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}
    \DataTypeTok{first\_gdp =} \KeywordTok{first}\NormalTok{(gdpPercap),}
    \DataTypeTok{last\_gdp =} \KeywordTok{last}\NormalTok{(gdpPercap)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## # A tibble: 5 x 3
##   continent first_gdp last_gdp
##   <fct>         <dbl>    <dbl>
## 1 Africa         241.   21951.
## 2 Americas      1202.   42952.
## 3 Asia           331   113523.
## 4 Europe         974.   49357.
## 5 Oceania      10040.   34435.
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{3}
\tightlist
\item
  Measures of counts: \texttt{n(x)} (all rows), \texttt{sum(!is.na(x))} (only non-missing rows) = \texttt{n\_distinct(x)}
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{ns =} \KeywordTok{n}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## # A tibble: 5 x 2
##   continent    ns
##   <fct>     <int>
## 1 Africa      624
## 2 Americas    300
## 3 Asia        396
## 4 Europe      360
## 5 Oceania      24
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{4}
\tightlist
\item
  Counts and proportions of logical values: \texttt{sum(condition\ about\ x)} (the number of TRUEs in x), \texttt{mean(condition\ about\ x)} (the proportion of TRUEs in x)
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{rich\_countries =} \KeywordTok{mean}\NormalTok{(gdpPercap }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{20000}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{verbatim}
## # A tibble: 5 x 2
##   continent rich_countries
##   <fct>              <dbl>
## 1 Africa           0.00481
## 2 Americas         0.05   
## 3 Asia             0.111  
## 4 Europe           0.261  
## 5 Oceania          0.333
\end{verbatim}

\textbf{Additional tips}

Also, check out window functions such as \texttt{cumsum()} and \texttt{lag()}. Window functions are a variant of aggregate functions that take a vector as an input then returns a vector of the same length as an output.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{vec \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{)}

\CommentTok{\# Typical aggregate function}
\KeywordTok{sum}\NormalTok{(vec) }\CommentTok{\# The output length is one}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 55
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Window function}
\KeywordTok{cumsum}\NormalTok{(vec) }\CommentTok{\# The output length is ten}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1]  1  3  6 10 15 21 28 36 45 55
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Let\textquotesingle{}s compare them side{-}by{-}side}
\KeywordTok{compare}\NormalTok{(}
  \KeywordTok{sum}\NormalTok{(vec),}
  \KeywordTok{cumsum}\NormalTok{(vec)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `old`:                         55
## `new`: 1 3 6 10 15 21 28 36 45 55
\end{verbatim}

\hypertarget{joining}{%
\subsection{Joining}\label{joining}}

Relational data = multiple tables of data

\begin{figure}
\centering
\includegraphics{https://d33wubrfki0l68.cloudfront.net/245292d1ea724f6c3fd8a92063dcd7bfb9758d02/5751b/diagrams/relational-nycflights.png}
\caption{Relational data example}
\end{figure}

\textbf{Key ideas}

\begin{itemize}
\tightlist
\item
  A \textbf{primary key} ``uniquely identifies an observation in its own table''
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Example}
\NormalTok{planes}\OperatorTok{$}\NormalTok{tailnum }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{head}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "N10156" "N102UW" "N103US" "N104UW" "N10575" "N105UW"
\end{verbatim}

Verify primary key

\texttt{tailnum} should be unique.

\textbf{Challenge}

What do you expect the outcome?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{planes }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(tailnum) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(n }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{1}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 0 x 2
## # ... with 2 variables: tailnum <chr>, n <int>
\end{verbatim}

\textbf{Optional}

If a dataframe doesn't have primary key, you can add one called a \textbf{surrogate} key.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Toy example}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{x =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{),}
  \DataTypeTok{y =} \KeywordTok{c}\NormalTok{(}\DecValTok{4}\OperatorTok{:}\DecValTok{6}\NormalTok{)}
\NormalTok{)}

\CommentTok{\# Add a row\_index column}
\NormalTok{df \textless{}{-}}\StringTok{ }\NormalTok{df }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{rowid\_to\_column}\NormalTok{(}\StringTok{"ID"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  A \textbf{foreign} key ``uniquely identifies an observation in another table.''
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights}\OperatorTok{$}\NormalTok{tailnum }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{head}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "N14228" "N24211" "N619AA" "N804JB" "N668DN" "N39463"
\end{verbatim}

For joining, don't be distracted by other details and focus on KEYS!

\hypertarget{mutating-joins}{%
\subsubsection{Mutating joins}\label{mutating-joins}}

\begin{quote}
Add new variables to one data frame from matching observations in another"
\end{quote}

Using a simple toy example is great because it is easy to see how things work in that much narrow context.

\begin{itemize}
\tightlist
\item
  Toy example
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Table 1}
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{key =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{4}\NormalTok{),}
  \DataTypeTok{val\_x =} \KeywordTok{c}\NormalTok{(}\StringTok{"x1"}\NormalTok{, }\StringTok{"x2"}\NormalTok{, }\StringTok{"x3"}\NormalTok{, }\StringTok{"x4"}\NormalTok{)}
\NormalTok{)}

\CommentTok{\# Table 2}
\NormalTok{y \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{key =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{5}\NormalTok{),}
  \DataTypeTok{val\_y =} \KeywordTok{c}\NormalTok{(}\StringTok{"y1"}\NormalTok{, }\StringTok{"y2"}\NormalTok{, }\StringTok{"y3"}\NormalTok{, }\StringTok{"y4"}\NormalTok{, }\StringTok{"y5"}\NormalTok{)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Inner Join
\end{itemize}

\texttt{inner\_join()} keeps the matched values in both tables. If the left table is a subset of the right table, then the result of \texttt{left\_join()} is same as \texttt{inner\_join()}.

\textbf{Challenge}

What are going to be the shared keys?

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{inner\_join}\NormalTok{(x, y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "key"
\end{verbatim}

\begin{verbatim}
## # A tibble: 4 x 3
##     key val_x val_y
##   <int> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    y3   
## 4     4 x4    y4
\end{verbatim}

\begin{figure}
\centering
\includegraphics{https://d33wubrfki0l68.cloudfront.net/aeab386461820b029b7e7606ccff1286f623bae1/ef0d4/diagrams/join-venn.png}
\caption{Mutating joins}
\end{figure}

\begin{itemize}
\tightlist
\item
  Left Join
\end{itemize}

\texttt{left\_join()}, \texttt{right\_join()} and \texttt{full\_join()} are outer join functions. Unlike \texttt{inner\_join()}, outer join functions keep observations that appear in at least one of the tables.

\texttt{left\_join()} keeps only the matched observations in the right table.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{left\_join}\NormalTok{(x, y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "key"
\end{verbatim}

\begin{verbatim}
## # A tibble: 4 x 3
##     key val_x val_y
##   <int> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    y3   
## 4     4 x4    y4
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Right Join
\end{itemize}

\texttt{right\_join()} does the opposite.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{right\_join}\NormalTok{(x, y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "key"
\end{verbatim}

\begin{verbatim}
## # A tibble: 5 x 3
##     key val_x val_y
##   <int> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    y3   
## 4     4 x4    y4   
## 5     5 <NA>  y5
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Full Join
\end{itemize}

\texttt{full\_join()} keeps the observations from both tables. If they were unmatched, then NAs were recoded in one of the two tables.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{full\_join}\NormalTok{(x, y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "key"
\end{verbatim}

\begin{verbatim}
## # A tibble: 5 x 3
##     key val_x val_y
##   <int> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    y3   
## 4     4 x4    y4   
## 5     5 <NA>  y5
\end{verbatim}

\hypertarget{filtering-joins}{%
\subsubsection{Filtering joins}\label{filtering-joins}}

\begin{quote}
Filter observations from one data frame based on whether or not they match an observation in the other table.
\end{quote}

\begin{itemize}
\tightlist
\item
  Semi Join
\end{itemize}

In SQL, this type of query is also called subqueries.

\begin{itemize}
\tightlist
\item
  Filtering without joining
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create the list of the top 10 destinations}
\NormalTok{top\_dest \textless{}{-}}\StringTok{ }\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(dest, }\DataTypeTok{sort =} \OtherTok{TRUE}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{top\_n}\NormalTok{(}\DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Selecting by n
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Filter}
\NormalTok{filtered \textless{}{-}}\StringTok{ }\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(dest }\OperatorTok{\%in\%}\StringTok{ }\NormalTok{top\_dest}\OperatorTok{$}\NormalTok{dest)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Using semi join: only keep (INCLUDE) the rows that were matched between the two tables
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{joined \textless{}{-}}\StringTok{ }\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{semi\_join}\NormalTok{(top\_dest)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "dest"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{head}\NormalTok{(filtered }\OperatorTok{==}\StringTok{ }\NormalTok{joined)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      year month  day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## [1,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
## [2,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
## [3,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
## [4,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
## [5,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
## [6,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
##      arr_delay carrier flight tailnum origin dest air_time distance hour minute
## [1,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
## [2,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
## [3,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
## [4,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
## [5,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
## [6,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
##      time_hour
## [1,]      TRUE
## [2,]      TRUE
## [3,]      TRUE
## [4,]      TRUE
## [5,]      TRUE
## [6,]      TRUE
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Anti Join
\end{itemize}

\texttt{anti\_join()} dose the opposite. Exclude the rows that were matched between the two tables. Great technique to filter stopwords when you do a computational text analysis.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{anti\_join}\NormalTok{(planes, }\DataTypeTok{by =} \StringTok{"tailnum"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(tailnum, }\DataTypeTok{sort =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 722 x 2
##    tailnum     n
##    <chr>   <int>
##  1 <NA>     2512
##  2 N725MQ    575
##  3 N722MQ    513
##  4 N723MQ    507
##  5 N713MQ    483
##  6 N735MQ    396
##  7 N0EGMQ    371
##  8 N534MQ    364
##  9 N542MQ    363
## 10 N531MQ    349
## # ... with 712 more rows
\end{verbatim}

\hypertarget{broom}{%
\section{broom}\label{broom}}

\hypertarget{nesting}{%
\subsection{Nesting}\label{nesting}}

\hypertarget{nest}{%
\subsubsection{nest}\label{nest}}

The following example comes from \href{https://r4ds.had.co.nz/many-models.html}{R for Data Science} by by Garrett Grolemund and Hadley Wickham.

\begin{itemize}
\tightlist
\item
  How can you run multiple models simultaneously? Using a nested data frame.
\end{itemize}

\begin{itemize}
\item
  \textbf{Grouped data: each row = an observation}
\item
  \textbf{Nested data: each row = a group}
\end{itemize}

\textbf{Challenge}

In the following example, why did we use \texttt{country} and \texttt{continent} for nesting variables?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nested \textless{}{-}}\StringTok{ }\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{nest}\NormalTok{()}

\KeywordTok{head}\NormalTok{(nested)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 3
## # Groups:   country, continent [6]
##   country     continent data             
##   <fct>       <fct>     <list>           
## 1 Afghanistan Asia      <tibble [12 x 4]>
## 2 Albania     Europe    <tibble [12 x 4]>
## 3 Algeria     Africa    <tibble [12 x 4]>
## 4 Angola      Africa    <tibble [12 x 4]>
## 5 Argentina   Americas  <tibble [12 x 4]>
## 6 Australia   Oceania   <tibble [12 x 4]>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nested}\OperatorTok{$}\NormalTok{data[[}\DecValTok{1}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 12 x 4
##     year lifeExp      pop gdpPercap
##    <int>   <dbl>    <int>     <dbl>
##  1  1952    28.8  8425333      779.
##  2  1957    30.3  9240934      821.
##  3  1962    32.0 10267083      853.
##  4  1967    34.0 11537966      836.
##  5  1972    36.1 13079460      740.
##  6  1977    38.4 14880372      786.
##  7  1982    39.9 12881816      978.
##  8  1987    40.8 13867957      852.
##  9  1992    41.7 16317921      649.
## 10  1997    41.8 22227415      635.
## 11  2002    42.1 25268405      727.
## 12  2007    43.8 31889923      975.
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Custom function
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{lm\_model \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(df) \{}
  \KeywordTok{lm}\NormalTok{(lifeExp }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{year, }\DataTypeTok{data =}\NormalTok{ df)}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Apply function to the nested data
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Apply m\_model to the nested data}

\NormalTok{nested \textless{}{-}}\StringTok{ }\NormalTok{nested }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{models =} \KeywordTok{map}\NormalTok{(data, lm\_model)) }\CommentTok{\# Add the list object as a new column}

\KeywordTok{head}\NormalTok{(nested)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 4
## # Groups:   country, continent [6]
##   country     continent data              models
##   <fct>       <fct>     <list>            <list>
## 1 Afghanistan Asia      <tibble [12 x 4]> <lm>  
## 2 Albania     Europe    <tibble [12 x 4]> <lm>  
## 3 Algeria     Africa    <tibble [12 x 4]> <lm>  
## 4 Angola      Africa    <tibble [12 x 4]> <lm>  
## 5 Argentina   Americas  <tibble [12 x 4]> <lm>  
## 6 Australia   Oceania   <tibble [12 x 4]> <lm>
\end{verbatim}

S3 is part of R's object oriented systems. If you need more information, check \href{http://adv-r.had.co.nz/S3.html}{this section} in Hadley's Advanced R out.

\hypertarget{unnest}{%
\subsubsection{unnest}\label{unnest}}

\texttt{glance()} function from \texttt{broom} package inspects the quality of a statistical model.

\textbf{Additional tips}

\begin{itemize}
\tightlist
\item
  \texttt{broom::glance(model)}: for evaluating model quality and/or complexity
\item
  \texttt{broom::tidy(model)}: for extracting each coefficient in the model (the estimates + its variability)
\item
  \texttt{broom::augment(model,\ data)}: for getting extra values (residuals, and influence statistics)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{glanced \textless{}{-}}\StringTok{ }\NormalTok{nested }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{glance =} \KeywordTok{map}\NormalTok{(models, broom}\OperatorTok{::}\NormalTok{glance))}

\NormalTok{glanced}\OperatorTok{$}\NormalTok{glance[[}\DecValTok{1}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.948         0.942  1.22      181. 9.84e-8     1  -18.3  42.7  44.1
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
\end{verbatim}

\texttt{unnest()} unpacks the list objects stored in glance column

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{glanced }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{unnest}\NormalTok{(glance) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{arrange}\NormalTok{(BIC) }\CommentTok{\# Low to High; Lower BIC indicates a better model fit}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 142 x 16
## # Groups:   country, continent [142]
##    country continent data  models r.squared adj.r.squared sigma statistic
##    <fct>   <fct>     <lis> <list>     <dbl>         <dbl> <dbl>     <dbl>
##  1 Sweden  Europe    <tib~ <lm>       0.995         0.995 0.212     2203.
##  2 Switze~ Europe    <tib~ <lm>       0.997         0.997 0.215     3823.
##  3 France  Europe    <tib~ <lm>       0.998         0.997 0.220     4200.
##  4 Canada  Americas  <tib~ <lm>       0.996         0.996 0.249     2757.
##  5 Argent~ Americas  <tib~ <lm>       0.996         0.995 0.292     2246.
##  6 Belgium Europe    <tib~ <lm>       0.995         0.994 0.293     1822.
##  7 Brazil  Americas  <tib~ <lm>       0.998         0.998 0.326     5111.
##  8 Equato~ Africa    <tib~ <lm>       0.997         0.997 0.329     3184.
##  9 Nether~ Europe    <tib~ <lm>       0.982         0.980 0.348      552.
## 10 Finland Europe    <tib~ <lm>       0.994         0.993 0.354     1613.
## # ... with 132 more rows, and 8 more variables: p.value <dbl>, df <dbl>,
## #   logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>, df.residual <int>,
## #   nobs <int>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{glanced }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{unnest}\NormalTok{(glance) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(continent, BIC)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_jitter}\NormalTok{(}\DataTypeTok{width =} \FloatTok{0.5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-159-1.pdf}

\hypertarget{mapping}{%
\subsection{Mapping}\label{mapping}}

We tasted a little bit about how \texttt{map()} function works. Let's dig into it deeper as this family of functions is really useful. For more information, see Rebecca Barter's wonderful tutorial on the \texttt{purrr} package. In her words, this is ``the tidyverse's answer to apply functions for iteration''. \texttt{map()} function can take a vector (of any type), a list, and a dataframe for input.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{multiply \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x) \{}
\NormalTok{  x }\OperatorTok{*}\StringTok{ }\NormalTok{x}
\NormalTok{\}}

\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{list}\NormalTok{(}
  \DataTypeTok{first\_obs =} \KeywordTok{rnorm}\NormalTok{(}\DecValTok{7}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DataTypeTok{sd =} \DecValTok{1}\NormalTok{),}
  \DataTypeTok{second\_obs =} \KeywordTok{rnorm}\NormalTok{(}\DecValTok{7}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DataTypeTok{sd =} \DecValTok{2}\NormalTok{)}
\NormalTok{) }\CommentTok{\# normal distribution}
\end{Highlighting}
\end{Shaded}

\textbf{Challenge}

Try \texttt{map\_df(.x\ =\ df,\ .f\ =\ multiply)} and tell me what's the difference between the output you got and what you saw earlier.

If you want to know more about the power and joy of functional programming in R (e.g., \texttt{purrr::map()}), then please take \href{https://github.com/dlab-berkeley/R-functional-programming}{``How to Automate Repeated Things in R''} workshop.

\hypertarget{ggplot2}{%
\section{ggplot2}\label{ggplot2}}

\begin{itemize}
\item
  The following material is adapted from Kieran Healy's wonderful book (2019) on \href{https://socviz.co/}{data visualization} and Hadley Wickham's equally wonderful book on \href{https://link.springer.com/content/pdf/10.1007\%2F978-0-387-98141-3.pdf}{ggplot2}. For more theoretical discussions, I recommend you to read \href{https://link.springer.com/book/10.1007\%2F0-387-28695-0}{The Grammar of Graphics} by Leland Wilkinson.
\item
  Why should we care data visualization? More precisely, why should we learn the grammar of statistical graphics?
\item
  Sometimes, pictures are better tools than words in 1) exploring, 2) understanding, and 3) explaining data.
\end{itemize}

\hypertarget{motivation-2}{%
\subsection{Motivation}\label{motivation-2}}

\href{https://en.wikipedia.org/wiki/Frank_Anscombe}{Anscombe}'s quarter comprises four datasets, which are so alike in terms of their descriptive statistics but quite different when presented graphically.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Set theme}
\KeywordTok{theme\_set}\NormalTok{(}\KeywordTok{theme\_minimal}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Data}
\NormalTok{anscombe}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##    x1 x2 x3 x4    y1   y2    y3    y4
## 1  10 10 10  8  8.04 9.14  7.46  6.58
## 2   8  8  8  8  6.95 8.14  6.77  5.76
## 3  13 13 13  8  7.58 8.74 12.74  7.71
## 4   9  9  9  8  8.81 8.77  7.11  8.84
## 5  11 11 11  8  8.33 9.26  7.81  8.47
## 6  14 14 14  8  9.96 8.10  8.84  7.04
## 7   6  6  6  8  7.24 6.13  6.08  5.25
## 8   4  4  4 19  4.26 3.10  5.39 12.50
## 9  12 12 12  8 10.84 9.13  8.15  5.56
## 10  7  7  7  8  4.82 7.26  6.42  7.91
## 11  5  5  5  8  5.68 4.74  5.73  6.89
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Correlation}
\KeywordTok{cor}\NormalTok{(anscombe)[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{4}\NormalTok{), }\KeywordTok{c}\NormalTok{(}\DecValTok{5}\OperatorTok{:}\DecValTok{8}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##            y1         y2         y3         y4
## x1  0.8164205  0.8162365  0.8162867 -0.3140467
## x2  0.8164205  0.8162365  0.8162867 -0.3140467
## x3  0.8164205  0.8162365  0.8162867 -0.3140467
## x4 -0.5290927 -0.7184365 -0.3446610  0.8165214
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# gather and select}
\NormalTok{anscombe\_processed \textless{}{-}}\StringTok{ }\NormalTok{anscombe }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{gather}\NormalTok{(x\_name, x\_value, x1}\OperatorTok{:}\NormalTok{x4) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{gather}\NormalTok{(y\_name, y\_value, y1}\OperatorTok{:}\NormalTok{y4)}

\CommentTok{\# plot}
\NormalTok{anscombe\_processed }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ x\_value, }\DataTypeTok{y =}\NormalTok{ y\_value)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =}\NormalTok{ lm, }\DataTypeTok{se =} \OtherTok{FALSE}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_grid}\NormalTok{(x\_name }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{y\_name) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme\_bw}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"X values"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Y values"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Anscombe\textquotesingle{}s quartet"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-164-1.pdf}

\hypertarget{the-grammar-of-graphics}{%
\subsection{The grammar of graphics}\label{the-grammar-of-graphics}}

\begin{itemize}
\item
  the grammar of graphics

  \begin{itemize}
  \tightlist
  \item
    data
  \item
    aesthetic attributes (color, shape, size)
  \item
    geometric objects (points, lines, bars)
  \item
    stats (summary stats)
  \item
    scales (map values in the data space)
  \item
    coord (data coordinates)
  \item
    facet (facetting specifications)
  \end{itemize}
\end{itemize}

No worries for new terms. We're going to learn them by actually plotting.

\begin{itemize}
\item
  Workflow:

  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \tightlist
  \item
    Tidy data
  \item
    Mapping
  \item
    Geom
  \item
    Cor\_ordinates and scales
  \item
    Labels and guides
  \item
    Themes
  \item
    Save files
  \end{enumerate}
\end{itemize}

\hypertarget{mapping-and-geom}{%
\subsection{mapping and geom}\label{mapping-and-geom}}

\begin{itemize}
\item
  \texttt{aes} (aesthetic mappings or aesthetics) tells which variables (x, y) in your data should be represented by which visual elements (color, shape, size) in the plot.
\item
  \texttt{geom\_} tells the type of plot you are going to use
\end{itemize}

\hypertarget{basic-aes-x-y}{%
\subsection{basic aes (x , y)}\label{basic-aes-x-y}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p \textless{}{-}}\StringTok{ }\KeywordTok{ggplot}\NormalTok{(}
  \DataTypeTok{data =}\NormalTok{ gapminder,}
  \DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ gdpPercap, }\DataTypeTok{y =}\NormalTok{ lifeExp)}
\NormalTok{) }\CommentTok{\# ggplot or R in general takes positional arguments too. So, you don\textquotesingle{}t need to name data, mapping each time you use ggplot2.}

\NormalTok{p}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-165-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-165-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_smooth}\NormalTok{() }\CommentTok{\# geom\_smooth has calculated a smoothed line;}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-165-3.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# the shaded area is the standard error for the line}
\end{Highlighting}
\end{Shaded}

\hypertarget{univariate-distribution}{%
\subsection{Univariate distribution}\label{univariate-distribution}}

\begin{itemize}
\tightlist
\item
  \texttt{geom\_histogram()}: For the probability distribution of a continuous variable. Bins divide the entire range of values into a series of intervals (see \href{https://en.wikipedia.org/wiki/Histogram}{the Wiki entry}).
\item
  \texttt{geom\_density()}: Also for the probability distribution of a continuous variable. It calculates a \href{https://en.wikipedia.org/wiki/Kernel_density_estimation}{kernel density estimate} of the underlying distribution.
\end{itemize}

\hypertarget{histogram}{%
\subsubsection{Histogram}\label{histogram}}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{data}\NormalTok{(midwest) }\CommentTok{\# load midwest dataset}

\NormalTok{midwest}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 437 x 28
##      PID county state  area poptotal popdensity popwhite popblack popamerindian
##    <int> <chr>  <chr> <dbl>    <int>      <dbl>    <int>    <int>         <int>
##  1   561 ADAMS  IL    0.052    66090      1271.    63917     1702            98
##  2   562 ALEXA~ IL    0.014    10626       759      7054     3496            19
##  3   563 BOND   IL    0.022    14991       681.    14477      429            35
##  4   564 BOONE  IL    0.017    30806      1812.    29344      127            46
##  5   565 BROWN  IL    0.018     5836       324.     5264      547            14
##  6   566 BUREAU IL    0.05     35688       714.    35157       50            65
##  7   567 CALHO~ IL    0.017     5322       313.     5298        1             8
##  8   568 CARRO~ IL    0.027    16805       622.    16519      111            30
##  9   569 CASS   IL    0.024    13437       560.    13384       16             8
## 10   570 CHAMP~ IL    0.058   173025      2983.   146506    16559           331
## # ... with 427 more rows, and 19 more variables: popasian <int>,
## #   popother <int>, percwhite <dbl>, percblack <dbl>, percamerindan <dbl>,
## #   percasian <dbl>, percother <dbl>, popadults <int>, perchsd <dbl>,
## #   percollege <dbl>, percprof <dbl>, poppovertyknown <int>,
## #   percpovertyknown <dbl>, percbelowpoverty <dbl>, percchildbelowpovert <dbl>,
## #   percadultpoverty <dbl>, percelderlypoverty <dbl>, inmetro <int>,
## #   category <chr>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{midwest }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ area)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\CommentTok{\# not working.}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{midwest }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ area)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_histogram}\NormalTok{() }\CommentTok{\# stat\_bin argument picks up 30 bins (or "bucket") by default.}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-168-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{midwest }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ area)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_histogram}\NormalTok{(}\DataTypeTok{bins =} \DecValTok{10}\NormalTok{) }\CommentTok{\# only 10 bins.}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-168-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ggplot}\NormalTok{(}
  \DataTypeTok{data =} \KeywordTok{subset}\NormalTok{(midwest, state }\OperatorTok{\%in\%}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"OH"}\NormalTok{, }\StringTok{"IN"}\NormalTok{)),}
  \DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ percollege, }\DataTypeTok{fill =}\NormalTok{ state)}
\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_histogram}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.7}\NormalTok{, }\DataTypeTok{bins =} \DecValTok{20}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_fill\_viridis\_d}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-168-3.pdf}

\hypertarget{density}{%
\subsubsection{Density}\label{density}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{midwest }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ area, }\DataTypeTok{fill =}\NormalTok{ state, }\DataTypeTok{color =}\NormalTok{ state)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_density}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_color\_viridis\_d}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_fill\_viridis\_d}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-169-1.pdf}

\hypertarget{advanced-aes-size-color}{%
\subsection{Advanced aes (size, color)}\label{advanced-aes-size-color}}

\begin{itemize}
\item
  There's also fill argument (mostly used in \texttt{geom\_bar()}). Color \texttt{aes} affects the appearance of lines and points, fill is for the filled areas of bars, polygons, and in some cases, the interior of a smoother's standard error ribbon.
\item
  The property size/color/fill represents\ldots{}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ggplot}\NormalTok{(}
  \DataTypeTok{data =}\NormalTok{ gapminder,}
  \DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}
    \DataTypeTok{x =}\NormalTok{ gdpPercap, }\DataTypeTok{y =}\NormalTok{ lifeExp,}
    \DataTypeTok{size =}\NormalTok{ pop}
\NormalTok{  )}
\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-170-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ggplot}\NormalTok{(}
  \DataTypeTok{data =}\NormalTok{ gapminder,}
  \DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}
    \DataTypeTok{x =}\NormalTok{ gdpPercap, }\DataTypeTok{y =}\NormalTok{ lifeExp,}
    \DataTypeTok{size =}\NormalTok{ pop,}
    \DataTypeTok{color =}\NormalTok{ continent}
\NormalTok{  )}
\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_color\_viridis\_d}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-171-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# try red instead of "red"}
\KeywordTok{ggplot}\NormalTok{(}
  \DataTypeTok{data =}\NormalTok{ gapminder,}
  \DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}
    \DataTypeTok{x =}\NormalTok{ gdpPercap, }\DataTypeTok{y =}\NormalTok{ lifeExp,}
    \DataTypeTok{size =}\NormalTok{ pop,}
    \DataTypeTok{color =} \StringTok{"red"}
\NormalTok{  )}
\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-172-1.pdf}

Aesthetics also can be mapped per Geom.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-173-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}\StringTok{ }\CommentTok{\# alpha controls transparency}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{color =} \StringTok{"red"}\NormalTok{, }\DataTypeTok{se =} \OtherTok{FALSE}\NormalTok{, }\DataTypeTok{size =} \DecValTok{2}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-173-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}\StringTok{ }\CommentTok{\# alpha controls transparency}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{color =} \StringTok{"red"}\NormalTok{, }\DataTypeTok{se =} \OtherTok{FALSE}\NormalTok{, }\DataTypeTok{size =} \DecValTok{2}\NormalTok{, }\DataTypeTok{method =} \StringTok{"lm"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-173-3.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ggplot}\NormalTok{(}
  \DataTypeTok{data =}\NormalTok{ gapminder,}
  \DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}
    \DataTypeTok{x =}\NormalTok{ gdpPercap, }\DataTypeTok{y =}\NormalTok{ lifeExp,}
    \DataTypeTok{color =}\NormalTok{ continent}
\NormalTok{  )}
\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"loess"}\NormalTok{, }\DataTypeTok{color =} \StringTok{"red"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"log GDP"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life Expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"A Gapminder Plot"}\NormalTok{,}
    \DataTypeTok{subtitle =} \StringTok{"Data points are country{-}years"}\NormalTok{,}
    \DataTypeTok{caption =} \StringTok{"Source: Gapminder"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-174-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ggplot}\NormalTok{(}
  \DataTypeTok{data =}\NormalTok{ gapminder,}
  \DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}
    \DataTypeTok{x =}\NormalTok{ gdpPercap, }\DataTypeTok{y =}\NormalTok{ lifeExp,}
    \DataTypeTok{color =}\NormalTok{ continent,}
    \DataTypeTok{fill =}\NormalTok{ continent}
\NormalTok{  )}
\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"loess"}\NormalTok{, }\DataTypeTok{color =} \StringTok{"red"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"log GDP"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life Expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"A Gapminder Plot"}\NormalTok{,}
    \DataTypeTok{subtitle =} \StringTok{"Data points are country{-}years"}\NormalTok{,}
    \DataTypeTok{caption =} \StringTok{"Source: Gapminder"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_color\_viridis\_d}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_fill\_viridis\_d}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-174-2.pdf}

\hypertarget{co-ordinates-and-scales}{%
\subsection{Co-ordinates and scales}\label{co-ordinates-and-scales}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{() }\CommentTok{\# coord\_type}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-175-1.pdf}

The data is heavily bunched up against the left side.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{() }\CommentTok{\# without scaling}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-176-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{() }\CommentTok{\# scales the axis of a plot to a log 10 basis}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-176-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"lm"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-176-3.pdf}

\hypertarget{labels-and-guides}{%
\subsection{Labels and guides}\label{labels-and-guides}}

\texttt{scales} package has some useful premade formatting functions. You can either load scales or just grab the function you need from the library using \texttt{scales::}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"loess"}\NormalTok{, }\DataTypeTok{color =} \StringTok{"red"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\NormalTok{dollar) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"log GDP"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life Expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"A Gapminder Plot"}\NormalTok{,}
    \DataTypeTok{subtitle =} \StringTok{"Data points are country{-}years"}\NormalTok{,}
    \DataTypeTok{caption =} \StringTok{"Source: Gapminder"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-177-1.pdf}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{5}
\tightlist
\item
  Themes
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"loess"}\NormalTok{, }\DataTypeTok{color =} \StringTok{"red"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\NormalTok{dollar) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"log GDP"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life Expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"A Gapminder Plot"}\NormalTok{,}
    \DataTypeTok{subtitle =} \StringTok{"Data points are country{-}years"}\NormalTok{,}
    \DataTypeTok{caption =} \StringTok{"Source: Gapminder"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme\_economist}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-178-1.pdf}

\hypertarget{ggsave}{%
\subsection{ggsave}\label{ggsave}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{figure\_example \textless{}{-}}\StringTok{ }\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"gam"}\NormalTok{, }\DataTypeTok{color =} \StringTok{"red"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\NormalTok{dollar) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"log GDP"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life Expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"A Gapminder Plot"}\NormalTok{,}
    \DataTypeTok{subtitle =} \StringTok{"Data points are country{-}years"}\NormalTok{,}
    \DataTypeTok{caption =} \StringTok{"Source: Gapminder"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme\_economist}\NormalTok{()}

\KeywordTok{ggsave}\NormalTok{(figure\_example, }\KeywordTok{here}\NormalTok{(}\StringTok{"outputs"}\NormalTok{, }\StringTok{"figure\_example.png"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\hypertarget{many-plots}{%
\subsection{Many plots}\label{many-plots}}

Basic ideas:

\begin{itemize}
\tightlist
\item
  Grouping: tell \texttt{ggplot2} about the structure of your data
\item
  Facetting: break up your data into pieces for a plot
\end{itemize}

\hypertarget{grouping-1}{%
\subsubsection{Grouping}\label{grouping-1}}

\begin{itemize}
\tightlist
\item
  Can you guess what's wrong?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p \textless{}{-}}\StringTok{ }\KeywordTok{ggplot}\NormalTok{(gapminder, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ year, }\DataTypeTok{y =}\NormalTok{ gdpPercap))}

\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-180-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_line}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-180-2.pdf}

\texttt{geom\_line} joins up all the lines for each particular year in the order they appear in the dataset. \texttt{ggplot2} does not know the yearly observations in your data are grouped by country.

Note that you need grouping when the grouping information you need to tell is not built into the varaibles being mapped (like continent).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1,704 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # ... with 1,694 more rows
\end{verbatim}

\hypertarget{facetting}{%
\subsubsection{Facetting}\label{facetting}}

Facetting is to make small multiples.

\begin{itemize}
\item
  \texttt{facet\_wrap}: based on a single categorical variable like \texttt{facet\_wrap(\textasciitilde{}single\_categorical\_variable)}. Your panels will be laid out in order and then wrapped into a grid.
\item
  \texttt{facet\_grid}: when you want to cross-classify some data by two categorical variables like \texttt{facet\_grid(one\_cat\_variable\ \textasciitilde{}\ two\_cat\_variable)}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p \textless{}{-}}\StringTok{ }\KeywordTok{ggplot}\NormalTok{(gapminder, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ year, }\DataTypeTok{y =}\NormalTok{ gdpPercap))}

\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_line}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{group =}\NormalTok{ country)) }\CommentTok{\# group by, \# The outlier is Kuwait.}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-182-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_line}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{group =}\NormalTok{ country)) }\OperatorTok{+}\StringTok{ }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{continent) }\CommentTok{\# facetting}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-182-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_line}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{group =}\NormalTok{ country), }\DataTypeTok{color =} \StringTok{"gray70"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{size =} \FloatTok{1.1}\NormalTok{, }\DataTypeTok{method =} \StringTok{"loess"}\NormalTok{, }\DataTypeTok{se =} \OtherTok{FALSE}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_y\_log10}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\NormalTok{dollar) }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{continent, }\DataTypeTok{ncol =} \DecValTok{5}\NormalTok{) }\OperatorTok{+}\StringTok{ }\CommentTok{\# for single categorical variable; for multiple categorical variables use facet\_grid()}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Year"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"GDP per capita"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"GDP per capita on Five continents"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme}\NormalTok{(}\DataTypeTok{axis.text.x =} \KeywordTok{element\_text}\NormalTok{(}\DataTypeTok{angle =} \DecValTok{90}\NormalTok{, }\DataTypeTok{hjust =} \DecValTok{1}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-182-3.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_line}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{group =}\NormalTok{ country), }\DataTypeTok{color =} \StringTok{"gray70"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{size =} \FloatTok{1.1}\NormalTok{, }\DataTypeTok{method =} \StringTok{"loess"}\NormalTok{, }\DataTypeTok{se =} \OtherTok{FALSE}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_y\_log10}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\NormalTok{dollar) }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_grid}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{continent) }\OperatorTok{+}\StringTok{ }\CommentTok{\# for single categorical variable; for multiple categorical variables use facet\_grid()}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Year"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"GDP per capita"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"GDP per capita on Five continents"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme}\NormalTok{(}\DataTypeTok{axis.text.x =} \KeywordTok{element\_text}\NormalTok{(}\DataTypeTok{angle =} \DecValTok{90}\NormalTok{, }\DataTypeTok{hjust =} \DecValTok{1}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-183-1.pdf}

\hypertarget{transforming}{%
\subsection{Transforming}\label{transforming}}

\begin{itemize}
\tightlist
\item
  Transforming: perform some calculations on or summarize your data before producing the plot
\end{itemize}

\hypertarget{use-pipes-to-summarize-data}{%
\subsubsection{Use pipes to summarize data}\label{use-pipes-to-summarize-data}}

Also, we experiment bar charts here. By default, \texttt{geom\_bar} \href{https://www.rdocumentation.org/packages/ggplot2/versions/1.0.1/topics/geom_bar}{uses} stat = ``bins'', which makes the height of each bar equal to the number of cases in each group. If you have a y column, then you should use \texttt{stat\ =\ "identity"} argument. Alternatively, you can use \texttt{geom\_col()}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder\_formatted \textless{}{-}}\StringTok{ }\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'continent' (override with `.groups` argument)
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =}\NormalTok{ gapminder\_formatted, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ year, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean, }\DataTypeTok{color =}\NormalTok{ continent)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Year"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy on Five continents"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-184-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ year, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean, }\DataTypeTok{color =}\NormalTok{ country)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Year"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy in Europe"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'country' (override with `.groups` argument)
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-184-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# geom point}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ year, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Year"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy in Europe"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{country)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'country' (override with `.groups` argument)
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-185-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# geom bar}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ year, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_bar}\NormalTok{(}\DataTypeTok{stat =} \StringTok{"identity"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Year"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy in Europe"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{country)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'country' (override with `.groups` argument)
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-185-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# no facet}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ year, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean, }\DataTypeTok{fill =}\NormalTok{ country)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_bar}\NormalTok{(}\DataTypeTok{stat =} \StringTok{"identity"}\NormalTok{) }\OperatorTok{+}\StringTok{ }\CommentTok{\# even if you not stack, still the plot looks messy or you can use geom\_col()}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Year"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy in Europe"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'country' (override with `.groups` argument)
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-185-3.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ country, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_boxplot}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Country"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy in Europe"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'country' (override with `.groups` argument)
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-186-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# without ordering}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{reorder}\NormalTok{(country, lifeExp\_mean), }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_boxplot}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Country"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy in Europe"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'country' (override with `.groups` argument)
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-187-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# reorder}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{reorder}\NormalTok{(country, }\OperatorTok{{-}}\NormalTok{lifeExp\_mean), }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_boxplot}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Country"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy in Europe"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'country' (override with `.groups` argument)
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-187-2.pdf}

\hypertarget{plotting-text}{%
\subsubsection{Plotting text}\label{plotting-text}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Asia"} \OperatorTok{|}\StringTok{ }\NormalTok{continent }\OperatorTok{==}\StringTok{ "Americas"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent, country) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ gdp\_mean, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_text}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{label =}\NormalTok{ country)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_grid}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{continent)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'continent' (override with `.groups` argument)
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-188-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# with label}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Asia"} \OperatorTok{|}\StringTok{ }\NormalTok{continent }\OperatorTok{==}\StringTok{ "Americas"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent, country) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ gdp\_mean, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_label}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{label =}\NormalTok{ country)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_grid}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{continent)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'continent' (override with `.groups` argument)
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-189-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# no overlaps}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Asia"} \OperatorTok{|}\StringTok{ }\NormalTok{continent }\OperatorTok{==}\StringTok{ "Americas"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent, country) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ gdp\_mean, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_text\_repel}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{label =}\NormalTok{ country)) }\OperatorTok{+}\StringTok{ }\CommentTok{\# there\textquotesingle{}s also geom\_label\_repel}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_grid}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{continent)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` regrouping output by 'continent' (override with `.groups` argument)
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-190-1.pdf}

\hypertarget{ploting-models}{%
\subsection{Ploting models}\label{ploting-models}}

In plotting models, we extensively use David Robinson's \href{https://cran.r-project.org/web/packages/broom/vignettes/broom.html}{broom package} in R. The idea is to transform model outputs (i.e., predictions and estimations) into tidy objects so that we can easily combine, separate, and visualize these elements.

\hypertarget{plotting-several-fits-at-the-same-time}{%
\subsubsection{Plotting several fits at the same time}\label{plotting-several-fits-at-the-same-time}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{model\_colors \textless{}{-}}\StringTok{ }\NormalTok{RColorBrewer}\OperatorTok{::}\KeywordTok{brewer.pal}\NormalTok{(}\DecValTok{3}\NormalTok{, }\StringTok{"Set1"}\NormalTok{) }\CommentTok{\# select three qualitatively different colors from a larger palette.}

\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{log}\NormalTok{(gdpPercap), }\DataTypeTok{y =}\NormalTok{ lifeExp)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.2}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"lm"}\NormalTok{, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{color =} \StringTok{"OLS"}\NormalTok{, }\DataTypeTok{fill =} \StringTok{"OLS"}\NormalTok{)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}
    \DataTypeTok{method =} \StringTok{"lm"}\NormalTok{, }\DataTypeTok{formula =}\NormalTok{ y }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{splines}\OperatorTok{::}\KeywordTok{bs}\NormalTok{(x, }\DataTypeTok{df =} \DecValTok{3}\NormalTok{),}
    \KeywordTok{aes}\NormalTok{(}\DataTypeTok{color =} \StringTok{"Cubic Spline"}\NormalTok{, }\DataTypeTok{fill =} \StringTok{"Cubic Spline"}\NormalTok{)}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"loess"}\NormalTok{, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{color =} \StringTok{"LOESS"}\NormalTok{, }\DataTypeTok{fill =} \StringTok{"LOESS"}\NormalTok{)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme}\NormalTok{(}\DataTypeTok{legend.position =} \StringTok{"top"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_color\_manual}\NormalTok{(}\DataTypeTok{name =} \StringTok{"Models"}\NormalTok{, }\DataTypeTok{values =}\NormalTok{ model\_colors) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_fill\_manual}\NormalTok{(}\DataTypeTok{name =} \StringTok{"Models"}\NormalTok{, }\DataTypeTok{values =}\NormalTok{ model\_colors)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-191-1.pdf}

\hypertarget{extracting-model-outcomes}{%
\subsubsection{Extracting model outcomes}\label{extracting-model-outcomes}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# regression model}
\NormalTok{out \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(}
  \DataTypeTok{formula =}\NormalTok{ lifeExp }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{gdpPercap }\OperatorTok{+}\StringTok{ }\NormalTok{pop }\OperatorTok{+}\StringTok{ }\NormalTok{continent,}
  \DataTypeTok{data =}\NormalTok{ gapminder}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\texttt{tidy()} is a method in the \texttt{broom} package. It ``constructs a dataframe that summarizes the model's statistical findings''. As the description states, tidy is a function that can be used for various models. For instance, a tidy can extract following information from a regression model.

\begin{itemize}
\tightlist
\item
  \texttt{Term}: a term being estimated
\item
  \texttt{p.value}
\item
  \texttt{statistic}: a test statistic used to compute p-value
\item
  \texttt{estimate}
\item
  \texttt{conf.low}: the low end of a confidence interval
\item
  \texttt{conf.high}: the high end of a confidence interval
\item
  \texttt{df}: degrees of freedom
\end{itemize}

\textbf{Challege}

Try glance(out), what did you get from these commands? If you're curious, you can try ?glance.

The followings are to show your degree of confidence.

\hypertarget{coeffficients}{%
\paragraph{Coeffficients}\label{coeffficients}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# estimates}
\NormalTok{out\_comp \textless{}{-}}\StringTok{ }\KeywordTok{tidy}\NormalTok{(out)}

\NormalTok{p \textless{}{-}}\StringTok{ }\NormalTok{out\_comp }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ term, }\DataTypeTok{y =}\NormalTok{ estimate))}

\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme\_bw}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-193-1.pdf}

\hypertarget{confidence-intervals}{%
\paragraph{Confidence intervals}\label{confidence-intervals}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# plus confidence intervals}
\NormalTok{out\_conf \textless{}{-}}\StringTok{ }\KeywordTok{tidy}\NormalTok{(out, }\DataTypeTok{conf.int =} \OtherTok{TRUE}\NormalTok{)}

\CommentTok{\# plotting coefficients using ggplot2 (pointrange)}
\NormalTok{out\_conf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{reorder}\NormalTok{(term, estimate), }\DataTypeTok{y =}\NormalTok{ estimate, }\DataTypeTok{ymin =}\NormalTok{ conf.low, }\DataTypeTok{ymax =}\NormalTok{ conf.high)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_pointrange}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{""}\NormalTok{, }\DataTypeTok{y =} \StringTok{"OLS Estimate"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme\_bw}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-194-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# another way to do it (errorbar)}
\NormalTok{out\_conf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ estimate, }\DataTypeTok{y =} \KeywordTok{reorder}\NormalTok{(term, estimate))) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_errorbarh}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{xmin =}\NormalTok{ conf.low, }\DataTypeTok{xmax =}\NormalTok{ conf.high)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{y =} \StringTok{""}\NormalTok{, }\DataTypeTok{x =} \StringTok{"OLS Estimate"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme\_bw}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-194-2.pdf}

You can calculate marginal effects using \texttt{margins} package. For the sake of time, I'm not covering that here.

\hypertarget{functional_programming}{%
\chapter{Automating repeated things}\label{functional_programming}}

\begin{quote}
Anything that can be automated should be automated. Do as little as possible by hand. Do as much as possible with functions.
- Hadley Wickham
\end{quote}

\hypertarget{flow-control}{%
\section{Flow control}\label{flow-control}}

\begin{itemize}
\item
  Control structures = putting logic in code to control flow (e.g., \texttt{if}, \texttt{else}, \texttt{for}, \texttt{while}, \texttt{repeat}, \texttt{break}, \texttt{next})
\item
  Almost all the conditional operators used in Python also work in R. The basic loop set up is also very similar, with some small syntax adjustments.
\item
  \texttt{if()} is a function whose arguments must be specified inside parentheses.
\item
  \texttt{else}, however, is a reserved operator that takes no arguments. Note that there is no \texttt{elif} option --- one simply writes \texttt{else\ if()}.
\item
  Whereas operations to be executed after conditional evaluations in Python come after a \texttt{:}, R operations must only be enclosed in curly brackets: \texttt{\{\}}. Furthermore, there is no requirement for indentation.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{5}

\ControlFlowTok{if}\NormalTok{(x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{0}\NormalTok{)\{ }\CommentTok{\# Condition }
  \KeywordTok{print}\NormalTok{(}\StringTok{"x is negative"}\NormalTok{) }\CommentTok{\# Do something }
\NormalTok{\} }

\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{{-}5}

\ControlFlowTok{if}\NormalTok{(x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{0}\NormalTok{)\{}
  \KeywordTok{print}\NormalTok{(}\StringTok{"x is negative"}\NormalTok{)}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "x is negative"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{5}

\ControlFlowTok{if}\NormalTok{(x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{0}\NormalTok{)\{}
  \KeywordTok{print}\NormalTok{(}\StringTok{"x is negative"}\NormalTok{)}
\NormalTok{\} }\ControlFlowTok{else}\NormalTok{\{}
  \KeywordTok{print}\NormalTok{(}\StringTok{"x is positive"}\NormalTok{)}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "x is positive"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{0}

\ControlFlowTok{if}\NormalTok{(x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{0}\NormalTok{)\{ }\CommentTok{\# Condition }
  \KeywordTok{print}\NormalTok{(}\StringTok{"x is negative"}\NormalTok{) }\CommentTok{\# Do something }
\NormalTok{\} }\ControlFlowTok{else} \ControlFlowTok{if}\NormalTok{(x }\OperatorTok{==}\StringTok{ }\DecValTok{0}\NormalTok{)\{ }
  \KeywordTok{print}\NormalTok{(}\StringTok{"x is zero"}\NormalTok{) }\CommentTok{\# Do something else }
\NormalTok{\} }\ControlFlowTok{else}\NormalTok{ \{ }\KeywordTok{print}\NormalTok{(}\StringTok{"x is positive"}\NormalTok{) }\CommentTok{\# Do something else }
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "x is zero"
\end{verbatim}

R also does some class coercion that makes Boolean evaluations harder to break than in Python. But be careful --- R has a set of special coercion used for fast logical evaluation and subsetting. Specifically, \texttt{TRUE} is considered equal to \texttt{1}, while \texttt{FALSE} is equal to \texttt{0}. The Boolean logicals can also be specified as a full word in all caps, or simply as \texttt{T} or \texttt{F}.

\begin{Shaded}
\begin{Highlighting}[]
\DecValTok{1} \OperatorTok{\textless{}}\StringTok{ }\DecValTok{2}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\StringTok{"1"} \OperatorTok{\textless{}}\StringTok{ }\DecValTok{2}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\StringTok{"a"} \OperatorTok{\textless{}}\StringTok{ }\DecValTok{2}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\OtherTok{TRUE} \OperatorTok{\textless{}}\StringTok{ }\DecValTok{2}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\OtherTok{TRUE} \OperatorTok{==}\StringTok{ "TRUE"}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{T }\OperatorTok{==}\StringTok{ "TRUE"}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\OtherTok{TRUE} \OperatorTok{==}\StringTok{ "T"}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\OtherTok{TRUE} \OperatorTok{==}\StringTok{ "FALSE"}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\OtherTok{TRUE} \OperatorTok{==}\StringTok{ }\DecValTok{0}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\OtherTok{TRUE} \OperatorTok{==}\StringTok{ }\DecValTok{1}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\OtherTok{FALSE} \OperatorTok{==}\StringTok{ }\DecValTok{0}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\OtherTok{FALSE} \OperatorTok{\textless{}=}\StringTok{ }\DecValTok{1}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\hypertarget{functions}{%
\subsection{Functions}\label{functions}}

While functions are defined in Python using the \texttt{def} reserved operator, R sees functions as just another type of named object. Thus, they require explicit assignment to an object. This is done using the function \texttt{function()}, which creates a function taking the arguments specified in parentheses.

function = input + computation (begin -\textgreater{} end) + output

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{simple.function \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x)\{}
  \KeywordTok{print}\NormalTok{(x }\OperatorTok{+}\StringTok{ }\DecValTok{1}\NormalTok{)}
\NormalTok{\}}

\KeywordTok{simple.function}\NormalTok{(}\DataTypeTok{x =} \DecValTok{2}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{less.simple.function \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x, y)\{}
  \KeywordTok{print}\NormalTok{(x }\OperatorTok{{-}}\StringTok{ }\NormalTok{y }\OperatorTok{+}\StringTok{ }\DecValTok{1}\NormalTok{)}
\NormalTok{\}}

\KeywordTok{less.simple.function}\NormalTok{(}\DataTypeTok{x =} \DecValTok{2}\NormalTok{, }\DataTypeTok{y =} \DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] -7
\end{verbatim}

With respect to returning function output, most of the same rules apply as with Python. Be sure to remember that \texttt{return()} will only process a single object, so multiple items must usually be returned as a list. Note that your ordering of the functions matters, too.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{dumbfun \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x)\{}
  \KeywordTok{return}\NormalTok{(x)}
  \KeywordTok{print}\NormalTok{(}\StringTok{"This will never print :("}\NormalTok{)}
\NormalTok{\}}

\KeywordTok{dumbfun}\NormalTok{(}\DataTypeTok{x =} \StringTok{"something"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "something"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{dumbfun \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x)\{}
  \KeywordTok{print}\NormalTok{(}\StringTok{"Why did I print?"}\NormalTok{)}
  \KeywordTok{return}\NormalTok{(x)}
\NormalTok{\}}

\KeywordTok{dumbfun}\NormalTok{(}\DataTypeTok{x =} \StringTok{"something"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Why did I print?"
\end{verbatim}

\begin{verbatim}
## [1] "something"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{dumbfun \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x,y)\{}
\NormalTok{  thing1 \textless{}{-}}\StringTok{ }\NormalTok{x}
\NormalTok{  thing2 \textless{}{-}}\StringTok{ }\NormalTok{y}
  \KeywordTok{return}\NormalTok{(}\KeywordTok{list}\NormalTok{(thing1, thing2))}
\NormalTok{\}}

\KeywordTok{dumbfun}\NormalTok{(}\DataTypeTok{x =} \StringTok{"some text"}\NormalTok{, }\DataTypeTok{y =} \StringTok{"some data"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] "some text"
## 
## [[2]]
## [1] "some data"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dumbfun}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{c}\NormalTok{(}\DecValTok{5}\NormalTok{,}\DecValTok{10}\NormalTok{,}\DecValTok{15}\NormalTok{), }\DataTypeTok{y =} \StringTok{"some data"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1]  5 10 15
## 
## [[2]]
## [1] "some data"
\end{verbatim}

R functions also allow you to set default argument values:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{less.simple.function \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x, }\DataTypeTok{y =} \DecValTok{0}\NormalTok{)\{}
  \KeywordTok{print}\NormalTok{(x }\OperatorTok{{-}}\StringTok{ }\NormalTok{y }\OperatorTok{+}\StringTok{ }\DecValTok{1}\NormalTok{)}
\NormalTok{\}}

\KeywordTok{less.simple.function}\NormalTok{(}\DataTypeTok{x =} \DecValTok{2}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{less.simple.function}\NormalTok{(}\DataTypeTok{x =} \DecValTok{2}\NormalTok{, }\DataTypeTok{y =} \DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] -7
\end{verbatim}

With respect to specifying arguments, one can either use argument \textbf{position} specifications (i.e., the order) or argument \textbf{name} specifications. The latter is strongly preferred, as it is very easy to accidentally specify incorrect argument values.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{send \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(message, recipient, }\DataTypeTok{cc=}\OtherTok{NULL}\NormalTok{, }\DataTypeTok{bcc=}\OtherTok{NULL}\NormalTok{)\{}
  \KeywordTok{print}\NormalTok{(}\KeywordTok{paste}\NormalTok{(message, recipient, }\DataTypeTok{sep =} \StringTok{", "}\NormalTok{))}
  \KeywordTok{print}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}\StringTok{"CC:"}\NormalTok{, cc, }\DataTypeTok{sep =} \StringTok{" "}\NormalTok{))}
  \KeywordTok{print}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}\StringTok{"BCC:"}\NormalTok{, bcc, }\DataTypeTok{sep =} \StringTok{" "}\NormalTok{))}
\NormalTok{\}}

\KeywordTok{send}\NormalTok{(}\DataTypeTok{message =} \StringTok{"Hello"}\NormalTok{, }\DataTypeTok{recipient =} \StringTok{"World"}\NormalTok{, }\DataTypeTok{cc =} \StringTok{"Sun"}\NormalTok{, }\DataTypeTok{bcc =} \StringTok{"Jane"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Hello, World"
## [1] "CC: Sun"
## [1] "BCC: Jane"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{send}\NormalTok{(}\StringTok{"Hello"}\NormalTok{, }\StringTok{"World"}\NormalTok{, }\StringTok{"Sun"}\NormalTok{, }\StringTok{"Jane"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Hello, World"
## [1] "CC: Sun"
## [1] "BCC: Jane"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{send}\NormalTok{(}\StringTok{"Hello"}\NormalTok{, }\StringTok{"Sun"}\NormalTok{, }\StringTok{"Jane"}\NormalTok{, }\StringTok{"World"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Hello, Sun"
## [1] "CC: Jane"
## [1] "BCC: World"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{send}\NormalTok{(}\DataTypeTok{message =} \StringTok{"Hello"}\NormalTok{, }\DataTypeTok{cc =} \StringTok{"Sun"}\NormalTok{, }\DataTypeTok{bcc =} \KeywordTok{c}\NormalTok{(}\StringTok{"Jane"}\NormalTok{, }\StringTok{"Rochelle"}\NormalTok{), }\DataTypeTok{recipient =} \StringTok{"World"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Hello, World"
## [1] "CC: Sun"
## [1] "BCC: Jane"     "BCC: Rochelle"
\end{verbatim}

Also, note that functions don't have what CS people called side-effects. Functions only define local variables = They don't change objects stored in the global environment. (Consider the difference between \texttt{\textless{}-} and \texttt{=} for assignments.) That's why you can use functions for reusable tasks since it does not interrupt other important things in your system.

See \href{https://darrenjw.wordpress.com/2011/11/23/lexical-scope-and-function-closures-in-r/}{the following example} from Wilkinson.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{a =}\StringTok{ }\DecValTok{1} 
\NormalTok{b =}\StringTok{ }\DecValTok{2}

\NormalTok{f \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x)}
\NormalTok{\{}
\NormalTok{  a}\OperatorTok{*}\NormalTok{x }\OperatorTok{+}\StringTok{ }\NormalTok{b}
\NormalTok{\}}

\KeywordTok{f}\NormalTok{(}\DecValTok{2}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{g \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x)}
\NormalTok{\{}
\NormalTok{  a =}\StringTok{ }\DecValTok{2}
\NormalTok{  b =}\StringTok{ }\DecValTok{1}
  \KeywordTok{f}\NormalTok{(x)}
\NormalTok{\}}

\KeywordTok{g}\NormalTok{(}\DecValTok{2}\NormalTok{) }\CommentTok{\# a equals still 1 }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4
\end{verbatim}

\hypertarget{for-loop}{%
\subsection{for loop}\label{for-loop}}

Loops in R also work basically the same way as in Python, with just a few adjustments. First, recall that index positions in R start at 1. Second, \texttt{while()} and \texttt{for()} are functions rather than reserved operators, meaning they must take arguments in parentheses. Third, just like \texttt{else}, the \texttt{in} operator \emph{is} reserved and takes no arguments in parentheses. Fourth, the conditional execution must appear between curly brackets. Finally, indentation is meaningless, but each new operation must appear on a new line.

\begin{itemize}
\tightlist
\item
  \texttt{while()}: when we have no idea how many times loop needs to be executed.
\item
  \texttt{for()}: when we know how many times loop needs to be executed. This is likely to be the loop you are going to use most frequently.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fruits \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"apples"}\NormalTok{, }\StringTok{"oranges"}\NormalTok{, }\StringTok{"pears"}\NormalTok{, }\StringTok{"bananas"}\NormalTok{)}

\CommentTok{\# a while loop}
\NormalTok{i \textless{}{-}}\StringTok{ }\DecValTok{1}
\ControlFlowTok{while}\NormalTok{(i }\OperatorTok{\textless{}=}\StringTok{ }\KeywordTok{length}\NormalTok{(fruits))\{}
  \KeywordTok{print}\NormalTok{(fruits[i])}
\NormalTok{  i \textless{}{-}}\StringTok{ }\NormalTok{i }\OperatorTok{+}\StringTok{ }\DecValTok{1}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "apples"
## [1] "oranges"
## [1] "pears"
## [1] "bananas"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# a for loop}
\ControlFlowTok{for}\NormalTok{(i }\ControlFlowTok{in} \DecValTok{1}\OperatorTok{:}\KeywordTok{length}\NormalTok{(fruits))\{}
  \KeywordTok{print}\NormalTok{(fruits[i])}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "apples"
## [1] "oranges"
## [1] "pears"
## [1] "bananas"
\end{verbatim}

\hypertarget{apply-family}{%
\subsection{apply family}\label{apply-family}}

While and for loops in R can be very slow. For this reason, R has a number of built-in iteration methods to speed up execution times. In many cases, packages will have ``behind-the-scenes'' ways to avoid for loops, but what if you need to write your own function?

A common method of getting around for loops is the \textbf{apply} family of functions. These take a data structure and a function, and applies a function over all the elements in the object.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fruit \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"apple"}\NormalTok{, }\StringTok{"orange"}\NormalTok{, }\StringTok{"pear"}\NormalTok{, }\StringTok{"banana"}\NormalTok{)}

\CommentTok{\# make function that takes in only one element}
\NormalTok{make.plural \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x)\{}
\NormalTok{   plural \textless{}{-}}\StringTok{ }\KeywordTok{paste}\NormalTok{(x, }\StringTok{\textquotesingle{}s\textquotesingle{}}\NormalTok{, }\DataTypeTok{sep =} \StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{) }\CommentTok{\# sep is for collapse, so collpase \textquotesingle{}\textquotesingle{}}
   \KeywordTok{return}\NormalTok{(plural)}
\NormalTok{\}}

\KeywordTok{make.plural}\NormalTok{(}\StringTok{\textquotesingle{}apple\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "apples"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \texttt{apply()} : loop over the margins (1 = row, 2 = column) of an array
\item
  \texttt{lapply()} : loop over a list then returns a list
\item
  \texttt{sapply()} : loop over a list then returns a named vector
\item
  \texttt{tapply()}: loop over subsets of a vector
\item
  \texttt{mapply()}: multivariate version of \texttt{lapply()}. Use this if you have a function that takes in 2 or more arguments.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# apply that function to every element}
\KeywordTok{lapply}\NormalTok{(fruit, make.plural) }\CommentTok{\# returns a list}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] "apples"
## 
## [[2]]
## [1] "oranges"
## 
## [[3]]
## [1] "pears"
## 
## [[4]]
## [1] "bananas"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{sapply}\NormalTok{(fruit, make.plural) }\CommentTok{\# returns a named vector}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##     apple    orange      pear    banana 
##  "apples" "oranges"   "pears" "bananas"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{library}\NormalTok{(purrr) }\CommentTok{\# load package}
\KeywordTok{map}\NormalTok{(fruit, make.plural) }\CommentTok{\# type consistent}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] "apples"
## 
## [[2]]
## [1] "oranges"
## 
## [[3]]
## [1] "pears"
## 
## [[4]]
## [1] "bananas"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Why sapply is bad }

\KeywordTok{sapply}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{100}\NormalTok{, paste) }\CommentTok{\# return character }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   [1] "1"   "2"   "3"   "4"   "5"   "6"   "7"   "8"   "9"   "10"  "11"  "12" 
##  [13] "13"  "14"  "15"  "16"  "17"  "18"  "19"  "20"  "21"  "22"  "23"  "24" 
##  [25] "25"  "26"  "27"  "28"  "29"  "30"  "31"  "32"  "33"  "34"  "35"  "36" 
##  [37] "37"  "38"  "39"  "40"  "41"  "42"  "43"  "44"  "45"  "46"  "47"  "48" 
##  [49] "49"  "50"  "51"  "52"  "53"  "54"  "55"  "56"  "57"  "58"  "59"  "60" 
##  [61] "61"  "62"  "63"  "64"  "65"  "66"  "67"  "68"  "69"  "70"  "71"  "72" 
##  [73] "73"  "74"  "75"  "76"  "77"  "78"  "79"  "80"  "81"  "82"  "83"  "84" 
##  [85] "85"  "86"  "87"  "88"  "89"  "90"  "91"  "92"  "93"  "94"  "95"  "96" 
##  [97] "97"  "98"  "99"  "100"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{sapply}\NormalTok{(}\KeywordTok{integer}\NormalTok{(), paste) }\CommentTok{\# return list!}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## list()
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{library}\NormalTok{(purrr)}
\KeywordTok{map}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{100}\NormalTok{, paste) }\CommentTok{\# return list}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] "1"
## 
## [[2]]
## [1] "2"
## 
## [[3]]
## [1] "3"
## 
## [[4]]
## [1] "4"
## 
## [[5]]
## [1] "5"
## 
## [[6]]
## [1] "6"
## 
## [[7]]
## [1] "7"
## 
## [[8]]
## [1] "8"
## 
## [[9]]
## [1] "9"
## 
## [[10]]
## [1] "10"
## 
## [[11]]
## [1] "11"
## 
## [[12]]
## [1] "12"
## 
## [[13]]
## [1] "13"
## 
## [[14]]
## [1] "14"
## 
## [[15]]
## [1] "15"
## 
## [[16]]
## [1] "16"
## 
## [[17]]
## [1] "17"
## 
## [[18]]
## [1] "18"
## 
## [[19]]
## [1] "19"
## 
## [[20]]
## [1] "20"
## 
## [[21]]
## [1] "21"
## 
## [[22]]
## [1] "22"
## 
## [[23]]
## [1] "23"
## 
## [[24]]
## [1] "24"
## 
## [[25]]
## [1] "25"
## 
## [[26]]
## [1] "26"
## 
## [[27]]
## [1] "27"
## 
## [[28]]
## [1] "28"
## 
## [[29]]
## [1] "29"
## 
## [[30]]
## [1] "30"
## 
## [[31]]
## [1] "31"
## 
## [[32]]
## [1] "32"
## 
## [[33]]
## [1] "33"
## 
## [[34]]
## [1] "34"
## 
## [[35]]
## [1] "35"
## 
## [[36]]
## [1] "36"
## 
## [[37]]
## [1] "37"
## 
## [[38]]
## [1] "38"
## 
## [[39]]
## [1] "39"
## 
## [[40]]
## [1] "40"
## 
## [[41]]
## [1] "41"
## 
## [[42]]
## [1] "42"
## 
## [[43]]
## [1] "43"
## 
## [[44]]
## [1] "44"
## 
## [[45]]
## [1] "45"
## 
## [[46]]
## [1] "46"
## 
## [[47]]
## [1] "47"
## 
## [[48]]
## [1] "48"
## 
## [[49]]
## [1] "49"
## 
## [[50]]
## [1] "50"
## 
## [[51]]
## [1] "51"
## 
## [[52]]
## [1] "52"
## 
## [[53]]
## [1] "53"
## 
## [[54]]
## [1] "54"
## 
## [[55]]
## [1] "55"
## 
## [[56]]
## [1] "56"
## 
## [[57]]
## [1] "57"
## 
## [[58]]
## [1] "58"
## 
## [[59]]
## [1] "59"
## 
## [[60]]
## [1] "60"
## 
## [[61]]
## [1] "61"
## 
## [[62]]
## [1] "62"
## 
## [[63]]
## [1] "63"
## 
## [[64]]
## [1] "64"
## 
## [[65]]
## [1] "65"
## 
## [[66]]
## [1] "66"
## 
## [[67]]
## [1] "67"
## 
## [[68]]
## [1] "68"
## 
## [[69]]
## [1] "69"
## 
## [[70]]
## [1] "70"
## 
## [[71]]
## [1] "71"
## 
## [[72]]
## [1] "72"
## 
## [[73]]
## [1] "73"
## 
## [[74]]
## [1] "74"
## 
## [[75]]
## [1] "75"
## 
## [[76]]
## [1] "76"
## 
## [[77]]
## [1] "77"
## 
## [[78]]
## [1] "78"
## 
## [[79]]
## [1] "79"
## 
## [[80]]
## [1] "80"
## 
## [[81]]
## [1] "81"
## 
## [[82]]
## [1] "82"
## 
## [[83]]
## [1] "83"
## 
## [[84]]
## [1] "84"
## 
## [[85]]
## [1] "85"
## 
## [[86]]
## [1] "86"
## 
## [[87]]
## [1] "87"
## 
## [[88]]
## [1] "88"
## 
## [[89]]
## [1] "89"
## 
## [[90]]
## [1] "90"
## 
## [[91]]
## [1] "91"
## 
## [[92]]
## [1] "92"
## 
## [[93]]
## [1] "93"
## 
## [[94]]
## [1] "94"
## 
## [[95]]
## [1] "95"
## 
## [[96]]
## [1] "96"
## 
## [[97]]
## [1] "97"
## 
## [[98]]
## [1] "98"
## 
## [[99]]
## [1] "99"
## 
## [[100]]
## [1] "100"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map}\NormalTok{(}\KeywordTok{integer}\NormalTok{(), paste) }\CommentTok{\# return list}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## list()
\end{verbatim}

\hypertarget{purrr}{%
\section{purrr}\label{purrr}}

\begin{itemize}
\tightlist
\item
  Setup
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Install packages}
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)) \{}
  \KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Loading required package: pacman
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{  tidyverse, }\CommentTok{\# tidyverse pkgs including purrr}
\NormalTok{  tictoc, }\CommentTok{\# performance test}
\NormalTok{  broom, }\CommentTok{\# tidy modeling}
\NormalTok{  glue, }\CommentTok{\# paste string and objects}
\NormalTok{  furrr, }\CommentTok{\# parallel processing}
\NormalTok{  rvest, }\CommentTok{\# web scraping}
\NormalTok{  devtools, }\CommentTok{\# dev tools }
\NormalTok{  usethis, }\CommentTok{\# workflow     }
\NormalTok{  roxygen2, }\CommentTok{\# documentation }
            
\NormalTok{  testthat) }\CommentTok{\# testing }
\end{Highlighting}
\end{Shaded}

\hypertarget{why-map}{%
\subsection{Why map?}\label{why-map}}

\hypertarget{objectives}{%
\subsubsection{Objectives}\label{objectives}}

\begin{itemize}
\tightlist
\item
  How to use \texttt{purrr} to automate workflow in a cleaner, faster, and more extendable way
\end{itemize}

\hypertarget{copy-and-paste-programming}{%
\subsubsection{Copy-and-paste programming}\label{copy-and-paste-programming}}

\begin{quote}
Copy-and-paste programming, sometimes referred to as just pasting, is the production of highly repetitive computer programming code, as produced by copy and paste operations. It is primarily a pejorative term; those who use the term are often implying a lack of programming competence. It may also be the result of technology limitations (e.g., an insufficiently expressive development environment) as subroutines or libraries would normally be used instead. However, there are occasions when copy-and-paste programming is considered acceptable or necessary, such as for boilerplate, loop unrolling (when not supported automatically by the compiler), or certain programming idioms, and it is supported by some source code editors in the form of snippets. - Wikipedia
\end{quote}

\begin{itemize}
\item
  The following exercise was inspired by \href{http://adv-r.had.co.nz/Functional-programming.html}{Wickham's example}.
\item
  Let's imagine \texttt{df} is a survey dataset.

  \begin{itemize}
  \item
    \texttt{a,\ b,\ c,\ d} = Survey questions
  \item
    \texttt{-99}: non-responses
  \item
    Your goal: replace \texttt{-99} with \texttt{NA}
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Data}

\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{) }\CommentTok{\# for reproducibility}

\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \StringTok{"a"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \StringTok{"b"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \StringTok{"c"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \StringTok{"d"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Copy and paste}
\NormalTok{df}\OperatorTok{$}\NormalTok{a[df}\OperatorTok{$}\NormalTok{a }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}
\NormalTok{df}\OperatorTok{$}\NormalTok{b[df}\OperatorTok{$}\NormalTok{b }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}
\NormalTok{df}\OperatorTok{$}\NormalTok{c[df}\OperatorTok{$}\NormalTok{c }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}
\NormalTok{df}\OperatorTok{$}\NormalTok{d[df}\OperatorTok{$}\NormalTok{d }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}

\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 4
##       a     b     c     d
##   <dbl> <dbl> <dbl> <dbl>
## 1     3     3     3     1
## 2     3     2     3     1
## 3     1    NA     1     2
## 4     1    NA     2     1
## 5    NA     1     1     3
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \textbf{Challenge}. Explain why this solution is not very efficient (Hint: If \texttt{df\$a{[}df\$a\ ==\ -99{]}\ \textless{}-\ NA} has an error, how are you going to fix it? A solution is not scalable if it's not automatable.
\end{itemize}

\hypertarget{using-a-function}{%
\subsubsection{Using a function}\label{using-a-function}}

\begin{itemize}
\item
  Let's recall what's function in R: \texttt{input\ +\ computation\ +\ output}
\item
  If you write a function, you gain efficiency because you don't need to copy and paste the computation part.
\end{itemize}

`
function(input)\{

computation

return(output)
\}
`

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Function}

\NormalTok{fix\_missing \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x) \{}
\NormalTok{  x[x }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}
\NormalTok{  x}
\NormalTok{\}}

\CommentTok{\# Apply function to each column (vector)}

\NormalTok{df}\OperatorTok{$}\NormalTok{a \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{a)}
\NormalTok{df}\OperatorTok{$}\NormalTok{b \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{b)}
\NormalTok{df}\OperatorTok{$}\NormalTok{c \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{c)}
\NormalTok{df}\OperatorTok{$}\NormalTok{d \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{d)}

\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 4
##       a     b     c     d
##   <dbl> <dbl> <dbl> <dbl>
## 1     3     3     3     1
## 2     3     2     3     1
## 3     1    NA     1     2
## 4     1    NA     2     1
## 5    NA     1     1     3
\end{verbatim}

\begin{itemize}
\item
  \textbf{Challenge} Why using function is more efficient than 100\% copying and pasting? Can you think about a way we can automate the process?
\item
  Many options for automation in R: \texttt{for\ loop}, \texttt{apply} family, etc.
\item
  Here's a tidy solution comes from \texttt{purrr} package.
\item
  The power and joy of one-liner.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\NormalTok{purrr}\OperatorTok{::}\KeywordTok{map\_df}\NormalTok{(df, fix\_missing)}

\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 4
##       a     b     c     d
##   <dbl> <dbl> <dbl> <dbl>
## 1     3     3     3     1
## 2     3     2     3     1
## 3     1    NA     1     2
## 4     1    NA     2     1
## 5    NA     1     1     3
\end{verbatim}

\texttt{map()} is a \href{https://en.wikipedia.org/wiki/Map_(higher-order_function)}{higher-order function} that applies a given function to each element of a list/vector.

\begin{figure}
\centering
\includegraphics{https://d33wubrfki0l68.cloudfront.net/f0494d020aa517ae7b1011cea4c4a9f21702df8b/2577b/diagrams/functionals/map.png}
\caption{This is how map() works. It's easier to understand with a picture.}
\end{figure}

\begin{verbatim}
- Input: Takes a vector/list. 

- Computation: Calls the function once for each element of the vector 

- Output: Returns in a list or whatever data format you prefer (e.g., `_df helper: dataframe`)
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \textbf{Challenge} If you run the code below, what's going to be the data type of the output?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map}\NormalTok{(df, fix\_missing)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## $a
## [1]  3  3  1  1 NA
## 
## $b
## [1]  3  2 NA NA  1
## 
## $c
## [1] 3 3 1 2 1
## 
## $d
## [1] 1 1 2 1 3
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Why \texttt{map()} is a good alternative to \texttt{for\ loop}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Built{-}in data}
\KeywordTok{data}\NormalTok{(}\StringTok{"airquality"}\NormalTok{)}

\KeywordTok{tic}\NormalTok{()}

\CommentTok{\# Placeholder}
\NormalTok{out1 \textless{}{-}}\StringTok{ }\KeywordTok{vector}\NormalTok{(}\StringTok{"double"}\NormalTok{, }\KeywordTok{ncol}\NormalTok{(airquality))}
\CommentTok{\# Sequence variable}
\ControlFlowTok{for}\NormalTok{ (i }\ControlFlowTok{in} \KeywordTok{seq\_along}\NormalTok{(airquality)) \{ }\CommentTok{\#}

  \CommentTok{\# Assign a computation result to each element}
\NormalTok{  out1[[i]] \textless{}{-}}\StringTok{ }\KeywordTok{mean}\NormalTok{(airquality[[i]], }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{)}
\NormalTok{\}}
\KeywordTok{toc}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 0.006 sec elapsed
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{tic}\NormalTok{()}
\NormalTok{out1 \textless{}{-}}\StringTok{ }\NormalTok{airquality }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{map\_dbl}\NormalTok{(mean, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{)}
\KeywordTok{toc}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 0.002 sec elapsed
\end{verbatim}

\begin{itemize}
\item
  In short, \texttt{map()} is more readable, faster, and easily extendable with other data science tasks (e.g., wrangling, modeling, and visualization) using \texttt{\%\textgreater{}\%}.
\item
  Final point: Why not base R \texttt{apply} family?
\item
  Short answer: \texttt{purrr::map()} is simpler to write. For instance,
\end{itemize}

\texttt{map\_dbl(x,\ mean,\ na.rm\ =\ TRUE)} = \texttt{vapply(x,\ mean,\ na.rm\ =\ TRUE,\ FUN.VALUE\ =\ double(1))}

\hypertarget{application-many-models}{%
\subsubsection{Application (many models)}\label{application-many-models}}

\begin{itemize}
\tightlist
\item
  One popular application of \texttt{map()} is to run regression models (or whatever model you want to run) on list-columns. No more copying and pasting for running many regression models on subgroups!
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Have you ever tried this?}
\NormalTok{lm\_A \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(y }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{x, }\KeywordTok{subset}\NormalTok{(data, subgroup }\OperatorTok{==}\StringTok{ "group\_A"}\NormalTok{))}
\NormalTok{lm\_B \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(y }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{x, }\KeywordTok{subset}\NormalTok{(data, subgroup }\OperatorTok{==}\StringTok{ "group\_B"}\NormalTok{))}
\NormalTok{lm\_C \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(y }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{x, }\KeywordTok{subset}\NormalTok{(data, subgroup }\OperatorTok{==}\StringTok{ "group\_C"}\NormalTok{))}
\NormalTok{lm\_D \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(y }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{x, }\KeywordTok{subset}\NormalTok{(data, subgroup }\OperatorTok{==}\StringTok{ "group\_D"}\NormalTok{))}
\NormalTok{lm\_E \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(y }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{x, }\KeywordTok{subset}\NormalTok{(data, subgroup }\OperatorTok{==}\StringTok{ "group\_E"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  For more information on this technique, read the Many Models subchapter of the \href{https://r4ds.had.co.nz/many-models.html\#creating-list-columns}{R for Data Science}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Function}
\NormalTok{lm\_model \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(df) \{}
  \KeywordTok{lm}\NormalTok{(Temp }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{Ozone, }\DataTypeTok{data =}\NormalTok{ df)}
\NormalTok{\}}

\CommentTok{\# Map}
\NormalTok{models \textless{}{-}}\StringTok{ }\NormalTok{airquality }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(Month) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{nest}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Create list{-}columns}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{ols =} \KeywordTok{map}\NormalTok{(data, lm\_model)) }\CommentTok{\# Map}
\NormalTok{models}\OperatorTok{$}\NormalTok{ols[}\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## 
## Call:
## lm(formula = Temp ~ Ozone, data = df)
## 
## Coefficients:
## (Intercept)        Ozone  
##     62.8842       0.1629
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Add tidying}
\NormalTok{tidy\_lm\_model \textless{}{-}}\StringTok{ }\NormalTok{purrr}\OperatorTok{::}\KeywordTok{compose}\NormalTok{( }\CommentTok{\# compose multiple functions}
\NormalTok{  broom}\OperatorTok{::}\NormalTok{tidy, }\CommentTok{\# convert lm objects into tidy tibbles}
\NormalTok{  lm\_model}
\NormalTok{)}

\NormalTok{tidied\_models \textless{}{-}}\StringTok{ }\NormalTok{airquality }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(Month) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{nest}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Create list{-}columns}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{ols =} \KeywordTok{map}\NormalTok{(data, tidy\_lm\_model))}

\NormalTok{tidied\_models}\OperatorTok{$}\NormalTok{ols[}\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   62.9      1.61       39.2  2.88e-23
## 2 Ozone          0.163    0.0500      3.26 3.31e- 3
\end{verbatim}

\hypertarget{automote-2-or-2-tasks}{%
\section{Automote 2 or 2+ tasks}\label{automote-2-or-2-tasks}}

\hypertarget{objectives-1}{%
\subsection{Objectives}\label{objectives-1}}

\begin{itemize}
\tightlist
\item
  Learning how to use \texttt{map2()} and \texttt{pmap()} to avoid writing nested loops.
\end{itemize}

\hypertarget{problem}{%
\subsection{Problem}\label{problem}}

\begin{itemize}
\tightlist
\item
  Problem: How can you create something like below?
\end{itemize}

{[}1{]} ``University = Berkeley \textbar{} Department = waterbenders''

{[}1{]} ``University = Berkeley \textbar{} Department = earthbenders''

{[}1{]} ``University = Berkeley \textbar{} Department = firebenders''

{[}1{]} ``University = Berkeley \textbar{} Department = airbenders''

{[}1{]} ``University = Stanford \textbar{} Department = waterbenders''

{[}1{]} ``University = Stanford \textbar{} Department = earthbenders''

{[}1{]} ``University = Stanford \textbar{} Department = firebenders''

{[}1{]} ``University = Stanford \textbar{} Department = airbenders''

\begin{itemize}
\tightlist
\item
  The most manual way: You can copy and paste eight times.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{paste}\NormalTok{(}\StringTok{"University = Berkeley | Department = CS"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "University = Berkeley | Department = CS"
\end{verbatim}

\hypertarget{for-loop-1}{%
\subsection{For loop}\label{for-loop-1}}

\begin{itemize}
\item
  A slightly more efficient way: using a for loop.
\item
  Think about which part of the statement is constant and which part varies ( = parameters).
\item
  Do we need a placeholder? No.~We don't need a placeholder because we don't store the result of iterations.
\item
  \textbf{Challenge}: How many parameters do you need to solve the problem below?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Outer loop}

\ControlFlowTok{for}\NormalTok{ (univ }\ControlFlowTok{in} \KeywordTok{c}\NormalTok{(}\StringTok{"Berkeley"}\NormalTok{, }\StringTok{"Stanford"}\NormalTok{)) \{}

  \CommentTok{\# Inner loop}

  \ControlFlowTok{for}\NormalTok{ (dept }\ControlFlowTok{in} \KeywordTok{c}\NormalTok{(}\StringTok{"waterbenders"}\NormalTok{, }\StringTok{"earthbenders"}\NormalTok{, }\StringTok{"firebenders"}\NormalTok{, }\StringTok{"airbenders"}\NormalTok{)) \{}
    \KeywordTok{print}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}\StringTok{"University = "}\NormalTok{, univ, }\StringTok{"|"}\NormalTok{, }\StringTok{"Department = "}\NormalTok{, dept))}
\NormalTok{  \}}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "University =  Berkeley | Department =  waterbenders"
## [1] "University =  Berkeley | Department =  earthbenders"
## [1] "University =  Berkeley | Department =  firebenders"
## [1] "University =  Berkeley | Department =  airbenders"
## [1] "University =  Stanford | Department =  waterbenders"
## [1] "University =  Stanford | Department =  earthbenders"
## [1] "University =  Stanford | Department =  firebenders"
## [1] "University =  Stanford | Department =  airbenders"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  This is not bad, but \ldots{} \texttt{n} arguments -\textgreater{} \texttt{n-nested\ for\ loops}. As a scale of your problem grows, your code gets really complicated.
\end{itemize}

\begin{quote}
To become significantly more reliable, code must become more transparent. In particular, nested conditions and loops must be viewed with great suspicion. Complicated control flows confuse programmers. Messy code often hides bugs. --- \href{https://en.wikipedia.org/wiki/Bjarne_Stroustrup}{Bjarne Stroustrup}
\end{quote}

\hypertarget{map2-pmap}{%
\subsection{map2 \& pmap}\label{map2-pmap}}

\begin{itemize}
\item
  Step 1: Define inputs and a function.
\item
  \textbf{Challenge} Why are we using \texttt{rep()} to create input vectors? For instance, for \texttt{univ\_list} why not just use \texttt{c("Berkeley",\ "Stanford")}?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Inputs (remember the length of these inputs should be identical)}

\NormalTok{univ\_list \textless{}{-}}\StringTok{ }\KeywordTok{rep}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"Berkeley"}\NormalTok{, }\StringTok{"Stanford"}\NormalTok{), }\DecValTok{4}\NormalTok{)}
\NormalTok{dept\_list \textless{}{-}}\StringTok{ }\KeywordTok{rep}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"waterbenders"}\NormalTok{, }\StringTok{"earthbenders"}\NormalTok{, }\StringTok{"firebenders"}\NormalTok{, }\StringTok{"airbenders"}\NormalTok{), }\DecValTok{2}\NormalTok{)}

\CommentTok{\# Function}

\NormalTok{print\_lists \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(univ, dept) \{}
  \KeywordTok{print}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}
    \StringTok{"University = "}\NormalTok{, univ, }\StringTok{"|"}\NormalTok{,}
    \StringTok{"Department = "}\NormalTok{, dept}
\NormalTok{  ))}
\NormalTok{\}}

\CommentTok{\# Test}

\KeywordTok{print\_lists}\NormalTok{(univ\_list[}\DecValTok{1}\NormalTok{], dept\_list[}\DecValTok{1}\NormalTok{])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "University =  Berkeley | Department =  waterbenders"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Step2: Using \texttt{map2()} or \texttt{pmap()}
\end{itemize}

\includegraphics{https://dcl-prog.stanford.edu/images/map2.png}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# 2 arguments}
\NormalTok{map2\_output \textless{}{-}}\StringTok{ }\KeywordTok{map2}\NormalTok{(univ\_list, dept\_list, print\_lists)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "University =  Berkeley | Department =  waterbenders"
## [1] "University =  Stanford | Department =  earthbenders"
## [1] "University =  Berkeley | Department =  firebenders"
## [1] "University =  Stanford | Department =  airbenders"
## [1] "University =  Berkeley | Department =  waterbenders"
## [1] "University =  Stanford | Department =  earthbenders"
## [1] "University =  Berkeley | Department =  firebenders"
## [1] "University =  Stanford | Department =  airbenders"
\end{verbatim}

\includegraphics{https://d33wubrfki0l68.cloudfront.net/e426c5755e2e65bdcc073d387775db79791f32fd/92902/diagrams/functionals/pmap.png}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# 3+ arguments}
\NormalTok{pmap\_output \textless{}{-}}\StringTok{ }\KeywordTok{pmap}\NormalTok{(}\KeywordTok{list}\NormalTok{(univ\_list, dept\_list), print\_lists)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "University =  Berkeley | Department =  waterbenders"
## [1] "University =  Stanford | Department =  earthbenders"
## [1] "University =  Berkeley | Department =  firebenders"
## [1] "University =  Stanford | Department =  airbenders"
## [1] "University =  Berkeley | Department =  waterbenders"
## [1] "University =  Stanford | Department =  earthbenders"
## [1] "University =  Berkeley | Department =  firebenders"
## [1] "University =  Stanford | Department =  airbenders"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \textbf{Challenge} Have you noticed that we used a slightly different input for \texttt{pmap()} compared to \texttt{map()} or \texttt{map2()}? What is the difference?
\end{itemize}

\hypertarget{automate-plotting}{%
\section{Automate plotting}\label{automate-plotting}}

\hypertarget{objective}{%
\subsection{Objective}\label{objective}}

\begin{itemize}
\tightlist
\item
  Learning how to use \texttt{map()} and \texttt{glue()} to automate creating multiple plots
\end{itemize}

\hypertarget{problem-1}{%
\subsection{Problem}\label{problem-1}}

\begin{itemize}
\tightlist
\item
  Making the following data visualization process more efficient.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{data}\NormalTok{(}\StringTok{"airquality"}\NormalTok{)}

\NormalTok{airquality }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ Ozone, }\DataTypeTok{y =}\NormalTok{ Solar.R)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{title =} \StringTok{"Relationship between Ozone and Solar.R"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Solar.R"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Removed 42 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-27-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{airquality }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ Ozone, }\DataTypeTok{y =}\NormalTok{ Wind)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{title =} \StringTok{"Relationship between Ozone and Wind"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Wind"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Removed 37 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-27-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{airquality }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ Ozone, }\DataTypeTok{y =}\NormalTok{ Temp)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{title =} \StringTok{"Relationship between Ozone and Temp"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Temp"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Removed 37 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-27-3.pdf}

\hypertarget{solution}{%
\subsection{Solution}\label{solution}}

\begin{itemize}
\item
  Learn how \texttt{glue()} works.
\item
  \texttt{glue()} combines strings and objects and it works simpler and faster than \texttt{paste()} or \texttt{sprintif()}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{names \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"Jae"}\NormalTok{, }\StringTok{"Aniket"}\NormalTok{, }\StringTok{"Avery"}\NormalTok{)}

\NormalTok{fields \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"Political Science"}\NormalTok{, }\StringTok{"Law"}\NormalTok{, }\StringTok{"Public Health"}\NormalTok{)}

\KeywordTok{glue}\NormalTok{(}\StringTok{"\{names\} studies \{fields\}."}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Jae studies Political Science.
## Aniket studies Law.
## Avery studies Public Health.
\end{verbatim}

\begin{itemize}
\item
  So, our next step is to combine \texttt{glue()} and \texttt{map()}.
\item
  Let's first think about writing a function that includes \texttt{glue()}.
\item
  \textbf{Challenge}
  How can you create the character vector of column names?
\item
  \textbf{Challenge}
  How can you make \texttt{ggplot2()} take strings as x and y variable names? (Hint: Type \texttt{?aes\_string()})
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{airquality }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes\_string}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{names}\NormalTok{(airquality)[}\DecValTok{1}\NormalTok{], }\DataTypeTok{y =} \KeywordTok{names}\NormalTok{(airquality)[}\DecValTok{2}\NormalTok{])) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{title =} \KeywordTok{glue}\NormalTok{(}\StringTok{"Relationship between Ozone and \{names(airquality)[2]\}"}\NormalTok{),}
    \DataTypeTok{y =} \KeywordTok{glue}\NormalTok{(}\StringTok{"\{names(airquality)[2]\}"}\NormalTok{)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Removed 42 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-29-1.pdf}

\begin{itemize}
\item
  The next step is to write an automatic plotting function.

  \begin{itemize}
  \tightlist
  \item
    Note that in the function argument \texttt{i} (abstract) replaced 2 (specific): abstraction
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{create\_point\_plot \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(i) \{}
\NormalTok{  airquality }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes\_string}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{names}\NormalTok{(airquality)[}\DecValTok{1}\NormalTok{], }\DataTypeTok{y =} \KeywordTok{names}\NormalTok{(airquality)[i])) }\OperatorTok{+}
\StringTok{    }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{    }\KeywordTok{labs}\NormalTok{(}
      \DataTypeTok{title =} \KeywordTok{glue}\NormalTok{(}\StringTok{"Relationship between Ozone and \{names(airquality)[i]\}"}\NormalTok{),}
      \DataTypeTok{y =} \KeywordTok{glue}\NormalTok{(}\StringTok{"\{names(airquality)[i]\}"}\NormalTok{)}
\NormalTok{    )}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  The final step is to put the function in \texttt{map()}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map}\NormalTok{(}\DecValTok{2}\OperatorTok{:}\KeywordTok{ncol}\NormalTok{(airquality), create\_point\_plot)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
\end{verbatim}

\begin{verbatim}
## Warning: Removed 42 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-31-1.pdf}

\begin{verbatim}
## 
## [[2]]
\end{verbatim}

\begin{verbatim}
## Warning: Removed 37 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-31-2.pdf}

\begin{verbatim}
## 
## [[3]]
\end{verbatim}

\begin{verbatim}
## Warning: Removed 37 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-31-3.pdf}

\begin{verbatim}
## 
## [[4]]
\end{verbatim}

\begin{verbatim}
## Warning: Removed 37 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-31-4.pdf}

\begin{verbatim}
## 
## [[5]]
\end{verbatim}

\begin{verbatim}
## Warning: Removed 37 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-31-5.pdf}

\hypertarget{automate-joining}{%
\section{Automate joining}\label{automate-joining}}

\hypertarget{objective-1}{%
\subsection{Objective}\label{objective-1}}

\begin{itemize}
\tightlist
\item
  Learning how to use \texttt{reduce()} to automate joining multiple dataframes
\end{itemize}

\hypertarget{problem-2}{%
\subsection{Problem}\label{problem-2}}

\begin{itemize}
\item
  How can you make joining multiple dataframes more efficient?
\item
  Note that we will use \texttt{dplyr::left\_join()\ =\ merge(x,\ y,\ all.x\ =\ TRUE)}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df1 \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{x =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \DataTypeTok{y =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \DataTypeTok{z =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{)}
\NormalTok{)}

\NormalTok{df2 \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{x =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \DataTypeTok{y =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \DataTypeTok{z =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{)}
\NormalTok{)}

\NormalTok{df3 \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{x =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \DataTypeTok{y =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \DataTypeTok{z =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{copy-and-paste}{%
\subsection{Copy and paste}\label{copy-and-paste}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{first\_join \textless{}{-}}\StringTok{ }\KeywordTok{left\_join}\NormalTok{(df1, df2)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = c("x", "y", "z")
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{second\_join \textless{}{-}}\StringTok{ }\KeywordTok{left\_join}\NormalTok{(first\_join, df3)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = c("x", "y", "z")
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{second\_join}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##       x     y     z
##   <int> <int> <int>
## 1     8     5     8
## 2     4     8     3
## 3     4     4     4
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \textbf{Challenge}
  Why the above solution is not efficient?
\end{itemize}

\hypertarget{reduce}{%
\subsection{reduce}\label{reduce}}

\begin{figure}
\centering
\includegraphics{https://d33wubrfki0l68.cloudfront.net/9c239e1227c69b7a2c9c2df234c21f3e1c74dd57/eec0e/diagrams/functionals/reduce.png}
\caption{How reduce() works.}
\end{figure}

\begin{verbatim}
- Input: Takes a vector of length n

- Computation: Calls a function with a pair of values at a time

- Output: Returns a vector of length 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{reduced \textless{}{-}}\StringTok{ }\KeywordTok{reduce}\NormalTok{(}\KeywordTok{list}\NormalTok{(df1, df2, df3), left\_join)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = c("x", "y", "z")
## Joining, by = c("x", "y", "z")
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{reduced}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##       x     y     z
##   <int> <int> <int>
## 1     8     5     8
## 2     4     8     3
## 3     4     4     4
\end{verbatim}

\hypertarget{make-automation-slower-or-faster}{%
\section{Make automation slower or faster}\label{make-automation-slower-or-faster}}

\hypertarget{objectives-2}{%
\subsection{Objectives}\label{objectives-2}}

\begin{itemize}
\tightlist
\item
  Learning how to use \texttt{slowly()} and \texttt{future\_} to make automation process either slower or faster
\end{itemize}

\hypertarget{how-to-make-automation-slower}{%
\subsection{How to make automation slower}\label{how-to-make-automation-slower}}

\begin{itemize}
\tightlist
\item
  Scraping 50 pages from a website and you don't want to overload the server. How can you do that?
\end{itemize}

\hypertarget{for-loop-2}{%
\subsection{For loop}\label{for-loop-2}}

\hypertarget{map}{%
\subsection{Map}\label{map}}

\begin{itemize}
\item
  \texttt{walk()} works same as \texttt{map()} but doesn't store its output.
\item
  If you're web scraping, one problem with this approach is it's too fast by human standards.
\item
  If you want to make the function run slowly \ldots{}
\end{itemize}

\begin{quote}
slowly() takes a function and modifies it to wait a given amount of time between each call. - \texttt{purrr} package vignette
- If a function is a verb, then a helper function is an adverb (modifying the behavior of the verb).
\end{quote}

\hypertarget{how-to-make-automation-faster}{%
\subsection{How to make automation Faster}\label{how-to-make-automation-faster}}

In a different situation, you want to make your function run faster. This is a common situation when you collect and analyze data at large-scale. You can solve this problem using parallel processing. For more on the parallel processing in R, read \href{https://yxue-me.com/post/2019-05-12-a-glossary-of-parallel-computing-packages-in-r-2019/}{this review}.

\begin{itemize}
\item
  Parallel processing setup

  \begin{itemize}
  \item
    Step1: Determine the number of max workers (\texttt{availableCores()})
  \item
    Step2: Determine the parallel processing mode (\texttt{plan()})
  \end{itemize}
\end{itemize}

\hypertarget{make-error-handling-easier}{%
\section{Make error handling easier}\label{make-error-handling-easier}}

\hypertarget{learning-objective}{%
\subsection{Learning objective}\label{learning-objective}}

\begin{itemize}
\item
  Learning how to use \texttt{safely()} and \texttt{possibly()} to make error handling easier
  \#\#\# Problem
\item
  \textbf{Challenge}
\item
  Explain why we can't run \texttt{map(url\_lists,\ read\_html)}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{url\_lists \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}
  \StringTok{"https://en.wikipedia.org/wiki/University\_of\_California,\_Berkeley"}\NormalTok{,}
  \StringTok{"https://en.wikipedia.org/wiki/Stanford\_University"}\NormalTok{,}
  \StringTok{"https://en.wikipedia.org/wiki/Carnegie\_Mellon\_University"}\NormalTok{,}
  \StringTok{"https://DLAB"}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map}\NormalTok{(url\_lists, read\_html)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  This is a very simple problem so it's easy to tell where the problem is. How can you make your error more informative?
\end{itemize}

\hypertarget{solution-1}{%
\subsection{Solution}\label{solution-1}}

\hypertarget{try-catch}{%
\subsubsection{Try-catch}\label{try-catch}}

\begin{itemize}
\item
  There are three kinds of messages you will run into, if your code has an error based on the following functions.

  \begin{itemize}
  \tightlist
  \item
    \texttt{stop()}: errors; Functions must stop.
  \item
    \texttt{warning()}: warnings; Functions may still work. Nonetheless, something is possibly messed up.
  \item
    \texttt{message()}: messages; Some actions happened.
  \end{itemize}
\item
  The basic logic of \texttt{try-catch}, R's basic error handling function, works like the following.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{tryCatch}\NormalTok{(}
\NormalTok{  \{}
    \KeywordTok{map}\NormalTok{(url\_lists, read\_html)}
\NormalTok{  \},}
  \DataTypeTok{warning =} \ControlFlowTok{function}\NormalTok{(w) \{}
    \StringTok{"Warning"}
\NormalTok{  \},}
  \DataTypeTok{error =} \ControlFlowTok{function}\NormalTok{(e) \{}
    \StringTok{"Error"}
\NormalTok{  \},}
  \DataTypeTok{finally =}\NormalTok{ \{}
    \StringTok{"Message"}
\NormalTok{  \}}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Error"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Here's \texttt{purrr} version of the \texttt{try-catch} mechanism (evaluates code and assigns exception handlers).
\end{itemize}

\hypertarget{safely}{%
\subsubsection{safely}\label{safely}}

\textbf{Outputs}

\begin{itemize}
\tightlist
\item
  result: result or \texttt{NULL}
\item
  error: \texttt{NULL} or \texttt{error}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map}\NormalTok{(url\_lists, }\KeywordTok{safely}\NormalTok{(read\_html))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [[1]]$result
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
## 
## [[1]]$error
## NULL
## 
## 
## [[2]]
## [[2]]$result
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
## 
## [[2]]$error
## NULL
## 
## 
## [[3]]
## [[3]]$result
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
## 
## [[3]]$error
## NULL
## 
## 
## [[4]]
## [[4]]$result
## NULL
## 
## [[4]]$error
## <simpleError in open.connection(x, "rb"): Could not resolve host: DLAB>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  The easier way to solve this problem is just avoiding the error.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map}\NormalTok{(url\_lists, }\KeywordTok{safely}\NormalTok{(read\_html)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{map}\NormalTok{(}\StringTok{"result"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\CommentTok{\# = map(function(x) x[["result"]]) = map(\textasciitilde{}.x[["name"]])}
\StringTok{  }\NormalTok{purrr}\OperatorTok{::}\KeywordTok{compact}\NormalTok{() }\CommentTok{\# Remove empty elements}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
## 
## [[2]]
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
## 
## [[3]]
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
\end{verbatim}

\hypertarget{possibly}{%
\subsubsection{possibly}\label{possibly}}

What if the best way to solve the problem is not ignoring the error \ldots{}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# If error occurred, "The URL is broken." will be stored in that element(s).}
\NormalTok{out \textless{}{-}}\StringTok{ }\KeywordTok{map}\NormalTok{(}
\NormalTok{  url\_lists,}
  \KeywordTok{possibly}\NormalTok{(read\_html,}
    \DataTypeTok{otherwise =} \StringTok{"The URL is broken."}
\NormalTok{  )}
\NormalTok{)}

\CommentTok{\# Let\textquotesingle{}s find the broken URL.}
\NormalTok{url\_lists[out[}\KeywordTok{seq}\NormalTok{(out)] }\OperatorTok{==}\StringTok{ "The URL is broken."}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "https://DLAB"
\end{verbatim}

\hypertarget{developing-your-own-data-products}{%
\section{Developing your own data products}\label{developing-your-own-data-products}}

\begin{quote}
A data product is the production output from a statistical analysis. - \href{https://sites.google.com/view/bcaffo/home}{Brian Caffo}
\end{quote}

\hypertarget{developing-r-packages}{%
\subsection{Developing R packages}\label{developing-r-packages}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Reuse your code
\item
  Automate your workflow
\item
  Help others (be part of an open source development community)
\end{enumerate}

\hypertarget{workflow}{%
\subsubsection{Workflow}\label{workflow}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Write code in \texttt{\textbackslash{}R}
\item
  Document code in \texttt{\textbackslash{}man} (automated by \texttt{roxygen2} package)
\end{enumerate}

\begin{itemize}
\tightlist
\item
  \texttt{devtools::document()}
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Check dependencies in \texttt{NAMESPACE}
\end{enumerate}

\begin{itemize}
\tightlist
\item
  \texttt{devtools::update()} updates the documentation
\item
  \texttt{devtools::check()} to see whether your package is ready to be submitted to CRAN
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{3}
\tightlist
\item
  Build a package (for more information, read \href{http://r-pkgs.had.co.nz/package.html}{this section} in Hadley's R package development book)
\end{enumerate}

\begin{itemize}
\tightlist
\item
  \texttt{devtools::build()}
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{4}
\tightlist
\item
  (Optional) Test (\texttt{devtools::test()}), teach in \texttt{\textbackslash{}vignettes}, and add data in \texttt{\textbackslash{}data}
\item
  Distribute the package either via CRAN or GitHub
\end{enumerate}

{[}{]}\url{http://r-pkgs.had.co.nz/diagrams/package-files.png}

\hypertarget{required-components}{%
\subsubsection{Required Components}\label{required-components}}

The 4 required components are necessary to build and distribute a minimally viable R package. The other steps are optional.

\begin{itemize}
\tightlist
\item
  Package

  \begin{itemize}
  \tightlist
  \item
    \texttt{\textbackslash{}R}: R functions
  \item
    \texttt{\textbackslash{}man}: function documentations
  \item
    DESCRIPTION: provides meta data about the package (e.g., author)
  \item
    LICENSE

    \begin{itemize}
    \tightlist
    \item
      GNU, MIT, etc.
    \end{itemize}
  \item
    NAMESPACE: package dependencies (to make your package self-contained)
  \item
    README (optional)
  \end{itemize}
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Setup (\textbf{DESCRIPTION})
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# This function creates DESCRIPTION file }
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{create\_package}\NormalTok{(}\KeywordTok{here}\NormalTok{(}\StringTok{"mypkg"}\NormalTok{))}

\CommentTok{\# Initialize git repo }
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_git}\NormalTok{()}

\CommentTok{\# License the package }
\CommentTok{\# You can use the MIT license by typing devtools::use\_mit\_license("author name"). The function produces MIT license related files (LICENSE, LICENSE.md).}
\KeywordTok{use\_mit\_license}\NormalTok{(}\StringTok{"Jae Yeon Kim"}\NormalTok{)}

\CommentTok{\# Add README (optional)}
\CommentTok{\# Makes the package more use{-}friendly }
\KeywordTok{use\_readme\_md}\NormalTok{()}

\CommentTok{\# Add news (optional) }
\CommentTok{\# Helps track changes }
\KeywordTok{use\_news\_md}\NormalTok{() }
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Write code (\textbf{R})
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_r}\NormalTok{(}\StringTok{"rbind\_mutate"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#\textquotesingle{} Add two numbers}
\CommentTok{\#\textquotesingle{}}
\CommentTok{\#\textquotesingle{} @param x A number}
\CommentTok{\#\textquotesingle{} @param y A number}
\CommentTok{\#\textquotesingle{} @return The sum of x and y }
\CommentTok{\#\textquotesingle{} @export}

\NormalTok{add \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x, y)\{}
  
\NormalTok{  x }\OperatorTok{+}\StringTok{ }\NormalTok{y}
  
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Document (\textbf{man})
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Document }
\CommentTok{\# The function creates documentation related files (NAMESPACE, function\_name.rd)}
\NormalTok{devtools}\OperatorTok{::}\KeywordTok{document}\NormalTok{()}

\CommentTok{\# Load all }
\NormalTok{devtools}\OperatorTok{::}\KeywordTok{load\_all}\NormalTok{()}

\CommentTok{\# Check }
\NormalTok{devtools}\OperatorTok{::}\KeywordTok{check}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{3}
\tightlist
\item
  Organize (\textbf{NAMESPACE})
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_package}\NormalTok{(}\StringTok{"dplyr"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{optional-components}{%
\subsubsection{Optional Components}\label{optional-components}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Test (\textbf{test})
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_testthat}\NormalTok{()}

\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_test}\NormalTok{(}\StringTok{"rbind\_mutate"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Add data (\textbf{data})
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ "Jae"}
\NormalTok{y \textless{}{-}}\StringTok{ "Sun"}
\NormalTok{z \textless{}{-}}\StringTok{ "Jane"}

\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_data}\NormalTok{(x, y, z, }\DataTypeTok{overwrite =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Teach (\textbf{vignetts})
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_vignette}\NormalTok{(}\StringTok{"rbind\_mutate"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{title}\OperatorTok{:}\StringTok{ "Vignette title"}
\NormalTok{author}\OperatorTok{:}\StringTok{ "Vignette author"}
\NormalTok{date}\OperatorTok{:}\StringTok{ "2020{-}10{-}28"}
\NormalTok{output}\OperatorTok{:}\StringTok{ }\NormalTok{rmarkdown}\OperatorTok{::}\NormalTok{html\_vignette}
\NormalTok{vignette}\OperatorTok{:}\StringTok{ }\NormalTok{blah blah}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  You can build a package website using \texttt{pkgdown}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# install.packages("pkgdown")}
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_pkgdown}\NormalTok{()}
\NormalTok{pkgdown}\OperatorTok{::}\KeywordTok{build\_site}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  A package site includes information on METADATA, Function references, Articles, News, etc.
\end{itemize}

\hypertarget{building-an-r-package}{%
\subsubsection{Building an R package}\label{building-an-r-package}}

\begin{itemize}
\tightlist
\item
  CMD (in the terminal)
\end{itemize}

You can run R commands in the terminal using R CMD.

\begin{itemize}
\tightlist
\item
  devtools
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Build }
\NormalTok{devtools}\OperatorTok{::}\KeywordTok{build}\NormalTok{()}

\CommentTok{\# Install }
\NormalTok{devtools}\OperatorTok{::}\KeywordTok{install}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\hypertarget{distributing-an-r-package}{%
\subsubsection{Distributing an R package}\label{distributing-an-r-package}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Version update }
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_version}\NormalTok{()}

\CommentTok{\# Spell check}
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_spell\_check}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \href{https://cran.r-project.org/}{CRAN (The Comprehensive R Archive Network)}
\end{enumerate}

\begin{itemize}
\tightlist
\item
  R package submission should comply with \href{https://cran.r-project.org/}{the CRAN Repository Policy}
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  GitHub
\end{enumerate}

\begin{itemize}
\item
  Push everything to the Git repository (you can do it using command-line interface or RStudio).
\item
  Don't forget that your repository should be \texttt{public}.
\item
  I highly recommend connecting GitHub with SSH. For more information, visit \href{https://docs.github.com/en/github/authenticating-to-github/connecting-to-github-with-ssh}{this link}.
\end{itemize}

\hypertarget{developing-shiny-apps}{%
\subsection{Developing Shiny apps}\label{developing-shiny-apps}}

\href{https://shiny.rstudio.com/}{Shiny} is a ``framework for creating web applications using R code''. You can create a dashboard or an interactive map without knowing anything about HTML, CSS, or JavaScript. Developing a shiny app helps people with little technical expertise to learn from your data in an intuitive and interactive way.

\hypertarget{workflow-1}{%
\subsubsection{Workflow}\label{workflow-1}}

The workflow follows what Hadley Wickham recommended in his book on mastering shiny.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Install libraries
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{install.packages}\NormalTok{(}\StringTok{"shiny"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Create app directory and file
\end{enumerate}

Add an \texttt{app.R} file.

The key objective here is defining your UI (how the app looks; front-end = INPUT) (defined in object \texttt{ui}) and server (how the app works; back-end = OUTPUT) (defined in object \texttt{server}).

If you're creating a complex app, you can achieve the same goal with two files: \texttt{ui.R} and \texttt{server.R}.

\hypertarget{app.r}{%
\subsubsection{app.r}\label{app.r}}

\begin{itemize}
\tightlist
\item
  Front-end
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Load packages }
\CommentTok{\# Do not use install.packages(), pacman::p\_load(), or library() if you intend to deploy the app using shinyapps.io }

\KeywordTok{require}\NormalTok{(}\StringTok{"wordcloud2"}\NormalTok{)}
\KeywordTok{require}\NormalTok{(}\StringTok{"shiny"}\NormalTok{)}
\KeywordTok{require}\NormalTok{(}\StringTok{"shinydashboard"}\NormalTok{)}
\KeywordTok{require}\NormalTok{(}\StringTok{"colourpicker"}\NormalTok{)}

\CommentTok{\# Load data }

\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{read.csv}\NormalTok{(}\KeywordTok{url}\NormalTok{(}\StringTok{"https://github.com/jaeyk/covid19antiasian/raw/master/processed\_data/hash\_counts.csv"}\NormalTok{))[,}\OperatorTok{{-}}\DecValTok{1}\NormalTok{]}

\CommentTok{\# Defines the user interface; how the app looks}

\NormalTok{ui \textless{}{-}}\StringTok{ }\KeywordTok{fluidPage}\NormalTok{(}
  
    \CommentTok{\# Application title }
    \KeywordTok{titlePanel}\NormalTok{(}\StringTok{"Word Cloud on the Hashtags of the Tweets related to COVID{-}19 \& Asian|Chinese|Wuhan"}\NormalTok{),}
  
    \KeywordTok{h4}\NormalTok{(tags}\OperatorTok{$}\KeywordTok{a}\NormalTok{(}\DataTypeTok{href =} \StringTok{"https://jaeyk.github.io/"}\NormalTok{, }\StringTok{"Developer: Jae Yeon Kim"}\NormalTok{)),}
            
    \KeywordTok{sidebarLayout}\NormalTok{(}
      
      \CommentTok{\# Sidebar with sliders }
      \KeywordTok{sidebarPanel}\NormalTok{(}
        \KeywordTok{sliderInput}\NormalTok{(}\StringTok{"size"}\NormalTok{, }
                    \StringTok{"Font size:"}\NormalTok{,}
                    \DataTypeTok{min =} \DecValTok{1}\NormalTok{, }\DataTypeTok{max =} \DecValTok{10}\NormalTok{,}
                    \DataTypeTok{value =} \DecValTok{2}\NormalTok{)}
\NormalTok{      ),}
    
    \KeywordTok{mainPanel}\NormalTok{(}
          
          \KeywordTok{wordcloud2Output}\NormalTok{(}\StringTok{"cloud"}\NormalTok{),}
        
\NormalTok{        )}
    
\NormalTok{    )}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Back-end
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{server \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(input, output, session) \{}
  
\NormalTok{  output}\OperatorTok{$}\NormalTok{cloud \textless{}{-}}\StringTok{ }\KeywordTok{renderWordcloud2}\NormalTok{(\{ }
    
    \KeywordTok{wordcloud2}\NormalTok{(df, }
               \DataTypeTok{size =}\NormalTok{ input}\OperatorTok{$}\NormalTok{size, }
               \DataTypeTok{color =} \StringTok{"random{-}dark"}\NormalTok{) }
    
\NormalTok{    \})}

\NormalTok{  \}}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Build a shiny app
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{shinyApp}\NormalTok{(}\DataTypeTok{ui =}\NormalTok{ ui, }\DataTypeTok{server =}\NormalTok{ server)}
\end{Highlighting}
\end{Shaded}

\hypertarget{deployment}{%
\subsubsection{Deployment}\label{deployment}}

\begin{itemize}
\tightlist
\item
  Deploy to \href{https://www.shinyapps.io/?_ga=2.5503866.871102833.1602978469-100003412.1602392815}{the shinyapps.io cloud}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Install packages }
\KeywordTok{install.packages}\NormalTok{(}\StringTok{"rsconnect"}\NormalTok{)}
\KeywordTok{library}\NormalTok{(rsconnect)}

\CommentTok{\# Setup }
\NormalTok{rsconnect}\OperatorTok{::}\KeywordTok{setAccountInfo}\NormalTok{(}\DataTypeTok{name =} \StringTok{"\textless{}Account name\textgreater{}"}\NormalTok{, }
                          \DataTypeTok{token =} \StringTok{"\textless{}Token\textgreater{}"}\NormalTok{,}
                          \DataTypeTok{secret =} \StringTok{"\textless{}Secret\textgreater{}"}\NormalTok{)}

\NormalTok{rsconnect}\OperatorTok{::}\KeywordTok{deployApp}\NormalTok{(}\DataTypeTok{appNames =} \StringTok{"\textless{}App name\textgreater{}"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{references-1}{%
\subsubsection{References}\label{references-1}}

\href{https://mastering-shiny.org/}{Mastering Shiny} by Hadley Wickham. For newbies.

\href{https://bookdown.org/yihui/rmarkdown/shiny-documents.html}{Shiny Documents} by Yihui Xie

\href{https://engineering-shiny.org/}{Engineering Production-Grade Shiny Apps} by Colin Fay, Sébastien Rochette, Vincent Guyader, Cervan Girard. For experienced developers.

\href{https://stat545.com/shiny-tutorial.html}{Building Shiny Apps} by Dean Attali.

\hypertarget{other-useful-data-products}{%
\subsection{Other useful data products}\label{other-useful-data-products}}

\begin{itemize}
\tightlist
\item
  Automating data reports using rmarkdown (called \href{https://rmarkdown.rstudio.com/developer_parameterized_reports.html\%23parameter_types\%2F}{parameterized reports})
\item
  Automating R presentation using \href{http://slidify.org/index.html}{slidify}
\item
  Creating interactive web apps using \href{https://rstudio.github.io/leaflet/}{leaflet}
\end{itemize}

\hypertarget{semi_structured_data}{%
\chapter{Semi-structured data}\label{semi_structured_data}}

\hypertarget{objectives-3}{%
\section{Objectives}\label{objectives-3}}

\begin{itemize}
\tightlist
\item
  Automating the process of turning semi-structured data (input) into structured data (output)
\end{itemize}

\hypertarget{what-is-semi-structured-data}{%
\section{What is semi-structured data?}\label{what-is-semi-structured-data}}

\begin{quote}
Semi-structured data is a form of structured data that does not obey the tabular structure of data models associated with relational databases or other forms of data tables, but nonetheless contains tags or other markers to separate semantic elements and enforce hierarchies of records and fields within the data. Therefore, it is also known as self-describing structure. - \href{https://en.wikipedia.org/wiki/Semi-structured_data\#:~:text=Semi\%2Dstructured\%20data\%20is\%20a,and\%20fields\%20within\%20the\%20data.}{Wikipedia}
\end{quote}

\begin{itemize}
\item
  Examples: \texttt{HTML\ (Hypertext\ Markup\ Language)} files (e.g., websites) and \texttt{JSON\ (JavaScript\ Object\ Notation)} files (e.g., tweets)
\item
  Why should we care semi-structured data?

  \begin{itemize}
  \tightlist
  \item
    Because this is what the data frontier looks like: \# of unstructured data \textgreater{} \# of semi-structured data \textgreater{} \# of structured data
  \item
    There are easy and fast ways to turn semi-structured data into structured data (ideally in a tidy format) using R, Python, and command-line tools. See my own examples (\href{https://github.com/jaeyk/tidyethnicnews}{tidyethnicnews} and \href{https://github.com/jaeyk/tidytweetjson}{tidytweetjson}).
  \end{itemize}
\end{itemize}

\hypertarget{workflow-2}{%
\section{Workflow}\label{workflow-2}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Import/connect to a semi-structured file using \texttt{rvest,} \texttt{jsonlite,} \texttt{xml2,} \texttt{pdftools,} \texttt{tidyjson}, etc.
\item
  Define target elements in the single file and extract them
\end{enumerate}

\begin{itemize}
\item
  \href{https://readr.tidyverse.org/}{\texttt{readr}} package providers \texttt{parse\_} functions that are useful for vector parsing.
\item
  \href{https://stringr.tidyverse.org/}{\texttt{stringr}} package for string manipulations (e.g., using regular expressions in a tidy way). Quite useful for parsing PDF files (see \href{https://themockup.blog/posts/2020-04-03-beer-and-pdftools-a-vignette/}{this example}).
\item
  \href{https://github.com/tidyverse/rvest}{\texttt{rvest}} package for parsing HTML (R equivalent to \texttt{beautiful\ soup} in Python)
\item
  \href{https://github.com/sailthru/tidyjson}{\texttt{tidyjson}} package for parsing JSON data
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\item
  Create a list of files (in this case URLs) to parse
\item
  Write a parsing function
\item
  Automate parsing process
\end{enumerate}

\hypertarget{htmlcss-web-scraping}{%
\section{HTML/CSS: web scraping}\label{htmlcss-web-scraping}}

\hypertarget{xmljson-social-media-scraping}{%
\section{XML/JSON: social media scraping}\label{xmljson-social-media-scraping}}

\hypertarget{api}{%
\subsection{API}\label{api}}

\hypertarget{objectives-4}{%
\subsubsection{Objectives}\label{objectives-4}}

\begin{itemize}
\tightlist
\item
  Learning what kind of social media data are accessible through application programming interfaces (APIs)
\end{itemize}

\textbf{Review question}

In the previous session, we learned the difference between semi-structured data and structured data. Can anyone tell us the difference between them?

\hypertarget{the-big-picture-for-digital-data-collection}{%
\subsubsection{The big picture for digital data collection}\label{the-big-picture-for-digital-data-collection}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Input: semi-structured data
\item
  Output: structured data
\item
  Process:

  \begin{itemize}
  \item
    Getting \textbf{target data} from a remote server

    \begin{itemize}
    \tightlist
    \item
      The target data is usually huge (\textgreater10GB) by the traditional social science standard.
    \end{itemize}
  \item
    Parsing the target data your laptop/database

    \begin{itemize}
    \tightlist
    \item
      Laptop (sample-parse): Downsamle the large target data and parse it on your laptop. This is just one option to \href{https://rviews.rstudio.com/2019/07/17/3-big-data-strategies-for-r/}{deal with big data in R}. It's a simple strategy as it doesn't require storing target data in your own database.
    \end{itemize}
  \item
    Database (push-parse): Push the large target data to a database, then explore, select, and filter it. If you were interested in using this option, then check out my \href{https://github.com/dlab-berkeley/sql-for-r-users}{SQL for R Users} workshop.
  \end{itemize}
\end{enumerate}

\begin{figure}
\centering
\includegraphics{https://rviews.rstudio.com/post/2019-07-01-3-big-data-paradigms-for-r_files/sample_model.png}
\caption{Sample-Parse. From RStudio.}
\end{figure}

\begin{figure}
\centering
\includegraphics{https://rviews.rstudio.com/post/2019-07-01-3-big-data-paradigms-for-r_files/push_data.png}
\caption{Push-Parse. From RStudio.}
\end{figure}

\begin{itemize}
\item
  But what exactly is this target data?

  \begin{itemize}
  \item
    When you scrape websites, you mostly deal with HTML (defines a structure of a website), CSS (its style), and JavaScript (its dynamic interactions).
  \item
    When you access social media data through API, you deal with either XML or JSON (major formats for storing and transporting data; they are light and flexible).
  \item
    XML and JSON have tree-like (nested; a root and branches) structures and keys and values (or elements and attributes).
  \item
    If HTML, CSS, and JavaScript are storefronts, then XML and JSON are warehouses.
  \end{itemize}
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://upload.wikimedia.org/wikipedia/commons/9/97/Automatisches_Kleinteilelager.jpg}
\caption{By Andreas Praefcke (Own work), via Wikimedia Commons}
\end{figure}

\hypertarget{opportunities-and-challenges-for-parsing-social-media-data}{%
\subsubsection{Opportunities and challenges for parsing social media data}\label{opportunities-and-challenges-for-parsing-social-media-data}}

This explanation draws on Pablo Barbara's \href{http://pablobarbera.com/social-media-workshop/social-media-slides.pdf}{LSE social media workshop slides}.

\textbf{Basic information}

\begin{itemize}
\item
  What is an API?: An interface (you can think of it as something akin to a restaurant menu. API parameters are menu items.)

  \begin{itemize}
  \item
    \href{https://en.wikipedia.org/wiki/Representational_state_transfer}{REST} (Representational state transfer) API: static information (e.g., user profiles, list of followers and friends)

    \begin{itemize}
    \tightlist
    \item
      R packages: \href{https://github.com/pablobarbera/twitter_ideology/tree/master/pkg/tweetscores}{tweetscores}, \href{https://cran.r-project.org/web/packages/twitteR/twitteR.pdf}{twitteR}, \href{https://github.com/ropensci/rtweet}{rtweet}
    \end{itemize}
  \item
    \href{https://blog.axway.com/amplify/api-management/streaming-apis\#:~:text=Streaming\%20APIs\%20are\%20used\%20to,a\%20subset\%20of\%20Streaming\%20APIS.}{Streaming API}: dynamic information (e..g, new tweets)

    \begin{itemize}
    \tightlist
    \item
      This streaming data is filtered by (1) keywords, (2) location, and (3) sample (1\% of the total tweets)
    \item
      R packages: \href{https://github.com/pablobarbera/streamR}{streamR}
    \end{itemize}
  \end{itemize}
\end{itemize}

\textbf{Status}

\begin{itemize}
\tightlist
\item
  Twitter API is still widely accessible (\href{https://developer.twitter.com/en/docs/twitter-api/early-access}{v2} recently released; new fields available such as \href{https://developer.twitter.com/en/docs/twitter-api/conversation-id}{conversation threads}).
\end{itemize}

\begin{quote}
Twitter data is unique from data shared by most other social platforms because it reflects information that users \emph{choose} to share publicly. Our API platform provides broad access to public Twitter data that users have chosen to share with the world. - Twitter Help Center
\end{quote}

\begin{itemize}
\item
  What does this policy mean? If Twitter users don't share the locations of their tweets (e.g., GPS), you can't collect them.
\item
  Facebook API access has become much constrained with \href{https://socialscience.one/blog/unprecedented-facebook-urls-dataset-now-available-research-through-social-science-one}{the exception of Social Science One} since the 2016 U.S. election.
\item
  \href{https://developers.google.com/youtube/v3}{YouTube API} access is somewhat limited (but you need to check as I'm not updated on this).
\end{itemize}

\textbf{Upside}

\begin{itemize}
\tightlist
\item
  Legal and well-documented.
\end{itemize}

Web scraping (Wild Wild West) \textless\textgreater{} API (Big Gated Garden)

\begin{itemize}
\item
  You have legal but limited access to (growing) big data that can be divided into text, image, and video and transformed into cross-sectional (geocodes), longitudinal (timestamps), and event historical data (hashtags). For more information, see Zachary C. Steinert-Threlkeld's \href{https://github.com/ZacharyST/APSA2020_EventDataFromSocialMedia}{2020 APSA Short Course Generating Event Data From Social Media}.
\item
  Social media data are also well-organized, managed, and curated data. It's easy to navigate because XML and JSON have keys and values. If you find keys, you will find observations you look for.
\end{itemize}

\textbf{Downside}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Rate-limited.
\item
  If you want to access to more and various data than those available, you need to pay for premium access.
\end{enumerate}

\hypertarget{next-steps}{%
\subsubsection{Next steps}\label{next-steps}}

\begin{itemize}
\item
  If you want to know how to sign up a new Twitter developer account and access Twitter API, then see Steinert-Threlkeld's \href{https://github.com/ZacharyST/APSA2020_EventDataFromSocialMedia/blob/master/Presentation/02_AccessTwitter.pdf}{APSA workshop slides}.
\item
  If you want to know about how to use \texttt{tweetscore} package, then see Pablo Barbara's R markdown file for \href{http://pablobarbera.com/social-media-workshop/code/02-twitter-REST-data-collection.html}{scraping data from Twitter's REST API}
\end{itemize}

\hypertarget{hydrating}{%
\subsection{Hydrating}\label{hydrating}}

\hypertarget{objectives-5}{%
\subsubsection{Objectives}\label{objectives-5}}

\begin{itemize}
\tightlist
\item
  Learning how hydrating works
\item
  Learning how to use \href{https://github.com/DocNow/twarc}{Twarc} to communicate with Twitter's API
\end{itemize}

\textbf{Review question}

What are the main two types of Twitter's API?

\hypertarget{hydrating-an-alternative-way-to-collect-historical-twitter-data}{%
\subsubsection{Hydrating: An Alternative Way to Collect Historical Twitter Data}\label{hydrating-an-alternative-way-to-collect-historical-twitter-data}}

\begin{itemize}
\item
  You can collect Twitter data using Twitter's API or you can hydrate Tweet IDs collected by other researchers. This is a good resource to collect historical Twitter data.
\item
  \href{http://www.panacealab.org/covid19/}{Covid-19 Twitter chatter dataset for scientic use} by Panacealab
\item
  \href{https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/5ZVMOR}{Women's March Dataset} by Littman and Park
\item
  Harvard Dataverse has a number of dehydrated Tweet IDs that could be of interest to social scientists.
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://github.com/jaeyk/digital_data_collection_workshop/raw/master/misc/dehydrated_tweets.png}
\caption{Dehydrated Tweet IDs}
\end{figure}

\hypertarget{twarc-one-solution-to-almost-all-twitters-api-problems}{%
\subsubsection{Twarc: one solution to (almost) all Twitter's API problems}\label{twarc-one-solution-to-almost-all-twitters-api-problems}}

\begin{itemize}
\item
  Why Twarc?

  \begin{itemize}
  \item
    A command-line tool and Python library that works for almost every Twitter's API related problem.
  \item
    It's really well-documented, tested, and maintained.

    \begin{itemize}
    \tightlist
    \item
      \href{https://scholarslab.github.io/learn-twarc/06-twarc-command-basics}{Twarc documentation} covers basic commands.
    \item
      \href{https://twarc-cloud.readthedocs.io/_/downloads/en/stable/pdf/}{Tward-cloud documentation} explains how to collect data from Twitter's API using Twarc running in \href{https://aws.amazon.com/}{Amazon Web Services} (AWS).
    \end{itemize}
  \item
    Twarc was developed as part of the \href{https://www.docnow.io/}{Documenting the Now} project which was funded by the Mellon Foundation.
  \end{itemize}
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://vignette.wikia.nocookie.net/lotr/images/8/8b/DOiAi2WUEAE3A1Y.0.jpg/revision/latest/scale-to-width-down/699?cb=20200305221819}
\caption{One ring that rules them all.}
\end{figure}

\begin{itemize}
\item
  There's no reason to be afraid of using a command-line tool and Python library, even though you primarily use R. It's easy to embed \href{https://bookdown.org/yihui/rmarkdown/language-engines.html\#python}{Python code} and \href{https://bookdown.org/yihui/rmarkdown/language-engines.html\#shell-scripts}{shell scripts} in R Markdown.
\item
  Even though you don't know how to write Python code or shell scripts, it's really useful to know how to integrate them in your R workflow.
\item
  I assume that you have already installed \href{https://www.python.org/download/releases/3.0/}{Python 3}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\ExtensionTok{pip3}\NormalTok{ install twarc}
\end{Highlighting}
\end{Shaded}

\hypertarget{applications}{%
\paragraph{Applications}\label{applications}}

The following examples are created by \href{http://digitalcollecting.lib.virginia.edu/toolkit/docs/social-media/twarc-commands/}{the University of Virginia library}.

\hypertarget{search}{%
\subparagraph{Search}\label{search}}

\begin{itemize}
\item
  Download pre-existing tweets (7-day window) matching certain conditions
\item
  In command-line, \texttt{\textgreater{}} = Create a file
\item
  I recommend running the following commands in the terminal because it's more stable than doing so in R Markdown.
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://github.com/jaeyk/digital_data_collection_workshop/raw/master/misc/terminal.png}
\caption{You can type commands in the Terminal in R Studio.}
\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Key word }
\ExtensionTok{twarc}\NormalTok{ search blacklivesmatter }\OperatorTok{\textgreater{}}\NormalTok{ blm\_tweets.jsonl}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Hashtag }
\ExtensionTok{twarc}\NormalTok{ search }\StringTok{\textquotesingle{}\#blacklivesmatter\textquotesingle{}} \OperatorTok{\textgreater{}}\NormalTok{ blm\_tweets\_hash.jsonl}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Hashtag + Language }
\ExtensionTok{twarc}\NormalTok{ search }\StringTok{\textquotesingle{}\#blacklivesmatter\textquotesingle{}}\NormalTok{ {-}{-}lang en }\OperatorTok{\textgreater{}}\NormalTok{ blm\_tweets\_hash.jsonl}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  It is really important to \textbf{save these tweets into a \texttt{jsonl} format;} \texttt{jsonl} extension refers to JSON \textbf{Lines} files. This structure is useful for splitting JSON data into smaller chunks, if it is too large.
\end{itemize}

\hypertarget{filter}{%
\subparagraph{Filter}\label{filter}}

\begin{itemize}
\tightlist
\item
  Download tweets meeting certain conditions as they happen.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Key word}
\ExtensionTok{twarc}\NormalTok{ filter blacklivesmatter }\OperatorTok{\textgreater{}}\NormalTok{ blm\_tweets.jsonl}
\end{Highlighting}
\end{Shaded}

\hypertarget{sample}{%
\subparagraph{Sample}\label{sample}}

\begin{itemize}
\tightlist
\item
  Use Twitter's random sample of recent tweets.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\ExtensionTok{twarc}\NormalTok{ sample }\OperatorTok{\textgreater{}}\NormalTok{ tweets.jsonl }
\end{Highlighting}
\end{Shaded}

\hypertarget{hydrate}{%
\subparagraph{Hydrate}\label{hydrate}}

\begin{itemize}
\tightlist
\item
  Tweet IDs -\textgreater{} Tweets
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\ExtensionTok{twarc}\NormalTok{ hydrate tweet\_ids.txt }\OperatorTok{\textgreater{}}\NormalTok{ tweets.jsonl }
\end{Highlighting}
\end{Shaded}

\hypertarget{dehydrate}{%
\subparagraph{Dehydrate}\label{dehydrate}}

\begin{itemize}
\tightlist
\item
  Hydrate \textless\textgreater{} Dehydrate
\item
  Tweets -\textgreater{} Tweet IDs
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\ExtensionTok{twarc}\NormalTok{ dehydrate tweets.jsonl }\OperatorTok{\textgreater{}}\NormalTok{ tweet\_ids.txt}
\end{Highlighting}
\end{Shaded}

\textbf{Challenge}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Collect tweets contain some key words of your choice using \texttt{twarc\ search} and save them as \texttt{tweets.jsonl}.
\item
  Using \texttt{less} command in the terminal, inspect \texttt{twarc.log}.
\item
  Using \texttt{less} command in the terminal, inspect \texttt{tweets.json}.
\end{enumerate}

\hypertarget{parsing-json}{%
\subsection{Parsing JSON}\label{parsing-json}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Install packages }
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)) }\KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Loading required package: pacman
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(tidyverse, }\CommentTok{\# tidyverse pkgs including purrr}
\NormalTok{               furrr, }\CommentTok{\# parallel processing }
\NormalTok{               tictoc, }\CommentTok{\# performance test  }
\NormalTok{               tcltk, }\CommentTok{\# GUI for choosing a dir path }
\NormalTok{               tidyjson) }\CommentTok{\# tidying JSON files }

\CommentTok{\#\# Install the current development version from GitHub}
\NormalTok{devtools}\OperatorTok{::}\KeywordTok{install\_github}\NormalTok{(}\StringTok{"jaeyk/tidytweetjson"}\NormalTok{,}
                         \DataTypeTok{dependencies =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Skipping install of 'tidytweetjson' from a github remote, the SHA1 (b598dcc1) has not changed since last install.
##   Use `force = TRUE` to force installation
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{library}\NormalTok{(tidytweetjson)}
\end{Highlighting}
\end{Shaded}

\hypertarget{objectives-6}{%
\subsubsection{Objectives}\label{objectives-6}}

\begin{itemize}
\tightlist
\item
  Learning chunk and pull strategy
\item
  Learning how \texttt{tidyjson} works
\item
  Learning how to apply \texttt{tidyjson} to tweets
\end{itemize}

\hypertarget{chunk-and-pull}{%
\subsubsection{Chunk and Pull}\label{chunk-and-pull}}

\hypertarget{problem-3}{%
\paragraph{Problem}\label{problem-3}}

\begin{itemize}
\tightlist
\item
  What if the size of the Twitter data you downloaded is too big (e.g., \textgreater10GB) to do complex wrangling in R?
\end{itemize}

\hypertarget{solution-2}{%
\paragraph{Solution}\label{solution-2}}

\begin{figure}
\centering
\includegraphics{https://rviews.rstudio.com/post/2019-07-01-3-big-data-paradigms-for-r_files/chunk_pull.png}
\caption{Chunk and Pull. From Studio.}
\end{figure}

Step1: Split the large JSON file in small chunks.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#Divide the JSON file by 100 lines (tweets)}

\CommentTok{\# Linux and Windows (in Bash)}
\NormalTok{$ }\FunctionTok{split}\NormalTok{ {-}100 search.jsonl}

\CommentTok{\# macOS}
\NormalTok{$ }\ExtensionTok{gsplit}\NormalTok{ {-}100 search.jsonl}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  After that, you will see several files appeared in the directory. Each of these files should have 100 tweets or fewer. All of these file names \textbf{should start with ``x'', as in ``xaa''.}
\end{itemize}

Step 2: Apply the parsing function to each chunk and pull all of these chunks together.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# You need to choose a Tweet JSON file}
\NormalTok{filepath \textless{}{-}}\StringTok{ }\KeywordTok{file.choose}\NormalTok{()}

\CommentTok{\# Assign the parsed result to the \textasciigrave{}df\textasciigrave{} object}
\CommentTok{\# 11.28 sec elapsed to parse 17,928 tweets }
\KeywordTok{tic}\NormalTok{()}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{jsonl\_to\_df}\NormalTok{(filepath)}
\KeywordTok{toc}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Setup }
\NormalTok{n\_cores \textless{}{-}}\StringTok{ }\KeywordTok{availableCores}\NormalTok{() }\OperatorTok{{-}}\StringTok{ }\DecValTok{1}

\NormalTok{n\_cores }\CommentTok{\# This number depends on your computer spec.}

\KeywordTok{plan}\NormalTok{(multiprocess, }\CommentTok{\# multicore, if supported, otherwise multisession}
     \DataTypeTok{workers =}\NormalTok{ n\_cores) }\CommentTok{\# the maximum number of workers}

\CommentTok{\# You need to designate a directory path where you saved the list of JSON files.}

\CommentTok{\# 9.385 sec elapsed to parse 17,928 tweets }

\NormalTok{dirpath \textless{}{-}}\StringTok{ }\NormalTok{tcltk}\OperatorTok{::}\KeywordTok{tk\_choose.dir}\NormalTok{()}

\KeywordTok{tic}\NormalTok{()}
\NormalTok{df\_all \textless{}{-}}\StringTok{ }\NormalTok{tidytweetjson}\OperatorTok{::}\KeywordTok{jsonl\_to\_df\_all}\NormalTok{(dirpath)}
\KeywordTok{toc}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\hypertarget{tidyjson}{%
\paragraph{tidyjson}\label{tidyjson}}

The \href{https://cran.r-project.org/web/packages/tidyjson/vignettes/introduction-to-tidyjson.html}{\texttt{tidyjson}} package helps to use tidyverse framework to JSON data.

\begin{itemize}
\tightlist
\item
  toy example
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# JSON collection; nested structure + keys and values }
\NormalTok{worldbank[}\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "{\"_id\":{\"$oid\":\"52b213b38594d8a2be17c780\"},\"boardapprovaldate\":\"2013-11-12T00:00:00Z\",\"closingdate\":\"2018-07-07T00:00:00Z\",\"countryshortname\":\"Ethiopia\",\"majorsector_percent\":[{\"Name\":\"Education\",\"Percent\":46},{\"Name\":\"Education\",\"Percent\":26},{\"Name\":\"Public Administration, Law, and Justice\",\"Percent\":16},{\"Name\":\"Education\",\"Percent\":12}],\"project_name\":\"Ethiopia General Education Quality Improvement Project II\",\"regionname\":\"Africa\",\"totalamt\":130000000}"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Check out keys (objects)}
\NormalTok{worldbank }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{as.tbl\_json}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{gather\_object}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(document.id }\OperatorTok{==}\StringTok{ }\DecValTok{1}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tbl_json: 8 x 3 tibble with a "JSON" attribute
##   ..JSON                  document.id name               
##   <chr>                         <int> <chr>              
## 1 "{\"$oid\":\"52b213..."           1 _id                
## 2 "\"2013-11-12T00:..."             1 boardapprovaldate  
## 3 "\"2018-07-07T00:..."             1 closingdate        
## 4 "\"Ethiopia\""                    1 countryshortname   
## 5 "[{\"Name\":\"Educa..."           1 majorsector_percent
## 6 "\"Ethiopia Gener..."             1 project_name       
## 7 "\"Africa\""                      1 regionname         
## 8 "130000000"                       1 totalamt
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Get the values associated with the keys }
\NormalTok{worldbank }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{as.tbl\_json}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Turn JSON into tbl\_json object }
\StringTok{  }\KeywordTok{enter\_object}\NormalTok{(}\StringTok{"project\_name"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Enter the objects }
\StringTok{  }\KeywordTok{append\_values\_string}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Append the values }
\StringTok{  }\KeywordTok{as\_tibble}\NormalTok{() }\CommentTok{\# To reduce the size of the file }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 500 x 2
##    document.id string                                                           
##          <int> <chr>                                                            
##  1           1 Ethiopia General Education Quality Improvement Project II        
##  2           2 TN: DTF Social Protection Reforms Support                        
##  3           3 Tuvalu Aviation Investment Project - Additional Financing        
##  4           4 Gov't and Civil Society Organization Partnership                 
##  5           5 Second Private Sector Competitiveness and Economic Diversificati~
##  6           6 Additional Financing for Cash Transfers for Orphans and Vulnerab~
##  7           7 National Highways Interconnectivity Improvement Project          
##  8           8 China Renewable Energy Scale-Up Program Phase II                 
##  9           9 Rajasthan Road Sector Modernization Project                      
## 10          10 MA Accountability and Transparency DPL                           
## # ... with 490 more rows
\end{verbatim}

\begin{itemize}
\tightlist
\item
  The following example draws on my \href{https://github.com/jaeyk/tidytweetjson}{tidytweetjson} R package. The package applies \texttt{tidyjson} to Tweets.
\end{itemize}

\hypertarget{individual-file}{%
\subparagraph{Individual file}\label{individual-file}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{jsonl\_to\_df \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(file\_path)\{}

\CommentTok{\# Save file name }

\NormalTok{file\_name \textless{}{-}}\StringTok{ }\KeywordTok{strsplit}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ file\_path, }
                     \DataTypeTok{split =} \StringTok{"[/]"}\NormalTok{) }

\NormalTok{file\_name \textless{}{-}}\StringTok{ }\NormalTok{file\_name[[}\DecValTok{1}\NormalTok{]][}\KeywordTok{length}\NormalTok{(file\_name[[}\DecValTok{1}\NormalTok{]])]}

\CommentTok{\# Import a Tweet JSON file}

\NormalTok{listed \textless{}{-}}\StringTok{ }\KeywordTok{read\_json}\NormalTok{(file\_path, }\DataTypeTok{format =} \KeywordTok{c}\NormalTok{(}\StringTok{"jsonl"}\NormalTok{))}

\CommentTok{\# IDs of the tweets with country codes}

\NormalTok{ccodes \textless{}{-}}\StringTok{ }\NormalTok{listed }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{enter\_object}\NormalTok{(}\StringTok{"place"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{enter\_object}\NormalTok{(}\StringTok{"country\_code"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{append\_values\_string}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{as\_tibble}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{rename}\NormalTok{(}\StringTok{"country\_code"}\NormalTok{ =}\StringTok{ "string"}\NormalTok{)}

\CommentTok{\# IDs of the tweets with location}

\NormalTok{locations \textless{}{-}}\StringTok{ }\NormalTok{listed }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{enter\_object}\NormalTok{(}\StringTok{"user"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{enter\_object}\NormalTok{(}\StringTok{"location"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{append\_values\_string}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{as\_tibble}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{rename}\NormalTok{(}\DataTypeTok{location =} \StringTok{"string"}\NormalTok{)}

\CommentTok{\# Extract other key elements from the JSON file}

\NormalTok{df \textless{}{-}}\StringTok{ }\NormalTok{listed }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{spread\_values}\NormalTok{(}
    \DataTypeTok{id =} \KeywordTok{jnumber}\NormalTok{(}\StringTok{"id"}\NormalTok{),}
    \DataTypeTok{created\_at =} \KeywordTok{jstring}\NormalTok{(}\StringTok{"created\_at"}\NormalTok{),}
    \DataTypeTok{full\_text =} \KeywordTok{jstring}\NormalTok{(}\StringTok{"full\_text"}\NormalTok{),}
    \DataTypeTok{retweet\_count =} \KeywordTok{jnumber}\NormalTok{(}\StringTok{"retweet\_count"}\NormalTok{),}
    \DataTypeTok{favorite\_count =} \KeywordTok{jnumber}\NormalTok{(}\StringTok{"favorite\_count"}\NormalTok{),}
    \DataTypeTok{user.followers\_count =} \KeywordTok{jnumber}\NormalTok{(}\StringTok{"user.followers\_count"}\NormalTok{),}
    \DataTypeTok{user.friends\_count =} \KeywordTok{jnumber}\NormalTok{(}\StringTok{"user.friends\_count"}\NormalTok{)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{      }\NormalTok{as\_tibble}

\KeywordTok{message}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}\StringTok{"Parsing"}\NormalTok{, file\_name, }\StringTok{"done."}\NormalTok{))}

\CommentTok{\# Full join}
\NormalTok{outcome \textless{}{-}}\StringTok{ }\KeywordTok{full\_join}\NormalTok{(ccodes, df) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{full\_join}\NormalTok{(locations)}

\CommentTok{\# Or you can write this way: outcome \textless{}{-} reduce(list(df, ccodes, locations), full\_join)}

\CommentTok{\# Select}
\NormalTok{outcome }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{select}\NormalTok{(}\OperatorTok{{-}}\KeywordTok{c}\NormalTok{(}\StringTok{"document.id"}\NormalTok{))\}}
\end{Highlighting}
\end{Shaded}

\hypertarget{many-files}{%
\subparagraph{Many files}\label{many-files}}

\begin{itemize}
\tightlist
\item
  Set up parallel processing.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{n\_cores \textless{}{-}}\StringTok{ }\KeywordTok{availableCores}\NormalTok{() }\OperatorTok{{-}}\StringTok{ }\DecValTok{1}

\NormalTok{n\_cores }\CommentTok{\# This number depends on your computer spec.}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## system 
##      7
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{plan}\NormalTok{(multiprocess, }\CommentTok{\# multicore, if supported, otherwise multisession}
     \DataTypeTok{workers =}\NormalTok{ n\_cores) }\CommentTok{\# the maximum number of workers}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: [ONE-TIME WARNING] Forked processing ('multicore') is disabled
## in future (>= 1.13.0) when running R from RStudio, because it is
## considered unstable. Because of this, plan("multicore") will fall
## back to plan("sequential"), and plan("multiprocess") will fall back to
## plan("multisession") - not plan("multicore") as in the past. For more details,
## how to control forked processing or not, and how to silence this warning in
## future R sessions, see ?future::supportsMulticore
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Parsing in parallel.
\end{itemize}

\textbf{Review}

There are at least three ways you can use function + \texttt{purrr::map()}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{squared \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x)\{}
\NormalTok{  x}\OperatorTok{*}\DecValTok{2} 
\NormalTok{\}}

\CommentTok{\# Named function }
\KeywordTok{map}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{, squared)}

\CommentTok{\# Anonymous function }
\KeywordTok{map}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{, }\ControlFlowTok{function}\NormalTok{(x)\{ x }\OperatorTok{*}\DecValTok{2}\NormalTok{ \})}

\CommentTok{\# Using formula; \textasciitilde{} = formula, .x = input }
\KeywordTok{map}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{,}\OperatorTok{\textasciitilde{}}\NormalTok{.x}\OperatorTok{*}\DecValTok{2}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create a list of file paths }
\NormalTok{filename \textless{}{-}}\StringTok{ }\KeywordTok{list.files}\NormalTok{(dir\_path,}
          \DataTypeTok{pattern =} \StringTok{\textquotesingle{}\^{}x\textquotesingle{}}\NormalTok{,}
          \DataTypeTok{full.names =} \OtherTok{TRUE}\NormalTok{)}

\NormalTok{df \textless{}{-}}\StringTok{ }\NormalTok{filename }\OperatorTok{\%\textgreater{}\%}

\CommentTok{\# Apply jsonl\_to\_df function to items on the list}
\KeywordTok{future\_map}\NormalTok{(}\OperatorTok{\textasciitilde{}}\KeywordTok{jsonl\_to\_df}\NormalTok{(.)) }\OperatorTok{\%\textgreater{}\%}

\CommentTok{\# Full join the list of dataframes}
\KeywordTok{reduce}\NormalTok{(full\_join,}
       \DataTypeTok{by =} \KeywordTok{c}\NormalTok{(}\StringTok{"id"}\NormalTok{,}
              \StringTok{"location"}\NormalTok{,}
              \StringTok{"country\_code"}\NormalTok{,}
              \StringTok{"created\_at"}\NormalTok{,}
              \StringTok{"full\_text"}\NormalTok{,}
              \StringTok{"retweet\_count"}\NormalTok{,}
              \StringTok{"favorite\_count"}\NormalTok{,}
              \StringTok{"user.followers\_count"}\NormalTok{,}
              \StringTok{"user.friends\_count"}\NormalTok{))}

\CommentTok{\# Output}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\hypertarget{machine_learning}{%
\chapter{High-dimensional data}\label{machine_learning}}

\hypertarget{overview}{%
\section{Overview}\label{overview}}

\begin{itemize}
\item
  The rise of high-dimensional data. The new data frontiers in social sciences---text (\href{https://web.stanford.edu/~gentzkow/research/text-as-data.pdf}{Gentzkow et al.~2019}; \href{https://www.jstor.org/stable/pdf/24572662.pdf?casa_token=SQdSI4R_VdwAAAAA:4QiVLhCXqr9f0qNMM9U75EL5JbDxxnXxUxyIfDf0U8ZzQx9szc0xVqaU6DXG4nHyZiNkvcwGlgD6H0Lxj3y0ULHwgkf1MZt8-9TPVtkEH9I4AHgbTg}{Grimmer and Stewart 2013}) and and image (\href{https://arxiv.org/pdf/1810.01544}{Joo and Steinert-Threlkeld 2018})---are all high-dimensional data.

  \begin{itemize}
  \item
    1000 common English words for 30-word tweets: \(1000^{30}\) similar to N of atoms in the universe (\href{https://web.stanford.edu/~gentzkow/research/text-as-data.pdf}{Gentzkow et al.~2019})
  \item
    Belloni, Alexandre, Victor Chernozhukov, and Christian Hansen. \href{https://pubs.aeaweb.org/doi/pdfplus/10.1257/jep.28.2.29}{``High-dimensional methods and inference on structural and treatment effects.''} \emph{Journal of Economic Perspectives 28}, no. 2 (2014): 29-50.
  \end{itemize}
\item
  The rise of new approach: statistics + computer science = machine learning
\item
  Statistical inference

  \begin{itemize}
  \item
    \(y\) \textless- some probability models (e.g., linear regression, logistic regression) \textless- \(x\)
  \item
    \(y\) = \(X\beta\) + \(\epsilon\)
  \item
    The goal is to estimate \(\beta\)
  \end{itemize}
\item
  Machine learning

  \begin{itemize}
  \item
    \(y\) \textless- unknown \textless- \(x\)
  \item
    \(y\) \textless-\textgreater{} decision trees, neutral nets \textless-\textgreater{} \(x\)
  \item
    For the main idea behind prediction modeling, see Breiman, Leo (Berkeley stat faculty who passed away in 2005). \href{https://projecteuclid.org/euclid.ss/1009213726}{``Statistical modeling: The two cultures (with comments and a rejoinder by the author).''} \emph{Statistical science} 16, no. 3 (2001): 199-231.
  \item
    ``The problem is to find an algorithm \(f(x)\) such that for future \(x\) in a test set, \(f(x)\) will be a good predictor of \(y\).''
  \item
    ``There are \textbf{two cultures} in the use of statistical modeling to reach conclusions from data. One assumes that the data are generated by a \textbf{given} \textbf{stochastic data model}. The other uses \textbf{algorithmic models} and treats the data mechanism as \textbf{unknown}.''
  \end{itemize}
\end{itemize}

\begin{quote}
Algorithmic models, both in theory and practice, has developed rapidly in fields of outside statistics. It can be used on large complex data sets and as a more accurate and informative alternative to data modeling on smaller data sets. - Leo Breiman
\end{quote}

\begin{itemize}
\item
  How ML differs from econometrics?
\item
  A review by Athey, Susan, and Guido W. Imbens. \href{https://www.annualreviews.org/doi/full/10.1146/annurev-economics-080217-053433}{``Machine learning methods that economists should know about.''} \emph{Annual Review of Economics} 11 (2019): 685-725.
\item
  Stat:

  \begin{itemize}
  \item
    Specifying a target (i.e., an estimand)
  \item
    Fitting a model to data using an objective function (e.g., the sum of squared errors)
  \item
    Reporting point estimates (effect size) and standard errors (uncertainty)
  \item
    Validation by yes-no using goodness-of-fit tests and residual examination
  \end{itemize}
\item
  ML:

  \begin{itemize}
  \item
    Developing algorithms (estimating \emph{f(x)})
  \item
    Prediction power not structural/causal parameters
  \item
    Basically, high-dimensional data statistics (N \textless{} P)
  \item
    The major problem is to avoid \href{https://en.wikipedia.org/wiki/Curse_of_dimensionality}{``the curse of dimensionality''} (\href{https://towardsdatascience.com/the-curse-of-dimensionality-50dc6e49aa1e}{too many features - \textgreater{} overfitting})
  \item
    Validation: out-of-sample comparisons (cross-validation) not in-sample goodness-of-fit measures
  \item
    So, it's curve-fitting but the primary focus is unseen (test data) not seen data (training data)
  \end{itemize}
\item
  A quick review on ML lingos for those trained in econometrics

  \begin{itemize}
  \item
    Sample to estimate parameters = Training sample
  \item
    Estimating the model = Being trained
  \item
    Regressors, covariates, or predictors = Features
  \item
    Regression parameters = weights
  \item
    Prediction problems = Supervised (some \(y\) are known) + Unsupervised (\(y\) unknown)
  \end{itemize}
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://i.vas3k.ru/7w9.jpg}
\caption{How to teach machines. Based on \href{https://vas3k.com/blog/machine_learning/}{vas3k blog}. Many images in this chapter come from vas3k blog.}
\end{figure}

\begin{figure}
\centering
\includegraphics{https://i.vas3k.ru/7vz.jpg}
\caption{The main types of machine learning. Based on \href{https://vas3k.com/blog/machine_learning/}{vas3k blog}}
\end{figure}

\begin{figure}
\centering
\includegraphics{https://i.vas3k.ru/7vx.jpg}
\caption{The map of the machine learning universe. Based on \href{https://vas3k.com/blog/machine_learning/}{vas3k blog}}
\end{figure}

\begin{figure}
\centering
\includegraphics{https://i.vas3k.ru/7w1.jpg}
\caption{Classical machine learning. Based on \href{https://vas3k.com/blog/machine_learning/}{vas3k blog}}
\end{figure}

\hypertarget{dataset}{%
\section{Dataset}\label{dataset}}

\begin{itemize}
\item
  \href{https://archive.ics.uci.edu/ml/datasets/heart+Disease}{Heart disease data from UCI}
\item
  One of the popular datasets used in machine learning competitions
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Load packages }

\CommentTok{\#\# CRAN packages }
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(here,}
\NormalTok{               tidyverse, }
\NormalTok{               tidymodels,}
\NormalTok{               doParallel, }\CommentTok{\# parallel processing }
\NormalTok{               patchwork, }\CommentTok{\# arranging ggplots}
\NormalTok{               ck37r, }
\NormalTok{               SuperLearner, }
\NormalTok{               vip, }
\NormalTok{               tidymodels)}

\CommentTok{\#\# Jae\textquotesingle{}s custom functions }
\KeywordTok{source}\NormalTok{(}\KeywordTok{here}\NormalTok{(}\StringTok{"functions"}\NormalTok{, }\StringTok{"ml\_utils.r"}\NormalTok{))}

\CommentTok{\# Import the dataset }

\NormalTok{data\_original \textless{}{-}}\StringTok{ }\KeywordTok{read\_csv}\NormalTok{(}\KeywordTok{here}\NormalTok{(}\StringTok{"data"}\NormalTok{, }\StringTok{"heart.csv"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## -- Column specification ---------------------------
## cols(
##   age = col_double(),
##   sex = col_double(),
##   cp = col_double(),
##   trestbps = col_double(),
##   chol = col_double(),
##   fbs = col_double(),
##   restecg = col_double(),
##   thalach = col_double(),
##   exang = col_double(),
##   oldpeak = col_double(),
##   slope = col_double(),
##   ca = col_double(),
##   thal = col_double(),
##   target = col_double()
## )
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{glimpse}\NormalTok{(data\_original)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 303
## Columns: 14
## $ age      <dbl> 63, 37, 41, 56, 57, 57, 56, 44, 52, 57, 54, 48, 49, 64, 58...
## $ sex      <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0...
## $ cp       <dbl> 3, 2, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 1, 3, 3, 2, 2, 3, 0, 3...
## $ trestbps <dbl> 145, 130, 130, 120, 120, 140, 140, 120, 172, 150, 140, 130...
## $ chol     <dbl> 233, 250, 204, 236, 354, 192, 294, 263, 199, 168, 239, 275...
## $ fbs      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0...
## $ restecg  <dbl> 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1...
## $ thalach  <dbl> 150, 187, 172, 178, 163, 148, 153, 173, 162, 174, 160, 139...
## $ exang    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0...
## $ oldpeak  <dbl> 2.3, 3.5, 1.4, 0.8, 0.6, 0.4, 1.3, 0.0, 0.5, 1.6, 1.2, 0.2...
## $ slope    <dbl> 0, 0, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 0, 2, 2...
## $ ca       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2...
## $ thal     <dbl> 1, 2, 2, 2, 2, 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ target   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Createa a copy }
\NormalTok{data \textless{}{-}}\StringTok{ }\NormalTok{data\_original}

\KeywordTok{theme\_set}\NormalTok{(}\KeywordTok{theme\_minimal}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  For more information on the Iowa housing data, read \href{http://jse.amstat.org/v19n3/decock.pdf}{Cook (2011)}. This is one of the famous datastets used in many prediction modeling competitions.
\end{itemize}

\hypertarget{workflow-3}{%
\section{Workflow}\label{workflow-3}}

\begin{itemize}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \tightlist
  \item
    Preprocessing
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{1}
  \tightlist
  \item
    Model building
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{2}
  \tightlist
  \item
    Model fitting
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{3}
  \tightlist
  \item
    Model evaluation
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{4}
  \tightlist
  \item
    Model tuning
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{5}
  \tightlist
  \item
    Prediction
  \end{enumerate}
\end{itemize}

\hypertarget{tidymodels}{%
\section{tidymodels}\label{tidymodels}}

\begin{itemize}
\item
  Like \texttt{tidyverse}, \texttt{tidymodels} is a collection of packages.

  \begin{itemize}
  \item
    \href{https://rsample.tidymodels.org/}{\texttt{rsample}}: for data splitting
  \item
    \href{https://recipes.tidymodels.org/index.html}{\texttt{recipes}}: for pre-processing
  \item
    \href{https://www.tidyverse.org/blog/2018/11/parsnip-0-0-1/}{\texttt{parsnip}}: for model building

    \begin{itemize}
    \tightlist
    \item
      \href{https://github.com/tidymodels/tune}{\texttt{tune}}: hyperparameter tuning
    \end{itemize}
  \item
    \href{https://github.com/tidymodels/yardstick}{\texttt{yardstick}}: for model evaluations
  \item
    \href{https://github.com/tidymodels/workflows}{\texttt{workflows}}: for bundling a pieplne that bundles together pre-processing, modeling, and post-processing requests
  \end{itemize}
\item
  Why taking a tidyverse approach to machine learning?
\item
  Benefits

  \begin{itemize}
  \item
    Readable code
  \item
    Reusable data structures
  \item
    Extendable code
  \end{itemize}
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://rviews.rstudio.com/post/2019-06-14-a-gentle-intro-to-tidymodels_files/figure-html/ds.png}
\caption{Tidymodels. From RStudio.}
\end{figure}

\begin{quote}
tidymodels are an \textbf{integrated, modular, extensible} set of packages that implement a framework that facilitates creating predicative stochastic models. - Joseph \href{mailto:Rickert@RStudio}{\nolinkurl{Rickert@RStudio}}
\end{quote}

\begin{itemize}
\item
  Currently, 238 models are \href{https://topepo.github.io/caret/available-models.html}{available}
\item
  The following materials are based on \href{https://github.com/dlab-berkeley/Machine-Learning-with-tidymodels}{the machine learning with tidymodels workshop} I developed for D-Lab. \href{https://github.com/dlab-berkeley/Machine-Learning-in-R}{The original workshop} was designed by \href{https://ck37.com/}{Chris Kennedy} and {[}Evan Muzzall{]}(\url{https://dlab.berkeley.edu/people/evan-muzzall}.
\end{itemize}

\hypertarget{pre-processing}{%
\section{Pre-processing}\label{pre-processing}}

\begin{itemize}
\item
  \href{https://recipes.tidymodels.org/index.html}{\texttt{recipes}}: for pre-processing
\item
  \href{https://github.com/tidymodels/textrecipes}{\texttt{textrecipes}} for text pre-processing
\item
  Step 1: \texttt{recipe()} defines target and predictor variables (ingredients).
\item
  Step 2: \texttt{step\_*()} defines preprocessing steps to be taken (recipe).

  The list of the preprocessing steps draws on the vignette of the \href{https://www.tidymodels.org/find/parsnip/}{\texttt{parsnip}} package.

  \begin{itemize}
  \item
    dummy: Also called one-hot encoding
  \item
    zero variance: Removing columns (or features) with a single unique value
  \item
    impute: Imputing missing values
  \item
    decorrelate: Mitigating correlated predictors (e.g., principal component analysis)
  \item
    normalize: Centering and/or scaling predictors (e.g., log scaling). Scaling matters because many algorithms (e.g., lasso) are scale-variant (except tree-based algorithms). Remind you that normalization (sensitive to outliers) = \(\frac{X - X_{min}}{X_{max} - X_{min}}\) and standardization (not sensitive to outliers) = \(\frac{X - \mu}{\sigma}\)
  \item
    transform: Making predictors symmetric
  \end{itemize}
\item
  Step 3: \texttt{prep()} prepares a dataset to base each step on.
\item
  Step 4: \texttt{bake()} applies the pre-processing steps to your datasets.
\end{itemize}

In this course, we focus on two preprocessing tasks.

\begin{itemize}
\tightlist
\item
  One-hot encoding (creating dummy/indicator variables)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Turn selected numeric variables into factor variables }
\NormalTok{data \textless{}{-}}\StringTok{ }\NormalTok{data }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{mutate}\NormalTok{(}\KeywordTok{across}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"sex"}\NormalTok{, }\StringTok{"ca"}\NormalTok{, }\StringTok{"cp"}\NormalTok{, }\StringTok{"slope"}\NormalTok{, }\StringTok{"thal"}\NormalTok{), as.factor)) }

\KeywordTok{glimpse}\NormalTok{(data) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 303
## Columns: 14
## $ age      <dbl> 63, 37, 41, 56, 57, 57, 56, 44, 52, 57, 54, 48, 49, 64, 58...
## $ sex      <fct> 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0...
## $ cp       <fct> 3, 2, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 1, 3, 3, 2, 2, 3, 0, 3...
## $ trestbps <dbl> 145, 130, 130, 120, 120, 140, 140, 120, 172, 150, 140, 130...
## $ chol     <dbl> 233, 250, 204, 236, 354, 192, 294, 263, 199, 168, 239, 275...
## $ fbs      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0...
## $ restecg  <dbl> 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1...
## $ thalach  <dbl> 150, 187, 172, 178, 163, 148, 153, 173, 162, 174, 160, 139...
## $ exang    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0...
## $ oldpeak  <dbl> 2.3, 3.5, 1.4, 0.8, 0.6, 0.4, 1.3, 0.0, 0.5, 1.6, 1.2, 0.2...
## $ slope    <fct> 0, 0, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 0, 2, 2...
## $ ca       <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2...
## $ thal     <fct> 1, 2, 2, 2, 2, 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ target   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Imputation
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Check missing values }

\KeywordTok{map\_df}\NormalTok{(data, }\OperatorTok{\textasciitilde{}}\StringTok{ }\KeywordTok{is.na}\NormalTok{(.) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{sum}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 14
##     age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak slope
##   <int> <int> <int>    <int> <int> <int>   <int>   <int> <int>   <int> <int>
## 1     0     0     0        0     0     0       0       0     0       0     0
## # ... with 3 more variables: ca <int>, thal <int>, target <int>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Add missing values }

\NormalTok{data}\OperatorTok{$}\NormalTok{oldpeak[}\KeywordTok{sample}\NormalTok{(}\KeywordTok{seq}\NormalTok{(data), }\DataTypeTok{size =} \DecValTok{10}\NormalTok{)] \textless{}{-}}\StringTok{ }\OtherTok{NA}

\CommentTok{\# Check missing values }

\CommentTok{\# Check the number of missing values }
\NormalTok{data }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{map\_df}\NormalTok{(}\OperatorTok{\textasciitilde{}}\KeywordTok{is.na}\NormalTok{(.) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{sum}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 14
##     age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak slope
##   <int> <int> <int>    <int> <int> <int>   <int>   <int> <int>   <int> <int>
## 1     0     0     0        0     0     0       0       0     0      10     0
## # ... with 3 more variables: ca <int>, thal <int>, target <int>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Check the rate of missing values}
\NormalTok{data }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{map\_df}\NormalTok{(}\OperatorTok{\textasciitilde{}}\KeywordTok{is.na}\NormalTok{(.) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{mean}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 14
##     age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak slope
##   <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl>
## 1     0     0     0        0     0     0       0       0     0  0.0330     0
## # ... with 3 more variables: ca <dbl>, thal <dbl>, target <dbl>
\end{verbatim}

\hypertarget{regression-setup}{%
\subsection{Regression setup}\label{regression-setup}}

\hypertarget{outcome-variable}{%
\subsubsection{Outcome variable}\label{outcome-variable}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Continuous variable }
\NormalTok{data}\OperatorTok{$}\NormalTok{age }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{class}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "numeric"
\end{verbatim}

\hypertarget{data-splitting-using-random-sampling}{%
\subsubsection{Data splitting using random sampling}\label{data-splitting-using-random-sampling}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# for reproducibility }
\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{) }

\CommentTok{\# split }
\NormalTok{split\_reg \textless{}{-}}\StringTok{ }\KeywordTok{initial\_split}\NormalTok{(data, }\DataTypeTok{prop =} \FloatTok{0.7}\NormalTok{)}

\CommentTok{\# training set }
\NormalTok{raw\_train\_x\_reg \textless{}{-}}\StringTok{ }\KeywordTok{training}\NormalTok{(split\_reg)}

\CommentTok{\# test set }
\NormalTok{raw\_test\_x\_reg \textless{}{-}}\StringTok{ }\KeywordTok{testing}\NormalTok{(split\_reg)}
\end{Highlighting}
\end{Shaded}

\hypertarget{recipe}{%
\subsubsection{recipe}\label{recipe}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Regression recipe }
\NormalTok{rec\_reg \textless{}{-}}\StringTok{ }\NormalTok{raw\_train\_x\_reg }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Define the outcome variable }
\StringTok{  }\KeywordTok{recipe}\NormalTok{(age }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{.) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Median impute oldpeak column }
\StringTok{  }\KeywordTok{step\_medianimpute}\NormalTok{(oldpeak) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Expand "sex", "ca", "cp", "slope", and "thal" features out into dummy variables (indicators). }
\StringTok{  }\KeywordTok{step\_dummy}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"sex"}\NormalTok{, }\StringTok{"ca"}\NormalTok{, }\StringTok{"cp"}\NormalTok{, }\StringTok{"slope"}\NormalTok{, }\StringTok{"thal"}\NormalTok{))}

\CommentTok{\# Prepare a dataset to base each step on}
\NormalTok{prep\_reg \textless{}{-}}\StringTok{ }\NormalTok{rec\_reg }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{prep}\NormalTok{(}\DataTypeTok{retain =} \OtherTok{TRUE}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# x features }
\NormalTok{train\_x\_reg \textless{}{-}}\StringTok{ }\KeywordTok{juice}\NormalTok{(prep\_reg, }\KeywordTok{all\_predictors}\NormalTok{())}

\NormalTok{test\_x\_reg \textless{}{-}}\StringTok{ }\KeywordTok{bake}\NormalTok{(}\DataTypeTok{object =}\NormalTok{ prep\_reg, }
                   \DataTypeTok{new\_data =}\NormalTok{ raw\_test\_x\_reg, }\KeywordTok{all\_predictors}\NormalTok{())}

\CommentTok{\# y variables }
\NormalTok{train\_y\_reg \textless{}{-}}\StringTok{ }\KeywordTok{juice}\NormalTok{(prep\_reg, }\KeywordTok{all\_outcomes}\NormalTok{())}\OperatorTok{$}\NormalTok{age }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{as.numeric}\NormalTok{()}
\NormalTok{test\_y\_reg \textless{}{-}}\StringTok{ }\KeywordTok{bake}\NormalTok{(prep\_reg, raw\_test\_x\_reg, }\KeywordTok{all\_outcomes}\NormalTok{())}\OperatorTok{$}\NormalTok{age }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{as.numeric}\NormalTok{()}

\CommentTok{\# Checks}
\KeywordTok{names}\NormalTok{(train\_x\_reg) }\CommentTok{\# Make sure there\textquotesingle{}s no age variable!}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "trestbps" "chol"     "fbs"      "restecg"  "thalach"  "exang"   
##  [7] "oldpeak"  "target"   "sex_X1"   "ca_X1"    "ca_X2"    "ca_X3"   
## [13] "ca_X4"    "cp_X1"    "cp_X2"    "cp_X3"    "slope_X1" "slope_X2"
## [19] "thal_X1"  "thal_X2"  "thal_X3"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(train\_y\_reg) }\CommentTok{\# Make sure this is a continuous variable!}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "numeric"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Note that other imputation methods are also available.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{grep}\NormalTok{(}\StringTok{"impute"}\NormalTok{, }\KeywordTok{ls}\NormalTok{(}\StringTok{"package:recipes"}\NormalTok{), }\DataTypeTok{value =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "step_bagimpute"          "step_impute_linear"     
##  [3] "step_knnimpute"          "step_lowerimpute"       
##  [5] "step_meanimpute"         "step_medianimpute"      
##  [7] "step_modeimpute"         "step_rollimpute"        
##  [9] "tunable.step_bagimpute"  "tunable.step_knnimpute" 
## [11] "tunable.step_meanimpute" "tunable.step_rollimpute"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  You can also create your own \texttt{step\_} functions. For more information, see \href{https://www.tidymodels.org/learn/develop/recipes/}{tidymodels.org}.
\end{itemize}

\hypertarget{classification-setup}{%
\subsection{Classification setup}\label{classification-setup}}

\hypertarget{outcome-variable-1}{%
\subsubsection{Outcome variable}\label{outcome-variable-1}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data}\OperatorTok{$}\NormalTok{target }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{class}\NormalTok{() }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "numeric"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data}\OperatorTok{$}\NormalTok{target \textless{}{-}}\StringTok{ }\KeywordTok{as.factor}\NormalTok{(data}\OperatorTok{$}\NormalTok{target)}

\NormalTok{data}\OperatorTok{$}\NormalTok{target }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{class}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "factor"
\end{verbatim}

\hypertarget{data-splitting-using-stratified-random-sampling}{%
\subsubsection{Data splitting using stratified random sampling}\label{data-splitting-using-stratified-random-sampling}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# split }
\NormalTok{split\_class \textless{}{-}}\StringTok{ }\KeywordTok{initial\_split}\NormalTok{(data }\OperatorTok{\%\textgreater{}\%}
\StringTok{                             }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{target =} \KeywordTok{as.factor}\NormalTok{(target)), }
                             \DataTypeTok{prop =} \FloatTok{0.7}\NormalTok{, }
                             \DataTypeTok{strata =}\NormalTok{ target)}

\CommentTok{\# training set }
\NormalTok{raw\_train\_x\_class \textless{}{-}}\StringTok{ }\KeywordTok{training}\NormalTok{(split\_class)}

\CommentTok{\# testing set }
\NormalTok{raw\_test\_x\_class \textless{}{-}}\StringTok{ }\KeywordTok{testing}\NormalTok{(split\_class)}
\end{Highlighting}
\end{Shaded}

\hypertarget{recipe-1}{%
\subsubsection{recipe}\label{recipe-1}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Classification recipe }
\NormalTok{rec\_class \textless{}{-}}\StringTok{ }\NormalTok{raw\_train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\CommentTok{\# Define the outcome variable }
\StringTok{  }\KeywordTok{recipe}\NormalTok{(target }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{.) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Median impute oldpeak column }
\StringTok{  }\KeywordTok{step\_medianimpute}\NormalTok{(oldpeak) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Expand "sex", "ca", "cp", "slope", and "thal" features out into dummy variables (indicators).}
\StringTok{  }\KeywordTok{step\_normalize}\NormalTok{(age) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{step\_dummy}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"sex"}\NormalTok{, }\StringTok{"ca"}\NormalTok{, }\StringTok{"cp"}\NormalTok{, }\StringTok{"slope"}\NormalTok{, }\StringTok{"thal"}\NormalTok{)) }

\CommentTok{\# Prepare a dataset to base each step on}
\NormalTok{prep\_class \textless{}{-}}\StringTok{ }\NormalTok{rec\_class }\OperatorTok{\%\textgreater{}\%}\KeywordTok{prep}\NormalTok{(}\DataTypeTok{retain =} \OtherTok{TRUE}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# x features }
\NormalTok{train\_x\_class \textless{}{-}}\StringTok{ }\KeywordTok{juice}\NormalTok{(prep\_class, }\KeywordTok{all\_predictors}\NormalTok{()) }
\NormalTok{test\_x\_class \textless{}{-}}\StringTok{ }\KeywordTok{bake}\NormalTok{(prep\_class, raw\_test\_x\_class, }\KeywordTok{all\_predictors}\NormalTok{())}

\CommentTok{\# y variables }
\NormalTok{train\_y\_class \textless{}{-}}\StringTok{ }\KeywordTok{juice}\NormalTok{(prep\_class, }\KeywordTok{all\_outcomes}\NormalTok{())}\OperatorTok{$}\NormalTok{target }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{as.factor}\NormalTok{()}
\NormalTok{test\_y\_class \textless{}{-}}\StringTok{ }\KeywordTok{bake}\NormalTok{(prep\_class, raw\_test\_x\_class, }\KeywordTok{all\_outcomes}\NormalTok{())}\OperatorTok{$}\NormalTok{target }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{as.factor}\NormalTok{()}

\CommentTok{\# Checks }
\KeywordTok{names}\NormalTok{(train\_x\_class) }\CommentTok{\# Make sure there\textquotesingle{}s no target variable!}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "age"      "trestbps" "chol"     "fbs"      "restecg"  "thalach" 
##  [7] "exang"    "oldpeak"  "sex_X1"   "ca_X1"    "ca_X2"    "ca_X3"   
## [13] "ca_X4"    "cp_X1"    "cp_X2"    "cp_X3"    "slope_X1" "slope_X2"
## [19] "thal_X1"  "thal_X2"  "thal_X3"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(train\_y\_class) }\CommentTok{\# Make sure this is a factor variable!}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "factor"
\end{verbatim}

\hypertarget{supervised-learning}{%
\section{Supervised learning}\label{supervised-learning}}

x -\textgreater{} f - \textgreater{} y (defined)

\hypertarget{ols-and-lasso}{%
\subsection{OLS and Lasso}\label{ols-and-lasso}}

\hypertarget{parsnip}{%
\subsubsection{parsnip}\label{parsnip}}

\begin{itemize}
\tightlist
\item
  Build models (\texttt{parsnip})
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Specify a model
\item
  Specify an engine
\item
  Specify a mode
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# OLS spec }
\NormalTok{ols\_spec \textless{}{-}}\StringTok{ }\KeywordTok{linear\_reg}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Specify a model }
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"lm"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Specify an engine: lm, glmnet, stan, keras, spark }
\StringTok{  }\KeywordTok{set\_mode}\NormalTok{(}\StringTok{"regression"}\NormalTok{) }\CommentTok{\# Declare a mode: regression or classification }

\CommentTok{\# Lasso spec }
\NormalTok{lasso\_spec \textless{}{-}}\StringTok{ }\KeywordTok{linear\_reg}\NormalTok{(}\DataTypeTok{penalty =} \FloatTok{0.1}\NormalTok{, }\CommentTok{\# tuning hyperparameter }
                         \DataTypeTok{mixture =} \DecValTok{1}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# 1 = lasso, 0 = ridge }
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"glmnet"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{set\_mode}\NormalTok{(}\StringTok{"regression"}\NormalTok{) }

\CommentTok{\# If you don\textquotesingle{}t understand parsnip arguments }
\NormalTok{lasso\_spec }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{translate}\NormalTok{() }\CommentTok{\# See the documentation}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Linear Regression Model Specification (regression)
## 
## Main Arguments:
##   penalty = 0.1
##   mixture = 1
## 
## Computational engine: glmnet 
## 
## Model fit template:
## glmnet::glmnet(x = missing_arg(), y = missing_arg(), weights = missing_arg(), 
##     alpha = 1, family = "gaussian")
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Fit models
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ols\_fit \textless{}{-}}\StringTok{ }\NormalTok{ols\_spec }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{fit\_xy}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ train\_x\_reg, }\DataTypeTok{y=}\NormalTok{ train\_y\_reg) }
  \CommentTok{\# fit(train\_y\_reg \textasciitilde{} ., train\_x\_reg) \# When you data are not preprocessed }

\NormalTok{lasso\_fit \textless{}{-}}\StringTok{ }\NormalTok{lasso\_spec }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{fit\_xy}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ train\_x\_reg, }\DataTypeTok{y=}\NormalTok{ train\_y\_reg) }
\end{Highlighting}
\end{Shaded}

\hypertarget{yardstick}{%
\subsubsection{yardstick}\label{yardstick}}

\begin{itemize}
\tightlist
\item
  Visualize model fits
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map2}\NormalTok{(}\KeywordTok{list}\NormalTok{(ols\_fit, lasso\_fit), }\KeywordTok{c}\NormalTok{(}\StringTok{"OLS"}\NormalTok{, }\StringTok{"Lasso"}\NormalTok{), visualize\_fit) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
\end{verbatim}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-16-1.pdf}

\begin{verbatim}
## 
## [[2]]
\end{verbatim}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-16-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Define performance metrics }
\NormalTok{metrics \textless{}{-}}\StringTok{ }\NormalTok{yardstick}\OperatorTok{::}\KeywordTok{metric\_set}\NormalTok{(rmse, mae, rsq)}

\CommentTok{\# Evaluate many models }
\NormalTok{evals \textless{}{-}}\StringTok{ }\NormalTok{purrr}\OperatorTok{::}\KeywordTok{map}\NormalTok{(}\KeywordTok{list}\NormalTok{(ols\_fit, lasso\_fit), evaluate\_reg) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{reduce}\NormalTok{(bind\_rows) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{type =} \KeywordTok{rep}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"OLS"}\NormalTok{, }\StringTok{"Lasso"}\NormalTok{), }\DataTypeTok{each =} \DecValTok{3}\NormalTok{))}

\CommentTok{\# Visualize the test results }
\NormalTok{evals }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{fct\_reorder}\NormalTok{(type, .estimate), }\DataTypeTok{y =}\NormalTok{ .estimate)) }\OperatorTok{+}
\StringTok{    }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{    }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{"Model"}\NormalTok{,}
         \DataTypeTok{y =} \StringTok{"Estimate"}\NormalTok{) }\OperatorTok{+}
\StringTok{    }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\KeywordTok{glue}\NormalTok{(}\StringTok{"\{toupper(.metric)\}"}\NormalTok{), }\DataTypeTok{scales =} \StringTok{"free\_y"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-17-1.pdf}
- For more information, read \href{https://www.tmwr.org/}{Tidy Modeling with R} by Max Kuhn and Julia Silge.

\hypertarget{tune}{%
\subsubsection{tune}\label{tune}}

\textbf{Hyper}parameters are parameters which control the learning process.

\hypertarget{tune-ingredients}{%
\paragraph{tune ingredients}\label{tune-ingredients}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# tune() = placeholder }

\NormalTok{tune\_spec \textless{}{-}}\StringTok{ }\KeywordTok{linear\_reg}\NormalTok{(}\DataTypeTok{penalty =} \KeywordTok{tune}\NormalTok{(), }\CommentTok{\# tuning hyperparameter }
                        \DataTypeTok{mixture =} \DecValTok{1}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# 1 = lasso, 0 = ridge }
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"glmnet"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{set\_mode}\NormalTok{(}\StringTok{"regression"}\NormalTok{) }

\NormalTok{tune\_spec}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Linear Regression Model Specification (regression)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# penalty() searches 50 possible combinations }

\NormalTok{lambda\_grid \textless{}{-}}\StringTok{ }\KeywordTok{grid\_regular}\NormalTok{(}\KeywordTok{penalty}\NormalTok{(), }\DataTypeTok{levels =} \DecValTok{50}\NormalTok{)}

\CommentTok{\# 10{-}fold cross{-}validation}

\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{) }\CommentTok{\# for reproducibility }

\NormalTok{rec\_folds \textless{}{-}}\StringTok{ }\KeywordTok{vfold\_cv}\NormalTok{(train\_x\_reg }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{age =}\NormalTok{ train\_y\_reg)))}
\end{Highlighting}
\end{Shaded}

\hypertarget{add-these-elements-to-a-workflow}{%
\paragraph{Add these elements to a workflow}\label{add-these-elements-to-a-workflow}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Workflow }
\NormalTok{rec\_wf \textless{}{-}}\StringTok{ }\KeywordTok{workflow}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{add\_model}\NormalTok{(tune\_spec) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{add\_formula}\NormalTok{(age}\OperatorTok{\textasciitilde{}}\NormalTok{.)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Tuning results }
\NormalTok{rec\_res \textless{}{-}}\StringTok{ }\NormalTok{rec\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tune\_grid}\NormalTok{(}
    \DataTypeTok{resamples =}\NormalTok{ rec\_folds, }
    \DataTypeTok{grid =}\NormalTok{ lambda\_grid}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\hypertarget{visualize}{%
\paragraph{Visualize}\label{visualize}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Visualize}

\NormalTok{rec\_res }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{collect\_metrics}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(penalty, mean, }\DataTypeTok{col =}\NormalTok{ .metric)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_errorbar}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}
    \DataTypeTok{ymin =}\NormalTok{ mean }\OperatorTok{{-}}\StringTok{ }\NormalTok{std\_err,}
    \DataTypeTok{ymax =}\NormalTok{ mean }\OperatorTok{+}\StringTok{ }\NormalTok{std\_err}
\NormalTok{  ),}
  \DataTypeTok{alpha =} \FloatTok{0.3}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_line}\NormalTok{(}\DataTypeTok{size =} \DecValTok{2}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{"log(lambda)"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\KeywordTok{glue}\NormalTok{(}\StringTok{"\{toupper(.metric)\}"}\NormalTok{), }
             \DataTypeTok{scales =} \StringTok{"free"}\NormalTok{,}
             \DataTypeTok{nrow =} \DecValTok{2}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme}\NormalTok{(}\DataTypeTok{legend.position =} \StringTok{"none"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-21-1.pdf}

\hypertarget{select}{%
\paragraph{Select}\label{select}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{top\_rmse \textless{}{-}}\StringTok{ }\KeywordTok{show\_best}\NormalTok{(rec\_res, }\DataTypeTok{metric =} \StringTok{"rmse"}\NormalTok{)}

\NormalTok{best\_rmse \textless{}{-}}\StringTok{ }\KeywordTok{select\_best}\NormalTok{(rec\_res, }\DataTypeTok{metric =} \StringTok{"rmse"}\NormalTok{)}

\NormalTok{best\_rmse }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 2
##   penalty .config
##     <dbl> <chr>  
## 1   0.153 Model46
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{glue}\NormalTok{(}\StringTok{\textquotesingle{}The RMSE of the intiail model is }
\StringTok{     \{evals \%\textgreater{}\%}
\StringTok{  filter(type == "Lasso", .metric == "rmse") \%\textgreater{}\%}
\StringTok{  select(.estimate) \%\textgreater{}\%}
\StringTok{  round(2)\}\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## The RMSE of the intiail model is 
##    7.87
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{glue}\NormalTok{(}\StringTok{\textquotesingle{}The RMSE of the tuned model is \{rec\_res \%\textgreater{}\%}
\StringTok{  collect\_metrics() \%\textgreater{}\%}
\StringTok{  filter(.metric == "rmse") \%\textgreater{}\%}
\StringTok{  arrange(mean) \%\textgreater{}\%}
\StringTok{  dplyr::slice(1) \%\textgreater{}\%}
\StringTok{  select(mean) \%\textgreater{}\%}
\StringTok{  round(2)\}\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## The RMSE of the tuned model is 7.71
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Finalize your workflow and visualize \href{https://koalaverse.github.io/vip/articles/vip.html}{variable importance}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{finalize\_lasso \textless{}{-}}\StringTok{ }\NormalTok{rec\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{finalize\_workflow}\NormalTok{(best\_rmse)}

\NormalTok{finalize\_lasso }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{fit}\NormalTok{(train\_x\_reg }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{age =}\NormalTok{ train\_y\_reg))) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pull\_workflow\_fit}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{vip}\OperatorTok{::}\KeywordTok{vip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-23-1.pdf}

\hypertarget{test-fit}{%
\paragraph{Test fit}\label{test-fit}}

\begin{itemize}
\tightlist
\item
  Apply the tuned model to the test dataset
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{test\_fit \textless{}{-}}\StringTok{ }\NormalTok{finalize\_lasso }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{fit}\NormalTok{(test\_x\_reg }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{age =}\NormalTok{ test\_y\_reg)))}

\KeywordTok{evaluate\_reg}\NormalTok{(test\_fit)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       7.11 
## 2 mae     standard       5.86 
## 3 rsq     standard       0.410
\end{verbatim}

\hypertarget{decision-tree}{%
\subsection{Decision tree}\label{decision-tree}}

\hypertarget{parsnip-1}{%
\subsubsection{parsnip}\label{parsnip-1}}

\begin{itemize}
\tightlist
\item
  Build a model
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Specify a model
\item
  Specify an engine
\item
  Specify a mode
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# workflow }
\NormalTok{tree\_wf \textless{}{-}}\StringTok{ }\KeywordTok{workflow}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{add\_formula}\NormalTok{(target}\OperatorTok{\textasciitilde{}}\NormalTok{.)}

\CommentTok{\# spec }
\NormalTok{tree\_spec \textless{}{-}}\StringTok{ }\KeywordTok{decision\_tree}\NormalTok{(}
  
           \CommentTok{\# Mode }
           \DataTypeTok{mode =} \StringTok{"classification"}\NormalTok{,}
           
           \CommentTok{\# Tuning hyperparameters}
           \DataTypeTok{cost\_complexity =} \OtherTok{NULL}\NormalTok{, }
           \DataTypeTok{tree\_depth =} \OtherTok{NULL}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"rpart"}\NormalTok{) }\CommentTok{\# rpart, c5.0, spark}

\NormalTok{tree\_wf \textless{}{-}}\StringTok{ }\NormalTok{tree\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{add\_model}\NormalTok{(tree\_spec)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Fit a model
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tree\_fit \textless{}{-}}\StringTok{ }\NormalTok{tree\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{fit}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)))}
\end{Highlighting}
\end{Shaded}

\hypertarget{yardstick-1}{%
\subsubsection{yardstick}\label{yardstick-1}}

\begin{itemize}
\tightlist
\item
  Let's formally test prediction performance.
\end{itemize}

\textbf{Metrics}

\begin{itemize}
\item
  \texttt{accuracy}: The proportion of the data predicted correctly
\item
  \texttt{precision}: Positive predictive value
\item
  \texttt{recall} (specificity): True positive rate (e.g., healthy people really healthy)
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://upload.wikimedia.org/wikipedia/commons/thumb/2/26/Precisionrecall.svg/525px-Precisionrecall.svg.png}
\caption{From wikipedia}
\end{figure}

\begin{itemize}
\tightlist
\item
  To learn more about other metrics, check out the yardstick package \href{https://yardstick.tidymodels.org/reference/index.html}{references}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Define performance metrics }

\NormalTok{metrics \textless{}{-}}\StringTok{ }\NormalTok{yardstick}\OperatorTok{::}\KeywordTok{metric\_set}\NormalTok{(accuracy, precision, recall)}

\CommentTok{\# Visualize}

\NormalTok{tree\_fit\_viz\_metr \textless{}{-}}\StringTok{ }\KeywordTok{visualize\_class\_eval}\NormalTok{(tree\_fit)}

\NormalTok{tree\_fit\_viz\_metr}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-27-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tree\_fit\_viz\_mat \textless{}{-}}\StringTok{ }\KeywordTok{visualize\_class\_conf}\NormalTok{(tree\_fit)}

\NormalTok{tree\_fit\_viz\_mat}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-27-2.pdf}

\hypertarget{tune-1}{%
\subsubsection{tune}\label{tune-1}}

\hypertarget{tune-ingredients-1}{%
\paragraph{tune ingredients}\label{tune-ingredients-1}}

Decisions trees tend to overfit. Broadly speaking, there are two things we need to consider to reduce this problem: how to split and when to stop a tree.

\begin{itemize}
\item
  \textbf{complexity parameter}: a high CP means a simple decision tree with few splits.
\item
  \textbf{tree\_depth}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tune\_spec \textless{}{-}}\StringTok{ }
\StringTok{  }\KeywordTok{decision\_tree}\NormalTok{(}
    \DataTypeTok{cost\_complexity =} \KeywordTok{tune}\NormalTok{(), }\CommentTok{\# how to split }
    \DataTypeTok{tree\_depth =} \KeywordTok{tune}\NormalTok{(), }\CommentTok{\# when to stop }
    \DataTypeTok{mode =} \StringTok{"classification"}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"rpart"}\NormalTok{)}

\NormalTok{tree\_grid \textless{}{-}}\StringTok{ }\KeywordTok{grid\_regular}\NormalTok{(}\KeywordTok{cost\_complexity}\NormalTok{(),}
                          \KeywordTok{tree\_depth}\NormalTok{(),}
                          \DataTypeTok{levels =} \DecValTok{5}\NormalTok{) }\CommentTok{\# 2 hyperparameters {-}\textgreater{} 5*5 = 25 combinations }

\NormalTok{tree\_grid }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(tree\_depth)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 2
##   tree_depth     n
##        <int> <int>
## 1          1     5
## 2          4     5
## 3          8     5
## 4         11     5
## 5         15     5
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# 10{-}fold cross{-}validation}

\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{) }\CommentTok{\# for reproducibility }

\NormalTok{tree\_folds \textless{}{-}}\StringTok{ }\KeywordTok{vfold\_cv}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)),}
                       \DataTypeTok{strata =}\NormalTok{ target)}
\end{Highlighting}
\end{Shaded}

\hypertarget{add-these-elements-to-a-workflow-1}{%
\paragraph{Add these elements to a workflow}\label{add-these-elements-to-a-workflow-1}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Update workflow }
\NormalTok{tree\_wf \textless{}{-}}\StringTok{ }\NormalTok{tree\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{update\_model}\NormalTok{(tune\_spec)}

\CommentTok{\# Determine the number of cores}
\NormalTok{no\_cores \textless{}{-}}\StringTok{ }\KeywordTok{detectCores}\NormalTok{() }\OperatorTok{{-}}\StringTok{ }\DecValTok{1}

\CommentTok{\# Initiate}
\NormalTok{cl \textless{}{-}}\StringTok{ }\KeywordTok{makeCluster}\NormalTok{(no\_cores)}

\KeywordTok{registerDoParallel}\NormalTok{(cl)}

\CommentTok{\# Tuning results }
\NormalTok{tree\_res \textless{}{-}}\StringTok{ }\NormalTok{tree\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tune\_grid}\NormalTok{(}
    \DataTypeTok{resamples =}\NormalTok{ tree\_folds, }
    \DataTypeTok{grid =}\NormalTok{ tree\_grid,}
    \DataTypeTok{metrics =}\NormalTok{ metrics}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\hypertarget{visualize-1}{%
\paragraph{Visualize}\label{visualize-1}}

\begin{itemize}
\tightlist
\item
  The following plot draws on the \href{https://www.tidymodels.org/start/tuning/}{vignette} of the tidymodels package.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tree\_res }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{collect\_metrics}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{tree\_depth =} \KeywordTok{factor}\NormalTok{(tree\_depth)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(cost\_complexity, mean, }\DataTypeTok{col =}\NormalTok{ .metric)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{size =} \DecValTok{3}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\CommentTok{\# Subplots }
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{tree\_depth, }
             \DataTypeTok{scales =} \StringTok{"free"}\NormalTok{, }
             \DataTypeTok{nrow =} \DecValTok{2}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\CommentTok{\# Log scale x }
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\KeywordTok{label\_number}\NormalTok{()) }\OperatorTok{+}
\StringTok{  }\CommentTok{\# Discrete color scale }
\StringTok{  }\KeywordTok{scale\_color\_viridis\_d}\NormalTok{(}\DataTypeTok{option =} \StringTok{"plasma"}\NormalTok{, }\DataTypeTok{begin =} \FloatTok{.9}\NormalTok{, }\DataTypeTok{end =} \DecValTok{0}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{"Cost complexity"}\NormalTok{,}
       \DataTypeTok{col =} \StringTok{"Tree depth"}\NormalTok{,}
       \DataTypeTok{y =} \OtherTok{NULL}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-30-1.pdf}

\hypertarget{select-1}{%
\paragraph{Select}\label{select-1}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Optimal hyperparameter}
\NormalTok{best\_tree \textless{}{-}}\StringTok{ }\KeywordTok{select\_best}\NormalTok{(tree\_res, }\StringTok{"recall"}\NormalTok{)}

\CommentTok{\# Add the hyperparameter to the workflow }
\NormalTok{finalize\_tree \textless{}{-}}\StringTok{ }\NormalTok{tree\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{finalize\_workflow}\NormalTok{(best\_tree)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tree\_fit\_tuned \textless{}{-}}\StringTok{ }\NormalTok{finalize\_tree }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{fit}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)))}

\CommentTok{\# Metrics }
\NormalTok{(tree\_fit\_viz\_metr }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Non{-}tuned"}\NormalTok{)) }\OperatorTok{/}\StringTok{ }\NormalTok{(}\KeywordTok{visualize\_class\_eval}\NormalTok{(tree\_fit\_tuned) }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Tuned"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-32-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Confusion matrix }
\NormalTok{(tree\_fit\_viz\_mat }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Non{-}tuned"}\NormalTok{)) }\OperatorTok{/}\StringTok{ }\NormalTok{(}\KeywordTok{visualize\_class\_conf}\NormalTok{(tree\_fit\_tuned) }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Tuned"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-32-2.pdf}

\begin{itemize}
\tightlist
\item
  Visualize variable importance
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tree\_fit\_tuned }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pull\_workflow\_fit}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{vip}\OperatorTok{::}\KeywordTok{vip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-33-1.pdf}

\hypertarget{test-fit-1}{%
\paragraph{Test fit}\label{test-fit-1}}

\begin{itemize}
\tightlist
\item
  Apply the tuned model to the test dataset
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{test\_fit \textless{}{-}}\StringTok{ }\NormalTok{finalize\_tree }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{fit}\NormalTok{(test\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ test\_y\_class)))}

\KeywordTok{evaluate\_class}\NormalTok{(test\_fit)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 accuracy  binary         0.733
## 2 precision binary         0.689
## 3 recall    binary         0.756
\end{verbatim}

In the next subsection, we will learn variants of ensemble models that improve decision tree model by putting models together.

\hypertarget{bagging-random-forest}{%
\subsection{Bagging (Random forest)}\label{bagging-random-forest}}

Key idea applied across all ensemble models (bagging, boosting, and stacking):
single learner -\textgreater{} N learners (N \textgreater{} 1)

Many learners could perform better than a single learner as this approach reduces the \textbf{variance} of a single estimate and provides more stability.

Here we focus on the difference between bagging and boosting. In short, boosting may reduce bias while increasing variance. Bagging may reduce variance but has nothing to do with bias. For more information, please check out \href{https://quantdare.com/what-is-the-difference-between-bagging-and-boosting/}{What is the difference between Bagging and Boosting?} by aporras.

\textbf{bagging}

\begin{itemize}
\item
  Data: Training data will be random sampled with replacement (bootstrapping samples + drawing random \textbf{subsets} of features for training individual trees)
\item
  Learning: Building models in parallel (independently)
\item
  Prediction: Simple average of the estimated responses (majority vote system)
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://sebastianraschka.com/images/faq/bagging-boosting-rf/bagging.png}
\caption{From Sebastian Raschka's blog}
\end{figure}

\textbf{boosting}

\begin{itemize}
\item
  Data: Weighted training data will be random sampled
\item
  Learning: Building models sequentially (mispredicted cases would receive more weights)
\item
  Prediction: Weighted average of the estimated responses
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://sebastianraschka.com/images/faq/bagging-boosting-rf/boosting.png}
\caption{From Sebastian Raschka's blog}
\end{figure}

\hypertarget{parsnip-2}{%
\subsubsection{parsnip}\label{parsnip-2}}

\begin{itemize}
\tightlist
\item
  Build a model
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Specify a model
\item
  Specify an engine
\item
  Specify a mode
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# workflow }
\NormalTok{rand\_wf \textless{}{-}}\StringTok{ }\KeywordTok{workflow}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{add\_formula}\NormalTok{(target}\OperatorTok{\textasciitilde{}}\NormalTok{.)}

\CommentTok{\# spec }
\NormalTok{rand\_spec \textless{}{-}}\StringTok{ }\KeywordTok{rand\_forest}\NormalTok{(}
  
           \CommentTok{\# Mode }
           \DataTypeTok{mode =} \StringTok{"classification"}\NormalTok{,}
           
           \CommentTok{\# Tuning hyperparameters}
           \DataTypeTok{mtry =} \OtherTok{NULL}\NormalTok{, }\CommentTok{\# The number of predictors to available for splitting at each node  }
           \DataTypeTok{min\_n =} \OtherTok{NULL}\NormalTok{, }\CommentTok{\# The minimum number of data points needed to keep splitting nodes}
           \DataTypeTok{trees =} \DecValTok{500}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# The number of trees}
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"ranger"}\NormalTok{, }
             \CommentTok{\# We want the importance of predictors to be assessed.}
             \DataTypeTok{seed =} \DecValTok{1234}\NormalTok{, }
             \DataTypeTok{importance =} \StringTok{"permutation"}\NormalTok{) }

\NormalTok{rand\_wf \textless{}{-}}\StringTok{ }\NormalTok{rand\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{add\_model}\NormalTok{(rand\_spec)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Fit a model
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{rand\_fit \textless{}{-}}\StringTok{ }\NormalTok{rand\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{fit}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)))}
\end{Highlighting}
\end{Shaded}

\hypertarget{yardstick-2}{%
\subsubsection{yardstick}\label{yardstick-2}}

\begin{itemize}
\tightlist
\item
  Let's formally test prediction performance.
\end{itemize}

\textbf{Metrics}

\begin{itemize}
\item
  \texttt{accuracy}: The proportion of the data predicted correctly
\item
  \texttt{precision}: Positive predictive value
\item
  \texttt{recall} (specificity): True positive rate (e.g., healthy people really healthy)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Define performance metrics }
\NormalTok{metrics \textless{}{-}}\StringTok{ }\NormalTok{yardstick}\OperatorTok{::}\KeywordTok{metric\_set}\NormalTok{(accuracy, precision, recall)}

\NormalTok{rand\_fit\_viz\_metr \textless{}{-}}\StringTok{ }\KeywordTok{visualize\_class\_eval}\NormalTok{(rand\_fit)}

\NormalTok{rand\_fit\_viz\_metr}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-37-1.pdf}

\begin{itemize}
\tightlist
\item
  Visualize the confusion matrix.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{rand\_fit\_viz\_mat \textless{}{-}}\StringTok{ }\KeywordTok{visualize\_class\_conf}\NormalTok{(rand\_fit)}

\NormalTok{rand\_fit\_viz\_mat}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-38-1.pdf}

\hypertarget{tune-2}{%
\subsubsection{tune}\label{tune-2}}

\hypertarget{tune-ingredients-2}{%
\paragraph{tune ingredients}\label{tune-ingredients-2}}

We focus on the following two hyperparameters:

\begin{itemize}
\item
  \texttt{mtry}: The number of predictors to available for splitting at each node.
\item
  \texttt{min\_n}: The minimum number of data points needed to keep splitting nodes.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tune\_spec \textless{}{-}}\StringTok{ }
\StringTok{  }\KeywordTok{rand\_forest}\NormalTok{(}
           \DataTypeTok{mode =} \StringTok{"classification"}\NormalTok{,}
           
           \CommentTok{\# Tuning hyperparameters}
           \DataTypeTok{mtry =} \KeywordTok{tune}\NormalTok{(), }
           \DataTypeTok{min\_n =} \KeywordTok{tune}\NormalTok{()) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"ranger"}\NormalTok{,}
             \DataTypeTok{seed =} \DecValTok{1234}\NormalTok{, }
             \DataTypeTok{importance =} \StringTok{"permutation"}\NormalTok{)}

\NormalTok{rand\_grid \textless{}{-}}\StringTok{ }\KeywordTok{grid\_regular}\NormalTok{(}\KeywordTok{mtry}\NormalTok{(}\DataTypeTok{range =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{10}\NormalTok{)),}
                          \KeywordTok{min\_n}\NormalTok{(}\DataTypeTok{range =} \KeywordTok{c}\NormalTok{(}\DecValTok{2}\NormalTok{, }\DecValTok{10}\NormalTok{)),}
                          \DataTypeTok{levels =} \DecValTok{5}\NormalTok{)}

\NormalTok{rand\_grid }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(min\_n)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 2
##   min_n     n
##   <int> <int>
## 1     2     5
## 2     4     5
## 3     6     5
## 4     8     5
## 5    10     5
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# 10{-}fold cross{-}validation}

\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{) }\CommentTok{\# for reproducibility }

\NormalTok{rand\_folds \textless{}{-}}\StringTok{ }\KeywordTok{vfold\_cv}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)),}
                       \DataTypeTok{strata =}\NormalTok{ target)}
\end{Highlighting}
\end{Shaded}

\hypertarget{add-these-elements-to-a-workflow-2}{%
\paragraph{Add these elements to a workflow}\label{add-these-elements-to-a-workflow-2}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Update workflow }
\NormalTok{rand\_wf \textless{}{-}}\StringTok{ }\NormalTok{rand\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{update\_model}\NormalTok{(tune\_spec)}

\CommentTok{\# Tuning results }
\NormalTok{rand\_res \textless{}{-}}\StringTok{ }\NormalTok{rand\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tune\_grid}\NormalTok{(}
    \DataTypeTok{resamples =}\NormalTok{ rand\_folds, }
    \DataTypeTok{grid =}\NormalTok{ rand\_grid,}
    \DataTypeTok{metrics =}\NormalTok{ metrics}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\hypertarget{visualize-2}{%
\paragraph{Visualize}\label{visualize-2}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{rand\_res }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{collect\_metrics}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{min\_n =} \KeywordTok{factor}\NormalTok{(min\_n)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(mtry, mean, }\DataTypeTok{color =}\NormalTok{ min\_n)) }\OperatorTok{+}
\StringTok{  }\CommentTok{\# Line + Point plot }
\StringTok{  }\KeywordTok{geom\_line}\NormalTok{(}\DataTypeTok{size =} \FloatTok{1.5}\NormalTok{, }\DataTypeTok{alpha =} \FloatTok{0.6}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{size =} \DecValTok{2}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\CommentTok{\# Subplots }
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{.metric, }
             \DataTypeTok{scales =} \StringTok{"free"}\NormalTok{, }
             \DataTypeTok{nrow =} \DecValTok{2}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\CommentTok{\# Log scale x }
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\KeywordTok{label\_number}\NormalTok{()) }\OperatorTok{+}
\StringTok{  }\CommentTok{\# Discrete color scale }
\StringTok{  }\KeywordTok{scale\_color\_viridis\_d}\NormalTok{(}\DataTypeTok{option =} \StringTok{"plasma"}\NormalTok{, }\DataTypeTok{begin =} \FloatTok{.9}\NormalTok{, }\DataTypeTok{end =} \DecValTok{0}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{"The number of predictors to be sampled"}\NormalTok{,}
       \DataTypeTok{col =} \StringTok{"The minimum number of data points needed for splitting"}\NormalTok{,}
       \DataTypeTok{y =} \OtherTok{NULL}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme}\NormalTok{(}\DataTypeTok{legend.position=}\StringTok{"bottom"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-42-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Optimal hyperparameter}
\NormalTok{best\_tree \textless{}{-}}\StringTok{ }\KeywordTok{select\_best}\NormalTok{(rand\_res, }\StringTok{"accuracy"}\NormalTok{)}

\NormalTok{best\_tree}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 3
##    mtry min_n .config
##   <int> <int> <chr>  
## 1     1    10 Model21
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Add the hyperparameter to the workflow }
\NormalTok{finalize\_tree \textless{}{-}}\StringTok{ }\NormalTok{rand\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{finalize\_workflow}\NormalTok{(best\_tree)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{rand\_fit\_tuned \textless{}{-}}\StringTok{ }\NormalTok{finalize\_tree }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{fit}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)))}

\CommentTok{\# Metrics }
\NormalTok{(rand\_fit\_viz\_metr }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Non{-}tuned"}\NormalTok{)) }\OperatorTok{/}\StringTok{ }\NormalTok{(}\KeywordTok{visualize\_class\_eval}\NormalTok{(rand\_fit\_tuned) }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Tuned"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-44-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Confusion matrix }
\NormalTok{(rand\_fit\_viz\_mat }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Non{-}tuned"}\NormalTok{)) }\OperatorTok{/}\StringTok{ }\NormalTok{(}\KeywordTok{visualize\_class\_conf}\NormalTok{(rand\_fit\_tuned) }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Tuned"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-44-2.pdf}

\begin{itemize}
\tightlist
\item
  Visualize variable importance
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{rand\_fit\_tuned }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pull\_workflow\_fit}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{vip}\OperatorTok{::}\KeywordTok{vip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-45-1.pdf}

\hypertarget{test-fit-2}{%
\paragraph{Test fit}\label{test-fit-2}}

\begin{itemize}
\tightlist
\item
  Apply the tuned model to the test dataset
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{test\_fit \textless{}{-}}\StringTok{ }\NormalTok{finalize\_tree }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{fit}\NormalTok{(test\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ test\_y\_class)))}

\KeywordTok{evaluate\_class}\NormalTok{(test\_fit)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 accuracy  binary         0.922
## 2 precision binary         0.972
## 3 recall    binary         0.854
\end{verbatim}

\hypertarget{boosting-xgboost}{%
\subsection{Boosting (XGboost)}\label{boosting-xgboost}}

\hypertarget{parsnip-3}{%
\subsubsection{parsnip}\label{parsnip-3}}

\begin{itemize}
\tightlist
\item
  Build a model
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Specify a model
\item
  Specify an engine
\item
  Specify a mode
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# workflow }
\NormalTok{xg\_wf \textless{}{-}}\StringTok{ }\KeywordTok{workflow}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{add\_formula}\NormalTok{(target}\OperatorTok{\textasciitilde{}}\NormalTok{.)}

\CommentTok{\# spec }
\NormalTok{xg\_spec \textless{}{-}}\StringTok{ }\KeywordTok{boost\_tree}\NormalTok{(}
  
           \CommentTok{\# Mode }
           \DataTypeTok{mode =} \StringTok{"classification"}\NormalTok{,}
           
           \CommentTok{\# Tuning hyperparameters}
           
           \CommentTok{\# The number of trees to fit, aka boosting iterations}
           \DataTypeTok{trees =} \KeywordTok{c}\NormalTok{(}\DecValTok{100}\NormalTok{, }\DecValTok{300}\NormalTok{, }\DecValTok{500}\NormalTok{, }\DecValTok{700}\NormalTok{, }\DecValTok{900}\NormalTok{),}
           \CommentTok{\# The depth of the decision tree (how many levels of splits).}
             \DataTypeTok{tree\_depth =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{6}\NormalTok{), }
           \CommentTok{\# Learning rate: lower means the ensemble will adapt more slowly.}
           \DataTypeTok{learn\_rate =} \KeywordTok{c}\NormalTok{(}\FloatTok{0.0001}\NormalTok{, }\FloatTok{0.01}\NormalTok{, }\FloatTok{0.2}\NormalTok{),}
           \CommentTok{\# Stop splitting a tree if we only have this many obs in a tree node.}
             \DataTypeTok{min\_n =}\NormalTok{ 10L}
\NormalTok{          ) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"xgboost"}\NormalTok{) }

\NormalTok{xg\_wf \textless{}{-}}\StringTok{ }\NormalTok{xg\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{add\_model}\NormalTok{(xg\_spec)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Fit a model
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xg\_fit \textless{}{-}}\StringTok{ }\NormalTok{xg\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{fit}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning in begin_iteration:end_iteration: numerical expression has 5 elements:
## only the first used
\end{verbatim}

\hypertarget{yardstick-3}{%
\subsubsection{yardstick}\label{yardstick-3}}

\begin{itemize}
\tightlist
\item
  Let's formally test prediction performance.
\end{itemize}

\textbf{Metrics}

\begin{itemize}
\item
  \texttt{accuracy}: The proportion of the data predicted correctly
\item
  \texttt{precision}: Positive predictive value
\item
  \texttt{recall} (specificity): True positive rate (e.g., healthy people really healthy)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{metrics \textless{}{-}}\StringTok{ }\KeywordTok{metric\_set}\NormalTok{(yardstick}\OperatorTok{::}\NormalTok{accuracy, }
\NormalTok{                      yardstick}\OperatorTok{::}\NormalTok{precision, }
\NormalTok{                      yardstick}\OperatorTok{::}\NormalTok{recall)}

\KeywordTok{evaluate\_class}\NormalTok{(xg\_fit)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 accuracy  binary         0.733
## 2 precision binary         0.730
## 3 recall    binary         0.659
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xg\_fit\_viz\_metr \textless{}{-}}\StringTok{ }\KeywordTok{visualize\_class\_eval}\NormalTok{(xg\_fit)}

\NormalTok{xg\_fit\_viz\_metr}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-50-1.pdf}

\begin{itemize}
\tightlist
\item
  Visualize the confusion matrix.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xg\_fit\_viz\_mat \textless{}{-}}\StringTok{ }\KeywordTok{visualize\_class\_conf}\NormalTok{(xg\_fit)}

\NormalTok{xg\_fit\_viz\_mat}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-51-1.pdf}

\hypertarget{tune-3}{%
\subsubsection{tune}\label{tune-3}}

\hypertarget{tune-ingredients-3}{%
\paragraph{tune ingredients}\label{tune-ingredients-3}}

\begin{itemize}
\tightlist
\item
  We focus on the following hyperparameters: \texttt{trees,} \texttt{tree\_depth,} \texttt{learn\_rate,} \texttt{min\_n,} \texttt{mtry,} \texttt{loss\_reduction,} and \texttt{sample\_size}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tune\_spec \textless{}{-}}\StringTok{ }
\StringTok{  }\NormalTok{xg\_spec \textless{}{-}}\StringTok{ }\KeywordTok{boost\_tree}\NormalTok{(}
  
           \CommentTok{\# Mode }
           \DataTypeTok{mode =} \StringTok{"classification"}\NormalTok{,}
           
           \CommentTok{\# Tuning hyperparameters}
           
           \CommentTok{\# The number of trees to fit, aka boosting iterations}
           \DataTypeTok{trees =} \KeywordTok{tune}\NormalTok{(),}
           \CommentTok{\# The depth of the decision tree (how many levels of splits).}
             \DataTypeTok{tree\_depth =} \KeywordTok{tune}\NormalTok{(), }
           \CommentTok{\# Learning rate: lower means the ensemble will adapt more slowly.}
           \DataTypeTok{learn\_rate =} \KeywordTok{tune}\NormalTok{(),}
           \CommentTok{\# Stop splitting a tree if we only have this many obs in a tree node.}
             \DataTypeTok{min\_n =} \KeywordTok{tune}\NormalTok{(),}
           \DataTypeTok{loss\_reduction =} \KeywordTok{tune}\NormalTok{(),}
           \CommentTok{\# The number of randomly selected hyperparameters }
           \DataTypeTok{mtry =} \KeywordTok{tune}\NormalTok{(), }
           \CommentTok{\# The size of the data set used for modeling within an iteration}
           \DataTypeTok{sample\_size =} \KeywordTok{tune}\NormalTok{()}
\NormalTok{          ) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"xgboost"}\NormalTok{) }

\CommentTok{\# Space{-}filling hyperparameter grids }
\NormalTok{xg\_grid \textless{}{-}}\StringTok{ }\KeywordTok{grid\_latin\_hypercube}\NormalTok{(}
  \KeywordTok{trees}\NormalTok{(),}
  \KeywordTok{tree\_depth}\NormalTok{(),}
  \KeywordTok{learn\_rate}\NormalTok{(),}
  \KeywordTok{min\_n}\NormalTok{(),}
  \KeywordTok{loss\_reduction}\NormalTok{(), }
  \DataTypeTok{sample\_size =} \KeywordTok{sample\_prop}\NormalTok{(),}
  \KeywordTok{finalize}\NormalTok{(}\KeywordTok{mtry}\NormalTok{(), train\_x\_class),}
  \DataTypeTok{size =} \DecValTok{30}
\NormalTok{  )}

\CommentTok{\# 10{-}fold cross{-}validation}

\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{) }\CommentTok{\# for reproducibility }

\NormalTok{xg\_folds \textless{}{-}}\StringTok{ }\KeywordTok{vfold\_cv}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)),}
                     \DataTypeTok{strata =}\NormalTok{ target)}
\end{Highlighting}
\end{Shaded}

\hypertarget{add-these-elements-to-a-workflow-3}{%
\paragraph{Add these elements to a workflow}\label{add-these-elements-to-a-workflow-3}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Update workflow }
\NormalTok{xg\_wf \textless{}{-}}\StringTok{ }\NormalTok{xg\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{update\_model}\NormalTok{(tune\_spec)}

\CommentTok{\# Tuning results }
\NormalTok{xg\_res \textless{}{-}}\StringTok{ }\NormalTok{xg\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tune\_grid}\NormalTok{(}
    \DataTypeTok{resamples =}\NormalTok{ xg\_folds, }
    \DataTypeTok{grid =}\NormalTok{ xg\_grid,}
    \DataTypeTok{control =} \KeywordTok{control\_grid}\NormalTok{(}\DataTypeTok{save\_pred =} \OtherTok{TRUE}\NormalTok{)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\hypertarget{visualize-3}{%
\paragraph{Visualize}\label{visualize-3}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xg\_res }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{collect\_metrics}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{filter}\NormalTok{(.metric }\OperatorTok{==}\StringTok{ "roc\_auc"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pivot\_longer}\NormalTok{(mtry}\OperatorTok{:}\NormalTok{sample\_size,}
               \DataTypeTok{values\_to =} \StringTok{"value"}\NormalTok{,}
               \DataTypeTok{names\_to =} \StringTok{"parameter"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ value, }\DataTypeTok{y =}\NormalTok{ mean, }\DataTypeTok{color =}\NormalTok{ parameter)) }\OperatorTok{+}
\StringTok{    }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.8}\NormalTok{, }\DataTypeTok{show.legend =} \OtherTok{FALSE}\NormalTok{) }\OperatorTok{+}
\StringTok{    }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{parameter, }\DataTypeTok{scales =} \StringTok{"free\_x"}\NormalTok{) }\OperatorTok{+}
\StringTok{    }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{y =} \StringTok{"AUC"}\NormalTok{,}
         \DataTypeTok{x =} \OtherTok{NULL}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-54-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Optimal hyperparameter}
\NormalTok{best\_xg \textless{}{-}}\StringTok{ }\KeywordTok{select\_best}\NormalTok{(xg\_res, }\StringTok{"roc\_auc"}\NormalTok{)}

\NormalTok{best\_xg }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 8
##    mtry trees min_n tree_depth  learn_rate loss_reduction sample_size .config
##   <int> <int> <int>      <int>       <dbl>          <dbl>       <dbl> <chr>  
## 1     6    98     4         13 0.000000211  0.00000000336       0.422 Model26
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Add the hyperparameter to the workflow }
\NormalTok{finalize\_xg \textless{}{-}}\StringTok{ }\NormalTok{xg\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{finalize\_workflow}\NormalTok{(best\_xg)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xg\_fit\_tuned \textless{}{-}}\StringTok{ }\NormalTok{finalize\_xg }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{fit}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)))}

\CommentTok{\# Metrics }
\NormalTok{(xg\_fit\_viz\_metr }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Non{-}tuned"}\NormalTok{)) }\OperatorTok{/}\StringTok{ }\NormalTok{(}\KeywordTok{visualize\_class\_eval}\NormalTok{(xg\_fit\_tuned) }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Tuned"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-56-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Confusion matrix }
\NormalTok{(xg\_fit\_viz\_mat }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Non{-}tuned"}\NormalTok{)) }\OperatorTok{/}\StringTok{ }\NormalTok{(}\KeywordTok{visualize\_class\_conf}\NormalTok{(xg\_fit\_tuned) }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Tuned"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-56-2.pdf}

\begin{itemize}
\tightlist
\item
  Visualize variable importance
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xg\_fit\_tuned }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pull\_workflow\_fit}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{vip}\OperatorTok{::}\KeywordTok{vip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
\end{verbatim}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-57-1.pdf}

\hypertarget{test-fit-3}{%
\paragraph{Test fit}\label{test-fit-3}}

\begin{itemize}
\tightlist
\item
  Apply the tuned model to the test dataset
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{test\_fit \textless{}{-}}\StringTok{ }\NormalTok{finalize\_xg }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{fit}\NormalTok{(test\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ test\_y\_class)))}

\KeywordTok{evaluate\_class}\NormalTok{(test\_fit)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 accuracy  binary         0.833
## 2 precision binary         0.861
## 3 recall    binary         0.756
\end{verbatim}

\hypertarget{stacking-superlearner}{%
\subsection{Stacking (SuperLearner)}\label{stacking-superlearner}}

This stacking part of the book heavily relies on \href{https://github.com/dlab-berkeley/Machine-Learning-in-R/blob/master/07-ensembles.Rmd}{Chris Kennedy's notebook}.

\hypertarget{overview-1}{%
\subsubsection{Overview}\label{overview-1}}

\hypertarget{stacking}{%
\paragraph{Stacking}\label{stacking}}

Wolpert, D.H., 1992. \href{http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.56.1533}{Stacked generalization}. \emph{Neural networks}, 5(2), pp.241-259.

Breiman, L., 1996. {[}Stacked regressions{]}((\url{https://statistics.berkeley.edu/sites/default/files/tech-reports/367.pdf}). \emph{Machine learning}, 24(1), pp.49-64.

\hypertarget{superlearner}{%
\paragraph{SuperLearner}\label{superlearner}}

The \href{https://cran.r-project.org/web/packages/SuperLearner/index.html}{``SuperLearner'' R package} is a method that simplifies ensemble learning by allowing you to simultaneously evaluate the cross-validated performance of multiple algorithms and/or a single algorithm with differently tuned hyperparameters. This is a generally advisable approach to machine learning instead of fitting single algorithms.

Let's see how the four classification algorithms you learned in this workshop (1-lasso, 2-decision tree, 3-random forest, and 4-gradient boosted trees) compare to each other and also to 5-binary logistic regression (\texttt{glm}) and to the 6-mean of Y as a benchmark algorithm, in terms of their cross-validated error!

A ``wrapper'' is a short function that adapts an algorithm for the SuperLearner package. Check out the different algorithm wrappers offered by SuperLearner:

\hypertarget{choose-algorithms}{%
\subsubsection{Choose algorithms}\label{choose-algorithms}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Review available models }
\NormalTok{SuperLearner}\OperatorTok{::}\KeywordTok{listWrappers}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## All prediction algorithm wrappers in SuperLearner:
\end{verbatim}

\begin{verbatim}
##  [1] "SL.bartMachine"      "SL.bayesglm"         "SL.biglasso"        
##  [4] "SL.caret"            "SL.caret.rpart"      "SL.cforest"         
##  [7] "SL.earth"            "SL.extraTrees"       "SL.gam"             
## [10] "SL.gbm"              "SL.glm"              "SL.glm.interaction" 
## [13] "SL.glmnet"           "SL.ipredbagg"        "SL.kernelKnn"       
## [16] "SL.knn"              "SL.ksvm"             "SL.lda"             
## [19] "SL.leekasso"         "SL.lm"               "SL.loess"           
## [22] "SL.logreg"           "SL.mean"             "SL.nnet"            
## [25] "SL.nnls"             "SL.polymars"         "SL.qda"             
## [28] "SL.randomForest"     "SL.ranger"           "SL.ridge"           
## [31] "SL.rpart"            "SL.rpartPrune"       "SL.speedglm"        
## [34] "SL.speedlm"          "SL.step"             "SL.step.forward"    
## [37] "SL.step.interaction" "SL.stepAIC"          "SL.svm"             
## [40] "SL.template"         "SL.xgboost"
\end{verbatim}

\begin{verbatim}
## 
## All screening algorithm wrappers in SuperLearner:
\end{verbatim}

\begin{verbatim}
## [1] "All"
## [1] "screen.corP"           "screen.corRank"        "screen.glmnet"        
## [4] "screen.randomForest"   "screen.SIS"            "screen.template"      
## [7] "screen.ttest"          "write.screen.template"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Compile the algorithm wrappers to be used.}
\NormalTok{sl\_lib \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"SL.mean"}\NormalTok{, }\CommentTok{\# Marginal mean of the outcome () }
            \StringTok{"SL.glmnet"}\NormalTok{, }\CommentTok{\# GLM with lasso/elasticnet regularization }
            \StringTok{"SL.rpart"}\NormalTok{, }\CommentTok{\# Decision tree }
            \StringTok{"SL.ranger"}\NormalTok{, }\CommentTok{\# Random forest  }
            \StringTok{"SL.xgboost"}\NormalTok{) }\CommentTok{\# Xgbboost }
\end{Highlighting}
\end{Shaded}

\hypertarget{fit-model}{%
\subsubsection{Fit model}\label{fit-model}}

Fit the ensemble!

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# This is a seed that is compatible with multicore parallel processing.}
\CommentTok{\# See ?set.seed for more information.}
\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1}\NormalTok{, }\StringTok{"L\textquotesingle{}Ecuyer{-}CMRG"}\NormalTok{) }

\CommentTok{\# This will take a few minutes to execute {-} take a look at the .html file to see the output!}
\NormalTok{cv\_sl \textless{}{-}}\StringTok{  }\NormalTok{SuperLearner}\OperatorTok{::}\KeywordTok{CV.SuperLearner}\NormalTok{(}
  \DataTypeTok{Y =} \KeywordTok{as.numeric}\NormalTok{(}\KeywordTok{as.character}\NormalTok{(train\_y\_class)),}
  \DataTypeTok{X =}\NormalTok{ train\_x\_class,}
  \DataTypeTok{family =} \KeywordTok{binomial}\NormalTok{(),}
  \CommentTok{\# For a real analysis we would use V = 10.}
  \DataTypeTok{cvControl =} \KeywordTok{list}\NormalTok{(}\DataTypeTok{V =}\NormalTok{ 5L, }\DataTypeTok{stratifyCV =} \OtherTok{TRUE}\NormalTok{),}
  \DataTypeTok{SL.library =}\NormalTok{ sl\_lib,}
  \DataTypeTok{verbose =} \OtherTok{FALSE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{risk}{%
\subsubsection{Risk}\label{risk}}

Risk is the average loss, and loss is how far off the prediction was for an individual observation. The lower the risk, the fewer errors the model makes in its prediction. SuperLearner's default loss metric is squared error \((y_{actual} - y_{predicted})^2\), so the risk is the mean-squared error (just like in ordinary least \emph{squares} regression). View the summary, plot results, and compute the Area Under the ROC Curve (AUC)!

\hypertarget{summary}{%
\paragraph{Summary}\label{summary}}

\begin{itemize}
\tightlist
\item
  \texttt{Discrete\ SL} chooses the best single learner (in this case, \texttt{SL.glmnet} or \texttt{lasso}).
\item
  \texttt{SuperLearner} takes a weighted average of the \textbf{models} using the coefficients (importance of each individual learner in the overall ensemble). Coefficient 0 means that learner is not used at all.
\item
  \texttt{SL.mean\_All} (the weighted mean of \(Y\)) is a benchmark algorithm (ignoring features).
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{summary}\NormalTok{(cv\_sl)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## Call:  
## SuperLearner::CV.SuperLearner(Y = as.numeric(as.character(train_y_class)),  
##     X = train_x_class, family = binomial(), SL.library = sl_lib, verbose = FALSE,  
##     cvControl = list(V = 5L, stratifyCV = TRUE)) 
## 
## Risk is based on: Mean Squared Error
## 
## All risk estimates are based on V =  5 
## 
##       Algorithm     Ave        se      Min     Max
##   Super Learner 0.12831 0.0147872 0.065689 0.17518
##     Discrete SL 0.12712 0.0148904 0.062977 0.17531
##     SL.mean_All 0.24802 0.0030531 0.247747 0.24893
##   SL.glmnet_All 0.12712 0.0148904 0.062977 0.17531
##    SL.rpart_All 0.19077 0.0197215 0.137814 0.22434
##   SL.ranger_All 0.14243 0.0131858 0.098400 0.17475
##  SL.xgboost_All 0.15704 0.0169223 0.121860 0.17506
\end{verbatim}

\hypertarget{plot}{%
\paragraph{Plot}\label{plot}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Plot the cross{-}validated risk estimate with 95\% CIs.}

\KeywordTok{plot}\NormalTok{(cv\_sl)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/cvsl_review-1.pdf}

\hypertarget{compute-auc-for-all-estimators}{%
\subsubsection{Compute AUC for all estimators}\label{compute-auc-for-all-estimators}}

\textbf{ROC}

ROC: an ROC (receiver operating characteristic curve) plots the relationship between True Positive Rate (Y-axis) and FALSE Positive Rate (X-axis).

\begin{figure}
\centering
\includegraphics{https://developers.google.com/machine-learning/crash-course/images/AUC.svg}
\caption{Area Under the ROC Curve}
\end{figure}

\textbf{AUC}

AUC: Area Under the ROC Curve

1 = perfect

0.5 = no better than chance

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{auc\_table}\NormalTok{(cv\_sl)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##                      auc         se  ci_lower  ci_upper      p-value
## SL.mean_All    0.5000000 0.06879264 0.3651689 0.6348311 3.138901e-09
## SL.rpart_All   0.7852455 0.04203594 0.7028566 0.8676344 3.253351e-03
## SL.xgboost_All 0.8469098 0.02790665 0.7922138 0.9016058 2.943716e-02
## SL.ranger_All  0.8809411 0.02338011 0.8351169 0.9267652 2.120545e-01
## SuperLearner   0.8966714 0.02131186 0.8549010 0.9384419 4.448119e-01
## SL.glmnet_All  0.8996291 0.02102314 0.8584245 0.9408337 5.000000e-01
## DiscreteSL     0.8996291 0.02102314 0.8584245 0.9408337 5.000000e-01
\end{verbatim}

\hypertarget{plot-the-roc-curve-for-the-best-estimator-discretsl}{%
\paragraph{Plot the ROC curve for the best estimator (DiscretSL)}\label{plot-the-roc-curve-for-the-best-estimator-discretsl}}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{plot\_roc}\NormalTok{(cv\_sl)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-64-1.pdf}

\hypertarget{review-weight-distribution-for-the-superlearner}{%
\paragraph{Review weight distribution for the SuperLearner}\label{review-weight-distribution-for-the-superlearner}}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{print}\NormalTok{(}\KeywordTok{cvsl\_weights}\NormalTok{(cv\_sl), }\DataTypeTok{row.names =} \OtherTok{FALSE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  # Learner    Mean      SD     Min     Max
##  1  glmnet 0.91558 0.07913 0.82193 1.00000
##  2  ranger 0.08147 0.07881 0.00000 0.17807
##  3    mean 0.00162 0.00362 0.00000 0.00810
##  4 xgboost 0.00106 0.00154 0.00000 0.00339
##  5   rpart 0.00026 0.00059 0.00000 0.00131
\end{verbatim}

General stacking approach is available in the tidymodels framework through \href{https://github.com/tidymodels/stacks}{\texttt{stacks}} package (developmental stage).

However, SuperLearner is currently not available in the tidymodels framework. If you'd like to, you can easily build and add a parsnip model. If you are interested in knowing more about it, please take a look at \href{https://www.tidymodels.org/learn/develop/models/}{this vignette} of the tidymodels.

\hypertarget{applications-1}{%
\subsection{Applications}\label{applications-1}}

\hypertarget{bandit-algorithm-optimizing-an-experiment}{%
\subsubsection{Bandit algorithm (optimizing an experiment)}\label{bandit-algorithm-optimizing-an-experiment}}

\hypertarget{causal-forest-estimating-heterogeneous-treatment-effect}{%
\subsubsection{Causal forest (estimating heterogeneous treatment effect)}\label{causal-forest-estimating-heterogeneous-treatment-effect}}

\hypertarget{unsupervised-learning}{%
\section{Unsupervised learning}\label{unsupervised-learning}}

x -\textgreater{} f - \textgreater{} y (not defined)

\hypertarget{dimension-reduction}{%
\subsection{Dimension reduction}\label{dimension-reduction}}

\begin{figure}
\centering
\includegraphics{https://i.stack.imgur.com/Q7HIP.gif}
\caption{Projecting 2D-data to a line (PCA). From vas3k.com}
\end{figure}

\hypertarget{correlation-analysis}{%
\subsubsection{Correlation analysis}\label{correlation-analysis}}

\begin{itemize}
\item
  Notice some problems?

  \begin{itemize}
  \item
    NAs
  \item
    Scaling issues
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data\_original }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{corrr}\OperatorTok{::}\KeywordTok{correlate}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
\end{verbatim}

\begin{verbatim}
## # A tibble: 14 x 15
##    rowname     age     sex      cp trestbps     chol      fbs restecg  thalach
##    <chr>     <dbl>   <dbl>   <dbl>    <dbl>    <dbl>    <dbl>   <dbl>    <dbl>
##  1 age     NA      -0.0984 -0.0687   0.279   0.214    0.121   -0.116  -0.399  
##  2 sex     -0.0984 NA      -0.0494  -0.0568 -0.198    0.0450  -0.0582 -0.0440 
##  3 cp      -0.0687 -0.0494 NA        0.0476 -0.0769   0.0944   0.0444  0.296  
##  4 trestb~  0.279  -0.0568  0.0476  NA       0.123    0.178   -0.114  -0.0467 
##  5 chol     0.214  -0.198  -0.0769   0.123  NA        0.0133  -0.151  -0.00994
##  6 fbs      0.121   0.0450  0.0944   0.178   0.0133  NA       -0.0842 -0.00857
##  7 restecg -0.116  -0.0582  0.0444  -0.114  -0.151   -0.0842  NA       0.0441 
##  8 thalach -0.399  -0.0440  0.296   -0.0467 -0.00994 -0.00857  0.0441 NA      
##  9 exang    0.0968  0.142  -0.394    0.0676  0.0670   0.0257  -0.0707 -0.379  
## 10 oldpeak  0.210   0.0961 -0.149    0.193   0.0540   0.00575 -0.0588 -0.344  
## 11 slope   -0.169  -0.0307  0.120   -0.121  -0.00404 -0.0599   0.0930  0.387  
## 12 ca       0.276   0.118  -0.181    0.101   0.0705   0.138   -0.0720 -0.213  
## 13 thal     0.0680  0.210  -0.162    0.0622  0.0988  -0.0320  -0.0120 -0.0964 
## 14 target  -0.225  -0.281   0.434   -0.145  -0.0852  -0.0280   0.137   0.422  
## # ... with 6 more variables: exang <dbl>, oldpeak <dbl>, slope <dbl>, ca <dbl>,
## #   thal <dbl>, target <dbl>
\end{verbatim}

\hypertarget{preprocessing}{%
\subsubsection{Preprocessing}\label{preprocessing}}

\texttt{recipe} is essential for preprocesssing multiple features at once.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pca\_recipe \textless{}{-}}\StringTok{ }\KeywordTok{recipe}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{., }\DataTypeTok{data =}\NormalTok{ data\_original) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Imputing NAs using mean }
\StringTok{  }\KeywordTok{step\_meanimpute}\NormalTok{(}\KeywordTok{all\_predictors}\NormalTok{()) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Normalize some numeric variables }
\StringTok{  }\KeywordTok{step\_normalize}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"age"}\NormalTok{, }\StringTok{"trestbps"}\NormalTok{, }\StringTok{"chol"}\NormalTok{, }\StringTok{"thalach"}\NormalTok{, }\StringTok{"oldpeak"}\NormalTok{)) }
\end{Highlighting}
\end{Shaded}

\hypertarget{pca-analysis}{%
\subsubsection{PCA analysis}\label{pca-analysis}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pca\_res \textless{}{-}}\StringTok{ }\NormalTok{pca\_recipe }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{step\_pca}\NormalTok{(}\KeywordTok{all\_predictors}\NormalTok{(), }
           \DataTypeTok{id =} \StringTok{"pca"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# id argument identifies each PCA step }
\StringTok{  }\KeywordTok{prep}\NormalTok{()}

\NormalTok{pca\_res }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tidy}\NormalTok{(}\DataTypeTok{id =} \StringTok{"pca"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 196 x 4
##    terms        value component id   
##    <chr>        <dbl> <chr>     <chr>
##  1 age      -0.00101  PC1       pca  
##  2 sex       0.216    PC1       pca  
##  3 cp        0.321    PC1       pca  
##  4 trestbps  0.00118  PC1       pca  
##  5 chol     -0.000292 PC1       pca  
##  6 fbs       0.0468   PC1       pca  
##  7 restecg   0.166    PC1       pca  
##  8 thalach   0.0137   PC1       pca  
##  9 exang     0.0962   PC1       pca  
## 10 oldpeak  -0.00863  PC1       pca  
## # ... with 186 more rows
\end{verbatim}

\hypertarget{screeplot}{%
\paragraph{Screeplot}\label{screeplot}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pca\_recipe }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{step\_pca}\NormalTok{(}\KeywordTok{all\_predictors}\NormalTok{(), }
           \DataTypeTok{id =} \StringTok{"pca"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# id argument identifies each PCA step }
\StringTok{  }\KeywordTok{prep}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tidy}\NormalTok{(}\DataTypeTok{id =} \StringTok{"pca"}\NormalTok{, }\DataTypeTok{type =} \StringTok{"variance"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(terms }\OperatorTok{==}\StringTok{ "percent variance"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ component, }\DataTypeTok{y =}\NormalTok{ value)) }\OperatorTok{+}
\StringTok{    }\KeywordTok{geom\_col}\NormalTok{() }\OperatorTok{+}
\StringTok{    }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{"PCAs of heart disease"}\NormalTok{,}
         \DataTypeTok{y =} \StringTok{"\% of variance"}\NormalTok{,}
         \DataTypeTok{title =} \StringTok{"Scree plot"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-69-1.pdf}

\hypertarget{view-factor-loadings}{%
\paragraph{View factor loadings}\label{view-factor-loadings}}

Loadings are the covariances between the features and the principal components (=eigenvectors).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pca\_recipe }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{step\_pca}\NormalTok{(}\KeywordTok{all\_predictors}\NormalTok{(), }
           \DataTypeTok{id =} \StringTok{"pca"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# id argument identifies each PCA step }
\StringTok{  }\KeywordTok{prep}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tidy}\NormalTok{(}\DataTypeTok{id =} \StringTok{"pca"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(component }\OperatorTok{\%in\%}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"PC1"}\NormalTok{, }\StringTok{"PC2"}\NormalTok{)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{fct\_reorder}\NormalTok{(terms, value), }\DataTypeTok{y =}\NormalTok{ value, }
             \DataTypeTok{fill =}\NormalTok{ component)) }\OperatorTok{+}
\StringTok{    }\KeywordTok{geom\_col}\NormalTok{(}\DataTypeTok{position =} \StringTok{"dodge"}\NormalTok{) }\OperatorTok{+}
\StringTok{    }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{    }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{"Terms"}\NormalTok{,}
         \DataTypeTok{y =} \StringTok{"Contribtutions"}\NormalTok{,}
         \DataTypeTok{fill =} \StringTok{"PCAs"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-70-1.pdf}

You can use these low-dimensional data to solve prediction problems. Compressing feature space via dimension reduction techniques is called feature extraction. PCA is one way of doing this.

\hypertarget{topic-modeling}{%
\subsection{Topic modeling}\label{topic-modeling}}

\hypertarget{setup-3}{%
\subsubsection{Setup}\label{setup-3}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(tidytext, }\CommentTok{\# tidy text analysis}
\NormalTok{               glue, }\CommentTok{\# paste string and objects                }
\NormalTok{               stm, }\CommentTok{\# structural topic modeling}
\NormalTok{               gutenbergr) }\CommentTok{\# toy datasets }
\end{Highlighting}
\end{Shaded}

\hypertarget{dataset-1}{%
\subsubsection{Dataset}\label{dataset-1}}

The data munging process draws on \href{https://juliasilge.com/blog/sherlock-holmes-stm/}{Julia Silge's blog post}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sherlock\_raw \textless{}{-}}\StringTok{ }\KeywordTok{gutenberg\_download}\NormalTok{(}\DecValTok{1661}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
\end{verbatim}

\begin{verbatim}
## Using mirror http://aleph.gutenberg.org
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{glimpse}\NormalTok{(sherlock\_raw)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 12,648
## Columns: 2
## $ gutenberg_id <int> 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, 1661, ...
## $ text         <chr> "THE ADVENTURES OF SHERLOCK HOLMES", "", "by", "", "SI...
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sherlock \textless{}{-}}\StringTok{ }\NormalTok{sherlock\_raw }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Mutate story using a conditional statement }
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{story =} \KeywordTok{ifelse}\NormalTok{(}\KeywordTok{str\_starts}\NormalTok{(text, }\StringTok{"ADVENTURE"}\NormalTok{), }
\NormalTok{                                   text, }\OtherTok{NA}\NormalTok{)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Fill in missing values with next value  }
\StringTok{  }\NormalTok{tidyr}\OperatorTok{::}\KeywordTok{fill}\NormalTok{(story, }\DataTypeTok{.direction =} \StringTok{"down"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Filter }
\StringTok{  }\KeywordTok{filter}\NormalTok{(story }\OperatorTok{!=}\StringTok{ "THE ADVENTURES OF SHERLOCK HOLMES"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Factor }
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{story =} \KeywordTok{factor}\NormalTok{(story, }\DataTypeTok{levels =} \KeywordTok{unique}\NormalTok{(story)))}

\NormalTok{sherlock \textless{}{-}}\StringTok{ }\NormalTok{sherlock[,}\DecValTok{2}\OperatorTok{:}\DecValTok{3}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\hypertarget{key-ideas}{%
\subsubsection{Key ideas}\label{key-ideas}}

\begin{itemize}
\item
  Topics as \textbf{distributions} of words
\item
  Documents as \textbf{distributions} of topics
\item
  What distributions?

  \begin{itemize}
  \item
    Probability
  \item
    Multinominal (e.g., Latent Dirichlet Distribution)
  \end{itemize}
\item
  Words lie on a lower dimensional space (dimension reduction)
\item
  Co-occurrence of words (clustering)
\item
  Bag of words (feature engineering)

  \begin{itemize}
  \tightlist
  \item
    Upside: easy and fast (also quite working well)
  \item
    Downside: ignored grammatical structures and rich interactions among words (Alternative: word embeddings. Please check out \href{http://text2vec.org/}{text2vec})
  \end{itemize}
\end{itemize}

\hypertarget{exploratory-data-analysis}{%
\subsubsection{Exploratory data analysis}\label{exploratory-data-analysis}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sherlock\_n \textless{}{-}}\StringTok{ }\NormalTok{sherlock }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{unnest\_tokens}\NormalTok{(}\DataTypeTok{output =}\NormalTok{ word,}
                \DataTypeTok{input =}\NormalTok{ text) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(story, word, }\DataTypeTok{sort =} \OtherTok{TRUE}\NormalTok{)}

\NormalTok{sherlock\_total\_n \textless{}{-}}\StringTok{ }\NormalTok{sherlock\_n }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(story) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{total =} \KeywordTok{sum}\NormalTok{(n))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` ungrouping output (override with `.groups` argument)
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sherlock\_words \textless{}{-}}\StringTok{ }\NormalTok{sherlock\_n }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{left\_join}\NormalTok{(sherlock\_total\_n)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "story"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sherlock\_words }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{freq =}\NormalTok{ n}\OperatorTok{/}\NormalTok{total) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(story) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{top\_n}\NormalTok{(}\DecValTok{10}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{fct\_reorder}\NormalTok{(word, freq), }
             \DataTypeTok{y =}\NormalTok{ freq, }
             \DataTypeTok{fill =}\NormalTok{ story)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_col}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{story, }
             \DataTypeTok{ncol =} \DecValTok{2}\NormalTok{, }
             \DataTypeTok{scales =} \StringTok{"free\_y"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_fill\_viridis\_d}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{""}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Selecting by freq
\end{verbatim}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-73-1.pdf}

\hypertarget{stm}{%
\subsubsection{STM}\label{stm}}

\hypertarget{turn-text-into-document-term-matrix}{%
\paragraph{Turn text into document-term matrix}\label{turn-text-into-document-term-matrix}}

\texttt{stm} package has its own preprocessing function.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{dtm \textless{}{-}}\StringTok{ }\KeywordTok{textProcessor}\NormalTok{(}\DataTypeTok{documents =}\NormalTok{ sherlock}\OperatorTok{$}\NormalTok{text,}
                     \DataTypeTok{metadata =}\NormalTok{ sherlock, }
                     \DataTypeTok{removestopwords =} \OtherTok{TRUE}\NormalTok{,}
                     \DataTypeTok{verbose =} \OtherTok{FALSE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{tuning-k}{%
\paragraph{Tuning K}\label{tuning-k}}

\begin{itemize}
\tightlist
\item
  K is the number of topics.
\item
  Let's try K = 5, 10, 15.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{test\_res \textless{}{-}}\StringTok{ }\KeywordTok{searchK}\NormalTok{(dtm}\OperatorTok{$}\NormalTok{documents, dtm}\OperatorTok{$}\NormalTok{vocab, }
                   \DataTypeTok{K =} \KeywordTok{c}\NormalTok{(}\DecValTok{5}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{15}\NormalTok{),}
                   \DataTypeTok{prevalence =}\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{story, }
                   \DataTypeTok{data =}\NormalTok{ dtm}\OperatorTok{$}\NormalTok{meta)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Beginning Spectral Initialization 
##   Calculating the gram matrix...
##   Finding anchor words...
##      .....
##   Recovering initialization...
##      ........................................................
## Initialization complete.
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 1 (approx. per word bound = -7.581) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 2 (approx. per word bound = -7.482, relative change = 1.312e-02) 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 3 (approx. per word bound = -7.408, relative change = 9.916e-03) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 4 (approx. per word bound = -7.383, relative change = 3.336e-03) 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 5 (approx. per word bound = -7.372, relative change = 1.424e-03) 
## Topic 1: holm, now, come, look, yes 
##  Topic 2: upon, littl, man, hand, door 
##  Topic 3: know, think, came, back, day 
##  Topic 4: said, will, can, face, matter 
##  Topic 5: one, see, shall, time, must 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 6 (approx. per word bound = -7.367, relative change = 6.889e-04) 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 7 (approx. per word bound = -7.365, relative change = 3.221e-04) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 8 (approx. per word bound = -7.364, relative change = 1.281e-04) 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 9 (approx. per word bound = -7.364, relative change = 1.012e-05) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Model Converged 
## Beginning Spectral Initialization 
##   Calculating the gram matrix...
##   Finding anchor words...
##      ..........
##   Recovering initialization...
##      ........................................................
## Initialization complete.
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 1 (approx. per word bound = -7.666) 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 2 (approx. per word bound = -7.481, relative change = 2.408e-02) 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 3 (approx. per word bound = -7.387, relative change = 1.265e-02) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 4 (approx. per word bound = -7.361, relative change = 3.497e-03) 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 5 (approx. per word bound = -7.351, relative change = 1.396e-03) 
## Topic 1: upon, littl, paper, even, came 
##  Topic 2: holm, back, two, busi, sat 
##  Topic 3: one, case, word, remark, point 
##  Topic 4: come, said, room, miss, say 
##  Topic 5: said, man, eye, yes, took 
##  Topic 6: may, just, away, fact, mind 
##  Topic 7: see, one, time, face, look 
##  Topic 8: know, now, can, hand, must 
##  Topic 9: will, sherlock, two, might, famili 
##  Topic 10: tabl, heard, die, might, record 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 6 (approx. per word bound = -7.346, relative change = 7.034e-04) 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 7 (approx. per word bound = -7.342, relative change = 5.221e-04) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 8 (approx. per word bound = -7.338, relative change = 5.161e-04) 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 9 (approx. per word bound = -7.336, relative change = 2.460e-04) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Model Converged 
## Beginning Spectral Initialization 
##   Calculating the gram matrix...
##   Finding anchor words...
##      ...............
##   Recovering initialization...
##      ........................................................
## Initialization complete.
## ....................................................................................................
## Completed E-Step (3 seconds). 
## Completed M-Step. 
## Completing Iteration 1 (approx. per word bound = -7.738) 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 2 (approx. per word bound = -7.461, relative change = 3.577e-02) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 3 (approx. per word bound = -7.367, relative change = 1.264e-02) 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 4 (approx. per word bound = -7.343, relative change = 3.252e-03) 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 5 (approx. per word bound = -7.333, relative change = 1.367e-03) 
## Topic 1: matter, like, made, much, street 
##  Topic 2: look, door, face, room, saw 
##  Topic 3: sir, someth, wife, mean, instant 
##  Topic 4: said, holm, ask, well, miss 
##  Topic 5: morn, littl, remark, quit, interest 
##  Topic 6: back, chair, close, get, step 
##  Topic 7: time, read, put, seen, part 
##  Topic 8: two, now, case, cri, yet 
##  Topic 9: upon, one, sherlock, famili, knew 
##  Topic 10: may, howev, tell, long, clear 
##  Topic 11: will, think, shall, good, came 
##  Topic 12: see, littl, hand, yes, way 
##  Topic 13: holm, answer, turn, return, mrs 
##  Topic 14: man, reason, certain, strang, crime 
##  Topic 15: might, twist, hand, never, come 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 6 (approx. per word bound = -7.328, relative change = 7.011e-04) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 7 (approx. per word bound = -7.324, relative change = 4.535e-04) 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 8 (approx. per word bound = -7.322, relative change = 3.650e-04) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 9 (approx. per word bound = -7.320, relative change = 2.220e-04) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 10 (approx. per word bound = -7.318, relative change = 2.408e-04) 
## Topic 1: matter, much, like, even, away 
##  Topic 2: look, room, door, face, saw 
##  Topic 3: sir, went, someth, wife, dark 
##  Topic 4: said, holm, well, ask, heard 
##  Topic 5: quit, morn, remark, left, give 
##  Topic 6: back, get, chair, step, close 
##  Topic 7: time, put, seen, paper, three 
##  Topic 8: two, case, cri, seem, yet 
##  Topic 9: upon, one, sherlock, knew, famili 
##  Topic 10: may, howev, tell, long, clear 
##  Topic 11: will, think, come, shall, can 
##  Topic 12: see, littl, hand, yes, way 
##  Topic 13: turn, holm, answer, return, observ 
##  Topic 14: man, reason, certain, strang, lord 
##  Topic 15: might, thing, follow, told, help 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 11 (approx. per word bound = -7.317, relative change = 1.808e-04) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 12 (approx. per word bound = -7.316, relative change = 1.221e-04) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 13 (approx. per word bound = -7.315, relative change = 8.460e-05) 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Completing Iteration 14 (approx. per word bound = -7.315, relative change = 4.530e-05) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 15 (approx. per word bound = -7.315, relative change = 2.133e-05) 
## Topic 1: matter, much, like, even, made 
##  Topic 2: look, room, door, face, eye 
##  Topic 3: sir, went, someth, wife, dark 
##  Topic 4: said, holm, well, ask, know 
##  Topic 5: quit, remark, morn, left, found 
##  Topic 6: back, get, chair, step, close 
##  Topic 7: time, year, paper, put, seen 
##  Topic 8: two, case, seem, cri, yet 
##  Topic 9: upon, one, sherlock, knew, famili 
##  Topic 10: may, howev, tell, long, clear 
##  Topic 11: will, come, think, now, can 
##  Topic 12: littl, see, hand, yes, way 
##  Topic 13: turn, answer, return, holm, observ 
##  Topic 14: man, reason, certain, strang, lord 
##  Topic 15: might, make, thing, word, follow 
## ....................................................................................................
## Completed E-Step (2 seconds). 
## Completed M-Step. 
## Model Converged
\end{verbatim}

\hypertarget{evaludating-models}{%
\paragraph{Evaludating models}\label{evaludating-models}}

There are several metrics to assess the performance of topic models: the held-out likelihood, residuals, semantic coherence, and exclusivity. In this course, we examine the relationship between semantic coherence and exclusivity to understand the trade-off involved in selecting K.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{test\_res}\OperatorTok{$}\NormalTok{results }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{unnest}\NormalTok{(K, exclus, semcoh) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{select}\NormalTok{(K, exclus, semcoh) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{K =} \KeywordTok{as.factor}\NormalTok{(K)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ exclus, }\DataTypeTok{y =}\NormalTok{ semcoh)) }\OperatorTok{+}
\StringTok{    }\KeywordTok{geom\_text}\NormalTok{(}\DataTypeTok{label =} \KeywordTok{glue}\NormalTok{(}\StringTok{"K = \{test\_res$results$K\}"}\NormalTok{),}
              \DataTypeTok{size =} \DecValTok{5}\NormalTok{,}
              \DataTypeTok{color =} \StringTok{"red"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: unnest() has a new interface. See ?unnest for details.
## Try `df %>% unnest(c(K, exclus, semcoh))`, with `mutate()` if needed
\end{verbatim}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-76-1.pdf}

\hypertarget{finalize}{%
\paragraph{Finalize}\label{finalize}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{final\_stm \textless{}{-}}\StringTok{ }\KeywordTok{stm}\NormalTok{(dtm}\OperatorTok{$}\NormalTok{documents, }
\NormalTok{                 dtm}\OperatorTok{$}\NormalTok{vocab, }
                 \DataTypeTok{K =} \DecValTok{10}\NormalTok{, }\DataTypeTok{prevalence =}\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{story,}
                 \DataTypeTok{max.em.its =} \DecValTok{75}\NormalTok{, }
                 \DataTypeTok{data =}\NormalTok{ dtm}\OperatorTok{$}\NormalTok{meta, }
                 \DataTypeTok{init.type=}\StringTok{"Spectral"}\NormalTok{,}
                 \DataTypeTok{seed =} \DecValTok{1234567}\NormalTok{,}
                 \DataTypeTok{verbose =} \OtherTok{FALSE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{explore-the-results}{%
\paragraph{Explore the results}\label{explore-the-results}}

\begin{itemize}
\tightlist
\item
  Using the \texttt{stm} pacakge.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# plot}
\KeywordTok{plot}\NormalTok{(final\_stm)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-78-1.pdf}

\begin{itemize}
\tightlist
\item
  Using ggplot2
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# tidy  }
\NormalTok{tidy\_stm \textless{}{-}}\StringTok{ }\KeywordTok{tidy}\NormalTok{(final\_stm)}

\CommentTok{\# top terms}
\NormalTok{tidy\_stm }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{group\_by}\NormalTok{(topic) }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{top\_n}\NormalTok{(}\DecValTok{10}\NormalTok{, beta) }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{ungroup}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\KeywordTok{fct\_reorder}\NormalTok{(term, beta), beta, }\DataTypeTok{fill =} \KeywordTok{as.factor}\NormalTok{(topic))) }\OperatorTok{+}
\StringTok{    }\KeywordTok{geom\_col}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.8}\NormalTok{, }\DataTypeTok{show.legend =} \OtherTok{FALSE}\NormalTok{) }\OperatorTok{+}
\StringTok{    }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{topic, }\DataTypeTok{scales =} \StringTok{"free\_y"}\NormalTok{) }\OperatorTok{+}
\StringTok{    }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{    }\KeywordTok{scale\_y\_continuous}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\NormalTok{percent) }\OperatorTok{+}
\StringTok{    }\KeywordTok{scale\_fill\_viridis\_d}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-79-1.pdf}

\hypertarget{bias-and-fairness-in-machine-learning}{%
\section{Bias and fairness in machine learning}\label{bias-and-fairness-in-machine-learning}}

This section introduces the issues surrounding the fairness and bias in machine learning applications with a focus on the ProPublica's Analysis of the COMPAS algorithm. I revised \href{https://github.com/propublica/compas-analysis/blob/master/Compas\%20Analysis.ipynb}{the ProPublica's original R and Python code} to increase its code readability.

\begin{figure}
\centering
\includegraphics{https://wp.technologyreview.com/wp-content/uploads/2019/10/mit-alg-yb-02-7.gif?fit=1444,962}
\caption{A gif of defendants being put into an algorithm by SELMAN DESIGN}
\end{figure}

\textbf{Outline}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Bias in the data
\end{enumerate}

\begin{itemize}
\tightlist
\item
  Risk of Recidivism Data
\item
  Risk of Violent Recidivism Data
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Bias in the algorithm
\end{enumerate}

\textbf{References}

For more information on the ProPublica's Machine Bias project, we encourage to check out the following references.

\begin{itemize}
\item
  \href{https://www.propublica.org/article/machine-bias-risk-assessments-in-criminal-sentencing/}{Argument} by Julia Angwin, Jeff Larson, Surya Mattu and Lauren Kirchner
\item
  \href{https://www.washingtonpost.com/news/monkey-cage/wp/2016/10/17/can-an-algorithm-be-racist-our-analysis-is-more-cautious-than-propublicas/}{Counterargument} by Sam Corbett-Davies, Emma Pierson, Avi Feller and Sharad Goel
\item
  \href{https://www.propublica.org/article/how-we-analyzed-the-compas-recidivism-algorithm/}{Methodology}
\end{itemize}

\hypertarget{bias-in-the-data-risk-of-recidivism-analysis}{%
\subsection{Bias in the Data (Risk of Recidivism Analysis)}\label{bias-in-the-data-risk-of-recidivism-analysis}}

\hypertarget{setup-4}{%
\subsubsection{Setup}\label{setup-4}}

\begin{Shaded}
\begin{Highlighting}[]
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)) }\KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Loading required package: pacman
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{ tidyverse, }\CommentTok{\# tidyverse packages }
\NormalTok{ conflicted, }\CommentTok{\# an alternative conflict resolution strategy }
\NormalTok{ ggthemes, }\CommentTok{\# other themes for ggplot2 }
\NormalTok{ patchwork, }\CommentTok{\# arranging ggplots}
\NormalTok{ scales, }\CommentTok{\# rescaling }
\NormalTok{ survival, }\CommentTok{\# survival analysis}
\NormalTok{ broom, }\CommentTok{\# for modeling}
\NormalTok{ here, }\CommentTok{\# reproducibility }
\NormalTok{ glue }\CommentTok{\# pasting strings and objects }
\NormalTok{)}

\CommentTok{\# To avoid conflicts }
\KeywordTok{conflict\_prefer}\NormalTok{(}\StringTok{"filter"}\NormalTok{, }\StringTok{"dplyr"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [conflicted] Will prefer dplyr::filter over any other package
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{conflict\_prefer}\NormalTok{(}\StringTok{"select"}\NormalTok{, }\StringTok{"dplyr"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [conflicted] Will prefer dplyr::select over any other package
\end{verbatim}

\hypertarget{load-data}{%
\subsubsection{Load data}\label{load-data}}

We select fields for severity of charge, number of priors, demographics, age, sex, COMPAS scores, and whether each person was accused of a crime within two years.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{two\_years \textless{}{-}}\StringTok{ }\KeywordTok{read\_csv}\NormalTok{(}\KeywordTok{here}\NormalTok{(}\StringTok{"data"}\NormalTok{, }\StringTok{"compas{-}scores{-}two{-}years.csv"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Duplicated column names deduplicated: 'decile_score' =>
## 'decile_score_1' [40], 'priors_count' => 'priors_count_1' [49]
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{glue}\NormalTok{(}\StringTok{"N of observations (rows): \{nrow(two\_years)\}}
\StringTok{      N of variables (columns): \{ncol(two\_years)\}"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## N of observations (rows): 7214
## N of variables (columns): 53
\end{verbatim}

\hypertarget{wrangling}{%
\subsubsection{Wrangling}\label{wrangling}}

\begin{itemize}
\tightlist
\item
  Not all of the observations are useable for the first round of analysis.
\item
  There are a number of reasons to remove rows because of missing data:

  \begin{itemize}
  \tightlist
  \item
    If the charge date of a defendants COMPAS scored crime was not within 30 days from when the person was arrested, we assume that because of data quality reasons, that we do not have the right offense.
  \item
    We coded the recidivist flag -- is\_recid -- to be -1 if we could not find a COMPAS case at all.
  \item
    In a similar vein, ordinary traffic offenses -- those with a c\_charge\_degree of `O' -- will not result in Jail time are removed (only two of them).
  \item
    We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility.
  \end{itemize}
\item
  Create a function
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wrangle\_data \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(data)\{}

\NormalTok{df \textless{}{-}}\StringTok{ }\NormalTok{data }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{    }
\StringTok{    }\CommentTok{\# Select variables }
\StringTok{    }\KeywordTok{select}\NormalTok{(age, c\_charge\_degree, race, age\_cat, score\_text, sex, priors\_count, days\_b\_screening\_arrest, decile\_score, is\_recid, two\_year\_recid, }
\NormalTok{         c\_jail\_in, c\_jail\_out) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{    }\CommentTok{\# Filter rows }
\StringTok{    }\KeywordTok{filter}\NormalTok{(days\_b\_screening\_arrest }\OperatorTok{\textless{}=}\StringTok{ }\DecValTok{30}\NormalTok{,}
\NormalTok{           days\_b\_screening\_arrest }\OperatorTok{\textgreater{}=}\StringTok{ }\DecValTok{{-}30}\NormalTok{, }
\NormalTok{           is\_recid }\OperatorTok{!=}\StringTok{ }\DecValTok{{-}1}\NormalTok{,}
\NormalTok{           c\_charge\_degree }\OperatorTok{!=}\StringTok{ "O"}\NormalTok{,}
\NormalTok{           score\_text }\OperatorTok{!=}\StringTok{ \textquotesingle{}N/A\textquotesingle{}}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{    }\CommentTok{\# Mutate variables }
\StringTok{    }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{length\_of\_stay =} \KeywordTok{as.numeric}\NormalTok{(}\KeywordTok{as.Date}\NormalTok{(c\_jail\_out) }\OperatorTok{{-}}\StringTok{ }\KeywordTok{as.Date}\NormalTok{(c\_jail\_in)),}
           \DataTypeTok{c\_charge\_degree =} \KeywordTok{factor}\NormalTok{(c\_charge\_degree),}
           \DataTypeTok{age\_cat =} \KeywordTok{factor}\NormalTok{(age\_cat),}
           \DataTypeTok{race =} \KeywordTok{factor}\NormalTok{(race, }\DataTypeTok{levels =} \KeywordTok{c}\NormalTok{(}\StringTok{"Caucasian"}\NormalTok{,}\StringTok{"African{-}American"}\NormalTok{,}\StringTok{"Hispanic"}\NormalTok{,}\StringTok{"Other"}\NormalTok{,}\StringTok{"Asian"}\NormalTok{,}\StringTok{"Native American"}\NormalTok{)),}
           \DataTypeTok{sex =} \KeywordTok{factor}\NormalTok{(sex, }\DataTypeTok{levels =} \KeywordTok{c}\NormalTok{(}\StringTok{"Male"}\NormalTok{,}\StringTok{"Female"}\NormalTok{)),}
           \DataTypeTok{score\_text =} \KeywordTok{factor}\NormalTok{(score\_text, }\DataTypeTok{levels =} \KeywordTok{c}\NormalTok{(}\StringTok{"Low"}\NormalTok{, }\StringTok{"Medium"}\NormalTok{, }\StringTok{"High"}\NormalTok{)),}
           \DataTypeTok{score =}\NormalTok{ score\_text,}
\CommentTok{\# I added this new variable to test whether measuring the DV as a binary or continuous var makes a difference }
           \DataTypeTok{score\_num =} \KeywordTok{as.numeric}\NormalTok{(score\_text)) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{    }\CommentTok{\# Rename variables }
\StringTok{    }\KeywordTok{rename}\NormalTok{(}\DataTypeTok{crime =}\NormalTok{ c\_charge\_degree,}
           \DataTypeTok{gender =}\NormalTok{ sex)}
        
\KeywordTok{return}\NormalTok{(df)\}}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Apply the function to the data
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{wrangle\_data}\NormalTok{(two\_years)}

\KeywordTok{names}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "age"                     "crime"                  
##  [3] "race"                    "age_cat"                
##  [5] "score_text"              "gender"                 
##  [7] "priors_count"            "days_b_screening_arrest"
##  [9] "decile_score"            "is_recid"               
## [11] "two_year_recid"          "c_jail_in"              
## [13] "c_jail_out"              "length_of_stay"         
## [15] "score"                   "score_num"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Check whether the function works as expected}
\KeywordTok{head}\NormalTok{(df, }\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 16
##     age crime race  age_cat score_text gender priors_count days_b_screenin~
##   <dbl> <fct> <fct> <fct>   <fct>      <fct>         <dbl>            <dbl>
## 1    69 F     Other Greate~ Low        Male              0               -1
## 2    34 F     Afri~ 25 - 45 Low        Male              0               -1
## 3    24 F     Afri~ Less t~ Low        Male              4               -1
## 4    44 M     Other 25 - 45 Low        Male              0                0
## 5    41 F     Cauc~ 25 - 45 Medium     Male             14               -1
## # ... with 8 more variables: decile_score <dbl>, is_recid <dbl>,
## #   two_year_recid <dbl>, c_jail_in <dttm>, c_jail_out <dttm>,
## #   length_of_stay <dbl>, score <fct>, score_num <dbl>
\end{verbatim}

\hypertarget{descriptive-analysis}{%
\subsubsection{Descriptive analysis}\label{descriptive-analysis}}

\begin{itemize}
\tightlist
\item
  Higher COMPAS scores are slightly correlated with a longer length of stay.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{cor}\NormalTok{(df}\OperatorTok{$}\NormalTok{length\_of\_stay, df}\OperatorTok{$}\NormalTok{decile\_score)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 0.2073297
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(score) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ score, }\DataTypeTok{y =}\NormalTok{ n)) }\OperatorTok{+}
\StringTok{    }\KeywordTok{geom\_col}\NormalTok{() }\OperatorTok{+}
\StringTok{    }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{"Score"}\NormalTok{,}
         \DataTypeTok{y =} \StringTok{"Count"}\NormalTok{,}
         \DataTypeTok{title =} \StringTok{"Score distribution"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-84-1.pdf}

Judges are often presented with two sets of scores from the COMPAS system -- one that classifies people into High, Medium and Low risk, and a corresponding decile score. There is a clear downward trend in the decile scores as those scores increase for white defendants.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\KeywordTok{ordered}\NormalTok{(decile\_score))) }\OperatorTok{+}\StringTok{ }
\StringTok{          }\KeywordTok{geom\_bar}\NormalTok{() }\OperatorTok{+}
\StringTok{          }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{race, }\DataTypeTok{nrow =} \DecValTok{2}\NormalTok{) }\OperatorTok{+}
\StringTok{          }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{"Decile Score"}\NormalTok{,}
               \DataTypeTok{y =} \StringTok{"Count"}\NormalTok{,}
               \DataTypeTok{Title =} \StringTok{"Defendant\textquotesingle{}s Decile Score"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-85-1.pdf}

\hypertarget{modeling}{%
\subsubsection{Modeling}\label{modeling}}

After filtering out bad rows, our first question is whether there is a significant difference in COMPAS scores between races. To do so we need to change some variables into factors, and run a logistic regression, comparing low scores to high scores.

\begin{itemize}
\tightlist
\item
  Model building
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{model\_data \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(data)\{}

\CommentTok{\# Logistic regression model}
\NormalTok{lr\_model \textless{}{-}}\StringTok{ }\KeywordTok{glm}\NormalTok{(score }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{gender }\OperatorTok{+}\StringTok{ }\NormalTok{age\_cat }\OperatorTok{+}\StringTok{ }\NormalTok{race }\OperatorTok{+}\StringTok{ }\NormalTok{priors\_count }\OperatorTok{+}\StringTok{ }\NormalTok{crime }\OperatorTok{+}\StringTok{ }\NormalTok{two\_year\_recid, }
             \DataTypeTok{family =} \StringTok{"binomial"}\NormalTok{, }\DataTypeTok{data =}\NormalTok{ data)}

\CommentTok{\# OLS, DV = score\_num}
\NormalTok{ols\_model1 \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(score\_num }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{gender }\OperatorTok{+}\StringTok{ }\NormalTok{age\_cat }\OperatorTok{+}\StringTok{ }\NormalTok{race }\OperatorTok{+}\StringTok{ }\NormalTok{priors\_count }\OperatorTok{+}\StringTok{ }\NormalTok{crime }\OperatorTok{+}\StringTok{ }\NormalTok{two\_year\_recid, }\DataTypeTok{data =}\NormalTok{ data)}

\CommentTok{\# OLS, DV = decile\_score }
\NormalTok{ols\_model2 \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(decile\_score }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{gender }\OperatorTok{+}\StringTok{ }\NormalTok{age\_cat }\OperatorTok{+}\StringTok{ }\NormalTok{race }\OperatorTok{+}\StringTok{ }\NormalTok{priors\_count }\OperatorTok{+}\StringTok{ }\NormalTok{crime }\OperatorTok{+}\StringTok{ }\NormalTok{two\_year\_recid, }\DataTypeTok{data =}\NormalTok{ data)}

\CommentTok{\# Extract model outcomes with confidence intervals }
\NormalTok{lr\_est \textless{}{-}}\StringTok{ }\NormalTok{lr\_model }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{    }\KeywordTok{tidy}\NormalTok{(}\DataTypeTok{conf.int =} \OtherTok{TRUE}\NormalTok{) }

\NormalTok{ols\_est1 \textless{}{-}}\StringTok{ }\NormalTok{ols\_model1 }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{tidy}\NormalTok{(}\DataTypeTok{conf.int =} \OtherTok{TRUE}\NormalTok{) }

\NormalTok{ols\_est2 \textless{}{-}}\StringTok{ }\NormalTok{ols\_model2 }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{tidy}\NormalTok{(}\DataTypeTok{conf.int =} \OtherTok{TRUE}\NormalTok{) }

\CommentTok{\# AIC scores }
\NormalTok{lr\_AIC \textless{}{-}}\StringTok{ }\KeywordTok{AIC}\NormalTok{(lr\_model)}
\NormalTok{ols\_AIC1 \textless{}{-}}\StringTok{ }\KeywordTok{AIC}\NormalTok{(ols\_model1)}
\NormalTok{ols\_AIC2 \textless{}{-}}\StringTok{ }\KeywordTok{AIC}\NormalTok{(ols\_model2)}
    
\KeywordTok{list}\NormalTok{(lr\_est, ols\_est1, ols\_est2, }
\NormalTok{     lr\_AIC, ols\_AIC1, ols\_AIC2)}

\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Model comparisons
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{glue}\NormalTok{(}\StringTok{"AIC score of logistic regression: \{model\_data(df)[4]\} }
\StringTok{      AIC score of OLS regression (with categorical DV):  \{model\_data(df)[5]\}}
\StringTok{      AIC score of OLS regression (with continuous DV): \{model\_data(df)[6]\}"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## AIC score of logistic regression: 6192.40169473357 
## AIC score of OLS regression (with categorical DV):  11772.1148541111
## AIC score of OLS regression (with continuous DV): 26779.9512226999
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Logistic regression model
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{lr\_model \textless{}{-}}\StringTok{ }\KeywordTok{model\_data}\NormalTok{(df)[}\DecValTok{1}\NormalTok{] }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{data.frame}\NormalTok{()}

\NormalTok{lr\_model }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(term }\OperatorTok{!=}\StringTok{ "(Intercept)"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{term =} \KeywordTok{gsub}\NormalTok{(}\StringTok{"race|age\_cat|gender|M"}\NormalTok{,}\StringTok{""}\NormalTok{, term)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{fct\_reorder}\NormalTok{(term, estimate), }\DataTypeTok{y =}\NormalTok{ estimate, }\DataTypeTok{ymax =}\NormalTok{ conf.high, }\DataTypeTok{ymin =}\NormalTok{ conf.low)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_pointrange}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{y =} \StringTok{"Estimate"}\NormalTok{, }\DataTypeTok{x =} \StringTok{""}\NormalTok{,}
      \DataTypeTok{title =} \StringTok{"Logistic regression"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_hline}\NormalTok{(}\DataTypeTok{yintercept =} \DecValTok{0}\NormalTok{, }\DataTypeTok{linetype =} \StringTok{"dashed"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-88-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{interpret\_estimate \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(model)\{}
    
    \CommentTok{\# Control }
\NormalTok{    intercept \textless{}{-}}\StringTok{ }\NormalTok{model}\OperatorTok{$}\NormalTok{estimate[model}\OperatorTok{$}\NormalTok{term }\OperatorTok{==}\StringTok{ "(Intercept)"}\NormalTok{]}
\NormalTok{    control \textless{}{-}}\StringTok{ }\KeywordTok{exp}\NormalTok{(intercept) }\OperatorTok{/}\StringTok{ }\NormalTok{(}\DecValTok{1} \OperatorTok{+}\StringTok{ }\KeywordTok{exp}\NormalTok{(intercept))}
    
    \CommentTok{\# Likelihood }
\NormalTok{    model \textless{}{-}}\StringTok{ }\NormalTok{model }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{filter}\NormalTok{(term }\OperatorTok{!=}\StringTok{ "(Intercept)"}\NormalTok{)}
    
\NormalTok{    model}\OperatorTok{$}\NormalTok{likelihood \textless{}{-}}\StringTok{ }\NormalTok{(}\KeywordTok{exp}\NormalTok{(model}\OperatorTok{$}\NormalTok{estimate) }\OperatorTok{/}\StringTok{ }\NormalTok{(}\DecValTok{1} \OperatorTok{{-}}\StringTok{ }\NormalTok{control }\OperatorTok{+}\StringTok{ }\NormalTok{(control }\OperatorTok{*}\StringTok{ }\KeywordTok{exp}\NormalTok{(model}\OperatorTok{$}\NormalTok{estimate))))}
    
    \KeywordTok{return}\NormalTok{(model)}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{interpret\_estimate}\NormalTok{(lr\_model) }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{term =} \KeywordTok{gsub}\NormalTok{(}\StringTok{"race|age\_cat|gender"}\NormalTok{,}\StringTok{""}\NormalTok{, term)) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{    }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{fct\_reorder}\NormalTok{(term, likelihood), }\DataTypeTok{y =}\NormalTok{ likelihood)) }\OperatorTok{+}
\StringTok{        }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{size =} \DecValTok{3}\NormalTok{) }\OperatorTok{+}
\StringTok{        }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{        }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{y =} \StringTok{"Likelihood"}\NormalTok{, }\DataTypeTok{x =} \StringTok{""}\NormalTok{,}
            \DataTypeTok{title =}\StringTok{"Logistic regression"}\NormalTok{) }\OperatorTok{+}
\StringTok{        }\KeywordTok{scale\_y\_continuous}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\KeywordTok{percent\_format}\NormalTok{(}\DataTypeTok{accuracy =} \DecValTok{1}\NormalTok{)) }\OperatorTok{+}
\StringTok{        }\KeywordTok{geom\_hline}\NormalTok{(}\DataTypeTok{yintercept =} \DecValTok{1}\NormalTok{, }\DataTypeTok{linetype =} \StringTok{"dashed"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-90-1.pdf}

\hypertarget{bias-in-the-data-risk-of-violent-recidivism-analysis}{%
\subsection{Bias in the Data (Risk of Violent Recidivism Analysis)}\label{bias-in-the-data-risk-of-violent-recidivism-analysis}}

\hypertarget{setup-5}{%
\subsubsection{Setup}\label{setup-5}}

\begin{Shaded}
\begin{Highlighting}[]
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)) }\KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)}
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{ tidyverse, }\CommentTok{\# tidyverse packages }
\NormalTok{ conflicted, }\CommentTok{\# an alternative conflict resolution strategy }
\NormalTok{ ggthemes, }\CommentTok{\# other themes for ggplot2 }
\NormalTok{ patchwork, }\CommentTok{\# arranging ggplots}
\NormalTok{ scales, }\CommentTok{\# rescaling }
\NormalTok{ survival, }\CommentTok{\# survival analysis}
\NormalTok{ broom, }\CommentTok{\# for modeling}
\NormalTok{ here, }\CommentTok{\# reproducibility }
\NormalTok{ glue }\CommentTok{\# pasting strings and objects }
\NormalTok{)}

\CommentTok{\# To avoid conflicts }
\KeywordTok{conflict\_prefer}\NormalTok{(}\StringTok{"filter"}\NormalTok{, }\StringTok{"dplyr"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [conflicted] Removing existing preference
\end{verbatim}

\begin{verbatim}
## [conflicted] Will prefer dplyr::filter over any other package
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{conflict\_prefer}\NormalTok{(}\StringTok{"select"}\NormalTok{, }\StringTok{"dplyr"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [conflicted] Removing existing preference
\end{verbatim}

\begin{verbatim}
## [conflicted] Will prefer dplyr::select over any other package
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Set themes }
\KeywordTok{theme\_set}\NormalTok{(ggthemes}\OperatorTok{::}\KeywordTok{theme\_fivethirtyeight}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\hypertarget{load-data-1}{%
\subsubsection{Load data}\label{load-data-1}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{two\_years\_violent \textless{}{-}}\StringTok{ }\KeywordTok{read\_csv}\NormalTok{(}\KeywordTok{here}\NormalTok{(}\StringTok{"data"}\NormalTok{ ,}\StringTok{"compas{-}scores{-}two{-}years{-}violent.csv"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Duplicated column names deduplicated: 'decile_score' =>
## 'decile_score_1' [40], 'priors_count' => 'priors_count_1' [49], 'two_year_recid'
## => 'two_year_recid_1' [54]
\end{verbatim}

\begin{verbatim}
## 
## -- Column specification ---------------------------
## cols(
##   .default = col_double(),
##   name = col_character(),
##   first = col_character(),
##   last = col_character(),
##   compas_screening_date = col_date(format = ""),
##   sex = col_character(),
##   dob = col_date(format = ""),
##   age_cat = col_character(),
##   race = col_character(),
##   c_jail_in = col_datetime(format = ""),
##   c_jail_out = col_datetime(format = ""),
##   c_case_number = col_character(),
##   c_offense_date = col_date(format = ""),
##   c_arrest_date = col_date(format = ""),
##   c_charge_degree = col_character(),
##   c_charge_desc = col_character(),
##   r_case_number = col_character(),
##   r_charge_degree = col_character(),
##   r_offense_date = col_date(format = ""),
##   r_charge_desc = col_character(),
##   r_jail_in = col_date(format = "")
##   # ... with 14 more columns
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{glue}\NormalTok{(}\StringTok{"N of observations (rows): \{nrow(two\_years\_violent)\}}
\StringTok{      N of variables (columns): \{ncol(two\_years\_violent)\}"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## N of observations (rows): 4743
## N of variables (columns): 54
\end{verbatim}

\hypertarget{wrangling-1}{%
\subsubsection{Wrangling}\label{wrangling-1}}

\begin{itemize}
\tightlist
\item
  Create a function
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wrangle\_data \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(data)\{}

\NormalTok{df \textless{}{-}}\StringTok{ }\NormalTok{data }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{    }
\StringTok{    }\CommentTok{\# Select variables }
\StringTok{    }\KeywordTok{select}\NormalTok{(age, c\_charge\_degree, race, age\_cat, v\_score\_text, sex, priors\_count, }
\NormalTok{         days\_b\_screening\_arrest, v\_decile\_score, is\_recid, two\_year\_recid) }\OperatorTok{\%\textgreater{}\%}\StringTok{            }
\StringTok{    }\CommentTok{\# Filter rows }
\StringTok{    }\KeywordTok{filter}\NormalTok{(days\_b\_screening\_arrest }\OperatorTok{\textless{}=}\StringTok{ }\DecValTok{30}\NormalTok{,}
\NormalTok{           days\_b\_screening\_arrest }\OperatorTok{\textgreater{}=}\StringTok{ }\DecValTok{{-}30}\NormalTok{, }
\NormalTok{           is\_recid }\OperatorTok{!=}\StringTok{ }\DecValTok{{-}1}\NormalTok{,}
\NormalTok{           c\_charge\_degree }\OperatorTok{!=}\StringTok{ "O"}\NormalTok{,}
\NormalTok{           v\_score\_text }\OperatorTok{!=}\StringTok{ \textquotesingle{}N/A\textquotesingle{}}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{    }\CommentTok{\# Mutate variables }
\StringTok{    }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{c\_charge\_degree =} \KeywordTok{factor}\NormalTok{(c\_charge\_degree),}
           \DataTypeTok{age\_cat =} \KeywordTok{factor}\NormalTok{(age\_cat),}
           \DataTypeTok{race =} \KeywordTok{factor}\NormalTok{(race, }\DataTypeTok{levels =} \KeywordTok{c}\NormalTok{(}\StringTok{"Caucasian"}\NormalTok{,}\StringTok{"African{-}American"}\NormalTok{,}\StringTok{"Hispanic"}\NormalTok{,}\StringTok{"Other"}\NormalTok{,}\StringTok{"Asian"}\NormalTok{,}\StringTok{"Native American"}\NormalTok{)),}
           \DataTypeTok{sex =} \KeywordTok{factor}\NormalTok{(sex, }\DataTypeTok{levels =} \KeywordTok{c}\NormalTok{(}\StringTok{"Male"}\NormalTok{,}\StringTok{"Female"}\NormalTok{)),}
           \DataTypeTok{v\_score\_text =} \KeywordTok{factor}\NormalTok{(v\_score\_text, }\DataTypeTok{levels =} \KeywordTok{c}\NormalTok{(}\StringTok{"Low"}\NormalTok{, }\StringTok{"Medium"}\NormalTok{, }\StringTok{"High"}\NormalTok{)),}
\CommentTok{\# I added this new variable to test whether measuring the DV as a binary or continuous var makes a difference }
           \DataTypeTok{score\_num =} \KeywordTok{as.numeric}\NormalTok{(v\_score\_text)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\CommentTok{\# Rename variables }
\StringTok{    }\KeywordTok{rename}\NormalTok{(}\DataTypeTok{crime =}\NormalTok{ c\_charge\_degree,}
           \DataTypeTok{gender =}\NormalTok{ sex,}
           \DataTypeTok{score =}\NormalTok{ v\_score\_text)}
        
\KeywordTok{return}\NormalTok{(df)\}}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Apply the function to the data
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{wrangle\_data}\NormalTok{(two\_years\_violent)}

\KeywordTok{names}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "age"                     "crime"                  
##  [3] "race"                    "age_cat"                
##  [5] "score"                   "gender"                 
##  [7] "priors_count"            "days_b_screening_arrest"
##  [9] "v_decile_score"          "is_recid"               
## [11] "two_year_recid"          "score_num"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{head}\NormalTok{(df, }\DecValTok{5}\NormalTok{) }\CommentTok{\# Check whether the function works as expected }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 12
##     age crime race  age_cat score gender priors_count days_b_screenin~
##   <dbl> <fct> <fct> <fct>   <fct> <fct>         <dbl>            <dbl>
## 1    69 F     Other Greate~ Low   Male              0               -1
## 2    34 F     Afri~ 25 - 45 Low   Male              0               -1
## 3    44 M     Other 25 - 45 Low   Male              0                0
## 4    43 F     Other 25 - 45 Low   Male              3               -1
## 5    39 M     Cauc~ 25 - 45 Low   Female            0               -1
## # ... with 4 more variables: v_decile_score <dbl>, is_recid <dbl>,
## #   two_year_recid <dbl>, score_num <dbl>
\end{verbatim}

\hypertarget{descriptive-analysis-1}{%
\subsubsection{Descriptive analysis}\label{descriptive-analysis-1}}

\begin{itemize}
\tightlist
\item
  Score distribution
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(score) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ score, }\DataTypeTok{y =}\NormalTok{ n)) }\OperatorTok{+}
\StringTok{    }\KeywordTok{geom\_col}\NormalTok{() }\OperatorTok{+}
\StringTok{    }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{"Score"}\NormalTok{,}
         \DataTypeTok{y =} \StringTok{"Count"}\NormalTok{,}
         \DataTypeTok{title =} \StringTok{"Score distribution"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-95-1.pdf}

\begin{itemize}
\tightlist
\item
  Score distribution by race
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\KeywordTok{ordered}\NormalTok{(v\_decile\_score))) }\OperatorTok{+}\StringTok{ }
\StringTok{          }\KeywordTok{geom\_bar}\NormalTok{() }\OperatorTok{+}
\StringTok{          }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{race, }\DataTypeTok{nrow =} \DecValTok{2}\NormalTok{) }\OperatorTok{+}
\StringTok{          }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{"Decile Score"}\NormalTok{,}
               \DataTypeTok{y =} \StringTok{"Count"}\NormalTok{,}
               \DataTypeTok{Title =} \StringTok{"Defendant\textquotesingle{}s Decile Score"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-96-1.pdf}

\hypertarget{modeling-1}{%
\subsubsection{Modeling}\label{modeling-1}}

After filtering out bad rows, our first question is whether there is a significant difference in COMPAS scores between races. To do so we need to change some variables into factors, and run a logistic regression, comparing low scores to high scores.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{model\_data \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(data)\{}

\CommentTok{\# Logistic regression model}
\NormalTok{lr\_model \textless{}{-}}\StringTok{ }\KeywordTok{glm}\NormalTok{(score }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{gender }\OperatorTok{+}\StringTok{ }\NormalTok{age\_cat }\OperatorTok{+}\StringTok{ }\NormalTok{race }\OperatorTok{+}\StringTok{ }\NormalTok{priors\_count }\OperatorTok{+}\StringTok{ }\NormalTok{crime }\OperatorTok{+}\StringTok{ }\NormalTok{two\_year\_recid, }
             \DataTypeTok{family =} \StringTok{"binomial"}\NormalTok{, }\DataTypeTok{data =}\NormalTok{ data)}

\CommentTok{\# OLS}
\NormalTok{ols\_model1 \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(score\_num }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{gender }\OperatorTok{+}\StringTok{ }\NormalTok{age\_cat }\OperatorTok{+}\StringTok{ }\NormalTok{race }\OperatorTok{+}\StringTok{ }\NormalTok{priors\_count }\OperatorTok{+}\StringTok{ }\NormalTok{crime }\OperatorTok{+}\StringTok{ }\NormalTok{two\_year\_recid, }
             \DataTypeTok{data =}\NormalTok{ data)}

\NormalTok{ols\_model2 \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(v\_decile\_score }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{gender }\OperatorTok{+}\StringTok{ }\NormalTok{age\_cat }\OperatorTok{+}\StringTok{ }\NormalTok{race }\OperatorTok{+}\StringTok{ }\NormalTok{priors\_count }\OperatorTok{+}\StringTok{ }\NormalTok{crime }\OperatorTok{+}\StringTok{ }\NormalTok{two\_year\_recid, }
             \DataTypeTok{data =}\NormalTok{ data)}

\CommentTok{\# Extract model outcomes with confidence intervals }
\NormalTok{lr\_est \textless{}{-}}\StringTok{ }\NormalTok{lr\_model }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{    }\KeywordTok{tidy}\NormalTok{(}\DataTypeTok{conf.int =} \OtherTok{TRUE}\NormalTok{) }

\NormalTok{ols\_est1 \textless{}{-}}\StringTok{ }\NormalTok{ols\_model1 }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{tidy}\NormalTok{(}\DataTypeTok{conf.int =} \OtherTok{TRUE}\NormalTok{) }

\NormalTok{ols\_est2 \textless{}{-}}\StringTok{ }\NormalTok{ols\_model2 }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{tidy}\NormalTok{(}\DataTypeTok{conf.int =} \OtherTok{TRUE}\NormalTok{) }

\CommentTok{\# AIC scores }
\NormalTok{lr\_AIC \textless{}{-}}\StringTok{ }\KeywordTok{AIC}\NormalTok{(lr\_model)}
\NormalTok{ols\_AIC1 \textless{}{-}}\StringTok{ }\KeywordTok{AIC}\NormalTok{(ols\_model1)}
\NormalTok{ols\_AIC2 \textless{}{-}}\StringTok{ }\KeywordTok{AIC}\NormalTok{(ols\_model2)}
    
\KeywordTok{list}\NormalTok{(lr\_est, ols\_est1, ols\_est2, lr\_AIC, ols\_AIC1, ols\_AIC2)}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Model comparisons
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{glue}\NormalTok{(}\StringTok{"AIC score of logistic regression: \{model\_data(df)[4]\} }
\StringTok{      AIC score of OLS regression (with categorical DV):  \{model\_data(df)[5]\}}
\StringTok{      AIC score of OLS regression (with continuous DV): \{model\_data(df)[6]\}"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## AIC score of logistic regression: 3022.77943765996 
## AIC score of OLS regression (with categorical DV):  5414.49127581608
## AIC score of OLS regression (with continuous DV): 15458.3861723106
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Logistic regression model
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{lr\_model \textless{}{-}}\StringTok{ }\KeywordTok{model\_data}\NormalTok{(df)[}\DecValTok{1}\NormalTok{] }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{data.frame}\NormalTok{()}

\NormalTok{lr\_model }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(term }\OperatorTok{!=}\StringTok{ "(Intercept)"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{term =} \KeywordTok{gsub}\NormalTok{(}\StringTok{"race|age\_cat|gender"}\NormalTok{,}\StringTok{""}\NormalTok{, term)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{fct\_reorder}\NormalTok{(term, estimate), }\DataTypeTok{y =}\NormalTok{ estimate, }\DataTypeTok{ymax =}\NormalTok{ conf.high, }\DataTypeTok{ymin =}\NormalTok{ conf.low)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_pointrange}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{y =} \StringTok{"Estimate"}\NormalTok{, }\DataTypeTok{x =} \StringTok{""}\NormalTok{,}
      \DataTypeTok{title =} \StringTok{"Logistic regression"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_hline}\NormalTok{(}\DataTypeTok{yintercept =} \DecValTok{0}\NormalTok{, }\DataTypeTok{linetype =} \StringTok{"dashed"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-99-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{interpret\_estimate}\NormalTok{(lr\_model) }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{term =} \KeywordTok{gsub}\NormalTok{(}\StringTok{"race|age\_cat|gender"}\NormalTok{,}\StringTok{""}\NormalTok{, term)) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{    }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{fct\_reorder}\NormalTok{(term, likelihood), }\DataTypeTok{y =}\NormalTok{ likelihood)) }\OperatorTok{+}
\StringTok{        }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{size =} \DecValTok{3}\NormalTok{) }\OperatorTok{+}
\StringTok{        }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{        }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{y =} \StringTok{"Likelihood"}\NormalTok{, }\DataTypeTok{x =} \StringTok{""}\NormalTok{,}
            \DataTypeTok{title =}\StringTok{"Logistic regression"}\NormalTok{) }\OperatorTok{+}
\StringTok{        }\KeywordTok{scale\_y\_continuous}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\KeywordTok{percent\_format}\NormalTok{(}\DataTypeTok{accuracy =} \DecValTok{1}\NormalTok{)) }\OperatorTok{+}
\StringTok{        }\KeywordTok{geom\_hline}\NormalTok{(}\DataTypeTok{yintercept =} \DecValTok{1}\NormalTok{, }\DataTypeTok{linetype =} \StringTok{"dashed"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-100-1.pdf}

\hypertarget{bias-in-the-algorithm}{%
\subsection{Bias in the algorithm}\label{bias-in-the-algorithm}}

\begin{itemize}
\item
  In order to test whether COMPAS scores do an accurate job of deciding whether an offender is Low, Medium or High risk, we ran a Cox Proportional Hazards model. Northpointe, the company that created COMPAS and markets it to Law Enforcement, also ran a Cox model in \href{https://journals.sagepub.com/doi/abs/10.1177/0093854808326545}{their validation study}.
\item
  We used the counting model and removed people when they were incarcerated. Due to errors in the underlying jail data, we need to filter out 32 rows that have an end date more than the start date. Considering that there are 13,334 total rows in the data, such a small amount of errors will not affect the results.
\end{itemize}

\hypertarget{setup-6}{%
\subsubsection{Setup}\label{setup-6}}

\begin{Shaded}
\begin{Highlighting}[]
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)) }\KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)}
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{ tidyverse, }\CommentTok{\# tidyverse packages }
\NormalTok{ conflicted, }\CommentTok{\# an alternative conflict resolution strategy }
\NormalTok{ ggthemes, }\CommentTok{\# other themes for ggplot2 }
\NormalTok{ patchwork, }\CommentTok{\# arranging ggplots}
\NormalTok{ scales, }\CommentTok{\# rescaling }
\NormalTok{ survival, }\CommentTok{\# survival analysis}
\NormalTok{ broom, }\CommentTok{\# for modeling}
\NormalTok{ here, }\CommentTok{\# reproducibility }
\NormalTok{ glue, }\CommentTok{\# pasting strings and objects }
\NormalTok{ reticulate }\CommentTok{\# source python codes}
\NormalTok{)}

\CommentTok{\# To avoid conflicts }
\KeywordTok{conflict\_prefer}\NormalTok{(}\StringTok{"filter"}\NormalTok{, }\StringTok{"dplyr"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [conflicted] Removing existing preference
\end{verbatim}

\begin{verbatim}
## [conflicted] Will prefer dplyr::filter over any other package
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{conflict\_prefer}\NormalTok{(}\StringTok{"select"}\NormalTok{, }\StringTok{"dplyr"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [conflicted] Removing existing preference
\end{verbatim}

\begin{verbatim}
## [conflicted] Will prefer dplyr::select over any other package
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Set themes }
\KeywordTok{theme\_set}\NormalTok{(ggthemes}\OperatorTok{::}\KeywordTok{theme\_fivethirtyeight}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\hypertarget{load-data-2}{%
\subsubsection{Load data}\label{load-data-2}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{cox\_data \textless{}{-}}\StringTok{ }\KeywordTok{read\_csv}\NormalTok{(}\KeywordTok{here}\NormalTok{(}\StringTok{"data"}\NormalTok{ ,}\StringTok{"cox{-}parsed.csv"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Duplicated column names deduplicated: 'decile_score' =>
## 'decile_score_1' [40], 'priors_count' => 'priors_count_1' [49]
\end{verbatim}

\begin{verbatim}
## 
## -- Column specification ---------------------------
## cols(
##   .default = col_character(),
##   id = col_double(),
##   compas_screening_date = col_date(format = ""),
##   dob = col_date(format = ""),
##   age = col_double(),
##   juv_fel_count = col_double(),
##   decile_score = col_double(),
##   juv_misd_count = col_double(),
##   juv_other_count = col_double(),
##   priors_count = col_double(),
##   days_b_screening_arrest = col_double(),
##   c_jail_in = col_datetime(format = ""),
##   c_jail_out = col_datetime(format = ""),
##   c_offense_date = col_date(format = ""),
##   c_arrest_date = col_date(format = ""),
##   c_days_from_compas = col_double(),
##   is_recid = col_double(),
##   r_days_from_arrest = col_double(),
##   r_offense_date = col_date(format = ""),
##   r_jail_in = col_date(format = ""),
##   r_jail_out = col_date(format = "")
##   # ... with 13 more columns
## )
## i Use `spec()` for the full column specifications.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{glue}\NormalTok{(}\StringTok{"N of observations (rows): \{nrow(cox\_data)\}}
\StringTok{      N of variables (columns): \{ncol(cox\_data)\}"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## N of observations (rows): 13419
## N of variables (columns): 52
\end{verbatim}

\hypertarget{wrangling-2}{%
\subsubsection{Wrangling}\label{wrangling-2}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\NormalTok{cox\_data }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{    }\KeywordTok{filter}\NormalTok{(score\_text }\OperatorTok{!=}\StringTok{ "N/A"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{filter}\NormalTok{(end }\OperatorTok{\textgreater{}}\StringTok{ }\NormalTok{start) }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{c\_charge\_degree =} \KeywordTok{factor}\NormalTok{(c\_charge\_degree),}
           \DataTypeTok{age\_cat =} \KeywordTok{factor}\NormalTok{(age\_cat),}
           \DataTypeTok{race =} \KeywordTok{factor}\NormalTok{(race, }\DataTypeTok{levels =} \KeywordTok{c}\NormalTok{(}\StringTok{"Caucasian"}\NormalTok{,}\StringTok{"African{-}American"}\NormalTok{,}\StringTok{"Hispanic"}\NormalTok{,}\StringTok{"Other"}\NormalTok{,}\StringTok{"Asian"}\NormalTok{,}\StringTok{"Native American"}\NormalTok{)),}
           \DataTypeTok{sex =} \KeywordTok{factor}\NormalTok{(sex, }\DataTypeTok{levels =} \KeywordTok{c}\NormalTok{(}\StringTok{"Male"}\NormalTok{,}\StringTok{"Female"}\NormalTok{)),}
           \DataTypeTok{score\_factor =} \KeywordTok{factor}\NormalTok{(score\_text, }\DataTypeTok{levels =} \KeywordTok{c}\NormalTok{(}\StringTok{"Low"}\NormalTok{, }\StringTok{"Medium"}\NormalTok{, }\StringTok{"High"}\NormalTok{)))}

\NormalTok{grp \textless{}{-}}\StringTok{ }\NormalTok{df[}\OperatorTok{!}\KeywordTok{duplicated}\NormalTok{(df}\OperatorTok{$}\NormalTok{id),]}
\end{Highlighting}
\end{Shaded}

\hypertarget{descriptive-analysis-2}{%
\subsubsection{Descriptive analysis}\label{descriptive-analysis-2}}

\begin{itemize}
\tightlist
\item
  Score distribution
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{grp }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{    }\KeywordTok{group\_by}\NormalTok{(score\_factor) }\OperatorTok{\%\textgreater{}\%}
\StringTok{      }\KeywordTok{count}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{      }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ score\_factor, }\DataTypeTok{y =}\NormalTok{ n)) }\OperatorTok{+}
\StringTok{        }\KeywordTok{geom\_col}\NormalTok{() }\OperatorTok{+}
\StringTok{        }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{"Score"}\NormalTok{,}
             \DataTypeTok{y =} \StringTok{"Count"}\NormalTok{,}
             \DataTypeTok{title =} \StringTok{"Score distribution"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-104-1.pdf}

\begin{itemize}
\tightlist
\item
  Score distribution by race
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\KeywordTok{ordered}\NormalTok{(score\_factor))) }\OperatorTok{+}\StringTok{ }
\StringTok{          }\KeywordTok{geom\_bar}\NormalTok{() }\OperatorTok{+}
\StringTok{          }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{race, }\DataTypeTok{nrow =} \DecValTok{2}\NormalTok{) }\OperatorTok{+}
\StringTok{          }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{"Decile Score"}\NormalTok{,}
               \DataTypeTok{y =} \StringTok{"Count"}\NormalTok{,}
               \DataTypeTok{Title =} \StringTok{"Defendant\textquotesingle{}s Decile Score"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-105-1.pdf}

\hypertarget{modeling-2}{%
\subsubsection{Modeling}\label{modeling-2}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{f2 \textless{}{-}}\StringTok{ }\KeywordTok{Surv}\NormalTok{(start, end, event, }\DataTypeTok{type=}\StringTok{"counting"}\NormalTok{) }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{race }\OperatorTok{+}\StringTok{ }\NormalTok{score\_factor }\OperatorTok{+}\StringTok{ }\NormalTok{race }\OperatorTok{*}\StringTok{ }\NormalTok{score\_factor}

\NormalTok{model \textless{}{-}}\StringTok{ }\KeywordTok{coxph}\NormalTok{(f2, }\DataTypeTok{data =}\NormalTok{ df)}

\NormalTok{model }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{broom}\OperatorTok{::}\KeywordTok{tidy}\NormalTok{(}\DataTypeTok{conf.int =} \OtherTok{TRUE}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{term =} \KeywordTok{gsub}\NormalTok{(}\StringTok{"race|score\_factor"}\NormalTok{,}\StringTok{""}\NormalTok{, term)) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{filter}\NormalTok{(term }\OperatorTok{!=}\StringTok{ "\textless{}chr\textgreater{}"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{fct\_reorder}\NormalTok{(term, estimate), }\DataTypeTok{y =}\NormalTok{ estimate, }\DataTypeTok{ymax =}\NormalTok{ conf.high, }\DataTypeTok{ymin =}\NormalTok{ conf.low)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_pointrange}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{y =} \StringTok{"Estimate"}\NormalTok{, }\DataTypeTok{x =} \StringTok{""}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-106-1.pdf}

The interaction term shows a similar disparity as the logistic regression above.

High risk white defendants are 3.61 more likely than low risk white defendants, while High risk black defendants are 2.99 more likely than low.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{visualize\_surv \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(input)\{}
  
\NormalTok{f \textless{}{-}}\StringTok{ }\KeywordTok{Surv}\NormalTok{(start, end, event, }\DataTypeTok{type=}\StringTok{"counting"}\NormalTok{) }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{score\_factor}

\NormalTok{fit \textless{}{-}}\StringTok{ }\KeywordTok{survfit}\NormalTok{(f, }\DataTypeTok{data =}\NormalTok{ input)}

\NormalTok{fit }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{tidy}\NormalTok{(}\DataTypeTok{conf.int =} \OtherTok{TRUE}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{strata =} \KeywordTok{gsub}\NormalTok{(}\StringTok{"score\_factor="}\NormalTok{,}\StringTok{""}\NormalTok{, strata)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{strata =} \KeywordTok{factor}\NormalTok{(strata, }\DataTypeTok{levels =} \KeywordTok{c}\NormalTok{(}\StringTok{"High"}\NormalTok{,}\StringTok{"Medium"}\NormalTok{,}\StringTok{"Low"}\NormalTok{))) }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ time, }\DataTypeTok{y =}\NormalTok{ estimate, }\DataTypeTok{ymax =}\NormalTok{ conf.high, }\DataTypeTok{ymin =}\NormalTok{ conf.low, }\DataTypeTok{group =}\NormalTok{ strata, }\DataTypeTok{col =}\NormalTok{ strata)) }\OperatorTok{+}
\StringTok{    }\KeywordTok{geom\_pointrange}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.1}\NormalTok{) }\OperatorTok{+}
\StringTok{    }\KeywordTok{guides}\NormalTok{(}\DataTypeTok{colour =} \KeywordTok{guide\_legend}\NormalTok{(}\DataTypeTok{override.aes =} \KeywordTok{list}\NormalTok{(}\DataTypeTok{alpha =} \DecValTok{1}\NormalTok{))) }\OperatorTok{+}
\StringTok{    }\KeywordTok{ylim}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{)) }\OperatorTok{+}
\StringTok{    }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{"Time"}\NormalTok{, }\DataTypeTok{y =} \StringTok{"Estimated survival rate"}\NormalTok{, }\DataTypeTok{col =} \StringTok{"Strata"}\NormalTok{)\}}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{visualize\_surv}\NormalTok{(df) }\OperatorTok{+}\StringTok{ }\KeywordTok{ggtitle}\NormalTok{(}\StringTok{"Overall"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-108-1.pdf}

Black defendants do recidivate at higher rates according to race specific Kaplan Meier plots.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{(df }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{filter}\NormalTok{(race }\OperatorTok{==}\StringTok{ "Caucasian"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{visualize\_surv}\NormalTok{() }\OperatorTok{+}\StringTok{ }\KeywordTok{ggtitle}\NormalTok{(}\StringTok{"Caucasian"}\NormalTok{)) }\OperatorTok{/}
\NormalTok{(df }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{filter}\NormalTok{(race }\OperatorTok{==}\StringTok{ "African{-}American"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{visualize\_surv}\NormalTok{() }\OperatorTok{+}\StringTok{ }\KeywordTok{ggtitle}\NormalTok{(}\StringTok{"African{-}American"}\NormalTok{)) }
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-109-1.pdf}

In terms of underlying recidivism rates, we can look at gender specific Kaplan Meier estimates. There is a striking difference between women and men.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{(df }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{filter}\NormalTok{(sex }\OperatorTok{==}\StringTok{ "Female"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{visualize\_surv}\NormalTok{() }\OperatorTok{+}\StringTok{ }\KeywordTok{ggtitle}\NormalTok{(}\StringTok{"Female"}\NormalTok{)) }\OperatorTok{/}

\NormalTok{(df }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{filter}\NormalTok{(sex }\OperatorTok{==}\StringTok{ "Male"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{visualize\_surv}\NormalTok{() }\OperatorTok{+}\StringTok{ }\KeywordTok{ggtitle}\NormalTok{(}\StringTok{"Male"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-110-1.pdf}

As these plots show, the COMPAS score treats a High risk women the same as a Medium risk man.

\hypertarget{risk-of-recidivism-accuracy}{%
\subsubsection{Risk of Recidivism accuracy}\label{risk-of-recidivism-accuracy}}

The above analysis shows that the COMPAS algorithm does overpredict African-American defendant's future recidivism, but we haven't yet explored the direction of the bias. We can discover fine differences in overprediction and underprediction by comparing COMPAS scores across racial lines.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# create a new environment }
\KeywordTok{conda\_create}\NormalTok{(}\StringTok{"r{-}reticulate"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "/home/jae/.local/share/r-miniconda/envs/r-reticulate/bin/python"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# install libs }
\KeywordTok{conda\_install}\NormalTok{(}\StringTok{"r{-}reticulate"}\NormalTok{, }\KeywordTok{c}\NormalTok{(}\StringTok{"pandas"}\NormalTok{))}

\CommentTok{\# indicate that we want to use a specific condaenv}
\KeywordTok{use\_condaenv}\NormalTok{(}\StringTok{"r{-}reticulate"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]

\ImportTok{from}\NormalTok{ truth\_tables }\ImportTok{import}\NormalTok{ PeekyReader, Person, table, is\_race, count, vtable, hightable, vhightable}
\ImportTok{from}\NormalTok{ csv }\ImportTok{import}\NormalTok{ DictReader}

\NormalTok{people }\OperatorTok{=}\NormalTok{ []}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]

\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(}\StringTok{"./data/cox{-}parsed.csv"}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
\NormalTok{    reader }\OperatorTok{=}\NormalTok{ PeekyReader(DictReader(f))}
    \ControlFlowTok{try}\NormalTok{:}
        \ControlFlowTok{while} \VariableTok{True}\NormalTok{:}
\NormalTok{            p }\OperatorTok{=}\NormalTok{ Person(reader)}
            \ControlFlowTok{if}\NormalTok{ p.valid:}
\NormalTok{                people.append(p)}
    \ControlFlowTok{except} \PreprocessorTok{StopIteration}\NormalTok{:}
        \ControlFlowTok{pass}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]

\NormalTok{pop }\OperatorTok{=} \BuiltInTok{list}\NormalTok{(}\BuiltInTok{filter}\NormalTok{(}\KeywordTok{lambda}\NormalTok{ i: ((i.recidivist }\OperatorTok{==} \VariableTok{True} \KeywordTok{and}\NormalTok{ i.lifetime }\OperatorTok{\textless{}=} \DecValTok{730}\NormalTok{) }\KeywordTok{or}
\NormalTok{                              i.lifetime }\OperatorTok{\textgreater{}} \DecValTok{730}\NormalTok{), }\BuiltInTok{list}\NormalTok{(}\BuiltInTok{filter}\NormalTok{(}\KeywordTok{lambda}\NormalTok{ x: x.score\_valid, people))))}

\NormalTok{recid }\OperatorTok{=} \BuiltInTok{list}\NormalTok{(}\BuiltInTok{filter}\NormalTok{(}\KeywordTok{lambda}\NormalTok{ i: i.recidivist }\OperatorTok{==} \VariableTok{True} \KeywordTok{and}\NormalTok{ i.lifetime }\OperatorTok{\textless{}=} \DecValTok{730}\NormalTok{, pop))}

\NormalTok{rset }\OperatorTok{=} \BuiltInTok{set}\NormalTok{(recid)}

\NormalTok{surv }\OperatorTok{=}\NormalTok{ [i }\ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in}\NormalTok{ pop }\ControlFlowTok{if}\NormalTok{ i }\KeywordTok{not} \KeywordTok{in}\NormalTok{ rset]}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Define a function for a table.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]

\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd }

\KeywordTok{def}\NormalTok{ create\_table(x, y):}

\NormalTok{  t }\OperatorTok{=}\NormalTok{ table(}\BuiltInTok{list}\NormalTok{(x), }\BuiltInTok{list}\NormalTok{(y))}
  
\NormalTok{  df }\OperatorTok{=}\NormalTok{ pd.DataFrame(t.items(), }
\NormalTok{             columns }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}Metrics\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Scores\textquotesingle{}}\NormalTok{])}
             
  \ControlFlowTok{return}\NormalTok{(df)}
             
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  All defenders
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]

\NormalTok{create\_table(}\BuiltInTok{list}\NormalTok{(recid), }\BuiltInTok{list}\NormalTok{(surv)).to\_csv(}\StringTok{"data/table\_recid.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{read.csv}\NormalTok{(}\KeywordTok{here}\NormalTok{(}\StringTok{"data"}\NormalTok{, }\StringTok{"table\_recid.csv"}\NormalTok{))[,}\OperatorTok{{-}}\DecValTok{1}\NormalTok{] }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ Metrics, }\DataTypeTok{y =}\NormalTok{ Scores)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_col}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Recidivism"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-117-1.pdf}

That number is higher for African Americans at 44.85\% and lower for whites at 23.45\%.

\begin{Shaded}
\begin{Highlighting}[]

\KeywordTok{def}\NormalTok{ create\_comp\_tables(recid\_data, surv\_data):}
  
    \CommentTok{\# filtering variables }
\NormalTok{    is\_afam }\OperatorTok{=}\NormalTok{ is\_race(}\StringTok{"African{-}American"}\NormalTok{)}
\NormalTok{    is\_white }\OperatorTok{=}\NormalTok{ is\_race(}\StringTok{"Caucasian"}\NormalTok{)}
  
    \CommentTok{\# dfs }
\NormalTok{    df1 }\OperatorTok{=}\NormalTok{ create\_table(}\BuiltInTok{filter}\NormalTok{(is\_afam, recid\_data),}
                       \BuiltInTok{filter}\NormalTok{(is\_afam, surv\_data))}
  
\NormalTok{    df2 }\OperatorTok{=}\NormalTok{ create\_table(}\BuiltInTok{filter}\NormalTok{(is\_white, recid\_data), }
                       \BuiltInTok{filter}\NormalTok{(is\_white, surv\_data))}
  
    \CommentTok{\# concat }
\NormalTok{    dfs }\OperatorTok{=}\NormalTok{ pd.concat([df1, df2])}
    
\NormalTok{    dfs[}\StringTok{\textquotesingle{}Group\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}African Americans\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}African Americans\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Whites\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Whites\textquotesingle{}}\NormalTok{]}
    
    \ControlFlowTok{return}\NormalTok{(dfs)}
    
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]

\NormalTok{create\_comp\_tables(recid, surv).to\_csv(}\StringTok{"data/comp\_tables\_recid.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{read.csv}\NormalTok{(}\KeywordTok{here}\NormalTok{(}\StringTok{"data"}\NormalTok{, }\StringTok{"comp\_tables\_recid.csv"}\NormalTok{))[,}\OperatorTok{{-}}\DecValTok{1}\NormalTok{] }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ Metrics, }\DataTypeTok{y =}\NormalTok{ Scores, }\DataTypeTok{fill =}\NormalTok{ Group)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_col}\NormalTok{(}\DataTypeTok{position =} \StringTok{"dodge"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Recidivism"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-120-1.pdf}

\hypertarget{risk-of-violent-recidivism-accuracy}{%
\subsubsection{Risk of Violent Recidivism accuracy}\label{risk-of-violent-recidivism-accuracy}}

COMPAS also offers a score that aims to measure a persons risk of violent recidivism, which has a similar overall accuracy to the Recidivism score.

\begin{Shaded}
\begin{Highlighting}[]

\NormalTok{vpeople }\OperatorTok{=}\NormalTok{ []}

\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(}\StringTok{"./data/cox{-}violent{-}parsed.csv"}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
\NormalTok{    reader }\OperatorTok{=}\NormalTok{ PeekyReader(DictReader(f))}
    \ControlFlowTok{try}\NormalTok{:}
        \ControlFlowTok{while} \VariableTok{True}\NormalTok{:}
\NormalTok{            p }\OperatorTok{=}\NormalTok{ Person(reader)}
            \ControlFlowTok{if}\NormalTok{ p.valid:}
\NormalTok{                vpeople.append(p)}
    \ControlFlowTok{except} \PreprocessorTok{StopIteration}\NormalTok{:}
        \ControlFlowTok{pass}

\NormalTok{vpop }\OperatorTok{=} \BuiltInTok{list}\NormalTok{(}\BuiltInTok{filter}\NormalTok{(}\KeywordTok{lambda}\NormalTok{ i: ((i.violent\_recidivist }\OperatorTok{==} \VariableTok{True} \KeywordTok{and}\NormalTok{ i.lifetime }\OperatorTok{\textless{}=} \DecValTok{730}\NormalTok{) }\KeywordTok{or}
\NormalTok{                              i.lifetime }\OperatorTok{\textgreater{}} \DecValTok{730}\NormalTok{), }\BuiltInTok{list}\NormalTok{(}\BuiltInTok{filter}\NormalTok{(}\KeywordTok{lambda}\NormalTok{ x: x.vscore\_valid, vpeople))))}

\NormalTok{vrecid }\OperatorTok{=} \BuiltInTok{list}\NormalTok{(}\BuiltInTok{filter}\NormalTok{(}\KeywordTok{lambda}\NormalTok{ i: i.violent\_recidivist }\OperatorTok{==} \VariableTok{True} \KeywordTok{and}\NormalTok{ i.lifetime }\OperatorTok{\textless{}=} \DecValTok{730}\NormalTok{, vpeople))}

\NormalTok{vrset }\OperatorTok{=} \BuiltInTok{set}\NormalTok{(vrecid)}

\NormalTok{vsurv }\OperatorTok{=}\NormalTok{ [i }\ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in}\NormalTok{ vpop }\ControlFlowTok{if}\NormalTok{ i }\KeywordTok{not} \KeywordTok{in}\NormalTok{ vrset]}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]

\NormalTok{create\_table(vrecid, vsurv).to\_csv(}\StringTok{"data/table\_vrecid.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{read.csv}\NormalTok{(}\KeywordTok{here}\NormalTok{(}\StringTok{"data"}\NormalTok{, }\StringTok{"table\_vrecid.csv"}\NormalTok{))[,}\OperatorTok{{-}}\DecValTok{1}\NormalTok{] }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ Metrics, }\DataTypeTok{y =}\NormalTok{ Scores)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_col}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Violent recidivism"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-123-1.pdf}

Even more so for Black defendants.

\begin{Shaded}
\begin{Highlighting}[]

\NormalTok{create\_comp\_tables(vrecid, vsurv).to\_csv(}\StringTok{"data/comp\_tables\_vrecid.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{read.csv}\NormalTok{(}\KeywordTok{here}\NormalTok{(}\StringTok{"data"}\NormalTok{, }\StringTok{"comp\_tables\_vrecid.csv"}\NormalTok{))[,}\OperatorTok{{-}}\DecValTok{1}\NormalTok{] }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ Metrics, }\DataTypeTok{y =}\NormalTok{ Scores, }\DataTypeTok{fill =}\NormalTok{ Group)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_col}\NormalTok{(}\DataTypeTok{position =} \StringTok{"dodge"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Violent recidivism"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{06_high_dimensional_data_files/figure-latex/unnamed-chunk-125-1.pdf}

\hypertarget{references-2}{%
\section{References}\label{references-2}}

\hypertarget{books}{%
\subsection{Books}\label{books}}

\begin{itemize}
\item
  \emph{An Introduction to Statistical Learning - with Applications in R (2013)} by Gareth James, Daniela Witten, Trevor Hastie, Robert Tibshirani. Springer: New York. \href{https://www.amazon.com/Introduction-Statistical-Learning-Applications-Statistics/dp/1461471370}{Amazon} or \href{http://www-bcf.usc.edu/~gareth/ISL/}{free PDF}.
\item
  \emph{Hands-On Machine Learning with R (2020)} by Bradley Boehmke \& Brandon Greenwell. \href{https://www.routledge.com/Hands-On-Machine-Learning-with-R/Boehmke-Greenwell/p/book/9781138495685}{CRC Press} or \href{https://www.amazon.com/gp/product/1138495689?pf_rd_p=ab873d20-a0ca-439b-ac45-cd78f07a84d8\&pf_rd_r=JBRX0ZJ1WFSR9T3JPTQE}{Amazon}
\item
  \emph{Applied Predictive Modeling (2013)} by Max Kuhn and Kjell Johnson. Springer: New York. \href{https://www.amazon.com/Applied-Predictive-Modeling-Max-Kuhn/dp/1461468485?SubscriptionId=0ENGV10E9K9QDNSJ5C82\&tag=apm0a-20\&linkCode=xm2\&camp=2025\&creative=165953\&creativeASIN=1461468485}{Amazon}
\item
  \emph{Feature Engineering and Selection: A Practical Approach for Predictive Models (2019)} by Kjell Johnson and Max Kuhn. Taylor \& Francis. \href{http://www.feat.engineering/}{Amazon} or \href{http://www.feat.engineering/}{free HTML}.
\item
  \emph{\href{https://www.tmwr.org/}{Tidy Modeling with R} (2020)} by Max Kuhn and Julia Silge (work-in-progress)
\end{itemize}

\hypertarget{lecture-slides}{%
\subsection{Lecture slides}\label{lecture-slides}}

\begin{itemize}
\item
  \href{https://www.nber.org/econometrics_minicourse_2015/nber_slides11.pdf}{An introduction to supervised and unsupervised learning (2015)} by Susan Athey and Guido Imbens
\item
  \href{https://education.rstudio.com/blog/2020/02/conf20-intro-ml/}{Introduction Machine Learning with the Tidyverse} by Alison Hill
\end{itemize}

\hypertarget{blog-posts}{%
\subsection{Blog posts}\label{blog-posts}}

\begin{itemize}
\tightlist
\item
  \href{http://www.rebeccabarter.com/blog/2019-06-06_pre_processing/}{``Using the recipes package for easy pre-processing''} by Rebecca Barter
\end{itemize}

\hypertarget{big_data}{%
\chapter{Big data}\label{big_data}}

\hypertarget{overview-2}{%
\section{Overview}\label{overview-2}}

\begin{itemize}
\tightlist
\item
  Big data problem: data is too big to fit into memory (=local environment).
\item
  R reads data into random-access memory (RAM) at once and this object lives in memory entirely. So, if object.size \textgreater{} memory.size, the process will crash R.
\item
  Therefore, the key to deal with big data in R is reducing the size of data you want to bring into it.
\end{itemize}

\textbf{Techniques to deal with big data}

\begin{itemize}
\tightlist
\item
  Medium sized file (1-2 GB)

  \begin{itemize}
  \tightlist
  \item
    Try to reduce the size of the file using slicing and dicing
  \item
    Tools:

    \begin{itemize}
    \tightlist
    \item
      R:\texttt{data.table::fread(file\ path,\ select\ =\ c("column\ 1",\ "column\ 2"))}. This command imports data faster than \texttt{read.csv()} does.
    \item
      Command line: \href{https://csvkit.readthedocs.io/en/latest/}{\texttt{csvkit}} - a suite of command-line tools to and working with CSV
    \end{itemize}
  \end{itemize}
\item
  Large file (\textgreater{} 2-10 GB)

  \begin{itemize}
  \tightlist
  \item
    Put the data into a database and \textbf{ACCESS} it
  \item
    Explore the data and pull the objects of interest
  \end{itemize}
\end{itemize}

\textbf{Databases}

\begin{itemize}
\tightlist
\item
  Types of databases

  \begin{itemize}
  \tightlist
  \item
    Relational database = a \textbf{collection} of \textbf{tables} (fixed columns and rows): SQL is a staple tool to define, \textbf{query} (focus of the workshop today), control, and manipulate this type of database
  \item
    Non-relational database = a collection of documents (MongoDB), key-values (Redis and DyanoDB), wide-column stores (Cassandra and HBase), or graph (Neo4j and JanusGraph). Note that this type of database does not preclude SQL. NoSQL stands for \href{https://www.mongodb.com/nosql-explained}{``not only SQL.''}
  \end{itemize}
\end{itemize}

\textbf{Relational database example}

\begin{figure}
\centering
\includegraphics{https://sp.mysqltutorial.org/wp-content/uploads/2009/12/MySQL-Sample-Database-Schema.png}
\caption{Relational Database. Source: MySQL Tutorial}
\end{figure}

\hypertarget{sql}{%
\section{SQL}\label{sql}}

\begin{itemize}
\item
  Structured Query Language. Called SEQUEL and developed by IBM Corporation in the 1970s.
\item
  Remains the standard language for a relational database management system.
\item
  It's a DECLARATIVE language (\href{https://www.sqlite.org/queryplanner.html}{what to do \textgreater{} how to do})

  \begin{itemize}
  \tightlist
  \item
    Database management systems figures optimal way to execute query (query optimization)
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{SELECT} \KeywordTok{COLUMN} \KeywordTok{FROM} \KeywordTok{TABLE} 
\end{Highlighting}
\end{Shaded}

\hypertarget{learning-objectives}{%
\subsection{Learning objectives}\label{learning-objectives}}

\begin{itemize}
\item
  Embracing a new mindset: shifting from ownership (opening CSVs stored in your laptop) to access (accessing data stored in a database)
\item
  Learning how to use R and SQL to access and query a database
\end{itemize}

\hypertarget{sql-and-r}{%
\subsection{SQL and R}\label{sql-and-r}}

\begin{itemize}
\tightlist
\item
  SQL and R
\end{itemize}

\begin{longtable}[]{@{}ll@{}}
\toprule
\begin{minipage}[b]{0.14\columnwidth}\raggedright
SQL\strut
\end{minipage} & \begin{minipage}[b]{0.80\columnwidth}\raggedright
R\strut
\end{minipage}\tabularnewline
\midrule
\endhead
\begin{minipage}[t]{0.14\columnwidth}\raggedright
SELECT\strut
\end{minipage} & \begin{minipage}[t]{0.80\columnwidth}\raggedright
select() for columns, mutate() for expressions, summarise() for aggregates\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.14\columnwidth}\raggedright
FROM\strut
\end{minipage} & \begin{minipage}[t]{0.80\columnwidth}\raggedright
which data frame\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.14\columnwidth}\raggedright
WHERE\strut
\end{minipage} & \begin{minipage}[t]{0.80\columnwidth}\raggedright
filter()\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.14\columnwidth}\raggedright
GROUP BY\strut
\end{minipage} & \begin{minipage}[t]{0.80\columnwidth}\raggedright
group\_by()\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.14\columnwidth}\raggedright
HAVING\strut
\end{minipage} & \begin{minipage}[t]{0.80\columnwidth}\raggedright
filter() \textbf{after group\_by()}\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.14\columnwidth}\raggedright
ORDER BY\strut
\end{minipage} & \begin{minipage}[t]{0.80\columnwidth}\raggedright
arrange()\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.14\columnwidth}\raggedright
LIMIT\strut
\end{minipage} & \begin{minipage}[t]{0.80\columnwidth}\raggedright
head()\strut
\end{minipage}\tabularnewline
\bottomrule
\end{longtable}

\textbf{Challenge 1}
1. Can you tell me the difference in the order in which the following \texttt{R} and \texttt{SQL} code were written to manipulate data? For instance, in R, what command comes first? In contrast, in SQL, what command comes first?

\begin{itemize}
\tightlist
\item
  R example
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]

\NormalTok{data }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Data }
\StringTok{  }\KeywordTok{select}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Column}
\StringTok{  }\KeywordTok{filter}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Row }
\StringTok{  }\KeywordTok{group\_by}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Group by }
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{n =} \KeywordTok{n}\NormalTok{()) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# n() is one of the aggregate functions in r; it\textquotesingle{}s count() used inside summarise() function }
\StringTok{  }\KeywordTok{filter}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Row }
\StringTok{  }\KeywordTok{order\_by}\NormalTok{() }\CommentTok{\# Arrange }
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  SQL example (in a SQL chunk, use \texttt{-\/-} instead of \texttt{\#} to comment)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]

\KeywordTok{SELECT} \KeywordTok{column}\NormalTok{, aggregation (}\FunctionTok{count}\NormalTok{())\textasciigrave{} }\CommentTok{{-}{-} Column}

\KeywordTok{FROM} \KeywordTok{data}\NormalTok{ \# }\KeywordTok{Data} 

\KeywordTok{WHERE}\NormalTok{ condition }\CommentTok{{-}{-} Filter rows }

\KeywordTok{GROUP} \KeywordTok{BY} \KeywordTok{column} \CommentTok{{-}{-} Group by}

\KeywordTok{HAVING}\NormalTok{ condition }\CommentTok{{-}{-} Filter rows after group by  }

\KeywordTok{ORDER} \KeywordTok{BY} \KeywordTok{column} \CommentTok{{-}{-} Arrange }
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{https://wizardzines.com/zines/sql/samples/from.png}
\caption{SQL Zine by by \href{https://jvns.ca/}{Julia Evans}}
\end{figure}

\hypertarget{setup-7}{%
\subsection{Setup}\label{setup-7}}

Let's get to work.

\hypertarget{packages-1}{%
\subsection{Packages}\label{packages-1}}

\begin{itemize}
\tightlist
\item
  \texttt{pacman::p\_load()} reduces steps for installing and loading several packages simultaneously.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# pacman }
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)) }\KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Loading required package: pacman
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# The rest of pkgs }
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
 
\NormalTok{ tidyverse, }\CommentTok{\# tidyverse packages }
 
\NormalTok{ DBI, }\CommentTok{\# using SQL queries}
 
\NormalTok{ RSQLite, }\CommentTok{\# SQLite}
 
\NormalTok{ dbplyr, }\CommentTok{\# use database with dplyr }
 
\NormalTok{ glue, }\CommentTok{\# glue to automate workflow }
 
\NormalTok{ nycflights13 }\CommentTok{\# toy data }
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{nyc-flights-data}{%
\subsection{NYC flights data}\label{nyc-flights-data}}

\begin{itemize}
\tightlist
\item
  \href{https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236}{The flight on-time performance data} from the Bureau of Transportation Statistics of the U.S. government. The data goes back to 1987 and its size is more than 20 gigabytes. For practice, we only use a small subset of the original data (flight data departing NYC in 2013) provided by RStudio.
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://d33wubrfki0l68.cloudfront.net/245292d1ea724f6c3fd8a92063dcd7bfb9758d02/5751b/diagrams/relational-nycflights.png}
\caption{From RStudio.}
\end{figure}

\hypertarget{workflow-4}{%
\subsection{Workflow}\label{workflow-4}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Create/connect to a database
\end{enumerate}

\begin{itemize}
\item
  Note that server also can be your laptop (called \href{https://en.wikipedia.org/wiki/Localhost\#:~:text=In\%20computer\%20networking\%2C\%20localhost\%20is,via\%20the\%20loopback\%20network\%20interface.}{localhost}).
\item
  Short answer: To do so, you need interfaces between R and a database. We use \href{https://github.com/r-dbi/RSQLite}{\texttt{RSQLite}} in this tutorial because it's easy to set up.
\item
  Long answer: The \texttt{DBI} package in R provides a client-side interface that allows \texttt{dplyr} to work with databases. DBI is automatically installed when you installed \texttt{dbplyr}. However, you need to install a specific backend engine (a tool for communication between R and a database management system) for the database (e.g., \texttt{RMariaDB}, \texttt{RPostgres}, \texttt{RSQLite}). In this workshop, we use SQLite because it is the easiest to get started with. Personally, I love PostgreSQL because it's an open-source and also powerful to do \href{https://www.postgresql.org/docs/current/functions.html}{many amazing things} (e.g., text mining, geospatial analysis). If you want to not only build a data warehouse, but an anlytical platform then consider using Spark (Hadoop).
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Copy a table to the database
\end{enumerate}

\begin{itemize}
\item
  Option 1: You can create a table and insert rows manually. In order to do that, you also need to define data schema (the structure of the database).
\item
  Table

  \begin{itemize}
  \tightlist
  \item
    Collection of rows
  \item
    Collection of columns (fields or attributes)
  \item
    Each col has a type:

    \begin{itemize}
    \tightlist
    \item
      String: \texttt{VARCHAR(20)}
    \item
      Integer: \texttt{INTEGER}
    \item
      Floating-point: \texttt{FLOAT}, \texttt{DOUBLE}
    \item
      Date/time: \texttt{DATE}, \texttt{TIME}, \texttt{DATETIME}
    \end{itemize}
  \item
    \textbf{Schema}: the structure of the database

    \begin{itemize}
    \tightlist
    \item
      The table name
    \item
      The names and types of its columns
    \item
      Various optional additional information

      \begin{itemize}
      \tightlist
      \item
        \href{https://www.w3schools.com/sql/sql_constraints.asp}{Constraints}

        \begin{itemize}
        \tightlist
        \item
          Syntax: \texttt{column\ datatype\ constraint}
        \item
          Examples: \texttt{NOT\ NULL}, \texttt{UNIQUE}, \texttt{INDEX}
        \end{itemize}
      \end{itemize}
    \end{itemize}
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]

\CommentTok{{-}{-} Create table }

\KeywordTok{CREATE} \KeywordTok{TABLE}\NormalTok{ students (}
    \KeywordTok{id} \DataTypeTok{INT}\NormalTok{ AUTO\_INCREMENT,}
\NormalTok{    name }\DataTypeTok{VARCHAR}\NormalTok{(}\DecValTok{30}\NormalTok{),}
\NormalTok{    birth }\DataTypeTok{DATE}\NormalTok{,}
\NormalTok{    gpa }\DataTypeTok{FLOAT}\NormalTok{,}
\NormalTok{    grad }\DataTypeTok{INT}\NormalTok{,}
    \KeywordTok{PRIMARY} \KeywordTok{KEY}\NormalTok{(}\KeywordTok{id}\NormalTok{));}

\CommentTok{{-}{-} Insert one additional row }

\KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ students(name, birth, gpa, grad)}
      \KeywordTok{VALUES}\NormalTok{ (}\StringTok{\textquotesingle{}Adam\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}2000{-}08{-}04\textquotesingle{}}\NormalTok{, }\FloatTok{4.0}\NormalTok{, }\DecValTok{2020}\NormalTok{);}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Option 2: Copy a file (object) to a table in a database using \texttt{copy\_to}). We take this option as it's fast and we would like to focus on querying in this workshop.
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Query the table
\end{enumerate}

\begin{itemize}
\tightlist
\item
  Main focus
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{3}
\item
  Pull the results of interests (\textbf{data}) using \texttt{collect()}
\item
  Disconnect the database
\end{enumerate}

\hypertarget{create-a-database}{%
\subsubsection{Create a database}\label{create-a-database}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Define a backend engine }

\NormalTok{drv \textless{}{-}}\StringTok{ }\NormalTok{RSQLite}\OperatorTok{::}\KeywordTok{SQLite}\NormalTok{()}

\CommentTok{\# Create an empty in{-}memory database }
\NormalTok{con \textless{}{-}}\StringTok{ }\NormalTok{DBI}\OperatorTok{::}\KeywordTok{dbConnect}\NormalTok{(drv, }
                      \DataTypeTok{dbname =} \StringTok{":memory:"}\NormalTok{)}

\CommentTok{\# Connect to an existing database }
\CommentTok{\#con \textless{}{-} DBI::dbConnect(RMariaDB::MariaDB(), }
 \CommentTok{\# host = "database.rstudio.com",}
 \CommentTok{\# user = "hadley",}
 \CommentTok{\# password = rstudioapi::askForPassword("Database password")}
\CommentTok{\#)}

\KeywordTok{dbListTables}\NormalTok{(con)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## character(0)
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# character(0) = NULL}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Note that con is empty at this stage.
\end{itemize}

\hypertarget{copy-an-object-as-a-table-to-the-database-push}{%
\subsubsection{Copy an object as a table to the database (push)}\label{copy-an-object-as-a-table-to-the-database-push}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Copy objects to the data }
\CommentTok{\# copy\_to() comes from dplyr}
\KeywordTok{copy\_to}\NormalTok{(}\DataTypeTok{dest =}\NormalTok{ con, }
        \DataTypeTok{df =}\NormalTok{ flights)}

\KeywordTok{copy\_to}\NormalTok{(}\DataTypeTok{dest =}\NormalTok{ con, }
        \DataTypeTok{df =}\NormalTok{ airports)}

\KeywordTok{copy\_to}\NormalTok{(}\DataTypeTok{dest =}\NormalTok{ con,}
        \DataTypeTok{df =}\NormalTok{ planes)}

\KeywordTok{copy\_to}\NormalTok{(}\DataTypeTok{dest =}\NormalTok{ con, }
        \DataTypeTok{df =}\NormalTok{ weather)}

\CommentTok{\# If you need, you can also select which columns you would like to copy:}

\CommentTok{\# copy\_to(dest = con, }
\CommentTok{\#          df = flights, }
\CommentTok{\#          name = "flights",}
\CommentTok{\#          indexes = list(c("year", "tailnum", "dest")))}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Show two tables in the database }

\KeywordTok{dbListTables}\NormalTok{(con)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "airports"     "flights"      "planes"       "sqlite_stat1" "sqlite_stat4"
## [6] "weather"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Show the columns/attributes/fields of a table }

\KeywordTok{dbListFields}\NormalTok{(con, }\StringTok{"flights"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "year"           "month"          "day"            "dep_time"      
##  [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
##  [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
## [13] "origin"         "dest"           "air_time"       "distance"      
## [17] "hour"           "minute"         "time_hour"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dbListFields}\NormalTok{(con, }\StringTok{"weather"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "origin"     "year"       "month"      "day"        "hour"      
##  [6] "temp"       "dewp"       "humid"      "wind_dir"   "wind_speed"
## [11] "wind_gust"  "precip"     "pressure"   "visib"      "time_hour"
\end{verbatim}

\hypertarget{quick-demonstrations}{%
\subsubsection{Quick demonstrations:}\label{quick-demonstrations}}

\begin{itemize}
\item
  SELECT desired columns
\item
  FROM tables
\item
  Select all columns (*) from \texttt{flights} table and show the \texttt{first\ ten\ rows}
\item
  Note that you can combine SQL and R commands thanks to \texttt{dbplyr}.
\item
  Option 1
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{DBI}\OperatorTok{::}\KeywordTok{dbGetQuery}\NormalTok{(con, }
                \StringTok{"SELECT * FROM flights;"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# SQL}
\StringTok{  }\KeywordTok{head}\NormalTok{(}\DecValTok{10}\NormalTok{) }\CommentTok{\# dplyr }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## 1  2013     1   1      517            515         2      830            819
## 2  2013     1   1      533            529         4      850            830
## 3  2013     1   1      542            540         2      923            850
## 4  2013     1   1      544            545        -1     1004           1022
## 5  2013     1   1      554            600        -6      812            837
## 6  2013     1   1      554            558        -4      740            728
## 7  2013     1   1      555            600        -5      913            854
## 8  2013     1   1      557            600        -3      709            723
## 9  2013     1   1      557            600        -3      838            846
## 10 2013     1   1      558            600        -2      753            745
##    arr_delay carrier flight tailnum origin dest air_time distance hour minute
## 1         11      UA   1545  N14228    EWR  IAH      227     1400    5     15
## 2         20      UA   1714  N24211    LGA  IAH      227     1416    5     29
## 3         33      AA   1141  N619AA    JFK  MIA      160     1089    5     40
## 4        -18      B6    725  N804JB    JFK  BQN      183     1576    5     45
## 5        -25      DL    461  N668DN    LGA  ATL      116      762    6      0
## 6         12      UA   1696  N39463    EWR  ORD      150      719    5     58
## 7         19      B6    507  N516JB    EWR  FLL      158     1065    6      0
## 8        -14      EV   5708  N829AS    LGA  IAD       53      229    6      0
## 9         -8      B6     79  N593JB    JFK  MCO      140      944    6      0
## 10         8      AA    301  N3ALAA    LGA  ORD      138      733    6      0
##     time_hour
## 1  1357034400
## 2  1357034400
## 3  1357034400
## 4  1357034400
## 5  1357038000
## 6  1357034400
## 7  1357038000
## 8  1357038000
## 9  1357038000
## 10 1357038000
\end{verbatim}

\begin{itemize}
\item
  Option 2 (works faster)
\item
  Option 3 (automating workflow)

  \begin{itemize}
  \tightlist
  \item
    When local variables are updated, the SQL query is also automatically updated. This approach is called \href{https://www.php.net/manual/en/pdo.prepared-statements.php}{parameterized query} (or prepared statement).
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\# PREPARATION \#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#}

\CommentTok{\# Local variables }
\NormalTok{tbl \textless{}{-}}\StringTok{ "flights"}
\NormalTok{var \textless{}{-}}\StringTok{ "dep\_delay"}
\NormalTok{num \textless{}{-}}\StringTok{ }\DecValTok{10}

\CommentTok{\# Glue SQL query string }
\CommentTok{\# Note that to indicate a numeric value, you don\textquotesingle{}t need \textasciigrave{}\textasciigrave{}}

\NormalTok{sql\_query \textless{}{-}}\StringTok{ }\KeywordTok{glue\_sql}\NormalTok{(}\StringTok{"}
\StringTok{  SELECT \{\textasciigrave{}var\textasciigrave{}\}}
\StringTok{  FROM \{\textasciigrave{}tbl\textasciigrave{}\}}
\StringTok{  LIMIT \{num\} }
\StringTok{  "}\NormalTok{, }\DataTypeTok{.con =}\NormalTok{ con)}

\CommentTok{\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\# EXECUTION \#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#}

\CommentTok{\# Run the query }
\KeywordTok{dbGetQuery}\NormalTok{(con, sql\_query)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##    dep_delay
## 1          2
## 2          4
## 3          2
## 4         -1
## 5         -6
## 6         -4
## 7         -5
## 8         -3
## 9         -3
## 10        -2
\end{verbatim}

\textbf{Challenge 2}
Can you rewrite the above code using \texttt{LIMIT} instead of \texttt{head(10)}?

\begin{itemize}
\item
  You may notice that using only SQL code makes querying faster.
\item
  Select \texttt{dep\_delay} and \texttt{arr\_delay} from flights table, show the first ten rows, then turn the result into a tibble.
\end{itemize}

\textbf{Challenge 3}
Could you remind me how to see the list of attributes of a table? Let's say you want to see the attributes of \texttt{flights} table. How can you do it?

\begin{itemize}
\tightlist
\item
  Collect the selected columns and filtered rows
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{dbGetQuery}\NormalTok{(con, }
  \StringTok{"SELECT dep\_delay, arr\_delay FROM flights;"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{head}\NormalTok{(}\DecValTok{10}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{collect}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\item
  Counting rows

  \begin{itemize}
  \tightlist
  \item
    Count all (*)
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dbGetQuery}\NormalTok{(con, }
          \StringTok{"SELECT COUNT(*) }
\StringTok{           FROM flights;"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   COUNT(*)
## 1   336776
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dbGetQuery}\NormalTok{(con, }
           \StringTok{"SELECT COUNT(dep\_delay)}
\StringTok{           FROM flights;"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   COUNT(dep_delay)
## 1           328521
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Count distinct values
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dbGetQuery}\NormalTok{(con, }
           \StringTok{"SELECT COUNT(DISTINCT dep\_delay)}
\StringTok{           FROM flights;"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   COUNT(DISTINCT dep_delay)
## 1                       527
\end{verbatim}

\hypertarget{tidy-way-dplyr---sql}{%
\subsubsection{Tidy-way: dplyr -\textgreater{} SQL}\label{tidy-way-dplyr---sql}}

Thanks to the \texttt{dbplyr} package you can use the \texttt{dplyr} syntax to query SQL.

\begin{itemize}
\tightlist
\item
  Note that pipe (\%) works.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# tbl select tables}
\NormalTok{flights \textless{}{-}}\StringTok{ }\NormalTok{con }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{tbl}\NormalTok{(}\StringTok{"flights"}\NormalTok{)}
\NormalTok{airports \textless{}{-}}\StringTok{ }\NormalTok{con }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{tbl}\NormalTok{(}\StringTok{"airports"}\NormalTok{)}
\NormalTok{planes \textless{}{-}}\StringTok{ }\NormalTok{con }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{tbl}\NormalTok{(}\StringTok{"planes"}\NormalTok{)}
\NormalTok{weather \textless{}{-}}\StringTok{ }\NormalTok{con }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{tbl}\NormalTok{(}\StringTok{"weather"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  \texttt{select} = \texttt{SELECT}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{select}\NormalTok{(}\KeywordTok{contains}\NormalTok{(}\StringTok{"delay"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # Source:   lazy query [?? x 2]
## # Database: sqlite 3.33.0 [:memory:]
##    dep_delay arr_delay
##        <dbl>     <dbl>
##  1         2        11
##  2         4        20
##  3         2        33
##  4        -1       -18
##  5        -6       -25
##  6        -4        12
##  7        -5        19
##  8        -3       -14
##  9        -3        -8
## 10        -2         8
## # ... with more rows
\end{verbatim}

\textbf{Challenge 4}
Your turn: write the same code in SQL. Don't forget to add \texttt{connection} argument to your SQL code chunk.

\begin{itemize}
\tightlist
\item
  \texttt{mutate} = \texttt{SELECT} \texttt{AS}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{select}\NormalTok{(distance, air\_time) }\OperatorTok{\%\textgreater{}\%}\StringTok{  }
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{speed =}\NormalTok{ distance }\OperatorTok{/}\StringTok{ }\NormalTok{(air\_time }\OperatorTok{/}\StringTok{ }\DecValTok{60}\NormalTok{)) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # Source:   lazy query [?? x 3]
## # Database: sqlite 3.33.0 [:memory:]
##    distance air_time speed
##       <dbl>    <dbl> <dbl>
##  1     1400      227  370.
##  2     1416      227  374.
##  3     1089      160  408.
##  4     1576      183  517.
##  5      762      116  394.
##  6      719      150  288.
##  7     1065      158  404.
##  8      229       53  259.
##  9      944      140  405.
## 10      733      138  319.
## # ... with more rows
\end{verbatim}

\textbf{Challenge 5}
Your turn: write the same code in SQL. (
Hint: \texttt{mutate(new\_var\ =\ var\ 1\ *\ var2} (R) = \texttt{SELECT\ var1\ *\ var2\ AS\ near\_var} (SQL)

\begin{itemize}
\tightlist
\item
  \texttt{filter} = \texttt{WHERE}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{filter}\NormalTok{(month }\OperatorTok{==}\StringTok{ }\DecValTok{1}\NormalTok{, day }\OperatorTok{==}\StringTok{ }\DecValTok{1}\NormalTok{) }\CommentTok{\# filter(month ==1 \& day == 1) Both work in the same way.}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # Source:   lazy query [?? x 19]
## # Database: sqlite 3.33.0 [:memory:]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dbl>
\end{verbatim}

\textbf{Challenge 6}
Your turn: write the same code in SQL (hint: \texttt{filter(condition1,\ condition2)} = \texttt{WHERE\ condition1\ and\ condition2})

\textbf{Additional tips}

Note that R and SQL operators are not exactly alike. R uses \texttt{!=} for \texttt{Not\ equal\ to}. SQL uses \texttt{\textless{}\textgreater{}} or \texttt{!=}. Furthermore, there are some cautions about using \texttt{NULL} (NA; unknown or missing): it should be \texttt{IS\ NULL} or \texttt{IS\ NOT\ NULL} not \texttt{=NULL} or \texttt{!=NULL}.

Another pro-tip is \href{https://www.w3schools.com/sql/sql_like.asp}{\texttt{LIKE} operator}, which is used in a \texttt{WHERE} statement to find values based on string patterns.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{SELECT} \KeywordTok{DISTINCT}\NormalTok{(origin) }\CommentTok{{-}{-} Distinct values from origin column}
\KeywordTok{FROM}\NormalTok{ flights}
\KeywordTok{WHERE}\NormalTok{ origin }\KeywordTok{LIKE} \StringTok{\textquotesingle{}J\%\textquotesingle{}}\NormalTok{; }\CommentTok{{-}{-} Find any origin values that start with "J"}
\end{Highlighting}
\end{Shaded}

\begin{table}

\caption{\label{tab:unnamed-chunk-15}1 records}
\centering
\begin{tabular}[t]{l}
\hline
origin\\
\hline
JFK\\
\hline
\end{tabular}
\end{table}

\begin{itemize}
\tightlist
\item
  \texttt{arrange} = \texttt{ORDER\ BY}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{arrange}\NormalTok{(carrier, }\KeywordTok{desc}\NormalTok{(arr\_delay)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{show\_query}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## <SQL>
## SELECT *
## FROM `flights`
## ORDER BY `carrier`, `arr_delay` DESC
\end{verbatim}

\textbf{Challenge 7}
Your turn: write the same code in SQL.
Hint: \texttt{arrange(var1,\ desc(var2)} (R) = \texttt{ORDER\ BY\ var1,\ var2\ DESC} (SQL)

\begin{itemize}
\tightlist
\item
  \texttt{summarise} = \texttt{SELECT} \texttt{AS} and \texttt{group\ by} = \texttt{GROUP\ BY}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(month, day) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{delay =} \KeywordTok{mean}\NormalTok{(dep\_delay)) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Missing values are always removed in SQL.
## Use `mean(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
\end{verbatim}

\begin{verbatim}
## # Source:   lazy query [?? x 3]
## # Database: sqlite 3.33.0 [:memory:]
## # Groups:   month
##    month   day delay
##    <int> <int> <dbl>
##  1     1     1 11.5 
##  2     1     2 13.9 
##  3     1     3 11.0 
##  4     1     4  8.95
##  5     1     5  5.73
##  6     1     6  7.15
##  7     1     7  5.42
##  8     1     8  2.55
##  9     1     9  2.28
## 10     1    10  2.84
## # ... with more rows
\end{verbatim}

\textbf{Challenge 8}
Your turn: write the same code in SQL (hint: in SQL the order should be \texttt{SELECT\ group\_var1,\ group\_var2,\ AVG(old\_var)\ AS\ new\_var} -\textgreater{} \texttt{FROM} -\textgreater{} \texttt{GROUP\ BY})

\begin{itemize}
\tightlist
\item
  If you feel too much challenged, here's a help.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(month, day) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{delay =} \KeywordTok{mean}\NormalTok{(dep\_delay)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{show\_query}\NormalTok{() }\CommentTok{\# Show the SQL equivalent!}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## <SQL>
## SELECT `month`, `day`, AVG(`dep_delay`) AS `delay`
## FROM `flights`
## GROUP BY `month`, `day`
\end{verbatim}

\begin{itemize}
\item
  Joins
\item
  Using joins is simpler in R than it is in SQL.
\item
  However, more flexible joins exist in SQL and they are not available in R.

  \begin{itemize}
  \tightlist
  \item
    Joins involving 3+ tables are not supported.
  \item
    Some advanced joins available in SQL are not supported.
  \item
    For more information, check out \href{https://github.com/ianmcook/tidyquery/issues}{\texttt{tidyquery}} to see the latest developments.
  \end{itemize}
\item
  SQL command
\end{itemize}

\texttt{FROM\ one\ table\ LEFT\ JOIN\ another\ table\ ON\ condition\ =\ condition} (\texttt{ON} in SQL = \texttt{BY} in R)

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{SELECT} \OperatorTok{*}
\KeywordTok{FROM}\NormalTok{ flights }\KeywordTok{AS}\NormalTok{ f}
\KeywordTok{LEFT} \KeywordTok{JOIN}\NormalTok{ weather }\KeywordTok{AS}\NormalTok{ w }
\KeywordTok{ON}\NormalTok{ f.}\DataTypeTok{year} \OperatorTok{=}\NormalTok{ w.}\DataTypeTok{year} \KeywordTok{AND}\NormalTok{ f.}\DataTypeTok{month} \OperatorTok{=}\NormalTok{ w.}\DataTypeTok{month}
\end{Highlighting}
\end{Shaded}

\begin{table}

\caption{\label{tab:unnamed-chunk-19}Displaying records 1 - 10}
\centering
\begin{tabular}[t]{r|r|r|r|r|r|r|r|r|l|r|l|l|l|r|r|r|r|r|l|r|r|r|r|r|r|r|r|r|r|r|r|r|r}
\hline
year & month & day & dep\_time & sched\_dep\_time & dep\_delay & arr\_time & sched\_arr\_time & arr\_delay & carrier & flight & tailnum & origin & dest & air\_time & distance & hour & minute & time\_hour & origin & year & month & day & hour & temp & dewp & humid & wind\_dir & wind\_speed & wind\_gust & precip & pressure & visib & time\_hour\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 1 & 39.02 & 26.06 & 59.37 & 270 & 10.35702 & NA & 0 & 1012.0 & 10 & 1357020000\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 2 & 39.02 & 26.96 & 61.63 & 250 & 8.05546 & NA & 0 & 1012.3 & 10 & 1357023600\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 3 & 39.02 & 28.04 & 64.43 & 240 & 11.50780 & NA & 0 & 1012.5 & 10 & 1357027200\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 4 & 39.92 & 28.04 & 62.21 & 250 & 12.65858 & NA & 0 & 1012.2 & 10 & 1357030800\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 5 & 39.02 & 28.04 & 64.43 & 260 & 12.65858 & NA & 0 & 1011.9 & 10 & 1357034400\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 6 & 37.94 & 28.04 & 67.21 & 240 & 11.50780 & NA & 0 & 1012.4 & 10 & 1357038000\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 7 & 39.02 & 28.04 & 64.43 & 240 & 14.96014 & NA & 0 & 1012.2 & 10 & 1357041600\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 8 & 39.92 & 28.04 & 62.21 & 250 & 10.35702 & NA & 0 & 1012.2 & 10 & 1357045200\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 9 & 39.92 & 28.04 & 62.21 & 260 & 14.96014 & NA & 0 & 1012.7 & 10 & 1357048800\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 10 & 41.00 & 28.04 & 59.65 & 260 & 13.80936 & NA & 0 & 1012.4 & 10 & 1357052400\\
\hline
\end{tabular}
\end{table}

Can anyone explain why SQL query using \texttt{dplyr} then translated by \texttt{show\_query()} looks so complex compared to the above? (\href{https://stackoverflow.com/questions/36808295/how-to-remove-duplicate-columns-from-join-in-sql}{Hint})

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{left\_join}\NormalTok{(weather, }\DataTypeTok{by =} \KeywordTok{c}\NormalTok{(}\StringTok{"year"}\NormalTok{, }\StringTok{"month"}\NormalTok{)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{show\_query}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## <SQL>
## SELECT `LHS`.`year` AS `year`, `LHS`.`month` AS `month`, `LHS`.`day` AS `day.x`, `LHS`.`dep_time` AS `dep_time`, `LHS`.`sched_dep_time` AS `sched_dep_time`, `LHS`.`dep_delay` AS `dep_delay`, `LHS`.`arr_time` AS `arr_time`, `LHS`.`sched_arr_time` AS `sched_arr_time`, `LHS`.`arr_delay` AS `arr_delay`, `LHS`.`carrier` AS `carrier`, `LHS`.`flight` AS `flight`, `LHS`.`tailnum` AS `tailnum`, `LHS`.`origin` AS `origin.x`, `LHS`.`dest` AS `dest`, `LHS`.`air_time` AS `air_time`, `LHS`.`distance` AS `distance`, `LHS`.`hour` AS `hour.x`, `LHS`.`minute` AS `minute`, `LHS`.`time_hour` AS `time_hour.x`, `RHS`.`origin` AS `origin.y`, `RHS`.`day` AS `day.y`, `RHS`.`hour` AS `hour.y`, `RHS`.`temp` AS `temp`, `RHS`.`dewp` AS `dewp`, `RHS`.`humid` AS `humid`, `RHS`.`wind_dir` AS `wind_dir`, `RHS`.`wind_speed` AS `wind_speed`, `RHS`.`wind_gust` AS `wind_gust`, `RHS`.`precip` AS `precip`, `RHS`.`pressure` AS `pressure`, `RHS`.`visib` AS `visib`, `RHS`.`time_hour` AS `time_hour.y`
## FROM `flights` AS `LHS`
## LEFT JOIN `weather` AS `RHS`
## ON (`LHS`.`year` = `RHS`.`year` AND `LHS`.`month` = `RHS`.`month`)
\end{verbatim}

\hypertarget{collect-pull}{%
\subsubsection{Collect (pull)}\label{collect-pull}}

\begin{itemize}
\item
  \texttt{collect()} is used to pull the data. Depending on the data size, it may take a long time to run.
\item
  The following code won't work.
\end{itemize}

\begin{quote}
Error in UseMethod(``collect'') : no applicable method for `collect' applied to an object of class ``c(`LayerInstance', `Layer', `ggproto', `gg')''
\end{quote}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{origin\_flights\_plot \textless{}{-}}\StringTok{ }\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(origin) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tally}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_col}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ origin, }\DataTypeTok{y =}\NormalTok{ n)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{collect}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  This works.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(origin) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tally}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{collect}\NormalTok{()}

\NormalTok{origin\_flights\_plot \textless{}{-}}\StringTok{ }\KeywordTok{ggplot}\NormalTok{(df) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_col}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ origin, }\DataTypeTok{y =}\NormalTok{ n))}

\NormalTok{origin\_flights\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_big_data_files/figure-latex/unnamed-chunk-22-1.pdf}

\hypertarget{disconnect}{%
\subsubsection{Disconnect}\label{disconnect}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{DBI}\OperatorTok{::}\KeywordTok{dbDisconnect}\NormalTok{(con)}
\end{Highlighting}
\end{Shaded}

\hypertarget{things-we-didnt-cover}{%
\subsection{Things we didn't cover}\label{things-we-didnt-cover}}

\hypertarget{subquery}{%
\subsubsection{Subquery}\label{subquery}}

Subquery = a query nested inside a query

This is a hypothetical example inspired by \href{https://www.dofactory.com/sql/subquery}{dofactory blog post}.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{SELECT}\NormalTok{ names  }\CommentTok{{-}{-} Outer query }
\KeywordTok{FROM}\NormalTok{ consultants}
\KeywordTok{WHERE} \KeywordTok{Id} \KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ ConsultingId}
                \KeywordTok{FROM}\NormalTok{ consulting\_cases }
                \KeywordTok{WHERE} \KeywordTok{category} \OperatorTok{=} \StringTok{\textquotesingle{}r\textquotesingle{}} \KeywordTok{AND} \KeywordTok{category} \OperatorTok{=} \StringTok{\textquotesingle{}sql\textquotesingle{}}\NormalTok{); }\CommentTok{{-}{-} Subquery }
\end{Highlighting}
\end{Shaded}

\hypertarget{common-table-expression-with-clauses}{%
\subsubsection{Common table expression (WITH clauses)}\label{common-table-expression-with-clauses}}

This is just a hypothetical example inspired by {[}James LeDoux's blog post{]}(\url{https://jamesrledoux.com/code/sql-cte-common-table-expressions}.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{{-}{-} cases about R and SQL from dlab{-}database }
\KeywordTok{WITH}\NormalTok{ r\_sql\_consulting\_cases }\KeywordTok{AS}\NormalTok{ ( }\CommentTok{{-}{-} The name of the CTE expression }
  \CommentTok{{-}{-} The CTE query }
  \KeywordTok{SELECT}
    \KeywordTok{id} 
  \KeywordTok{FROM} 
\NormalTok{    dlab }
  \KeywordTok{WHERE}
\NormalTok{    tags }\KeywordTok{LIKE} \StringTok{\textquotesingle{}\%sql\%\textquotesingle{}}
  \KeywordTok{AND}
\NormalTok{    tags }\KeywordTok{LIKE} \StringTok{\textquotesingle{}\%r\%\textquotesingle{}}
\NormalTok{),}
\CommentTok{{-}{-} count the number of open cases about this consulting category }
\CommentTok{{-}{-} The outer query }
\KeywordTok{SELECT}\NormalTok{ status, }\FunctionTok{COUNT}\NormalTok{(status) }\KeywordTok{AS}\NormalTok{ open\_status\_count}
\KeywordTok{FROM}\NormalTok{ dlab }\KeywordTok{as}\NormalTok{ d }
\KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ r\_sql\_consulting\_cases }\KeywordTok{as}\NormalTok{ r}
  \KeywordTok{ON}\NormalTok{ d.}\KeywordTok{id} \OperatorTok{=}\NormalTok{ r.}\KeywordTok{id} 
\KeywordTok{WHERE}\NormalTok{ status }\OperatorTok{=} \StringTok{\textquotesingle{}open\textquotesingle{}}\NormalTok{; }
\end{Highlighting}
\end{Shaded}

\hypertarget{references-3}{%
\subsection{References}\label{references-3}}

\begin{itemize}
\tightlist
\item
  \href{https://github.com/csv2db/csv2db}{csv2db} - for loading large CSV files in to a database
\item
  R Studio, \href{https://db.rstudio.com/}{Database using R}
\item
  Ian Cook, \href{https://github.com/ianmcook/rstudioconf2020/blob/master/bridging_the_gap_between_sql_and_r.pdf}{``Bridging the Gap between SQL and R''} rstudio::conf 2020 slides

  \begin{itemize}
  \tightlist
  \item
    \href{https://www.youtube.com/watch?v=JwP5KdWSgqE\&ab_channel=RStudio}{Video recording}
  \end{itemize}
\item
  Data Carpentry contributors, \href{https://datacarpentry.org/R-ecology-lesson/05-r-and-databases.html}{SQL database and R}, Data Carpentry, September 10, 2019.
\item
  \href{https://cran.r-project.org/web/packages/dbplyr/vignettes/dbplyr.html}{Introduction to dbplyr}
\item
  Josh Erickson, \href{http://dept.stat.lsa.umich.edu/~jerrick/courses/stat701/notes/sql.html}{SQL in R}, STAT 701, University of Michigan
\item
  \href{https://wizardzines.com/zines/sql/}{SQL zine} by Julia Evans
\item
  \href{http://harelba.github.io/q/}{q} - a command line tool that allows direct execution of SQL-like queries on CSVs/TSVs (and any other tabular text files)
\end{itemize}

\hypertarget{spark}{%
\section{Spark}\label{spark}}

\hypertarget{setup-8}{%
\subsection{Setup}\label{setup-8}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Install \texttt{sparklyr} package
\item
  Install \texttt{spark} using \texttt{sparklyr} package
\item
  (If you haven't) install Java 8 (see \href{https://www.java.com/en/download/manual.jsp}{this guideline} from the Java website)
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{require}\NormalTok{(}\StringTok{"sparklyr"}\NormalTok{)) }\KeywordTok{install.packages}\NormalTok{(}\StringTok{"sparklyr"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Loading required package: sparklyr
\end{verbatim}

\begin{verbatim}
## 
## Attaching package: 'sparklyr'
\end{verbatim}

\begin{verbatim}
## The following object is masked from 'package:purrr':
## 
##     invoke
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# sparklyr::spark\_install(version = "3.0.0")}
\end{Highlighting}
\end{Shaded}


  \bibliography{book.bib,packages.bib}

\end{document}