-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathusing-r-main.Rnw
364 lines (249 loc) · 23.8 KB
/
using-r-main.Rnw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
\documentclass[paper=a4,headsepline,BCOR=12mm,DIV=11,twoside,open=right,%
titlepage,headings=small,fontsize=10pt,index=totoc,bibliography=totoc,%
captions=tableheading,captions=nooneline]{scrbook}
%\usepackage[utf8]{inputenc}
\usepackage{color}
\usepackage{polyglossia}
\setdefaultlanguage[variant = british, ordinalmonthday = false]{english}
\usepackage{gitinfo2} % remember to setup Git hooks
\usepackage{hologo}
\usepackage{csquotes}
\usepackage{graphicx}
\DeclareGraphicsExtensions{.jpg,.pdf,.png}
\usepackage{animate}
%\usepackage{microtype}
\usepackage[style=authoryear-comp,giveninits,sortcites,maxcitenames=2,%
mincitenames=1,maxbibnames=10,minbibnames=10,backref,uniquename=mininit,%
uniquelist=minyear,sortgiveninits=true,backend=biber]{biblatex}%,refsection=chapter
\usepackage[unicode,hyperindex,bookmarks,pdfview=FitB,%backref,
pdftitle={Learn R ...as you learnt your mother tongue},%
pdfkeywords={R, statistics, data analysis, plotting},%
pdfsubject={R},%
pdfauthor={Pedro J. Aphalo}%
]{hyperref}
\hypersetup{colorlinks,breaklinks,
urlcolor=blue,
linkcolor=blue,
citecolor=blue,
filecolor=blue,
menucolor=blue}
\usepackage{framed}
\usepackage{abbrev}
\usepackage{usingr}
\usepackage{imakeidx}
%%% Adjust graphic design
% New float "example" and corresponding "list of examples"
%\DeclareNewTOC[type=example,types=examples,float,counterwithin=chapter]{loe}
\DeclareNewTOC[name=Box,listname=List of Text Boxes, type=example,types=examples,float,counterwithin=chapter,%
]{lotxb}
% changing the style of float captions
\addtokomafont{caption}{\sffamily\small}
\setkomafont{captionlabel}{\sffamily\bfseries}
\setcapindent{0em}
% finetuning tocs
\makeatletter
\renewcommand*\l@figure{\@dottedtocline{1}{0em}{2.6em}}
\renewcommand*\l@table{\@dottedtocline{1}{0em}{2.6em}}
\renewcommand*\l@example{\@dottedtocline{1}{0em}{2.3em}}
\renewcommand{\@pnumwidth}{2.66em}
\makeatother
% add pdf bookmarks to tocs
\makeatletter
\BeforeTOCHead{%
\cleardoublepage
\edef\@tempa{%
\noexpand\pdfbookmark[0]{\list@fname}{\@currext}%
}\@tempa
}
\setcounter{topnumber}{3}
\setcounter{bottomnumber}{3}
\setcounter{totalnumber}{4}
\renewcommand{\topfraction}{0.90}
\renewcommand{\bottomfraction}{0.90}
\renewcommand{\textfraction}{0.10}
\renewcommand{\floatpagefraction}{0.70}
\renewcommand{\dbltopfraction}{0.90}
\renewcommand{\dblfloatpagefraction}{0.70}
\addbibresource{rbooks.bib}
\addbibresource{references.bib}
\makeindex
\begin{document}
% customize chapter format:
%\KOMAoption{headings}{twolinechapter}
\renewcommand*\chapterformat{\thechapter\autodot\hspace{1em}}
% customize dictum format:
\setkomafont{dictumtext}{\itshape\small}
\setkomafont{dictumauthor}{\normalfont}
\renewcommand*\dictumwidth{0.7\linewidth}
\renewcommand*\dictumauthorformat[1]{--- #1}
%\renewcommand*\dictumrule{}
\extratitle{\vspace*{2\baselineskip}%
{\Huge\textsf{\textbf{Learn R}\\ \textsl{\huge\ldots as you learnt your mother tongue}}}}
\title{\Huge{\fontseries{ub}\sffamily Learn R\\{\Large\ldots as you learnt your mother tongue}}}
\subtitle{Git hash: \gitAbbrevHash; Git date: \gitAuthorIsoDate}
\author{Pedro J. Aphalo}
\date{Helsinki, \today}
\publishers{Draft, 95\% done\\Available through \href{https://leanpub.com/learnr}{Leanpub}}
\uppertitleback{\copyright\ 2001--2017 by Pedro J. Aphalo\\
Licensed under one of the \href{http://creativecommons.org/licenses/}{Creative Commons licenses} as indicated, or when not explicitly indicated, under the \href{http://creativecommons.org/licenses/by-sa/4.0/}{CC BY-SA 4.0 license}.}
\lowertitleback{Typeset with \href{http://www.latex-project.org/}{\hologo{XeLaTeX}}\ in Lucida Bright and \textsf{Lucida Sans} using the KOMA-Script book class.\\
The manuscript was written using \href{http://www.r-project.org/}{R} with package knitr. The manuscript was edited in \href{http://www.winedt.com/}{WinEdt} and \href{http://www.rstudio.com/}{RStudio}.
The source files for the whole book are available at \url{https://bitbucket.org/aphalo/using-r}.}
\frontmatter
% knitr setup
<<setup-1, include=FALSE, cache=FALSE>>=
library(knitr)
library(svglite)
@
<<setup-2, include=FALSE, cache=FALSE>>=
opts_knit$set(unnamed.chunk.label = 'main-chunk')
opts_knit$set(child.command = 'include')
opts_knit$set(self.contained=FALSE)
opts_chunk$set(fig.path='figure/pos-', fig.align='center', fig.show='hold', size="footnotesize", dev='pdf', dev.args=list(family='ArialMT'), cache=FALSE) #
opts_chunk$set(tidy=FALSE,size='footnotesize')
options(replace.assign=TRUE,width=70)
@
<<fig-setup, include=FALSE, cache=FALSE>>=
opts_fig_very_wide <- list(fig.width=9, fig.height=3.9, out.width='.98\\textwidth')
opts_fig_wide <- list(fig.width=7, fig.height=3.9, out.width='.76\\textwidth')
opts_fig_wide_square <- list(fig.width=7, fig.height=7, out.width='.76\\textwidth')
opts_fig_narrow <- list(fig.width=5.444, fig.height=3.9, out.width='.60\\textwidth')
opts_fig_narrow_square <- list(fig.width=5.444, fig.height=5.444, out.width='.60\\textwidth')
opts_fig_very_narrow <- opts_fig_narrow
opts_fig_medium <- opts_fig_narrow
opts_chunk$set(opts_fig_narrow)
@
<<match-setup, include=FALSE>>=
options(warnPartialMatchAttr = FALSE,
warnPartialMatchDollar = FALSE,
warnPartialMatchArgs = FALSE)
@
<<diagnose-set-up, echo=FALSE, include=FALSE, cache=FALSE>>=
eval_diag <- FALSE
knitter_diag <- function() {opts_knit$get()["unnamed.chunk.label"]}
R_diag <- function(){
list(
"Search path" = search(),
"Working dir" = getwd(),
"Number of loaded DLLs" = length(getLoadedDLLs()),
"Max. loaded DLLs" = Sys.getenv(x = "R_MAX_NUM_DLLS", names = TRUE)
)
}
other_diag <- function(){NULL}
@
<<eval=eval_diag, include=eval_diag, echo=eval_diag, cache=FALSE>>=
knitter_diag()
R_diag()
other_diag()
@
% \thispagestyle{empty}
% \titleLL
% \clearpage
\maketitle
%\frontmatter
%\begin{titlingpage}
% \maketitle
%\titleLL
%\end{titlingpage}
\tableofcontents
%\listoftables
%\listoffigures
%\mainmatter
\chapter*{Preface}
\dictum[R. P. Boas (1981) Can we make mathematics intelligible?, \emph{American Mathematical Monthly} \textbf{88:} 727-731.]{"Suppose that you want to teach the `cat' concept to a very young child. Do you explain that a cat is a relatively small, primarily carnivorous mammal with retractible claws, a distinctive sonic output, etc.? I'll bet not. You probably show the kid a lot of different cats, saying `kitty' each time, until it gets the idea. To put it more generally, generalizations are best made by abstraction from experience."}
% Such pauses are not a miss use of our time. To learn a natural language we need to interact with other speakers, we need feedback. In the case of R, we can get feedback both from the outcomes from our utterances to the computer, and from other R users.}
\vspace{2ex}This book covers different aspects of the use of \Rpgrm. It is meant to be used as a tutorial complementing a reference book about \R, or the documentation that accompanies R and the many packages used in the examples. Explanations are rather short and terse, so as to encourage the development of a routine of exploration. This is not an arbitrary decision, this is the normal \emph{modus operandi} of most of us who use R regularly for a variety of different problems.
I do not discuss here statistics, just \Rpgrm as a tool and language for data manipulation and display. The idea is for you to learn the \Rpgrm language like children learn a language: they work-out what the rules are, simply by listening to people speak and trying to utter what they want to tell their parents. I do give some explanations and comments, but the idea of these notes is mainly for you to use the numerous examples to find-out by yourself the overall patterns and coding philosophy behind the \Rpgrm language. Instead of parents being the sound board for your first utterances in \Rlang, the computer will play this role. You should look and try to repeat the examples, and then try your own hand and see how the computer responds, does it understand you or not?
When teaching I tend to lean towards challenging students rather than telling a simplified story. I do the same here, because it is what I prefer as a student, and how I learn best myself. Not everybody learns best with the same approach, for me the most limiting factor is for what I listen to, or read, to be in a way or another challenging or entertaining enough to keep my thoughts focused. This I achieve best when making an effort to understand the contents or to follow the thread or plot of a story. So, be warned, reading this book will be about exploring a new world, this book aims to be a travel guide, neither a traveler's account, nor a cookbook of R recipes.
Do not expect to ever know everything about \Rpgrm! \Rpgrm in a broad sense is vast because its capabilities can be expanded with independently developed packages. Currently there are more than ten thousand packages available for free in the Comprehensive R Archive Network (CRAN), the main, but not only public repository for R packages. You just need to learn to use what you need to use and to have an idea of what else is available, so that you know where to look for packages when your needs change in the future. And if what you need does not exist, then take the plunge, and write your very own package to share with the world (or not). Being \Rpgrm very popular there is nowadays lots of information available, plus a helpful and open minded on-line community willing to help with those difficult problems for which Goggle will not be of help.
How to read this book? My idea is that you will run all the code examples and try as many other variations as needed until you are sure to understand the basic `rules' of the \Rpgrm language and how each function or command described works. In \Rpgrm for each function, data set, etc.\ there is a help page available. In addition, if you use a front-end like \RStudio, auto-completion is available as well as balloon help on the arguments accepted by functions. For scripts, there is syntax checking of the source code before its execution: \emph{possible} mistakes and even formatting style problems are highlighted in the editor window. Error messages tend to be terse in \Rpgrm, and may require some lateral thinking and/or `experimentation' to understand the real cause behind problems. When you are not sure to understand how some command works, it is useful in many cases to try simple examples for which you know the correct answer and see if you can reproduce them with \Rpgrm.
As with any computer language, in addition to learning the grammar of the language, learning the commonly used writing styles and idioms is extremely important. Computer programs should be readable and easy to understand to humans, in addition to being valid. One aspect of this is consistency. I have tried to be consistent, and to use a clear style that does not diverge much from current usual practice. With time you may develop to some extent a personal style, and this is usually o.k. However, when writing computer code, as for any other text intended for humans to read, strive to stick to a consistent writing style and formatting as they go a long way in making your intentions clear.
As I wrote above there are many different ways of doing things in R, and many of the packages that are most popular nowadays did not exist when I started using R. One could write many different R books with similar content and still use substantially different ways of achieving the same results. I limit myself to packages that are currently popular or that I consider elegantly designed. I have in particular tried to limit myself to packages with similar design philosophies, especially in relation to their interfaces. What is elegant design, and in particular what is a friendly user interface depends strongly on each user's preferences and previous experience. Consequently, the contents of the book is strongly biased by my own preferences. Once again, I encourage readers to take this book as a travel guide, as a starting point for exploring the very many packages, styles and approaches which I have not described.
I will appreciate suggestions for further examples, notification of errors and unclear sections. Many of the examples here have been collected from diverse sources over many years and because of this not all sources are acknowledged. If you recognize any example as yours or someone else's please let me know so that I can add a proper acknowledgement. I warmly thank the students that over the years have asked the questions and posed the problems that have helped me write this text and correct the mistakes and voids of previous versions. I have also received help on on-line forums and in person from numerous people, learnt from archived e-mail list messages, blog posts, books, articles, tutorials, webinars, and by struggling to solve some new problems on my own. In many ways this text owes much more to people who are not authors than to myself. However, as I am the one who has written this version and decided what to include and exclude, as author, I take full responsibility for any errors and inaccuracies.
I have been using \Rpgrm since around 1998 or 1999, but I am still constantly learning new things about \Rpgrm itself and \Rpgrm packages. With time it has replaced in my work as a researcher and teacher several other pieces of software: \pgrmname{SPSS}, \pgrmname{Systat}, \pgrmname{Origin}, \pgrmname{Excel}, and it has become a central piece of the tool set I use for producing lecture slides, notes, books and even web pages. This is to say that it is the most useful piece of software and programming language I have ever learnt to use. Of course, in time it will be replaced by something better, but at the moment it is the ``hot'' thing to learn for anybody with a need to analyse and display data.
\begin{framed}
\noindent\large%
\textbf{I encourage you to approach R, like a child approaches his or hers mother tongue when learning to speak:} Do not struggle, just play! If going gets difficult and frustrating, take a break! If you get a new insight, take a break to enjoy the victory!
\end{framed}
\newpage
\begin{framed}
\noindent\large
\textbf{Icons used to mark different content.} Throughout the book text boxes marked with icons present different types of information. First of all, we have \emph{playground} boxes indicated with \playicon\ which contain open-ended exercises---ideas and pieces of R code to play with at the R console. A few of these will require more time to grasp, and are indicated with \advplayicon. Boxes providing general information, usually not directly related to \Rlang as a language, are indicated with \infoicon. Some boxes highlighted with \ilAttention\ give important bits of information that must be remembered when using \Rlang---i.e.\ explain some unusual feature of the language. Finally, some boxes indicated by \ilAdvanced\ give in depth explanations, that may require you to spend time thinking, which en general can be skipped on first reading, but to which you should return at a later, and peaceful, time with a cup of coffee or tea.
\end{framed}
\newpage
\newpage
\begin{infobox}
\noindent
\textbf{Status as of 2016-11-23.} I have updated the manuscript to track package updates since the previous version uploaded six months ago, and added several examples of the new functionality added to packages \ggpmisc, \ggrepel, and \ggplot. I have written new sections on packages \viridis, \pkgname{gganimate}, \pkgname{ggstance}, \pkgname{ggbiplot}, \pkgname{ggforce}, \pkgname{ggtern} and \pkgname{ggalt}. Some of these sections are to be expanded, and additional sections are planned for other recently released packages.
With respect to the chapter \textit{Storing and manipulating data with R} I have put it on hold, except for the introduction, until I can see a soon to be published book covering the same subject. Hadley Wickham has named the set of tools developed by him and his collaborators as \textit{tidyverse} to be described in the book titled \textit{R for Data Science} by Grolemund and Wickham (O'Reilly).
An important update to \ggplot was released last week, and it includes changes to the behavior of some existing functions, specially faceting has become extensible through other packages. Several of the new facilities are described in the updated text and code included in this book and this pdf has been generated with up-to-date version of \ggplot and packages as available today from CRAN, except for \pkgname{ggtern} which was downloaded from Bitbucket minutes ago.
The present update adds about 100 pages to the previous versions. I expect to upload a new update to this manuscript in one or two months time.
\textbf{Status as of 2017-01-17.} Added ``playground'' exercises to the chapter describing \ggplot, and converted some of the examples earlier part of the main text into these playground items. Added icons to help readers quickly distinguish playground sections (\textcolor{blue}{\noticestd{"0055}}), information sections (\textcolor{blue}{\modpicts{"003D}}), warnings about things one needs to be specially aware of (\colorbox{yellow}{\typicons{"E136}}) and boxes with more advanced content that may require longer time/more effort to grasp (\typicons{"E04E}). Added to the sections \code{scales} and examples in the \ggplot chapter details about the use of colors in R and \ggplot2. Removed some redundant examples, and updated the section on \code{plotmath}. Added terms to the alphabetical index. Increased line-spacing to avoid uneven spacing with inline code bits.
\textbf{Status as of 2017-02-09.} Wrote section on ggplot2 themes, and on using system- and Google fonts in ggpplots with the help of package \pkgname{showtext}. Expanded section on \ggplot's \code{annotation}, and revised some sections in the ``R scripts and Programming'' chapter. Started writing the data chapter. Wrote draft on writing and reading text files. Several other smaller edits to text and a few new examples.
\textbf{Status as of 2017-02-14.} Wrote sections on reading and writing MS-Excel files, files from statistical programs such as SPSS, SyStat, etc., and NetCDF files. Also wrote sections on using URLs to directly read data, and on reading HTML and XML files directly, as well on using JSON to retrieve measured/logged data from IoT (internet of things) and similar intelligent physical sensors, micro-controller boards and sensor hubs with network access.
\textbf{Status as of 2017-03-25.} Revised and expanded the chapter on plotting maps, adding a section on the manipulation and plotting of image data. Revised and expanded the chapter on extensions to \pkgname{ggplot2}, so that there are no longer empty sections. Wrote short chapter ``If and when R needs help''. Revised and expanded the ``Introduction'' chapter. Added index entries, and additional citations to literature.
\textbf{Status as of 2017-04-04.} Revised and expanded the chapter on using \Rpgrm as a calculator. Revised and expanded the ``Scripts'' chapter. Minor edits to ``Functions'' chapter. Continued writing chapter on data, writing a section on R's native apply functions and added preliminary text for a pipes and tees section. Write intro to `tidyverse' and grammar of data manipulation. Added index entries, and a few additional citations to the literature. Spell checking.
\textbf{Status as of 2017-04-08.} Completed writing first draft of chapter on data, writing all the previously missing sections on the ``grammar of data manipulation''. Wrote two extended examples in the same chapter. Add table listing several extensions to \pkgname{ggplot2} not described in the book.
\textbf{Status as of 2017-04-13.} Revised all chapters correcting some spelling mistakes, adding some explanatory text and indexing all functions and operators used. Thoroughly revised the Introduction chapter and the Preface. Expanded section on bar plots (now bar and column plots). Revised section on tile plots. Expanded section on factors in chapter 2, adding examples of reordering of factor labels, and making clearer the difference between the labels of the levels and the levels themselves.
\textbf{Status as of 2017-04-29.} Tested with R 3.4.0. Package \pkgname{gganimate} needs to be installed from Github as the updated version is not yet in CRAN. Function \code{gg\_animate()} has been renamed \code{gganimate().}
\textbf{Status as of 2017-05-14.} Submitted package \pkgname{learnrbook} to CRAN. Revised code in the book
to use this new package. Small fixes after more testing. Added examples of plotting and labeling based on fits with \code{method = "nls"}, including use of the new \code{ggpmisc::stat\_fit\_tidy()}.
\textbf{Status as of 2017-06-11.} Added sections on R-code bench marking and profiling for performance optimization. Added also an example of explicit compilation of a function defined in the R language. Added section on functions \code{assign()}, \code{get()} and \code{mget()}.
\textbf{Status as of 2017-08-12.} Various edits to all chapters. Expanded section on \pkgname{ggpmisc} to include the new functionality added in version 0.2.15.9002: \code{geom\_table} and \code{stat\_fit\_tb}. Added section on package \pkgname{ggbeeswarm}. Added sections on packages \pkgname{magick} and on using \pgrmname{ImageJ} from \Rpgrm. Improved indexing and cross references.
\textbf{Status as of 2017-10-25.} Edited the chapter on using R as a calculator, adding examples on insertion and deletion of members of lists and vectors, and also of use of \code{gl()} and \code{reorder()}. Edited sections on scale limits and added new section on coordinate limits to explain more thoroughly their differences and uses in chapter on plotting with \pkgname{ggplot2}. Added a section on package \pkgname{ggsignif} to the chapter on extensions to \pkgname{ggplot2}. Expanded section on \pkgname{ggpmisc} in the same chapter describing new functionality added in version 0.2.16.
\pkgname{ggplo2} $>=$ 2.2.1.9000 is required by the current development version of \pkgname{ggpmisc}.
\textbf{Status as of 2017-10-30.} Add section on using pipes with \code{ggplot()} and layers.
\end{infobox}
%\chapter*{Learning like a child}
\mainmatter
<<children-flag, echo=FALSE, include=FALSE, cache=FALSE>>=
incl_all <- TRUE
@
<<child-r-intro, child='R.intro.Rnw', eval=incl_all || FALSE>>=
@
<<child-r-calc, child='R.as.calculator.Rnw', eval=incl_all || FALSE>>=
@
<<child-r-scripts, child='R.scripts.Rnw', eval=incl_all || FALSE>>=
@
<<child-r-functions, child='R.functions.Rnw', eval=incl_all || FALSE>>=
@
<<child-r-data, child='R.data.Rnw', eval=incl_all || FALSE>>=
@
<<child-r-plotting, child='R.plotting.Rnw', eval=incl_all || FALSE>>=
@
<<child-r-more-plotting, child='R.more.plotting.Rnw', eval=incl_all || FALSE>>=
@
<<child-r-maps, child='R.maps.Rnw', eval= FALSE>>=
@
<<child-r-friends, child='R.friends.Rnw', eval=incl_all || FALSE>>=
@
\chapter{Further reading about R}\label{chap:R:readings}
\dictum[Arthur C. Clarke]{Before you become too entranced with gorgeous gadgets and mesmerizing video displays, let me remind you that information is not knowledge, knowledge is not wisdom, and wisdom is not foresight. Each grows out of the other, and we need them all.}\vskip2ex
\begin{warningbox}
This list will be expanded and more importantly reorganized and short comments added for book or group of books.
\end{warningbox}
\section{Introductory texts}
\cite{Allerhand2011,Dalgaard2008,Zuur2009,Teetor2011,Peng2017,Paradis2005,Peng2016}
\section{Texts on specific aspects}
\cite{Chang2018,Fox2002,Fox2010,Faraway2004,Faraway2006,Everitt2011,Wickham2017}
\section{Advanced texts}
\cite{Xie2013,Chambers2016,Wickham2015,Wickham2019,Wickham2016,Pinheiro2000,Murrell2011,Matloff2011,Ihaka1996,Venables2000}
\section{Texts for S/R wisdom}
\cite{Burns1998,Burns2011,Burns2012,Bentley1986,Bentley1988}
\backmatter
\printbibliography
\printindex
\end{document}
\appendix
\chapter{Build information}
<<>>=
Sys.info()
@
<<eval=FALSE, echo=FALSE>>=
R.Version()
@
<<>>=
sessionInfo()
@
%
\end{document}