-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathr-cheats.txt
243 lines (203 loc) · 7.11 KB
/
r-cheats.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# attach and detach a package:
library("MouseDivGeno")
detach(package:MouseDivGeno)
# start in 64 bit mode:
R --arch x86_64
# you can check if you're running 64 or 32 like:
if(.Machine$sizeof.pointer == 8) cat("running 64-bit\n")
else if(.Machine$sizeof.pointer == 4) cat("running 32-bit\n")
else cat("witchcraft!\n")
# Stack-trace in R:
traceback()
# Break on error in R:
options(error = recover)
# Turn warnings into errors
options(warn = 2)
# Acts as a code break-point so you can inspect vars
browser()
# trim off the 1st and last chars of a string:
substr("hello", start=2, stop=nchar("hello") - 1)
# the apply most similar to haskell's (map f [1..20])
mapply(f, 1:20)
# if you would rather have a vector than a list you can do
sapply(1:20, f)
# Here is the R version of haskell's zipWith
mapply(get("+"), as.list(1 : 10), as.list(31: 40), SIMPLIFY = F)
# apply a function to matrix margins first by row then by column
apply(matrix(1:12, nrow = 3), 1, sum)
apply(matrix(1:12, nrow = 3), 2, sum)
# find difference between neighboring values
diff(1:10) # returns nine 1s
# Run Length Encoding
rle(...)
inverse.rle(...)
# replace file extension with empty string
sub("\\..*$", "", "helloworld.exe")
# find string manipulation functions
help.search(keyword="character")
# Find all installed packages matching a substring
grep("MouseDivGeno",rownames(installed.packages()), value = T)
# file stuff
myPath <- file.path(baseDir, "tmp.txt")
if(file.exists(myPath))
{
con <- file(description = myPath, open = "rt") # open can be: rt, wt, rb, rb, at ...
... do something ...
close(con)
}
dir.create("some-dir-name")
# Setting and getting using character names
assign("varname", matrix(1 : 10))
get("varname")
# setting terminal width
options(width=Sys.getenv("COLUMNS"))
# shuffle a vector
sample(1 : 30)
# machine specific info like maximum integer
.Machine$integer.max
# Get the means of a vector split on several levels.
sapply(split(values, factor), mean)
#################
# Performance Tips #
#################
# crossprod(x,y) is faster than t(x) %*% y
> x = matrix(rnorm(1000), 100, 10)
> y = matrix(rnorm(1000), 100, 10)
> system.time(for(i in 1:10000) { t(x) %*% y })
user system elapsed
0.383 0.006 0.389
> system.time(for(i in 1:10000) { crossprod(x, y) })
user system elapsed
0.165 0.001 0.166
# When subtracting a vector from each *row* of a matrix, subtracting a matrix is faster than apply(x, 2, "-", y) or sweep(x, 1, y)
> x = matrix(rnorm(1000), 100, 10)
> y = rnorm(10)
> system.time(for(i in 1:10000) { apply(x, 1, "-", y) })
user system elapsed
9.390 0.061 9.427
> system.time(for(i in 1:10000) { sweep(x, 2, y) })
user system elapsed
1.178 0.009 1.184
> system.time(for(i in 1:10000) { t(t(x) - y) })
user system elapsed
0.391 0.007 0.398
> system.time(for(i in 1:10000) { x - matrix(y, nrow(x), ncol(x), byrow = T) })
user system elapsed
0.233 0.002 0.235
# Showing the all three methods produce the same result.
> all(t(apply(x, 1, "-", y)) == x - matrix(y, nrow(x), ncol(x), byrow = T))
[1] TRUE
> all(sweep(x, 2, y) == x - matrix(y, nrow(x), ncol(x), byrow = T))
[1] TRUE
# When subtracting a vector from each *column* of a matrix, x - y is faster than apply(x, 1, "-", y) or sweep(x, 2, y)
> x = matrix(rnorm(1000), 100, 10)
> y = rnorm(100)
> system.time(for(i in 1:10000) { apply(x, 2, "-", y) })
user system elapsed
2.985 0.024 3.039
> system.time(for(i in 1:10000) { sweep(x, 1, y) })
user system elapsed
1.180 0.008 1.187
> system.time(for(i in 1:10000) { x - y })
user system elapsed
0.070 0.001 0.071
# Showing the all three methods produce the same result.
> all(apply(x, 2, "-", y) == x - y)
[1] TRUE
> all(sweep(x, 1, y) == x - y)
[1] TRUE
# Fast Linear Model Tips
# Suppose that you have ten predictors in your model.
n = 100
p = 10
y = rnorm(n)
x = matrix(rnorm(n * p), n, p)
mod = lm(y ~ x)
# The Choleski factorization and using solve(crossprod(x)) %*% x %*% y will fail of t(x) %*% x is ill-conditioned.
# Choleski is the fastest method. But the QR decomposition does not calculate t(x) %*% x and can be more stable.
# Basic usage is to make a QT decomposition of x and then call qr.coef(), qr.fitted() or qr.resid().
x_with_intercept = matrix(c(rep(1, n), x), n, p + 1)
qrx = qr(x_with_intercept)
coef = qr.coef(qrx, y)
# Values are the same.
all(mod$coef == coef)
[1] TRUE
###
# The QR decomposition can be saved and used for permutations.
# Normal method using lm().
nperm = 1000
fstat = rep(1, nperm)
system.time(for(i in 1:nperm) {
y = sample(y)
mod = lm(y ~ x)
fstat[i] = summary(mod)$fstatistic[1]
})
user system elapsed
3.321 0.024 3.367
# Using the QR decomposition.
x_with_intercept = matrix(c(rep(1, n), x), n, p + 1)
qrx = qr(x_with_intercept)
sst = crossprod(y - mean(y))
fstat2 = rep(0, nperm)
df1 = ncol(x)
df2 = length(y) - ncol(x) - 1
fstat.factor = df2 / df1
system.time(for(i in 1:nperm) {
y = sample(y)
sse = crossprod(qr.resid(qrx, y))
fstat2[i] = (sst - sse) / sse * fstat.factor
})
user system elapsed
0.118 0.000 0.120
#####
# Indexing through 3-dimensional arrays.
# Because of the way that R stores arrays, it's faster to place the longest dimension in dim[[2]] or dim[[3]].
# To demonstrate this, I make three arrays with the long dimension in dim[[1]], [[2]] & [[3]] and show the timing
# of looping through the longest dimension and doing some work. The savings in this case is a factor of 3.
> y = rnorm(100)
>
> x = array(rnorm(1000), c(10000, 100, 100))
> system.time(for(i in 1:dim(x)[[1]]) {
+ a = x[i,,] %*% y
+ })
user system elapsed
19.377 7.495 25.633
>
> x = array(rnorm(1000), c(100, 10000, 100))
> system.time(for(i in 1:dim(x)[[2]]) {
+ a = x[,i,] %*% y
+ })
user system elapsed
4.568 5.003 8.405
>
> x = array(rnorm(1000), c(100, 100, 10000))
> system.time(for(i in 1:dim(x)[[3]]) {
+ a = x[,,i] %*% y
+ })
user system elapsed
4.522 4.996 8.330
#####
# It's faster to allocate the memory you need before using it. For example, if
# you are adding numbers to a vector and don't know how many you'll have, it's
# better to pre-allocate too much memory.
a = 0
system.time(for(i in 1:10000) {
a = c(a, runif(1))
})
a = rep(0, 10000)
system.time(for(i in 1:10000) {
a[i] = runif(1)
})
Creating binary packages on windows:
====================================
* Install latest version of R (choosing non-default install location is OK)
* Install latest version of Rtools (refer to http://cran.r-project.org/doc/manuals/R-admin.html#The-Windows-toolset)
* make sure that the R and Rtools bin dirs are on your %PATH%
* If you have your R code in a tarball or zip file you need to unzip it.
* Enter: "Rcmd check <package_src_dir>" to check on any package errors or warnings
* Enter: "Rcmd build --binary <package_src_dir>" on the command line
Package Installation:
====================================
* from cran install.packages("packageName", repos = "http://cran.r-project.org") NOTE: repos arg is optional. Default is to prompt for a mirror
* remove packages: remove.packages("packageName")
* find out which library(s) packages are installed to .libPaths()