r-cheats.txt

# attach and detach a package:
library("MouseDivGeno")
detach(package:MouseDivGeno)

# start in 64 bit mode:
R --arch x86_64

# you can check if you're running 64 or 32 like:
if(.Machine$sizeof.pointer == 8) cat("running 64-bit\n")
else if(.Machine$sizeof.pointer == 4) cat("running 32-bit\n")
else cat("witchcraft!\n")

# Stack-trace in R:
traceback()

# Break on error in R:
options(error = recover)

# Turn warnings into errors
options(warn = 2)

# Acts as a code break-point so you can inspect vars
browser()

# trim off the 1st and last chars of a string:
substr("hello", start=2, stop=nchar("hello") - 1)

# the apply most similar to haskell's (map f [1..20])
mapply(f, 1:20)
# if you would rather have a vector than a list you can do
sapply(1:20, f)
# Here is the R version of haskell's zipWith
mapply(get("+"), as.list(1 : 10), as.list(31: 40), SIMPLIFY = F)
# apply a function to matrix margins first by row then by column
apply(matrix(1:12, nrow = 3), 1, sum)
apply(matrix(1:12, nrow = 3), 2, sum)

# find difference between neighboring values
diff(1:10) # returns nine 1s

# Run Length Encoding
rle(...)
inverse.rle(...)

# replace file extension with empty string
sub("\\..*$", "", "helloworld.exe")

# find string manipulation functions
help.search(keyword="character")

# Find all installed packages matching a substring
grep("MouseDivGeno",rownames(installed.packages()), value = T)

# file stuff
myPath <- file.path(baseDir, "tmp.txt")
if(file.exists(myPath))
{
  con <- file(description = myPath, open = "rt") # open can be: rt, wt, rb, rb, at ...
  ... do something ...
  close(con)
}
dir.create("some-dir-name")

# Setting and getting using character names
assign("varname", matrix(1 : 10))
get("varname")

# setting terminal width
options(width=Sys.getenv("COLUMNS"))

# shuffle a vector
sample(1 : 30)

# machine specific info like maximum integer
.Machine$integer.max

# Get the means of a vector split on several levels.
sapply(split(values, factor), mean)

#################
# Performance Tips #
#################

# crossprod(x,y) is faster than t(x) %*% y

> x = matrix(rnorm(1000), 100, 10)
> y = matrix(rnorm(1000), 100, 10)
> system.time(for(i in 1:10000) {  t(x) %*% y })
   user  system elapsed 
  0.383   0.006   0.389 
> system.time(for(i in 1:10000) {  crossprod(x, y) })
   user  system elapsed 
  0.165   0.001   0.166 

# When subtracting a vector from each *row* of a matrix, subtracting a matrix is faster than apply(x, 2, "-", y) or sweep(x, 1, y)

> x = matrix(rnorm(1000), 100, 10)
> y = rnorm(10)
> system.time(for(i in 1:10000) { apply(x, 1, "-", y) })
   user  system elapsed 
  9.390   0.061   9.427
> system.time(for(i in 1:10000) { sweep(x, 2, y) })
   user  system elapsed 
  1.178   0.009   1.184
> system.time(for(i in 1:10000) { t(t(x) - y) })
   user  system elapsed 
  0.391   0.007   0.398 
> system.time(for(i in 1:10000) { x - matrix(y, nrow(x), ncol(x), byrow = T) })
   user  system elapsed 
  0.233   0.002   0.235 

# Showing the all three methods produce the same result.
> all(t(apply(x, 1, "-", y)) == x - matrix(y, nrow(x), ncol(x), byrow = T))
[1] TRUE
> all(sweep(x, 2, y) == x - matrix(y, nrow(x), ncol(x), byrow = T))
[1] TRUE

# When subtracting a vector from each *column* of a matrix, x - y is faster than apply(x, 1, "-", y) or sweep(x, 2, y)

> x = matrix(rnorm(1000), 100, 10)
> y = rnorm(100)
> system.time(for(i in 1:10000) { apply(x, 2, "-", y) })
   user  system elapsed 
  2.985   0.024   3.039 
> system.time(for(i in 1:10000) { sweep(x, 1, y) })
   user  system elapsed 
  1.180   0.008   1.187  
> system.time(for(i in 1:10000) { x - y })
   user  system elapsed 
  0.070   0.001   0.071 

# Showing the all three methods produce the same result.
> all(apply(x, 2, "-", y) == x - y)
[1] TRUE
> all(sweep(x, 1, y) == x - y)
[1] TRUE

# Fast Linear Model Tips
# Suppose that you have ten predictors in your model.
n = 100
p = 10
y = rnorm(n)
x = matrix(rnorm(n * p), n, p)
mod = lm(y ~ x)

# The Choleski factorization and using solve(crossprod(x)) %*% x %*% y will fail of t(x) %*% x is ill-conditioned.
# Choleski is the fastest method.  But the QR decomposition does not calculate t(x) %*% x and can be more stable.
# Basic usage is to make a QT decomposition of x and then call qr.coef(), qr.fitted() or qr.resid().
x_with_intercept = matrix(c(rep(1, n), x), n, p + 1)
qrx = qr(x_with_intercept)
coef = qr.coef(qrx, y)
# Values are the same.
all(mod$coef == coef)
[1] TRUE

###
# The QR decomposition can be saved and used for permutations.

# Normal method using lm().
nperm = 1000
fstat = rep(1, nperm)
system.time(for(i in 1:nperm) {
  y = sample(y)
  mod = lm(y ~ x)
  fstat[i] = summary(mod)$fstatistic[1]
})
   user  system elapsed 
  3.321   0.024   3.367 

# Using the QR decomposition.
x_with_intercept = matrix(c(rep(1, n), x), n, p + 1)
qrx = qr(x_with_intercept)
sst = crossprod(y - mean(y))
fstat2 = rep(0, nperm)
df1 = ncol(x)
df2 = length(y) - ncol(x) - 1
fstat.factor = df2 / df1
system.time(for(i in 1:nperm) {
  y = sample(y)
  sse = crossprod(qr.resid(qrx, y))
  fstat2[i] = (sst - sse) / sse * fstat.factor
})
   user  system elapsed 
  0.118   0.000   0.120 

#####
# Indexing through 3-dimensional arrays.
# Because of the way that R stores arrays, it's faster to place the longest dimension in dim[[2]] or dim[[3]].
# To demonstrate this, I make three arrays with the long dimension in dim[[1]], [[2]] & [[3]] and show the timing
# of looping through the longest dimension and doing some work.  The savings in this case is a factor of 3.

> y = rnorm(100)
> 
> x = array(rnorm(1000), c(10000, 100, 100))
> system.time(for(i in 1:dim(x)[[1]]) {
+  a = x[i,,] %*% y
+ })
   user  system elapsed 
 19.377   7.495  25.633 
> 
> x = array(rnorm(1000), c(100, 10000, 100))
> system.time(for(i in 1:dim(x)[[2]]) {
+  a = x[,i,] %*% y
+ })
   user  system elapsed 
  4.568   5.003   8.405 
> 
> x = array(rnorm(1000), c(100, 100, 10000))
> system.time(for(i in 1:dim(x)[[3]]) {
+  a = x[,,i] %*% y
+ })
   user  system elapsed 
  4.522   4.996   8.330 

#####
# It's faster to allocate the memory you need before using it.  For example, if
# you are adding numbers to a vector and don't know how many you'll have, it's
# better to pre-allocate too much memory.
a = 0
system.time(for(i in 1:10000) {
  a = c(a, runif(1))
})

a = rep(0, 10000)
system.time(for(i in 1:10000) {
  a[i] = runif(1)
})


Creating binary packages on windows:
====================================
 * Install latest version of R (choosing non-default install location is OK)
 * Install latest version of Rtools (refer to http://cran.r-project.org/doc/manuals/R-admin.html#The-Windows-toolset)
 * make sure that the R and Rtools bin dirs are on your %PATH%
 * If you have your R code in a tarball or zip file you need to unzip it.
 * Enter: "Rcmd check <package_src_dir>" to check on any package errors or warnings
 * Enter: "Rcmd build --binary <package_src_dir>" on the command line

Package Installation:
====================================
 * from cran install.packages("packageName", repos = "http://cran.r-project.org") NOTE: repos arg is optional. Default is to prompt for a mirror
 * remove packages: remove.packages("packageName")
 * find out which library(s) packages are installed to .libPaths()