This repository has been archived by the owner on May 22, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 53
Example
Hai Qian edited this page Oct 8, 2013
·
10 revisions
library(PivotalR)
## get the help for the package
help("PivotalR-package")
## get the help for one method
help("preview,db.Rquery-method")
## get help for a function
help(madlib.lm)
## connect to a local database
db.connect(port = 14526, dbname = "madlib")
## connect to HAWQ
db.connect(port = 18526, dbname = "madlib")
## connect to DCA
db.connect(host = "dca1-mdw1.dan.dh.greenplum.com", user = "gpadmin",
password = "changeme", dbname = "dstraining")
db.list() # list connections
db.objects() # list all tables in connection 1
db.objects("madlibtestdata.dt") # search table in connection 1
## wrapper of a table in connection 1
x <- db.data.frame("madlibtestdata.dt_abalone")
x
## wrapper of a table in connection 2
y <- db.data.frame("madlibtestdata.dt_abalone", conn.id = 2)
y
dim(x)
names(x) # column names
x$rings
content(x$rings)
lookat(x$rings, 10)
lookat(mean(x$rings)) # mean value of a column
## wrapper of MADlib summary
madlib.summary(y)
lookat(x, 10) # look at a sample of table
lookat(sort(x, FALSE, x$id), 10) # ordered by id
lookat(sort(x, FALSE, NULL), 20) # look at a sample ordered randomly
## ----------------------------------------------------------------------
## ----------------------------------------------------------------------
## linear regression Examples --------------------------------
## fit one different model to each group of data with the same sex
fit <- madlib.lm(rings ~ . - id | sex, data = y)
fit # view the result
length(fit)
fit[[1]]
groups(fit)
groups(fit[[1]])
## apply the model onto data in another database
lookat(mean((x$rings - predict(fit, x))^2)) # mean square error
ap <- cbind(x$rings, predict(fit, x)) # combine two columns
plot(lookat(sort(ap, FALSE, NULL), 100)) # plot a random sample
idx <- which(groups(fit)[[1]] == "I")
ap <- cbind(x$rings[x$sex == "I"], predict(fit[[idx]], x[x$sex == "I",]))
plot(lookat(sort(ap, FALSE, NULL), 100)) # plot a random sample
## ----------------------------------------------------------------------
## Pivoting: categorial variable
v <- x
v$sex <- as.factor(v$sex) # specify which column to pivot
v
names(v)
f <- madlib.lm(rings ~ . - id, data = v)
f
lookat(mean((v$rings - predict(f, v))^2)) # mean square error
## ----------------------------------------------------------------------
## generic bagging
fit <- generic.bagging(function(data) {
madlib.lm(rings ~ . - id - sex, data = data)
},
data = y, nbags = 10, fraction = 0.85)
pred <- predict(fit, newdata = x) # make prediction
lookat(mean((x$rings - pred)^2))
## ---------------------------------------------------------------------
## cross-validation
## no transaction lock number limitation
err <- generic.cv(function(data) {
madlib.lm(rings ~ . - id - sex, data = data)
},
predict,
function(predicted, data) {
lookat(mean((data$rings - predicted)^2))
}, data = y)
err
## ----------------------------------------------------------------------
## logistic regression
g <- madlib.glm(rings < 10 ~ . - id - sex, data = x,
family = "binomial")
g
## accuracy
lookat(mean(((x$rings<10) == predict(g, x))))
## ----------------------------------------------------------------------
## array column support
z <- db.data.frame("madlibtestdata.lin_auto_mpg_oi")
lookat(z, 10)
madlib.lm(y ~ x, data = z)
madlib.lm(y ~ x - `x[1]`, data = z) # `x[1]` is a valid R variable name
## ----------------------------------------------------------------------
## deal with NULL values in the table
delete("null_data", conn.id = 1)
w <- as.db.data.frame(null.data, "null_data", conn.id = 1)
dim(w)
lookat(w, 10)
db.objects("null", conn.id = 2)
madlib.lm(sf_mrtg_pct_assets ~ ., data = w) # will fail
for (i in 1:10) w <- w[!is.na(w[i]),] # filter NULL values
w[is.na(w)] <- 20
dim(w)
madlib.lm(sf_mrtg_pct_assets ~ ., data = w)
## ----------------------------------------------------------------------
## ----------------------------------------------------------------------
## ----------------------------------------------------------------------
## quick prototype linear regression
## Comparison:
## normal R script, computation runs in memory
## cannot run on big data
r.linregr <- function (x, y) {
a <- crossprod(x)
b <- crossprod(x, y)
solve(a) %*% b
}
dat <- lookat(x, "all")
r.linregr(as.matrix(cbind(1, dat[,-c(1,2,10)])), dat$rings)
## ------------------------------------------------
## PivotalR script, computation runs in database
## Assume that the feature number is small (row number is big)
## So the crossprod result is small and can be loaded into memory
## Run on the big data in database
linregr <- function (x, y) {
a <- crossprod(x)
b <- crossprod(x, y)
solve(lookat(a, "all")) %*% lookat(b, "all")
}
linregr(db.array(1, x[,-c(1,2,10)]), x$rings)
## ----------------------------------------------------------------------
## ----------------------------------------------------------------------
## compute all eigenvectors in parallel
## can be used for tables with features < 1000
pca <- function (x, center = TRUE, scale = FALSE) {
y <- scale(x, center = center, scale = scale) # centering and scaling
z <- as.db.data.frame(y, verbose = FALSE) # create an intermediate table to save computation
m <- lookat(crossprod(z)) # one scan of the table to compute Z^T * Z
d <- delete(z) # delete the intermediate table
res <- eigen(m) # only this computation is in R
n <- attr(y, "row.number") # save the computation to count rows
## return the result
list(val = sqrt(res$values/(n-1)), # eigenvalues
vec = res$vectors, # columns of this matrix are eigenvectors
center = attr(y, "scaled:center"),
scale = attr(y, "scaled:scale"))
}
dat <- db.data.frame("madlibtestdata.pca_mat_600_100", conn.id = 2)
q <- pca(dat[,-1])
q$val
q$center
## -------------------------------------------------------------
## ARIMA model
## -------------------------------------------------------------
## Time series data has two columns: time stamp and time series value
x <- db.data.frame("tseries")
dim(x)
names(x)
## one can use expressions
s <- madlib.arima(tval+2 ~ I(tid + 1), x, order = c(2,0,1))
s
lookat(sort(s$residuals, F, s$residuals$tid), 10)
## delete all resulting tables
delete(s)
## expressions can also be used in this interface
s1 <- madlib.arima(x$tval + 1, x$tid, order = c(2,0,1))
s1
lookat(sort(s1$residuals, F, s1$residuals$tid_opr), 10)
s1$exec.time
pred <- predict(s1, n.ahead = 10)
lookat(pred)