-
Notifications
You must be signed in to change notification settings - Fork 0
/
PracticalMachLearning-2.R
644 lines (573 loc) · 30.7 KB
/
PracticalMachLearning-2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
setwd("D:\\study\\DataScience")
#1. prediction with trees
#pros: easy to interpret; better performance in nonlinear settings
#cons: without pruning/cross-validation can lead to overfitting; results may be variable
#basic algorithm
#start with all variables in one group
#find the variable/split that best separates the outcomes, and divide data into two groups(leaves) on that split(nodes)
#within each split, find the best variable/split that separates the outcomes
#continue until the groups are too small or sufficiently "pure"
#measure of impurity:
#misclassification error(1-percentage of classifying to one leaf on a particular split) (0=perfect purity; 0.5=no purity)
#Gini Index: 1-sum{k=1~K}((p_mk)^2). (0=perfect purity; 0.5=no purity)
#Deviance/Information Gain: -sum{k=1~K}((p_mk)*log2(p_mk)) (0=perfect purity, 1=no purity = -0.5*(-1)-0.5*(-1))
#Classification Trees use interactions between variables
#Data transformation may be less important.
data(iris)
library(ggplot2)
names(iris)
table(iris$Species)
inTrain<-createDataPartition(y=iris$Species,p=0.7,list=FALSE)
training<-iris[inTrain,]
testing<-iris[-inTrain,]
dim(training);dim(testing)
#iris petal widths/sepal width
qplot(Petal.Width,Sepal.Width,color=Species,data=training)
#we could see three distinct classes. It might be a little bit challenging for a linear model.
library(caret)
modFit<-train(Species~.,method='rpart',data=training)
#rpart: Recursive Partitioning and Regression Trees. predict(obj,type="prob")
print(modFit$finalModel)
par(mar=c(2,2,2,2))
plot(modFit$finalModel,uniform=TRUE,main="Classification Tree")
#notice that this dendrogram is drawn in different method in the dendrogram in EDA
text(modFit$finalModel,use.n=TRUE,all=TRUE,cex=0.8)
#prettier plots
if(!require("rattle")){install.packages("rattle")}
library(rattle)
if(!require("rpart.plot")){install.packages("rattle")}
library(rpart.plot)
fancyRpartPlot(modFit$finalModel)
#predict new values
testPredict<-predict(modFit,newdata=testing)
table(testPredict,testing$Species)
#Regression Trees can also be used for regression problems(cts. outcome)
#2. Bagging -- short for bootstrap aggregating.
#the basic idea is that when you fit complicated models, sometimes if you average those models together,
#you get a smoother model fit. That gives you a better balance between potential bias and variance in your fit.
#Similar bias for each model and reduced variance
#more useful for non-linear funtions
#Basic Idea:
#(1)Resample cases and recalcualte predictions
#(2)Average or majority vote
if(!require("ElemStatLearn")){install.packages("ElemStatLearn")}
library(ElemStatLearn); #data(ozone,package="ElemStatLearn")
ozone<-ozone[order(ozone$ozone),]
head(ozone)
dim(ozone)
#predict temperature as a function of ozone.
#Bagged loess(Bagged Local Polynomial Regression Fitting)
ll<-matrix(NA,nrow=10,ncol=155)
for(i in 1:10){
ss<-sample(1:dim(ozone)[1],replace=T) #length(ss) = 111, #why we use substituted sampling here?
#print(ss) #if we use replaece=F, then all training sets are the same. Therefore the prediction results are the same.
#Bootstrapping means sampling with replacement, so bagging also randomizes the size the training set.
#we ss = 109, we could retreive the 109th row from ozone (be careful, not the row labeled as 109 but the 109th row counting from top)
ozone0<-ozone[ss,]; #if retrieve the 63th row twice, the second row would be labeled as 63.1
ozone0<-ozone0[order(ozone0$ozone),]
loess0<-loess(temperature~ozone,data=ozone0,span=0.2) #the only predictor is "ozone"
ll[i,]<-predict(loess0,newdata=data.frame(ozone=1:155))
}
par(mar=c(5.1,4.1,4.1,2.1))
plot(ozone$ozone,ozone$temperature,pch=19,cex=0.5)
for(i in 1:10){lines(1:155,ll[i,],col="grey",lwd=1)}
lines(1:155,apply(ll,2,median),col="red",lwd=2)
#some models perform bagging for you, in "train" function consider "method" options.
#-bagEarth: A bagging wrapper for multivariate adaptive regression splines (MARS) via the earth function
#-treebag;
#-bagFDA:A bagging wrapper for flexible discriminant analysis (FDA) using multivariate adaptive regression splines (MARS) basis functions
#alternatively, you can bag any model you choose using the bag function.
#bag(x, y, B = 10, vars = ncol(x), bagControl = NULL, ...)
if(!require("party")){install.packages("party")}
library(party)
predictors<-data.frame(ozone=ozone$ozone)
temperature=ozone$temperature
treebag <- bag(predictors, temperature, B = 10,
bagControl = bagControl(fit = ctreeBag$fit, #the function (ctree in this case) applied to fit the model
predict = ctreeBag$pred, #a call to the predict function from a trained model
#ctreeBag$pred(object, x): takes into object created by ctreeBag and a new data set x, and it's going to get a new prediction
aggregate = ctreeBag$aggregate)) #the way that we'll put predictions together and take the median
#bagControl: ldaBag, plsBag, nbBag, ctreeBag(conditional regression tree), svmBag, nnetBag
#a function in ctreeBag$pred: treeresponse(object, newdata = NULL, ...): compute statistics for the conditional distribution
#of the response as modelled by the tree. For regression problems, this is just the mean. For
#nominal or ordered responses, estimated conditional class probabilities are returned. KaplanMeier
#curves are computed for censored responses. Note that a list with one element for each
#observation is returned.
#predict(treebag,predictors)
plot(ozone$ozone,temperature,col='lightgrey',pch=19)
points(ozone$ozone,predict(treebag$fits[[1]]$fit,predictors),pch=19,col="red")
points(ozone$ozone,predict(treebag$fits[[2]]$fit,predictors),pch=19,col="red")
points(ozone$ozone,predict(treebag,predictors),pch=19,col="blue")
#another example
data(mdrr)
## remove some zero variance predictors and linear dependencies
mdrrDescr <- mdrrDescr[, -nearZeroVar(mdrrDescr)]
mdrrDescr <- mdrrDescr[, -findCorrelation(cor(mdrrDescr), .95)]
basicLDA <- train(mdrrDescr, mdrrClass, "lda")
bagLDA2 <- train(mdrrDescr, mdrrClass, method="bag", B = 10,
bagControl = bagControl(fit = ldaBag$fit,
predict = ldaBag$pred,
aggregate = ldaBag$aggregate),
tuneGrid = data.frame(vars = c((1:10)*10 , ncol(mdrrDescr))))
#The final value used for the model was vars = 20.
#Conditional inference trees vs traditional decision trees
#the main difference seems to be that ctree uses a covariate selection scheme that is
#based on statistical theory (i.e. selection by permutation-based significance tests)
#and thereby avoids a potential bias in rpart, otherwise they seem similar; e.g.
#conditional inference trees can be used as base learners for Random Forests.
#3. Random Forest - an extension to bagging for classification and regression trees
#Process:
#-Bootstrap samples.
#-here is the difference: At each split, we also bootstrap variables. This makes for
#a diverse set of potential trees that can be built.
#-Grow a large number of trees and vote or average those trees
#Pros: Accuracy
#Cons: speed(slow); overfitting
#interpretability(hard): because different trees represent bootstrap samples with bootstrap nodes and take aggregation
data(iris)
library(ggplot2)
inTrain<-createDataPartition(y=iris$Species, p=0.7, list=FALSE)
training<-iris[inTrain,]
testing<-iris[-inTrain,]
library(caret)
library(randomForest)
modFit<-train(Species~., data=training, method="rf",prox=TRUE) #rf is random forest
#Proximities: The proximities originally formed a NxN matrix. After a tree is grown, put
#all of the data, both training and oob, down the tree. If cases/records k and n are in the
#same terminal node increase their proximity by one. At the end, normalize the
#proximities by dividing by the number of trees.
modFit
#getting a single tree from a forest -- getTree
getTree(modFit$finalModel,k=2)
#result: (terminal means leaf)
#each column corresponds to particular split.
#split var: which variable we're splitting on. Split point:what's the value where that variable is split.
#status: is the node terminal (-1) or not (1)
#prediction: what the perdiction would be out of that particular split. 0 if the node is not terminal
modFit$finalModel
names(modFit$finalModel)
#class "centers"
irisP<-classCenter(training[,c(3,4)],training$Species,modFit$finalModel$prox)
#classCenter(x, label, prox, nNbr = min(table(label))-1)
#prox: the proximity (or similarity) matrix, assumed to be symmetric with 1 on the
#diagonal and in [0, 1] off the diagonal (the order of row/column must match that of x)
class(irisP) #matrix
irisP
irisP<-as.data.frame(irisP)
irisP$Species<-rownames(irisP)
p<-qplot(Petal.Width,Petal.Length,col=Species,data=training)
p+geom_point(aes(x=Petal.Width,y=Petal.Length,col=Species),size=5,shape=4,data=irisP)
#predicting new values
pred<-predict(modFit,testing)
testing$predRight<-pred==testing$Species
sum(testing$predRight)/length(testing$predRight)
table(pred,testing$Species)
#Random Forest are usually one of the two top performing algorithms along with boosting.
#Care should be taken to avoid overfitting. (rfcv function or there is an option in train function)
?rfcv #random forest cross-validation
#rfcv(trainx, trainy, cv.fold=5, scale="log", step=0.5,
# mtry=function(p) max(1, floor(sqrt(p))), recursive=FALSE, ...)
set.seed(647)
myiris <- cbind(iris[1:4], matrix(runif(96 * nrow(iris)), nrow(iris), 96))
result <- rfcv(myiris, iris$Species, cv.fold=3)
with(result, plot(n.var, error.cv, log="x", type="o", lwd=2))
result <- replicate(5, rfcv(myiris, iris$Species), simplify=FALSE)
#replicate(n, expr, simplify = "array")
error.cv <- sapply(result, "[[", "error.cv")
matplot(result[[1]]$n.var, cbind(rowMeans(error.cv), error.cv), type="l",
lwd=c(2, rep(1, ncol(error.cv))), col=1, lty=1, log="x",
xlab="Number of variables", ylab="CV Error")
#4. boosting - weightedly add up weak predictors
#iterative, select one classifier at each step, calculate weights based on errors
#upweight missed classifications and select next h
#weight alpha=0.5*ln((1-error)/error)
#Boosting can be used with any subset of classifiers
#one large subclass is gradient boosting
#R libraries: most are available in caret package
#- gbm: boosting with trees
#- mboost: model based boosting
#- ada: statistical boosting based on additive logistic regression
#- gamBoost: boosting generalized additive models
library(ISLR);data(Wage);library(ggplot2);library(caret)
Wage<-subset(Wage,select=-c(logwage))
inTrain<-createDataPartition(y=Wage$wage,p=0.7,list=FALSE)
training<-Wage[inTrain,]
testing<-Wage[-inTrain,]
library(gbm)
modFit<-train(wage~.,method="gbm",data=training,verbose=FALSE)
modFit
qplot(predict(modFit,testing),wage,data=testing,xlim=c(50,300),ylim=c(50,300))
#5. Model-based Prediction
#Basic idea:
#(1) Assume the data follow a probabilistic model
#(2) Use Bayes' theorem to identify optimal claasifiers
#Goal is to build a parametric model for conditional distribution P(Y=k|X=X)
#A typical approach is to apply Bayes Theorem: P(Y=k|X=x)=(f(x)_k * pi_k)/(sum{l=1~k}f(x)_l * pi_l)
#pi_k is P(Y=k) ; f(x)_k is P(X=x|Y=k)
#typical prior probabilities pi_k are set in advance
#f(x)_k often uses normal distribution (estimate mean and variance from data)
#classify to the class with the highest value of P(Y=k|X=x) #same logic with MLE
#A range of models use this approach
#Linear Discriminant Analysis: assumes f(x)_k is multivariate Gaussian with same covariances (covariances across classes are identical)
#Quadratic Discriminant Analysis: assumes f(x)_k is multivariate Gaussian with different covariances (covariances across classes are different)
#Naive Bayes assumes independence between features for model building (the covariance matrix is a diagonal matrix)
#Model based prediction assumes more complicated versions for the covariance matrix
#Lnear discriminant anlaysis?
#log(P(Y=k|X=x)/P(Y=j|X=x)) = log(f(x)_k/f(x)_j)+log(pi_k/pi_j)
#= log(pi_k/pi_j) - 0.5*(miu_k+miu_j)' * sigma^(-1) * (miu_k+miu_j) + x' * sigma^(-1) * (miu_k-miu_j)
#discriminant function: d(x)_k = - 0.5*miu_k * sigma^(-1) * miu_k + x' * sigma^(-1) * miu_k
#decide on class based on Y(x)_hat = argmax_k(d(x)_k)
#we usually estimate parameters with MLE
data(iris)
cols=c(rgb(1,0,0,alpha=.5),rgb(0,1,0,alpha=.5),rgb(0,0,1,alpha=.5))
pairs(iris[,1:4],col=cols[as.numeric(iris$Species)],pch=19,gap=0)
fit <- lda(Species~.,iris)
plot(fit,abbrev=TRUE)
fit
table(predict(fit)$class,iris$Species)
ind <- c(sample(which(iris$Species=="setosa"),5),
sample(which(iris$Species=="versicolor"),5),
sample(which(iris$Species=="virginica"),5))
Train <- iris[ind,]
Test <- iris[-ind,]
fit <- lda(Species~.,Train)
pred <- predict(fit,Test)
table(pred$class,Test$Species)
#Naive Bayes - does something more to simplify the problem
#logic: suppose we have many predictors: we would want to model: P(Y=k|X1,...,Xm)
#By Bayes Theorem: P(Y=k|X1,...,Xm) = [pi_k * P(X1,...,Xm|Y=k)]/sum{l=1~k}(P(X1,...,Xm|Y=k)*pi_l)
# ~ pi_k * P(X1,...,Xm|Y=k)
# = pi_k * P(X1|Y=k)* P(X3,...,Xm|X1,Y=k)
# = pi_k * P(X1|Y=k)* P(X2|X1,Y=k)* P(X3,...,Xm|X1, X2,Y=k)
# = pi_k * P(X1|Y=k)* P(X2|X1,Y=k)* ... * P(Xm|X1, X2, ..., Xm-1,Y=k)
# = pi_k * P(X1|Y=k)* P(X2|Y=k)*... * P(Xm|Y=k) because all of predictors are independent
data(iris);library(ggplot2)
names(iris)
table(iris$Species)
inTrain<-createDataPartition(iris$Species,p=0.7,list=FALSE)
training<-iris[inTrain,]
testing<-iris[-inTrain,]
dim(training);dim(testing)
library(klaR)
modlda<-train(Species~.,data=training,method="lda")
modnb<-train(Species~.,data=training,method="nb")
plda<-predict(modlda,testing);
pnb<-predict(modnb,testing);
table(plda,pnb)
table(plda,testing$Species)
table(pnb,testing$Species)
#lda is a little bit better than pnb
#comparison of results
equalPredictions = (plda==pnb)
qplot(Petal.Width,Sepal.Width,color=equalPredictions,data=testing)
#5. Regularized Regression
#Basic idea: fit a regression model + penalize (or shrink) large coefficients
#Pros: can help with the bias/variance tradeoff - if certain variables are highly correlated
#with each other, you might not want to include them both in linear regression
#model as they will have a very high variance; but leaving one of them out might
#slightly bias your model(lose a little bit of prediction capability);
#Can help with model selection in certain cases for regular organization techniques like the lasso.
# regression subset selection in the prostate dataset
library(ElemStatLearn);data(prostate)
str(prostate)
head(prostate)
covnames <- names(prostate[-(9:10)])
y <- prostate$lpsa
x <- prostate[,covnames]
form <- as.formula(paste("lpsa~", paste(covnames, collapse="+"), sep=""))
class(form) #"formula"
summary(lm(form, data=prostate[prostate$train,]))
set.seed(1)
train.ind <- sample(nrow(prostate), ceiling(nrow(prostate))/2)
y.test <- prostate$lpsa[-train.ind]
x.test <- x[-train.ind,]
y <- prostate$lpsa[train.ind]
x <- x[train.ind,]
p <- length(covnames)
rss <- list() #create an empty list
for (i in 1:p) {
#cat(i) #good idea!
Index <- combn(p,i)
#combn(x, m, FUN = NULL, simplify = TRUE, ...)
#x: vector source for combinations, or integer n for x <- seq_len(n).
#m: number of elements to choose
#cat(paste("index",Index))
rss[[i]] <- apply(Index, 2, function(is){ #apply by columns
form <- as.formula(paste("y~", paste(covnames[is], collapse="+"), sep=""))
#"lcavol" "lweight" "age" "lbph" "svi" "lcp" "gleason" "pgg45"
cat(paste(form),"\n")
isfit <- lm(form, data=x)
yhat <- predict(isfit) #if not newdata, don't have to specify data
train.rss <- sum((y - yhat)^2)
yhat <- predict(isfit, newdata=x.test)
test.rss <- sum((y.test - yhat)^2)
c(train.rss, test.rss)
})
}
#png("./PracticalMachLearning/selection-plots-01.png", height=432, width=432, pointsize=12)
plot(1:p, 1:p, type="n", ylim=range(unlist(rss)), xlim=c(0,p), xlab="number of predictors", ylab="residual sum of squares", main="Prostate cancer data")
for (i in 1:p) {
points(rep(i-0.15, ncol(rss[[i]])), rss[[i]][1, ], col="blue")
points(rep(i+0.15, ncol(rss[[i]])), rss[[i]][2, ], col="red")
}
minrss <- sapply(rss, function(x) min(x[1,]))
lines((1:p)-0.15, minrss, col="blue", lwd=1.7)
minrss <- sapply(rss, function(x) min(x[2,]))
lines((1:p)+0.15, minrss, col="red", lwd=1.7)
legend("topright", c("Train", "Test"), col=c("blue", "red"), pch=1)
#dev.off()
#we could see apparent overfitting in the above plot.
#Model selection Approach - Split Samples
#(1)divide data into training/test/validation
#(2)treat validation as test data, train all competing models on the training data
#and pick the best one on validation.
#(3)to appropriate assess performance on new data, apply to test set
#(4)You may re-split and reperform steps 1-3
#Model selection Approach - Decomposing Expected Prediction Error
#Assume Yi= f(Xi)+ei. EPE(lambda) = E[{Y-f(X)_lambda}^2], where f(X)_lambda is the estimate from the
#training data (predicted outcome).
#look at a new data point X=x*:
#E[{Y-f(x*)_lambda}^2] = sigma^2 + {E[{f(x*)_lambda}]-f(x*)}^2 + var[f(x0)_lambda]
#= irreducible error + Bias^2 + Variance of our estimate
#trade off bias and variane is the idea of regularized regression
#Another issue for high-dimensional data -- # of predictors is more than # of records/observations
small=prostate[1:5,]
lm(lpsa~.,data=small) #some predictors will get NA's. R not be able to estimate them because more predictors than sample. Design matrix cannot be inverted
#An approach to this problem
#suppose model is Y=f(X)+e; set f(x)_lambda = x'*beta
#constrain only lambda coefficients to be nonzero
#then selection problem is after choosing lambda, figure out which lambda coefficients to make nonzero
#Another approach is to regularization for regression
#to control variance, we might regularize/shrink the coefficients:
#PRSS(beta)=sum{j=1~n}([(Yj-sum{i=1~m}(beta1i * Xij)]^2) + P(lambda;beta). Where PRSS is a penalized form of the sum of squares
#Penalty reduces complexity, variance.
##
# ridge regression on prostate dataset
# ridge regression penalizes the size of the regression coefficients
# solve: sum{i=1~N}([(Yi-beta0-sum{j=1~p}(betaj * Xij)]^2) + lambda * sum{j=1~p}((betaj)^2)
#equivalent to solving: sum{i=1~N}([(Yi-beta0-sum{j=1~p}(betaj * Xij)]^2) subject to sum{j=1~p}((betaj)^2) <= s, where s is inversely proportional to lambda
#inclusion of lambda makes the problem non-singular even if X'X is not invertible (colinearity). beta_hat = (X'X+lambda*I)^(-1) * X'y
#In the special case of an orthonormal design matrix: beta_ridge_hat = beta_OLS/(1+lambda) -> This illustrates the essential feature of ridge regression: shrinkage
#Applying the ridge regression penalty has the effect of shrinking the estimates toward zero - introducing bias but reducing the variance of the estimate
#http://web.as.uky.edu/statistics/users/pbreheny/764-F11/notes/9-1.pdf
#The benefits of ridge regression are most striking in the presence of multicollinearity
x1 <- rnorm(20)
x2 <- rnorm(20,mean=x1,sd=.01)
y <- rnorm(20,mean=3+x1+x2)
lm(y~x1+x2)$coef
lm.ridge(y~x1+x2,lambda=1)
#The variance of the ridge regression estimate is Var(beta_hat)=sigma^2*W*X'*X*W, where W = (X'X+lambda*I)^(-1)
#the bias of the ridge regression is: Bias(beta_hat) = -lambda*W*beta
#penalized regression can be interpreted in a Bayesian context,
#even if the model we fit is exactly correct and follows the exact distribution we specify, we can always obtain a better estimator by shrinking towards zero
####example from UKY.edu
data(prostate)
prostate <- prostate[,-ncol(prostate)]
fit <- lm.ridge(lpsa~.,prostate,lambda=seq(0,100,len=501))
plot(fit)
fit$lambda[which.min(fit$GCV)] #generalized cross-validation value
## More interesting lambda values
lam <- c(0,exp(seq(log(0.01),log(1e8),len=201)))
fit <- lm.ridge(lpsa~.,prostate,lambda=lam)
## Calculating df
XX <- scale(model.matrix(lpsa~0+.,prostate),center=fit$xm,scale=fit$scales)
l <- eigen(crossprod(XX))$values #crossprod(XX) is equivalent to t(XX)%*%(XX)
fit$df <- numeric(length(fit$lambda))
for (i in 1:length(fit$lambda)) fit$df[i] <- sum(l/(l+fit$lambda[i]))
## Cooler plot
matplot(c(fit$df,0),rbind(coef(fit)[,-1],0),type="l",lwd=2,lty=1,col="slateblue",xlab="Degrees of freedom",ylab="Coefficients",xlim=c(0,max(fit$df)+1))
x <- pretty(c(fit$df,0))
xx <- numeric(length(x))
xx[1] <- expression(infinity)
for (i in 2:(length(x)-1)) xx[i] <- round(fit$lambda[which.min(abs(fit$df-x[i]))])
xx[length(xx)] <- 0
axis(3,at=pretty(c(fit$df,0)),labels=xx)
mtext(expression(lambda),line=2.5)
text(max(x)+0.75,coef(fit)[1,-1],labels=colnames(XX))
## Model selection - use AIC/BIC or cross validation to choose lambda
#In order to apply AIC or BIC to the problem of choosing lambda,we will need an estimate of the degrees of freedom
#Recall that in linear regression: y_hat = H * y, where H was the projection matrix, and tr(H) = p, the degree of freedom
#Ridge regression is also a linear estimator (y_hat= H * y), with H_ridge=X(X'X+lambda*I)^(-1)*X', and df = tr(H)
#we could show that df_ridge=sum(lambda_i/(lambda_i+lambda)), where lambda_i are the eigenvalues of X'X
#The main point is to note that df is a decreasing function of lambda with df = p at lambda = 0 and df = 0 at lambda = Inf.
#AIC = n*log(RSS) + 2df
#BIC = n*log(RSS) + df*log(n)
n <- nrow(prostate)
X <- model.matrix(lpsa~.,prostate)
fit$RSS <- apply(prostate$lpsa-X%*%t(coef(fit)),2,crossprod)
AIC <- n*log(fit$RSS) + 2*fit$df
BIC <- n*log(fit$RSS) + log(n)*fit$df
abline(v=fit$df[which.min(AIC)],col="red")
abline(v=fit$df[which.min(BIC)],col="green")
# leave-one-out ("deleted") residuals: y_i - y(-i)_i_hat = r_i/(1-H_ii)
# calculating H is computationally inefficient. GCV is often used instead, replacing $ H_ii by the average of all diagonal elements
# GCV = 1/n * sum{i}{[(y_i - y_i_hat)/(1-tr(H)/n)]^2}
abline(v=fit$df[which.min(fit$GCV)])
## Other plots
par(mfrow=c(1,2))
plot(fit$lambda,fit$df,log="x",type="l",xlab=expression(lambda),ylab="Degrees of freedom",lwd=3,xaxt="n")
axis(1,at=c(1e-2,1e1,1e4,1e7),labels=c(0.01,10,expression(10^4),expression(10^7)))
plot(fit$df,fit$RSS,type="l",xlab="Degrees of freedom",ylab="RSS",lwd=3)
plot(fit$df,AIC-min(AIC),type="l",ylim=c(0,20),lwd=3,col="red",ylab="Criterion - min",xlab="Degrees of freedom")
lines(fit$df,BIC-min(BIC),type="l",lwd=3,col="green")
g <- fit$GCV*n^2
lines(fit$df,g-min(g),type="l",lwd=3)
par(mfrow=c(1,1))
## Comparison with OLS
fit.ols <- lm(lpsa~.,prostate)
sig2 <- as.numeric(crossprod(fit.ols$residuals)/fit.ols$df.residual)
b.o <- coef(fit.ols)[-1]
se.o <- sqrt(diag(vcov(fit.ols)))[-1]
k <- which.min(fit$GCV)
XX <- cbind(1,scale(model.matrix(lpsa~0+.,prostate),center=fit$xm,scale=fit$scales))
b.r <- coef(fit)[k,-1]
l <- fit$lambda[k]
W <- solve(crossprod(XX)+diag(c(0,rep(l,ncol(XX)-1))))
V <- sig2*W%*%crossprod(XX)%*%W
se.r <- sqrt(diag(V))[-1]/fit$scales
z.o <- b.o/se.o
z.r <- b.r/se.r
########### example from lecture note
library(ElemStatLearn);data(prostate)
covnames <- names(prostate[-(9:10)])
y <- prostate$lpsa
x <- prostate[,covnames]
form <- as.formula(paste("lpsa~", paste(covnames, collapse="+"), sep=""))
class(form) #"formula"
summary(lm(form, data=prostate[prostate$train,]))
set.seed(1)
train.ind <- sample(nrow(prostate), ceiling(nrow(prostate))/2)
y.test <- prostate$lpsa[-train.ind]
x.test <- x[-train.ind,]
y <- prostate$lpsa[train.ind]
x <- x[train.ind,]
p <- length(covnames)
rss <- list() #create an empty list
#lambdas <- seq(0,50,len=10)
lambdas <- seq(0,50,by=0.1)
M <- length(lambdas)
train.rss <- rep(0,M)
test.rss <- rep(0,M)
betas <- matrix(0,ncol(x),M)
library(caret);library(lars);library(MASS)
for(i in 1:M){
Formula <-as.formula(paste("y~",paste(covnames,collapse="+"),sep=""))
fit1 <- lm.ridge(Formula,data=x,lambda=lambdas[i]) #lambdas is the penalty factor
betas[,i] <- fit1$coef
scaledX <- sweep(as.matrix(x),2,fit1$xm) #xm: column means of x matrix
#sweep(x, MARGIN, STATS, FUN = "-", check.margin = TRUE, ...) #default is substract
scaledX <- sweep(scaledX,2,fit1$scale,"/")
#devide by fit1$scale
#xm<-apply(x,2,mean) # same with fit1$xm
#could just use: scaledX<-apply(x,2,scale) #slightly different from sweep due to precision.
yhat <- scaledX%*%fit1$coef+fit1$ym
train.rss[i] <- sum((y - yhat)^2)
scaledX <- sweep(as.matrix(x.test),2,fit1$xm)
scaledX <- sweep(scaledX,2,fit1$scale,"/")
yhat <- scaledX%*%fit1$coef+fit1$ym
test.rss[i] <- sum((y.test - yhat)^2)
}
par(mfrow=c(1,1))
#png(file="./PracticalMachLearning/selection-plots-02.png", width=432, height=432, pointsize=12)
plot(lambdas,test.rss,type="l",col="red",lwd=2,ylab="RSS",ylim=range(train.rss,test.rss))
lines(lambdas,train.rss,col="blue",lwd=2,lty=2)
best.lambda <- lambdas[which.min(test.rss)]
abline(v=best.lambda+1/9)
legend(30,30,c("Train","Test"),col=c("blue","red"),lty=c(2,1))
#dev.off()
#png(file="./PracticalMachLearning/selection-plots-03.png", width=432, height=432, pointsize=8)
plot(lambdas,betas[1,],ylim=range(betas),type="n",ylab="Coefficients")
for(i in 1:ncol(x))
lines(lambdas,betas[i,],type="b",lty=i,pch=as.character(i))
abline(h=0)
legend("topright",covnames,pch=as.character(1:8),cex = 0.6)
#dev.off()
#from the plot we could see that: we set off with the betas being equal to those in standard OLS when lambda's equal to 0;
#as lambda increases, all of the coefficients get closer to 0. Because we penalize the coefficient, and make them smaller
#so the tuning parameter lambda controls the size of the coefficients and the amount of regularization.
#######
# lasso
#http://statweb.stanford.edu/~tibs/lasso/simple.html
#Give a set of input measurements x1, x2 ...xp and an outcome measurement y, the lasso fits a linear model
#yhat=b0 + b1*x1+ b2*x2 + ... bp*xp
#The criterion it uses is: Minimize sum{i=1~N}([(Yi-beta0-sum{j=1~p}(betaj * Xij)]^2) subject to sum{j=1~p}|beta_j| <= s
#The first sum is taken over observations (cases) in the dataset. The bound "s" is a tuning parameter.
#When "s" is large enough, the constraint has no effect and the solution is just the usual multiple linear least squares regression of y on x1, x2, ...xp.
#However when for smaller values of s (s>=0) the solutions are shrunken versions of the least squares estimates. Often, some of the coefficients bj are zero.
#for orthonormal design matrix, this has a closed form solution:
#beta_j_hat = sign(beta_j_hat_OLS)(|beta_j_hat_OLS|-gamma)^(+),
#where (x)^(+) = x when x>0, (x)^(+) = 0, is x<=0; gamma = s/2
#Choosing "s" is like choosing the number of predictors to use in a regression model, and
#cross-validation is a good tool for estimating the best value for "s".
#The computation of the lasso solutions is a quadratic programming problem , and
#can be tackled by standard numerical analysis algorithms. But the least angle
#regression procedure is a better approach.
#The least angle regression procedure follows the same general scheme with
#forward stepwise regression, but doesn't add a predictor fully into the model.
#The coefficient of that predictor is increased only until that predictor is no
#longer the one most correlated with the residual r. Then some other competing
#predictor is invited to "join the club".
#Start with all coefficients bj equal to zero.
#Find the predictor xj most correlated with y
#Increase the coefficient bj in the direction of the sign of its correlation with y. Take residuals r=y-yhat along the way. Stop when some other predictor xk has as much correlation with r as xj has.
#Increase (bj, bk) in their joint least squares direction, until some other predictor xm has as much correlation with the residual r.
#Continue until: all predictors are in the model
library(ElemStatLearn);data(prostate);library(MASS);library(caret)
covnames <- names(prostate[-(9:10)])
y <- prostate$lpsa
x <- prostate[,covnames]
form <- as.formula(paste("lpsa~", paste(covnames, collapse="+"), sep=""))
set.seed(1)
train.ind <- sample(nrow(prostate), ceiling(nrow(prostate))/2)
y.test <- prostate$lpsa[-train.ind]
x.test <- x[-train.ind,]
y <- prostate$lpsa[train.ind]
x <- x[train.ind,]
p <- length(covnames)
rss <- list() #create an empty list
library(lars) #(LARS: Least angle regression)
lasso.fit <- lars(as.matrix(x), y, type="lasso", trace=TRUE)
#png(file="./PracticalMachLearning/selection-plots-04.png", width=432, height=432, pointsize=8)
plot(lasso.fit, breaks=FALSE)
legend("topleft", covnames, pch=8, lty=1:length(covnames), col=1:length(covnames))
#dev.off()
# this plots the cross validation curve
#png(file="./PracticalMachLearning/selection-plots-05.png", width=432, height=432, pointsize=12)
lasso.cv <- cv.lars(as.matrix(x), y, K=10, type="lasso", trace=TRUE)
#dev.off()
## Prostate data
library(glmnet) #fit a GLM with lasso or elasticnet regularization
data(prostate)
prostate <- prostate[,-ncol(prostate)]
X <- model.matrix(lpsa~0+.,prostate)
y <- prostate$lpsa
fit <- glmnet(X,y)
plot(fit)
cvfit <- cv.glmnet(X,y) #Cross-validation for glmnet, default is 10 folds
plot(cvfit)
coef(fit,s=cvfit$lambda.min)
lam <- c(0,exp(seq(log(0.01),log(1e8),len=201)))
fit <- lm.ridge(lpsa~.,prostate,lambda=lam)
l <- apply(coef(fit)[,-1]^2,1,sum)
s <- l/max(l)
matplot(c(s,0),rbind(coef(fit)[,-1],0),type="l",lwd=2,lty=1,col="slateblue",xlab="",ylab="Coefficients",xlim=c(0,1.15),ylim=c(-0.1,0.8),main="Ridge")
text(1.1,coef(fit)[1,-1],labels=colnames(X),cex=0.8)
mtext(expression(sum(beta[j]^2)/max(sum(beta[j]^2))),1,line=3)
cvfit <- cv.glmnet(X,y)
fit <- glmnet(X,y,lambda.min=0,nlambda=501)
l <- apply(abs(coef(fit)[-1,]),2,sum)
s <- l/max(l)
matplot(s,t(as.matrix(coef(fit)[-1,])),type="l",lwd=2,lty=1,col="slateblue",xlab="",ylab="Coefficients",xlim=c(0,1.15),ylim=c(-0.1,0.8),main="Lasso")
text(1.1,coef(fit)[-1,length(fit$lambda)],labels=colnames(X),cex=0.8)
mtext(expression(sum(abs(beta[j]))/max(sum(abs(beta[j])))),1,line=3)
## Selection of lambda
b <- coef(fit,cvfit$lambda.min)[-1]
abline(v=sum(abs(b))/max(l),col="gray80")
RSS <- apply(predict(fit,newx=X)-y,2,crossprod)
n <- length(y)
AIC <- n*log(RSS) + 2*fit$df
BIC <- n*log(RSS) + log(n)*fit$df
GCV <- RSS/n/(1-fit$df/n)^2
abline(v=s[which.min(AIC)],col=rgb(1,.5,.5))
abline(v=s[which.min(BIC)],col=rgb(.5,1,.5))
#there is also relaxo method in caret!!