-
Notifications
You must be signed in to change notification settings - Fork 0
/
PracticalMachLearning-1.R
724 lines (652 loc) · 35.5 KB
/
PracticalMachLearning-1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
setwd("D:\\study\\DataScience")
#1.Introduction
#Machine Learning is about prediction
#The first step of designing machine-learning algorithm is Probability/Sampling
#Process: Question->input data->features->algorithm->parameters->evaluation
#Question: - General question: Can I automatically detect emails that are SPAM that are not?
# - Concrete: Can I use quantitative characteristics of the emails to classify them as SPAM/HAM?
#input data: The SPAM example is in package kernlab
library(kernlab)
data(spam)
head(spam)
dim(spam)
#features: we can calculate the frequency of particular words.
plot(density(spam$your[spam$type=="nonspam"]),col="blue",main="",xlab="Frequency of 'your'")
lines(density(spam$your[spam$type=="spam"]),col="red")
#x-axis is the frequency with 'your' appeared in the email.
#y-axis is the density, the number of times that frequency appears amongst the emails.
#algorithm: find a cut-off constant C, is frequency of 'your' > C, then predict "spam"
#let's choose cut-off as 0.5
abline(v=0.5,col="black")
#evaluation
prediction<-ifelse(spam$your>0.5,"spam","nonspam")
table(prediction,spam$type)/length(spam$type)
#accuracy = 0.459+0.292 = 0.751 -->later on, we'll know this is an optimistic estimate of the overall error rate.(in-sample error rate)
#relative importance of steps: question > data > features > algorithms
#frequently the question that you actually really want to answer, the data won't be available, or the
#data will be only avilable in a sort of tangential.
#common mistakes in feature slection: automate feature selection in a way that doesn't allow you to understand how those features are actually being applied to make good predictions.
#not paying attention to data-specific quirks
#throwing away information unnecessarily
#you can automate feature selections somtimes with care. semi-supervised learning, or deep learning.
#in sample v.s. out of sample errors
library(kernlab);data(spam)
set.seed(333)
smallSpam<-spam[sample(dim(spam)[1],size=10),]
spamLabel<-(smallSpam$type=="spam")+1
plot(smallSpam$capitalAve,col=spamLabel,pch=19)
#capitalAve>2.7 = "spam", capitalAve<2.4 = "nonspam"; capitalAve between 2.40 and 2.45 = "spam"; capitalAve between 2.45 and 2.7 = "nonspam"
#prediction rule1
rule1<-function(x){
prediction<-rep(NA,length(x))
prediction[x>2.7]<-"spam"
prediction[x<2.40]<-"nonspam"
prediction[x>=2.40&x<=2.45]<-"spam"
prediction[x>2.45&x<=2.70]<-"nonspam"
return(prediction)
}
table(rule1(smallSpam$capitalAve),smallSpam$type)
#prediction rule2
rule2<-function(x){
prediction<-rep(NA,length(x))
prediction[x>2.8]<-"spam"
prediction[x<=2.8]<-"nonspam"
return(prediction)
}
table(rule2(smallSpam$capitalAve),smallSpam$type) #will miss one value
table(rule1(spam$capitalAve),spam$type)
table(rule2(spam$capitalAve),spam$type)
sum(rule1(spam$capitalAve)==spam$type)
sum(rule2(spam$capitalAve)==spam$type) #more accurate for the whole sample when we use simplified rule
#the reason that the simplied rule is better than the more complicated rule is overfitting:
#Data have two parts: signal and noise. The goal of a predictor is to find signal.
#you can always build a prefect in-sample predictor in any samll dataset, but you will capture both signal and noise when you do that.
#prediction study design
#split data into: training set, testing set, validation set.
#in the training set, you pick up features and prediction function using cross-validation
#if no validation set, apply only the best model we have to test set 1 time.
#if validation set exists, apply predication models to test set and refine, apply only the best one 1 time to validation set.
#In general, randomly sample training and test.
#if you have a large sample size: 60% training, 20% test, 20% validation.
#if you have a medium sample sizeL 60% training, 40% test
#small sample size: Do cross validation, report caveat of small sample size
#Your data sets must reflect structure of problem.
#for example, if predictions evolve with time, split training/test in time chunks (called backtesting in finance - to use chunks of data that consist of obs. over time)
#type of errors (different name in machine learning)
#TP+FN = 1; #FP+TN = 1
#sensitivity (Pr(positive test|disease) = TP/(TP+FN)) #TP:True Positive(good) | FN:False Negative(bad)
#specificity (Pr(negative test|no disease)=TN/(FP+TN)) #TN:True Negative(good) | FP:False Positive(bad)
#positive predictive value (Pr(disease|positive test)=TP/(TP+FP))
#Negative predictive value (Pr(no disease|negative test)=TN/(FN+TN))
#Accuracy((TP+TN)/(TP+FP+FN+TN))
#The above four could be easiliy recognized once you draw the table.
#In relation to Bayesian statistics, the sensitivity and specificity are the conditional
#probabilities, the prevalence is the prior, and the positive/negative predicted values are
#the posterior probabilities.
####If we predict a rare event, you have to be aware of how rare that event is.
#when it's very rare, then the negative predictive value would be small even the screen machenism is relatively good.
#you're getting a lot of false positives that are as a fraction of the total number of positives that you are getting.
#This goes back to the idea of knowing what population you're sampling from when you're building a predictive model.
#for cts data, we use MSE and RMSE(root MSE)
#Common error measures:
#1. MSE/RMSE: cts data, data sensitive to outliers
#2. Median absolute deviation: cts data, often more robust
#Sensitivity and Specificity are commonly used when you care about one type of error more than the other type of error.
#3. Sensitivity (recall): if you want few missed positives
#4. Specificity: if you want few negatives called positives
#5. Accuracy: weights false positives/negatives equally
#6. Concordance: One example is Kappa.
#An ROC curve is the most commonly used way to visualize the performance of a binary classifier,
#and AUC(area under the curve) is (arguably) the best way to summarize its performance in a single number.
#ROC Curves(receiver operating characteristics curves) -- test the quality or goodness of a prediction algorithm
#In binary classification, you're predicting one of two categories. The cutoff point you choose gives you different results.
#on x-axis, P(False Positive Rate) or 1-specificity(FP/(FP+TN)=1-TN/(FP+TN))
#y-axis,P(True Positive Rate) or sensitivity,
#every single point along the curve corresponds to exactly one cutoff
#so for a particular cutoff, you get a certain probablity of being a false positive or a certain 1-specificity(FP/(FP+TN)=1-TN/(FP+TN))
#from the ROC curve, we could see that sensivity and specificity sometimes are oppositely related.
#The area under ROC curve(AUC) quantifies the goodness of an algorithm.
#AUC = 0.5: random guessing. AUC = 1: perfect classifier(sensitivity and specificity are both 1). AUC>=0.8: good classifier.
#Cross Validation - detecting relevant features and building models and estimating parameters.
#we use cross validation to estimate the "test set accuracy" with the training set.
#Approach:
#use the training set, split it into training/test sets(subsampling), build a model on the training set, evaluate on test set.
#repeat above and average the estimated errors.
#Used for:
#Picking variables to include in a model, #picking the type of prediction function to use
#picking the parameters in the prediction function, #comparing different predictors
#random subsampling cross validation
#k-fold cross validation: for example, 3-fold cross validation. (see video 1-8 4:00)
#larger K = less bias, more variance
#smaller k = more bias, less variance
#leave-one-out cross validation: just leave one sample out every time.
#Considerations:
#for time series data, data must be used in "chunks" -- you have to get blocks of time that are all
#contiguous, because time point might depend on all the time points that came previously. You will
#ignore a huge, rich structure in the data if you just randomly take samples.
#random sampling must be done without replacement, even though we have to break our training setup to smaller samples
#random sampling with replacement is called bootstrap(inference class). -- underestimates of the error (can be correct by 0.632 bootstrap, but it is complicated)
#(http://stats.stackexchange.com/questions/96739/what-is-the-632-rule-in-bootstrapping)
#2.Caret Functionality - Preprocessing(preProcess)
#data splitting(createDataPartition, createResample, createTimeSlices),
#training/testing functions(train, predict)
#model comparison(confusionMatrix)
#machine learning algorithms in R: #Linear Discriminant analysis #Regression #Naive Bayes #SVM: support vector machine
#Classification and Regression Trees #Random Forests #Boosting
#predict function syntax:
#lda(MASS):Linear Discriminant Analysis. predict(obj) #no option needed
#glm(stats): Generalized linear models. predict(obj,type="response")
#gbm(gbm): Generalized Boosted Regression Models. predict(obj,type="response,n.trees)
#mda(mda): Mixture and flexible discriminant analysis. predict(obj,type="posterior")
#rpart(rpart): Recursive Partitioning and Regression Trees. predict(obj,type="prob")
#Weka(RWeka): Waikato Environment for Knowledge Analysis. predict(obj, type="probability")
#LogitBoost(caTools): logitboost classification algorithm. predict(obj,type="raw",nIter)
#SPAM Example: Data Splitting
library(caret);library(kernlab);data(spam)
#A series of test/training partitions are created using createDataPartition
#while createResample creates one or more bootstrap samples(simple random sampling is used)
#createFolds splits the data into k groups
#while createTimeSlices creates cross-validation sample information to be used with time series data.
inTrain<-createDataPartition(y=spam$type,p=0.75,list=FALSE)
#the random sampling is done within the levels of y when y is a factor
#For numeric y, the sample is split into groups sections based on percentiles and sampling is done within these subgroups.
training<-spam[inTrain,]
testing<-spam[-inTrain,]
dim(training)
dim(testing)
#fit a model
install.packages("e1071")
library(e1071)
#e1071: Misc Functions of the Department of Statistics
set.seed(32343)
modelFit<-train(type~.,data=training,method="glm")
#train(formula, data, ..., weights, subset, na.action, contrasts = NULL)
#result of train function:
#method, modelType, results(a data frame the training error rate and values of the tuning parameters.),
#bestTune(a data frame with the final parameters), call(the (matched) function call with dots expanded)
#metric(a string that specifies what summary metric will be used to select the optimal model.), control(the list of control parameters.)
#preProcess,finalModel,trainingData,resample,perfNames,maximize(a logical recycled from the function arguments.),
#yLimits,times
modelFit
str(modelFit)
summary(modelFit)
names(modelFit)
modelFit$finalModel
modelFit$metric
modelFit$perfNames
modelFit$bestTune
modelFit$call
modelFit$modelType
modelFit$results
modelFit$maximize
modelFit$yLimits
modelFit$times
#predict
predictions<-predict(modelFit,newdata=testing)
predictions
#table(predictions,testing$type)
confusionMatrix(predictions,testing$type) #table's entension version
#another regression example
library(mlbench)
data(BostonHousing)
lmFit <- train(medv ~ . + rm:lstat, data = BostonHousing, method = "lm")
library(rpart)
rpartFit <- train(medv ~ .,data = BostonHousing,method = "rpart",tuneLength = 9)
###Data Slicing:
#normal data splitting:
inTrain<-createDataPartition(y=spam$type,p=0.75,list=FALSE)
training<-spam[inTrain,]
testing<-spam[-inTrain,]
#k-fold splitting:
set.seed(32323)
trainFolds<-createFolds(y=spam$type,k=10,list=TRUE, returnTrain=TRUE)
length(spam$type)
#createFolds(y, k = 10, list = TRUE, returnTrain = FALSE)
#list: logical - should the results be in a list (TRUE) or a matrix with the number of rows equal to floor(p * length(y)) and times columns
#returnTrain: a logical. When true, return training set. when false, return test set. This argument only works in conjunction with list = TRUE
sapply(trainFolds,length) #4140-4142 (around 90% of the length of spam$type)
length(spam$type) #4601
trainFolds[[1]][1:10]
trainFolds[[2]][1:10]
trainFolds[[3]][1:10]
trainFolds[[4]][1:10]
trainFolds[[9]][1:10]
trainFolds[[10]][1:10]
set.seed(32323)
testFold<-createFolds(y=spam$type,k=10,list=TRUE, returnTrain=FALSE)
testFold[[1]][1:10]
testFold[[2]][1:10]
testFold[[3]][1:10]
testFold[[4]][1:10]
testFold[[9]][1:10]
testFold[[10]][1:10]
spam$type[-testFold[[10]]]==spam$type[trainFolds[[10]]] #All True's
#Resampling -- Bootstrapping
set.seed(32323)
bootFolds<-createResample(y=spam$type,times=10,list=TRUE) #times: the number of partitions to create
sapply(bootFolds,length)
bootFolds[[1]][1:10]
bootFolds[[2]][1:10]
bootFolds[[3]][1:10]
bootFolds[[4]][1:10]
bootFolds[[9]][1:10]
bootFolds[[10]][1:10]
#time slices (time series):
#createTimeSlices(y, initialWindow, horizon = 1, fixedWindow = TRUE)
#y: For createTimeSlices, these should be in chronological order.
#initialWindow: The initial number of consecutive values in each training set sample
#horizon: The number of consecutive values in test set sample
#fixedWindow: A logical: if FALSE, the training set always start at the first sample.
set.seed(32323)
tme<-1:1000
timeFolds<-createTimeSlices(y=tme,initialWindow=20,horizon=10) #I want to predict the next 10 samples out after I take the initial window of 20.
names(timeFolds)
timeFolds$train[[1]]
timeFolds$test[[1]]
#training control options
library(caret);library(kernlab);data(spam)
set.seed(32343)
inTrain<-createDataPartition(y=spam$type,p=0.75,list=FALSE)
training<-spam[inTrain,]
testing<-spam[-inTrain,]
modelFit<-train(type~.,data=training,method="glm")
names(modelFit)
modelFit$finalModel
#train.options
args(train.default)
#function (x, y, method = "rf", preProcess = NULL, ..., weights = NULL,
# metric = ifelse(is.factor(y), "Accuracy", "RMSE"), maximize = ifelse(metric == "RMSE", FALSE, TRUE),
# trControl = trainControl(), tuneGrid = NULL, tuneLength = 3)
#we can use "weights" to upweight or downweight certain observations
#in metric: for cts outcomes, we can use RMSE or R^2(a measure of linear agreement.not that useful for non-linear things like random forest)
#for categorical outcomes, we can use Accuracy or Kappa((Pr(a)-Pr(e))/(1-Pr(e)))
#trainControl:
#trainControl(method = "boot", number = ifelse(grepl("cv", method), 10, 25),
# repeats = ifelse(grepl("cv", method), 1, number), p = 0.75, initialWindow = NULL, horizon = 1,
# fixedWindow = TRUE, verboseIter = FALSE, returnData = TRUE, returnResamp = "final",
# savePredictions = FALSE, classProbs = FALSE, summaryFunction = defaultSummary,selectionFunction = "best",
# preProcOptions = list(thresh = 0.95, ICAcomp = 3, k = 5), index = NULL, indexOut = NULL,
# timingSamps = 0, predictionBounds = rep(FALSE, 2), seeds = NA, adaptive = list(min = 5, alpha = 0.05,
# method = "gls", complete = TRUE), allowParallel = TRUE)
#method: bootstrapping or cross validation.
#boot = bootstrapping, boot632 = bootstrapping(reduce bias) that adjusts for the fact that multiple samples are repeatedly resampled when you're doing sub-sampling
#cv = cross-validation, repeatedcv = repeated cross-validation, LOOCV = leave one out cross validation
#number: number of times to do bootstrapping or cross validation; number of subsamples to take
#repeats: how many times to repeat that subsampling process if you want to be careful about.
#p: size of the training set.
#other parameters depend on the specific problems you're working on. for example, for time course data, initialWindow
#tells you the size of time points in the training data. horizon will be the # of time points that you'll be predicting.
#savePredictions: return the actual predictions from each of the iterations when it's building the model
#summaryFunction: you can return a different summary
#preProcessing, predicitonBounds, parallelizing.
#seeds: it's often used to set an overall seed; if you do parallel computation, you can also set a seed for each resampling
#example:
set.seeds(1235) #it will generate a set of random numbers that is consistent.
modelFit3<-train(type~.,data=training,method="glm") #if I fit model like this, then when it generates bootstrap samples,
#it will generate those bootstrap samples according to the random number that come for the seed 1235
modelFit3
#another example:
set.seed(123)
seeds <- vector(mode = "list", length = 51)
for(i in 1:50) seeds[[i]] <- sample.int(1000, 22)
seeds[[51]] <- sample.int(1000, 1)
ctrl <- trainControl(method = "repeatedcv",
repeats = 5,
seeds = seeds)
set.seed(1)
mod <- train(Species ~ ., data = iris,
method = "knn",
tuneLength = 12,
trControl = ctrl)
ctrl2 <- trainControl(method = "adaptive_cv",
repeats = 5,
verboseIter = TRUE,
seeds = seeds)
set.seed(1)
mod2 <- train(Species ~ ., data = iris,
method = "knn",
tuneLength = 12,
trControl = ctrl2)
###plotting predictors - to understand how the data actually look and how the data interact with each other
#things you should be looking for:
#-Imbalance in outcomes/predictors: if you see all of the predictors tend to be one value in the
#one outcome group and not in another outcome group, then you see that's a good predictor.
#but if you see that you only have three of one outcome and 150 of the other outcome, that means
#it's going to be very hard to build an accurate classifier between those two classes.
#-Outliers: they might suggest that there are some variables you're missing.
#-Groups of points not explained by any predictor
#-Skewed variables: you may want to transform and make look better, more sort of nicely normally
#distributed if you're using things like regression models, but that may not matter as much as
#if you're using more of machine learning methods.
#let's use Wage data in ISLR(introduction to statistical learning) package.
library(ISLR); library(ggplot2); library(caret)
data(Wage)
summary(Wage)
set.seed(12345)
inTrain<-createDataPartition(y=Wage$wage,p=0.7,list=FALSE)
training<-Wage[inTrain,]
testing<-Wage[-inTrain,]
dim(training);dim(testing)
#feature plot
#use caret
library(caret)
featurePlot(x=training[,c("age","education","jobclass")],y=training$wage,plot="pairs")
#outcome(topright) is the wage. #we can see a trend between education and wage
#use qqplot2 plotting system
qplot(age,wage,data=training)
qplot(age,wage,data=training,color=jobclass)
#so we could see that most of points in the upper chunk come from the information-based jobs.
#this may help you detect which variable is important in your model.
#add regression smoothers(ggplot2)
qq<-qplot(age,wage,color=education,data=training)
qq+geom_smooth(method="lm",formula=y~x)
#you can see for every different education class, it fits a linear model.
#break wage variable into different categories (Hmisc package)
library(Hmisc) #Harrell Miscellaneous #see Data_Cleaning-3-plyr&dplyr&reshape.R
cutWage<-cut2(training$wage,g=3)
table(cutWage)
#boxplot with cut2
p1<-qplot(cutWage,age,data=training,fill=cutWage,geom=c("boxplot"))
p1
p2<-qplot(cutWage,age,data=training,fill=cutWage,geom=c("boxplot","jitter"))
p2
if(!require("gridExtra")){install.packages("gridExtra")}
library("gridExtra")
grid.arrange(p1,p2,ncol=2)
#we could see that there's a large number of dots in each of the different boxes and so that suggest
#that any trend may actually be real. If you observe just a few dots in the one boxes, maybe it means that
#the particular box isn't very well representative of what the data actually look like.
#Table
t1<-table(cutWage,training$jobclass)
t1
prop.table(t1,1) #proportional to the fist dimension -- row (sum of each row = 1)
#Density plots - If you want to see how all the distributions change by group, density plot will be useful
qplot(wage,color=education,data=training,geom="density",lwd=1)
#we can see that <HS Grad have more value down and centralized
#college degree and advanced degree have two small bumps in the right-hand side.
#Preprocessing Predictor Variables:
#you need to plot the variables upfront so you can see if there is any sort of weird behavior of those variables
#sometimes, predictors will look very strange or the distribution will be very strange and you might need to
#transform them in order to make them more useful for prediction algorithms (
#particularly model-based algorithms like linear discriminant analysis, naive Bayes, linear regressions
library(caret); library(kernlab); data(spam)
inTrain<-createDataPartition(y=spam$type,p=0.75,list=FALSE)
training<-spam[inTrain,] #if list = TRUE in the above function, we could subset spam with list inTrain.
testing<-spam[-inTrain,]
hist(training$capitalAve,main="",xlab="ave. capital run length")
#we can see almost all of the run links are very small, but there are a few that are much, much larger.
mean(training$capitalAve)
sd(training$capitalAve)
trainCapAve<-training$capitalAve
trainCapAveS<-(trainCapAve-mean(trainCapAve))/sd(trainCapAve)
mean(trainCapAveS)
sd(trainCapAveS)
#we must adjust the test set with parameters of the training set.
testCapAve<-testing$capitalAve
testCapAveS<-(testCapAve-mean(trainCapAve))/sd(trainCapAve)
mean(testCapAveS)
sd(testCapAveS)
#we could also use preProcess function
names(training[,58]) #the 58th column is the actual outcome
preObj<-preProcess(training[,-58],method=c("center","scale")) #record center and scale of every variable in the training set, and defined preObj as preProcess class.
trainCapAves<-predict(preObj,training[,-58])$capitalAve
## Default S3 method:
#preProcess(x, method = c("center", "scale"), thresh = 0.95, pcaComp = NULL,na.remove = TRUE,k = 5,
#knnSummary = mean, outcome = NULL,fudge = .2, numUnique = 3,verbose = FALSE,...)
## S3 method for class 'preProcess'
#predict(object, newdata, ...)
testCapAves<-predict(preObj,testing[,-58])$capitalAve #apply to the test set
mean(testCapAves)
sd(testCapAves)
#we can also pass the preProcess object directly to the train function as an argument
set.seed(32343)
modelFit<-train(type~.,data=training,preProcess=c("center","scale"),method="glm")
modelFit
#Other kinds of transformation -- Box-Cox Transformation
preObj<-preProcess(training[,-58],method=c("BoxCox")) #Box-Cox takes cts data and try to make them look like normal data, by estimating a specific data set of parameters using MLE.
trainCapAves<-predict(preObj,training[,-58])$capitalAve
par(mfrow=c(1,2));
hist(trainCapAves) #you could see the data are more like a normal distribution, but it doesn't take
#care of all of the problems. There's still a stack set of values at zero.
qqnorm(trainCapAves) #normal qq plot #you could see data are chunked down in the bottom.
sum(trainCapAves==0) #245 values are equal to 0, Box-Cox as a cts transform doesn't take care of values that are repeated.
#so it doesn't take care of a lot of the problems that would occur using a variable that's highly skewed.
#we could impute data for these data sets
#We also do this for missing data, prediction algorithms are built not to handle missing data in most cases.
#make some values NA
set.seed(13343)
training$capAve<-training$capitalAve
selectNA<-rbinom(dim(training)[1],size=1,prob=0.05)==1
training$capAve[selectNA]<-NA
#impute and standardize
preObj<-preProcess(training[,-58],method="knnImpute") #will also automatically standardize.
#install.packages("RANN") #required for knnImpute
capAve<-predict(preObj,training[,-58])$capAve
#standardize true values
capAveTruth<-training$capitalAve
capAveTruth<-(capAveTruth-mean(capAveTruth))/sd(capAveTruth)
quantile(capAve-capAveTruth) #most value are close to zero, so the imputation works well
quantile((capAve-capAveTruth)[selectNA])
quantile((capAve-capAveTruth)[!selectNA]) #we can see these values that are not imputed are closer to capAveTruth.
#All the transformations talked are for cts data.careful when transforming factor variables. It's more difficult to know what is the right transformation.
#most machine learning algorithms are designed to deal with binary predictors, in which the binary predictors are not preprocessed,
#and cts data which are preprocessed to be more normal.
###Covariate Creation
#Covariate are somtimes called predictors and sometimes called features
#They're the variables that you will actually include in your model that you're going to be using
#to combine them to predict whatever outcome that you care about.
#Two levels/steps of covariate creation:
#1. from raw data(text file, image, website) to covariate
#no example
#the more knowledge of the system you have the better the job you will do.
#when in doubt, err on the side of more features
#use Google "feature extraction for [data type]" to search for features
#2. transforming tidy covariates to more useful covariates
#should be done only on the training set
#more necessary for some methods(regression, svms) than for others(classification trees)
#the best approach is through EDA(plotting/tables)
#New covariates should be added to data frame
#be careful about overfitting!
#e.g.:
library(kernlab); data(spam)
spam$capitalAveSq<-spam$capitalAve^2
#a full e.g.
library(ISLR);library(caret);data(Wage)
inTrain<-createDataPartition(y=Wage$wage,p=0.7,list=FALSE)
training<-Wage[inTrain,]
testing<-Wage[-inTrain,]
#common covariates to add: turn qualitative or factor covariates into dummy variables(quantitative)
table(training$jobclass)
dummies<-dummyVars(wage~jobclass,data=training) #outcome~predictor variable
head(predict(dummies,newdata=training))
#removing zero-variability covariates
#nzv: nearZeroVar diagnoses predictors that have one unique value (i.e. are zero variance predictors)
#or predictors that are have both of the following characteristics: they have very few unique
#values relative to the number of samples and the ratio of the frequency of the most common value
#to the frequency of the second most common value is large.
#for example, we have a feature that says for emails, does it have any letters in it at all?
#almost every single email will have at least one letter in it, so that variable will always
#be equal to true. So the feature has no variabilty and it's probabiliy not going to be useful.
nsv<-nearZeroVar(training,saveMetrics=TRUE) #near-zero variability
#freqRatio: the ratio of frequencies for the most common value over the second most common value
#percentUnique: the percentage of unique data points out of the total number of data points
#Spline basis - splines package
library(splines)
bsBasis<-bs(training$age,df=3) # B-spline function will create a 3-degree polynomial variable.
bsBasis #three columns' coefficients: original age variable, age squared, age cubed
#fitting curves with splines
par(mfrow=c(1,1))
plot(training$age,training$wage,pch=19,cex=0.5)
lm1<-lm(wage~bsBasis,data=training)
predictValue<-predict(lm1,newdata=training)
points(training$age,predictValue,col="red",pch=19,cex=0.5)
#splines on the test set
#Importnat in Machine Learning:
#on the test set, you have to predict the same variable -- you have to create the variable on
#the test set using the exact same procedure that you used on the training set,
#instead of directly applying the BS function to the age variable in test set, which would create
#a new set of variables on the test set that isn't relaed to the variables that you create on the
#training set and may introduce some bias.
testBasis<-predict(bsBasis,newx=testing$age)
#identical(bsBasis, predict(bsBasis, testing$age))
getAnywhere(predict.bs)
#if you want to fit spline models, use the "gam"(generalized additive model using Splines) method in caret, which allows smoothing of multiple variables.
#caret uses gam from package mgcv
#e.g.:
library(mgcv)
dat <- gamSim(1,n=400,dist="normal",scale=2) #Function used to simulate data sets to illustrate the use of gam and gamm.
test<-train(y~x1, data=dat, method='gam') # Next time, provide any data like this.
test$finalModel$fitted.values
#Using method = "gam" gets you gam() from the mgcv package and using "gamLoess" and "gamSpline" use gam() from the gam package.
#Preprocessing with PCA
inTrain<-createDataPartition(y=spam$type,p=0.75,list=FALSE)
training<-spam[inTrain,]
testing<-spam[-inTrain,]
M<-abs(cor(training[,-58]))
diag(M)<-0
index<-which(M>0.8,arr.ind=T) #could be used in pairs trading
#which (x, arr.ind = FALSE, useNames = TRUE)
#logical; should array indices be returned when x is an array
index
M[32,31]
M[34,31]
paste(rownames(M)[index[,1]], colnames(M)[index[,2]], sep=", ")
names(spam)[c(34,32)]
plot(spam[,34],spam[,32])
#rotate the plot
x<-0.71*training$num415+0.71*training$num857
y<-0.71*training$num415-0.71*training$num857
plot(x,y) #we could see that most variability is happening in the x-axis, but most points are
#clustered at 0 on the y-axiss (almost all of these points have y=0)
#prcomp
smallSpam<-spam[,c(34,32)]
prComp<-prcomp(smallSpam)
plot(prComp$x[,1],prComp$x[,2]) #similar to the above plot
#PCA on Spam data
typeColor<-((spam$type=="spam")+1) #black ones are spam, red ones are ham
prComp<-prcomp(log10(spam[,-58]+1)) #use log10 to make data look more Gaussian because some variable are normal looking, some others are skewed.
plot(prComp$x[,1],prComp$x[,2],col=typeColor,xlab='PC1',ylab='PC2')
#we could see that long PC1, there is separation of the ham messages from the spam messages.
#do the principal component in preProcess
preProc<-preProcess(log10(spam[,-58]+1),method="pca",pcaComp=2) #pcaComp: # of pc to computer
#The preProcess function normalizes the data and then performs PCA
#the preProcess equivalent to prcomp(scale(log10(spam[,-58]+1))) or prcomp(log10(spam[,-58]+1),scale. = TRUE).
#prcomp(x, retx = TRUE, center = TRUE, scale. = FALSE, tol = NULL, ...)
spamPC<-predict(preProc,log10(spam[,-58]+1)) #calculate the values of each new principal components
head(spamPC)
str(spamPC)
plot(spamPC[,1],spamPC[,2],col=typeColor)
#####full steps:
preProc <- preProcess(log10(training[,-58]+1),method="pca",pcaComp=2)
trainPC <- predict(preProc,log10(training[,-58]+1)) #predict on training data
modelFit <- train(training$type ~ .,method="glm",data=trainPC) #change the data.
modelFit
testPC<-predict(preProc,log10(testing[,-58]+1))
fitValue<-predict(modelFit,testPC) #we also need to use testPC instead of testing data
confusionMatrix(testing$type,fitValue)
#integrate pcs into the train -- now you couldn't select #of PC's
#By default, caret keeps the components that explain 95% of the variance.
modelFit<-train(training$type~.,method="glm",preProcess="pca",data=training)
confusionMatrix(testing$type,predict(modelFit,testing))
#watch out for outliers:
#transform first (with logs/Box Cox)
#plot predictors to identify problems
#PCs are more useful for linear-type models
#can make it harder to intepret predictors
###Predicting with Regression
#often poor performance in nonlinear settings
library(caret);data(faithful);
head(faithful)
set.seed(333)
inTrain<-createDataPartition(y=faithful$waiting,p=0.75,list=FALSE)
trainFaith<-faithful[inTrain,]
testFaith<-faithful[-inTrain,]
head(trainFaith)
#fit a linear model
lm1<-lm(eruptions~waiting,data=trainFaith)
summary(lm1)
names(lm1)
plot(trainFaith$waiting,trainFaith$eruptions,pch=19,col="blue",xlab="Waiting",ylab="Duration")
lines(trainFaith$waiting,lm1$fitted.values,lwd=3)
?lines #lines(x, y = NULL, type = "l", ...)
#predict a new value
#method 1
coef(lm1)[1]+coef(lm1)[2]*6
#method 2
newdata<-data.frame(waiting=6)
predict(lm1,newdata) #newdata should be data frame
predict(lm1,newdata=testFaith)
#predFun<-function(x){predict(lm1,data.frame(waiting=x))}
#predFun(80)
#fitTest<-sapply(testFaith$waiting,predFun)
#as.numeric(fitTest)
#plot predictions
par(mfrow=c(1,2))
plot(trainFaith$waiting,trainFaith$eruptions,pch=19,col="blue",xlab="Waiting",ylab="Duration")
lines(trainFaith$waiting,predict(lm1),lwd=3)
plot(testFaith$waiting,testFaith$eruptions,pch=19,col="blue",xlab="Waiting",ylab="Duration")
lines(testFaith$waiting,predict(lm1,newdata=testFaith),lwd=3)
#get training set/test set errors
#RMSE
sqrt(sum((lm1$fitted-trainFaith$eruptions)^2))
sqrt(sum((predict(lm1,newdata=testFaith)-testFaith$eruptions)^2))
#prediction intervals
par(mfrow=c(1,1))
pred1<-predict(lm1,newdata=testFaith,interval="prediction") #interval="prediction"
ord<-order(testFaith$waiting) #return the index of ordered vector # seems we don't have to order actually.
plot(testFaith$waiting,testFaith$eruptions,pch=19,col="blue")
matlines(testFaith$waiting[ord],pred1[ord,],type="l",,col=c(1,2,2),lty=c(1,1,1),lwd=3)
#matlines: Plot the columns of one matrix against the columns of another.
args(matlines)
#function (x, y, type = "l", lty = 1:5, lwd = 1, pch = NULL, col = 1:6, ...)
plot(testFaith$waiting,testFaith$eruptions,pch=19,col="blue")
matlines(testFaith$waiting,pred1,type="l",,col=c(1,2,2),lty=c(1,1,1),lwd=3)
#same process with caret
library(caret)
modFit<-train(eruptions~waiting,data=trainFaith,method="lm")
summary(modFit$finalModel)
##multiple regression
library(ISLR);library(ggplot2);library(caret);data(Wage)
names(Wage)
Wage<-subset(Wage,select=-c(logwage))
summary(Wage)
inTrain<-createDataPartition(y=Wage$wage,p=0.7,list=FALSE)
training<-Wage[inTrain,]
testing<-Wage[-inTrain,]
dim(training);dim(testing)
#first thing: feature plot:
featurePlot(x=training[,c("age","education","jobclass")],y=training$wage,plot="pairs")
?featurePlot #A shortcut to produce lattice graphs
pairs(training[,c("age","education","jobclass","wage")]) #same but the color and sequence
#seems to be two separate groups for the wage and age, giving us some indication to analyze
qplot(age,wage,data=training)
#qplot(age,wage,data=training)+geom_smooth(method="lm",formula=y~x)
qplot(age,wage,data=training,color=jobclass)
qplot(age,wage,data=training,color=education)
#we could see information and advanced degree acount or most of high-wage peopple.
?qplot
#fit a linear model
modFit<-train(wage~age+jobclass+education,method="lm",data=training)
finMod<-modFit$finalModel
print(finMod)
par(mfrow=c(2,2))
plot(finMod)
par(mfrow=c(1,1))
plot(finMod,1,pch=19,cex=0.5,col="#00000010") #plot functions: plot(functions,the # of the funciton)
#we could see some outliers above, labeled with numbers. those variables might be variables
#that we want to explore a little bit further to see if we can identify any other predictors
#color by variables not used in the model
qplot(finMod$fitted,finMod$residuals,color=race,data=training)
#we can see some of the outliers above may could be explained by the race variable: white and black
#plot by index
plot(finMod$residuals,pch=19) #index(row number) v.s. residuals
#we can see that high residuals seem to be happening at the highest row numbers.
#we can also see a trend with respect to row numbers.
#whenever you can see a trend or a outlier w.r.t. row numbers, it suggests there is a variable missing from your model.
#predicted v.s. truth in test set
pred<-predict(modFit,testing)
qplot(wage,pred,color=year,data=testing)
#if you want to use all covariates
modFitAll<-train(wage~.,data=training,method="lm")
pred<-predict(modFitAll,testing)
qplot(wage,pred,data=testing)