-
Notifications
You must be signed in to change notification settings - Fork 0
/
ANOVA-1.R
1170 lines (1013 loc) · 58.4 KB
/
ANOVA-1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
setwd("D:\\study\\DataScience")
#Before start
#1. EDA
#Numerical summaries - means, sds, five-number summaries, correlations.
#Graphical summaries
#- One variable - Boxplots, histograms etc.
#- T wo variables - scatterplots.
#- Many variables - interactive graphics.
library(faraway)
data(pima)
pima
summary(pima)
sort(pima$diastolic)
#set all zero values of the five variables to NA which is the missing value
pima$diastolic[pima$diastolic == 0] <- NA
pima$glucose[pima$glucose == 0] <- NA
pima$triceps[pima$triceps == 0] <- NA
pima$insulin[pima$insulin == 0] <- NA
pima$bmi[pima$bmi == 0] <- NA
pima$test <- factor(pima$test)
summary(pima$test)
levels(pima$test) <- c("negative","positive")
summary(pima)
hist(pima$diastolic)
plot(density(pima$diastolic,na.rm=TRUE))
plot(sort(pima$diastolic),pch=".")
plot(diabetes ~ diastolic,pima) #scatterplot
plot(diabetes ~ test,pima) #boxplots as test is a factor
hist(pima$diastolic)
pairs(pima)
#2. Regression Analysis
#When p=1, it is called simple regression but when p=>1 it is called multiple regression
#Objective:
#- Prediction of future observations.
#- Assessment of the effect of, or relationship between, explanatory variables on the response.
#- A general description of data structure.
#e.g. Final and midterm scores in standard units
data(stat500)
stat500 <- data.frame(scale(stat500))
plot(final ~ midterm,stat500)
abline(0,1)
g <- lm(final ~ midterm,stat500)
abline(g$coef,lty=5)
cor(stat500)
#beta_hat = (sum[(xi-x_bar)y])/(sum(xi-x_bar)^2)
#beta_hat = (X'X)^(-1) * X'y;
#E(beta_hat)=(X'X)^(-1) * X'X * beta = beta;
#Var(beta_hat) = (X'X)^(-1) * X' * sigma^2 * I * X * (X'X)^(-1) = (X'X)^(-1) * sigma^2
#e.g.
data(gala)
head(gala)
dim(gala)
gfit <- lm(Species ~ Area + Elevation + Nearest + Scruz + Adjacent, data=gala)
summary(gfit)
#(X'X)^(-1) is the unscaled covariance matrix
x <- cbind(1,gala[,-c(1,2)]) #the intercept should be included
y <- gala$Species
x <- as.matrix(x) #Data frames are allowed to contain character variables, which would disallow
#matrix arithmetic. We need to force x into the matrix form.
t(x) %*% x
xtxi <- solve(t(x) %*% x)
xtxi
#A somewhat more direct way to get (X'X)^(-1) is as follows:
gfit <- lm(Species ~ Area + Elevation + Nearest + Scruz + Adjacent,data=gala)
gs <- summary(gfit)
gs$cov.unscaled #same as xtxi
#scaled covariance:
summary(gfit)$cov.unscaled*summary(gfit)$sigma^2 #one way
vcov(gfit) #another way. vcov returns the variance-covariance matrix of the main parameters of a fitted model object.
names(gs)
names(gfit)
gfit$fit
gfit$res
#we could get beta_hat directly
xtxi %*% t(x) %*% y
#or in a computationally efficient and stable manner
solve(t(x) %*% x, t(x) %*% y)
#We can estimate sigma using the estimator in the text
sqrt(sum(gfit$res^2)/(30-6)) #gfit$res is the residual for every observation
#We may also obtain the standard errors for the coefficients
sqrt(diag(xtxi))*60.975
#Finally we may compute R^2
1-sum(gfit$res^2)/sum((y-mean(y))^2)
summary(gfit)
#3. Inference
#F statistic = [(RSS_omega - RSS_OMEGA) / (df_omega - df_OMEGA)] / [RSS_OMEGA/df_OMEGA]
#df_OMEGA = n - q; df_omega = n - p
#Test of all predictors
#F_{p-1,n-p} = [(SYY-RSS)/(p-1)]/[RSS/(n-p)]
data(savings)
savings
g <- lm(sr ~ pop15 + pop75 + dpi + ddpi, data=savings)
summary(g)
sum((savings$sr-mean(savings$sr))^2)
sum(g$res^2)
((983.63-650.71)/4)/(650.706/45)
1-pf(5.7558,4,45) # one-tail test #p-value
#Testing just one predictor
#t statistic = beta_i_hat / se(beta_i_hat)
g2 <- lm(sr ~ pop75 + dpi + ddpi, data=savings)
sum(g2$res^2)
(797.72-650.71)/(650.71/45)
1-pf(10.167,1,45)
sqrt(10.167) #t^2 will give you the F-statistic
2*(1-pt(3.1886,45)) #similar p-value
anova(g2,g)
#Testing a pair of predictors - 5 possible tests (omit both, omit one, test a subspace, test an offset)
#(1)Testing a subspace
#H0 : beta_j = beta_k
g <- lm(sr ~ .,savings)
gr <- lm(sr ~ I(pop15+pop75)+dpi+ddpi,savings) #The function I() ensures that the argument is evaluated
#rather than interpreted as part of the model formula.
anova(gr,g)
#(2)test whether one of the coefficients can be set to a particular value
##H0 : beta_ddpi = 2
#Such a fixed term in the regression equation is called an offset.
gr <- lm(sr ~ pop15+pop75+dpi+offset(2*ddpi),savings)
#An offset is a term to be added to a linear predictor, such as in a generalised linear model, with
#known coefficient 1 rather than an estimated coefficient.
anova(gr,g)
#t statistic = (beta_hat - c)/se(beta_hat)
tstat <- (0.409695-2)/0.196197
tstat
2*pt(tstat,45) #similar to the pvalue for anova's F statistic
tstat^2 #similar to F statistic in ANOVA
#notice that we couldn't test H0: beta_j * beta_k = 1
#This hypothesis is not linear in the parameters so we can't use our general method
#Concerns about Hypothesis T esting - permutation test
#Since the permutation test does not depend on the assumption of normality , we might regard it as
#superior to the normal theory based value.
g <- lm(sr ~ pop75+dpi,data=savings)
summary(g)
gs <- summary(g)
gs$fstat
#one way to simulate and calculate
permutations <- sapply(1 : 10000, function(i) summary(lm(sample(sr) ~ pop75+dpi,data=savings))$fstat[1])
mean(permutations>gs$fstat[1])
#another way
fstats <- numeric(10000)
for(i in 1:10000){
ge <- lm(sample(sr) ~ pop75+dpi,data=savings)
fstats[i] <- summary(ge)$fstat[1]
}
length(fstats[fstats > gs$fstat[1]])/10000 #null isn't included in
#(3) Confidence Intervals for beta
#As with testing, we must decide whether to form confidence regions for parameters individually or
#simultaneously. Simultaneous regions are preferable but for more than two dimensions they are difficult to
#display and so there is still some value in computing the one-dimensional confidence intervals.
#simultaneous regions:
#(beta_hat - beta)' X' X (beta_hat - beta) / sigma^2 ~ chisq_p and (n-p)*sigma_hat ^ 2 / sigma^2 ~ chisq_(n-p)
#--> (beta_hat - beta)' X' X (beta_hat - beta)/(p*sigma_hat ^ 2) ~ (chisq_p/p) / (chisq_(n-p)/(n-P)) = F_{p,n-p}
#--> (beta_hat - beta)' X' X (beta_hat - beta) <= (p*sigma_hat ^ 2) * F_{p,n-p}
#one-dimensional confidence intervals:
#(beta_hat_i + c(-1,1) * qt_{alpha/2, n-p} * sigma_hat * sqrt((X'X_ii)^(-1))
g <- lm(sr ~ ., savings)
summary(g)
dim(savings) #50 * 5
#individual 95% confidence intervals for the regression parameters of pop75:
qt(0.975,45) #df = n-p = 45 #result: 2.014103, because symmetric, so directly use 0.975
qt(0.025,45) #result: -2.014103 #previously we calcualte pt, now we calculate qt.
c(-1.69-2.01*1.08,-1.69+2.01*1.08)
#similarly for growth(ddpi):
c(0.41-2.01*0.196,0.41+2.01*0.196) #the qt_{alpha/2, n-p} * sigma_hat * sqrt((X'X_ii)^(-1)) part is the same
#Notice that this confidence interval is pretty wide in the sense that the upper limit is about 50 times larger
#than the lower limit. This means that we are not really that confident about what the exact effect of growth
#on savings really is.
#drawing confidence ellipses
library(ellipse)
plot(ellipse(g,c(2,3)),type="l",xlim=c(-1,0))
#add the origin and the point of the estimates:
points(0,0)
points(g$coef[2],g$coef[3],pch=18)
abline(v=c(-0.461-2.01*0.145,-0.461+2.01*0.145),lty=2)
abline(h=c(-1.69-2.01*1.08,-1.69+2.01*1.08),lty=2)
#Why are these lines not tangential to the ellipse? The reason for this is that the confidence intervals are
#calculated individually. If we wanted a 95% chance that both intervals contain their true values, then the
#lines would be tangential. (but I don't know how to draw it). we prefer the joint test result.
#Examine the correlation of the two predictors:
cor(savings$pop15,savings$pop75) #-0.9084787
#But from the plot, we see that coefficients have a positive correlation.
#The correlation between predictors and the correlation between the coefficients of those predictors are
#often different in sign.
#Intuitively, this can be explained by realizing that two negatively correlated predictors are attempting
#to the perform the same job. The more work one does, the less the other can do and hence the positive
#correlation in the coefficients.
#(4)Confidence intervals for predictions
#y_hat_0 = x_0' * beta_hat = x_0' * (X'X)^(-1) * x_0 * sigma^2
#prediction of a future value --> x_0' * beta_hat + epsilon
#--> y_hat_0 + c(-1,1) * t_{alpha/2, n-p} * sigma_hat * sqrt(1 + x_0' * (X'X)^(-1) * x_0)
#prediction of the mean response --> x_0' * beta_hat
#-->y_hat_0 + c(-1,1) * t_{alpha/2, n-p} * sigma_hat * sqrt(x_0' * (X'X)^(-1) * x_0)
g <- lm(Species ~ Area+Elevation+Nearest+Scruz+Adjacent,data=gala)
#Suppose we want to predict the number of species (of tortoise) on an island with predictors 0.08,93,6.0,12.0,0.34
x0 <- c(1,0.08,93,6.0,12.0,0.34)
y0 <- sum(x0*g$coef) #33.91967
qt(0.975,24) #2.0639
#(X'X)^(-1) matrix
x <- cbind(1,gala[,3:7])
x <- as.matrix(x)
xtxi <- solve(t(x) %*% x) #same as summary(g)$cov.unscaled
#the prediction interval for the single future response
bm <- sqrt(1+x0 %*% xtxi %*% x0) *2.064 * 60.98 #130.0891
c(y0-bm,y0+bm) #-96.16946 164.00879
#the bands for mean response CI:
bm <- sqrt(x0 %*% xtxi %*% x0) *2.064 * 60.98 #32.89005
c(y0-bm,y0+bm) # 1.029614 66.809721
#In the single future response, negative values should be always possible. These impossibe values can be avoided
#by transforming the response, say taking logs or by using a probability model more appropriate to the response
#There is a more direct method for computing the CI:
#the prediction interval for the single future response
predict(g,data.frame(Area=0.08,Elevation=93,Nearest=6.0,Scruz=12, Adjacent=0.34),se=T,interval="prediction") #names are necessary
#the bands for mean response CI:
predict(g,data.frame(Area=0.08,Elevation=93,Nearest=6.0,Scruz=12, Adjacent=0.34),se=T,interval="confidence")
#The width of the mean response interval can then be calculated by multiplying the se for the fit by the
#appropriate t-critical value:
15.934*2.064 #32.888 --> bm
#(5)Orthogonality
#Suppose we can partition X in two, X = [X1|X2] such that X1'X2 = 0. So now
#Y=X*beta + X1 * beta1 + X2 * beta2 + epsilon and X'X = [X1'X1, 0; 0, X2'X2]
#then, beta1_hat = (X1'X1)^(-1) * (X1') * y; beta2_hat = (X2'X2)^(-1) * (X2') * y
#Now if we wish to test H0 : beta1 = 0, it should be noted that RSS_Omega/df = sigma_Omega^2 will
#be different depending on whether X2 is included in the model. #Orthogonality is a desirable property.
data(odor)
odor # temp = (Fahrenheit-80)/40
x <- as.matrix(cbind(1,odor[,-1]))
xtx <- t(x) %*% x #The matrix is diagonal.
#What would happen if temp was measured in the original Fahrenheit scale?
#The matrix would still be diagonal but the entry corresponding to temp would change.
g <- lm(odor ~ temp + gas + pack, data=odor)
summary(g,cor=T)
ga <- lm(odor ~ gas + pack, data=odor)
summary(ga)
###: The coefficients themselves do not change but the residual standard error does change slightly which
#causes small changes in the standard errors of the coefficients t-statistics and p-values, but
#nowhere near enough to change our qualitative conclusions.
g <- lm(sr ~ pop15 + pop75 + dpi + ddpi, savings)
summary(g,cor=T)
#Drop pop15 from the model:
gu <- update(g, . ~ . - pop15) #update will update and (by default) re-fit a model.
summary(gu)
#update(object, formula., ..., evaluate = TRUE)
#Pay particular attention to pop75. The effect has now become positive whereas it was negative.
#Granted, in neither case is it significant, but it is not uncommon in other datasets for such sign
#changes to occur and for them to be significant.
#(6)Identifiability
#where X is an n p matrix. If X TX is singular and cannot be inverted, then there will be infinitely many
#solutions to the normal equations and beta_hat is at least partially unidentifiable.
#Unidentifiability will occur when X is not of full rank - when its columns are linearly dependent
#multilinear, supersaturated.
#Suppose we create a new variable for the savings dataset - the percentage of people between 15 and 75:
pa <- 100-savings$pop15-savings$pop75
g <- lm(sr ~ pa + pop15 + pop75 + dpi + ddpi, data=savings)
summary(g)
#pop75 is not defined because of singularities
x <- as.matrix(cbind(1,pa,savings[,-1]))
dimnames(x) <- list(row.names(savings),c("int","pa","p15","p75", "dpi","ddpi")) #dimnames() is a list
#If we didn't know which linear combination was causing the trouble, how would we find out? An eigen
#decomposition of X TX can help:
e <- eigen(t(x) %*% x)
signif(e$values,3) #3 means only keep 3 digits in scientific expression
#Only the last eigenvalue is zero, indicating one linear combination is the problem.
#So we look at the last column of the eigenvector
signif(e$vectors,3)
#From last column of the matrix, we see that 100-pa-p15-p75=0 is the offending combination.
#Lack of identifiability is obviously a problem but it is usually easy to identify and work around. More
#problematic are cases where we are close to unidentifiability.
pae <- pa +0.001*(runif(50)-0.5)
ge <- lm(sr ~ pae+pop15+pop75+dpi+ddpi,savings)
summary(ge)
#Notice the now all parameters can be estimated but the standard errors are very large because we cannot
#estimate them in a stable way.
#We hope that epsilon ~ N(0,sigma^2) but
#1. Errors may be heterogeneous (unequal variance).
#2. Errors may be correlated.
#3. Errors may not be normally distributed.
#The last defect is less serious than the first two because even if the errors are not normal, the beta_hat
#will tend to normality due to the power of the central limit theorem.
g <- lm(sr ~ pop15 + pop75 + dpi + ddpi, data=savings)
g2 <- lm(sr ~ pop75 + dpi + ddpi, data=savings)
g3 <- lm(sr ~ pop75 + ddpi, data=savings)
g4 <- lm(sr ~ pop75, data=savings)
#Prediction is more stable than parameter estimation.
x0 <- data.frame(pop15=32,pop75=3,dpi=700,ddpi=3)
predict(g,x0)
predict(g2,x0)
predict(g3,x0)
predict(g4,x0)
#Interpretation of beta: suppose we observe x1 + 1 and the same other given predictor values then the
#predicted response is increased by beta.
#Notice that I have been careful to not to say that we have taken a specific individual and increased
#their x1 by 1, rather we have observed a new individual with predictor x1 + 1. To put it another way,
#people with yellow fingers tend to be smokers but making someone's fingers yellow won't make them more
#likely to smoke.
###Prediction is conceptually simpler since interpretation is not an issue but you do need to worry about extrapolation.
##Quantitative extrapolation: Is the new x0 within the range of validity of the model. Is it close to
#the range of the original data? If not, the prediction may be unrealistic.
range(savings$pop75) #0.56 4.70
grid <- seq(0,10,0.1)
p <- predict(g4,data.frame(pop75=grid),se=T)
cv <- qt(0.975,48)
matplot(grid,cbind(p$fit,p$fit-cv*p$se,p$fit+cv*p$se),lty=c(1,2,2), type="l",xlab="pop75",ylab="Saving")
rug(savings$pop75) #A "rug" shows the location of the observed values of pop75
#The confidence bands in the figure become wider as we move away from the range of the data.
#A model may fit well in the range of the data, but outside of that range, the predictions may be very bad.
##Qualitative extrapolation: If the model was built in the past and is to be used for future predictions,
#we must make a difficult judgment as to whether conditions have remained constant enough for this to work.
#4.Errors in Predictors(x) (error in Y is allowed for with epsilon)
#suppose: y_i = eta_i + epsilon_i, x_i = zeta_i + delta_i, where epsilon and delta are independent.
#Suppose the true underlying relationship is: eta_i = beta_0 + beta_1 * zeta_i, but we only see (x_i,y_i).
#Putting it together, we get y_i = beta_0 + beta_1 * zeta_i + (epsilon_i - beta_1 * delta_i)
#let sigma_zeta^2 = [sum(zeta_i - zeta_bar)^2]/n, sigma_{zeta,delta} = cov(zeta, delta)
#where zeta are the true values of X
#Now beta1_hat = [sum(x_i-x_bar)*y]/[sum(x_i-x_bar)^2]
#E[beta1_hat] = beta_1 * (sigma_zeta^2+sigma_{zeta,delta})/(sigma_zeta^2+sigma_{zeta,delta}+sigma_delta^2)
#If there is no relation between zeta and delta, this simplifies to:
#E[beta1_hat] = beta_1*sigma_zeta^2/(sigma_zeta^2+sigma_delta^2) = beta_1* 1/(1+sigma_delta^2/sigma_zeta^2)
#So in general beta_1 will be biased (regardless of the sample size and typically towards zero).
#If sigma_delta^2 is small relative to sigma_zeta^2 then the problem can be ignored.
#In other words, if the variability in the errors of observation of X are small relative to the range
#of X then we need not be concerned.
#If not, it's a serious problem and other methods such as fitting using orthogonal rather than vertical
#distance in the least squares fit should be considered.
#For prediction, measurement error in the x's is not such a problem since the same error will apply to the
#new x0 and the model used will be the right one.
#True values of the regression coeffs are 0 and 1 respectively.
set.seed(1234)
x <- 10*runif(50)
y <- x+rnorm(50)
gx <- lm(y ~ x)
summary(gx) #intercept is not significant
#Compare the results - notice how the slope has decreased.
z <- x + rnorm(50)
gz <- lm(y ~ z)
summary(gz)
#add even more noise:
z2 <- x+5*rnorm(50)
gz2 <- lm(y ~ z2)
summary(gz2)
#compare:
matplot(cbind(x,z,z2),y,xlab="x",ylab="y")
abline(gx,lty=1)
abline(gz,lty=2)
abline(gz2,lty=5)
legend("bottomright",legend = c("gx","gz","gz2"),lty=c(1,2,5))
#Original x shown with "1", with small error as "2" and with large error as "3".
bc <- numeric(1000)
for(i in 1:1000){
y <- x + rnorm(50)
z <- x + 5*rnorm(50)
g <- lm(y ~ z)
bc[i] <- g$coef[2]
}
summary(bc)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#0.01285 0.18330 0.22110 0.22090 0.26100 0.38700
#Given that the variance of a standard uniform random variable is 1/12, sigma_delta^2=25
#and sigma_zeta^2=100/12, we'd expect the mean of beta to be 0.25.
#5.Generalized Least Squares -- errors have non-constant variance or are correlated
#(1)THE GENERAL CASE -- assume errors are AR(1)
#previously, we assume that var(epsilon) = sigma^2 * I.
#Instead, we assume var(epsilon) = simga^2 * SIGMA. --> SIGMA is the covariance matrix.
#simga^2 is unknown, SIGMA is known.
#then Generalized least squares minimizes (y-X*beta)'*SIGMA^(-1) * (y-X*beta)
#therefore, beta_hat = (X'SIGMA^(-1)X)^(-1) * X'*SIGMA^(-1)*y
#write SIGMA = S*S', where S is a triangular matrix using the Choleski Decomposition
#then minimize (y-X*beta)'* (S*S')^(-1) * (y-X*beta)=(S^(-1)*y-S^(-1)*X*beta)'*I*(S^(-1)*y-S^(-1)*X*beta)
#So GLS is like regressing S^(-1)*X on S^(-1)*y
#--> where y = X*beta + epsilon, S^(-1)*y = S^(-1)*X*beta + S^(-1)*epsilon;
#--> y_new = X_new*beta + epsilon_new
#--> var(epsilon') = var(S^(-1)*epsilon)=S^(-1)*var(epsilon)*S'^(-1)
#= S^(-1)*sigma^2*S*S'*S'^(-1) = sigma^2 * I
#--> var(beta_hat)=(X'*SIGMA^(-1)*X)^(-1) * sigma^2
#the practical problem is that SIGMA may not be known.
data(longley)
head(longley)
str(longley)
g <- lm(Employed ~ GNP + Population, data=longley) #here the errors are assumed to be uncorrelated.
summary(g,cor=T)
#notice the correlation of Coefficients are all close to 1
#In data collected over time such as this, successive errors could be correlated.
#assume that the errors take a simple autoregressive form:
#epsilon_{i+1} = rho * epsilon_i + delta_i, where delta_i ~ N(0,tau^2)
#under this assumption, SIGMA_ij = rho^|i-j|
#We can estimate this correlation 'rho' by:
cor(g$res[-1],g$res[-16]) #0.3104092
cor(residuals(g)[-1],residuals(g)[-16])
#We now construct the SIGMA matrix and compute the GLS estimate of beta along with its standard errors.
x <- model.matrix(g)
SIGMA <- diag(16)
SIGMA <- 0.31041^abs(row(SIGMA)-col(SIGMA)) #important!!!! How to create an AR matrix!
SigI <- solve(SIGMA)
xtxi <- solve(t(x) %*% SigI %*% x)
beta <- xtxi %*% t(x) %*% SigI %*% longley$Empl
beta
#Compare with the model output above where the errors are assumed to be uncorrelated.
#Another way to get the same result is to regress S^(-1)*y on S^(-1)*x as we demonstrate here:
sm <- chol(Sigma)
smi <- solve(t(sm))
sx <- smi %*% x #S^(-1)*X*beta
sy <- smi %*% longley$Empl #S^(-1)*y
g_refit<-lm(sy ~ sx-1)
g_refit$coef
g_refit$res
#Our initial estimate of rho is 0.31 but once we fit our GLS model we'd need to re-estimate it
#The process would be iterated until convergence. This is cumbersome. A more convenient approach
#may be found in the nlme package, which contains a GLS fitting function.
library(nlme)
g <- gls(Employed ~ GNP + Population, correlation=corAR1(form= ~Year), data=longley)
#corAR1(value, form, fixed)
#value: the value of the lag 1 autocorrelation, which must be between -1 and 1. Defaults to 0 (no autocorrelation).
#form: a one sided formula of the form ~ t, or ~ t | g, specifying a time covariate t and, optionally, a grouping factor g.
# A covariate for this correlation structure must be integer valued.
# observations with different grouping levels are assumed to be uncorrelated.
#Defaults to ~ 1, which corresponds to using the order of the observations in the data as a covariate, and no groups.
#here, form=~Year is the same as form=~1
#fixed: an optional logical value indicating whether the coefficients should be allowed to
#vary in the optimization, or kept fixed at their initial value.
summary(g) #rho = 0.6441692
intervals(g) #no applicable method for 'intervals' applied to an object of class "lm"
#(2)Weighted Least Squares
#Sometimes the errors are uncorrelated, but have unequal variance where the form of the
#inequality is known. --> SIGMA is diagonal.
#we can write SIGMA = diag(1/w_1, 1/w_2, ..., 1/w_n)
#so S = diag(sqrt(1/w_1),sqrt(1/w_2),...,sqrt(1/w_n)), so where the w_i are called the weights
#and so we can regress sqrt(w_i*x_i) on sqrt(w_i*y_i)
#therefore, Cases with low variability should get a high weight, high variability a low weight.
#examples of such situation:
#Errors proportional to a predictor: var(epsilon_i) ~ x_i suggests w_i = x_i^(-1)
#Yi are the averages of n_i observations then var(y_i)=var(epsilon_i) = sigma^2/n_i suggests w_i=n_i.
#fitting the regression with weights
data(strongx)
strongx
#The cross-section(crossx) variable is believed to be linearly related to the inverse of the energy (energy - has already been inverted)
#sd is the standard deviation of the response(y_i, that is, crossx).
g <- lm(crossx ~ energy, strongx, weights=sd^-2) #directly add weights option in lm()
summary(g)
#fitting the regression without weights
gu <- lm(crossx ~ energy, strongx)
summary(gu)
#The two fits can be compared
plot(crossx ~ energy, data=strongx)
abline(g)
abline(gu,lty=2)
#(3)Iteratively Reweighted Least Squares
#we may model SIGMA using a small number of parameters(predictors):
# var(epsilon_i) = sigma^2 * SIGMA = gamma_0 + gamma_1 * x1
#The IRWLS fitting Algorithm is (don't have to remember, we have rlm function):
# Start with w_i = 1
# Use least squares to estimate beta.
# Use the residuals to estimate gamma, perhaps by regressing epsilon^2 on x.
# Recompute the weights and goto 2. (Continue until convergence.)
library(foreign)
cdata <- read.dta("http://www.ats.ucla.edu/stat/data/crime.dta")
summary(cdata)
summary(ols <- lm(crime ~ poverty + single, data = cdata))
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(ols, las = 1) #las: numeric in {0,1,2,3}; the style of axis labels.
#0: always parallel to the axis [default],
#1: always horizontal #2: always perpendicular to the axis,
#3: always vertical.
par(opar) #will return to default par setting (1*1 mfrow)
cdata[c(9, 25, 51), 1:2]
d1 <- cooks.distance(ols)
library(MASS)
r <- stdres(ols) #The standardized residuals. !!important!!
a <- cbind(cdata, d1, r)
a[d1 > 4/51, ]
#Now we will look at the residuals. We will generate a new variable called absr1, which is the
#absolute value of the residuals
rabs <- abs(r)
a <- cbind(cdata, d1, r, rabs)
asorted <- a[order(-rabs), ]
asorted[1:10, ]
#Now let's run our first robust regression. Robust regression is done by iterated re-weighted least squares (IRLS).
#There are several weighting functions that can be used for IRLS.
#We are going to first use the Huber weights in this example
# rlm(x, y, weights, ..., w = rep(1, nrow(x)),
# init = "ls", psi = psi.huber,
# scale.est = c("MAD", "Huber", "proposal 2"), k2 = 1.345,
# method = c("M", "MM"), wt.method = c("inv.var", "case"),
# maxit = 20, acc = 1e-4, test.vec = "resid", lqs.control = NULL)
summary(rr.huber <- rlm(crime ~ poverty + single, data = cdata))
#In Huber weighting, observations with small residuals get a weight of 1 and the larger the
#residual, the smaller the weight.
#With bisquare weighting, all cases with a non-zero residual get down-weighted at least
#a little.
hweights <- data.frame(state = cdata$state, resid = rr.huber$resid, weight = rr.huber$w)
hweights2 <- hweights[order(rr.huber$w), ]
hweights2[1:15, ]
#We can see that roughly, as the absolute residual goes down, the weight goes up.
#In other words, cases with a large residuals tend to be down-weighted. This output shows
#us that the observation for Mississippi will be down-weighted the most. Florida will also
#be substantially down-weighted. All observations not shown above have a weight of 1.
#In OLS regression, all cases have a weight of 1.
rr.bisquare <- rlm(crime ~ poverty + single, data=cdata, psi = psi.bisquare)
#Psi functions are supplied for the Huber, Hampel and Tukey bisquare proposals as
#psi.huber, psi.hampel and psi.bisquare
summary(rr.bisquare)
biweights <- data.frame(state = cdata$state, resid = rr.bisquare$resid, weight = rr.bisquare$w)
biweights2 <- biweights[order(rr.bisquare$w), ]
biweights2[1:15, ]
#We can see that the weight given to Mississippi is dramatically lower using the bisquare
#weighting function than the Huber weighting function and the parameter estimates from
#these two different weighting methods differ.
# When comparing the results of a regular OLS regression and a robust regression, if the
# results are very different, you will most likely want to use the results from the robust
# regression. Large differences suggest that the model parameters are being highly influenced
# by outliers.
#Huber weights can have difficulties with severe outliers,
#and bisquare weights can have difficulties converging or may yield multiple solutions.
#An alternative approach is to model the variance and jointly estimate the regression and
#weighting parameters using likelihood based method. This can be implemented in R using the
#gls() function in the nlme library.
#last time we use gls() is in 5(1) when we use correlation=corAR1(form= ~Year)
# AR(1) errors within each Mare
fm1 <- gls(follicles ~ sin(2*pi*Time) + cos(2*pi*Time), Ovary,
correlation = corAR1(form = ~ 1 | Mare))
# variance increases as a power of the absolute fitted values
fm2 <- update(fm1, weights = varPower())
#6. Testing for Lack of Fit
#How can we tell if a model fits the data?
#If the model is correct then sigma_hat^2 should be an unbiased estimate of sigma^2.
#If a lack of fit is found, then a new model is needed.
#Criteria: If we have a model which is not complex enough to fit the data or simply takes
#the wrong form, then sigma_hat^2 will overestimate sigma^2
#Alternatively, if our model is too complex and overfits the data,
#then sigma_hat^2 will be an underestimate.
#This suggests a possible testing procedure - we should compare sigma_hat^2 to sigma^2.
#(1) sigma^2 is known -- chisq test
#because sigma_hat^2/sigma^2 ~ Chisq_{n-p}/(n-p)
#so there is a lack of fit if (n-p)*sigma_hat^2/sigma^2 > chisq_{n-p,1-alpha}
#Still use strongx data as we we know the variance almost exactly because each response value
#is the average of a large number of observations
#Because of the way the weights are defined, w_i=1/var(y_i), the known variance is implicitly
#equal to one. There is nothing special about one - we could define w_i=99/var(y_i) and
#the variance would be implicitly 99.
data(strongx)
g <- lm(crossx ~ energy, weights=sd^-2, strongx)
summary(g)
plot(strongx$energy,strongx$crossx,xlab="Energy",ylab="Crossection")
abline(g$coef) #abline(g) also works
chistat<-1.66^2*8 #test stat: Residual standard error: 1.66 on 8 degrees of freedom
#because setting known sigma^2 as 1, so chisqtest = (n-p)*sigma_hat^2
#if the known sigma^2 is a different number, then chisqtest = (n-p)*sigma_hat^2/(n-p)*sigma^2
1-pchisq(chistat,8) #0.0048332
#We conclude that there is a lack of fit. Just because R2 is large does not mean that
#you can not do better. Add a quadratic term to the model and test again:
g2 <- lm(crossx ~ energy + I(energy^2), weights=sd^-2, strongx)
#g2 <- lm(crossx ~ energy + energy^2, weights=sd^-2, strongx) #if no I(), will ignore energy^2
summary(g2)
tstat2<-0.679^2*7
1-pchisq(tstat2,7) #This time we cannot detect a lack of fit.
x <- seq(0.05,0.35,by=0.01)
lines(x,g2$coef[1]+g2$coef[2]*x+g2$coef[3]*x^2,lty=2) #lty=2: a dotted line on the plot
#(2) sigma^2 is unknown -- estimate one -- F-statistic: F_{n-p-df_pe,df_pe}
#we can partition the RSS into that due to lack of fit(between subject variability)
#and that due to the pure error(within subject variability or the measurement error)
###ANOVA of LACK OF FIT TABLE
#RSS: df = n-p; SS = RSS
#lack of fit: df = n-p-df_pe; SS= RSS-SS_pe; MS = (RSS-SS_pe)/(n-p-df_pe); F = Ratio of MS
#pure error: df_pe; SS = SS_pe; MS=SS_pe/df_pe
#if there is no replicates(duplicate x values), then there is no pure error.
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!when there is no replicate!!!!!!!!!!!!!!!!!!!!!!!#
#When there are no replicates(then no with-in subject variance), it may be possible to group the
#responses for similar x (use cut() function) but this is not straightforward. It is also possible
#to detect lack of fit by less formal, graphical methods.
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!if there is replicates!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#Compute the F-statistic = [(RSS-SS_pe)/(n-p-df_pe)] / (SS_pe/df_pe)
#and compare to F_{n-p-df_pe,df_pe} and reject if the statistic is too large.
#The "pure error" estimate of sigma^2 is given by SS_pe/df_pe, that is, MS
#where SS_pe = sum_distinctx (sum_givenx((y_i-y_bar)^2))
# df_pe = sum_distinctx(#replicates - 1)
#sum_givenx((y_i-y_bar)^2) means within-subject variance.
#If you fit a model that assigns one parameter to each group of observations with fixed x
#then the sigma^2 from this model will be the pure error sigma^2. -- group x by factoring #This model is just the one-way anova model
#Another way of looking at this is a comparison between the model of interest and a
#saturated model that assigns a parameter to each unique combination of the predictors.
data(corrosion)
corrosion
g <- lm(loss ~ Fe, data=corrosion)
summary(g) #Residual standard error: 3.058
plot(corrosion$Fe,corrosion$loss,xlab="Iron content",ylab="Weight loss")
abline(g$coef)
#We have an R2 of 97% and an apparently good fit to the data.
#We now fit a model that reserves a parameter for each group of data with the same value of x
#This is accomplished by declaring the predictor to be a factor.
unique(corrosion$Fe) #6 groups: 0.01 0.48 0.71 0.95 1.19 1.44 1.96
ga <- lm(loss ~ factor(Fe), data=corrosion) # -- group x by factoring, so that get rid of the between-group variance
#all errors would be the pure error!!!!!
#The fitted values are the means in each group - put these on the plot:
points(corrosion$Fe,ga$fit,pch=18)
#We can now compare the two models in the usual way
anova(g,ga) #compare the pure error(SS_pe/df_pe) with RSS 3.058.
#notice the table is not the above ANOVA of LACK OF FIT TABLE, we only want to get the pure error data from the table.
#The low p-value indicates that we must conclude that there is a lack of fit.
#The reason is that the "pure error" sqrt(11.782/6) = 1.4 is substantially less than the
#regression standard error of 3.058. We might investigate models other than a straight
#line although no obvious alternative is suggested by the plot.
#Before considering other models:
#I would first find out whether the replicates are genuine - perhaps the low pure error SD
#can be explained by some correlation in the measurements.
#When there are replicates, it is impossible to get a perfect fit.In these cases, one
#should realize that the maximum R2 that may be attained might be substantially less than
#100% and so perceptions about what a good value for R2 should be downgraded appropriately.
#Another possible explanation is unmeasured third variable is causing the lack of fit.
#These methods are good for detecting lack of fit, but if the null hypothesis is accepted,
#we cannot conclude that we have the true model. After all, it may be that we just did not
#have enough data to detect the inadequacies of the model. All we can say is that the model
#is not contradicted by the data.
#A more general question is how good a fit do you really want? By increasing the complexity
#of the model, it is possible to fit the data more closely. By using as many parameters as data
#points, we can fit the data exactly. Very little is achieved by doing this since we learn
#nothing beyond the data itself and any predictions made using such a model will tend to have
#very high variance.
#For example, we can fit the mean responses for the example above exactly using a sixth order polynomial:
gp <- lm(loss ~ Fe+I(Fe^2)+I(Fe^3)+I(Fe^4)+I(Fe^5)+I(Fe^6),corrosion)
plot(loss ~ Fe, data=corrosion,ylim=c(60,130))
points(corrosion$Fe,ga$fit,pch=18)
grid <- seq(0,2,len=50)
lines(grid,predict(gp,data.frame(Fe=grid)))
summary(gp)$r.squared #0.99653
#The fit of this model is excellent, but it is clearly riduculous. --> overfitting
#This illustrates the need not to become too focused on measures of fit like R^2.
#7. Diagnostics - influential points, assumption of residuals.
#(1) Residuals and Leverage
#y_hat = X*(X'X)^(-1)*X'*y = H*y, where H is the hat-matrix, which is only depends on X
#Residual(epsilon_hat) is different from error:
#epsilon_hat = y-y_hat = (I-H)*y = (I-H)*X*beta + (I-H)*epsilon = (I-H)*epsilon
#var(epsilon_hat) = var(I-H) * epsilon = (I-H)*sigma^2 (assuming var(epsilon) = sigma^2*I)
#h_i = H_ii are called leverages. The h_i depends only on X.
#var(epsilon_hat_i) = sigma^2*(1-h_i), so that a large leverage for h_i will make
#var(epsilon_hat_i) small, in other words the fit will be "forced" to be close to y_i.
# leverage points have residuals with less variability than residuals from non-leverage points
# a leverage point is characterized by fitted value close to the observed target value, its
#residual is likely to be closer to zero.(but leverage point could be considered x-axis outlier as
#normal outlier is considered y-axis outlier) (a large leverage point is considered good
#leverage point but c is considered bad leverage point.)
#Facts: sum_{i}(h_i) = p; h_i >= 1/n for any i because there must be more than 1 predictors
#An average value for h_i is p/n and a "rule of thumb" is that leverages of more than 2p/n
#should be looked at more closely.
#Large values of h_i are due to extreme values in X.
#var(y_hat) = var(H*y) = H*sigma^2 so var(y_hat_i) = h_i * sigma_hat^2 (sigma_hat is the RSS)
data(savings)
g <- lm(sr ~ pop15 + pop75 + dpi + ddpi, savings)
plot(g$res,ylab="Residuals",main="Index plot of residuals")
sort(g$res)[c(1,50)]
countries <- row.names(savings)
identify(1:50,g$res,countries) #Double click on the left mouse button next to the points you
#are interested in to identify them, and press Esc to show them.
#identify(x, y = NULL, labels = seq_along(x), pos = FALSE, n = length(x), plot = TRUE, atpen = FALSE, offset = 0.5, tolerance = 0.25, ...)
#Now look at the leverage: We first extract the X-matrix here using model.matrix() and then
#compute and plot the leverages (also called "hat" values)
x <- model.matrix(g)
lev <- hat(x) #get hat values H[i,i] (leverages)
plot(lev,ylab="Leverages",main="Index plot of Leverages")
abline(h=2*5/50) #rule of thumb
sum(lev) #sum_{i}(h_i) = p
names(lev) <- countries
lev[lev > 0.2] ##rule of thumb cutoff
#Alternatively, we can do it interactively like this
identify(1:50,lev,countries)
#(2)Studentized Residuals - a studentized residual is dividing residual by an estimate of its standard deviation.
#This is an important technique in the detection of outliers.
#(internally) studentized residuals: the ith case is included.
#var(epsilon_hat_i) = sigma^2*(1-h_i) --> r_i = epsilon_hat_i/(sigma_hat*sqrt(1-h_i))
#If the model assumptions are correct var(r_i)=1 and corr(r_i,r_j) tends to be small.
#(see 7.8)Studentized residuals are sometimes preferred in residual plots as they have been standardized to have equal variance.
#Note that studentization can only correct for the natural non-constant variance in residuals when the
#errors have constant variance. If there is some underlying heteroscedascity in the errors(epsilon),
#studentization cannot correct for it.
#studentized residuals for the savings data:
gs <- summary(g)
gs$sigma
stud <- g$res/(gs$sigma * sqrt(1-lev))
plot(stud,ylab="Studentized Residuals",main="Studentized Residuals")
#(3) An outlier Test -- jackknife residuals -- t dist.
#jackknife (or externally studentized or crossvalidated) residuals -- the ith case is exclused.
#t_i = epsilon_i_hat/[sigma_(i)_hat * sqrt(1-h_i)] = r_i * sqrt[(n-p-1)/(n-p-r_i^2)] #(i) means leaving out the ith case
#t_i ~ t_(n-p-1); if p value too small, then case i is an outlier
# Bonferroni correction:
#P(all tests accept) = 1 - P(At least one rejects) >= 1 - sum_{i}P(Test i rejects) = 1 - n*alpha
jack <- rstudent(g)
plot(jack,ylab="Jacknife Residuals",main="Jacknife Residuals")
jack[abs(jack)==max(abs(jack))] #pick up the largest jackknife residual.
#The largest residual of 2.85 is pretty big for a standard normal scale but is it an outlier?
#Compute the Bonferroni critical value:
abs(qt(.05/(50*2),44)) #3.525801 > 2.85. So it's not significant.
#notes:
# Two or more outliers next to each other can hide each other.
# An outlier in one model may not be an outlier in another when the variables have been changed or
# transformed. You will usually need to reinvestigate the question of outliers
# when you change the model.
# for large datasets, we need only worry about clusters of outliers.
#Here is an example of a dataset with multiple outliers.
#Data are available on the log of the surface temperature and the log of the light intensity
#of 47 stars in the star cluster CYG OB1, which is in the direction of Cygnus.
data(star)
head(star)
plot(star$temp,star$light,xlab="log(Temperature)", ylab="log(Light Intensity)")
ga <- lm(light ~ temp, data=star)
gu <- lm(light ~ temp, data=star, subset=(temp>3.6)) #subset argument!!!!
abline(ga) #apparently, the line is distorted by the outliers upperleft
abline(gu$coef,lty=2)
range(rstudent(ga)) #Are there any outliers in the data? The outlier test does not reveal any.
abs(qt(.05/(47*2),47-1-1)) #3.499247 is outside (-2.102756 2.780525)
#This illustrates the problem of multiple outliers. We can visualize the problems here, but for higher
#dimensional data this is much more difficult.
#(4)Influential Observations - at least either an outlier or have large leverage
#measures of influence:
#Change in the coefficients: beta_hat - beta_(i)_hat
#Change in the fit: X'(beta_hat - beta_(i)_hat) = y_hat - y_(i)_hat
###Cook Statistics. D_i = [(beta_hat - beta_(i)_hat)'(X'X)(beta_hat - beta_(i)_hat)] / (p*sigma_hat^2)
#= [y_hat - y_(i)_hat]'[y_hat - y_(i)_hat] / (p*sigma_hat^2)
#= (1/p) * r_i^2 * h_i/(1-h_i)
#The first term, r_i^2, is the residual effect and the second is the leverage.
#The combination of the two leads to influence.
cook <- cooks.distance(g)
plot(cook,ylab="Cooks distances")
identify(1:50,cook,countries)
gl <- lm(sr ~ pop15+pop75+dpi+ddpi,savings,subset=(cook < max(cook))) #subset argument!!!
summary(gl)
summary(g)
#What changed? The coefficient for ddpi changed by about 50%. We don't like our estimates to be so
#sensitive to the presence of just one country.
#It would be rather tedious to do this for each country but there's a quicker way:
ginf <- lm.influence(g)
#results of lm.influence:
#hat: a vector containing the diagonal of the 'hat' matrix.
#coefficients: a matrix whose i-th row contains the change in the estimated coefficients which
#results when the i-th case is dropped from the regression.
#sigma: a vector whose i-th element contains the estimate of the residual standard deviation
#obtained when the i-th case is dropped from the regression.
#wt.res: a vector of weighted (or for class glm rather deviance) residuals.
ginf$coef #seems we need to plot it to see it clearly.
plot(ginf$coef[,2],ginf$coef[,3],xlab="Change in pop15 coef", ylab="Change in pop75 coef")
identify(ginf$coef[,2],ginf$coef[,3],countries)
gj <- lm(sr ~ pop15+pop75+dpi+ddpi,savings,subset=!(countries %in% c("Japan","Libya","Ireland")))
summary(gj)
summary(g)
#Notice that the ddpi term is no longer significant and that the the R2 value has decreased a lot.
#(5) Residual Plots --> check the assumptions of the model,
#The most important diagnostic plot: Plot epsilon_hat(residual) against y_hat.
#You should also plot epsilon_hat against x_i (for predictors that are both in and out of the model)
#Things to look for are heteroscedascity and nonlinearity
g <- lm(sr ~ pop15+pop75+dpi+ddpi,savings)
plot(g$fit,g$res,xlab="Fitted",ylab="Residuals") #check both non-constant variance and nonlinearity
abline(h=0)
plot(g$fit,abs(g$res),xlab="Fitted",ylab="|Residuals|") #check for non-constant variance only
#A quick way to check non-constant variance is this regression:
summary(lm(abs(g$res) ~ g$fit))
#This test is not quite right as some weighting should be used and the degrees of freedom should be
#adjusted but there doesn't seem to be a clear problem with non-constant variance.
gr <- rlm(sr ~ pop15+pop75+dpi+ddpi,savings)
plot(gr$fit,gr$res,xlab="Fitted_psi.huber",ylab="Residuals")
gr_ha <- rlm(sr ~ pop15+pop75+dpi+ddpi,savings,psi=psi.hampel)
plot(gr_ha$fit,gr_ha$res,xlab="Fitted_psi.hampel",ylab="Residuals")
gr_bi <- rlm(sr ~ pop15+pop75+dpi+ddpi,savings,psi=psi.bisquare)
plot(gr_bi$fit,gr_bi$res,xlab="Fitted_psi.bisquare",ylab="Residuals")
#artificial example:
par(mfrow=c(3,3))
for(i in 1:9) plot(1:50,rnorm(50)) #Constant Variance
for(i in 1:9) plot(1:50,(1:50)*rnorm(50)) #Strong non-constant variance
for(i in 1:9) plot(1:50,sqrt((1:50))*rnorm(50)) #Mild non-constant variance
for(i in 1:9) plot(1:50,cos((1:50)*pi/25)+rnorm(50)) #Non-linearity
par(mfrow=c(1,1))
#we could see that it's often hard to judge whether an apparent feature is real or just random variation.
#Repeated generation of plots under a model where there is no violation of the assumption
#that the diagnostic plot is designed to check is helpful in making this judgement.
#epsilon_hat against x_i
plot(savings$pop15,g$res,xlab="pop15",ylab="Residuals")
plot(savings$pop75,g$res,xlab="pop75",ylab="Residuals")
plot(savings$dpi,g$res,xlab="dpi",ylab="Residuals")
#from the first plot we can see two groups. Let's compare and test the variances of these two groups
#Given two independent samples from normal distributions, we can test for equal variance using
#the test statistic of the ratio of the two variance.
#The null distribution is a F with degrees of freedom given by the two samples.
var(g$res[savings$pop15 > 35])/var(g$res[savings$pop15 <35])
table(savings$pop15 > 35)
1-pf(2.7851,22,26) #A significant difference is seen
#(6) Deal with Non-Constant Variance
#two approaches to dealing with non-constant variance:
#Weighted least squares is appropriate when the form of the non-constant variance is either
#known exactly or there is some known parametric form.
#Alternatively, one can transform y to h(y) where h() is chosen so that var(h(y)) is constant.
#how to choose h(y):
#consider h(y)=h(Ey)+(y-Ey)h'(Ey) + ...
#var(h(y)) = h'(Ey)^2 * var(y) + ...
#--> For var(h(y)) to be constant we need: h'(Ey) ~ (var(y))^(-0.5),
#solve the ODE: h(y) = integrate(dy/sqrt(var(y))) = integrate(dy/SDy)
#For example if var(y)=var(epsilon) ~ (Ey)^2 then h(y)=log(y) is suggested
#while if var(epsilon) ~ E(y), then h(y) = sqrt(y). Graphically one tends to see SDy rather then var y.
#Personal comment: we could regress var(epsilon) on (Ey)^2 or E(y). (data will from different sampling).
#A formal test may be good at detecting a particular kind of non-constant variance but have no
#power to detect another. Residual plots are more versatile because unanticipated problems may be spotted.
#Sometimes y_i<=0 for some i in which case the transformations may choke.
#You can try log(y+delta) for some small delta but this makes interpretation difficult.
#Consider the residual-fitted plot for the Galapagos data:
gg <- lm(Species ~ Area + Elevation + Scruz + Nearest + Adjacent, gala)
plot(gg$fit,gg$res,xlab="Fitted",ylab="Residuals", main="Untransformed Response")
#We guess that a square root transformation will give us constant variance:
gs <- lm(sqrt(Species) ~ Area+ Elevation+ Scruz+ Nearest+ Adjacent, gala)
plot(gs$fit,gs$res,xlab="Fitted",ylab="Residuals",main="Square root Response")
#We see in the second plot that the variance is now constant. Our guess at a variance stabilizing
#transformation worked out here, but had it not, we could always have tried something else.
#The poisson distribution is a good model for counts and that distribution has the property that
#the mean is equal to the variance thus suggesting the "square root transformation". It might be even
#better to go with a poisson regression rather than the normal-based regression we are using here.
#There are more formal tests for non-constant variance - for example one could start by regressing
#|epsion_hat| on y or x_i but there is a problem in specifying the alternative hypothesis for such a test.
#(7) Deal with Non-Linearity
#(recap: nonlinearity means residual shows an nonlinear pattern against fitted value, because the
#linear part has been explained by the linear combination of predictors, the nonlinear part
#would be left in the residual.)
#Lack of fit tests can be used when there is replication which doesn't happen too often, but even
#if you do have it, the tests don't tell you how to improve the model.
#How do we check if the systematic part (Ey = X*beta) of the model is correct?
#1. Plots of epsilon_hat against y_hat and x_i
#2. Plots of y against each xi.
#but what about the effect of other x on the y vs. xi plot?
#two methods:
#(a) Partial Regression, or called Added Variable, plots can help isolate the effect of xi on y.
#1. Regress y on all x except xi, get residuals delta_hat. This represents y with the other X-effect taken out.
#2. Regress xi on all x except xi, get residuals gamma_hat. This represents xi with the other X-effect taken out.
#3. Plot delta_hat against gamma_hat.
#so the slope of a line fitted to the plot must be beta_i_hat
#(b) Partial residual plots: plot (epsilon_hat + beta_i_hat * x_i) against x_i. -- only plot, no regression
#Again the slope on the plot will be beta_j_hat and the interpretation is the same.
#To see where this comes from, look at the response with the predicted effect of the other X removed:
#x_i*beta_i_hat + epsilon_hat = y_hat + epsilon_hat - sum_{j!=i}(x_j*beta_j_hat)
#= y - sum_{j!=i}(x_j*beta_j_hat)
#therefore, the response with the predicted effect of the other X removed
####Rule: Partial residual plots are reckoned to be better for non-linearity detection while added variable
#plots are better for outlier/influential detection.
#e.g.
#First we construct a partial regression (added variable) plot for pop15:
d <- lm(sr ~ pop75 + dpi + ddpi,savings)$res
m <- lm(pop15 ~ pop75 + dpi + ddpi,savings)$res
plot(m,d,xlab="pop15 with the other X-effect taken out",ylab="savings with the other X-effect taken out",main="Partial Regression")
#The slope of a line fitted to the plot is beta_i_hat.
lm(d ~ m)$coef #slope: -4.611931e-01
#Compare the slope of the plot to the original regression and show the line on the plot
g <- lm(sr ~ pop15+pop75+dpi+ddpi,savings)
g$coef #slope of pop15: -0.4611931471 same with above.
abline(0,lm(d ~ m)$coef[2])
abline(0,g$coef['pop15']) #same as above.
# In the partial regression plot,w e are looking at the marginal relationship between the response
# and the predictor after the effect of the other predictors has been removed. Multiple regression
# is difficult because we cannot visualize the full relationship because of the high dimensionality
#(the poins plot couldn't be draw with more than one predictors)
# The partial regression plot allows us to focus on the relationship between one predictor and the
# response, much as in simple regression.
#A partial residual plot is easier to do:
plot(savings$pop15,g$res+g$coef['pop15']*savings$pop15,xlab="pop'n under 15", ylab="Saving(adjusted)",main="Partial Residual")
abline(0,g$coef['pop15'])
#or more directly:
prplot(g,1)
#compare added variable plots and Partial residual plots
con<-par(mfrow=c(1,2))
#added variable plots are better for outlier/influential detection.
plot(m,d,xlab="pop15 with the other X-effect taken out",ylab="savings with the other X-effect taken out",main="Partial Regression")
abline(0,g$coef['pop15'])
#Partial residual plots are reckoned to be better for non-linearity detection
plot(savings$pop15,g$res+g$coef['pop15']*savings$pop15,xlab="pop'n under 15", ylab="Saving(adjusted)",main="Partial Residual")
abline(0,g$coef['pop15'])
par(con)
#From the partial regression, we see there might be a different relationship in the two groups.
#(conclusion is the similar as shown in part 'epsilon_hat against x_i')
g1 <- lm(sr ~ pop15+pop75+dpi+ddpi,savings,subset=(pop15 > 35))
g2 <- lm(sr ~ pop15+pop75+dpi+ddpi,savings,subset=(pop15 < 35))
summary(g1) #g1 is almost invalid.
summary(g2)
#Higher dimensional plots can also be useful for detecting structure that cannot be seen in two
#dimensions. These are interactive in nature so you need to try them to see how they work.
#Two ideas are:
#1. Spinning - 3D plots where color, point size and rotation are used to give illusion of a third dimension.
#2. Brushing - Two or more plots are linked so that point which are brushed in one plot are highlighted in another.
#These tools look good but it's not clear whether they actually are useful in practice.