-
Notifications
You must be signed in to change notification settings - Fork 2
/
mlr.html
1312 lines (1272 loc) · 72.3 KB
/
mlr.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html >
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<title>Chapter 3 Single & Multiple Linear Regression | Machine Learning with R</title>
<meta name="description" content="This book is about using R for machine learning purposes.">
<meta name="generator" content="bookdown and GitBook 2.6.7">
<meta property="og:title" content="Chapter 3 Single & Multiple Linear Regression | Machine Learning with R" />
<meta property="og:type" content="book" />
<meta property="og:description" content="This book is about using R for machine learning purposes." />
<meta name="github-repo" content="fderyckel/machinelearningwithr" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="Chapter 3 Single & Multiple Linear Regression | Machine Learning with R" />
<meta name="twitter:description" content="This book is about using R for machine learning purposes." />
<meta name="author" content="François de Ryckel">
<meta name="date" content="2019-02-23">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<link rel="prev" href="testinference.html">
<link rel="next" href="logistic.html">
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<script src="libs/kePrint-0.0.1/kePrint.js"></script>
<style type="text/css">
div.sourceCode { overflow-x: auto; }
table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
margin: 0; padding: 0; vertical-align: baseline; border: none; }
table.sourceCode { width: 100%; line-height: 100%; }
td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
td.sourceCode { padding-left: 5px; }
code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
code > span.dt { color: #902000; } /* DataType */
code > span.dv { color: #40a070; } /* DecVal */
code > span.bn { color: #40a070; } /* BaseN */
code > span.fl { color: #40a070; } /* Float */
code > span.ch { color: #4070a0; } /* Char */
code > span.st { color: #4070a0; } /* String */
code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
code > span.ot { color: #007020; } /* Other */
code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
code > span.fu { color: #06287e; } /* Function */
code > span.er { color: #ff0000; font-weight: bold; } /* Error */
code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
code > span.cn { color: #880000; } /* Constant */
code > span.sc { color: #4070a0; } /* SpecialChar */
code > span.vs { color: #4070a0; } /* VerbatimString */
code > span.ss { color: #bb6688; } /* SpecialString */
code > span.im { } /* Import */
code > span.va { color: #19177c; } /* Variable */
code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code > span.op { color: #666666; } /* Operator */
code > span.bu { } /* BuiltIn */
code > span.ex { } /* Extension */
code > span.pp { color: #bc7a00; } /* Preprocessor */
code > span.at { color: #7d9029; } /* Attribute */
code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
</style>
<link rel="stylesheet" href="style.css" type="text/css" />
</head>
<body>
<div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">
<div class="book-summary">
<nav role="navigation">
<ul class="summary">
<li><strong><a href="./">Machine Learning with R</a></strong></li>
<li class="divider"></li>
<li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Prerequisites</a><ul>
<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#pre-requisite-and-conventions"><i class="fa fa-check"></i><b>1.1</b> Pre-requisite and conventions</a></li>
<li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#organization"><i class="fa fa-check"></i><b>1.2</b> Organization</a></li>
<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#packages"><i class="fa fa-check"></i><b>1.3</b> Packages</a></li>
</ul></li>
<li class="chapter" data-level="2" data-path="testinference.html"><a href="testinference.html"><i class="fa fa-check"></i><b>2</b> Tests and inferences</a><ul>
<li class="chapter" data-level="2.1" data-path="testinference.html"><a href="testinference.html#normality"><i class="fa fa-check"></i><b>2.1</b> Assumption of normality</a><ul>
<li class="chapter" data-level="2.1.1" data-path="testinference.html"><a href="testinference.html#visual-check-of-normality"><i class="fa fa-check"></i><b>2.1.1</b> Visual check of normality</a></li>
<li class="chapter" data-level="2.1.2" data-path="testinference.html"><a href="testinference.html#normality-tests"><i class="fa fa-check"></i><b>2.1.2</b> Normality tests</a></li>
</ul></li>
<li class="chapter" data-level="2.2" data-path="testinference.html"><a href="testinference.html#ttest"><i class="fa fa-check"></i><b>2.2</b> T-tests</a></li>
<li class="chapter" data-level="2.3" data-path="testinference.html"><a href="testinference.html#anova---analyse-of-variance."><i class="fa fa-check"></i><b>2.3</b> ANOVA - Analyse of variance.</a></li>
<li class="chapter" data-level="2.4" data-path="testinference.html"><a href="testinference.html#covariance"><i class="fa fa-check"></i><b>2.4</b> Covariance</a></li>
</ul></li>
<li class="chapter" data-level="3" data-path="mlr.html"><a href="mlr.html"><i class="fa fa-check"></i><b>3</b> Single & Multiple Linear Regression</a><ul>
<li class="chapter" data-level="3.1" data-path="mlr.html"><a href="mlr.html#single-variable-regression"><i class="fa fa-check"></i><b>3.1</b> Single variable regression</a></li>
<li class="chapter" data-level="3.2" data-path="mlr.html"><a href="mlr.html#multi-variables-regression"><i class="fa fa-check"></i><b>3.2</b> Multi-variables regression</a><ul>
<li class="chapter" data-level="3.2.1" data-path="mlr.html"><a href="mlr.html#predicting-wine-price-again"><i class="fa fa-check"></i><b>3.2.1</b> Predicting wine price (again!)</a></li>
</ul></li>
<li class="chapter" data-level="3.3" data-path="mlr.html"><a href="mlr.html#model-diagnostic-and-evaluation"><i class="fa fa-check"></i><b>3.3</b> Model diagnostic and evaluation</a></li>
<li class="chapter" data-level="3.4" data-path="mlr.html"><a href="mlr.html#final-example---boston-dataset---with-backward-elimination"><i class="fa fa-check"></i><b>3.4</b> Final example - Boston dataset - with backward elimination</a><ul>
<li class="chapter" data-level="3.4.1" data-path="mlr.html"><a href="mlr.html#model-diagmostic"><i class="fa fa-check"></i><b>3.4.1</b> Model diagmostic</a></li>
</ul></li>
<li class="chapter" data-level="3.5" data-path="mlr.html"><a href="mlr.html#references"><i class="fa fa-check"></i><b>3.5</b> References</a></li>
</ul></li>
<li class="chapter" data-level="4" data-path="logistic.html"><a href="logistic.html"><i class="fa fa-check"></i><b>4</b> Logistic Regression</a><ul>
<li class="chapter" data-level="4.1" data-path="logistic.html"><a href="logistic.html#introduction"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
<li class="chapter" data-level="4.2" data-path="logistic.html"><a href="logistic.html#the-logistic-equation."><i class="fa fa-check"></i><b>4.2</b> The logistic equation.</a></li>
<li class="chapter" data-level="4.3" data-path="logistic.html"><a href="logistic.html#performance-of-logistic-regression-model"><i class="fa fa-check"></i><b>4.3</b> Performance of Logistic Regression Model</a></li>
<li class="chapter" data-level="4.4" data-path="logistic.html"><a href="logistic.html#setting-up"><i class="fa fa-check"></i><b>4.4</b> Setting up</a></li>
<li class="chapter" data-level="4.5" data-path="logistic.html"><a href="logistic.html#example-1---graduate-admission"><i class="fa fa-check"></i><b>4.5</b> Example 1 - Graduate Admission</a></li>
<li class="chapter" data-level="4.6" data-path="logistic.html"><a href="logistic.html#example-2---diabetes"><i class="fa fa-check"></i><b>4.6</b> Example 2 - Diabetes</a><ul>
<li class="chapter" data-level="4.6.1" data-path="logistic.html"><a href="logistic.html#accounting-for-missing-values"><i class="fa fa-check"></i><b>4.6.1</b> Accounting for missing values</a></li>
<li class="chapter" data-level="4.6.2" data-path="logistic.html"><a href="logistic.html#imputting-missing-values"><i class="fa fa-check"></i><b>4.6.2</b> Imputting Missing Values</a></li>
<li class="chapter" data-level="4.6.3" data-path="logistic.html"><a href="logistic.html#roc-and-auc"><i class="fa fa-check"></i><b>4.6.3</b> ROC and AUC</a></li>
</ul></li>
<li class="chapter" data-level="4.7" data-path="logistic.html"><a href="logistic.html#references-1"><i class="fa fa-check"></i><b>4.7</b> References</a></li>
</ul></li>
<li class="chapter" data-level="5" data-path="softmax-and-multinomial-regressions.html"><a href="softmax-and-multinomial-regressions.html"><i class="fa fa-check"></i><b>5</b> Softmax and multinomial regressions</a><ul>
<li class="chapter" data-level="5.1" data-path="softmax-and-multinomial-regressions.html"><a href="softmax-and-multinomial-regressions.html#multinomial-logistic-regression"><i class="fa fa-check"></i><b>5.1</b> Multinomial Logistic Regression</a></li>
<li class="chapter" data-level="5.2" data-path="softmax-and-multinomial-regressions.html"><a href="softmax-and-multinomial-regressions.html#references-2"><i class="fa fa-check"></i><b>5.2</b> References</a></li>
</ul></li>
<li class="chapter" data-level="6" data-path="gradient-descent.html"><a href="gradient-descent.html"><i class="fa fa-check"></i><b>6</b> Gradient Descent</a><ul>
<li class="chapter" data-level="6.1" data-path="gradient-descent.html"><a href="gradient-descent.html#example-on-functions"><i class="fa fa-check"></i><b>6.1</b> Example on functions</a></li>
<li class="chapter" data-level="6.2" data-path="gradient-descent.html"><a href="gradient-descent.html#example-on-regressions"><i class="fa fa-check"></i><b>6.2</b> Example on regressions</a></li>
</ul></li>
<li class="chapter" data-level="7" data-path="knnchapter.html"><a href="knnchapter.html"><i class="fa fa-check"></i><b>7</b> KNN - K Nearest Neighbour</a><ul>
<li class="chapter" data-level="7.1" data-path="knnchapter.html"><a href="knnchapter.html#example-1.-prostate-cancer-dataset"><i class="fa fa-check"></i><b>7.1</b> Example 1. Prostate Cancer dataset</a></li>
<li class="chapter" data-level="7.2" data-path="knnchapter.html"><a href="knnchapter.html#example-2.-wine-dataset"><i class="fa fa-check"></i><b>7.2</b> Example 2. Wine dataset</a><ul>
<li class="chapter" data-level="7.2.1" data-path="knnchapter.html"><a href="knnchapter.html#understand-the-data"><i class="fa fa-check"></i><b>7.2.1</b> Understand the data</a></li>
</ul></li>
<li class="chapter" data-level="7.3" data-path="knnchapter.html"><a href="knnchapter.html#references-3"><i class="fa fa-check"></i><b>7.3</b> References</a></li>
</ul></li>
<li class="chapter" data-level="8" data-path="kmeans.html"><a href="kmeans.html"><i class="fa fa-check"></i><b>8</b> Kmeans clustering</a><ul>
<li class="chapter" data-level="8.1" data-path="kmeans.html"><a href="kmeans.html#multinomial-logistic-regression-1"><i class="fa fa-check"></i><b>8.1</b> Multinomial Logistic Regression</a></li>
<li class="chapter" data-level="8.2" data-path="kmeans.html"><a href="kmeans.html#references-4"><i class="fa fa-check"></i><b>8.2</b> References</a></li>
</ul></li>
<li class="chapter" data-level="9" data-path="hierclust.html"><a href="hierclust.html"><i class="fa fa-check"></i><b>9</b> Hierarichal Clustering</a><ul>
<li class="chapter" data-level="9.1" data-path="hierclust.html"><a href="hierclust.html#example-on-the-pokemon-dataset"><i class="fa fa-check"></i><b>9.1</b> Example on the Pokemon dataset</a></li>
<li class="chapter" data-level="9.2" data-path="hierclust.html"><a href="hierclust.html#example-on-regressions-1"><i class="fa fa-check"></i><b>9.2</b> Example on regressions</a></li>
<li class="chapter" data-level="9.3" data-path="hierclust.html"><a href="hierclust.html#references-5"><i class="fa fa-check"></i><b>9.3</b> References</a></li>
</ul></li>
<li class="chapter" data-level="10" data-path="pca.html"><a href="pca.html"><i class="fa fa-check"></i><b>10</b> Principal Component Analysis</a><ul>
<li class="chapter" data-level="10.1" data-path="pca.html"><a href="pca.html#pca-on-an-easy-example."><i class="fa fa-check"></i><b>10.1</b> PCA on an easy example.</a></li>
<li class="chapter" data-level="10.2" data-path="pca.html"><a href="pca.html#references."><i class="fa fa-check"></i><b>10.2</b> References.</a></li>
</ul></li>
<li class="chapter" data-level="11" data-path="trees-and-classification.html"><a href="trees-and-classification.html"><i class="fa fa-check"></i><b>11</b> Trees and Classification</a><ul>
<li class="chapter" data-level="11.1" data-path="trees-and-classification.html"><a href="trees-and-classification.html#introduction-1"><i class="fa fa-check"></i><b>11.1</b> Introduction</a></li>
<li class="chapter" data-level="11.2" data-path="trees-and-classification.html"><a href="trees-and-classification.html#first-example."><i class="fa fa-check"></i><b>11.2</b> First example.</a></li>
<li class="chapter" data-level="11.3" data-path="trees-and-classification.html"><a href="trees-and-classification.html#second-example."><i class="fa fa-check"></i><b>11.3</b> Second Example.</a></li>
<li class="chapter" data-level="11.4" data-path="trees-and-classification.html"><a href="trees-and-classification.html#how-does-a-tree-decide-where-to-split"><i class="fa fa-check"></i><b>11.4</b> How does a tree decide where to split?</a></li>
<li class="chapter" data-level="11.5" data-path="trees-and-classification.html"><a href="trees-and-classification.html#third-example."><i class="fa fa-check"></i><b>11.5</b> Third example.</a></li>
<li class="chapter" data-level="11.6" data-path="trees-and-classification.html"><a href="trees-and-classification.html#references-6"><i class="fa fa-check"></i><b>11.6</b> References</a></li>
</ul></li>
<li class="chapter" data-level="12" data-path="random-forest.html"><a href="random-forest.html"><i class="fa fa-check"></i><b>12</b> Random Forest</a><ul>
<li class="chapter" data-level="12.1" data-path="random-forest.html"><a href="random-forest.html#how-does-it-work"><i class="fa fa-check"></i><b>12.1</b> How does it work?</a></li>
<li class="chapter" data-level="12.2" data-path="random-forest.html"><a href="random-forest.html#references-7"><i class="fa fa-check"></i><b>12.2</b> References</a></li>
</ul></li>
<li class="chapter" data-level="13" data-path="svm.html"><a href="svm.html"><i class="fa fa-check"></i><b>13</b> Support Vector Machine</a><ul>
<li class="chapter" data-level="13.1" data-path="svm.html"><a href="svm.html#support-vecotr-regression"><i class="fa fa-check"></i><b>13.1</b> Support Vecotr Regression</a><ul>
<li class="chapter" data-level="13.1.1" data-path="svm.html"><a href="svm.html#create-data"><i class="fa fa-check"></i><b>13.1.1</b> Create data</a></li>
<li class="chapter" data-level="13.1.2" data-path="svm.html"><a href="svm.html#tuning-a-svm-model"><i class="fa fa-check"></i><b>13.1.2</b> Tuning a SVM model</a></li>
<li class="chapter" data-level="13.1.3" data-path="svm.html"><a href="svm.html#discussion-on-parameters"><i class="fa fa-check"></i><b>13.1.3</b> Discussion on parameters</a></li>
</ul></li>
<li class="chapter" data-level="13.2" data-path="svm.html"><a href="svm.html#references-8"><i class="fa fa-check"></i><b>13.2</b> References</a></li>
</ul></li>
<li class="chapter" data-level="14" data-path="model-evaluation.html"><a href="model-evaluation.html"><i class="fa fa-check"></i><b>14</b> Model Evaluation</a><ul>
<li class="chapter" data-level="14.1" data-path="model-evaluation.html"><a href="model-evaluation.html#biais-variance-tradeoff"><i class="fa fa-check"></i><b>14.1</b> Biais variance tradeoff</a></li>
<li class="chapter" data-level="14.2" data-path="model-evaluation.html"><a href="model-evaluation.html#bagging"><i class="fa fa-check"></i><b>14.2</b> Bagging</a></li>
<li class="chapter" data-level="14.3" data-path="model-evaluation.html"><a href="model-evaluation.html#crossvalidation"><i class="fa fa-check"></i><b>14.3</b> Cross Validation</a></li>
</ul></li>
<li class="chapter" data-level="15" data-path="case-study-text-classification-spam-and-ham-.html"><a href="case-study-text-classification-spam-and-ham-.html"><i class="fa fa-check"></i><b>15</b> Case Study - Text classification: Spam and Ham.</a></li>
<li class="chapter" data-level="16" data-path="mushroom.html"><a href="mushroom.html"><i class="fa fa-check"></i><b>16</b> Case Study - Mushrooms Classification</a><ul>
<li class="chapter" data-level="16.1" data-path="mushroom.html"><a href="mushroom.html#import-the-data"><i class="fa fa-check"></i><b>16.1</b> Import the data</a></li>
<li class="chapter" data-level="16.2" data-path="mushroom.html"><a href="mushroom.html#tidy-the-data"><i class="fa fa-check"></i><b>16.2</b> Tidy the data</a></li>
<li class="chapter" data-level="16.3" data-path="mushroom.html"><a href="mushroom.html#understand-the-data-1"><i class="fa fa-check"></i><b>16.3</b> Understand the data</a><ul>
<li class="chapter" data-level="16.3.1" data-path="mushroom.html"><a href="mushroom.html#transform-the-data"><i class="fa fa-check"></i><b>16.3.1</b> Transform the data</a></li>
<li class="chapter" data-level="16.3.2" data-path="mushroom.html"><a href="mushroom.html#visualize-the-data"><i class="fa fa-check"></i><b>16.3.2</b> Visualize the data</a></li>
<li class="chapter" data-level="16.3.3" data-path="mushroom.html"><a href="mushroom.html#modeling"><i class="fa fa-check"></i><b>16.3.3</b> Modeling</a></li>
</ul></li>
<li class="chapter" data-level="16.4" data-path="mushroom.html"><a href="mushroom.html#communication"><i class="fa fa-check"></i><b>16.4</b> Communication</a></li>
</ul></li>
<li class="chapter" data-level="17" data-path="case-study-the-adults-dataset-.html"><a href="case-study-the-adults-dataset-.html"><i class="fa fa-check"></i><b>17</b> Case study - The adults dataset.</a><ul>
<li class="chapter" data-level="17.1" data-path="case-study-the-adults-dataset-.html"><a href="case-study-the-adults-dataset-.html#introduction-2"><i class="fa fa-check"></i><b>17.1</b> Introduction</a></li>
<li class="chapter" data-level="17.2" data-path="case-study-the-adults-dataset-.html"><a href="case-study-the-adults-dataset-.html#import-the-data-1"><i class="fa fa-check"></i><b>17.2</b> Import the data</a></li>
<li class="chapter" data-level="17.3" data-path="case-study-the-adults-dataset-.html"><a href="case-study-the-adults-dataset-.html#tidy-the-data-1"><i class="fa fa-check"></i><b>17.3</b> Tidy the data</a></li>
</ul></li>
<li class="chapter" data-level="18" data-path="breastcancer.html"><a href="breastcancer.html"><i class="fa fa-check"></i><b>18</b> Case Study - Wisconsin Breast Cancer</a><ul>
<li class="chapter" data-level="18.1" data-path="breastcancer.html"><a href="breastcancer.html#import-the-data-2"><i class="fa fa-check"></i><b>18.1</b> Import the data</a></li>
<li class="chapter" data-level="18.2" data-path="breastcancer.html"><a href="breastcancer.html#tidy-the-data-2"><i class="fa fa-check"></i><b>18.2</b> Tidy the data</a></li>
<li class="chapter" data-level="18.3" data-path="breastcancer.html"><a href="breastcancer.html#understand-the-data-2"><i class="fa fa-check"></i><b>18.3</b> Understand the data</a><ul>
<li class="chapter" data-level="18.3.1" data-path="breastcancer.html"><a href="breastcancer.html#transform-the-data-1"><i class="fa fa-check"></i><b>18.3.1</b> Transform the data</a></li>
<li class="chapter" data-level="18.3.2" data-path="breastcancer.html"><a href="breastcancer.html#pre-process-the-data"><i class="fa fa-check"></i><b>18.3.2</b> Pre-process the data</a></li>
<li class="chapter" data-level="18.3.3" data-path="breastcancer.html"><a href="breastcancer.html#model-the-data-1"><i class="fa fa-check"></i><b>18.3.3</b> Model the data</a></li>
</ul></li>
<li class="chapter" data-level="18.4" data-path="breastcancer.html"><a href="breastcancer.html#references-9"><i class="fa fa-check"></i><b>18.4</b> References</a></li>
</ul></li>
<li class="chapter" data-level="19" data-path="final-words.html"><a href="final-words.html"><i class="fa fa-check"></i><b>19</b> Final Words</a></li>
<li class="chapter" data-level="" data-path="references-10.html"><a href="references-10.html"><i class="fa fa-check"></i>References</a></li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Machine Learning with R</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<section class="normal" id="section-">
<div id="mlr" class="section level1">
<h1><span class="header-section-number">Chapter 3</span> Single & Multiple Linear Regression</h1>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(skimr)
<span class="kw">library</span>(kableExtra) <span class="co"># for the kable_styling function</span>
<span class="kw">library</span>(tibble)
<span class="kw">library</span>(dplyr)
<span class="kw">library</span>(readr)
<span class="kw">library</span>(ggplot2)</code></pre></div>
<div id="single-variable-regression" class="section level2">
<h2><span class="header-section-number">3.1</span> Single variable regression</h2>
<p>The general equation for a linear regression model</p>
<blockquote>
<p><span class="math inline">\(y^i = \beta_{0} + \beta_{1} x^i + \epsilon^i\)</span></p>
</blockquote>
<p>where:</p>
<ul>
<li><span class="math inline">\(y^i\)</span> is the <span class="math inline">\(i^{th}\)</span> observation of the dependent variable</li>
<li><span class="math inline">\(\beta_{0}\)</span> is the intercept coefficient</li>
<li><span class="math inline">\(\beta_{1}\)</span> is the regression coefficient for the dependent variable</li>
<li><span class="math inline">\(x^i\)</span> is the <span class="math inline">\(i^{th}\)</span> observation of the independent variable</li>
<li><span class="math inline">\(\epsilon^i\)</span> is the error term for the <span class="math inline">\(i^{th}\)</span> observation. It basically is the difference in therm of y between the observed value and the estimated value. It is also called the residuals. A good model minimize these errors.<a href="#fn1" class="footnoteRef" id="fnref1"><sup>1</sup></a></li>
</ul>
<p>Some ways to assess how good our model is to:</p>
<ol style="list-style-type: decimal">
<li>compute the SSE (the sum of squared error)
<ul>
<li>SSE = <span class="math inline">\((\epsilon^1)^2 + (\epsilon^2)^2 + \ldots + (\epsilon^n)^2\)</span> = <span class="math inline">\(\sum_{i=1}^N \epsilon^i\)</span></li>
<li>A good model will minimize SSE</li>
<li>problem: SSE is dependent of N. SSE will naturally increase as N increase</li>
</ul></li>
<li>compute the RMSE (the root mean squared error)
<ul>
<li>RMSE = <span class="math inline">\(\sqrt {\frac {SSE} {N}}\)</span></li>
<li>Also a good model will minimize SSE</li>
<li>It depends of the unit of the dependent variable. It is like the average error the model is making (in term of the unit of the dependent variable)</li>
</ul></li>
<li>compute <span class="math inline">\(R^2\)</span>
<ul>
<li>It compare the models to a baseline model</li>
<li><span class="math inline">\(R^2\)</span> is <strong>unitless</strong> and <strong>universaly</strong> interpretable</li>
<li>SST is the sum of the squared of the difference between the observed value and the mean of all the observed value</li>
<li><span class="math inline">\(R^2 = 1 - \frac {SSE} {SST}\)</span></li>
</ul></li>
</ol>
<p>We usually use r-squared to check the performance of a regression.</p>
<p>The conditions and assumptions to have a valid linear model are the same as for the t-test.</p>
<ul>
<li>linear relationship between dependent and independent variables. (scatterplot of dependent vs independent variables + scatterplot of residuals vs fitted). Also check here for outliers. Regression line and coefficient of regression are affected by outliers. Check it would make sense to remove them.<br />
</li>
<li>Multivariate normality. Multiple regression assumes that the residuals are normally distributed. Visual check on the Q-Q plot.<br />
</li>
<li>No Multicollinearity. Multiple regression assumes that the independent variables are not highly correlated with each other. Check correlation matrix and correlation plot. This assumption can also be tested using Variance Inflation Factor (VIF) values.</li>
<li>Homoscedasticity. This assumption states that the variance of error terms are similar across the values of the independent variables. A plot of standardized residuals versus predicted values can show whether points are equally distributed across all values of the independent variables.</li>
</ul>
<p>In our first linear regression, we’ll use the <strong>Wine</strong> dataset. Let’s load it and then have a quick look at its structure. </p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df =<span class="st"> </span><span class="kw">read_csv</span>(<span class="st">"dataset/Wine.csv"</span>)
<span class="kw">skim</span>(df)</code></pre></div>
<pre><code>## Skim summary statistics
## n obs: 25
## n variables: 7
##
## ── Variable type:numeric ───────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25
## Age 0 25 25 17.2 7.69 5 11
## AGST 0 25 25 16.51 0.68 14.98 16.2
## FrancePop 0 25 25 49694.44 3665.27 43183.57 46584
## HarvestRain 0 25 25 148.56 74.42 38 89
## Price 0 25 25 7.07 0.65 6.2 6.52
## WinterRain 0 25 25 605.28 132.28 376 536
## Year 0 25 25 1965.8 7.69 1952 1960
## p50 p75 p100 hist
## 17 23 31 ▇▆▆▇▆▆▃▆
## 16.53 17.07 17.65 ▂▃▃▇▆▆▆▅
## 50254.97 52894.18 54602.19 ▃▂▃▂▃▃▃▇
## 130 187 292 ▅▇▇▅▆▁▃▅
## 7.12 7.5 8.49 ▇▃▃▇▃▂▂▁
## 600 697 830 ▅▁▂▇▃▃▂▃
## 1966 1972 1978 ▆▃▆▇▆▆▆▇</code></pre>
<p>We use the <code>lm</code> function to create our linear regression model. We use <em>AGST</em> as the independent variable while the <em>price</em> is the dependent variable. <img src="machinelearningwithR_files/figure-html/linreg02-plot-1.png" width="672" /></p>
<p>We can see a weak positive correlation between <code>AGST</code> and <code>Price</code>. The model would confirm that.<br />
</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">model_lm_df =<span class="st"> </span><span class="kw">lm</span>(Price <span class="op">~</span><span class="st"> </span>AGST, <span class="dt">data =</span> df)
<span class="kw">summary</span>(model_lm_df)</code></pre></div>
<pre><code>##
## Call:
## lm(formula = Price ~ AGST, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78450 -0.23882 -0.03727 0.38992 0.90318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.4178 2.4935 -1.371 0.183710
## AGST 0.6351 0.1509 4.208 0.000335 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4993 on 23 degrees of freedom
## Multiple R-squared: 0.435, Adjusted R-squared: 0.4105
## F-statistic: 17.71 on 1 and 23 DF, p-value: 0.000335</code></pre>
<p>The <code>summary</code> function applied on the model is giving us important information. See below for a detailed explanation of it.</p>
<ul>
<li>the stars next to the predictor variable indicated how significant the variable is for our regression model</li>
<li>it also gives us the value of the R^2 coefficient</li>
</ul>
<p>We could have calculated the R^2 value in this way: </p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">SSE =<span class="st"> </span><span class="kw">sum</span>(model_lm_df<span class="op">$</span>residuals<span class="op">^</span><span class="dv">2</span>)
SST =<span class="st"> </span><span class="kw">sum</span>((df<span class="op">$</span>Price <span class="op">-</span><span class="st"> </span><span class="kw">mean</span>(df<span class="op">$</span>Price))<span class="op">^</span><span class="dv">2</span>)
r_squared =<span class="st"> </span><span class="dv">1</span> <span class="op">-</span><span class="st"> </span>SSE<span class="op">/</span>SST
r_squared</code></pre></div>
<pre><code>## [1] 0.4350232</code></pre>
<p>The low R^2 indicate our model does not explain much of the variance of the data.</p>
<p>We can now plot the observations and the line of regression; and see how the linear model fits the data. </p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(df, <span class="kw">aes</span>(AGST, Price)) <span class="op">+</span><span class="st"> </span>
<span class="st"> </span><span class="kw">geom_point</span>(<span class="dt">shape =</span> <span class="dv">1</span>, <span class="dt">col =</span> <span class="st">"blue"</span>) <span class="op">+</span><span class="st"> </span>
<span class="st"> </span><span class="kw">geom_smooth</span>(<span class="dt">method =</span> <span class="st">"lm"</span>, <span class="dt">col =</span> <span class="st">"red"</span>)</code></pre></div>
<p><img src="machinelearningwithR_files/figure-html/linreg04-graph-1.png" width="672" /> By default, the <code>geom_smooth()</code> will use a 95% confidence interval (which is the grey-er area on the graph). There are 95% chance the line of regression will be within that zone for the whole population.</p>
<p>It is always nice to see how our residuals are distributed.<br />
We use the <code>ggplot2</code> library and the <code>fortify</code> function which transform the <code>summary(model1)</code> into a data frame usable for plotting. </p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">model1 <-<span class="st"> </span><span class="kw">fortify</span>(model_lm_df)
p <-<span class="st"> </span><span class="kw">ggplot</span>(model1, <span class="kw">aes</span>(.fitted, .resid)) <span class="op">+</span><span class="st"> </span><span class="kw">geom_point</span>()
p <-<span class="st"> </span>p <span class="op">+</span><span class="st"> </span><span class="kw">geom_hline</span>(<span class="dt">yintercept =</span> <span class="dv">0</span>, <span class="dt">col =</span> <span class="st">"red"</span>, <span class="dt">linetype =</span> <span class="st">"dashed"</span>)
p <-<span class="st"> </span>p <span class="op">+</span><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Fitted values"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Residuals"</span>) <span class="op">+</span><span class="st"> </span>
<span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Plot of the residuals in function of the fitted values"</span>)
p</code></pre></div>
<p><img src="machinelearningwithR_files/figure-html/linreg05_residuals-1.png" width="672" /></p>
<p>Residuals look normal: randonly scattered around the zero line.</p>
</div>
<div id="multi-variables-regression" class="section level2">
<h2><span class="header-section-number">3.2</span> Multi-variables regression</h2>
<p>Instead of just considering one variable as predictor, we’ll add a few more variables to our model with the idea to increase its predictive ability. In our case, we are expecting an increased r-squared value.</p>
<p>We have to be cautious in adding more variables. Too many variable might give a high <span class="math inline">\(R^2\)</span> on our training data, but this not be the case as we switch to our testing data. This is because of over-fitting and we will need to avoid this at all cost. We’ll check several ways we can use against overfitting.</p>
<p>The general equations can be expressed as</p>
<blockquote>
<p><span class="math inline">\(y^i = \beta_{0} + \beta_{1} x_{1}^i + \beta_{2} x_{2}^i + \ldots + \beta_{k} x_{k}^i + \epsilon^i\)</span></p>
</blockquote>
<p>when there are k predictors variables.</p>
<p>There are a bit of trials and errors to make while trying to fit multiple variables into a model, but a rule of thumb would be to include most of the variable (all these that would make sense) and then take out the ones that are not very significant using the <code>summary(modelx)</code></p>
<p>We are introducing 3 news libraries here besides the usual tidyverse.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(corrr)
<span class="kw">library</span>(corrplot)
<span class="kw">library</span>(leaps)</code></pre></div>
<div id="predicting-wine-price-again" class="section level3">
<h3><span class="header-section-number">3.2.1</span> Predicting wine price (again!)</h3>
<p>We continue here with the same dataset, <em>wine.csv</em>.<br />
First, we can see how each variable is correlated with each other ones.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(corrr)
d <-<span class="st"> </span><span class="kw">correlate</span>(df)</code></pre></div>
<pre><code>##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">d <span class="op">%>%</span><span class="st"> </span><span class="kw">shave</span>() <span class="op">%>%</span><span class="st"> </span><span class="kw">fashion</span>()</code></pre></div>
<pre><code>## rowname Year Price WinterRain AGST HarvestRain Age FrancePop
## 1 Year
## 2 Price -.45
## 3 WinterRain .02 .14
## 4 AGST -.25 .66 -.32
## 5 HarvestRain .03 -.56 -.28 -.06
## 6 Age -1.00 .45 -.02 .25 -.03
## 7 FrancePop .99 -.47 -.00 -.26 .04 -.99</code></pre>
<p>By default, R uses the Pearson coefficient of correlation.</p>
<p>Multiple linear regression doesn’t handle well multicollinearity. In this case, we should remove variables that are too highly correlated. <em>Age</em> and <em>Year</em> are too highly correlated and it should be removed.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">corrplot<span class="op">::</span><span class="kw">corrplot</span>(<span class="kw">cor</span>(df), <span class="dt">type =</span> <span class="st">"lower"</span>, <span class="dt">order =</span> <span class="st">"hclust"</span>, <span class="dt">tl.col =</span> <span class="st">"black"</span>, <span class="dt">sig.level =</span> <span class="fl">0.01</span>)</code></pre></div>
<p><img src="machinelearningwithR_files/figure-html/mlr-corrplot01-1.png" width="672" /></p>
<p>So let’s start by using all variables.<br />
</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">model2_lm_df <-<span class="st"> </span><span class="kw">lm</span>(Price <span class="op">~</span><span class="st"> </span>Year <span class="op">+</span><span class="st"> </span>WinterRain <span class="op">+</span><span class="st"> </span>AGST <span class="op">+</span><span class="st"> </span>HarvestRain <span class="op">+</span><span class="st"> </span>Age <span class="op">+</span><span class="st"> </span>FrancePop, <span class="dt">data =</span> df)
<span class="kw">summary</span>(model2_lm_df)</code></pre></div>
<pre><code>##
## Call:
## lm(formula = Price ~ Year + WinterRain + AGST + HarvestRain +
## Age + FrancePop, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.48179 -0.24662 -0.00726 0.22012 0.51987
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.092e-01 1.467e+02 0.005 0.996194
## Year -5.847e-04 7.900e-02 -0.007 0.994172
## WinterRain 1.043e-03 5.310e-04 1.963 0.064416 .
## AGST 6.012e-01 1.030e-01 5.836 1.27e-05 ***
## HarvestRain -3.958e-03 8.751e-04 -4.523 0.000233 ***
## Age NA NA NA NA
## FrancePop -4.953e-05 1.667e-04 -0.297 0.769578
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3019 on 19 degrees of freedom
## Multiple R-squared: 0.8294, Adjusted R-squared: 0.7845
## F-statistic: 18.47 on 5 and 19 DF, p-value: 1.044e-06</code></pre>
<p>While doing so, we notice that the variable <em>Age</em> has NA. This is because it is so highly correlated with the variable <em>year</em> and <em>FrancePop</em>. This came in from our correlation plot. Also the variable <em>FrancePop</em> isn’t very predictive of the price of wine. So we can refine our models, by taking out these 2 variables, and as we’ll see, it won’t affect much our <span class="math inline">\(R^2\)</span> value. Note that with multiple variables regression, it is important to look at the <strong>Adjusted R-squared</strong> as it take into consideration the amount of variables in the model.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">model3_lm_df <-<span class="st"> </span><span class="kw">lm</span>(Price <span class="op">~</span><span class="st"> </span>Year <span class="op">+</span><span class="st"> </span>WinterRain <span class="op">+</span><span class="st"> </span>AGST <span class="op">+</span><span class="st"> </span>HarvestRain, <span class="dt">data =</span> df)
<span class="kw">summary</span>(model3_lm_df)</code></pre></div>
<pre><code>##
## Call:
## lm(formula = Price ~ Year + WinterRain + AGST + HarvestRain,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45470 -0.24273 0.00752 0.19773 0.53637
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.0248601 16.4434570 2.677 0.014477 *
## Year -0.0239308 0.0080969 -2.956 0.007819 **
## WinterRain 0.0010755 0.0005073 2.120 0.046694 *
## AGST 0.6072093 0.0987022 6.152 5.2e-06 ***
## HarvestRain -0.0039715 0.0008538 -4.652 0.000154 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.295 on 20 degrees of freedom
## Multiple R-squared: 0.8286, Adjusted R-squared: 0.7943
## F-statistic: 24.17 on 4 and 20 DF, p-value: 2.036e-07</code></pre>
<p>We managed now to have a better r-squared than using only one predictive variable. Also by choosing better predictive variables we managed to increase our <em>adjusted r-squared</em>.</p>
<p>Although it isn’t now feasible to graph in 2D the <em>Price</em> in function of the other variables, we can still graph our residuals in 2D. </p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">model3 <-<span class="st"> </span><span class="kw">fortify</span>(model3_lm_df)
p <-<span class="st"> </span><span class="kw">ggplot</span>(model3, <span class="kw">aes</span>(.fitted, .resid)) <span class="op">+</span><span class="st"> </span><span class="kw">geom_point</span>() <span class="op">+</span><span class="st"> </span>
<span class="st"> </span><span class="kw">geom_hline</span>(<span class="dt">yintercept =</span> <span class="dv">0</span>, <span class="dt">col =</span> <span class="st">"red"</span>, <span class="dt">linetype =</span> <span class="st">"dashed"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Fitted values"</span>) <span class="op">+</span><span class="st"> </span>
<span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Residuals"</span>) <span class="op">+</span><span class="st"> </span>
<span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Plot of the residuals in function of the fitted values (multiple variables)"</span>)
p</code></pre></div>
<p><img src="machinelearningwithR_files/figure-html/linreg07-1.png" width="672" /></p>
<p>The plot of residuals look pretty normal with points randomly scattered around the 0 line.</p>
</div>
</div>
<div id="model-diagnostic-and-evaluation" class="section level2">
<h2><span class="header-section-number">3.3</span> Model diagnostic and evaluation</h2>
<p>Let’s first on onto the explanations of the summary function on the regression model.</p>
<p><strong>Call:</strong> The formula we have used for our model.</p>
<p><strong>Coefficient – Estimate</strong> The coefficient Estimate is the value of the coefficient that is to be used in the equation. The coefficients for each of the independent variable has a meaning, for example, 0.0010755 for ‘WinterRain’ means that for every 1 unit change in ‘WinterRain’, the value of ‘Price‘ increases by 0.0010755.</p>
<p><strong>Coefficient – Standard Error</strong> The coefficient Standard Error measures the average amount that the coefficient estimates vary from the actual average value of our response variable. We need this to be minimal for the variable to be able to predict accurately.</p>
<p><strong>Coefficient – t value:</strong> The coefficient t-value measures how many standard deviations our coefficient estimate can be far away from 0. We want this value to be high so that we can reject the null hypothesis (H0) which is ‘there is no relationship between dependent and independent variables’.</p>
<p><strong>Coefficient – Pr(>t):</strong> The Pr(>t) is computed from the t values. This is used for rejecting the Null Hypothesis (H00) stated above. Normally, the value for this less than 0.05 or 5% is considered to be the cut-off point for rejecting H0.</p>
<p><strong>Residuals:</strong> Residuals are the next component in the model summary. Residuals are the difference between the predicted values by the model and the actual values in the dataset. For the model to be good, the residuals should be normally distributed.</p>
<p><strong>Adjusted R-squared:</strong><br />
Adjusted R-squared is considered for evaluating model accuracy when the number of independent variables is greater than 1. Adjusted R-squared adjusts the number of variables considered in the model and is the preferred measure for evaluating the model goodness.</p>
<p><strong>F-Statistic:</strong> F-statistic is used for finding out if there exists any relationship between our independent (predictor) and the dependent (response) variables. Normally, the value of F-statistic greater than one can be used for rejecting the null hypothesis (H0: There is no relationship between Employed and other independent variables). For our model, the value of F-statistic, 330.6 is very high because of the limited data points. The p-value in the output for F-statistic is evaluated the same way we use the Pr(>t) value in the coefficients output. For the p-value, we can reject the null hypothesis (H0) as p-value < 0.05.</p>
<p><strong>There is no established relationship between the two.</strong> R-squared tells how much variation in the response variable is explained by the predictor variables while p-value tells if the predictors used in the model are able to explain the response variable or not. If p-value < 0.05 (for 95% confidence), then the model is considered to be good.</p>
<ol style="list-style-type: decimal">
<li><strong>low R-square</strong> and <strong>low p-value</strong> (p-value <= 0.05): This means that the model doesn’t explain much of the variation in the response variable, but still this is considered better than having no model to explain the response variable as it is significant as per the p-value.</li>
<li><strong>low R-square</strong> and <strong>high p-value</strong> (p-value > 0.05): This means that model doesn’t explain much variation in the data and is not significant. We should discard such model as this is the worst scenario.</li>
<li><strong>high R-square</strong> and <strong>low p-value</strong>: This means that model explains a lot of variation in the data and is also significant. This scenario is best of the four and the model is considered to be good in this case.</li>
<li><strong>high R-square</strong> and <strong>high p-value</strong>: This means that variance in the data is explained by the model but it is not significant. We should not use such model for predictions.</li>
</ol>
<p>Here are the nessary conditions for a linear regression model to be valid. Hence, these are the assumptions made when doing a linear regression.</p>
<ul>
<li><strong>Linear Relationship</strong>.<br />
The plot of the residuals should show the data points randomly scattered around the 0 line.<br />
This plot shows if residuals have non-linear patterns. There could be a non-linear relationship between predictor variables and an outcome variable and the pattern could show up in this plot if the model doesn’t capture the non-linear relationship. If you find equally spread residuals around a horizontal line without distinct patterns, that is a good indication you don’t have non-linear relationships.</li>
</ul>
<div class="figure">
<img src="otherpics/GoodVsBadResidualsPlot.png" alt="Good Vs Bad residuals plot" />
<p class="caption">Good Vs Bad residuals plot</p>
</div>
<p>There isn’t any distinctive pattern in Case 1, but there is a parabola in Case 2, where the non-linear relationship was not explained by the model and was left out in the residuals.</p>
<ul>
<li><p><strong>Multivariate normality</strong>. The multiple linear regression analysis requires that the errors between observed and predicted values (i.e., the residuals of the regression) should be normally distributed. This assumption may be checked by looking at a histogram or a Q-Q-Plot. Normality can also be checked with a goodness of fit test (e.g., the Kolmogorov-Smirnov test), though this test must be conducted on the residuals themselves.<br />
<img src="otherpics/GoodVsBadQQPlot.png" alt="Good Vs Bad residuals Q-Q plot" /></p></li>
<li><strong>No Multicollinearity</strong>. Multicollinearity may be tested with these central criteria:</li>
</ul>
<ol style="list-style-type: decimal">
<li>Correlation matrix. When computing the matrix of Pearson’s Bivariate Correlation among all independent variables the correlation coefficients need to be smaller than 1.</li>
<li>Variance Inflation Factor (VIF) – the variance inflation factor of the linear regression is defined as VIF = 1/T. Tolerance (T) is defined as T = 1 – R². With VIF > 10 there is an indication that multicollinearity may be present; with VIF > 100 there is certainly multicollinearity among the variables. If multicollinearity is found in the data, centering the data (that is deducting the mean of the variable from each score) might help to solve the problem. However, the simplest way to address the problem is to remove independent variables with high VIF values.</li>
</ol>
<ul>
<li><strong>Homoscedasticity</strong>. A scatterplot of residuals versus predicted values is good way to check for homoscedasticity. There should be no clear pattern in the distribution; if there is a cone-shaped pattern (as shown below), the data is heteroscedastic. If the data are heteroscedastic, a non-linear data transformation or addition of a quadratic term might fix the problem.</li>
</ul>
<p>This plot shows if residuals are spread equally along the ranges of predictors. This is how you can check the assumption of equal variance (homoscedasticity). It’s good if you see a horizontal line with equally (randomly) spread points.<br />
<img src="otherpics/GoodVsBadScalePlot.png" alt="Good Vs Bad residuals Q-Q plot" /></p>
<p>In Case 2, the residuals begin to spread wider along the x-axis. Because the residuals spread wider and wider, the red smooth line is not horizontal and shows a steep angle in Case 2.</p>
</div>
<div id="final-example---boston-dataset---with-backward-elimination" class="section level2">
<h2><span class="header-section-number">3.4</span> Final example - Boston dataset - with backward elimination</h2>
<p>On this last example we’ll use a more systemic way to find out which variables should be chosen into our models.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df <-<span class="st"> </span><span class="kw">read_csv</span>(<span class="st">"dataset/Boston.csv"</span>)
skimr<span class="op">::</span><span class="kw">skim</span>(df)</code></pre></div>
<pre><code>## Skim summary statistics
## n obs: 333
## n variables: 14
##
## ── Variable type:numeric ───────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50
## AGE 0 333 333 68.23 28.13 6 45.4 76.7
## BLACK 0 333 333 359.47 86.58 3.5 376.73 392.05
## CHAS 0 333 333 0.06 0.24 0 0 0
## CRIM 0 333 333 3.36 7.35 0.0063 0.079 0.26
## DIS 0 333 333 3.71 1.98 1.13 2.12 3.09
## INDUS 0 333 333 11.29 7 0.74 5.13 9.9
## LSTAT 0 333 333 12.52 7.07 1.73 7.18 10.97
## MEDV 0 333 333 22.77 9.17 5 17.4 21.6
## NOX 0 333 333 0.56 0.11 0.39 0.45 0.54
## PTRATIO 0 333 333 18.45 2.15 12.6 17.4 19
## RAD 0 333 333 9.63 8.74 1 4 5
## RM 0 333 333 6.27 0.7 3.56 5.88 6.2
## TAX 0 333 333 409.28 170.84 188 279 330
## ZN 0 333 333 10.69 22.67 0 0 0
## p75 p100 hist
## 93.8 100 ▁▂▂▂▂▂▃▇
## 396.24 396.9 ▁▁▁▁▁▁▁▇
## 0 1 ▇▁▁▁▁▁▁▁
## 3.68 73.53 ▇▁▁▁▁▁▁▁
## 5.12 10.71 ▇▆▅▃▂▁▁▁
## 18.1 27.74 ▅▅▃▁▁▇▁▁
## 16.42 37.97 ▅▇▆▃▂▁▁▁
## 25 50 ▂▅▇▇▂▂▁▁
## 0.63 0.87 ▇▇▇▆▃▅▁▁
## 20.2 21.2 ▁▂▁▂▃▃▂▇
## 24 24 ▃▇▂▁▁▁▁▅
## 6.59 8.72 ▁▁▂▇▇▂▁▁
## 666 711 ▅▇▂▅▁▁▁▇
## 12.5 100 ▇▁▁▁▁▁▁▁</code></pre>
<p>Here is the list of variables with their meaning.</p>
<ul>
<li>CRIM per capita crime rate by town</li>
<li>ZN proportion of residential land zoned for lots over 25,000 sq.ft.</li>
<li>INDUS proportion of non-retail business acres per town</li>
<li>CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)</li>
<li>NOX nitric oxides concentration (parts per 10 million)</li>
<li>RM average number of rooms per dwelling</li>
<li>AGE proportion of owner-occupied units built prior to 1940</li>
<li>DIS weighted distances to five Boston employment centres</li>
<li>RAD index of accessibility to radial highways</li>
<li>TAX full-value property-tax rate per $10,000</li>
<li>PTRATIO pupil-teacher ratio by town</li>
<li>B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town</li>
<li>LSTAT % lower status of the population</li>
<li>MEDV Median value of owner-occupied homes in $1000’s</li>
</ul>
<p>Let’s make the necessary adjustment in variable types</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df<span class="op">$</span>CHAS <-<span class="st"> </span><span class="kw">factor</span>(df<span class="op">$</span>CHAS)</code></pre></div>
<p>A quick check on how correlated are our variables.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">corrplot</span>(<span class="kw">cor</span>(df <span class="op">%>%</span><span class="st"> </span><span class="kw">select</span>(<span class="op">-</span>CHAS)), <span class="dt">type =</span> <span class="st">"lower"</span>, <span class="dt">order =</span> <span class="st">"hclust"</span>, <span class="dt">tl.col =</span> <span class="st">"black"</span>, <span class="dt">sig.level =</span> <span class="fl">0.01</span>)</code></pre></div>
<p><img src="machinelearningwithR_files/figure-html/linreg09-1.png" width="672" /></p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">correlate</span>(df <span class="op">%>%</span><span class="st"> </span><span class="kw">select</span>(<span class="op">-</span>CHAS)) <span class="op">%>%</span><span class="st"> </span><span class="kw">shave</span>() <span class="op">%>%</span><span class="st"> </span><span class="kw">fashion</span>()</code></pre></div>
<pre><code>##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'</code></pre>
<pre><code>## rowname CRIM ZN INDUS NOX RM AGE DIS RAD TAX PTRATIO BLACK
## 1 CRIM
## 2 ZN -.21
## 3 INDUS .42 -.52
## 4 NOX .46 -.50 .75
## 5 RM -.31 .33 -.44 -.34
## 6 AGE .38 -.54 .64 .74 -.25
## 7 DIS -.40 .64 -.70 -.77 .27 -.76
## 8 RAD .67 -.30 .57 .61 -.27 .45 -.48
## 9 TAX .62 -.31 .71 .67 -.36 .51 -.53 .90
## 10 PTRATIO .31 -.38 .39 .19 -.37 .26 -.23 .47 .47
## 11 BLACK -.48 .17 -.34 -.37 .16 -.27 .28 -.41 -.41 -.16
## 12 LSTAT .53 -.39 .61 .60 -.62 .59 -.51 .48 .54 .37 -.36
## 13 MEDV -.41 .34 -.47 -.41 .69 -.36 .25 -.35 -.45 -.48 .34
## LSTAT MEDV
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13 -.74</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">yo <-<span class="st"> </span><span class="kw">correlate</span>(df <span class="op">%>%</span><span class="st"> </span><span class="kw">select</span>(<span class="op">-</span>CHAS)) <span class="op">%>%</span><span class="st"> </span><span class="kw">shave</span>() <span class="op">%>%</span><span class="st"> </span><span class="kw">fashion</span>()</code></pre></div>
<pre><code>##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">kable</span>(yo, <span class="dt">format =</span> <span class="st">"html"</span>) <span class="op">%>%</span><span class="st"> </span><span class="kw">kable_styling</span>()</code></pre></div>
<table class="table" style="margin-left: auto; margin-right: auto;">
<thead>
<tr>
<th style="text-align:left;">
rowname
</th>
<th style="text-align:left;">
CRIM
</th>
<th style="text-align:left;">
ZN
</th>
<th style="text-align:left;">
INDUS
</th>
<th style="text-align:left;">
NOX
</th>
<th style="text-align:left;">
RM
</th>
<th style="text-align:left;">
AGE
</th>
<th style="text-align:left;">
DIS
</th>
<th style="text-align:left;">
RAD
</th>
<th style="text-align:left;">
TAX
</th>
<th style="text-align:left;">
PTRATIO
</th>
<th style="text-align:left;">
BLACK
</th>
<th style="text-align:left;">
LSTAT
</th>
<th style="text-align:left;">
MEDV
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
CRIM
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
ZN
</td>
<td style="text-align:left;">
-.21
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
INDUS
</td>
<td style="text-align:left;">
.42
</td>
<td style="text-align:left;">
-.52
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
NOX
</td>
<td style="text-align:left;">
.46
</td>
<td style="text-align:left;">
-.50
</td>
<td style="text-align:left;">
.75
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
RM
</td>
<td style="text-align:left;">
-.31
</td>
<td style="text-align:left;">
.33
</td>
<td style="text-align:left;">
-.44
</td>
<td style="text-align:left;">
-.34
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
AGE
</td>
<td style="text-align:left;">
.38
</td>
<td style="text-align:left;">
-.54
</td>
<td style="text-align:left;">
.64
</td>
<td style="text-align:left;">
.74
</td>
<td style="text-align:left;">
-.25
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
DIS
</td>
<td style="text-align:left;">
-.40
</td>
<td style="text-align:left;">
.64
</td>
<td style="text-align:left;">
-.70
</td>
<td style="text-align:left;">
-.77
</td>
<td style="text-align:left;">
.27
</td>
<td style="text-align:left;">
-.76
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
RAD
</td>
<td style="text-align:left;">
.67
</td>
<td style="text-align:left;">
-.30
</td>
<td style="text-align:left;">
.57
</td>
<td style="text-align:left;">
.61
</td>
<td style="text-align:left;">
-.27
</td>
<td style="text-align:left;">
.45
</td>
<td style="text-align:left;">
-.48
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
TAX
</td>
<td style="text-align:left;">
.62
</td>
<td style="text-align:left;">
-.31
</td>
<td style="text-align:left;">
.71
</td>
<td style="text-align:left;">
.67
</td>
<td style="text-align:left;">
-.36
</td>
<td style="text-align:left;">
.51
</td>
<td style="text-align:left;">
-.53
</td>
<td style="text-align:left;">
.90
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
PTRATIO
</td>
<td style="text-align:left;">
.31
</td>
<td style="text-align:left;">
-.38
</td>
<td style="text-align:left;">
.39
</td>
<td style="text-align:left;">
.19
</td>
<td style="text-align:left;">
-.37