mlr.html

<!DOCTYPE html>
<html >

<head>

  <meta charset="UTF-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <title>Chapter 3 Single &amp; Multiple Linear Regression | Machine Learning with R</title>
  <meta name="description" content="This book is about using R for machine learning purposes.">
  <meta name="generator" content="bookdown  and GitBook 2.6.7">

  <meta property="og:title" content="Chapter 3 Single &amp; Multiple Linear Regression | Machine Learning with R" />
  <meta property="og:type" content="book" />
  
  
  <meta property="og:description" content="This book is about using R for machine learning purposes." />
  <meta name="github-repo" content="fderyckel/machinelearningwithr" />

  <meta name="twitter:card" content="summary" />
  <meta name="twitter:title" content="Chapter 3 Single &amp; Multiple Linear Regression | Machine Learning with R" />
  
  <meta name="twitter:description" content="This book is about using R for machine learning purposes." />
  

<meta name="author" content="François de Ryckel">


<meta name="date" content="2019-02-23">

  <meta name="viewport" content="width=device-width, initial-scale=1">
  <meta name="apple-mobile-web-app-capable" content="yes">
  <meta name="apple-mobile-web-app-status-bar-style" content="black">
  
  
<link rel="prev" href="testinference.html">
<link rel="next" href="logistic.html">
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />


<script src="libs/kePrint-0.0.1/kePrint.js"></script>


<style type="text/css">
div.sourceCode { overflow-x: auto; }
table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
  margin: 0; padding: 0; vertical-align: baseline; border: none; }
table.sourceCode { width: 100%; line-height: 100%; }
td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
td.sourceCode { padding-left: 5px; }
code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
code > span.dt { color: #902000; } /* DataType */
code > span.dv { color: #40a070; } /* DecVal */
code > span.bn { color: #40a070; } /* BaseN */
code > span.fl { color: #40a070; } /* Float */
code > span.ch { color: #4070a0; } /* Char */
code > span.st { color: #4070a0; } /* String */
code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
code > span.ot { color: #007020; } /* Other */
code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
code > span.fu { color: #06287e; } /* Function */
code > span.er { color: #ff0000; font-weight: bold; } /* Error */
code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
code > span.cn { color: #880000; } /* Constant */
code > span.sc { color: #4070a0; } /* SpecialChar */
code > span.vs { color: #4070a0; } /* VerbatimString */
code > span.ss { color: #bb6688; } /* SpecialString */
code > span.im { } /* Import */
code > span.va { color: #19177c; } /* Variable */
code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code > span.op { color: #666666; } /* Operator */
code > span.bu { } /* BuiltIn */
code > span.ex { } /* Extension */
code > span.pp { color: #bc7a00; } /* Preprocessor */
code > span.at { color: #7d9029; } /* Attribute */
code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
</style>

<link rel="stylesheet" href="style.css" type="text/css" />
</head>

<body>


  <div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">

    <div class="book-summary">
      <nav role="navigation">

<ul class="summary">
<li><strong><a href="./">Machine Learning with R</a></strong></li>

<li class="divider"></li>
<li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Prerequisites</a><ul>
<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#pre-requisite-and-conventions"><i class="fa fa-check"></i><b>1.1</b> Pre-requisite and conventions</a></li>
<li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#organization"><i class="fa fa-check"></i><b>1.2</b> Organization</a></li>
<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#packages"><i class="fa fa-check"></i><b>1.3</b> Packages</a></li>
</ul></li>
<li class="chapter" data-level="2" data-path="testinference.html"><a href="testinference.html"><i class="fa fa-check"></i><b>2</b> Tests and inferences</a><ul>
<li class="chapter" data-level="2.1" data-path="testinference.html"><a href="testinference.html#normality"><i class="fa fa-check"></i><b>2.1</b> Assumption of normality</a><ul>
<li class="chapter" data-level="2.1.1" data-path="testinference.html"><a href="testinference.html#visual-check-of-normality"><i class="fa fa-check"></i><b>2.1.1</b> Visual check of normality</a></li>
<li class="chapter" data-level="2.1.2" data-path="testinference.html"><a href="testinference.html#normality-tests"><i class="fa fa-check"></i><b>2.1.2</b> Normality tests</a></li>
</ul></li>
<li class="chapter" data-level="2.2" data-path="testinference.html"><a href="testinference.html#ttest"><i class="fa fa-check"></i><b>2.2</b> T-tests</a></li>
<li class="chapter" data-level="2.3" data-path="testinference.html"><a href="testinference.html#anova---analyse-of-variance."><i class="fa fa-check"></i><b>2.3</b> ANOVA - Analyse of variance.</a></li>
<li class="chapter" data-level="2.4" data-path="testinference.html"><a href="testinference.html#covariance"><i class="fa fa-check"></i><b>2.4</b> Covariance</a></li>
</ul></li>
<li class="chapter" data-level="3" data-path="mlr.html"><a href="mlr.html"><i class="fa fa-check"></i><b>3</b> Single &amp; Multiple Linear Regression</a><ul>
<li class="chapter" data-level="3.1" data-path="mlr.html"><a href="mlr.html#single-variable-regression"><i class="fa fa-check"></i><b>3.1</b> Single variable regression</a></li>
<li class="chapter" data-level="3.2" data-path="mlr.html"><a href="mlr.html#multi-variables-regression"><i class="fa fa-check"></i><b>3.2</b> Multi-variables regression</a><ul>
<li class="chapter" data-level="3.2.1" data-path="mlr.html"><a href="mlr.html#predicting-wine-price-again"><i class="fa fa-check"></i><b>3.2.1</b> Predicting wine price (again!)</a></li>
</ul></li>
<li class="chapter" data-level="3.3" data-path="mlr.html"><a href="mlr.html#model-diagnostic-and-evaluation"><i class="fa fa-check"></i><b>3.3</b> Model diagnostic and evaluation</a></li>
<li class="chapter" data-level="3.4" data-path="mlr.html"><a href="mlr.html#final-example---boston-dataset---with-backward-elimination"><i class="fa fa-check"></i><b>3.4</b> Final example - Boston dataset - with backward elimination</a><ul>
<li class="chapter" data-level="3.4.1" data-path="mlr.html"><a href="mlr.html#model-diagmostic"><i class="fa fa-check"></i><b>3.4.1</b> Model diagmostic</a></li>
</ul></li>
<li class="chapter" data-level="3.5" data-path="mlr.html"><a href="mlr.html#references"><i class="fa fa-check"></i><b>3.5</b> References</a></li>
</ul></li>
<li class="chapter" data-level="4" data-path="logistic.html"><a href="logistic.html"><i class="fa fa-check"></i><b>4</b> Logistic Regression</a><ul>
<li class="chapter" data-level="4.1" data-path="logistic.html"><a href="logistic.html#introduction"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
<li class="chapter" data-level="4.2" data-path="logistic.html"><a href="logistic.html#the-logistic-equation."><i class="fa fa-check"></i><b>4.2</b> The logistic equation.</a></li>
<li class="chapter" data-level="4.3" data-path="logistic.html"><a href="logistic.html#performance-of-logistic-regression-model"><i class="fa fa-check"></i><b>4.3</b> Performance of Logistic Regression Model</a></li>
<li class="chapter" data-level="4.4" data-path="logistic.html"><a href="logistic.html#setting-up"><i class="fa fa-check"></i><b>4.4</b> Setting up</a></li>
<li class="chapter" data-level="4.5" data-path="logistic.html"><a href="logistic.html#example-1---graduate-admission"><i class="fa fa-check"></i><b>4.5</b> Example 1 - Graduate Admission</a></li>
<li class="chapter" data-level="4.6" data-path="logistic.html"><a href="logistic.html#example-2---diabetes"><i class="fa fa-check"></i><b>4.6</b> Example 2 - Diabetes</a><ul>
<li class="chapter" data-level="4.6.1" data-path="logistic.html"><a href="logistic.html#accounting-for-missing-values"><i class="fa fa-check"></i><b>4.6.1</b> Accounting for missing values</a></li>
<li class="chapter" data-level="4.6.2" data-path="logistic.html"><a href="logistic.html#imputting-missing-values"><i class="fa fa-check"></i><b>4.6.2</b> Imputting Missing Values</a></li>
<li class="chapter" data-level="4.6.3" data-path="logistic.html"><a href="logistic.html#roc-and-auc"><i class="fa fa-check"></i><b>4.6.3</b> ROC and AUC</a></li>
</ul></li>
<li class="chapter" data-level="4.7" data-path="logistic.html"><a href="logistic.html#references-1"><i class="fa fa-check"></i><b>4.7</b> References</a></li>
</ul></li>
<li class="chapter" data-level="5" data-path="softmax-and-multinomial-regressions.html"><a href="softmax-and-multinomial-regressions.html"><i class="fa fa-check"></i><b>5</b> Softmax and multinomial regressions</a><ul>
<li class="chapter" data-level="5.1" data-path="softmax-and-multinomial-regressions.html"><a href="softmax-and-multinomial-regressions.html#multinomial-logistic-regression"><i class="fa fa-check"></i><b>5.1</b> Multinomial Logistic Regression</a></li>
<li class="chapter" data-level="5.2" data-path="softmax-and-multinomial-regressions.html"><a href="softmax-and-multinomial-regressions.html#references-2"><i class="fa fa-check"></i><b>5.2</b> References</a></li>
</ul></li>
<li class="chapter" data-level="6" data-path="gradient-descent.html"><a href="gradient-descent.html"><i class="fa fa-check"></i><b>6</b> Gradient Descent</a><ul>
<li class="chapter" data-level="6.1" data-path="gradient-descent.html"><a href="gradient-descent.html#example-on-functions"><i class="fa fa-check"></i><b>6.1</b> Example on functions</a></li>
<li class="chapter" data-level="6.2" data-path="gradient-descent.html"><a href="gradient-descent.html#example-on-regressions"><i class="fa fa-check"></i><b>6.2</b> Example on regressions</a></li>
</ul></li>
<li class="chapter" data-level="7" data-path="knnchapter.html"><a href="knnchapter.html"><i class="fa fa-check"></i><b>7</b> KNN - K Nearest Neighbour</a><ul>
<li class="chapter" data-level="7.1" data-path="knnchapter.html"><a href="knnchapter.html#example-1.-prostate-cancer-dataset"><i class="fa fa-check"></i><b>7.1</b> Example 1. Prostate Cancer dataset</a></li>
<li class="chapter" data-level="7.2" data-path="knnchapter.html"><a href="knnchapter.html#example-2.-wine-dataset"><i class="fa fa-check"></i><b>7.2</b> Example 2. Wine dataset</a><ul>
<li class="chapter" data-level="7.2.1" data-path="knnchapter.html"><a href="knnchapter.html#understand-the-data"><i class="fa fa-check"></i><b>7.2.1</b> Understand the data</a></li>
</ul></li>
<li class="chapter" data-level="7.3" data-path="knnchapter.html"><a href="knnchapter.html#references-3"><i class="fa fa-check"></i><b>7.3</b> References</a></li>
</ul></li>
<li class="chapter" data-level="8" data-path="kmeans.html"><a href="kmeans.html"><i class="fa fa-check"></i><b>8</b> Kmeans clustering</a><ul>
<li class="chapter" data-level="8.1" data-path="kmeans.html"><a href="kmeans.html#multinomial-logistic-regression-1"><i class="fa fa-check"></i><b>8.1</b> Multinomial Logistic Regression</a></li>
<li class="chapter" data-level="8.2" data-path="kmeans.html"><a href="kmeans.html#references-4"><i class="fa fa-check"></i><b>8.2</b> References</a></li>
</ul></li>
<li class="chapter" data-level="9" data-path="hierclust.html"><a href="hierclust.html"><i class="fa fa-check"></i><b>9</b> Hierarichal Clustering</a><ul>
<li class="chapter" data-level="9.1" data-path="hierclust.html"><a href="hierclust.html#example-on-the-pokemon-dataset"><i class="fa fa-check"></i><b>9.1</b> Example on the Pokemon dataset</a></li>
<li class="chapter" data-level="9.2" data-path="hierclust.html"><a href="hierclust.html#example-on-regressions-1"><i class="fa fa-check"></i><b>9.2</b> Example on regressions</a></li>
<li class="chapter" data-level="9.3" data-path="hierclust.html"><a href="hierclust.html#references-5"><i class="fa fa-check"></i><b>9.3</b> References</a></li>
</ul></li>
<li class="chapter" data-level="10" data-path="pca.html"><a href="pca.html"><i class="fa fa-check"></i><b>10</b> Principal Component Analysis</a><ul>
<li class="chapter" data-level="10.1" data-path="pca.html"><a href="pca.html#pca-on-an-easy-example."><i class="fa fa-check"></i><b>10.1</b> PCA on an easy example.</a></li>
<li class="chapter" data-level="10.2" data-path="pca.html"><a href="pca.html#references."><i class="fa fa-check"></i><b>10.2</b> References.</a></li>
</ul></li>
<li class="chapter" data-level="11" data-path="trees-and-classification.html"><a href="trees-and-classification.html"><i class="fa fa-check"></i><b>11</b> Trees and Classification</a><ul>
<li class="chapter" data-level="11.1" data-path="trees-and-classification.html"><a href="trees-and-classification.html#introduction-1"><i class="fa fa-check"></i><b>11.1</b> Introduction</a></li>
<li class="chapter" data-level="11.2" data-path="trees-and-classification.html"><a href="trees-and-classification.html#first-example."><i class="fa fa-check"></i><b>11.2</b> First example.</a></li>
<li class="chapter" data-level="11.3" data-path="trees-and-classification.html"><a href="trees-and-classification.html#second-example."><i class="fa fa-check"></i><b>11.3</b> Second Example.</a></li>
<li class="chapter" data-level="11.4" data-path="trees-and-classification.html"><a href="trees-and-classification.html#how-does-a-tree-decide-where-to-split"><i class="fa fa-check"></i><b>11.4</b> How does a tree decide where to split?</a></li>
<li class="chapter" data-level="11.5" data-path="trees-and-classification.html"><a href="trees-and-classification.html#third-example."><i class="fa fa-check"></i><b>11.5</b> Third example.</a></li>
<li class="chapter" data-level="11.6" data-path="trees-and-classification.html"><a href="trees-and-classification.html#references-6"><i class="fa fa-check"></i><b>11.6</b> References</a></li>
</ul></li>
<li class="chapter" data-level="12" data-path="random-forest.html"><a href="random-forest.html"><i class="fa fa-check"></i><b>12</b> Random Forest</a><ul>
<li class="chapter" data-level="12.1" data-path="random-forest.html"><a href="random-forest.html#how-does-it-work"><i class="fa fa-check"></i><b>12.1</b> How does it work?</a></li>
<li class="chapter" data-level="12.2" data-path="random-forest.html"><a href="random-forest.html#references-7"><i class="fa fa-check"></i><b>12.2</b> References</a></li>
</ul></li>
<li class="chapter" data-level="13" data-path="svm.html"><a href="svm.html"><i class="fa fa-check"></i><b>13</b> Support Vector Machine</a><ul>
<li class="chapter" data-level="13.1" data-path="svm.html"><a href="svm.html#support-vecotr-regression"><i class="fa fa-check"></i><b>13.1</b> Support Vecotr Regression</a><ul>
<li class="chapter" data-level="13.1.1" data-path="svm.html"><a href="svm.html#create-data"><i class="fa fa-check"></i><b>13.1.1</b> Create data</a></li>
<li class="chapter" data-level="13.1.2" data-path="svm.html"><a href="svm.html#tuning-a-svm-model"><i class="fa fa-check"></i><b>13.1.2</b> Tuning a SVM model</a></li>
<li class="chapter" data-level="13.1.3" data-path="svm.html"><a href="svm.html#discussion-on-parameters"><i class="fa fa-check"></i><b>13.1.3</b> Discussion on parameters</a></li>
</ul></li>
<li class="chapter" data-level="13.2" data-path="svm.html"><a href="svm.html#references-8"><i class="fa fa-check"></i><b>13.2</b> References</a></li>
</ul></li>
<li class="chapter" data-level="14" data-path="model-evaluation.html"><a href="model-evaluation.html"><i class="fa fa-check"></i><b>14</b> Model Evaluation</a><ul>
<li class="chapter" data-level="14.1" data-path="model-evaluation.html"><a href="model-evaluation.html#biais-variance-tradeoff"><i class="fa fa-check"></i><b>14.1</b> Biais variance tradeoff</a></li>
<li class="chapter" data-level="14.2" data-path="model-evaluation.html"><a href="model-evaluation.html#bagging"><i class="fa fa-check"></i><b>14.2</b> Bagging</a></li>
<li class="chapter" data-level="14.3" data-path="model-evaluation.html"><a href="model-evaluation.html#crossvalidation"><i class="fa fa-check"></i><b>14.3</b> Cross Validation</a></li>
</ul></li>
<li class="chapter" data-level="15" data-path="case-study-text-classification-spam-and-ham-.html"><a href="case-study-text-classification-spam-and-ham-.html"><i class="fa fa-check"></i><b>15</b> Case Study - Text classification: Spam and Ham.</a></li>
<li class="chapter" data-level="16" data-path="mushroom.html"><a href="mushroom.html"><i class="fa fa-check"></i><b>16</b> Case Study - Mushrooms Classification</a><ul>
<li class="chapter" data-level="16.1" data-path="mushroom.html"><a href="mushroom.html#import-the-data"><i class="fa fa-check"></i><b>16.1</b> Import the data</a></li>
<li class="chapter" data-level="16.2" data-path="mushroom.html"><a href="mushroom.html#tidy-the-data"><i class="fa fa-check"></i><b>16.2</b> Tidy the data</a></li>
<li class="chapter" data-level="16.3" data-path="mushroom.html"><a href="mushroom.html#understand-the-data-1"><i class="fa fa-check"></i><b>16.3</b> Understand the data</a><ul>
<li class="chapter" data-level="16.3.1" data-path="mushroom.html"><a href="mushroom.html#transform-the-data"><i class="fa fa-check"></i><b>16.3.1</b> Transform the data</a></li>
<li class="chapter" data-level="16.3.2" data-path="mushroom.html"><a href="mushroom.html#visualize-the-data"><i class="fa fa-check"></i><b>16.3.2</b> Visualize the data</a></li>
<li class="chapter" data-level="16.3.3" data-path="mushroom.html"><a href="mushroom.html#modeling"><i class="fa fa-check"></i><b>16.3.3</b> Modeling</a></li>
</ul></li>
<li class="chapter" data-level="16.4" data-path="mushroom.html"><a href="mushroom.html#communication"><i class="fa fa-check"></i><b>16.4</b> Communication</a></li>
</ul></li>
<li class="chapter" data-level="17" data-path="case-study-the-adults-dataset-.html"><a href="case-study-the-adults-dataset-.html"><i class="fa fa-check"></i><b>17</b> Case study - The adults dataset.</a><ul>
<li class="chapter" data-level="17.1" data-path="case-study-the-adults-dataset-.html"><a href="case-study-the-adults-dataset-.html#introduction-2"><i class="fa fa-check"></i><b>17.1</b> Introduction</a></li>
<li class="chapter" data-level="17.2" data-path="case-study-the-adults-dataset-.html"><a href="case-study-the-adults-dataset-.html#import-the-data-1"><i class="fa fa-check"></i><b>17.2</b> Import the data</a></li>
<li class="chapter" data-level="17.3" data-path="case-study-the-adults-dataset-.html"><a href="case-study-the-adults-dataset-.html#tidy-the-data-1"><i class="fa fa-check"></i><b>17.3</b> Tidy the data</a></li>
</ul></li>
<li class="chapter" data-level="18" data-path="breastcancer.html"><a href="breastcancer.html"><i class="fa fa-check"></i><b>18</b> Case Study - Wisconsin Breast Cancer</a><ul>
<li class="chapter" data-level="18.1" data-path="breastcancer.html"><a href="breastcancer.html#import-the-data-2"><i class="fa fa-check"></i><b>18.1</b> Import the data</a></li>
<li class="chapter" data-level="18.2" data-path="breastcancer.html"><a href="breastcancer.html#tidy-the-data-2"><i class="fa fa-check"></i><b>18.2</b> Tidy the data</a></li>
<li class="chapter" data-level="18.3" data-path="breastcancer.html"><a href="breastcancer.html#understand-the-data-2"><i class="fa fa-check"></i><b>18.3</b> Understand the data</a><ul>
<li class="chapter" data-level="18.3.1" data-path="breastcancer.html"><a href="breastcancer.html#transform-the-data-1"><i class="fa fa-check"></i><b>18.3.1</b> Transform the data</a></li>
<li class="chapter" data-level="18.3.2" data-path="breastcancer.html"><a href="breastcancer.html#pre-process-the-data"><i class="fa fa-check"></i><b>18.3.2</b> Pre-process the data</a></li>
<li class="chapter" data-level="18.3.3" data-path="breastcancer.html"><a href="breastcancer.html#model-the-data-1"><i class="fa fa-check"></i><b>18.3.3</b> Model the data</a></li>
</ul></li>
<li class="chapter" data-level="18.4" data-path="breastcancer.html"><a href="breastcancer.html#references-9"><i class="fa fa-check"></i><b>18.4</b> References</a></li>
</ul></li>
<li class="chapter" data-level="19" data-path="final-words.html"><a href="final-words.html"><i class="fa fa-check"></i><b>19</b> Final Words</a></li>
<li class="chapter" data-level="" data-path="references-10.html"><a href="references-10.html"><i class="fa fa-check"></i>References</a></li>
</ul>

      </nav>
    </div>

    <div class="book-body">
      <div class="body-inner">
        <div class="book-header" role="navigation">
          <h1>
            <i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Machine Learning with R</a>
          </h1>
        </div>

        <div class="page-wrapper" tabindex="-1" role="main">
          <div class="page-inner">

            <section class="normal" id="section-">
<div id="mlr" class="section level1">
<h1><span class="header-section-number">Chapter 3</span> Single &amp; Multiple Linear Regression</h1>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(skimr)
<span class="kw">library</span>(kableExtra) <span class="co"># for the kable_styling function</span>

<span class="kw">library</span>(tibble)
<span class="kw">library</span>(dplyr)
<span class="kw">library</span>(readr)
<span class="kw">library</span>(ggplot2)</code></pre></div>
<div id="single-variable-regression" class="section level2">
<h2><span class="header-section-number">3.1</span> Single variable regression</h2>
<p>The general equation for a linear regression model</p>
<blockquote>
<p><span class="math inline">\(y^i = \beta_{0} + \beta_{1} x^i + \epsilon^i\)</span></p>
</blockquote>
<p>where:</p>
<ul>
<li><span class="math inline">\(y^i\)</span> is the <span class="math inline">\(i^{th}\)</span> observation of the dependent variable</li>
<li><span class="math inline">\(\beta_{0}\)</span> is the intercept coefficient</li>
<li><span class="math inline">\(\beta_{1}\)</span> is the regression coefficient for the dependent variable</li>
<li><span class="math inline">\(x^i\)</span> is the <span class="math inline">\(i^{th}\)</span> observation of the independent variable</li>
<li><span class="math inline">\(\epsilon^i\)</span> is the error term for the <span class="math inline">\(i^{th}\)</span> observation. It basically is the difference in therm of y between the observed value and the estimated value. It is also called the residuals. A good model minimize these errors.<a href="#fn1" class="footnoteRef" id="fnref1"><sup>1</sup></a></li>
</ul>
<p>Some ways to assess how good our model is to:</p>
<ol style="list-style-type: decimal">
<li>compute the SSE (the sum of squared error)
<ul>
<li>SSE = <span class="math inline">\((\epsilon^1)^2 + (\epsilon^2)^2 + \ldots + (\epsilon^n)^2\)</span> = <span class="math inline">\(\sum_{i=1}^N \epsilon^i\)</span></li>
<li>A good model will minimize SSE</li>
<li>problem: SSE is dependent of N. SSE will naturally increase as N increase</li>
</ul></li>
<li>compute the RMSE (the root mean squared error)
<ul>
<li>RMSE = <span class="math inline">\(\sqrt {\frac {SSE} {N}}\)</span></li>
<li>Also a good model will minimize SSE</li>
<li>It depends of the unit of the dependent variable. It is like the average error the model is making (in term of the unit of the dependent variable)</li>
</ul></li>
<li>compute <span class="math inline">\(R^2\)</span>
<ul>
<li>It compare the models to a baseline model</li>
<li><span class="math inline">\(R^2\)</span> is <strong>unitless</strong> and <strong>universaly</strong> interpretable</li>
<li>SST is the sum of the squared of the difference between the observed value and the mean of all the observed value</li>
<li><span class="math inline">\(R^2 = 1 - \frac {SSE} {SST}\)</span></li>
</ul></li>
</ol>
<p>We usually use r-squared to check the performance of a regression.</p>
<p>The conditions and assumptions to have a valid linear model are the same as for the t-test.</p>
<ul>
<li>linear relationship between dependent and independent variables. (scatterplot of dependent vs independent variables + scatterplot of residuals vs fitted). Also check here for outliers. Regression line and coefficient of regression are affected by outliers. Check it would make sense to remove them.<br />
</li>
<li>Multivariate normality. Multiple regression assumes that the residuals are normally distributed. Visual check on the Q-Q plot.<br />
</li>
<li>No Multicollinearity. Multiple regression assumes that the independent variables are not highly correlated with each other. Check correlation matrix and correlation plot. This assumption can also be tested using Variance Inflation Factor (VIF) values.</li>
<li>Homoscedasticity. This assumption states that the variance of error terms are similar across the values of the independent variables. A plot of standardized residuals versus predicted values can show whether points are equally distributed across all values of the independent variables.</li>
</ul>
<p>In our first linear regression, we’ll use the <strong>Wine</strong> dataset. Let’s load it and then have a quick look at its structure. </p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df =<span class="st"> </span><span class="kw">read_csv</span>(<span class="st">&quot;dataset/Wine.csv&quot;</span>)
<span class="kw">skim</span>(df)</code></pre></div>
<pre><code>## Skim summary statistics
##  n obs: 25 
##  n variables: 7 
## 
## ── Variable type:numeric ───────────────────────────────────────────────────
##     variable missing complete  n     mean      sd       p0      p25
##          Age       0       25 25    17.2     7.69     5       11   
##         AGST       0       25 25    16.51    0.68    14.98    16.2 
##    FrancePop       0       25 25 49694.44 3665.27 43183.57 46584   
##  HarvestRain       0       25 25   148.56   74.42    38       89   
##        Price       0       25 25     7.07    0.65     6.2      6.52
##   WinterRain       0       25 25   605.28  132.28   376      536   
##         Year       0       25 25  1965.8     7.69  1952     1960   
##       p50      p75     p100     hist
##     17       23       31    ▇▆▆▇▆▆▃▆
##     16.53    17.07    17.65 ▂▃▃▇▆▆▆▅
##  50254.97 52894.18 54602.19 ▃▂▃▂▃▃▃▇
##    130      187      292    ▅▇▇▅▆▁▃▅
##      7.12     7.5      8.49 ▇▃▃▇▃▂▂▁
##    600      697      830    ▅▁▂▇▃▃▂▃
##   1966     1972     1978    ▆▃▆▇▆▆▆▇</code></pre>
<p>We use the <code>lm</code> function to create our linear regression model. We use <em>AGST</em> as the independent variable while the <em>price</em> is the dependent variable. <img src="machinelearningwithR_files/figure-html/linreg02-plot-1.png" width="672" /></p>
<p>We can see a weak positive correlation between <code>AGST</code> and <code>Price</code>. The model would confirm that.<br />
</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">model_lm_df =<span class="st"> </span><span class="kw">lm</span>(Price <span class="op">~</span><span class="st"> </span>AGST, <span class="dt">data =</span> df)
<span class="kw">summary</span>(model_lm_df)</code></pre></div>
<pre><code>## 
## Call:
## lm(formula = Price ~ AGST, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78450 -0.23882 -0.03727  0.38992  0.90318 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(&gt;|t|)    
## (Intercept)  -3.4178     2.4935  -1.371 0.183710    
## AGST          0.6351     0.1509   4.208 0.000335 ***
## ---
## Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1
## 
## Residual standard error: 0.4993 on 23 degrees of freedom
## Multiple R-squared:  0.435,  Adjusted R-squared:  0.4105 
## F-statistic: 17.71 on 1 and 23 DF,  p-value: 0.000335</code></pre>
<p>The <code>summary</code> function applied on the model is giving us important information. See below for a detailed explanation of it.</p>
<ul>
<li>the stars next to the predictor variable indicated how significant the variable is for our regression model</li>
<li>it also gives us the value of the R^2 coefficient</li>
</ul>
<p>We could have calculated the R^2 value in this way: </p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">SSE =<span class="st"> </span><span class="kw">sum</span>(model_lm_df<span class="op">$</span>residuals<span class="op">^</span><span class="dv">2</span>)
SST =<span class="st"> </span><span class="kw">sum</span>((df<span class="op">$</span>Price <span class="op">-</span><span class="st"> </span><span class="kw">mean</span>(df<span class="op">$</span>Price))<span class="op">^</span><span class="dv">2</span>)
r_squared =<span class="st"> </span><span class="dv">1</span> <span class="op">-</span><span class="st"> </span>SSE<span class="op">/</span>SST
r_squared</code></pre></div>
<pre><code>## [1] 0.4350232</code></pre>
<p>The low R^2 indicate our model does not explain much of the variance of the data.</p>
<p>We can now plot the observations and the line of regression; and see how the linear model fits the data. </p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(df, <span class="kw">aes</span>(AGST, Price)) <span class="op">+</span><span class="st"> </span>
<span class="st">  </span><span class="kw">geom_point</span>(<span class="dt">shape =</span> <span class="dv">1</span>, <span class="dt">col =</span> <span class="st">&quot;blue&quot;</span>) <span class="op">+</span><span class="st"> </span>
<span class="st">  </span><span class="kw">geom_smooth</span>(<span class="dt">method =</span> <span class="st">&quot;lm&quot;</span>, <span class="dt">col =</span> <span class="st">&quot;red&quot;</span>)</code></pre></div>
<p><img src="machinelearningwithR_files/figure-html/linreg04-graph-1.png" width="672" /> By default, the <code>geom_smooth()</code> will use a 95% confidence interval (which is the grey-er area on the graph). There are 95% chance the line of regression will be within that zone for the whole population.</p>
<p>It is always nice to see how our residuals are distributed.<br />
We use the <code>ggplot2</code> library and the <code>fortify</code> function which transform the <code>summary(model1)</code> into a data frame usable for plotting.   </p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">model1 &lt;-<span class="st"> </span><span class="kw">fortify</span>(model_lm_df)
p &lt;-<span class="st"> </span><span class="kw">ggplot</span>(model1, <span class="kw">aes</span>(.fitted, .resid)) <span class="op">+</span><span class="st"> </span><span class="kw">geom_point</span>() 
p &lt;-<span class="st"> </span>p <span class="op">+</span><span class="st"> </span><span class="kw">geom_hline</span>(<span class="dt">yintercept =</span> <span class="dv">0</span>, <span class="dt">col =</span> <span class="st">&quot;red&quot;</span>, <span class="dt">linetype =</span> <span class="st">&quot;dashed&quot;</span>) 
p &lt;-<span class="st"> </span>p <span class="op">+</span><span class="st"> </span><span class="kw">xlab</span>(<span class="st">&quot;Fitted values&quot;</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">&quot;Residuals&quot;</span>) <span class="op">+</span><span class="st"> </span>
<span class="st">  </span><span class="kw">ggtitle</span>(<span class="st">&quot;Plot of the residuals in function of the fitted values&quot;</span>)
p</code></pre></div>
<p><img src="machinelearningwithR_files/figure-html/linreg05_residuals-1.png" width="672" /></p>
<p>Residuals look normal: randonly scattered around the zero line.</p>
</div>
<div id="multi-variables-regression" class="section level2">
<h2><span class="header-section-number">3.2</span> Multi-variables regression</h2>
<p>Instead of just considering one variable as predictor, we’ll add a few more variables to our model with the idea to increase its predictive ability. In our case, we are expecting an increased r-squared value.</p>
<p>We have to be cautious in adding more variables. Too many variable might give a high <span class="math inline">\(R^2\)</span> on our training data, but this not be the case as we switch to our testing data. This is because of over-fitting and we will need to avoid this at all cost. We’ll check several ways we can use against overfitting.</p>
<p>The general equations can be expressed as</p>
<blockquote>
<p><span class="math inline">\(y^i = \beta_{0} + \beta_{1} x_{1}^i + \beta_{2} x_{2}^i + \ldots + \beta_{k} x_{k}^i + \epsilon^i\)</span></p>
</blockquote>
<p>when there are k predictors variables.</p>
<p>There are a bit of trials and errors to make while trying to fit multiple variables into a model, but a rule of thumb would be to include most of the variable (all these that would make sense) and then take out the ones that are not very significant using the <code>summary(modelx)</code></p>
<p>We are introducing 3 news libraries here besides the usual tidyverse.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(corrr)
<span class="kw">library</span>(corrplot)
<span class="kw">library</span>(leaps)</code></pre></div>
<div id="predicting-wine-price-again" class="section level3">
<h3><span class="header-section-number">3.2.1</span> Predicting wine price (again!)</h3>
<p>We continue here with the same dataset, <em>wine.csv</em>.<br />
First, we can see how each variable is correlated with each other ones.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(corrr)
d &lt;-<span class="st"> </span><span class="kw">correlate</span>(df)</code></pre></div>
<pre><code>## 
## Correlation method: &#39;pearson&#39;
## Missing treated using: &#39;pairwise.complete.obs&#39;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">d <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">shave</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">fashion</span>()</code></pre></div>
<pre><code>##       rowname  Year Price WinterRain AGST HarvestRain  Age FrancePop
## 1        Year                                                       
## 2       Price  -.45                                                 
## 3  WinterRain   .02   .14                                           
## 4        AGST  -.25   .66       -.32                                
## 5 HarvestRain   .03  -.56       -.28 -.06                           
## 6         Age -1.00   .45       -.02  .25        -.03               
## 7   FrancePop   .99  -.47       -.00 -.26         .04 -.99</code></pre>
<p>By default, R uses the Pearson coefficient of correlation.</p>
<p>Multiple linear regression doesn’t handle well multicollinearity. In this case, we should remove variables that are too highly correlated. <em>Age</em> and <em>Year</em> are too highly correlated and it should be removed.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">corrplot<span class="op">::</span><span class="kw">corrplot</span>(<span class="kw">cor</span>(df), <span class="dt">type =</span> <span class="st">&quot;lower&quot;</span>, <span class="dt">order =</span> <span class="st">&quot;hclust&quot;</span>, <span class="dt">tl.col =</span> <span class="st">&quot;black&quot;</span>, <span class="dt">sig.level =</span> <span class="fl">0.01</span>)</code></pre></div>
<p><img src="machinelearningwithR_files/figure-html/mlr-corrplot01-1.png" width="672" /></p>
<p>So let’s start by using all variables.<br />
</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">model2_lm_df &lt;-<span class="st"> </span><span class="kw">lm</span>(Price <span class="op">~</span><span class="st"> </span>Year <span class="op">+</span><span class="st"> </span>WinterRain <span class="op">+</span><span class="st"> </span>AGST <span class="op">+</span><span class="st"> </span>HarvestRain <span class="op">+</span><span class="st"> </span>Age <span class="op">+</span><span class="st"> </span>FrancePop, <span class="dt">data =</span> df)
<span class="kw">summary</span>(model2_lm_df)</code></pre></div>
<pre><code>## 
## Call:
## lm(formula = Price ~ Year + WinterRain + AGST + HarvestRain + 
##     Age + FrancePop, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48179 -0.24662 -0.00726  0.22012  0.51987 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(&gt;|t|)    
## (Intercept)  7.092e-01  1.467e+02   0.005 0.996194    
## Year        -5.847e-04  7.900e-02  -0.007 0.994172    
## WinterRain   1.043e-03  5.310e-04   1.963 0.064416 .  
## AGST         6.012e-01  1.030e-01   5.836 1.27e-05 ***
## HarvestRain -3.958e-03  8.751e-04  -4.523 0.000233 ***
## Age                 NA         NA      NA       NA    
## FrancePop   -4.953e-05  1.667e-04  -0.297 0.769578    
## ---
## Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1
## 
## Residual standard error: 0.3019 on 19 degrees of freedom
## Multiple R-squared:  0.8294, Adjusted R-squared:  0.7845 
## F-statistic: 18.47 on 5 and 19 DF,  p-value: 1.044e-06</code></pre>
<p>While doing so, we notice that the variable <em>Age</em> has NA. This is because it is so highly correlated with the variable <em>year</em> and <em>FrancePop</em>. This came in from our correlation plot. Also the variable <em>FrancePop</em> isn’t very predictive of the price of wine. So we can refine our models, by taking out these 2 variables, and as we’ll see, it won’t affect much our <span class="math inline">\(R^2\)</span> value. Note that with multiple variables regression, it is important to look at the <strong>Adjusted R-squared</strong> as it take into consideration the amount of variables in the model.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">model3_lm_df &lt;-<span class="st"> </span><span class="kw">lm</span>(Price <span class="op">~</span><span class="st"> </span>Year <span class="op">+</span><span class="st"> </span>WinterRain <span class="op">+</span><span class="st"> </span>AGST <span class="op">+</span><span class="st"> </span>HarvestRain, <span class="dt">data =</span> df)
<span class="kw">summary</span>(model3_lm_df)</code></pre></div>
<pre><code>## 
## Call:
## lm(formula = Price ~ Year + WinterRain + AGST + HarvestRain, 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45470 -0.24273  0.00752  0.19773  0.53637 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(&gt;|t|)    
## (Intercept) 44.0248601 16.4434570   2.677 0.014477 *  
## Year        -0.0239308  0.0080969  -2.956 0.007819 ** 
## WinterRain   0.0010755  0.0005073   2.120 0.046694 *  
## AGST         0.6072093  0.0987022   6.152  5.2e-06 ***
## HarvestRain -0.0039715  0.0008538  -4.652 0.000154 ***
## ---
## Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1
## 
## Residual standard error: 0.295 on 20 degrees of freedom
## Multiple R-squared:  0.8286, Adjusted R-squared:  0.7943 
## F-statistic: 24.17 on 4 and 20 DF,  p-value: 2.036e-07</code></pre>
<p>We managed now to have a better r-squared than using only one predictive variable. Also by choosing better predictive variables we managed to increase our <em>adjusted r-squared</em>.</p>
<p>Although it isn’t now feasible to graph in 2D the <em>Price</em> in function of the other variables, we can still graph our residuals in 2D. </p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">model3 &lt;-<span class="st"> </span><span class="kw">fortify</span>(model3_lm_df)
p &lt;-<span class="st"> </span><span class="kw">ggplot</span>(model3, <span class="kw">aes</span>(.fitted, .resid)) <span class="op">+</span><span class="st"> </span><span class="kw">geom_point</span>() <span class="op">+</span><span class="st"> </span>
<span class="st">  </span><span class="kw">geom_hline</span>(<span class="dt">yintercept =</span> <span class="dv">0</span>, <span class="dt">col =</span> <span class="st">&quot;red&quot;</span>, <span class="dt">linetype =</span> <span class="st">&quot;dashed&quot;</span>) <span class="op">+</span><span class="st"> </span><span class="kw">xlab</span>(<span class="st">&quot;Fitted values&quot;</span>) <span class="op">+</span><span class="st"> </span>
<span class="st">  </span><span class="kw">ylab</span>(<span class="st">&quot;Residuals&quot;</span>) <span class="op">+</span><span class="st"> </span>
<span class="st">  </span><span class="kw">ggtitle</span>(<span class="st">&quot;Plot of the residuals in function of the fitted values (multiple variables)&quot;</span>)
p</code></pre></div>
<p><img src="machinelearningwithR_files/figure-html/linreg07-1.png" width="672" /></p>
<p>The plot of residuals look pretty normal with points randomly scattered around the 0 line.</p>
</div>
</div>
<div id="model-diagnostic-and-evaluation" class="section level2">
<h2><span class="header-section-number">3.3</span> Model diagnostic and evaluation</h2>
<p>Let’s first on onto the explanations of the summary function on the regression model.</p>
<p><strong>Call:</strong> The formula we have used for our model.</p>
<p><strong>Coefficient – Estimate</strong> The coefficient Estimate is the value of the coefficient that is to be used in the equation. The coefficients for each of the independent variable has a meaning, for example, 0.0010755 for ‘WinterRain’ means that for every 1 unit change in ‘WinterRain’, the value of ‘Price‘ increases by 0.0010755.</p>
<p><strong>Coefficient – Standard Error</strong> The coefficient Standard Error measures the average amount that the coefficient estimates vary from the actual average value of our response variable. We need this to be minimal for the variable to be able to predict accurately.</p>
<p><strong>Coefficient – t value:</strong> The coefficient t-value measures how many standard deviations our coefficient estimate can be far away from 0. We want this value to be high so that we can reject the null hypothesis (H0) which is ‘there is no relationship between dependent and independent variables’.</p>
<p><strong>Coefficient – Pr(&gt;t):</strong> The Pr(&gt;t) is computed from the t values. This is used for rejecting the Null Hypothesis (H00) stated above. Normally, the value for this less than 0.05 or 5% is considered to be the cut-off point for rejecting H0.</p>
<p><strong>Residuals:</strong> Residuals are the next component in the model summary. Residuals are the difference between the predicted values by the model and the actual values in the dataset. For the model to be good, the residuals should be normally distributed.</p>
<p><strong>Adjusted R-squared:</strong><br />
Adjusted R-squared is considered for evaluating model accuracy when the number of independent variables is greater than 1. Adjusted R-squared adjusts the number of variables considered in the model and is the preferred measure for evaluating the model goodness.</p>
<p><strong>F-Statistic:</strong> F-statistic is used for finding out if there exists any relationship between our independent (predictor) and the dependent (response) variables. Normally, the value of F-statistic greater than one can be used for rejecting the null hypothesis (H0: There is no relationship between Employed and other independent variables). For our model, the value of F-statistic, 330.6 is very high because of the limited data points. The p-value in the output for F-statistic is evaluated the same way we use the Pr(&gt;t) value in the coefficients output. For the p-value, we can reject the null hypothesis (H0) as p-value &lt; 0.05.</p>
<p><strong>There is no established relationship between the two.</strong> R-squared tells how much variation in the response variable is explained by the predictor variables while p-value tells if the predictors used in the model are able to explain the response variable or not. If p-value &lt; 0.05 (for 95% confidence), then the model is considered to be good.</p>
<ol style="list-style-type: decimal">
<li><strong>low R-square</strong> and <strong>low p-value</strong> (p-value &lt;= 0.05): This means that the model doesn’t explain much of the variation in the response variable, but still this is considered better than having no model to explain the response variable as it is significant as per the p-value.</li>
<li><strong>low R-square</strong> and <strong>high p-value</strong> (p-value &gt; 0.05): This means that model doesn’t explain much variation in the data and is not significant. We should discard such model as this is the worst scenario.</li>
<li><strong>high R-square</strong> and <strong>low p-value</strong>: This means that model explains a lot of variation in the data and is also significant. This scenario is best of the four and the model is considered to be good in this case.</li>
<li><strong>high R-square</strong> and <strong>high p-value</strong>: This means that variance in the data is explained by the model but it is not significant. We should not use such model for predictions.</li>
</ol>
<p>Here are the nessary conditions for a linear regression model to be valid. Hence, these are the assumptions made when doing a linear regression.</p>
<ul>
<li><strong>Linear Relationship</strong>.<br />
The plot of the residuals should show the data points randomly scattered around the 0 line.<br />
This plot shows if residuals have non-linear patterns. There could be a non-linear relationship between predictor variables and an outcome variable and the pattern could show up in this plot if the model doesn’t capture the non-linear relationship. If you find equally spread residuals around a horizontal line without distinct patterns, that is a good indication you don’t have non-linear relationships.</li>
</ul>
<div class="figure">
<img src="otherpics/GoodVsBadResidualsPlot.png" alt="Good Vs Bad residuals plot" />
<p class="caption">Good Vs Bad residuals plot</p>
</div>
<p>There isn’t any distinctive pattern in Case 1, but there is a parabola in Case 2, where the non-linear relationship was not explained by the model and was left out in the residuals.</p>
<ul>
<li><p><strong>Multivariate normality</strong>. The multiple linear regression analysis requires that the errors between observed and predicted values (i.e., the residuals of the regression) should be normally distributed. This assumption may be checked by looking at a histogram or a Q-Q-Plot. Normality can also be checked with a goodness of fit test (e.g., the Kolmogorov-Smirnov test), though this test must be conducted on the residuals themselves.<br />
<img src="otherpics/GoodVsBadQQPlot.png" alt="Good Vs Bad residuals Q-Q plot" /></p></li>
<li><strong>No Multicollinearity</strong>. Multicollinearity may be tested with these central criteria:</li>
</ul>
<ol style="list-style-type: decimal">
<li>Correlation matrix. When computing the matrix of Pearson’s Bivariate Correlation among all independent variables the correlation coefficients need to be smaller than 1.</li>
<li>Variance Inflation Factor (VIF) – the variance inflation factor of the linear regression is defined as VIF = 1/T. Tolerance (T) is defined as T = 1 – R². With VIF &gt; 10 there is an indication that multicollinearity may be present; with VIF &gt; 100 there is certainly multicollinearity among the variables. If multicollinearity is found in the data, centering the data (that is deducting the mean of the variable from each score) might help to solve the problem. However, the simplest way to address the problem is to remove independent variables with high VIF values.</li>
</ol>
<ul>
<li><strong>Homoscedasticity</strong>. A scatterplot of residuals versus predicted values is good way to check for homoscedasticity. There should be no clear pattern in the distribution; if there is a cone-shaped pattern (as shown below), the data is heteroscedastic. If the data are heteroscedastic, a non-linear data transformation or addition of a quadratic term might fix the problem.</li>
</ul>
<p>This plot shows if residuals are spread equally along the ranges of predictors. This is how you can check the assumption of equal variance (homoscedasticity). It’s good if you see a horizontal line with equally (randomly) spread points.<br />
<img src="otherpics/GoodVsBadScalePlot.png" alt="Good Vs Bad residuals Q-Q plot" /></p>
<p>In Case 2, the residuals begin to spread wider along the x-axis. Because the residuals spread wider and wider, the red smooth line is not horizontal and shows a steep angle in Case 2.</p>
</div>
<div id="final-example---boston-dataset---with-backward-elimination" class="section level2">
<h2><span class="header-section-number">3.4</span> Final example - Boston dataset - with backward elimination</h2>
<p>On this last example we’ll use a more systemic way to find out which variables should be chosen into our models.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df &lt;-<span class="st"> </span><span class="kw">read_csv</span>(<span class="st">&quot;dataset/Boston.csv&quot;</span>)
skimr<span class="op">::</span><span class="kw">skim</span>(df)</code></pre></div>
<pre><code>## Skim summary statistics
##  n obs: 333 
##  n variables: 14 
## 
## ── Variable type:numeric ───────────────────────────────────────────────────
##  variable missing complete   n   mean     sd       p0     p25    p50
##       AGE       0      333 333  68.23  28.13   6       45.4    76.7 
##     BLACK       0      333 333 359.47  86.58   3.5    376.73  392.05
##      CHAS       0      333 333   0.06   0.24   0        0       0   
##      CRIM       0      333 333   3.36   7.35   0.0063   0.079   0.26
##       DIS       0      333 333   3.71   1.98   1.13     2.12    3.09
##     INDUS       0      333 333  11.29   7      0.74     5.13    9.9 
##     LSTAT       0      333 333  12.52   7.07   1.73     7.18   10.97
##      MEDV       0      333 333  22.77   9.17   5       17.4    21.6 
##       NOX       0      333 333   0.56   0.11   0.39     0.45    0.54
##   PTRATIO       0      333 333  18.45   2.15  12.6     17.4    19   
##       RAD       0      333 333   9.63   8.74   1        4       5   
##        RM       0      333 333   6.27   0.7    3.56     5.88    6.2 
##       TAX       0      333 333 409.28 170.84 188      279     330   
##        ZN       0      333 333  10.69  22.67   0        0       0   
##     p75   p100     hist
##   93.8  100    ▁▂▂▂▂▂▃▇
##  396.24 396.9  ▁▁▁▁▁▁▁▇
##    0      1    ▇▁▁▁▁▁▁▁
##    3.68  73.53 ▇▁▁▁▁▁▁▁
##    5.12  10.71 ▇▆▅▃▂▁▁▁
##   18.1   27.74 ▅▅▃▁▁▇▁▁
##   16.42  37.97 ▅▇▆▃▂▁▁▁
##   25     50    ▂▅▇▇▂▂▁▁
##    0.63   0.87 ▇▇▇▆▃▅▁▁
##   20.2   21.2  ▁▂▁▂▃▃▂▇
##   24     24    ▃▇▂▁▁▁▁▅
##    6.59   8.72 ▁▁▂▇▇▂▁▁
##  666    711    ▅▇▂▅▁▁▁▇
##   12.5  100    ▇▁▁▁▁▁▁▁</code></pre>
<p>Here is the list of variables with their meaning.</p>
<ul>
<li>CRIM per capita crime rate by town</li>
<li>ZN proportion of residential land zoned for lots over 25,000 sq.ft.</li>
<li>INDUS proportion of non-retail business acres per town</li>
<li>CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)</li>
<li>NOX nitric oxides concentration (parts per 10 million)</li>
<li>RM average number of rooms per dwelling</li>
<li>AGE proportion of owner-occupied units built prior to 1940</li>
<li>DIS weighted distances to five Boston employment centres</li>
<li>RAD index of accessibility to radial highways</li>
<li>TAX full-value property-tax rate per $10,000</li>
<li>PTRATIO pupil-teacher ratio by town</li>
<li>B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town</li>
<li>LSTAT % lower status of the population</li>
<li>MEDV Median value of owner-occupied homes in $1000’s</li>
</ul>
<p>Let’s make the necessary adjustment in variable types</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df<span class="op">$</span>CHAS &lt;-<span class="st"> </span><span class="kw">factor</span>(df<span class="op">$</span>CHAS)</code></pre></div>
<p>A quick check on how correlated are our variables.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">corrplot</span>(<span class="kw">cor</span>(df <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">select</span>(<span class="op">-</span>CHAS)), <span class="dt">type =</span> <span class="st">&quot;lower&quot;</span>, <span class="dt">order =</span> <span class="st">&quot;hclust&quot;</span>, <span class="dt">tl.col =</span> <span class="st">&quot;black&quot;</span>, <span class="dt">sig.level =</span> <span class="fl">0.01</span>)</code></pre></div>
<p><img src="machinelearningwithR_files/figure-html/linreg09-1.png" width="672" /></p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">correlate</span>(df <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">select</span>(<span class="op">-</span>CHAS)) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">shave</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">fashion</span>()</code></pre></div>
<pre><code>## 
## Correlation method: &#39;pearson&#39;
## Missing treated using: &#39;pairwise.complete.obs&#39;</code></pre>
<pre><code>##    rowname CRIM   ZN INDUS  NOX   RM  AGE  DIS  RAD  TAX PTRATIO BLACK
## 1     CRIM                                                            
## 2       ZN -.21                                                       
## 3    INDUS  .42 -.52                                                  
## 4      NOX  .46 -.50   .75                                            
## 5       RM -.31  .33  -.44 -.34                                       
## 6      AGE  .38 -.54   .64  .74 -.25                                  
## 7      DIS -.40  .64  -.70 -.77  .27 -.76                             
## 8      RAD  .67 -.30   .57  .61 -.27  .45 -.48                        
## 9      TAX  .62 -.31   .71  .67 -.36  .51 -.53  .90                   
## 10 PTRATIO  .31 -.38   .39  .19 -.37  .26 -.23  .47  .47              
## 11   BLACK -.48  .17  -.34 -.37  .16 -.27  .28 -.41 -.41    -.16      
## 12   LSTAT  .53 -.39   .61  .60 -.62  .59 -.51  .48  .54     .37  -.36
## 13    MEDV -.41  .34  -.47 -.41  .69 -.36  .25 -.35 -.45    -.48   .34
##    LSTAT MEDV
## 1            
## 2            
## 3            
## 4            
## 5            
## 6            
## 7            
## 8            
## 9            
## 10           
## 11           
## 12           
## 13  -.74</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">yo &lt;-<span class="st"> </span><span class="kw">correlate</span>(df <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">select</span>(<span class="op">-</span>CHAS)) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">shave</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">fashion</span>()</code></pre></div>
<pre><code>## 
## Correlation method: &#39;pearson&#39;
## Missing treated using: &#39;pairwise.complete.obs&#39;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">kable</span>(yo, <span class="dt">format =</span> <span class="st">&quot;html&quot;</span>) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">kable_styling</span>()</code></pre></div>
<table class="table" style="margin-left: auto; margin-right: auto;">
<thead>
<tr>
<th style="text-align:left;">
rowname
</th>
<th style="text-align:left;">
CRIM
</th>
<th style="text-align:left;">
ZN
</th>
<th style="text-align:left;">
INDUS
</th>
<th style="text-align:left;">
NOX
</th>
<th style="text-align:left;">
RM
</th>
<th style="text-align:left;">
AGE
</th>
<th style="text-align:left;">
DIS
</th>
<th style="text-align:left;">
RAD
</th>
<th style="text-align:left;">
TAX
</th>
<th style="text-align:left;">
PTRATIO
</th>
<th style="text-align:left;">
BLACK
</th>
<th style="text-align:left;">
LSTAT
</th>
<th style="text-align:left;">
MEDV
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
CRIM
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
ZN
</td>
<td style="text-align:left;">
-.21
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
INDUS
</td>
<td style="text-align:left;">
.42
</td>
<td style="text-align:left;">
-.52
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
NOX
</td>
<td style="text-align:left;">
.46
</td>
<td style="text-align:left;">
-.50
</td>
<td style="text-align:left;">
.75
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
RM
</td>
<td style="text-align:left;">
-.31
</td>
<td style="text-align:left;">
.33
</td>
<td style="text-align:left;">
-.44
</td>
<td style="text-align:left;">
-.34
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
AGE
</td>
<td style="text-align:left;">
.38
</td>
<td style="text-align:left;">
-.54
</td>
<td style="text-align:left;">
.64
</td>
<td style="text-align:left;">
.74
</td>
<td style="text-align:left;">
-.25
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
DIS
</td>
<td style="text-align:left;">
-.40
</td>
<td style="text-align:left;">
.64
</td>
<td style="text-align:left;">
-.70
</td>
<td style="text-align:left;">
-.77
</td>
<td style="text-align:left;">
.27
</td>
<td style="text-align:left;">
-.76
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
RAD
</td>
<td style="text-align:left;">
.67
</td>
<td style="text-align:left;">
-.30
</td>
<td style="text-align:left;">
.57
</td>
<td style="text-align:left;">
.61
</td>
<td style="text-align:left;">
-.27
</td>
<td style="text-align:left;">
.45
</td>
<td style="text-align:left;">
-.48
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
TAX
</td>
<td style="text-align:left;">
.62
</td>
<td style="text-align:left;">
-.31
</td>
<td style="text-align:left;">
.71
</td>
<td style="text-align:left;">
.67
</td>
<td style="text-align:left;">
-.36
</td>
<td style="text-align:left;">
.51
</td>
<td style="text-align:left;">
-.53
</td>
<td style="text-align:left;">
.90
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
PTRATIO
</td>
<td style="text-align:left;">
.31
</td>
<td style="text-align:left;">
-.38
</td>
<td style="text-align:left;">
.39
</td>
<td style="text-align:left;">
.19
</td>
<td style="text-align:left;">
-.37
</td>
<td style="text-align:left;">
.26
</td>
<td style="text-align:left;">
-.23
</td>
<td style="text-align:left;">
.47
</td>
<td style="text-align:left;">
.47
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
BLACK
</td>
<td style="text-align:left;">
-.48
</td>
<td style="text-align:left;">
.17
</td>
<td style="text-align:left;">
-.34
</td>
<td style="text-align:left;">
-.37
</td>
<td style="text-align:left;">
.16
</td>
<td style="text-align:left;">
-.27
</td>
<td style="text-align:left;">
.28
</td>
<td style="text-align:left;">
-.41
</td>
<td style="text-align:left;">
-.41
</td>
<td style="text-align:left;">
-.16
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
LSTAT
</td>
<td style="text-align:left;">
.53
</td>
<td style="text-align:left;">
-.39
</td>
<td style="text-align:left;">
.61
</td>
<td style="text-align:left;">
.60
</td>
<td style="text-align:left;">
-.62
</td>
<td style="text-align:left;">
.59
</td>
<td style="text-align:left;">
-.51
</td>
<td style="text-align:left;">
.48
</td>
<td style="text-align:left;">
.54
</td>
<td style="text-align:left;">
.37
</td>
<td style="text-align:left;">
-.36
</td>
<td style="text-align:left;">
</td>
<td style="text-align:left;">
</td>
</tr>
<tr>
<td style="text-align:left;">
MEDV
</td>
<td style="text-align:left;">
-.41
</td>
<td style="text-align:left;">
.34
</td>
<td style="text-align:left;">
-.47
</td>
<td style="text-align:left;">
-.41
</td>
<td style="text-align:left;">
.69
</td>
<td style="text-align:left;">
-.36
</td>
<td style="text-align:left;">
.25
</td>
<td style="text-align:left;">
-.35
</td>
<td style="text-align:left;">
-.45
</td>
<td style="text-align:left;">
-.48
</td>
<td style="text-align:left;">
.34
</td>
<td style="text-align:left;">
-.74
</td>
<td style="text-align:left;">
</td>
</tr>
</tbody>
</table>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">model_mlr_df &lt;-<span class="st"> </span><span class="kw">lm</span>(MEDV <span class="op">~</span><span class="st"> </span>., <span class="dt">data =</span> df)
model_bwe_df &lt;-<span class="st"> </span><span class="kw">regsubsets</span>(<span class="kw">formula</span>(model_mlr_df), <span class="dt">data =</span> df, <span class="dt">method =</span> <span class="st">&quot;backward&quot;</span>)
<span class="kw">summary</span>(model_mlr_df)</code></pre></div>
<pre><code>## 
## Call:
## lm(formula = MEDV ~ ., data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.6174  -2.8741  -0.6261   1.6553  24.5250 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(&gt;|t|)    
## (Intercept)  34.045438   6.296131   5.407 1.26e-07 ***
## CRIM         -0.052489   0.053799  -0.976  0.32997    
## ZN            0.047445   0.016916   2.805  0.00534 ** 
## INDUS         0.053855   0.074238   0.725  0.46871    
## CHAS1         3.784864   1.149874   3.292  0.00111 ** 
## NOX         -15.739657   4.855773  -3.241  0.00131 ** 
## RM            3.768832   0.520066   7.247 3.23e-12 ***
## AGE          -0.004627   0.016885  -0.274  0.78426    
## DIS          -1.548823   0.264283  -5.860 1.15e-08 ***
## RAD           0.328967   0.081861   4.019 7.31e-05 ***
## TAX          -0.012866   0.004519  -2.847  0.00469 ** 
## PTRATIO      -0.856976   0.165343  -5.183 3.88e-07 ***
## BLACK         0.011666   0.003597   3.243  0.00131 ** 
## LSTAT        -0.600315   0.063841  -9.403  &lt; 2e-16 ***
## ---
## Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1
## 
## Residual standard error: 4.834 on 319 degrees of freedom
## Multiple R-squared:  0.7331, Adjusted R-squared:  0.7223 
## F-statistic: 67.41 on 13 and 319 DF,  p-value: &lt; 2.2e-16</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">plot</span>(model_bwe_df, <span class="dt">scale =</span> <span class="st">&quot;adjr2&quot;</span>)</code></pre></div>
<p><img src="machinelearningwithR_files/figure-html/linreg11-1.png" width="672" /></p>
<p>Ideally, the model should consider the following variables</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">model2_mlr_df &lt;-<span class="st"> </span><span class="kw">lm</span>(MEDV <span class="op">~</span><span class="st"> </span>CRIM <span class="op">+</span><span class="st"> </span>NOX <span class="op">+</span><span class="st"> </span>RM <span class="op">+</span><span class="st"> </span>DIS <span class="op">+</span><span class="st"> </span>RAD <span class="op">+</span><span class="st"> </span>PTRATIO <span class="op">+</span><span class="st"> </span>BLACK <span class="op">+</span><span class="st"> </span>LSTAT, <span class="dt">data =</span> df)
<span class="kw">summary</span>(model2_mlr_df)</code></pre></div>
<pre><code>## 
## Call:
## lm(formula = MEDV ~ CRIM + NOX + RM + DIS + RAD + PTRATIO + BLACK + 
##     LSTAT, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.0039  -3.1223  -0.6353   1.8782  25.2095 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(&gt;|t|)    
## (Intercept)  32.703270   6.414135   5.099 5.83e-07 ***
## CRIM         -0.041182   0.055041  -0.748 0.454880    
## NOX         -18.097931   4.535787  -3.990 8.17e-05 ***
## RM            4.194258   0.508532   8.248 4.09e-15 ***
## DIS          -1.288755   0.219996  -5.858 1.15e-08 ***
## RAD           0.166232   0.052295   3.179 0.001622 ** 
## PTRATIO      -1.080124   0.157349  -6.864 3.41e-11 ***
## BLACK         0.013020   0.003689   3.529 0.000477 ***
## LSTAT        -0.604600   0.061989  -9.753  &lt; 2e-16 ***
## ---
## Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1
## 
## Residual standard error: 4.99 on 324 degrees of freedom
## Multiple R-squared:  0.7112, Adjusted R-squared:  0.7041 
## F-statistic: 99.76 on 8 and 324 DF,  p-value: &lt; 2.2e-16</code></pre>
<div id="model-diagmostic" class="section level3">
<h3><span class="header-section-number">3.4.1</span> Model diagmostic</h3>
<p>To check that we have a linear relationship between the numerical explanatory variables and the response variable, we create a scatter plot with the variable and the residuals</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df2 &lt;-<span class="st"> </span><span class="kw">tibble</span>(<span class="dt">x =</span> df<span class="op">$</span>MEDV, <span class="dt">residuals =</span> model2_mlr_df<span class="op">$</span>residuals, <span class="dt">fitted =</span> model2_mlr_df<span class="op">$</span>fitted.values)
<span class="kw">ggplot</span>(df2, <span class="kw">aes</span>(<span class="dt">x =</span> x, <span class="dt">y =</span> residuals)) <span class="op">+</span><span class="st"> </span>
<span class="st">  </span><span class="kw">geom_jitter</span>()</code></pre></div>
<p><img src="machinelearningwithR_files/figure-html/linreg13-1.png" width="672" /></p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(df2, <span class="kw">aes</span>(<span class="dt">x =</span> fitted, <span class="dt">y =</span> residuals)) <span class="op">+</span><span class="st"> </span>
<span class="st">  </span><span class="kw">geom_jitter</span>()</code></pre></div>
<p><img src="machinelearningwithR_files/figure-html/linreg14-1.png" width="672" /></p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(ggfortify)
<span class="kw">autoplot</span>(model2_mlr_df)</code></pre></div>
<p><img src="machinelearningwithR_files/figure-html/linreg15-1.png" width="672" /></p>
</div>
</div>
<div id="references" class="section level2">
<h2><span class="header-section-number">3.5</span> References</h2>
<ul>
<li>On the difference between the p-values of the F statistics and the R^2. <a href="https://analyticsdefined.com/interpreting-linear-regression-in-r/">Here</a><br />
</li>
<li>On the diagnotic plots and interpretation of results. <a href="http://data.library.virginia.edu/diagnostic-plots/">Here</a> and <a href="http://www.statisticssolutions.com/assumptions-of-multiple-linear-regression/">here</a></li>
</ul>

</div>
</div>
<div class="footnotes">
<hr />
<ol start="1">
<li id="fn1"><p>Remember that the error term, <span class="math inline">\(\epsilon^i\)</span>, in the simple linear regression model is independent of x, and is normally distributed, with zero mean and constant variance.<a href="mlr.html#fnref1">↩</a></p></li>
</ol>
</div>
            </section>

          </div>
        </div>
      </div>
<a href="testinference.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
<a href="logistic.html" class="navigation navigation-next " aria-label="Next page"><i class="fa fa-angle-right"></i></a>
    </div>
  </div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": true,
"twitter": true,
"google": false,
"linkedin": false,
"weibo": false,
"instapaper": false,
"vk": false,
"all": ["facebook", "google", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": {
"link": "https://github.com/fderyckel/machinelearningwithr/edit/master/03-linear_regressions.Rmd",
"text": "Suggest edit to this page"
},
"history": {
"link": null,
"text": null
},
"download": null,
"toc": {
"collapse": "section"
}
});
});
</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    var src = "";
    if (src === "" || src === "true") src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML";
    if (location.protocol !== "file:" && /^https?:/.test(src))
      src = src.replace(/^https?:/, '');
    script.src = src;
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>
</body>

</html>