-
Notifications
You must be signed in to change notification settings - Fork 0
/
week1_assi1_sol2_Final.r
144 lines (109 loc) · 4.83 KB
/
week1_assi1_sol2_Final.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# -*- coding: utf-8 -*-
# """Week1 Assi1 Sol2.ipynb
#
# Automatically generated by Colaboratory.
#
# Original file is located at
# https://colab.research.google.com/drive/1TS5EIWCpSPLTFcDhZdcOAxI__nJpxttQ
# """
###########################################################################
## Week-1, Homework-1, Sol-2
## Sreya Dhar
## Created: Sept 4, 2020
## Edited: Sept 14, 2020
###########################################################################
rm(list = ls()) ## clearing working environment
# Set working directory to where csv file is located
setwd("C:/File E/EAS 506 Statistical Mining I/Week 1")
## installing all the libaries in R kernel
# install.packages("ISLR")
# install.packages("Hmisc")
# install.packages("funModeling")
# install.packages("PerformanceAnalytics")
# install.packages("corrplot")
# install.packages("repr")
## importing the libraries in R kernel
library(ISLR)
library(Hmisc)
library(ggplot2)
library(dplyr)
library(funModeling)
library(tidyverse)
library(tidyr)
library(PerformanceAnalytics)
library(corrplot)
library(repr)
library(lmtest)
# Set working directory to where csv file is located
setwd("C:/File E/EAS 506 Statistical Mining I/Week 1")
## """# Multiple regression analysis on preprocessed Auto dataset
## Qs. 2 (a) Which predictors appear to have a significant relationship to the response.
## uploaded the "Auto_C.RData" under files
Auto_S<-read.table("Auto_S.Rdata")
## overall summary of the data with several statistical parameters removing 'name' variable
summary(Auto_S)
## """# Exploration of Auto dataset via regression models with '*' and ':'
## Different combinations within the variables have been explored to get to the best performing model via multiple iterations.
## """
## plotting the correlation values on chart matrix which also combined with histogram and scatter plots of different features on scaled dataset.
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 200)
chart.Correlation(Auto_S, histogram=TRUE, pch=15)
# ###################################################################################################
year_reg <- lm(mpg ~ year, data = Auto_S)
summary(year_reg)
options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 200)
par(mfrow = c(2,2))
plot(year_reg, )
cyl_reg <- lm(mpg ~ cylinders, data = Auto_S)
summary(cyl_reg)
######################### Interactive modelling Begins ######################
model_1 <- lm(mpg~., data = Auto_S)
summary(model_1)
options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 200)
par(mfrow = c(2,2))
plot(model_1, )
model_2 = lm(mpg ~.+displacement:weight+acceleration:horsepower, data=Auto_S)
summary(model_2)
model_3 = lm(mpg ~. + year:origin + displacement:weight +
acceleration:horsepower + acceleration:weight, data=Auto_S)
summary(model_3)
model_4 = lm(mpg ~.+ year:origin + displacement:weight +
acceleration:origin + acceleration:year
+ year:displacement + displacement:origin, data=Auto_S)
summary(model_4)
model_5 = lm(mpg ~.+ year:origin + displacement:weight +
acceleration:origin + acceleration:year
+ year:displacement + displacement:origin
+ weight:origin + weight:year, data=Auto_S)
summary(model_5)
model_6 = lm(mpg ~. + year:origin + displacement:weight
+ acceleration:origin + acceleration:year
+ year:displacement + displacement:origin
+ weight:origin + weight:year + horsepower:displacement
+ weight:horsepower, data=Auto_S)
summary(model_6)
model_7 = lm(mpg ~.+ year:origin + displacement:weight
+ acceleration:origin + acceleration:year + year:displacement
+ displacement:origin + weight:origin + weight:year
+ horsepower:displacement + weight:horsepower
+ year:horsepower, data=Auto_S)
summary(model_7)
model_8 = lm(mpg ~. + year:origin
+ displacement:weight + acceleration:origin + acceleration:year
+ year:displacement + displacement:origin + weight:origin + weight:year
+ horsepower:displacement + weight:horsepower + year:horsepower
+ acceleration:weight, data=Auto_S)
summary(model_8)
plot(model_8, )
bptest(model_8) ## check for heteroscedasticity
#### Comparison of Regression models #####
anova(year_reg, model_1)
anova(cyl_reg, model_1)
anova(model_1, model_8)
#### Comparison of Regression models via cook's distance #####
par(mfrow = c(2,2))
plot(cooks.distance(model_1), pch = 16, col = "blue", ylim=c(0,0.06))
plot(cooks.distance(model_3), pch = 16, col = "blue", ylim=c(0,0.06))
plot(cooks.distance(model_6), pch = 16, col = "blue", ylim=c(0,0.06))
plot(cooks.distance(model_8), pch = 16, col = "blue", ylim=c(0,0.06))
## end ##