-
Notifications
You must be signed in to change notification settings - Fork 0
/
week1_assi1_sol1.R
196 lines (147 loc) · 7.68 KB
/
week1_assi1_sol1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# -*- coding: utf-8 -*-
# """Week1 Assi1 Sol1.ipynb
#
# Automatically generated by Colaboratory.
#
# Original file is located at
# https://colab.research.google.com/drive/1Mv7zC7skLGUl1kE-OjcXCIuAz95xwyFT
# """
###########################################################################
## Week-1, Homework-1, Sol-1
## Sreya Dhar
## Created: Sept 4, 2020
## Edited: Sept 14, 2020
###########################################################################
rm(list = ls()) ## clearing working environment
# Set working directory to where csv file is located
setwd("C:/File E/EAS 506 Statistical Mining I/Week 1")
## installing all the libaries in R kernel
# install.packages("ISLR")
# install.packages("Hmisc")
# install.packages("funModeling")
# install.packages("PerformanceAnalytics")
# install.packages("corrplot")
# install.packages("repr")
## importing the libraries in R kernel
library(ISLR)
library(Hmisc)
library(ggplot2)
library(dplyr)
library(funModeling)
library(tidyverse)
library(tidyr)
library(PerformanceAnalytics)
library(corrplot)
library(repr)
library(gplots)
## """# Exploratory Data Analysis on 'Auto' Dataset"""
# View different features in Auto dataset
names(Auto)
## overall view of Auto dataset
glimpse(Auto)
## Display several statistical parameters including datatype and unique values in variables
status(Auto)
# specifically to convert 'name' into numerical variable
Auto_1 <- Auto %>% mutate_if(is.factor, as.numeric)
## describing the metric table of the variables which includes range, mean, standard deviation and variation
profiling_num(Auto_1)
## histogram plot for the variables for better visalization and distribution of data
options(repr.plot.width=7, repr.plot.height=7, repr.plot.res = 200)
plot_num(Auto_1)
## description of the overall dataset, including lowest, highest and frequency of high occurance variables
describe(Auto_1)
## Statistical parameters of all the variables
summary(Auto_1)
## provides top 6 rows as heading matrix from dataset
head(data.matrix(Auto_1))
## pair wise plot :: between any two variable to show the correlation between variables
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 200)
pairs(Auto_1, main = "Pairwise plot")
## removing the 'name' variable from the Auto dataset for further exploring the data :: as the name variable does not influence any other variable in the dataset.
Auto_C = Auto_1[,-9]
## plotting the correlation values on chart matrix which also combined with histogram and scatter plots of different features.
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 200)
chart.Correlation(Auto_C, histogram=TRUE, pch=15, , density = TRUE) # pearson correlation values at upper traingle
## defining the figure plot like width, ht and resolution of the below figure
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200)
## Showing the correlation plots in lower triangular matrix :: intensities can be visualize by color range variation
mod_cor <- cor(Auto_C)
corrplot(mod_cor, method = "circle", type = "lower")
## plotting the heatmap diagram on modified dataset
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200)
corrplot(mod_cor, method = "color" )
## heatmap matrix of the Auto_C dataset after scaling the dataframe around the mean =0, sd = 1
options(repr.plot.width=7, repr.plot.height=7, repr.plot.res = 200)
Auto_S <- as.data.frame(scale(Auto_C,center=TRUE,scale=TRUE))
heatmap.2(as.matrix(Auto_S), scale = "none", col = bluered(100), trace = "none", density.info = "none")
## overall summary of the data with several statistical parameters removing 'name' variable
summary(Auto_C)
# Boxplots showing mpg vs. origin
origin_C <- factor(Auto$origin, labels = c("Amerian", "European", "Japanese"))
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 230)
qplot(origin_C, mpg, geom = c("violin"), data = Auto_C, xlab= "Different Origin") + geom_boxplot(width=0.1, color="blue")
# Boxplots showing mpg vs. no. of cylinders
cylinders_C <- factor(Auto$cylinders, labels = c("c_3", "c_4", "c_5","c_6", "c_8"))
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 230)
qplot(cylinders_C, mpg, geom = c("violin"), data = Auto_C, xlab= "Number of cylinders") + geom_boxplot(width=0.1, color="red")
# EDA showing horsepower in boxplot and its distribution broken down by different pockets
par(mfrow = c(1,3))
dens <- density(Auto_C$horsepower)
options(repr.plot.width=12, repr.plot.height=4, repr.plot.res = 300)
boxplot(Auto_C$horsepower, main="Detecting horsepower outliers",
xlab="horsepower ", ylab="value")
hist(Auto_C$horsepower, breaks = 15, xlab="horsepower distribution ",
main="Histogram of horsepower", probability = T)
lines(dens)
hist(log(Auto_C$horsepower), breaks = 15, xlab="horsepower log distribution ",
main="Histogram of horsepower in log scale")
## identifying the outliers from horsepower variable
outl_hp <- boxplot.stats(Auto_C$horsepower)$out ## outliers values
outl_hpw <- which(Auto_C$horsepower %in% c(outl_hp)) ## no. of observations (index:: row)
outl_hp
## replacing the outliers by the median value of horsepower
Auto_C[Auto_C$horsepower >=200, "horsepower"] <- median(Auto_C$horsepower)
# cleaned Auto dataset
Auto_C
## Relationhip between mpg and year variable; from the plot its been seen that as the years passed by Performance (mpg) increases in vehicles.
par(mfrow = c(1,2))
options(repr.plot.width=14, repr.plot.height=5, repr.plot.res = 200)
# scatter plot mpg vs year
plot(mpg ~ year, xlab = "year", ylab = "Performance in mpg", data = Auto_C)
# add a smooth trendline on top of scatter plot to verify the pattern
with(Auto_C, lines(lowess(mpg~year), lwd=2, col='red'))
# boxplot to visualize the transmision range increasing by year.
boxplot(mpg ~ year, data = Auto_C, ylab = "Performance in mpg", xlab = "Year")
################## exploring weight variable from normal scale to logscale ########################
par(mfrow = c(1,2))
options(repr.plot.width=14, repr.plot.height=5, repr.plot.res = 200)
# scatter plot mpg vs weight
plot(mpg ~ weight, xlab = "weight", ylab = "Performance in mpg", data = Auto_C)
with(Auto_C, lines(lowess(mpg~weight), lwd=2, col='red'))
plot(log(mpg) ~ log(weight), xlab = "log of weight", ylab = "log of mpg", data = Auto_C)
with(Auto_C, lines(lowess(log(mpg)~log(weight)), lwd=2, col='red'))
###################################################################################################
options(repr.plot.width=5, repr.plot.height=4, repr.plot.res = 300)
fil_Auto_C1 <- filter(Auto_C, year == c(70,82))
fil_Auto_C1$origin <- factor(fil_Auto_C1$origin, labels = c("Amerian","European", "Japanese"))
fil_Auto_C1$year <- factor(fil_Auto_C1$year, labels = c("1972","1980"))
comb_plot<- fil_Auto_C1 %>%
mutate(day = fct_reorder(origin, mpg)) %>%
mutate(day = factor(origin)) %>%
ggplot(aes(x=origin, y=mpg, fill=year)) +
geom_violin(position=position_dodge(0.8), alpha=0.5) +
geom_boxplot(aes(color = year), width = 0.07, position = position_dodge(0.6))+
xlab("Origin") +
ylab("mpg")
comb_plot
######################### Scaling ###################################################################
## before starting linear regression we need to normalize the Auto_C data into a min-max scale scale
max <- apply(Auto_C , 2 , max)
min <- apply(Auto_C, 2 , min)
Auto_S <- as.data.frame(scale(Auto_C, center = min, scale = max - min))
mean(Auto_S$mpg)
sd(Auto_S$mpg)
## """Submit the cleaned dataset as an *.RData file."""
write.table(Auto_S, file="Auto_S.Rdata")
# Auto_S<-read.table("Auto_S.Rdata")
## end ##