-
Notifications
You must be signed in to change notification settings - Fork 0
/
week1_assi1_sol3.r
230 lines (175 loc) · 8.83 KB
/
week1_assi1_sol3.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# -*- coding: utf-8 -*-
# """Week1 Assi1 Sol3.ipynb
#
# Automatically generated by Colaboratory.
#
# Original file is located at
# https://colab.research.google.com/drive/1BG-9PZCVBVll3X94DMFrw0I27Wa5VDD4
# """
###########################################################################
## Week-1, Homework-1, Sol-3
## Sreya Dhar
## Created: Sept 4, 2020
## Edited: Sept 14, 2020
###########################################################################
rm(list = ls()) ## clearing working environment
# Set working directory to where csv file is located
setwd("C:/File E/EAS 506 Statistical Mining I/Week 1")
## installing all the libaries in R kernel
# install.packages("MASS")
# install.packages("Hmisc")
# install.packages("funModeling")
# install.packages("PerformanceAnalytics")
# install.packages("corrplot")
# install.packages("hrbrthemes")
## importing the libraries in R kernel
library(MASS)
library(Hmisc)
library(ggplot2)
library(dplyr)
library(funModeling)
library(tidyverse)
library(tidyr)
library(PerformanceAnalytics)
library(corrplot)
library(repr)
library(ggplot2)
library(reshape2)
library(hrbrthemes)
library(gplots)
## overall view of Auto dataset
glimpse(Boston)
## Display several statistical parameters including datatype and unique values in variables
status(Boston)
## describing the metric table of the variables which includes range, mean, standard deviation and variation
profiling_num(Boston)
## """Qs. 3 (a) Make pairwise scatterplots of the predictors"""
## pair wise plot :: between any two variable to show the correlation between variables
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 300)
pairs(Boston)
## plotting the correlation values on chart matrix which also combined with histogram and scatter plots of different features.
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 300)
chart.Correlation(Boston, histogram=TRUE, pch=15)
## overall summary of the data with several statistical parameters
summary(Boston)
## detecting the outliers from medv variable
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200)
boxplot(Boston$medv, main="Detecting medv outliers", xlab="medv", ylab="value")
## identifying the outliers from medv variable
out_medv <- boxplot.stats(Boston$medv)$out ## outliers values
outl_medv <- which(Boston$medv %in% c(out_medv)) ## no. of observations (index:: row)
out_medv
## replacing the outliers by the median value of horsepower
Boston[Boston$medv >=38.7, "medv"] <- mean(Boston$medv)
Boston
## Showing the correlation plots in lower triangular matrix :: intensities can be visualize by color range variation
L <- cor(Boston)
corrplot(L, method = "circle", type = "lower")
# rquery.cormat(Boston, type="flatten", graph=FALSE)
options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 200)
Boston_h <- as.data.frame(scale(Boston,center=TRUE,scale=TRUE))
heatmap.2(as.matrix(Boston_h), scale = "none", col = bluered(100), trace = "none", density.info = "none")
options(repr.plot.width=7, repr.plot.height=7, repr.plot.res = 200)
plot_num(Boston)
## Visualizing how diferent predictors in Boston dataset has been correlated with each other
Boston_N = Boston
Boston_N$chas <- factor(Boston$chas, labels = c("far river", "near river"))
Boston_N$rad <- cut(Boston_N$rad, c(0, 15, 200), labels=c("near HW", "far HW"))
Boston_N$tax <- cut(Boston_N$tax, c(0, 550, 100000), labels=c("lowTax", "highTax"))
Boston_N
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 230)
xyplot(crim ~black| tax*rad, data=Boston_N, type = c("p", "smooth"), groups = chas, auto.key = list(columns=2))
xyplot(crim ~medv| tax*rad, data=Boston_N, type = c("p", "smooth"), groups = chas, auto.key = list(columns=2))
# xyplot(crim ~ptratio | tax*rad, data=Boston_N, type = c("p", "smooth"), groups = chas, auto.key = list(columns=2))
xyplot(dis ~nox | rad*tax, data=Boston_N, type = c("p", "smooth"), groups = chas, auto.key = list(columns=2))
xyplot(lstat ~ age | rad*tax, data=Boston_N, type = c("p", "smooth"), groups = chas, auto.key = list(columns=2))
## """Qs 3 (b) Are any of the predictors associated with per capita crime rate?"""
# convert objects into molten dataframe with crime data, variable and respective values
crimrate <- melt(Boston, id="crim") # convert objects into molten dataframe.
# plot each feature against crim rate to visualise the relation
ggplot(data=crimrate, aes(x=value, y=crim))+
facet_wrap(vars(variable), scales="free", nrow = 4)+
geom_point(col='red')+ theme_bw()+
ggtitle("Plot of crime rate with other variables") +
xlab("Values") + ylab("Crime rate")
## min-max scaling on boston dataset prior to regression
max <- apply(Boston , 2 , max)
min <- apply(Boston, 2 , min)
Boston_S <- as.data.frame(scale(Boston, center = min, scale = max - min))
## regression model on scaled boston dataset with all the predictors
model_1 = lm(crim~., data = Boston_S)
summary(model_1)
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200)
par(mfrow = c(2,2))
plot(model_1,)
## regression model on scaled boston dataset with the significant predictors
Boston_S_mod <- Boston_S[, -c(3,4,6,7,10,11,13)]
Boston_S_mod
model_2 = lm(crim~., data = Boston_S_mod)
summary(model_2)
## regression model on scaled boston dataset with the interaction
model_3 = lm(crim~.+dis:rad+dis:medv+rad:medv+rad:zn, data = Boston_S_mod)
summary(model_3)
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200)
par(mfrow = c(2,2))
plot(model_3,)
## removing influencial outliers
Boston_S_mod1 <- Boston_S_mod[-c(381,419,406,411),]
dim(Boston_S_mod1)
## regression model after removing ouliers with interactions
model_4 = lm(crim~.+dis:rad+dis:medv+rad:medv+rad:zn, data = Boston_S_mod1)
summary(model_4)
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 230)
par(mfrow = c(2,2))
plot(model_4,)
## """3 (c) Do any of the suburbs of Boston appear to have particularly high crime rates?
## Tax rates? Pupil-teacher ratios?
## """
## Relationship between Crime rates and zonal area
options(repr.plot.width=12, repr.plot.height=4, repr.plot.res = 230)
par(mfrow = c(1,3))
boxplot(crim ~ zn, data = Boston, xlab = "Zone", ylab = "Crime Rate")
boxplot(crim ~ zn, data = Boston, xlab = "Zone", ylab = "Crime Rate (zoomed)", xlim=c(0,6))
boxplot(log10(crim) ~ (zn), data = Boston, xlab = "Zone ", ylab = "Crime Rate (log scale)")
## Relationship between Crime rates and zonal area
options(repr.plot.width=12, repr.plot.height=4, repr.plot.res = 230)
par(mfrow = c(1,3))
boxplot(tax ~ zn, data = Boston, xlab = "Zone", ylab = "Tax")
boxplot(tax ~ zn, data = Boston, xlab = "Zone", ylab = "Tax (zoomed)", xlim=c(0,6))
boxplot(log10(tax) ~ (zn), data = Boston, xlab = "Zone ", ylab = "Tax (log scale)")
## Relationship between Crime rates and zonal area << extra >>
options(repr.plot.width=12, repr.plot.height=4, repr.plot.res = 230)
par(mfrow = c(1,3))
boxplot(ptratio ~ zn, data = Boston, xlab = "Zone", ylab = "pupil-teacher ratio")
boxplot(ptratio ~ zn, data = Boston, xlab = "Zone", ylab = "pupil-teacher ratio (zoomed)", xlim=c(0,6))
boxplot(log10(ptratio) ~ (zn), data = Boston, xlab = "Zone ", ylab = "pupil-teacher ratio (log scale)")
## Relationship between tax and zonal area << extra >>
options(repr.plot.width=12, repr.plot.height=4, repr.plot.res = 230)
par(mfrow = c(1,3))
boxplot(crim ~ tax, data = Boston, xlab = "Tax", ylab = "Crime Rate")
boxplot(crim ~ tax, data = Boston, xlab = "Tax", ylab = "Crime Rate (zoomed)", xlim=c(60,66))
boxplot(log10(crim) ~ (tax), data = Boston, xlab = "Tax ", ylab = "Crime Rate (log scale)")
## Relationship between ptratio and zonal area
options(repr.plot.width=12, repr.plot.height=4, repr.plot.res = 230)
par(mfrow = c(1,3))
boxplot(crim ~ ptratio, data = Boston, xlab = "pupil-teacher ratio", ylab = "Crime Rate" )
boxplot(crim ~ ptratio, data = Boston, xlab = "pupil-teacher ratio", ylab = "Crime Rate (zoomed)", xlim= c(40,45))
boxplot(log10(crim) ~ (ptratio), data = Boston, xlab = "pupil-teacher ratio ", ylab = "Crime Rate (log scale)" )
## Comment on the range of each predictor.
data.frame(min=sapply(Boston,min),max=sapply(Boston,max)
, range=(sapply(Boston, max)- sapply(Boston, min)), sd=sapply(Boston,sd))
## """3 (d) In this data set, how many of the suburbs average more than seen rooms per
## dwelling? More than eight rooms per dwelling?
## """
par(mfrow = c(1,1))
options(repr.plot.width=8, repr.plot.height=5, repr.plot.res = 200)
boxplot(rm ~ zn, data = Boston, xlab = "Zone", ylab = "Rooms" )
abline(7,0, type="l", lty=2, lwd=1, col='red')
abline(8,0, type="l", lty=2, lwd=1, col='blue')
rm_7 <- filter(Boston, rm > 7 )
nrow(rm_7)
rm_8 <- filter(Boston, rm > 8 )
nrow(rm_8)
rm_8$zn
sapply(rm_8, function(x) length(unique(x)))
## end ##