week1_assi1_sol3.r

# -*- coding: utf-8 -*-
# """Week1 Assi1 Sol3.ipynb
# 
# Automatically generated by Colaboratory.
# 
# Original file is located at
#     https://colab.research.google.com/drive/1BG-9PZCVBVll3X94DMFrw0I27Wa5VDD4
# """

###########################################################################
## Week-1, Homework-1, Sol-3
## Sreya Dhar 
## Created: Sept 4, 2020
## Edited: Sept 14, 2020
###########################################################################

rm(list = ls()) ## clearing working environment

# Set working directory to where csv file is located
setwd("C:/File E/EAS 506 Statistical Mining I/Week 1")

## installing all the libaries in R kernel

# install.packages("MASS")
# install.packages("Hmisc")
# install.packages("funModeling")
# install.packages("PerformanceAnalytics")
# install.packages("corrplot")
# install.packages("hrbrthemes")

## importing the libraries in R kernel
library(MASS)
library(Hmisc)
library(ggplot2)
library(dplyr)
library(funModeling) 
library(tidyverse)
library(tidyr)
library(PerformanceAnalytics)
library(corrplot)
library(repr)
library(ggplot2)
library(reshape2)
library(hrbrthemes)
library(gplots)

## overall view of Auto dataset
glimpse(Boston)

## Display several statistical parameters including datatype and unique values in variables 
status(Boston)

## describing the metric table of the variables which includes range, mean, standard deviation and variation
profiling_num(Boston)

## """Qs. 3 (a) Make pairwise scatterplots of the predictors"""

## pair wise plot :: between any two variable to show the correlation between variables
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 300)
pairs(Boston)

## plotting the correlation values on chart matrix which also combined with histogram and scatter plots of different features.
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 300)
chart.Correlation(Boston, histogram=TRUE, pch=15)

## overall summary of the data with several statistical parameters 
summary(Boston)

## detecting the outliers from medv variable
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200)
boxplot(Boston$medv, main="Detecting medv outliers", xlab="medv", ylab="value")

## identifying the outliers from medv variable
out_medv <- boxplot.stats(Boston$medv)$out ## outliers values
outl_medv <- which(Boston$medv %in% c(out_medv)) ## no. of observations (index:: row)
out_medv

## replacing the outliers by the median value of horsepower 
Boston[Boston$medv >=38.7, "medv"] <- mean(Boston$medv)
Boston


## Showing the correlation plots in lower triangular matrix :: intensities can be visualize by color range variation 
L <- cor(Boston)
corrplot(L, method = "circle",  type = "lower")

# rquery.cormat(Boston, type="flatten", graph=FALSE)

options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 200)
Boston_h <- as.data.frame(scale(Boston,center=TRUE,scale=TRUE))
heatmap.2(as.matrix(Boston_h), scale = "none", col = bluered(100), trace = "none", density.info = "none")

options(repr.plot.width=7, repr.plot.height=7, repr.plot.res = 200)
plot_num(Boston)

## Visualizing how diferent predictors in Boston dataset has been correlated with each other
Boston_N = Boston
Boston_N$chas <- factor(Boston$chas, labels = c("far river", "near river"))
Boston_N$rad <- cut(Boston_N$rad, c(0, 15, 200), labels=c("near HW", "far HW"))
Boston_N$tax <- cut(Boston_N$tax, c(0, 550, 100000), labels=c("lowTax", "highTax"))

Boston_N

options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 230)
xyplot(crim ~black| tax*rad, data=Boston_N, type = c("p", "smooth"), groups = chas, auto.key = list(columns=2))
xyplot(crim ~medv| tax*rad, data=Boston_N, type = c("p", "smooth"), groups = chas, auto.key = list(columns=2))
# xyplot(crim ~ptratio | tax*rad, data=Boston_N, type = c("p", "smooth"), groups = chas, auto.key = list(columns=2))
xyplot(dis ~nox | rad*tax, data=Boston_N, type = c("p", "smooth"), groups = chas, auto.key = list(columns=2))
xyplot(lstat ~ age | rad*tax, data=Boston_N, type = c("p", "smooth"), groups = chas, auto.key = list(columns=2))


## """Qs 3 (b) Are any of the predictors associated with per capita crime rate?"""

# convert objects into molten dataframe with crime data, variable and respective values
crimrate <- melt(Boston, id="crim") # convert objects into molten dataframe.

# plot each feature against crim rate to visualise the relation
ggplot(data=crimrate, aes(x=value, y=crim))+ 
facet_wrap(vars(variable), scales="free", nrow = 4)+ 
geom_point(col='red')+ theme_bw()+ 
ggtitle("Plot of crime rate with other variables") +
xlab("Values") + ylab("Crime rate")

## min-max scaling on boston dataset prior to regression
max <- apply(Boston , 2 , max)
min <- apply(Boston, 2 , min)
Boston_S <- as.data.frame(scale(Boston, center = min, scale = max - min))

## regression model on scaled boston dataset with all the predictors 
model_1 = lm(crim~., data = Boston_S)
summary(model_1)

options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200)
par(mfrow = c(2,2))
plot(model_1,)

## regression model on scaled boston dataset with the significant predictors 
Boston_S_mod <- Boston_S[, -c(3,4,6,7,10,11,13)]

Boston_S_mod

model_2 = lm(crim~., data = Boston_S_mod)
summary(model_2)

## regression model on scaled boston dataset with the interaction
model_3 = lm(crim~.+dis:rad+dis:medv+rad:medv+rad:zn, data = Boston_S_mod)
summary(model_3)

options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200)
par(mfrow = c(2,2))
plot(model_3,)

## removing influencial outliers 
Boston_S_mod1 <- Boston_S_mod[-c(381,419,406,411),]
dim(Boston_S_mod1)

## regression model after removing ouliers with interactions
model_4 = lm(crim~.+dis:rad+dis:medv+rad:medv+rad:zn, data = Boston_S_mod1)
summary(model_4)

options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 230)
par(mfrow = c(2,2))
plot(model_4,)

## """3 (c) Do any of the suburbs of Boston appear to have particularly high crime rates?
## Tax rates? Pupil-teacher ratios?
## """

## Relationship between Crime rates and zonal area
options(repr.plot.width=12, repr.plot.height=4, repr.plot.res = 230)
par(mfrow = c(1,3))
boxplot(crim ~ zn, data = Boston, xlab = "Zone", ylab = "Crime Rate")
boxplot(crim ~ zn, data = Boston, xlab = "Zone", ylab = "Crime Rate (zoomed)", xlim=c(0,6))
boxplot(log10(crim) ~ (zn), data = Boston, xlab = "Zone ", ylab = "Crime Rate (log scale)")

## Relationship between Crime rates and zonal area
options(repr.plot.width=12, repr.plot.height=4, repr.plot.res = 230)
par(mfrow = c(1,3))
boxplot(tax ~ zn, data = Boston, xlab = "Zone", ylab = "Tax")
boxplot(tax ~ zn, data = Boston, xlab = "Zone", ylab = "Tax (zoomed)", xlim=c(0,6))
boxplot(log10(tax) ~ (zn), data = Boston, xlab = "Zone ", ylab = "Tax (log scale)")

## Relationship between Crime rates and zonal area << extra >>
options(repr.plot.width=12, repr.plot.height=4, repr.plot.res = 230)
par(mfrow = c(1,3))
boxplot(ptratio ~ zn, data = Boston, xlab = "Zone", ylab = "pupil-teacher ratio")
boxplot(ptratio ~ zn, data = Boston, xlab = "Zone", ylab = "pupil-teacher ratio (zoomed)", xlim=c(0,6))
boxplot(log10(ptratio) ~ (zn), data = Boston, xlab = "Zone ", ylab = "pupil-teacher ratio (log scale)")

## Relationship between tax and zonal area << extra >>
options(repr.plot.width=12, repr.plot.height=4, repr.plot.res = 230)
par(mfrow = c(1,3))
boxplot(crim ~ tax, data = Boston, xlab = "Tax", ylab = "Crime Rate")
boxplot(crim ~ tax, data = Boston, xlab = "Tax", ylab = "Crime Rate (zoomed)", xlim=c(60,66))
boxplot(log10(crim) ~ (tax), data = Boston, xlab = "Tax ", ylab = "Crime Rate (log scale)")

## Relationship between ptratio and zonal area
options(repr.plot.width=12, repr.plot.height=4, repr.plot.res = 230)
par(mfrow = c(1,3))
boxplot(crim ~ ptratio, data = Boston, xlab = "pupil-teacher ratio", ylab = "Crime Rate" )
boxplot(crim ~ ptratio, data = Boston, xlab = "pupil-teacher ratio", ylab = "Crime Rate (zoomed)", xlim= c(40,45))
boxplot(log10(crim) ~ (ptratio), data = Boston, xlab = "pupil-teacher ratio ", ylab = "Crime Rate (log scale)" )

## Comment on the range of each predictor.

data.frame(min=sapply(Boston,min),max=sapply(Boston,max)
, range=(sapply(Boston, max)- sapply(Boston, min)), sd=sapply(Boston,sd))

## """3 (d) In this data set, how many of the suburbs average more than seen rooms per
## dwelling? More than eight rooms per dwelling?
## """
par(mfrow = c(1,1))
options(repr.plot.width=8, repr.plot.height=5, repr.plot.res = 200)
boxplot(rm ~ zn, data = Boston, xlab = "Zone", ylab = "Rooms" )
abline(7,0, type="l", lty=2, lwd=1, col='red')
abline(8,0, type="l", lty=2, lwd=1, col='blue')

rm_7 <- filter(Boston, rm > 7 )
nrow(rm_7)

rm_8 <- filter(Boston, rm > 8 )
nrow(rm_8)

rm_8$zn

sapply(rm_8, function(x) length(unique(x)))



## end ##