week1_assi1_sol1.R

# -*- coding: utf-8 -*-
# """Week1 Assi1 Sol1.ipynb
# 
# Automatically generated by Colaboratory.
# 
# Original file is located at
#     https://colab.research.google.com/drive/1Mv7zC7skLGUl1kE-OjcXCIuAz95xwyFT
# """

###########################################################################
## Week-1, Homework-1, Sol-1
## Sreya Dhar 
## Created: Sept 4, 2020
## Edited: Sept 14, 2020
###########################################################################

rm(list = ls()) ## clearing working environment

# Set working directory to where csv file is located
setwd("C:/File E/EAS 506 Statistical Mining I/Week 1")

## installing all the libaries in R kernel

# install.packages("ISLR")
# install.packages("Hmisc")
# install.packages("funModeling")
# install.packages("PerformanceAnalytics")
# install.packages("corrplot")
# install.packages("repr")

## importing the libraries in R kernel

library(ISLR)
library(Hmisc)
library(ggplot2)
library(dplyr)
library(funModeling) 
library(tidyverse)
library(tidyr)
library(PerformanceAnalytics)
library(corrplot)
library(repr)
library(gplots)

## """# Exploratory Data Analysis on 'Auto' Dataset"""

# View different features in Auto dataset
names(Auto)

## overall view of Auto dataset
glimpse(Auto) 

## Display several statistical parameters including datatype and unique values in variables
status(Auto) 

# specifically to convert 'name' into numerical variable
Auto_1 <- Auto %>% mutate_if(is.factor, as.numeric) 


## describing the metric table of the variables which includes range, mean, standard deviation and variation
profiling_num(Auto_1) 

## histogram plot for the variables for better visalization and distribution of data
options(repr.plot.width=7, repr.plot.height=7, repr.plot.res = 200)
plot_num(Auto_1)

## description of the overall dataset, including lowest, highest and frequency of high occurance variables
describe(Auto_1) 

## Statistical parameters of all the variables
summary(Auto_1) 

## provides top 6 rows as heading matrix from dataset
head(data.matrix(Auto_1)) 

## pair wise plot :: between any two variable to show the correlation between variables
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 200)
pairs(Auto_1, main = "Pairwise plot")

## removing the 'name' variable from the Auto dataset for further exploring the data :: as the name variable does not influence any other variable in the dataset.
Auto_C = Auto_1[,-9]


## plotting the correlation values on chart matrix which also combined with histogram and scatter plots of different features.
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 200)
chart.Correlation(Auto_C, histogram=TRUE, pch=15, , density = TRUE) # pearson correlation values at upper traingle 


## defining the figure plot like width, ht and resolution of the below figure 
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200) 

## Showing the correlation plots in lower triangular matrix :: intensities can be visualize by color range variation 
mod_cor <- cor(Auto_C)
corrplot(mod_cor, method = "circle",  type = "lower")

## plotting the heatmap diagram on modified dataset
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200)
corrplot(mod_cor, method = "color" )

## heatmap matrix of the Auto_C dataset after scaling the dataframe around the mean =0, sd = 1
options(repr.plot.width=7, repr.plot.height=7, repr.plot.res = 200)
Auto_S <- as.data.frame(scale(Auto_C,center=TRUE,scale=TRUE))
heatmap.2(as.matrix(Auto_S), scale = "none", col = bluered(100), trace = "none", density.info = "none")

## overall summary of the data with several statistical parameters removing 'name' variable 
summary(Auto_C)

# Boxplots showing mpg  vs. origin
origin_C <- factor(Auto$origin, labels = c("Amerian", "European", "Japanese"))
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 230) 
qplot(origin_C, mpg, geom = c("violin"), data = Auto_C, xlab= "Different Origin") + geom_boxplot(width=0.1, color="blue")

# Boxplots showing mpg vs. no. of cylinders
cylinders_C <- factor(Auto$cylinders, labels = c("c_3", "c_4", "c_5","c_6", "c_8"))
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 230) 
qplot(cylinders_C, mpg, geom = c("violin"), data = Auto_C, xlab= "Number of cylinders") + geom_boxplot(width=0.1, color="red")

# EDA showing horsepower in boxplot and its distribution broken down by different pockets
par(mfrow = c(1,3))
dens <- density(Auto_C$horsepower)
options(repr.plot.width=12, repr.plot.height=4, repr.plot.res = 300) 
boxplot(Auto_C$horsepower, main="Detecting horsepower outliers",
        xlab="horsepower ", ylab="value")
hist(Auto_C$horsepower, breaks = 15,  xlab="horsepower distribution ", 
     main="Histogram of horsepower", probability = T)
lines(dens)
hist(log(Auto_C$horsepower), breaks = 15, xlab="horsepower log distribution ", 
     main="Histogram of horsepower in log scale")


## identifying the outliers from horsepower variable
outl_hp <- boxplot.stats(Auto_C$horsepower)$out ## outliers values
outl_hpw <- which(Auto_C$horsepower %in% c(outl_hp)) ## no. of observations (index:: row)
outl_hp

## replacing the outliers by the median value of horsepower 
Auto_C[Auto_C$horsepower >=200, "horsepower"] <- median(Auto_C$horsepower)

# cleaned Auto dataset 
Auto_C

## Relationhip between mpg and year variable; from the plot its been seen that as the years passed by Performance (mpg) increases in vehicles.
par(mfrow = c(1,2))
options(repr.plot.width=14, repr.plot.height=5, repr.plot.res = 200)
# scatter plot mpg vs year 
plot(mpg ~ year, xlab = "year", ylab = "Performance in mpg", data = Auto_C)
# add a smooth trendline on top of scatter plot to verify the pattern
with(Auto_C, lines(lowess(mpg~year), lwd=2, col='red'))
# boxplot to visualize the transmision range increasing by year.
boxplot(mpg ~ year, data = Auto_C, ylab = "Performance in mpg", xlab = "Year")

################## exploring weight variable from normal scale to logscale ########################
par(mfrow = c(1,2))
options(repr.plot.width=14, repr.plot.height=5, repr.plot.res = 200)
# scatter plot mpg vs weight 
plot(mpg ~ weight, xlab = "weight", ylab = "Performance in mpg", data = Auto_C)
with(Auto_C, lines(lowess(mpg~weight), lwd=2, col='red'))
plot(log(mpg) ~ log(weight), xlab = "log of weight", ylab = "log of mpg", data = Auto_C)
with(Auto_C, lines(lowess(log(mpg)~log(weight)), lwd=2, col='red'))

###################################################################################################

options(repr.plot.width=5, repr.plot.height=4, repr.plot.res = 300)

fil_Auto_C1 <- filter(Auto_C, year == c(70,82))
fil_Auto_C1$origin <- factor(fil_Auto_C1$origin, labels = c("Amerian","European", "Japanese"))
fil_Auto_C1$year <- factor(fil_Auto_C1$year, labels = c("1972","1980"))

comb_plot<- fil_Auto_C1 %>% 
  mutate(day = fct_reorder(origin, mpg)) %>%
  mutate(day = factor(origin)) %>%
  ggplot(aes(x=origin, y=mpg, fill=year))  +
  geom_violin(position=position_dodge(0.8), alpha=0.5) +
  geom_boxplot(aes(color = year), width = 0.07, position = position_dodge(0.6))+
  xlab("Origin") +
  ylab("mpg")

comb_plot

######################### Scaling ###################################################################
## before starting linear regression we need to normalize the Auto_C data into a min-max scale scale

max <- apply(Auto_C , 2 , max)
min <- apply(Auto_C, 2 , min)
Auto_S <- as.data.frame(scale(Auto_C, center = min, scale = max - min))
mean(Auto_S$mpg)
sd(Auto_S$mpg)


## """Submit the cleaned dataset as an *.RData file."""

write.table(Auto_S, file="Auto_S.Rdata")

# Auto_S<-read.table("Auto_S.Rdata")

## end ##