Daniel Carpenter & Sonaxy Mohanty October 2022
- Packages
- General Data Prep
- Exploratory Data Analysis
1 (a)
- OLS Model1 (b)
- PLS Model1 (c)
- LASSO Model1 (d)
- Model Variants- Summary Table of Model Performance
- References
# Data Wrangling
library(tidyverse)
# Modeling
library(MASS)
library(caret) # Modeling variants like SVM
library(earth) # Modeling with Mars
library(pls) #Modeling with PLS
library(glmnet) #Modeling with LASSO
# Aesthetics
library(knitr)
library(cowplot) # multiple ggplots on one plot with plot_grid()
library(scales)
library(kableExtra)
library(ggplot2)
#Hold-out Validation
library(caTools)
#Data Correlation
library(GGally)
library(regclass)
#RMSE Calculation
library(Metrics)
#p-value for OLS model
library(broom)
#ncvTest
library(car)
For general data preparation, please see conceptual steps below. See
.rmd
file for detailed code.
# Convert all character data to factor
hd <- read.csv('housingData.csv', stringsAsFactors = TRUE) %>%
# creates new variables age, ageSinceRemodel, and ageofGarage and
dplyr::mutate(age = YrSold - YearBuilt,
ageSinceRemodel = YrSold - YearRemodAdd,
ageofGarage = ifelse(is.na(GarageYrBlt), age, YrSold - GarageYrBlt)) %>%
# remove some columns used in the above calculations
dplyr::select(!c(Id,YrSold ,
MoSold, YearBuilt, YearRemodAdd))
Make data set of numeric
variables hd.numericRaw
hd.numericRaw <- hd %>%
#selecting all the numeric data
dplyr::select_if(is.numeric) %>%
#converting the data frame to tibble
as_tibble()
Make data set of factor
variables hd.factorRaw
hd.factorRaw <- hd %>%
#selecting all the numeric data
dplyr::select_if(is.factor) %>%
#converting the data frame to tibble
as_tibble()
For each column with missing data, impute missing values with PMM
- Imputation completed with our created function called
imputeWithPMM()
- Applies function to columns with missing data via dynamic
dplyr
logic - Note
seeImputation()
function to visualize the imputation from prior homework 4, not shown for simplicity in viewing
# Create function to impute via `PMM`
imputeWithPMM <- function(colWithMissingData) {
# Using the mice package
suppressMessages(library(mice))
# Discover the missing rows
isMissing <- is.na(colWithMissingData)
# Create data frame to pass to PMM imputation function from mic package
df <- data.frame(x = rexp(length(colWithMissingData)), # meaningless x to help show variation
y = colWithMissingData,
missing = isMissing)
# imputation by PMM
df[isMissing, "y"] <- mice.impute.pmm( df$y,
!df$missing,
df$x)
return(df$y)
}
# Apply `PMM` function to numeric data containing null values
# Data to store imputed values with PMM method
hd.Imputed <- hd
# Which columns has Na's?
colNamesWithNulls <- colnames(hd.numericRaw[ , colSums(is.na(hd.numericRaw)) != 0])
colNamesWithNulls
## [1] "LotFrontage" "MasVnrArea" "GarageYrBlt"
numberOfColsWithNulls = length(colNamesWithNulls)
# For each of the numeric columns with null values
for (colWithNullsNum in 1:numberOfColsWithNulls) {
# The name of the column with null values
nameOfThisColumn <- colNamesWithNulls[colWithNullsNum]
# Get the actual data of the column with nulls
colWithNulls <- hd[, nameOfThisColumn]
# Impute the missing values with PMM
imputedValues <- imputeWithPMM(colWithNulls)
# Now store the data in the original new frame
hd.Imputed[, nameOfThisColumn] <- imputedValues
# Save a visualization of the imputation
pmmVisual <- seeImputation(data.frame(y = colWithNulls),
data.frame(y = imputedValues),
nameOfThisColumn )
fileToSave = paste0('OutputPMM/Imputation_With_PMM_', nameOfThisColumn, '.pdf')
print(paste0('For imputation results of ', nameOfThisColumn, ', see ', fileToSave))
dir.create("OutputPMM/")
ggsave(pmmVisual, filename = fileToSave,
height = 11, width = 8.5)
}
## [1] "For imputation results of LotFrontage, see OutputPMM/Imputation_With_PMM_LotFrontage.pdf"
## [1] "For imputation results of MasVnrArea, see OutputPMM/Imputation_With_PMM_MasVnrArea.pdf"
## [1] "For imputation results of GarageYrBlt, see OutputPMM/Imputation_With_PMM_GarageYrBlt.pdf"
Overview: Create
Other
Bin for Columns over4
Unique Values
- Applied to any
factor
column (previouslycharacter
) with over 4 unique values - Applies
fct_lump()
function to columns via dynamicdplyr
logic
hd.Cleaned <- hd.Imputed # For final cleaned data
# Get list of factors and the number of unique values
factorCols <- as.data.frame(t(hd.factorRaw %>% summarise_all(n_distinct)))
# We are going to factor collapse factor columns with more than 4 columns
# So there will be 4 of the original, and 1 containing 'other'
# This is the threshold
factorThreshold = 4
# Get a list of the factors we are going to collapse
colsWithManyFactors <- rownames(factorCols %>% filter(V1 > factorThreshold))
# Show a summary of how many factors will be collapsed
numberOfColsWithManyFactors = length(colsWithManyFactors)
paste('Before cleaning, there are', numberOfColsWithManyFactors, 'factor columns with more than',
factorThreshold, 'unique values')
## [1] "Before cleaning, there are 14 factor columns with more than 4 unique values"
# Collapse the affected factors in the original data (the one that already has imputation)
## for each factor column that we are about to collapse
for (collapsedColNum in 1:numberOfColsWithManyFactors) {
# The name of the column with null values
nameOfThisColumn <- colsWithManyFactors[collapsedColNum]
# Get the actual data of the column with nulls
colWithManyFactors <- hd[, nameOfThisColumn]
# lumps all levels except for the n most frequent
hd.Cleaned[, nameOfThisColumn] <- fct_lump_n(colWithManyFactors,
n=factorThreshold)
}
# Check to see if the factor lumping worked
factorColsCleaned <- t(hd.Cleaned %>%
select_if(is.factor) %>%
summarise_all(n_distinct))
paste('After cleaning, there are', sum(factorColsCleaned > factorThreshold, na.rm = TRUE),
"columns with more than", factorThreshold, "unique values (omitting NA's)")
## [1] "After cleaning, there are 14 columns with more than 4 unique values (omitting NA's)"
Overview: Using numeric data frame, remove some outliers from each column without dwindling the entire data set. See steps below to create data frame
hd.CleanedNoOutliers
.
- Please note that NOT all models use this data set with removed
outliers.
- Only models which are sensitive to outliers use this data frame without outliers.
- For example, the linear model using this data frame.
Outlier removal steps below:
- Since there are so many outliers in each column, we are only going to remove some outliers
- If you count the number of outliers by column, the 75% of columns contain less than 50 outliers.
- However, some contain up to 200. Since remove ALL outliers would reduce the size of the data to less than 300 observations, we are removing up to 50 per numeric column.
hd.CleanedNoOutliers <- hd.Cleaned
# Remove up to 75% of the outliers in the data set
# this is the 3rd quartile of number of outliers.
k_outliers = 50
numOutliers = c() # to store the number of outliers per column
theColNames <- colnames(hd.Cleaned)
for (colNum in 1:ncol(hd.Cleaned)) {
theCol <- hd.Cleaned[, colNum]
nrowBefore = length(theCol)
colName <- theColNames[colNum]
# Only consider numeric
if (is.numeric(theCol)) {
# Identify the outliers in the column
# Source: https://www.geeksforgeeks.org/remove-outliers-from-data-set-in-r/
columnOutliers <- boxplot.stats(hd.CleanedNoOutliers[, colNum])$out
numOutliers <- c(numOutliers, length(columnOutliers))
# Now remove k outliers from the column
if (length(columnOutliers) < k_outliers) {
hd.CleanedNoOutliers <- hd.CleanedNoOutliers %>%
# If this syntax looks weird, it is just referencing a column in the
# data set using dplyr piping. See below for more info:
# https://stackoverflow.com/questions/48062213/dplyr-using-column-names-as-function-arguments
# https://stackoverflow.com/questions/72673381/column-names-as-variables-in-dplyr-select-v-filter
filter( !( get({{colName}}) %in% columnOutliers ) )
}
}
}
paste0('Of the columns with outliers, removed up to 75th percentile of num. outliers.')
## [1] "Of the columns with outliers, removed up to 75th percentile of num. outliers."
paste0('See that the 75th percentile of columns with outliers contain ',
paste0(summary(numOutliers)[5]), ' outliers')
## [1] "See that the 75th percentile of columns with outliers contain 51.75 outliers"
hist(hd.CleanedNoOutliers$SalePrice,
col = 'skyblue4',
main = 'Distribution of Sale Price of houses',
xlab = 'House Price')
- After removing the desired outliers, we can see that the
distribution of
log(Sale Price)
looks like a normal distribution with few outliers on the left tail.
ggcorr(hd.CleanedNoOutliers, geom='blank', label=T, label_size=3, hjust=1,
size=3, layout.exp=2) +
geom_point(size = 4, aes(color = coefficient > 0, alpha = abs(coefficient) >= 0.5)) +
scale_alpha_manual(values = c("TRUE" = 0.25, "FALSE" = 0)) +
guides(color = F, alpha = F)
- We can see that
SalePrice
has strong correlations withGarageArea
,GarageCars
,TotRmsAbvGrd
,FullBath
,GrLivArea
,X1stFlrSF
,TotalBsmtSF
,OverallQual
.
- Since, we have deleted some of the outlier values during data pre-processing, using 10% of the data as test and remaining 90% as train
idx <- sample(nrow(hd.CleanedNoOutliers), nrow(hd.CleanedNoOutliers)*0.1)
test <- hd.CleanedNoOutliers[idx,]
train <- hd.CleanedNoOutliers[-idx,]
- Linear model containing:
- Independent variables:
GarageArea + GarageCars + TotRmsAbvGrd + FullBath + GrLivArea + X1stFlrSF + TotalBsmtSF + OverallQual
- Predicted variable:
SalePrice
- Independent variables:
ols.mdl1 <- lm(log(SalePrice) ~ GarageArea + GarageCars + TotRmsAbvGrd
+ FullBath + GrLivArea + X1stFlrSF + TotalBsmtSF + OverallQual, data=train)
summary(ols.mdl1)
##
## Call:
## lm(formula = log(SalePrice) ~ GarageArea + GarageCars + TotRmsAbvGrd +
## FullBath + GrLivArea + X1stFlrSF + TotalBsmtSF + OverallQual,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.60845 -0.07292 0.00978 0.08654 0.47000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.065e+01 3.432e-02 310.397 < 2e-16 ***
## GarageArea 2.275e-04 5.427e-05 4.192 3.1e-05 ***
## GarageCars 5.000e-02 1.609e-02 3.108 0.00196 **
## TotRmsAbvGrd 5.790e-04 6.649e-03 0.087 0.93064
## FullBath 4.116e-02 1.386e-02 2.968 0.00309 **
## GrLivArea 2.737e-04 2.583e-05 10.598 < 2e-16 ***
## X1stFlrSF -3.049e-06 2.790e-05 -0.109 0.91300
## TotalBsmtSF 2.164e-04 2.433e-05 8.897 < 2e-16 ***
## OverallQual 8.004e-02 6.239e-03 12.829 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1361 on 724 degrees of freedom
## Multiple R-squared: 0.8066, Adjusted R-squared: 0.8044
## F-statistic: 377.3 on 8 and 724 DF, p-value: < 2.2e-16
AIC(ols.mdl1)
## [1] -833.129
BIC(ols.mdl1)
## [1] -787.1575
ols.mdl1.RMSE <- rmse(actual=log(train$SalePrice), predicted=ols.mdl1$fitted.values)
VIF(ols.mdl1)
## GarageArea GarageCars TotRmsAbvGrd FullBath GrLivArea X1stFlrSF
## 3.747991 4.191478 2.992405 1.961489 4.132558 2.622933
## TotalBsmtSF OverallQual
## 2.653477 2.050188
- For Model 1: Adjusted R-squared is
0.8138
, AIC is-847.5004
and BIC is-801.5289
and RMSE is171456.2
. - Still trying to improve the existing model.
- No multicollinearity detected.
- This model created is based on
Principal Component Analysis
.- Uses
numeric
data for Principal Component Analysis - Then appends the
factor
data to the data withoutNULL
values - Finally, uses
stepAIC()
to best model data
- Uses
#Get cleaned `numeric` and `factor` `data frames` for OLS Model
# After cleaning, two data sets that contain..
## Numeric data ---------------------------------------------------
hd.numericClean.OLS <- train %>% select_if(is.numeric)
## Factors -------------------------------------------------------
hd.factorClean.OLS <- train %>% dplyr::select(where(is.factor))
# Removing any columns with NA
removeColsWithNA <- function(df) {
return( df[ , colSums(is.na(df)) == 0] )
}
hd.factorClean.OLS <- removeColsWithNA(hd.factorClean.OLS)
paste('Num. factor cols. removed due to null values:',
ncol(train %>% dplyr::select(where(is.factor)) ) - ncol(hd.factorClean.OLS) )
## [1] "Num. factor cols. removed due to null values: 16"
paste(ncol(hd.factorClean.OLS), 'factor cols. remain')
## [1] "22 factor cols. remain"
# Principal component analysis on numeric data
#to remove zero variance columns from the dataset, using the apply expression,
#setting variance not equal to zero
pc.house <- prcomp(hd.numericClean.OLS[ , which(apply(hd.numericClean.OLS, 2, var) != 0)] %>%
dplyr::select(-SalePrice), # do not include response var
center = TRUE, # Mean centered
scale = TRUE # Z-Score standardized
)
# See first 10 cumulative proportions
pc.house.summary <- summary(pc.house)
pc.house.summary$importance[, 1:10]
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.60037 1.860578 1.703664 1.408879 1.145132 1.134045
## Proportion of Variance 0.23317 0.119370 0.100090 0.068450 0.045220 0.044350
## Cumulative Proportion 0.23317 0.352540 0.452630 0.521070 0.566290 0.610640
## PC7 PC8 PC9 PC10
## Standard deviation 1.047496 1.03631 1.011789 0.9602641
## Proportion of Variance 0.037840 0.03703 0.035300 0.0318000
## Cumulative Proportion 0.648470 0.68551 0.720810 0.7526000
Now we choose number of PC’s that explain 75% of the variation
- Note this threshold is just a judgement call. No significance behind 75%
cumPropThreshold = 0.75 # The threshold
numPCs <- sum(pc.house.summary$importance['Cumulative Proportion', ] < cumPropThreshold)
paste0('There are ', numPCs, ' principal components that explain up to ', cumPropThreshold*100,
'% of the variation in the data')
## [1] "There are 9 principal components that explain up to 75% of the variation in the data"
chosenPCs <- as.data.frame(pc.house$x[, 1:numPCs])
#Join on the factor data
df.ols <- cbind(SalePrice = hd.numericClean.OLS$SalePrice, chosenPCs, hd.factorClean.OLS)
- Linear model containing:
- Principal components explaining 75% of variation in
numeric
data - Non-null
factor
data - Predicted variable:
SalePrice
- Principal components explaining 75% of variation in
- Then use
stepAIC()
to identify which variables are actually important for model
# Fit data using PC's, non-null factors
fit.ols <- lm(log(SalePrice) ~ ., data = df.ols)
# Reduce to only important variables
ols.mdl2 <- stepAIC(fit.ols, direction="both")
- Reporting
all the variables
of the best model (Model 2
):
Coefficient estimates:
# Reporting the variables for best model
ols.mdl2.sum <- summary(ols.mdl2)
# Coefficient estimates of the model
ols.mdl2.sum$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.167879e+01 0.064646815 1.806553e+02 0.000000e+00
## PC1 7.680389e-02 0.003028370 2.536146e+01 4.143975e-100
## PC3 -5.107985e-02 0.004420829 -1.155436e+01 2.686925e-28
## PC4 -1.952798e-02 0.003187462 -6.126497e+00 1.528491e-09
## PC5 8.143573e-03 0.004321925 1.884247e+00 5.996236e-02
## PC6 -3.810332e-02 0.004016721 -9.486176e+00 4.047656e-20
## PC7 -1.116983e-02 0.003823769 -2.921156e+00 3.604130e-03
## PC8 -2.792595e-02 0.004060648 -6.877214e+00 1.396364e-11
## PC9 1.158340e-02 0.003876572 2.988053e+00 2.909807e-03
## MSZoningRH -1.392549e-01 0.049301439 -2.824561e+00 4.874716e-03
## MSZoningRL -5.759490e-02 0.023394921 -2.461855e+00 1.407172e-02
## MSZoningRM -9.890075e-02 0.025652819 -3.855356e+00 1.266424e-04
## LandContourHLS 7.917568e-02 0.030573862 2.589652e+00 9.815075e-03
## LandContourLow 3.513932e-02 0.037692352 9.322666e-01 3.515331e-01
## LandContourLvl -5.242978e-03 0.019679779 -2.664145e-01 7.900016e-01
## LotConfigCulDSac 2.783362e-02 0.018119557 1.536109e+00 1.249815e-01
## LotConfigInside -6.812083e-06 0.010575115 -6.441616e-04 9.994862e-01
## LotConfigother -3.144235e-02 0.020838671 -1.508846e+00 1.318073e-01
## NeighborhoodNAmes -1.857504e-02 0.017338075 -1.071344e+00 2.843988e-01
## NeighborhoodOldTown -6.095598e-02 0.023297984 -2.616363e+00 9.086623e-03
## Neighborhoodother -4.113109e-02 0.021351639 -1.926367e+00 5.447939e-02
## NeighborhoodOther -1.113604e-02 0.013611536 -8.181328e-01 4.135709e-01
## Condition1Feedr 5.739541e-02 0.027807442 2.064031e+00 3.939756e-02
## Condition1Norm 9.461088e-02 0.022540873 4.197303e+00 3.063794e-05
## Condition1RR 4.302262e-02 0.032743979 1.313909e+00 1.893244e-01
## Condition1Other 6.419995e-02 0.043290673 1.482997e+00 1.385432e-01
## BldgType2fmCon -1.681753e-02 0.041021856 -4.099650e-01 6.819621e-01
## BldgTypeDuplex 1.548195e-03 0.053342446 2.902370e-02 9.768543e-01
## BldgTypeTwnhs -2.535442e-02 0.030062593 -8.433877e-01 3.993113e-01
## BldgTypeTwnhsE 3.987977e-02 0.023658778 1.685622e+00 9.233208e-02
## HouseStyle1Story -4.902167e-02 0.016020651 -3.059905e+00 2.302086e-03
## HouseStyle2Story 1.429854e-02 0.017784937 8.039692e-01 4.216986e-01
## HouseStyleSLvl -1.188674e-02 0.022889846 -5.193018e-01 6.037210e-01
## HouseStyleOther -2.535574e-02 0.022987493 -1.103023e+00 2.704112e-01
## RoofStyleHip 1.042899e-02 0.010873944 9.590810e-01 3.378623e-01
## RoofStyleother 1.266007e-01 0.031702550 3.993391e+00 7.230766e-05
## Exterior1stMetalSd 5.051822e-02 0.046030122 1.097504e+00 2.728138e-01
## Exterior1stVinylSd -2.244993e-03 0.044645448 -5.028492e-02 9.599103e-01
## Exterior1stWd Sdng -5.019232e-02 0.029083733 -1.725787e+00 8.484476e-02
## Exterior1stOther 2.299730e-02 0.022168043 1.037408e+00 2.999185e-01
## Exterior2ndMetalSd -3.080838e-02 0.046496157 -6.626005e-01 5.078133e-01
## Exterior2ndVinylSd 3.765969e-02 0.045466447 8.282964e-01 4.077963e-01
## Exterior2ndWd Sdng 5.924821e-02 0.029647129 1.998447e+00 4.607010e-02
## Exterior2ndOther -7.704962e-03 0.021868124 -3.523376e-01 7.246954e-01
## ExterQualAvg -2.630914e-02 0.013589523 -1.935987e+00 5.328769e-02
## ExterQualBelowAvg 3.952126e-02 0.057980661 6.816284e-01 4.957085e-01
## FoundationCBlock -1.609751e-04 0.016553811 -9.724353e-03 9.922441e-01
## Foundationother 7.077953e-03 0.035278504 2.006308e-01 8.410479e-01
## FoundationPConc 3.719023e-02 0.018743130 1.984206e+00 4.763868e-02
## Heatingother 6.036279e-02 0.034290756 1.760323e+00 7.880734e-02
## CentralAirY 7.812578e-02 0.025443477 3.070562e+00 2.222598e-03
## KitchenQualAvg -2.971360e-02 0.011663032 -2.547674e+00 1.106562e-02
## KitchenQualBelowAvg -5.326673e-02 0.029900390 -1.781473e+00 7.528611e-02
## FunctionalMaj2 -2.169423e-01 0.071642698 -3.028114e+00 2.554958e-03
## FunctionalMin1 8.985229e-02 0.044632592 2.013154e+00 4.449611e-02
## FunctionalMin2 1.118952e-01 0.044095585 2.537560e+00 1.138734e-02
## FunctionalMod 2.452407e-02 0.072125766 3.400182e-01 7.339489e-01
## FunctionalTyp 1.710032e-01 0.036930262 4.630435e+00 4.380181e-06
## PavedDriveP 3.604131e-03 0.030223854 1.192479e-01 9.051146e-01
## PavedDriveY 6.053860e-02 0.018705601 3.236389e+00 1.269860e-03
p-values:
# p-values of the model
glance(ols.mdl2)$p.value
## value
## 0
Adjusted R-squared:
ols.mdl2.sum$adj.r.squared
## [1] 0.8929468
AIC:
AIC(ols.mdl2)
## [1] -1226.397
BIC:
BIC(ols.mdl2)
## [1] -945.9715
VIF:
VIF(ols.mdl2)
## GVIF Df GVIF^(1/(2*Df))
## PC1 4.480179 1 2.116643
## PC3 4.098099 1 2.024376
## PC4 1.456950 1 1.207042
## PC5 1.769590 1 1.330259
## PC6 1.499033 1 1.224350
## PC7 1.159031 1 1.076583
## PC8 1.279314 1 1.131068
## PC9 1.111431 1 1.054244
## MSZoning 3.438020 3 1.228527
## LandContour 1.517353 3 1.071966
## LotConfig 1.386462 3 1.055969
## Neighborhood 5.929401 4 1.249184
## Condition1 1.757911 4 1.073062
## BldgType 6.947589 4 1.274176
## HouseStyle 5.349893 4 1.233227
## RoofStyle 1.393339 2 1.086461
## Exterior1st 5754.773889 4 2.951233
## Exterior2nd 5663.217047 4 2.945323
## ExterQual 3.519854 2 1.369718
## Foundation 5.306185 3 1.320678
## Heating 1.816291 1 1.347698
## CentralAir 2.071578 1 1.439298
## KitchenQual 3.136071 2 1.330750
## Functional 2.054221 5 1.074644
## PavedDrive 1.707506 2 1.143117
RMSE:
ols.mdl2.RMSE <- rmse(actual=log(df.ols$SalePrice), predicted=ols.mdl2$fitted.values)
ols.mdl2.RMSE
## [1] 0.09645116
- So, we can say that using PCA followed by stepAIC the OLS regression
model is better as compared to the other OLS model built based on
their
adjusted R-squared
value. - There is also no multicollinearity found in the model as the VIF values are less than 10.
# Key diagnostics for OLS: lm final summary table
ols.mdl1.sum <- summary(ols.mdl1)
# Get the RMSE and R Squared of the model
keyDiagnostics.ols.mdl1 <- data.frame(Model = 'OLS',
Notes = 'lm',
Hyperparameters = 'N/A',
RMSE = ols.mdl1.RMSE,
Rsquared = ols.mdl1.sum$adj.r.squared)
# Show output
keyDiagnostics.ols.mdl1 %>%
knitr::kable()
Model | Notes | Hyperparameters | RMSE | Rsquared |
---|---|---|---|---|
OLS | lm | N/A | 0.135216 | 0.8044233 |
# Key diagnostics for OLS: lm + 2-way interactions final summary table
# Get the RMSE and R Squared of the model
keyDiagnostics.ols.mdl2 <- data.frame(Model = 'OLS',
Notes = 'lm + 2-way interactions',
Hyperparameters = 'N/A',
RMSE = ols.mdl2.RMSE,
Rsquared = ols.mdl2.sum$adj.r.squared)
# Show output
keyDiagnostics.ols.mdl2 %>%
knitr::kable()
Model | Notes | Hyperparameters | RMSE | Rsquared |
---|---|---|---|---|
OLS | lm + 2-way interactions | N/A | 0.0964512 | 0.8929468 |
A linear regression model is considered fit if the below assumptions are met:
- Residuals should follow normal distribution
- There should be no heteroscedasticity
- There should be no multicollinearity
hist(ols.mdl2$residuals,
col = 'skyblue4',
main = 'Histogram of Residuals',
xlab = 'Residuals')
We can see that the residuals are normally distributed
with a little
longer left tail, maybe due to presence of outliers.
par(mfrow=c(2,2)) #combining multiple plots together
plot(ols.mdl2)
- From the Residuals vs Fitted plot, we can see there are points above and below the 0 line.
- There is also a pattern seen like a
very slight curvature pattern
towards the end which indicates that there maybe a systematic lack of fit. - The mean of residuals is almost zero which implies there is no biasing involved.
- From the Normal Q-Q plot, we can see that most of the points are
very close to the dotted line
, indicating that the residuals follow a normal distribution, except some points which might be outliers which maybe affecting the regression line fit of data. - Here the Scale-Location plot suggests that the red line is roughly horizontal across the plot and the spread of magnitude looks unequal, at some fitted values there are more residuals as compared to other like the ones in between 11.5 and and 12.5, indicating some heteroskedasticity.
- From the Residuals vs Leverage plot, we can see that there are no
influential points close to the Cook’s distance line in our
regression model. We need to check
influencePlot
to see if we are missing any leverage.
influencePlot(ols.mdl2)
## StudRes Hat CookD
## 102 -5.5330720 0.06894614 0.036192140
## 330 -7.1066901 0.03219393 0.026082146
## 332 1.3649856 0.48649265 0.029381761
## 479 0.3970509 0.44261022 0.002089044
## 741 -2.9684939 0.39659053 0.095419948
## 755 2.8387736 0.39888107 0.088198659
- We can now see some high influential points for the fitted values.
#ncv Test
ncvTest(ols.mdl2)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 10.53958, Df = 1, p = 0.0011685
Since p-value
is less than significance level (0.05
,
that means we reject the null hypothesis
of constant error variance
which indicates heteroscedasticity.
VIF(ols.mdl2)
## GVIF Df GVIF^(1/(2*Df))
## PC1 4.480179 1 2.116643
## PC3 4.098099 1 2.024376
## PC4 1.456950 1 1.207042
## PC5 1.769590 1 1.330259
## PC6 1.499033 1 1.224350
## PC7 1.159031 1 1.076583
## PC8 1.279314 1 1.131068
## PC9 1.111431 1 1.054244
## MSZoning 3.438020 3 1.228527
## LandContour 1.517353 3 1.071966
## LotConfig 1.386462 3 1.055969
## Neighborhood 5.929401 4 1.249184
## Condition1 1.757911 4 1.073062
## BldgType 6.947589 4 1.274176
## HouseStyle 5.349893 4 1.233227
## RoofStyle 1.393339 2 1.086461
## Exterior1st 5754.773889 4 2.951233
## Exterior2nd 5663.217047 4 2.945323
## ExterQual 3.519854 2 1.369718
## Foundation 5.306185 3 1.320678
## Heating 1.816291 1 1.347698
## CentralAir 2.071578 1 1.439298
## KitchenQual 3.136071 2 1.330750
## Functional 2.054221 5 1.074644
## PavedDrive 1.707506 2 1.143117
Generally, VIF values which are greater than 5 or 7 are the cause of multicollinearity which we do not see in our model.
Improving the current model:
- To improve our model, we need to remove some influential observations from our model and then fit the regression model to the data.
- We can re-build the model with new predictors.
- We can also perform variable transformation such as Box-Cox or use better evolved models like SVM, PCR etc., and see how it works.
- Using the whole data set after PMM imputation and factor level collapsing without omitting any outliers
- Using the predictors -
GarageArea
,GarageCars
,TotRmsAbvGrd
,FullBath
,GrLivArea
,X1stFlrSF
,TotalBsmtSF
,OverallQual
which has strong correlations with response variable -SalePrice
#creating a PLS model to predict the log of the sale price
#using 5-fold CV
pls.model <- plsr(log(SalePrice) ~ GarageArea + GarageCars + TotRmsAbvGrd
+ FullBath + GrLivArea + X1stFlrSF + TotalBsmtSF + OverallQual,
data=hd.Cleaned, scale=TRUE, validation='CV', k=5)
- Hyperparameter tuning to determine the number of PLS components with RMSE as the error metric
#report chart
summary(pls.model)
## Data: X dimension: 1000 8
## Y dimension: 1000 1
## Fit method: kernelpls
## Number of components considered: 8
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 0.3633 0.1695 0.1532 0.1506 0.1494 0.1492 0.1491
## adjCV 0.3633 0.1695 0.1531 0.1505 0.1493 0.1491 0.1490
## 7 comps 8 comps
## CV 0.1491 0.1491
## adjCV 0.1490 0.1490
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 54.34 62.66 74.93 79.61 83.32 95.82 97.73
## log(SalePrice) 78.36 82.58 83.18 83.49 83.60 83.60 83.60
## 8 comps
## X 100.0
## log(SalePrice) 83.6
plot(RMSEP(pls.model),legendpos="topright")
- From the table, we can see that if we use
6
PLS components only in our model, the RMSE drops to0.1486
and after that even if we keep adding components the RMSE still is the same. - Though we are eyeballing the CV component, but from the plot we can
see that fitting
4
PLS components is enough because even if we are adding 2 more components there is not much difference in the CV component. - Using the final model with
four PLS components
to make predictions
final.pls <- plsr(log(SalePrice) ~ GarageArea + GarageCars + TotRmsAbvGrd
+ FullBath + GrLivArea + X1stFlrSF + TotalBsmtSF + OverallQual,4,
data=hd.Cleaned, scale=TRUE, validation='CV', k=5)
plot(final.pls, plottype = "scores", comps = 1:4)
-
From the above plot, we can see that by using only four PLS components we can describe about 80% of the variation in the response variable.
-
Metric Calculations:
beta.pls <- drop(coef(final.pls))
resid.pls <- drop(final.pls$resid)[,4]
rss.pls <- sum(resid.pls^2)/(1000-4)
rmse.pls <- sqrt(mean(resid.pls^2))
ncomps.pls <- final.pls$ncomp
# Key diagnostics for PLS final summary table
# Get the RMSE and R Squared of the model
keyDiagnostics.pls <- data.frame(Model = 'PLS',
Notes = 'pls',
Hyperparameters = paste('ncomp = ', ncomps.pls),
RMSE = rmse.pls,
Rsquared = rss.pls)
# Show output
keyDiagnostics.pls %>%
knitr::kable()
Model | Notes | Hyperparameters | RMSE | Rsquared |
---|---|---|---|---|
PLS | pls | ncomp = 4 | 0.1474771 | 0.0218368 |
- If we now compare between our preferred OLS model and PLS model on basis of RMSE values, we can see that PLS model’s efficiency is much higher.
- RMSE for chosen OLS model was
ols.mdl2.rsme
whereas for PLS model is0.1475
. - But we see that the adjusted R-squared value for PLS model has
significantly reduced to about
2%
. - We know that adjusted R-squared identifies the percentage of variance in the response that is explained by the predictors which PCA handles in a better way as PCA finds the composite variables of predictors that maximally explain the variability of the data, whereas PLS finds the composite variables of predictors that are most predictive of the response variable. So maybe that’s why we have a less adjusted R-squared whereas a better RMSE value.
- We first setup our cross-validation strategy
- Then create a dataframe with PMM imputed values, and only whole columns without NA. Does not omit outliers
- Then we train the model using
glmnet
which actually fits the elastic net
# After cleaning, two data sets that contain..
## Numeric data ---------------------------------------------------
hd.numericClean <- hd.Cleaned %>% select_if(is.numeric)
## Factors -------------------------------------------------------
hd.factorClean <- hd.Cleaned %>% dplyr::select(where(is.factor))
# Removing any columns with NA
removeColsWithNA <- function(df) {
return( df[ , colSums(is.na(df)) == 0] )
}
hd.factorClean <- removeColsWithNA(hd.factorClean)
paste('Num. factor cols. removed due to null values:',
ncol(hd.Cleaned %>% dplyr::select(where(is.factor)) ) - ncol(hd.factorClean) )
## [1] "Num. factor cols. removed due to null values: 16"
paste(ncol(hd.factorClean), 'factor cols. remain')
## [1] "22 factor cols. remain"
ctrl <- trainControl(method = "repeatedcv",
number = 5, # 5 fold cross validation
repeats = 2 # 2 repeats
)
# The data (PMM imputed values, and only whole columns without NA. Does not omit outliers)
df.lasso <- cbind(SalePrice = hd.numericClean$SalePrice,
hd.numericClean, hd.factorClean)
# Train and tune the SVM
fit.lasso <- train(data = df.lasso,
log(SalePrice) ~ .,
method = "glmnet", # Elastic net
preProc = c("center","scale"), # Center and scale data
tuneLength = 10, #10 values of alpha and 10 lamda values for each
trControl = ctrl)
# Function to get the best hypertuned parameters
get_best_result = function(caret_fit) {
best = which(rownames(caret_fit$results) == rownames(caret_fit$bestTune))
best_result = caret_fit$results[best, ]
rownames(best_result) = NULL
best_result
}
result.lasso <- get_best_result(fit.lasso)
- The variables with non-zero coefficients of the final model:
lasso.coeff <- drop(coef(fit.lasso$finalModel, fit.lasso$bestTune$lambda))
lasso.coeff[lasso.coeff != 0]
## (Intercept) MSSubClass LotFrontage LotArea
## 12.0024696188 -0.0007921493 0.0026478399 0.0230760239
## OverallQual OverallCond MasVnrArea BsmtFinSF1
## 0.0830224519 0.0472047528 0.0007427026 0.0308870569
## BsmtFinSF2 TotalBsmtSF X1stFlrSF LowQualFinSF
## 0.0025371083 0.0429378869 0.0005909439 -0.0003123244
## GrLivArea BsmtFullBath HalfBath BedroomAbvGr
## 0.1293293597 0.0097350716 0.0003792692 -0.0014670944
## KitchenAbvGr Fireplaces GarageCars GarageArea
## -0.0082947665 0.0238248778 0.0241271452 0.0205714766
## WoodDeckSF OpenPorchSF EncPorchSF PoolArea
## 0.0054585299 0.0071790500 0.0113333888 0.0002923492
## age ageSinceRemodel MSZoningRH MSZoningRM
## -0.0489369396 -0.0120919238 -0.0023181212 -0.0255125817
## LotShapeIR3 LotShapeReg LandContourHLS LotConfigCulDSac
## -0.0005678139 -0.0012719892 0.0040202817 0.0041892575
## LandSlopeMod NeighborhoodOldTown Neighborhoodother NeighborhoodOther
## 0.0025765062 -0.0048725734 -0.0037200921 0.0014452209
## Condition1Norm BldgTypeDuplex BldgTypeTwnhs HouseStyleSLvl
## 0.0142184896 -0.0007300906 -0.0083334148 0.0001146792
## HouseStyleOther RoofStyleother Exterior1stWd Sdng Exterior1stOther
## -0.0028496468 0.0089202148 -0.0009953817 0.0018786546
## Exterior2ndVinylSd ExterQualAvg ExterQualBelowAvg ExterCondAvg
## 0.0029727720 -0.0045240096 -0.0051768565 0.0015378096
## ExterCondBelowAvg FoundationPConc HeatingQCAvg HeatingQCBelowAvg
## -0.0010470425 0.0173137043 -0.0060194630 -0.0003914879
## CentralAirY KitchenQualAvg KitchenQualBelowAvg FunctionalMaj2
## 0.0096502825 -0.0071298261 -0.0026199641 -0.0111519085
## FunctionalTyp PavedDriveY
## 0.0123207943 0.0060043777
# Gather key diagnostics for summary table
# Get the RMSE and R Squared of the model
hyperparameters.lasso = list('Alpha' = result.lasso$alpha,
'Lambda' = result.lasso$lambda)
keyDiagnostics.lasso <- data.frame(Model = 'Lasso',
Notes = 'caret and elasticnet',
Hyperparameters = paste('Alpha =', hyperparameters.lasso$Alpha, ',',
'Lambda =', hyperparameters.lasso$Lambda))
keyDiagnostics.lasso <- cbind(keyDiagnostics.lasso,
RMSE = result.lasso$RMSE,
Rsquared =result.lasso$Rsquared
)
# Show output
keyDiagnostics.lasso %>% knitr::kable()
Model | Notes | Hyperparameters | RMSE | Rsquared |
---|---|---|---|---|
Lasso | caret and elasticnet | Alpha = 0.8 , Lambda = 0.00385954380548551 | 0.1003116 | 0.9240285 |
- Uses
numeric
data for Principal Component Analysis- Data includes outliers
- Chose number of PC’s that explain 75% of the variation. This is just a general judgement call to keep the number of principal components low.
- Then appends the
factor
columns withoutNULL
values andSalePrice
to the data - Finally, uses
stepAIC()
to best model data - See interpretation at end
# Get cleaned `numeric` and `factor` `data frames`
# After cleaning, two data sets that contain..
## Numeric data ---------------------------------------------------
hd.numericClean <- hd.Cleaned %>% select_if(is.numeric)
## Factors -------------------------------------------------------
hd.factorClean <- hd.Cleaned %>% dplyr::select(where(is.factor))
# Removing any columns with NA
removeColsWithNA <- function(df) {
return( df[ , colSums(is.na(df)) == 0] )
}
hd.factorClean <- removeColsWithNA(hd.factorClean)
paste('Num. factor cols. removed due to null values:',
ncol(hd.Cleaned %>% dplyr::select(where(is.factor)) ) - ncol(hd.factorClean) )
## [1] "Num. factor cols. removed due to null values: 16"
paste(ncol(hd.factorClean), 'factor cols. remain')
## [1] "22 factor cols. remain"
# Perform PCA
# Principal component analysis on numeric data
pc.house <- prcomp(hd.numericClean %>% dplyr::select(-SalePrice), # do not include response var
center = TRUE, # Mean centered
scale = TRUE # Z-Score standardized
)
# See first 10 cumulative proportions
pc.house.summary <- summary(pc.house)
pc.house.summary$importance[, 1:10]
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.654699 1.857165 1.661636 1.412696 1.198246 1.105965
## Proportion of Variance 0.213560 0.104520 0.083670 0.060480 0.043510 0.037070
## Cumulative Proportion 0.213560 0.318080 0.401740 0.462220 0.505730 0.542790
## PC7 PC8 PC9 PC10
## Standard deviation 1.084074 1.064364 1.030999 1.008343
## Proportion of Variance 0.035610 0.034330 0.032210 0.030810
## Cumulative Proportion 0.578410 0.612740 0.644950 0.675760
# Now we choose number of PC's that explain 75% of the variation
# Note this threshold is just a judgement call. No significance behind 75%
cumPropThreshold = 0.75 # The threshold
numPCs <- sum(pc.house.summary$importance['Cumulative Proportion', ] < cumPropThreshold)
paste0('There are ', numPCs, ' principal components that explain up to ', cumPropThreshold*100,
'% of the variation in the data')
## [1] "There are 12 principal components that explain up to 75% of the variation in the data"
chosenPCs <- as.data.frame(pc.house$x[, 1:numPCs])
Join on the factor
data and SalePrice
df.pcr <- cbind(SalePrice = hd.numericClean$SalePrice, chosenPCs, hd.factorClean)
- Linear model containing:
- Principal components explaining 75% of variation in
numeric
data - Non-null
factor
data - Predicted variable:
log(SalePrice)
- Principal components explaining 75% of variation in
- Then use
stepAIC()
to identify which variables are actually important for model
# Fit data using PC's, non-null factors
fit.pcr <- lm(log(SalePrice) ~ ., data = df.pcr)
# Reduce to only important variables
fit.pcrReduced <- stepAIC(fit.pcr, direction="both")
# Key diagnostics for final summary table
est.pcr <- summary(fit.pcrReduced)
# Get the RMSE and R Squared of the model
keyDiagnostics.pcr <- data.frame(Model = 'PCR',
Notes = 'lm, prcomp, and stepAIC',
Hyperparameters = 'N/A',
RMSE = sqrt(mean(fit.pcrReduced$residuals^2)),
Rsquared = est.pcr$adj.r.squared)
# Show output
keyDiagnostics.pcr %>% kable()
Model | Notes | Hyperparameters | RMSE | Rsquared |
---|---|---|---|---|
PCR | lm, prcomp, and stepAIC | N/A | 0.1013773 | 0.9177087 |
View results of step AIC
model
summary(fit.pcrReduced)
##
## Call:
## lm(formula = log(SalePrice) ~ PC1 + PC3 + PC4 + PC5 + PC7 + PC9 +
## PC12 + MSZoning + LandContour + LotConfig + Condition1 +
## BldgType + HouseStyle + RoofStyle + Exterior1st + ExterQual +
## ExterCond + Foundation + Heating + CentralAir + KitchenQual +
## Functional + PavedDrive, data = df.pcr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.69458 -0.06013 0.00372 0.06641 0.31110
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.788178 0.054476 216.391 < 2e-16 ***
## PC1 0.097826 0.002396 40.828 < 2e-16 ***
## PC3 -0.054099 0.003786 -14.290 < 2e-16 ***
## PC4 -0.020065 0.003028 -6.627 5.74e-11 ***
## PC5 -0.042290 0.004500 -9.397 < 2e-16 ***
## PC7 0.034703 0.004108 8.447 < 2e-16 ***
## PC9 0.005958 0.003659 1.628 0.103791
## PC12 0.016985 0.003810 4.458 9.28e-06 ***
## MSZoningRH -0.073277 0.040064 -1.829 0.067713 .
## MSZoningRL -0.038153 0.020292 -1.880 0.060378 .
## MSZoningRM -0.118066 0.021931 -5.384 9.21e-08 ***
## LandContourHLS 0.075187 0.026685 2.818 0.004939 **
## LandContourLow -0.002292 0.028183 -0.081 0.935184
## LandContourLvl -0.014637 0.018166 -0.806 0.420593
## LotConfigCulDSac 0.039882 0.015134 2.635 0.008544 **
## LotConfigInside 0.001906 0.009101 0.209 0.834115
## LotConfigother -0.007785 0.019206 -0.405 0.685323
## Condition1Feedr 0.048342 0.024685 1.958 0.050488 .
## Condition1Norm 0.092076 0.020398 4.514 7.16e-06 ***
## Condition1RR 0.048199 0.029206 1.650 0.099218 .
## Condition1Other 0.024039 0.031188 0.771 0.441036
## BldgType2fmCon 0.050671 0.031238 1.622 0.105122
## BldgTypeDuplex 0.026637 0.028144 0.946 0.344163
## BldgTypeTwnhs -0.037597 0.025399 -1.480 0.139137
## BldgTypeTwnhsE 0.010759 0.018634 0.577 0.563835
## HouseStyle1Story -0.077090 0.013371 -5.766 1.10e-08 ***
## HouseStyle2Story -0.002235 0.014679 -0.152 0.879026
## HouseStyleSLvl -0.028825 0.019922 -1.447 0.148247
## HouseStyleOther -0.053210 0.019369 -2.747 0.006125 **
## RoofStyleHip 0.015960 0.009352 1.707 0.088240 .
## RoofStyleother 0.100418 0.024695 4.066 5.17e-05 ***
## Exterior1stMetalSd 0.025403 0.012732 1.995 0.046306 *
## Exterior1stVinylSd 0.022663 0.011374 1.993 0.046602 *
## Exterior1stWd Sdng -0.005745 0.013388 -0.429 0.667924
## Exterior1stOther 0.030404 0.011676 2.604 0.009359 **
## ExterQualAvg -0.037275 0.011643 -3.201 0.001413 **
## ExterQualBelowAvg -0.109878 0.046851 -2.345 0.019220 *
## ExterCondAvg 0.021911 0.011581 1.892 0.058802 .
## ExterCondBelowAvg -0.001738 0.032734 -0.053 0.957664
## FoundationCBlock 0.005009 0.014363 0.349 0.727338
## Foundationother 0.031231 0.025444 1.227 0.219959
## FoundationPConc 0.052120 0.016508 3.157 0.001643 **
## Heatingother 0.035643 0.025347 1.406 0.159996
## CentralAirY 0.068068 0.018695 3.641 0.000286 ***
## KitchenQualAvg -0.022613 0.010379 -2.179 0.029606 *
## KitchenQualBelowAvg -0.039207 0.025778 -1.521 0.128607
## FunctionalMaj2 -0.215700 0.061728 -3.494 0.000497 ***
## FunctionalMin1 0.026899 0.039008 0.690 0.490618
## FunctionalMin2 0.024274 0.038114 0.637 0.524345
## FunctionalMod 0.005062 0.044792 0.113 0.910040
## FunctionalTyp 0.095708 0.032005 2.990 0.002858 **
## PavedDriveP -0.000974 0.025345 -0.038 0.969353
## PavedDriveY 0.050551 0.016182 3.124 0.001839 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1042 on 947 degrees of freedom
## Multiple R-squared: 0.922, Adjusted R-squared: 0.9177
## F-statistic: 215.2 on 52 and 947 DF, p-value: < 2.2e-16
Please note all interpretations below are approximate, given the
stepAIC()
uses stochastic modeling.
Model performance evaluation:
-
See that around 28 of the variables cannot be explained by random chance, with a probability of 90% or more (see significance codes above)
-
Standard errors range from
$\pm$ 1-5%, with average around 2%. Larger values may indicate higher uncertainty of the estimated coefficients. -
This model explains around 92% of the variation in the
log(SalePrice)
. See Adjusted R-Squared for reference. -
Note this model may exhibit selection bias, since the data excludes factor data with null values in the variable.
-
This model would likely doe well for prediction of
log(SalePrice)
, given the small range of standard errors, high adjusted R squared, and number of significant variables. This model would obviously not do well for inference, given we are using principal components that mask the numeric data.
Practical significance evaluation:
-
The principal components contribute positively about 20% of the sale price of the home
-
Residential Medium Density (
MSZoningRM
) reduces the home price by around 12%, with a standard error of around 2%. -
If the exterior quality is below average (
ExterQualBelowAvg
), it reduces the home price by around 12%, with a standard error of around 5%. -
If the functionality of the home has 2 major deductions (
FunctionalMaj2
), it reduces the home price by around 20%, with a standard error of around 6%. While having typical functionality (FunctionalTyp
) increases the home sale price by nearly 10%, with a standard error of 3%. -
See other coefficients of the data for other variables.
Note that the Function predictedVsObserved()
created to compare
predicted vs. observed values from the model. Uses ggplot2
and model
output to display the following. See interpretation below.
# Function to compare predicted vs. actual (observed) regression outputs
predictedVsObserved <- function(predicted, observed, modelName, outcomeName = 'Log(SalePrice)') {
## Create data set for predicted vs. actuals
comparison <- data.frame(observed = observed,
predicted = predicted) %>%
# Row index
mutate(ID = row_number()) %>%
# Put in single column
pivot_longer(cols = c('observed', 'predicted'),
names_to = 'metric',
values_to = 'value')
# Plot --- Observed vs. Actuals across all variables in data
variationScatter <- comparison %>%
ggplot(aes(x = ID,
y = value,
color = metric
)
) +
geom_point(alpha = 0.5, size = 1) +
labs(title = 'Variation in Predicted vs. Observed Data',
subtitle = paste('Model:', modelName),
x = 'X', y = outcomeName) +
theme_minimal() + theme(legend.title = element_blank(),
legend.position = 'top') +
scale_color_manual(values = c('grey60', 'palegreen3'))
print(variationScatter)
# Limit for x and y axis for scatter of predicted vs. observed
axisLim = c( min(c(predicted, observed)), max(c(predicted, observed)) )
# Simple comparison of data
plot(x = observed,
y = predicted,
main = paste(modelName, 'Model - Actual (Observed) vs. Predicted\n'),
xlab = paste('Observed Values -', outcomeName),
ylab = paste('Predicted Values -', outcomeName),
pch = 16,
cex = 0.75,
col = alpha('steelblue3', 1/4),
xlim = axisLim,
ylim = axisLim
)
# Add the Predicted vs. actual line
abline(lm(predicted ~ observed), col = 'steelblue3', lwd = 2)
mtext('Predicted ~ Actual', side = 3, adj = 1, col = 'steelblue3')
# Add line for perfectly fit model
abline(0,1, col = alpha('tomato3', 0.8), lwd = 2)
mtext('Perfectly Fit Model', side = 1, adj = 0, col = 'tomato3')
}
View results of the PCR Model
-
See that the variation in the data is very closely resembled actual by changes in independent variables
-
Implication? This model fits its own data well, but what is not know if it can predict out of sample data.
-
Note that it the data (blue) deviates slightly from perfect line model (red), indicating that the model is slightly skewed from predicted and actual data.
# How do the predicted vs. Actuals Compare?
predictedVsObserved(observed = log(df.pcr$SalePrice),
predicted = predict(fit.pcrReduced),
modelName = 'PCR')
ctrl <- trainControl(method = "repeatedcv",
number = 5, # 5 fold cross validation
repeats = 2 # 2 repeats
)
# The data (PMM imputed values, and only whole columns without NA. Does not omit outliers)
df.svm <- cbind(SalePrice = hd.numericClean$SalePrice,
hd.numericClean, hd.factorClean)
- Predicted variable:
log(SalePrice)
- Dependent variables: non-null factor data (collapsed if over 4
unique values), and all numeric data (
pmm
imputed if needed). Includes outliers
# Train and tune the SVM
fit.svm <- train(data = df.svm,
log(SalePrice) ~ .,
method = "svmRadial", # Radial kernel
tuneLength = 9, # 9 values of the cost function
preProc = c("center","scale"), # Center and scale data
trControl = ctrl)
- Note all numbers mentioned below are approximate
- See that the R Squared of the model is around 0.86, and RMSE is 0.14
- See that the model predicts the data well.
- Also, note that the model predicts the data with less error than the linear model. See this from the RMSE or scatter plot of predicted values.
# Gather key diagnostics for summary table
# Get the RMSE and R Squared of the model
hyperparameters.svm = list('C' = fit.svm[["finalModel"]]@param[["C"]],
'Epsilon' = fit.svm[["finalModel"]]@param[["epsilon"]])
keyDiagnostics.svm <- data.frame(Model = 'SVM',
Notes = 'caret and svmRadial',
Hyperparameters = paste('C =', hyperparameters.svm$C, ',',
'Epsilon =', hyperparameters.svm$Epsilon)
)
keyDiagnostics.svm <- cbind(keyDiagnostics.svm,
fit.svm$results %>%
filter(C == hyperparameters.svm$C) %>%
dplyr::select(RMSE, Rsquared)
)
# Show output
keyDiagnostics.svm %>% knitr::kable()
Model | Notes | Hyperparameters | RMSE | Rsquared |
---|---|---|---|---|
SVM | caret and svmRadial | C = 4 , Epsilon = 0.1 | 0.1400092 | 0.8540507 |
# Final model?
fit.svm$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 4
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.00757293697362188
##
## Number of Support Vectors : 670
##
## Objective Function Value : -161.6045
## Training error : 0.012257
# How do the predicted vs. Actuals Compare?
predictedVsObserved(observed = log(df.svm$SalePrice),
predicted = predict(fit.svm, df.svm),
modelName = 'SVM')
- Predicted variable:
log(SalePrice)
- Dependent variables: non-null factor data (collapsed if over 4
unique values), and all numeric data (
pmm
imputed if needed). Includes outliers
# Train and tune the MARS model
fit.mars <- train(data = df.svm, # note this is fine since data is the same for this model
log(SalePrice) ~ .,
method = "earth", # Radial kernel
tuneLength = 9, # 9 values of the cost function
preProc = c("center","scale"), # Center and scale data
trControl = ctrl
)
# Key diagnostics for final model
# Get the RMSE and R Squared of the model
hyperparameters.mars = list('degree' = fit.mars[["bestTune"]][["degree"]],
'nprune' = fit.mars[["bestTune"]][["nprune"]])
keyDiagnostics.mars <- data.frame(Model = 'MARS',
Notes = 'caret and earth',
Hyperparameters = paste('Degree =', hyperparameters.mars$degree, ',',
'nprune =', hyperparameters.mars$nprune)
)
keyDiagnostics.mars <- cbind(keyDiagnostics.mars,
fit.mars$results %>%
filter(degree == hyperparameters.mars$degree,
nprune == hyperparameters.mars$nprune) %>%
dplyr::select(RMSE, Rsquared)
)
# Show output
keyDiagnostics.mars %>% kable()
Model | Notes | Hyperparameters | RMSE | Rsquared |
---|---|---|---|---|
MARS | caret and earth | Degree = 1 , nprune = 17 | 0.109485 | 0.909223 |
- See that the model overall performs very well, and in fact performs similarly to the PCR model (in terms of RMSE and Adjusted R Squared).
- Again, unsure if the model would do well for prediction of out of sample data, but fits this data extremely well.
# Final model?
fit.mars$finalModel
## Selected 17 of 21 terms, and 10 of 94 predictors (nprune=17)
## Termination condition: RSq changed by less than 0.001 at 21 terms
## Importance: GrLivArea, age, OverallQual, TotalBsmtSF, OverallCond, LotArea, ...
## Number of terms at each degree of interaction: 1 16 (additive model)
## GCV 0.011145 RSS 10.42157 GRSq 0.9155756 RSq 0.9208976
# How do the predicted vs. Actuals Compare?
predicted.mars = fit.mars[["finalModel"]][["fitted.values"]]
colnames(predicted.mars) <- 'predicted'
predictedVsObserved(observed = log(df.svm$SalePrice),
predicted = predicted.mars,
modelName = 'MARS')
# Add the key diagnostics here
rbind(keyDiagnostics.ols.mdl1,
keyDiagnostics.ols.mdl2,
keyDiagnostics.pls,
keyDiagnostics.lasso,
keyDiagnostics.pcr,
keyDiagnostics.svm,
keyDiagnostics.mars
) %>%
# Round to 4 digits across numeric data
mutate_if(is.numeric, round, digits = 4) %>%
# Spit out kable table
kable()
Model | Notes | Hyperparameters | RMSE | Rsquared |
---|---|---|---|---|
OLS | lm | N/A | 0.1352 | 0.8044 |
OLS | lm + 2-way interactions | N/A | 0.0965 | 0.8929 |
PLS | pls | ncomp = 4 | 0.1475 | 0.0218 |
Lasso | caret and elasticnet | Alpha = 0.8 , Lambda = 0.00385954380548551 | 0.1003 | 0.9240 |
PCR | lm, prcomp, and stepAIC | N/A | 0.1014 | 0.9177 |
SVM | caret and svmRadial | C = 4 , Epsilon = 0.1 | 0.1400 | 0.8541 |
MARS | caret and earth | Degree = 1 , nprune = 17 | 0.1095 | 0.9092 |