we first load the train data and the test data without the "id" and "date" columns (the data is already splitted into two datasets "train" and "test" with different extensions)
library(readxl)
house_train <- read.csv("home_data-train .csv",
header = FALSE,
stringsAsFactors = FALSE)[, -c(1:2)]
house_test <- data.frame(read_excel("HomePrices-Test.xlsx",
sheet = "Sheet1"))[, -c(1:2)]
names(house_train) <- names(house_test)
str(house_train)
## 'data.frame': 19998 obs. of 19 variables:
## $ price : int 221900 538000 180000 604000 510000 1225000 257500 291850 229500 323000 ...
## $ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
## $ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
## $ floors : num 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : int 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : int 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : int 1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
## $ sqft_basement: int 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : int 1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
## $ yr_renovated : int 0 1991 0 0 0 0 0 0 0 0 ...
## $ zipcode : int 98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
## $ lat : num 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ sqft_living15: int 1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
## $ sqft_lot15 : int 5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
library(ggplot2)
ggplot(house_train) + geom_density(aes(x = price))
it seems that the price variable distribution is skewed and this indicates that we have to get log(price)
ggplot(house_train) + geom_density(aes(x = log(price)))
we then create a correlation map to know which variables affect the price of a house
library(corrplot)
library(dplyr)
cor_mat <- cor(house_train) %>% round(digits = 2)
corrplot(cor_mat, method = 'number',
tl.srt = 45, tl.col = 'black')
#method = "shade", type = "upper", shade.col = NA, diag = FALSE
# 2d density plot
#normal geom_point + stat_density2d(aes(color = ..level..), #geom = "raster" / "polygon")
it seems that some variables have weak correlation with the price, so we remove them
unnecessary <- c('sqft_lot', 'condition', 'yr_built', 'yr_renovated',
'zipcode', 'long', 'lat', 'sqft_lot15', 'sqft_basement')
train <- house_train[, -which(names(house_train) %in% unnecessary)]
test <- house_test[, -which(names(house_test) %in% unnecessary)]
then before fitting the model we define functions to extract RSquared and RMSE values from our model. RMSE = root-mean-square error "the lower the better" RSQ = RSquared "the greater the better"
rmse <- function(y, x){
sqrt(mean((y - x)^2))
}
rsq <- function(y, x) {
1 - sum((y - x) ^ 2) / sum((y - mean(y)) ^ 2)
}
library(caret)
control <- trainControl(method = "cv", number = 10)
# train the model
model <- train(log(price) ~ ., data = train, method = "lm",
trControl = control, verbose = FALSE)
model
## Linear Regression
##
## 19998 samples
## 9 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 17999, 17998, 17998, 17998, 17999, 17999, ...
## Resampling results:
##
## RMSE Rsquared
## 0.342187 0.5818805
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
##
summary(model)
##
## Call:
## lm(formula = .outcome ~ ., data = dat, verbose = FALSE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.67311 -0.24658 0.00735 0.23401 1.31939
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.118e+01 2.190e-02 510.555 < 2e-16 ***
## bedrooms -6.731e-03 3.267e-03 -2.060 0.0394 *
## bathrooms -2.098e-02 5.390e-03 -3.892 9.96e-05 ***
## sqft_living 2.761e-04 7.407e-06 37.271 < 2e-16 ***
## floors 6.629e-02 6.383e-03 10.385 < 2e-16 ***
## waterfront 3.543e-01 2.988e-02 11.857 < 2e-16 ***
## view 6.736e-02 3.650e-03 18.456 < 2e-16 ***
## grade 1.716e-01 3.688e-03 46.542 < 2e-16 ***
## sqft_above -1.494e-04 7.452e-06 -20.048 < 2e-16 ***
## sqft_living15 1.006e-04 6.011e-06 16.732 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3421 on 19988 degrees of freedom
## Multiple R-squared: 0.5822, Adjusted R-squared: 0.582
## F-statistic: 3094 on 9 and 19988 DF, p-value: < 2.2e-16
vimp <- varImp(model)
vimp
## lm variable importance
##
## Overall
## grade 100.000
## sqft_living 79.160
## sqft_above 40.438
## view 36.860
## sqft_living15 32.985
## waterfront 22.024
## floors 18.715
## bathrooms 4.119
## bedrooms 0.000
plot(vimp)
surprisingly number of bedrooms has no effect at all on the price, meanwhile the grade is the most important variable in the model.