-
Notifications
You must be signed in to change notification settings - Fork 0
/
PCA_PLS_Stepwise
121 lines (96 loc) · 4.54 KB
/
PCA_PLS_Stepwise
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
library(caret) # To Create Folds
library(MASS) # For stepwise regression
library(pls) # For PLSR
library(ggplot2) # For plots
# Generate fake data with strong multicollinearity
set.seed(1)
n <- 1000 # number of observations
m <- 100 # number of predictors
X <- matrix(rnorm(n * m), nrow = n, ncol = m)
# Make Multicollinearity:
X[, 2] <- X[, 1] + rnorm(n, sd = 1)
X[, 3] <- X[, 1] + X[, 6] + rnorm(n, sd= 1)
X[, 5] <- X[, 4] + rnorm(n, sd= 1)
# Generate coefficients (intercept = 15)
true_beta <- rep(c(1.5, 0, 4, 0, -.5, 0, 0, 0, -2, 0),m/10)
y <- 15 + X %*% true_beta + rnorm(n, sd = 1)
# Give names to X columns (X1, X2, ..., Xm)
colnames(X) <- paste0("X", 1:m)
# Create K folds
K <- 10
folds <- createFolds(y, k = K, list = TRUE)
pca_mse <- matrix(0, nrow = m, ncol = K)
plsr_mse <- matrix(0, nrow = m, ncol = K)
ols_mse <- numeric(K)
stepwise_mse <- numeric(K)
stepwise_p <- numeric(K) # Number of selected predictors (not intercept)
# Evaluate the models for each fold
for (i in 1:K) {
# Split into training and test data for fold i
print(paste("##################################### i=",i))
test_indices <- folds[[i]]
X_test <- X[test_indices, , drop = FALSE]
y_test <- y[test_indices]
X_train <- X[-test_indices, , drop = FALSE]
y_train <- y[-test_indices]
# Full OLS Regression
lm_fit <- lm(y_train ~ ., data = as.data.frame(X_train))
yhat <- predict(lm_fit,as.data.frame(X_test))
print('OLS: ')
print(ols_mse[i] <- mean((y_test - yhat)^2))
print(' ------------------------------------ ')
# Stepwise Regression
stepwise_model <- stepAIC(lm_fit, direction = "both", trace = FALSE)
stepwise_coefs <- coef(stepwise_model)
print("Stepwise: ")
print(stepwise_p[i] <- length(names(stepwise_coefs)) - 1)
print(stepwise_mse[i] <- mean((y_test - predict(stepwise_model,as.data.frame(X_test)))^2))
print(' ------------------------------------ ')
# PCA Regression
pcr_model <- pcr(y_train ~ X_train, ncomp = m, scale = TRUE, validation = "none")
pca_mse[,i] <- colMeans((y_test - predict(pcr_model, X_test, ncomp = 1:m))^2)
print("Best PCA: ")
print(which.min(pca_mse[,i]))
print(min(pca_mse[,i]))
print(' ------------------------------------ ')
# PLS Regression
plsr_model <- plsr(y_train ~ X_train, ncomp = m, scale = TRUE, validation = "none")
plsr_mse[,i] <- colMeans((y_test - predict(plsr_model, X_test, ncomp = 1:m))^2)
print('Best PLS: ')
print(which.min(plsr_mse[,i]))
print(min(plsr_mse[,i]))
print(' ------------------------------------ ')
}
# Combine the PLSR and PCAR MSE values into a matrix
mse_matrix <- cbind(rowMeans(plsr_mse), rowMeans(pca_mse))
# Visualize ###################################################################
par(mfrow=c(1,2))
# Use matplot to plot the multiple MSE series
matplot(1:m, mse_matrix, type = "b", pch = 1:2, col = c("blue", "red"), lty = 1:2,
xlab = "Number of Components/Predictors", ylab = "Mean Squared Error (MSE)",
main = "MSE for OLS, PLSR, PCAR and Stepwise Regression")
# One point for OLS performance
points(m, mean(ols_mse), col = "black", pch = 5, cex = 1.5)
# K points for the performance of Stepwise regression on each fold
points(stepwise_p, stepwise_mse, col = "darkgreen", pch = 8, cex = 1.5)
# Add a legend
legend("topright", legend = c("PLSR", "PCAR", "OLS", "Stepwise"),
col = c("blue", "red", "black", "darkgreen"), pch = c(1, 2, 5, 8), lty = c(1, 2, NA, NA))
# Zooming in to the optimal region
matplot(1:m, rowMeans(plsr_mse), type = "b", pch = 1, col = "blue", lty = 1,
ylim = c(min(stepwise_mse), max(stepwise_mse)),
xlim = c(which.min(rowMeans(plsr_mse)), max(stepwise_p)),
xlab = "Number of Components/Predictors", ylab = "Mean Squared Error (MSE)",
main = "Zooming for PLSR and Stepwise")
# K points for the performance of Stepwise regression on each fold
points(stepwise_p, stepwise_mse, col = "darkgreen", pch = 8, cex = 1.5)
# Add a legend
legend("topleft", legend = c("PLSR", "Stepwise"),
col = c("blue", "darkgreen"), pch = c(1, 8), lty = c(1, NA))
# Summarize ####################################################################
print (data.frame(
Method = c("Full OLS", "Stepwise","PCAR", "PLSR"),
Mean_MSE = c(mean(ols_mse), mean(stepwise_mse), min(rowMeans(pca_mse)), min(rowMeans(plsr_mse))),
Components = c(m, mean(stepwise_p), which.min(rowMeans(pca_mse)), which.min(rowMeans(plsr_mse)))
# For Stepwise Regression: Components is the average number of selected predictors in K folds.
))