-
Notifications
You must be signed in to change notification settings - Fork 0
/
regressions_and_assumptions.py
145 lines (120 loc) · 4.98 KB
/
regressions_and_assumptions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.diagnostic import het_breuschpagan, acorr_ljungbox
from statsmodels.stats.outliers_influence import variance_inflation_factor
class AssumptionChecks:
@staticmethod
def check_linearity(X, y, predictions):
sns.pairplot(pd.concat([X, y], axis=1))
plt.suptitle('Linearity Check: Pairplot', y=1.02)
plt.show()
@staticmethod
def check_normality(residuals):
sns.histplot(residuals, bins=30, kde=True)
plt.title('Normality Check: Residuals Histogram')
plt.show()
_, p_value = stats.shapiro(residuals)
print(f"Shapiro-Wilk normality test p-value: {p_value:.5f}")
@staticmethod
def check_homoscedasticity(X, predictions, residuals):
sns.scatterplot(x=predictions, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.title('Homoscedasticity Check')
if X.shape[1] > 1:
bp_test = het_breuschpagan(residuals, X)
bp_test_result = f'BP Test Statistic: {bp_test[0]:.5f}, p-value: {bp_test[1]:.5f}'
else:
bp_test_result = 'Breusch-Pagan test not applicable for univariate regression'
plt.annotate(bp_test_result, xy=(0.05, 0.95), xycoords='axes fraction')
plt.show()
@staticmethod
def check_independence(residuals):
plt.plot(residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Independence Check: Residuals vs. Index')
plt.show()
lb_value, p_value = acorr_ljungbox(residuals, lags=[10])
if isinstance(p_value[0], str):
print(f"Ljung-Box test message: {p_value[0]}")
else:
print(f"Ljung-Box test p-value: {p_value[0]:.5f}")
@staticmethod
def check_multicollinearity(X):
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print("Variance Inflation Factor (VIF)")
print(vif_data)
correlation_matrix = X.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()
def plot_regression_model(X, y, predictions):
if X.shape[1] == 1:
sns.scatterplot(x=X.iloc[:, 0], y=y, color='blue', label='Observed')
sns.scatterplot(x=X.iloc[:, 0], y=predictions, color='red', label='Predicted')
sns.regplot(x=X.iloc[:, 0], y=predictions, color='green', label='Regression Line', scatter=False)
plt.title('Regression Model')
plt.legend()
else:
print("Multivariate plot not available for more than one predictor variable.")
plt.show()
def make_predictions(model, new_data):
new_data = sm.add_constant(new_data)
predictions = model.predict(new_data)
return predictions
def linear_regression_and_check_assumptions(X, y):
if X is None or y is None or X.empty or y.empty:
raise ValueError("Input data (X, y) cannot be None or empty")
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
residuals = y - predictions
results = {
"Model Summary": model.summary(),
"Predictions": predictions,
"Residuals": residuals
}
try:
AssumptionChecks.check_linearity(X, y, predictions)
except Exception as e:
print(f"Error in checking linearity: {e}")
try:
AssumptionChecks.check_normality(residuals)
except Exception as e:
print(f"Error in checking normality: {e}")
try:
AssumptionChecks.check_homoscedasticity(X, predictions, residuals)
except Exception as e:
print(f"Error in checking homoscedasticity: {e}")
try:
AssumptionChecks.check_independence(residuals)
except Exception as e:
print(f"Error in checking independence: {e}")
try:
AssumptionChecks.check_multicollinearity(X)
except Exception as e:
print(f"Error in checking multicollinearity: {e}")
try:
dw_test = durbin_watson(residuals)
print(f"Durbin-Watson statistic: {dw_test:.5f} (Values close to 2 suggest no autocorrelation)")
except Exception as e:
print(f"Error in Durbin-Watson test: {e}")
try:
plot_regression_model(X, y, predictions)
except Exception as e:
print(f"Error in plotting regression model: {e}")
return results
if __name__ == "__main__":
np.random.seed(0)
X = pd.DataFrame({'X1': np.linspace(0, 100, 100), 'X2': np.linspace(0, 200, 100)})
y = 3*X['X1'] + 2*X['X2'] + np.random.normal(0, 10, size=100)
results = linear_regression_and_check_assumptions(X, y)
print(results["Model Summary"])