-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLoanPrediction-EndtoEnd.py
349 lines (289 loc) · 14.4 KB
/
LoanPrediction-EndtoEnd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
# This project involves predicing the whether a loan application is approved or not approved
# First we undertake EDA and creating a predicition model
# Dataset: https://datahack.analyticsvidhya.com/contest/practice-problem-loan-prediction-iii/
# __________________________________________________________________________________________
# Setup
import pandas as pd
import numpy as np # For mathematical calculations
import seaborn as sns # For data visualization
import matplotlib.pyplot as plt # For plotting graphs
import warnings # To ignore any warnings warnings.filterwarnings("ignore")
import os
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "images"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "", CHAPTER_ID)
# Function to save figures
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=450):
path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
print("Saving figure", fig_id)
#if tight_layout:
# plt.tight_layout()
plt.savefig(path, format=fig_extension, dpi=resolution)
train = pd.read_csv('TrainData-LoanPrediction.csv')
test = pd.read_csv('TestData-LoanPrediction.csv')
train_original = train.copy() #copy of train data, to preserve original data from changes
test_original = train.copy()
print(train.columns)#check the columns
print(train.dtypes) #check datatypes
print(train.shape, test.shape) #check the number of rows and columns
# __________________________________________________________________________________________
# Univariate Analysis
# Close all graph plots
plt.close('all')
#Catogarical counts of Loan_Status
print("Loan_Status counts: \n",train['Loan_Status'].value_counts())
# 472 Approved and 192 Not Approved
#Catogarical propotions of Loan_Status
print("Loan_Status counts: \n",train['Loan_Status'].value_counts(normalize = True))
# 472 68% Approved and 31% Not approved
#plot Loan_Status
plt.figure(1)
train['Loan_Status'].value_counts().plot.bar(color =('dodgerblue','hotpink')).set_xticklabels( ('Yes', 'No') )
plt.xticks(rotation=0)
plt.suptitle('Loan Status', fontsize=20)
save_fig("Uni_Loan_Status")
# We have a little less than half of our loans classified as not accepted, we should therefore perform stratefied cv.
#Status and background of Applicants
plt.figure(2)
#Plot 1
plt.subplot(221)
train['Gender'].value_counts(normalize=True).plot.bar(figsize=(20,11),title= 'Gender', color =('dodgerblue','hotpink'))
plt.ylim((0,1))
plt.xticks(rotation=0)
#Plot 2
plt.subplot(222)
train['Married'].value_counts(normalize=True).plot.bar(title= 'Married', color =('dodgerblue','hotpink'))
plt.ylim((0,1))
plt.xticks(rotation=0)
#Plot 3
plt.subplot(223)
train['Self_Employed'].value_counts(normalize=True).plot.bar(title= 'Self Employed', color =('dodgerblue','hotpink'))
plt.ylim((0,1))
plt.xticks(rotation=0)
#Plot 4
plt.subplot(224)
train['Credit_History'].value_counts(normalize=True).plot.bar(title= 'Credit History Meeting Requirements', color =('dodgerblue','hotpink')).set_xticklabels( ('Yes', 'No') )
plt.ylim((0,1))
plt.xticks(rotation=0)
#Details
plt.suptitle('Background of Applicants', fontsize=20)
save_fig("Uni_Background_of_Applicants")
plt.figure(3)
#Plot 1
plt.subplot(131)
train['Dependents'].value_counts(normalize=True).plot.bar(figsize=(15,6), title= 'Dependents',color =('dodgerblue'))
plt.xticks(rotation=0)
#Plot 2
plt.subplot(132)
train['Education'].value_counts(normalize=True).plot.bar(title= 'Education',color =('dodgerblue','hotpink'))
plt.xticks(rotation=0)
#Plot 3
plt.subplot(133)
train['Property_Area'].value_counts(normalize=True).plot.bar(title= 'Property_Area', color =('dodgerblue'))
plt.xticks(rotation=0)
#Details
plt.suptitle('Further Background of Applicants', fontsize=20)
save_fig("Uni_Further_Background_of_Applicants")
plt.figure(4)
plt.subplot(121)
sns.distplot(train['ApplicantIncome'], bins=50, kde=True)
plt.subplot(122)
train['ApplicantIncome'].plot.box(figsize=(16,5))
#plt.ylim((0,30000))
plt.suptitle('Applicant Income', fontsize=20)
save_fig("Uni_Applicant_Income")
plt.figure(5)
plt.subplot(121)
sns.distplot(train['CoapplicantIncome'])
plt.subplot(122)
train['CoapplicantIncome'].plot.box(figsize=(16,5))
plt.suptitle('Coapplicant Income)', fontsize=20)
save_fig("Uni_Coapplicant_Income")
plt.figure(6)
plt.subplot(121)
df=train.dropna()
sns.distplot(df['LoanAmount'])
plt.subplot(122)
train['LoanAmount'].plot.box(figsize=(16,5))
plt.suptitle('Loan Amount (Thousands)', fontsize=20)
plt.show()
save_fig("Uni_Loan_Amount")
# __________________________________________________________________________________________
# Bivariate Analysis
#plot relationship between catogarical data and loan status
#Crosstab function is like plotting it in a matrix/PivotTable
Gender = pd.crosstab(train['Gender'],train['Loan_Status'])
Married = pd.crosstab(train['Married'],train['Loan_Status'])
Dependents = pd.crosstab(train['Dependents'],train['Loan_Status'])
Education = pd.crosstab(train['Education'],train['Loan_Status'])
Self_Employed = pd.crosstab(train['Self_Employed'],train['Loan_Status'])
Credit_History = pd.crosstab(train['Credit_History'],train['Loan_Status'])
Property_Area = pd.crosstab(train['Property_Area'],train['Loan_Status'])
#Plot of 7 figures
Gender.div(Gender.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
save_fig("Bivariant_Gender")
Married.div(Married.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
save_fig("Bivariant_Married")
Dependents.div(Dependents.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
save_fig("Bivariant_Dependents")
Education.div(Education.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
save_fig("Bivariant_Education")
Self_Employed.div(Self_Employed.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
save_fig("Bivariant_Self_Employed")
Credit_History.div(Credit_History.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
save_fig("Bivariant_Credit_History")
Property_Area.div(Property_Area.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
save_fig("Bivariant_Property_Area")
"""
# __________________________________________________________________________________________
# Create a Loop to Scale up - Practice
1. Store dataframe as a dynamic variable for FeaturesALoanStatus
2. print as a chart
3. Save the file too
4. Bonus being able to do a subplot.
FeaturesALoanStatus = ['Gender','Married','Dependents','Education','Self_Employed','Credit_History','Property_Area']
for col in train.columns[1:]:
if col in FeaturesALoanStatus:
print(pd.crosstab(train[col],train['Loan_Status']))
pd.crosstab(train[col],train['Loan_Status']))
"""
#Applicant income variable in category
bins = [0,2500,4000,6000,81000] #0-2499, 2500-3999, 4000-5999, 6000-80999
groupA = ['Low','Average','High', 'Very high']
train['Income_bin'] = pd.cut(train['ApplicantIncome'],bins,labels = groupA)
Income_bin = pd.crosstab(train['Income_bin'],train['Loan_Status'])
#analyze the corresponding loan status for each bin
Income_bin.div(Income_bin.sum(1).astype(float), axis = 0).plot(title= 'Applicant Income',kind = "bar", stacked = True)
plt.xlabel('ApplicantIncome')
plt.ylabel('Percentage')
save_fig("Bivariant_ApplicantIncome")
#coapplicant income and loan amount variable
bins = [0,1000,3000,42000]
groupC = ['Low','Average','High']
train['Coapplicant_Income_bin'] = pd.cut(train['CoapplicantIncome'],bins,labels = groupC)
Coapplicant_Income_bin = pd.crosstab(train['Coapplicant_Income_bin'],train['Loan_Status'])
Coapplicant_Income_bin.div(Coapplicant_Income_bin.sum(1).astype(float), axis = 0).plot(title= 'Co-applicant Income',kind = "bar", stacked = True)
plt.xlabel('CoapplicantIncome')
plt.ylabel('Percentage')
save_fig("Bivariant_CoapplicantIncome")
"""It shows that the lower the coapplicant’s income the chances of loan approval are higher.
But this does sound right. The possible reason behind this may be that most of the
applicants don’t have any coapplicant so the coapplicant income for such applicants is 0
and hence the loan approval is not dependent on it. So we can make a new variable in which
we will combine the applicant’s and coapplicant’s income to visualize the combined effect of income on loan approval."""
train['Total_Income'] = train['ApplicantIncome'] + train['CoapplicantIncome']
bins = [0,2500,4000,6000,81000]
groupT = ['Low','Average','High', 'Very high']
train['Total_Income_bin'] = pd.cut(train['Total_Income'],bins,labels = groupT)
Total_Income_bin = pd.crosstab(train['Total_Income_bin'],train['Loan_Status'])
Total_Income_bin.div(Total_Income_bin.sum(1).astype(float), axis = 0).plot(title= 'Total Income',kind = "bar", stacked = True)
plt.xlabel('Total_Income')
plt.ylabel('Percentage')
save_fig("Bivariant_Total_Income")
#visualize the Loan amount variable
bins = [0,100,200,700]
groupL = ['Low','Average','High']
train['LoanAmount_bin'] = pd.cut(train['LoanAmount'], bins, labels = groupL)
LoanAmount_bin = pd.crosstab(train['LoanAmount_bin'],train['Loan_Status'])
LoanAmount_bin.div(LoanAmount_bin.sum(1).astype(float), axis=0).plot(title= 'Loan Amount', kind="bar", stacked=True)
plt.xlabel('LoanAmount')
plt.ylabel('Percentage')
save_fig("Bivariant_LoanAmount")
#drop the bins which we created for the exploration part
train = train.drop(['Income_bin', 'Coapplicant_Income_bin', 'LoanAmount_bin', 'Total_Income_bin', 'Total_Income'], axis=1)
#change the 3+ in dependents variable to 3 to make it a numerical variable
train['Dependents'].replace('3+', 3,inplace=True)
test['Dependents'].replace('3+', 3,inplace=True)
#convert the target variable’s categories into 0 and 1 so that we can find its correlation with numerical variables. replace N with 0 and Y with 1.
train['Loan_Status'].replace('N', 0,inplace=True)
train['Loan_Status'].replace('Y', 1,inplace=True)
#create heat map to visualize correlation between all the numerical variables
matrix = train.corr()
ax = plt.subplots(figsize=(9, 6))
sns.heatmap(matrix, vmax=.8, square=True, cmap="BuPu").set_title('Heatmap to show correlation between features')
plt.show()
save_fig("Bivariant_HeatMap")
#most correlated variables are (ApplicantIncome - LoanAmount) and (Credit_History - Loan_Status).
#LoanAmount is also correlated with CoapplicantIncome
# __________________________________________________________________________________________
# Data Cleaning (treating missing values & Outliers)
# missing data and outliers can have adverse effect on the model performance.
#this a model created using sklearn open library
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
train = pd.read_csv('TrainData-LoanPrediction.csv')
test = pd.read_csv('TestData-LoanPrediction.csv')
train_original = train.copy() #copy of train data, to preserve original data from changes
test_original = train.copy()
print(train.isnull().sum())
#Fill numerical values with mean and catogarical values with mode.
train['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
train['Married'].fillna(train['Married'].mode()[0], inplace=True)
train['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
train['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
train['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)
print(train['Loan_Amount_Term'].value_counts()) #loan amount term 360 is the mode
train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
#use median to fill the null values in loan amount. as earlier we saw that loan amount have outliers so the mean will not be the
#proper approach as it is highly affected by the presence of outliers
train['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)
#treat missing values in test data as we did for training data
test['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
test['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
test['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
test['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)
test['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
test['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)
# Undertaken this to improve the accuracy of predicting Loan Approvals
#remove the skewness in loan amount by doing the log transformation. it does not affect the smaller values much, but reduces the larger values.
train['LoanAmount_log'] = np.log(train['LoanAmount'])
test['LoanAmount_log'] = np.log(test['LoanAmount'])
# __________________________________________________________________________________________
# Model Training
#no need Loan_ID for training so drop it
train = train.drop('Loan_ID',axis=1)
test = test.drop('Loan_ID',axis=1)
X = train.drop('Loan_Status',1)
y = train.Loan_Status #Series DataType
#this will convert catogarical data to 0 and 1 which makes easy to learn by the model
X = pd.get_dummies(X)
train = pd.get_dummies(train)
test = pd.get_dummies(test)
#split train data into train and validation
x_train, x_cv, y_train, y_cv = train_test_split(X,y, test_size =0.3)
model = LogisticRegression(solver='lbfgs')
model.fit(x_train, y_train)
#set hyperparameters
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1,
max_iter=10000, multi_class='ovr', n_jobs=1, penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
#validate trained model using validation set
pred_cv = model.predict(x_cv)
score = accuracy_score(y_cv,pred_cv)
print("Model Accuracy = ",score)
#Score = 0.768
#prediction
pred_test = model.predict(test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_cv, pred_cv)
print(cm)
# __________________________________________________________________________________________
# Stratified K Fold
i = 1
kf = StratifiedKFold(n_splits = 5, random_state = 1,shuffle = True)
for train_index, test_index in kf.split(X,y):
print('\n{} of kfold {}'.format(i,kf.n_splits))
xtr,xvl = X.loc[train_index],X.loc[test_index]
ytr,yvl = y[train_index],y[test_index]
model = LogisticRegression(random_state=1, solver = 'lbfgs', max_iter = 10000)
model.fit(xtr, ytr)
pred_test = model.predict(xvl)
score = accuracy_score(yvl,pred_test)
print('accuracy_score',score)
i += 1
kfpred_test = model.predict(test)
kfpred = model.predict_proba(xvl)[:,1]