-
Notifications
You must be signed in to change notification settings - Fork 0
/
titanic.py
114 lines (92 loc) · 3.96 KB
/
titanic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Importing Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
# Importing the dataset
df = pd.read_csv("/Users/pb/Downloads/titanic/train.csv")
# Encoding column 'Sex'
label_encoder = LabelEncoder()
df["Sex"] = label_encoder.fit_transform(df["Sex"])
# Handling missing values in the "Age" column
mean_age = df['Age'].mean()
df['Age'].fillna(mean_age, inplace=True)
# Splitting into X and y
X = df.drop(["Survived", "Embarked", "PassengerId", "Name", "Ticket", "Cabin"], axis=1) # Features (all columns except "Survived")
y = df["Survived"]
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create an instance of the Random Forest model
model = RandomForestClassifier(n_estimators=100, oob_score = True)
# Train the model on the training data
model.fit(X_train, y_train)
# Make predictions on the test data
y_pred = model.predict(X_test)
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: " + str(accuracy * 100) + "%")
model.score(X_train, y_train)
acc_random_forest = round(model.score(X_train, y_train) * 100, 2)
print("oob score:", round(model.oob_score_, 4)*100, "%")
print("acc_random_forest", acc_random_forest)
# Predict probabilities for the training data
y_scores = model.predict_proba(X_train)
# Access the predicted probabilities for the positive class
y_scores = y_scores[:, 1]
# ROC-AUC Curve
r_a_score = roc_auc_score(y_train, y_scores)
print("ROC-AUC-Score:", r_a_score)
# Confusion Matrix
predictions = cross_val_predict(model, X_train, y_train, cv=3)
confusion_matrix(y_train, predictions)
# Precision and Recall
print("Precision:", precision_score(y_train, predictions))
print("Recall:",recall_score(y_train, predictions))
# F1 Score
f1_score(y_train, predictions)
# Precision, Recall and Threshold Plot
precision, recall, threshold = precision_recall_curve(y_train, y_scores)
def plot_precision_and_recall(precision, recall, threshold):
plt.plot(threshold, precision[:-1], "r-", label="precision", linewidth=5)
plt.plot(threshold, recall[:-1], "b", label="recall", linewidth=5)
plt.xlabel("threshold", fontsize=19)
plt.legend(loc="upper right", fontsize=19)
plt.ylim([0, 1])
plt.figure(figsize=(14, 7))
plot_precision_and_recall(precision, recall, threshold)
plt.show()
# Precison vs Recall Plot
def plot_precision_vs_recall(precision, recall):
plt.plot(recall, precision, "g--", linewidth=2.5)
plt.ylabel("recall", fontsize=19)
plt.xlabel("precision", fontsize=19)
plt.axis([0, 1.5, 0, 1.5])
plt.figure(figsize=(14, 7))
plot_precision_vs_recall(precision, recall)
plt.show()
# True positive rate vs False positive rate Plot
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, y_scores)# plotting them against each other
def plot_roc_curve(false_positive_rate, true_positive_rate, label=None):
plt.plot(false_positive_rate, true_positive_rate, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'r', linewidth=4)
plt.axis([0, 1, 0, 1])
plt.xlabel('False Positive Rate (FPR)', fontsize=16)
plt.ylabel('True Positive Rate (TPR)', fontsize=16)
plt.figure(figsize=(14, 7))
plot_roc_curve(false_positive_rate, true_positive_rate)
plt.show()