-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
122 lines (93 loc) · 4.13 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import argparse
from tqdm import tqdm
import time
import utils
import adaboost
def cross_validation_adaboost(X, y, fold_indices, n_iterations):
"""
Perform k-fold cross-validation for AdaBoost and return the mean and standard deviation of errors.
"""
errors = []
for train_index, val_index in fold_indices:
X_train, X_val = X[train_index], X[val_index]
y_train, y_val = y[train_index], y[val_index]
model = adaboost.AdaBoost(n_iterations=n_iterations)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
accuracy = np.sum(np.equal(y_val, y_pred)) / len(y_val)
error = 1 - accuracy
errors.append(error)
mean_error = np.mean(errors)
std_error = np.std(errors)
return mean_error, std_error
def eval_adaboost(X_train, X_test, y_train, y_test, n_iterations):
"""
Evaluate AdaBoost on the test set and return the error.
"""
model = adaboost.AdaBoost(n_iterations)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = np.sum(np.equal(y_test, y_pred)) / len(y_test)
error = 1 - accuracy
return error
def load_and_clean_data(filepath):
"""
Load and clean the dataset.
"""
columns = [i for i in range(1, 10)]
columns.append("target")
data = pd.read_csv(filepath, names=columns)
# Define the mapping for replacements
replace_map = {'x': 1, 'o': -1, 'b': 0, 'positive': 1, 'negative': -1}
# Apply the mapping to each column
for col in data.columns:
data[col] = data[col].map(replace_map).astype(int)
return data
def main(filepath, test_size, k_folds, max_iterations):
# Load and preprocess data
data = load_and_clean_data(filepath)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
# Split data into training and testing sets
X_train, X_test, y_train, y_test = utils.train_test_split(X, y, test_size=test_size, random_seed=42)
# Generate fold indices for cross-validation
fold_indices = utils.kfold_indices(X_train, k_folds, random_seed=42)
# Perform cross-validation and evaluation
errors_means_train = []
errors_stds_train = []
errors_test = []
time_to_train = []
for n_iterations in tqdm(range(1, max_iterations + 1), desc ="Progress"):
start = time.time()
mean_error, std_error = cross_validation_adaboost(X_train, y_train, fold_indices, n_iterations)
time_to_train.append(time.time() - start)
errors_means_train.append(mean_error)
errors_stds_train.append(std_error)
test_error = eval_adaboost(X_train, X_test, y_train, y_test, n_iterations)
errors_test.append(test_error)
# Plot the errors
utils.plot_errors(errors_means_train, errors_stds_train, errors_test)
utils.plot_time(time_to_train)
print(f"Last error: {errors_test[-1]}")
min_val_error = min(errors_means_train)
acceptable_error_threshold = min_val_error * 1.05
for i, score in enumerate(errors_means_train):
if score <= acceptable_error_threshold:
best_n_estimators = i
break
print(f"Best number of estimators: {best_n_estimators}")
best_test_error = eval_adaboost(X_train, X_test, y_train, y_test, best_n_estimators)
print(f"Best estimators error: {best_test_error}")
print(f"Best estimators accuracy: {1 - best_test_error}")
print("Evaluation process completed successfully!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="AdaBoost Cross-Validation")
parser.add_argument('--filepath', type=str, required=True, help='Path to the dataset file')
parser.add_argument('--test_size', type=float, default=0.2, help='Proportion of the dataset to include in the test split')
parser.add_argument('--k_folds', type=int, default=5, help='Number of folds for cross-validation')
parser.add_argument('--max_iterations', type=int, default=1000, help='Maximum number of boosting iterations')
args = parser.parse_args()
main(filepath=args.filepath, test_size=args.test_size, k_folds=args.k_folds, max_iterations=args.max_iterations)