-
Notifications
You must be signed in to change notification settings - Fork 1
/
susy.py
75 lines (57 loc) · 3.33 KB
/
susy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import argparse
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from time import time
from falkon import Falkon
from utility.kernel import *
def main(path, semi_supervised, kernel_function, max_iterations, gpu):
# loading dataset as ndarray
dataset = np.load(path).astype(np.float32)
print("Dataset loaded ({} points, {} features per point)".format(dataset.shape[0], dataset.shape[1] - 1))
# adjusting label's range {-1, 1}
dataset[:, 0] = (2 * dataset[:, 0]) - 1
# defining train and test set
x_train, x_test, y_train, y_test = train_test_split(dataset[:, 1:], dataset[:, 0], test_size=0.2, random_state=None)
print("Train and test set defined (test: {} + , train: {} +, {} -)".format(np.sum(y_test == 1.), np.sum(y_train == 1.), np.sum(y_train == -1.)))
# removing some labels (if semi_supervised > 0)
labels_removed = int(len(y_train) * semi_supervised)
if labels_removed > 0:
y_train[np.random.choice(len(y_train), labels_removed, replace=False)] = 0
print("{} labels removed".format(labels_removed))
# removing the mean and scaling to unit variance
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
print("Standardization done")
# choosing kernel function
kernel = Kernel(kernel_function=kernel_function, gpu=gpu)
# fitting falkon
print("Starting falkon fitting routine...")
falkon = Falkon(nystrom_length=10000, gamma=1e-6, kernel_fun=kernel.get_kernel(), kernel_param=4, optimizer_max_iter=max_iterations, gpu=gpu)
# parameters = {'nystrom_length': [10000, ], 'gamma': [1e-6, ], 'kernel_param': [4, ]}
# gsht = GridSearchCV(falkon, param_grid=parameters, scoring=make_scorer(roc_auc_score), cv=3, verbose=3)
# gsht.fit(x_train, y_train)
start_ = time()
falkon.fit(x_train, y_train)
print("Fitting time: {:.3f} seconds".format(time() - start_))
# printing some information of the best model
# print("Best model information: {} params, {:.3f} time (sec)".format(gsht.best_params_, gsht.refit_time_))
# testing falkon
print("Starting falkon testing routine...")
y_pred = falkon.predict(x_test)
accuracy = accuracy_score(y_test, np.sign(y_pred))
auc = roc_auc_score(y_test, y_pred)
print("Accuracy: {:.3f} - AUC: {:.3f}".format(accuracy, auc))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("dataset", metavar='path', type=str, help='path of the dataset used for this test')
parser.add_argument("--kernel", metavar='ker', type=str, default='gaussian', help='choose the kernel function')
parser.add_argument("--semi_supervised", metavar='ss', type=float, default=0., help='percentage of elements [0, 1] to remove the label')
parser.add_argument("--max_iterations", type=int, default=20, help="specify the maximum number of iterations during the optimization")
parser.add_argument("--gpu", type=bool, default=False, help='enable the GPU')
args = parser.parse_args()
main(path=args.dataset, kernel_function=args.kernel, semi_supervised=args.semi_supervised, max_iterations=args.max_iterations, gpu=args.gpu)