-
Notifications
You must be signed in to change notification settings - Fork 1
/
semi_million.py
121 lines (101 loc) · 5.77 KB
/
semi_million.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import argparse
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from time import time
from falkon import Falkon
from utility.kernel import Kernel
def labelling(y_pred, balance_constraint, lam0, theta0, max_violation, max_iterations=200):
lam = lam0
theta = theta0
idx = None
best_labels, y_l, y_u = None, None, None
for idx in range(max_iterations):
best_labels = get_best_labels(y_pred, lam)
violation = np.sum(best_labels) - (y_pred.shape[0]*balance_constraint)
if abs(violation) < max_violation:
break
elif violation < 0:
y_l = best_labels
else:
y_u = best_labels
if (y_l is not None) and (y_u is not None):
# plane intersection
numerator = np.sum(np.power(y_pred - y_l, 2)) - np.sum(np.power(y_pred - y_u, 2))
denominator = np.sum(y_u) - np.sum(y_l)
lam = numerator / denominator
else:
lam = lam + (theta * violation)
theta = theta * 0.9
return best_labels, lam, idx
def get_best_labels(functional_margin, lam):
positive_labels = np.power(functional_margin - 1, 2) + lam
negative_labels = np.power(functional_margin + 1, 2) - lam
return 2.0*(negative_labels > positive_labels) - 1
def main(path, kernel_function, max_iterations, gpu):
# loading dataset as ndarray
dataset = np.load(path).astype(np.float32)
print("Dataset loaded ({} points, {} features per point)".format(dataset.shape[0], dataset.shape[1] - 1))
# defining train and test set
x_train = dataset[0:463715, 1:]; x_test = dataset[463715:515345, 1:]
y_train = dataset[0:463715, 0]; y_test = dataset[463715:515345, 0]
print("Train and test set defined")
# creating the unsupervised part of the dataset (using x_train)
labeled_ids, unlabeled_ids = train_test_split(range(x_train.shape[0]), test_size=0.7, random_state=42)
x_labeled, y_labeled = x_train[labeled_ids, :], y_train[labeled_ids]
x_unlabeled, y_unlabeled = x_train[unlabeled_ids, :], y_train[unlabeled_ids]
print("Labeled examples {}, Unlabeled examples {}".format(x_labeled.shape[0], x_unlabeled.shape[0]))
# labels binarization (-1 from 1922 to 2002, 1 from 2002 to 2011) -- balanced (labeled) dataset
y_labeled, y_test = (y_labeled >= 2002).astype(np.float32), (y_test >= 2002).astype(np.float32)
y_unlabeled = (y_unlabeled >= 2002).astype(np.float32)
y_labeled, y_unlabeled = (2 * y_labeled) - 1, (2 * y_unlabeled) - 1
y_test = (2 * y_test) - 1
# removing the mean and scaling to unit variance
x_scaler = StandardScaler()
x_scaler.fit(x_train) # using labeled + unlabeled part
x_labeled, x_unlabeled = x_scaler.transform(x_labeled), x_scaler.transform(x_unlabeled)
x_test = x_scaler.transform(x_test)
print("Standardization done")
# choosing kernel function
kernel = Kernel(kernel_function=kernel_function, gpu=gpu)
# training
print("First training...")
falkon = Falkon(nystrom_length=round(np.sqrt(x_labeled.shape[0])), gamma=1e-6, kernel_fun=kernel.get_kernel(), kernel_param=6, optimizer_max_iter=max_iterations, gpu=gpu)
falkon.fit(x_labeled, y_labeled)
functional_margin = falkon.predict(x_test)
# initial Accuracy, AUC_ROC
accuracy = accuracy_score(y_test, np.sign(functional_margin))
auc_roc = roc_auc_score(y_test, functional_margin)
print("Accuracy: {:.4f} - AUC: {:.4f}".format(accuracy, auc_roc))
print("Annealing loop...")
functional_margin = falkon.predict(x_unlabeled)
falkon = Falkon(nystrom_length=10000, gamma=1e-6, kernel_fun=kernel.get_kernel(), kernel_param=6, optimizer_max_iter=max_iterations, gpu=gpu)
balance_constraint = (2 * 0.5) - 1 # 2r - 1
tic = time()
for idx, weight in enumerate([0.1, 0.15, 0.25, 1.]):
print(" -> iteration {}".format(idx+1))
lam0 = ((2/x_unlabeled.shape[0])*np.sum(functional_margin)) - (2*balance_constraint)
y_u, lam, _iter = labelling(functional_margin, balance_constraint, lam0, 1., int(x_unlabeled.shape[0]*0.005))
print(" -> [debug info] balance constraint {:.2}".format(np.divide(np.sum(y_u), x_unlabeled.shape[0])))
print(" -> [debug info] lambda from {:.3e} to {:.3e} in {} iterations".format(lam0, lam, _iter+1))
print(" -> [debug info] wrong labels {}".format(np.sum(y_u != y_unlabeled)))
sample_weights = ([1.] * x_labeled.shape[0]) + ([weight] * x_unlabeled.shape[0])
falkon.fit(np.vstack((x_labeled, x_unlabeled)), np.concatenate((y_labeled, y_u)).astype(np.float32), sample_weights=sample_weights)
functional_margin = falkon.predict(x_unlabeled)
print("Annealing done in {:.3} seconds".format(time()-tic))
# testing falkon
print("Starting falkon testing routine...")
y_pred = falkon.predict(x_test)
functional_margin = falkon.predict(x_test)
accuracy = accuracy_score(y_test, np.sign(functional_margin))
auc_roc = roc_auc_score(y_test, functional_margin)
print("Accuracy: {:.3f} - AUC_ROC: {:.3f}".format(accuracy, auc_roc))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("dataset", metavar='path', type=str, help='path of the dataset used for this test')
parser.add_argument("--kernel", metavar='ker', type=str, default='gaussian', help='choose the kernel function')
parser.add_argument("--max_iterations", type=int, default=20, help="specify the maximum number of iterations during the optimization")
parser.add_argument("--gpu", type=bool, default=False, help='enable the GPU')
args = parser.parse_args()
main(path=args.dataset, kernel_function=args.kernel, max_iterations=args.max_iterations, gpu=args.gpu)