-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlearn.py
237 lines (185 loc) · 6.98 KB
/
learn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import json
import os
import sqlite3
from datetime import datetime
import numpy as np
from scipy.optimize import minimize
from scipy.stats import norm
from sklearn.model_selection import KFold, train_test_split
from xdg.BaseDirectory import xdg_data_home
config_path = os.path.join(xdg_data_home, "bliss-rs/config.json")
with open(config_path, "r") as f:
config = json.load(f)
database_path = config["database_path"]
con = sqlite3.connect(database_path)
cur = con.cursor()
# Absolutely shameless code
query = cur.execute(
"""
select song_id, training_triplet.id, 1 as song_number, feature, feature.feature_index from feature
inner join training_triplet on feature.song_id = training_triplet.song_1_id
union all
select song_id, training_triplet.id, 2 as song_number, feature, feature.feature_index from feature
inner join training_triplet on feature.song_id = training_triplet.song_2_id
union all
select song_id, training_triplet.id, 3 as song_number, feature, feature.feature_index from feature
inner join training_triplet on feature.song_id = training_triplet.odd_one_out_id
order by training_triplet.id, song_number, feature.feature_index
"""
)
# Copy list so we don't have an iterator
query = [x for x in query]
triplets = []
ids = set(t for _, t, _, _, _ in query)
for id in ids:
current_triplet_list = [(i, p, f) for i, tid, p, f, _ in query if tid == id]
song1_features = np.array([f for _, p, f in current_triplet_list if p == 1])
song2_features = np.array([f for _, p, f in current_triplet_list if p == 2])
song3_features = np.array([f for _, p, f in current_triplet_list if p == 3])
triplets.append(np.array([song1_features, song2_features, song3_features]))
def d_metric(x1, x2, L=None):
return d(L, x1, x2)
def d(L, x1, x2):
L = L.reshape(len(x1), len(x1))
sqrd = ((x1 - x2).dot(L.dot(np.transpose(L)))).dot(x1 - x2)
ret = np.sqrt(sqrd)
return ret
def grad_d(L, x1, x2):
L = L.reshape(len(x1), len(x2))
ret = grad_d_squared(L, x1, x2) / (2 * d(L, x1, x2))
return ret
def grad_d_squared(L, x1, x2):
L = L.reshape(len(x1), len(x1))
grad = 2 * np.outer(x1 - x2, x1 - x2).dot(L)
return grad.ravel()
# x3 here is the odd thing
def delta(L, x1, x2, x3, sigma, second_batch=False):
ret = (d(L, x2, x3) - d(L, x1, x2)) / sigma
if second_batch:
ret = (d(L, x1, x3) - d(L, x1, x2)) / sigma
return ret
def grad_delta(L, x1, x2, x3, sigma, second_batch=False):
ret = (grad_d(L, x2, x3) - grad_d(L, x1, x2)) / sigma
if second_batch:
ret = (grad_d(L, x1, x3) - grad_d(L, x1, x2)) / sigma
return ret
def p(L, x1, x2, x3, sigma, second_batch=False):
cdf = norm.cdf(delta(L, x1, x2, x3, sigma, second_batch))
if cdf == 0:
print(delta(L, x1, x2, x3, sigma, second_batch))
return norm.cdf(delta(L, x1, x2, x3, sigma, second_batch))
def grad_p(L, x1, x2, x3, sigma, second_batch=False):
return norm.pdf(delta(L, x1, x2, x3, sigma, second_batch)) * grad_delta(
L, x1, x2, x3, sigma, second_batch
)
def log_p(L, x1, x2, x3, sigma, second_batch=False):
return np.log(p(L, x1, x2, x3, sigma, second_batch))
def grad_log_p(L, x1, x2, x3, sigma, second_batch=False):
return grad_p(L, x1, x2, x3, sigma, second_batch) / p(
L, x1, x2, x3, sigma, second_batch
)
def opti_fun(L, X, sigma, l):
batch_1 = -sum(np.array([log_p(L, x1, x2, x3, sigma) for x1, x2, x3 in X]))
batch_2 = -sum(np.array([log_p(L, x1, x2, x3, sigma, True) for x1, x2, x3 in X]))
return batch_1 + batch_2 + l * np.sum(L**2)
def grad_opti_fun(L, X, sigma, l):
batch_1 = -np.sum(
np.array([grad_log_p(L, x1, x2, x3, sigma) for x1, x2, x3 in X]),
0,
)
batch_2 = -np.sum(
np.array([grad_log_p(L, x1, x2, x3, sigma, True) for x1, x2, x3 in X]),
0,
)
return batch_1 + batch_2 + 2 * l * L
def percentage_preserved_distances(L, X):
count = 0
for x1, x2, x3 in X:
d1 = d(L.ravel(), x1, x2) # short distance
d2 = d(L.ravel(), x2, x3) # long distance
d3 = d(L.ravel(), x1, x3) # long distance
if (d1 < d2) and (d1 < d3):
count = count + 1
return count / len(X)
def optimize(L0, X, sigma2, l, method):
l_dim = len(X[0][0])
res = minimize(
opti_fun,
L0,
args=(X, sigma2, l),
jac=grad_opti_fun,
method=method,
)
L = np.reshape(res.x, [l_dim, l_dim])
return (res, L)
# Methods that converged:
# - L-BFGS-B
# - Newton-CG gave best results on survey_features but took forever
# - SLSQP completes, but ehh results (2% more than normally)
# - trust-constr - no more amelioration than the rest
method = "L-BFGS-B"
X = np.array(triplets)
l_dim = len(X[0][0])
sigma2 = 2
L0 = np.identity(len(X[0][0])).ravel()
L_init = L0
design, test = train_test_split(X, test_size=0.2)
# lambdas = [10, 50, 100, 200, 500, 1000, 2500, 5000]
lambdas = [0.0, 0.001, 0.01, 0.1, 1, 50, 100, 500, 1000, 5000]
accuracies = [[] for _ in lambdas]
accuracies_euclidean = []
print("Started {}".format(datetime.now()))
kf = KFold(n_splits=5)
rounds = 0
for train_index, test_index in kf.split(X):
rounds = rounds + 1
X_train, X_test = X[train_index], X[test_index]
print("Doing {}th fold...".format(rounds))
accuracies_euclidean.append(percentage_preserved_distances(L0, X_test))
print("Euclidean accuracy is {}".format(percentage_preserved_distances(L0, X_test)))
for i, l in enumerate(lambdas):
res, L = optimize(L_init, X_train, sigma2, l, method)
print(f"Optimizing was a success? {res}")
accuracy = percentage_preserved_distances(L, X_test)
accuracies[i].append(accuracy)
print("Done for lambda = {}, accuracy is {}".format(l, accuracy))
mean_accuracies = np.array(
[np.mean(local_accuracies) for local_accuracies in accuracies]
)
idx = mean_accuracies.argmax()
max_accuracy = mean_accuracies[idx]
l = lambdas[idx]
print("Mean accuracy for euclidean is: {}".format(np.mean(accuracies_euclidean)))
print("Best accuracy is {} for lambda = {}\n".format(max_accuracy, l))
res, L = optimize(L_init, design, sigma2, l, method)
print("At the end of the day:")
print(
"Accuracy for non-trained metric on the test set: {}".format(
percentage_preserved_distances(L0, test)
)
)
print(
"Accuracy for trained metric on the test set: {}".format(
percentage_preserved_distances(L, test)
)
)
res, L_total = optimize(L_init, X, sigma2, l, method)
M = L_total.dot(L_total.transpose())
np.save("L_total", L_total)
np.save("M", M)
with open(config_path, "w") as f:
config["m"] = {
"v": 1,
"dim": M.shape,
"data": M.ravel().tolist(),
}
json.dump(config, f, indent=2)
# If you want to load M from the config
Loaded_M = np.array(config["m"]["data"]).reshape(config["m"]["dim"])
print(f"Done {datetime.now()}, but was it a success? {res}")
print(
f"""
The configuration file has been saved at {config_path}. You can use it
using `blissify playlist 300 --mahalanobis`
"""
)