-
Notifications
You must be signed in to change notification settings - Fork 0
/
CTMP.py
334 lines (278 loc) · 11.9 KB
/
CTMP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
# -*- coding: utf-8 -*-
import time
import os
NUM_THREADS = "1"
os.environ["OMP_NUM_THREADS"] = NUM_THREADS
os.environ["OPENBLAS_NUM_THREADS"] = NUM_THREADS
os.environ["MKL_NUM_THREADS"] = NUM_THREADS
os.environ["VECLIB_MAXIMUM_THREADS"] = NUM_THREADS
os.environ["NUMEXPR_NUM_THREADS"] = NUM_THREADS
import numpy as np
from scipy import special
from numba import njit
class MyCTMP:
def __init__(self, rating_GroupForUser, rating_GroupForMovie,
num_movies, num_words, num_topics, user_size, lamb, e, f, alpha, tau, kappa, bernoulli_p, iter_infer):
"""
Arguments:
num_movies: Number of unique words in the corpus (length of the vocabulary).
num_topics: Number of topics shared by the whole corpus.
alpha: Hyperparameter for prior on topic mixture theta.
iter_infer: Number of iterations of FW algorithm
"""
self.rating_GroupForUser = rating_GroupForUser
self.rating_GroupForMovie = rating_GroupForMovie
self.num_movies = num_movies
self.num_words = num_words
self.num_topics = num_topics
self.user_size = user_size
self.lamb = lamb
self.e = e
self.f = f
self.alpha = alpha
self.tau = tau
self.kappa = kappa
self.bernoulli_p = bernoulli_p
self.updatect = 1
self.iter_infer = iter_infer
# Get initial beta(topics) which was produced by LDA
# self.beta = np.load('./input-data/beta.npy')
self.beta = np.random.rand(self.num_topics, self.num_words) + 1e-10
beta_norm = self.beta.sum(axis=1)
self.beta /= beta_norm[:, np.newaxis]
# Get initial theta(topic proportions) which was produced by LDA
# self.theta = np.load('./input-data/theta.npy')
self.theta = np.random.rand(self.num_movies, self.num_topics) + 1e-10
theta_norm = self.theta.sum(axis=1)
self.theta /= theta_norm[:, np.newaxis]
# Initialize mu (topic offsets)
self.mu = np.copy(self.theta) # + np.random.normal(0, self.lamb, self.theta.shape)
# Initialize phi (rating's variational parameter)
self.phi = self.get_phi()
# Initialize shp, rte (user's variational parameters)
self.shp = np.ones((self.user_size, self.num_topics)) * self.e
self.rte = np.ones((self.user_size, self.num_topics)) * self.f
def get_phi(self):
""" Click to read description
As we know Φ(phi) has shape of (user_size, num_movies, num_topics)
which is 3D matrix of shape=(138493, 25900, 50) for ORIGINAL || (1915, 639, 50) for REDUCED
For ORIGINAL data, it is not possible(memory-wise) to store 3D matrix of shape=(138493, 25900, 50) into single numpy array.
Therefore, we cut the whole 3D matrix into small chunks of 3D matrix and put them into list and set it as our self.phi
"""
block_2D = np.zeros(shape=(self.num_movies, self.num_topics))
# Initiate matrix
phi_matrices = list()
# Create small 3D matrices and add them into list
thousand_block_size = self.user_size // 1000
phi = np.empty(shape=(1000, self.num_movies, self.num_topics))
for i in range(1000):
phi[i, :, :] = block_2D
for i in range(thousand_block_size):
phi_matrices.append(phi)
# Create last remaining 3D matrix and add it into list
remaining_block_size = self.user_size % 1000
phi = np.empty(shape=(remaining_block_size, self.num_movies, self.num_topics))
for i in range(remaining_block_size):
phi[i, :, :] = block_2D
phi_matrices.append(phi)
return phi_matrices
def run_EM(self, wordids, wordcts, GLOB_ITER):
""" Click to read more
First does an E step on given wordids and wordcts to update theta,
then uses the result to update betas in M step.
"""
self.GLOB_ITER = GLOB_ITER
# E - expectation step
self.e_step(wordids, wordcts)
# M - maximization step
self.m_step(wordids, wordcts)
def e_step(self, wordids, wordcts):
""" Does e step. Updates theta, mu, pfi, shp, rte for all documents and users"""
# Normalization denominator for mu
# norm_mu = np.copy((self.shp / self.rte).sum(axis=0))
# --->> UPDATE phi, shp, rte
s = time.time()
mf, pf, rf = 0, 0, 0
mu_sum = self.mu.sum(axis=0)
for u in range(self.user_size):
ms = time.time()
if len(self.rating_GroupForUser[u]) == 0:
# if user didnt like any movie, then dont update anything, continue!
continue
movies_for_u = self.rating_GroupForUser[u] # list of movie ids liked by user u
phi_block = self.phi[u // 1000] # access needed 3D matrix of phi list by index
usr = u % 1000 # convert user id into interval 0-1000
me = time.time()
mf += (me - ms)
ps = time.time()
phi_uj = np.exp(np.log(self.mu[[movies_for_u], :]) + special.psi(self.shp[u, :]) - np.log(self.rte[u, :]))
phi_uj_sum = np.copy(phi_uj)[0].sum(axis=1) # DELETE np.copy and test
phi_uj_norm = np.copy(phi_uj) / phi_uj_sum[:, np.newaxis] # DELETE np.copy and test
# update user's phi in phi_block with newly computed phi_uj_sum
phi_block[usr, [movies_for_u], :] = phi_uj_norm
pe = time.time()
pf += (pe - ps)
rs = time.time()
# update user's shp and rte
self.shp[u, :] = self.e + phi_uj_norm[0].sum(axis=0)
self.rte[u, :] = self.f + mu_sum
re = time.time()
rf += (re-rs)
# print(f" ** UPDATE phi, shp, rte over {u + 1}/{self.user_size} users |iter:{self.GLOB_ITER}| ** ")
e = time.time()
print("users time:", e - s)
# print(mf/(e-s))
# print(pf / (e - s))
# print(rf / (e - s))
# --->> UPDATE theta, mu
d_s = time.time()
a = 0
norm_mu = np.copy((self.shp / self.rte).sum(axis=0))
for d in range(self.num_movies):
ts = time.time()
thetad = self.update_theta(wordids[d], wordcts[d], d)
self.theta[d, :] = thetad
te = time.time()
ms = time.time()
mud = self.update_mu(norm_mu, d)
self.mu[d, :] = mud
# print(f" ** UPDATE theta, mu over {d + 1}/{self.num_movies} documents |iter:{self.GLOB_ITER}| ** ")
me = time.time()
a += (me - ms) / ((me - ms) + (te - ts))
d_e = time.time()
print("movies time:", d_e - d_s)
# print("avg mu proportion on movies time:", a / self.num_movies)
def update_mu(self, norm_mu, d):
# initiate new mu
mu = np.empty(self.num_topics)
mu_users = self.rating_GroupForMovie[d]
def get_phi(x):
phi_ = self.phi[x // 1000]
usr = x % 1000
return phi_[usr, d, :] # **previously** np.sum()
rating_phi = sum(map(get_phi, mu_users))
if len(mu_users) == 0:
mu = np.copy(self.theta[d, :])
else:
for k in range(self.num_topics):
temp = -1 * norm_mu[k] + self.lamb * self.theta[d, k]
delta = temp ** 2 + 4 * self.lamb * rating_phi[k] # added [k] to rating_phi.
mu[k] = (temp + np.sqrt(delta)) / (2 * self.lamb)
return mu
@staticmethod
@njit
def x_(cts, beta, alpha, lamb, mu, tt):
return np.dot(cts, np.log(np.dot(tt, beta))) + (alpha - 1) * np.log(tt) \
- 1 * (lamb / 2) * (np.linalg.norm((tt - mu), ord=2)) ** 2
@staticmethod
@njit
def t_(cts, beta, theta, mu, x, p, T_lower, T_upper, alpha, lamb, t):
# ======== G's ========== 30%
G_1 = (np.dot(beta, cts / x) + (alpha - 1) / theta) / p
G_2 = (-1 * lamb * (theta - mu)) / (1 - p)
# ======== Lower ========== 40%
if np.random.rand() < p:
T_lower[0] += 1
else:
T_lower[1] += 1
ft_lower = T_lower[0] * G_1 + T_lower[1] * G_2
index_lower = np.argmax(ft_lower)
alpha = 1.0 / (t + 1)
theta_lower = np.copy(theta)
theta_lower *= 1 - alpha
theta_lower[index_lower] += alpha
# ======== Upper ========== 30%
if np.random.rand() < p:
T_upper[0] += 1
else:
T_upper[1] += 1
ft_upper = T_upper[0] * G_1 + T_upper[1] * G_2
index_upper = np.argmax(ft_upper)
alpha = 1.0 / (t + 1)
theta_upper = np.copy(theta)
theta_upper *= 1 - alpha
theta_upper[index_upper] += alpha
return theta_lower, theta_upper, index_lower, index_upper, alpha
def update_theta(self, ids, cts, d):
""" Click to read more
Updates theta for a given document using BOPE algorithm (i.e, MAP Estimation With Bernoulli Randomness).
Arguments:
ids: an element of wordids, corresponding to a document.
cts: an element of wordcts, corresponding to a document.
Returns updated theta.
"""
cts = cts.astype("float64")
# locate cache memory
beta = self.beta[:, ids]
# Get theta
theta = self.theta[d, :]
# Get mu
mu = self.mu[d, :]
# x = sum_(k=2)^K theta_k * beta_{kj}
x = np.dot(theta, beta)
# Parameter of Bernoulli distribution
# Likelihood vs Prior
p = self.bernoulli_p
# Number of times likelihood and prior are chosen
T_lower = [1, 0]
T_upper = [0, 1]
ts = time.time()
jf = 0
xf = 0
uf = 0
for t in range(1, self.iter_infer):
# ======== G's, Upper, Lower ======== 50%
# JITed
js = time.time()
theta_lower, theta_upper, index_lower, index_upper, alpha = self.t_(cts, beta, theta, mu, x, p, T_lower,
T_upper, self.alpha, self.lamb, t)
je = time.time()
jf += (je-js)
# ======== Decision ======== 30%
# JITed
xs = time.time()
x_l = self.x_(cts, beta, self.alpha, self.lamb, mu, theta_lower)
x_u = self.x_(cts, beta, self.alpha, self.lamb, mu, theta_upper)
compare = np.array([x_l[0], x_u[0]])
best = np.argmax(compare)
xe = time.time()
xf += (xe-xs)
# ======== Update ======== 20%
us = time.time()
if best == 0:
theta = np.copy(theta_lower)
x = x + alpha * (beta[index_lower, :] - x)
else:
theta = np.copy(theta_upper)
x = x + alpha * (beta[index_upper, :] - x)
ue = time.time()
uf += (ue-us)
te = time.time()
# print(te-ts)
# print(jf/(te-ts))
# print(xf / (te - ts))
# print(uf / (te - ts))
# print("-----")
return theta
def m_step(self, wordids, wordcts):
""" Does m step: update global variables beta """
# Compute intermediate beta which is denoted as "unit beta"
beta = np.zeros((self.num_topics, self.num_words))
for d in range(self.num_movies):
beta[:, wordids[d]] += np.outer(self.theta[d], wordcts[d])
# Check zeros index
beta_sum = beta.sum(axis=0)
ids = np.where(beta_sum != 0)[0]
unit_beta = beta[:, ids]
# Normalize the intermediate beta
unit_beta_norm = unit_beta.sum(axis=1)
unit_beta /= unit_beta_norm[:, np.newaxis]
# OLD Update beta
# self.beta = np.zeros((self.num_topics, self.num_words))
# self.beta[:, ids] += unit_beta
# NEW Update beta
# Update beta
rhot = pow(self.tau + self.updatect, -self.kappa)
self.beta *= (1 - rhot)
self.beta[:, ids] += unit_beta * rhot
self.updatect += 1