-
Notifications
You must be signed in to change notification settings - Fork 0
/
semi-supervised EM.py
301 lines (233 loc) · 10.8 KB
/
semi-supervised EM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
import matplotlib.pyplot as plt
import numpy as np
import os
PLOT_COLORS = ['red', 'green', 'blue', 'orange'] # Colors for your plots
K = 4 # Number of Gaussians in the mixture model
NUM_TRIALS = 3 # Number of trials to run (can be adjusted for debugging)
UNLABELED = -1 # Cluster label for unlabeled data points (do not change)
def main(is_semi_supervised, trial_num):
print('Running {} EM algorithm...'
.format('semi-supervised' if is_semi_supervised else 'unsupervised'))
# Load dataset
train_path = os.path.join('.', 'train.csv')
x_all, z_all = load_gmm_dataset(train_path)
# Split into labeled and unlabeled examples
labeled_idxs = (z_all != UNLABELED).squeeze()
x_tilde = x_all[labeled_idxs, :] # Labeled examples
z_tilde = z_all[labeled_idxs, :] # Corresponding labels
x = x_all[~labeled_idxs, :] # Unlabeled examples
# (1) Initialize mu and sigma by splitting the n_examples data points uniformly at random
# into K groups, then calculating the sample mean and covariance for each group
# (2) Initialize phi to place equal probability on each Gaussian
# phi should be a numpy array of shape (K,)
# (3) Initialize the w values to place equal probability on each Gaussian
# w should be a numpy array of shape (m, K)
n, d = x.shape
group = np.random.choice(K, n)
mu = [np.mean(x[group == g, :], axis = 0) for g in range(K)]
sigma = [np.cov(x[group == g, :].T) for g in range(K)]
phi = np.full((K,), fill_value = (1./K), dtype=np.float32)
w = np.full((n,K), fill_value =(1./K), dtype=np.float32)
if is_semi_supervised:
w = run_semi_supervised_em(x, x_tilde, z_tilde, w, phi, mu, sigma)
else:
w = run_em(x, w, phi, mu, sigma)
# Plot your predictions
z_pred = np.zeros(n)
if w is not None: # Just a placeholder for the starter code
for i in range(n):
z_pred[i] = np.argmax(w[i])
plot_gmm_preds(x, z_pred, is_semi_supervised, plot_id=trial_num)
def run_em(x, w, phi, mu, sigma, max_iter=1000):
"""
See inline comments for instructions.
Args:
x: Design matrix of shape (n_examples, dim).
w: Initial weight matrix of shape (n_examples, k).
phi: Initial mixture prior, of shape (k,).
mu: Initial cluster means, list of k arrays of shape (dim,).
sigma: Initial cluster covariances, list of k arrays of shape (dim, dim)
max_iter: Max iterations. No need to change this
Returns:
Updated weight matrix of shape (n_examples, k) resulting from EM algorithm.
More specifically, w[i, j] should contain the probability of
example x^(i) belonging to the j-th Gaussian in the mixture.
"""
# No need to change any of these parameters
eps = 1e-3 # Convergence threshold
# Stop when the absolute change in log-likelihood is < eps
# See below for explanation of the convergence criterion
it = 0
ll = prev_ll = None
while it < max_iter and (prev_ll is None or np.abs(ll - prev_ll) >= eps):
pass # Just a placeholder for the starter code
# (1) E-step: Update your estimates in w
# (2) M-step: Update the model parameters phi, mu, and sigma
# (3) Compute the log-likelihood of the data to check for convergence.
# By log-likelihood, we mean `ll = sum_x[log(sum_z[p(x|z) * p(z)])]`.
# We define convergence by the first iteration where abs(ll - prev_ll) < eps.
# Hint: For debugging, recall part (a). We showed that ll should be monotonically increasing.
w = e_step(x, w, phi, mu, sigma) #update of q value
phi, mu, sigma = m_step(x, w, mu, sigma) #no phi because M-Step doesnt need phi
prev_ll = ll
ll = log_likelihood(x, phi, mu, sigma)
it += 1
print('[iter:{:03d}, log-likelihood:{:.4f}]'.format(it,ll))
return w
def run_semi_supervised_em(x, x_tilde, z_tilde, w, phi, mu, sigma, max_iter=1000):
"""
See inline comments for instructions.
Args:
x: Design matrix of unlabeled examples of shape (n_examples_unobs, dim).
x_tilde: Design matrix of labeled examples of shape (n_examples_obs, dim).
z_tilde: Array of labels of shape (n_examples_obs, 1).
w: Initial weight matrix of shape (n_examples, k).
phi: Initial mixture prior, of shape (k,).
mu: Initial cluster means, list of k arrays of shape (dim,).
sigma: Initial cluster covariances, list of k arrays of shape (dim, dim)
max_iter: Max iterations. No need to change this
Returns:
Updated weight matrix of shape (n_examples, k) resulting from semi-supervised EM algorithm.
More specifically, w[i, j] should contain the probability of
example x^(i) belonging to the j-th Gaussian in the mixture.
"""
# No need to change any of these parameters
alpha = 20. # Weight for the labeled examples
eps = 1e-3 # Convergence threshold
# Stop when the absolute change in log-likelihood is < eps
# See below for explanation of the convergence criterion
it = 0
ll = prev_ll = None
while it < max_iter and (prev_ll is None or np.abs(ll - prev_ll) >= eps):
pass # Just a placeholder for the starter code
# (1) E-step: Update your estimates in w
# (2) M-step: Update the model parameters phi, mu, and sigma
# (3) Compute the log-likelihood of the data to check for convergence.
# Hint: Make sure to include alpha in your calculation of ll.
# Hint: For debugging, recall part (a). We showed that ll should be monotonically increasing.
w = e_step(x, w, phi, mu, sigma)
phi, mu, sigma = m_step_ss(x, x_tilde, z_tilde, w, phi, mu, sigma, alpha)
prev_ll = ll
ll = log_likelihood(x, phi, mu, sigma)
ll += alpha * log_likelihood(x_tilde, phi, mu, sigma, z_tilde)
it += 1
print('[iter:{:03d}, log-likelihood:{:.4f}]'.format(it,ll))
return w
# *** START CODE HERE ***
def e_step(x, w, phi, mu, sigma):
""" E-step for both unsupervised and semi-supervised EM."""
n,d = x.shape
k = len(mu)
for i in range(n):
for j in range(k):
w[i; j] = p_x_given_z(x[i], mu[i], sigma[j] * phi[j])
w /= np.sum(w, axis = 1, keepdims=true)
return w
def m_step(x, w, mu, sigma):
""" M-Step for unsupervised EM"""
n, d = x.shape
k = len(mu)
phi = np.mean(w, axis =0)
for j in range(k):
w_j = w[:, j : j +1]
mu[j] = np.sum(w_j * x, axis = 0) / np.sum(w_j)
simg[j] = np.zeros_like(sigma[j])
for i in range(n):
x_minus_mu = x[i] - mu[j]
sigma[j] += w[i,j] * np.outer(x_minus_mu, x_minus_mu)
sigma[j] /= np.sum(w_j)
return phi, mu, sigma
def m_step_ss(x, x_tilde, z_tilde, w, phi, mu, sigma, alpha):
""" M-Step for semi-supervised EM."""
n, _ = x.shape
n_tilde, _ = x_tilde.shape
k = len(mu)
w_colsums = np.sum(w, axis =0)
k_counts = [np.sum(z_tilde == j) for j in range(k)]
for j in range(k):
phi[j] = (w_colsums[j] + alpha * k_counts[j]) / (n + alpha * n_tilde)
w_j = w[:, j: J+1]
mu[j] = ((np.sum(w_j * x, axis =0)
+ alpha * np.sum(x_tilde[(z_tilde == j).squeeze(),:], axis =0))
/ (np.sum(w_j)+ alpha * k_counts[j]))
sigma[j] = np.zeros_like(sigma[j])
for i in range(n):
x_minus_mu = x[i] - mu[j]
sigma[j] += w[i,j] * np.outer(x_minus_mu,x_minus_mu)
for i in range(n_tilde):
if z_tilde[i] == j:
x_minus_mu = x_tilde[i] - mu[j]
sigma[j] += alpha +np.outer(x_minus_mu, x_minus_mu)
simga[j] /= (np.sum(w_j) + alpha * k_counts[j])
return phi, mu, sigma
def log_likelihood(x, phi, mu, sigma, z=None):
""" Get log-likelihood of the data 'x' given model parameters
'phi', 'mu', and 'sigma'.
"""
n, d = x.shape
k = len(phi)
ll = 0.
for i in range(n):
if z is None: #unsupervised case
p_x = 0.
for j in range(k):
p_x += p_x_given_z(x[i], mu[j], sigma[j]) * phi[j]
else: #supervised case
j = int(z[i])
p_x = p_x_given_z(x[i], mu[j],sigma[j]) * phi[j]
ll += np.log(p_x)
def p_x_given_z(x, mu, sigma):
""" Get probability of a single example 'x' given model parameters
'mu' and 'sigma' (corresponding to cluster z = j).
"""
d = len(x)
assert d == len(mu) and sigma.shape == (d, d), 'Shape mismatch.'
c = 1./((2. * np.pi) ** (d / 2) * np.sqrt(np.linalg.det(sigm)))
x_minus_mu = x - mu
sigma_inv = np.linalg.inv(sigma)
p_val = c * np.exp(-.5 * x_minus_mu.dot(sigma_inv).dot(x_minus_mu.T))
return p_val
def plot_gmm_preds(x, z, with_supervision, plot_id):
"""Plot GMM predictions on a 2D dataset `x` with labels `z`.
Write to the output directory, including `plot_id`
in the name, and appending 'ss' if the GMM had supervision.
NOTE: You do not need to edit this function.
"""
plt.figure(figsize=(12, 8))
plt.title('{} GMM Predictions'.format('Semi-supervised' if with_supervision else 'Unsupervised'))
plt.xlabel('x_1')
plt.ylabel('x_2')
for x_1, x_2, z_ in zip(x[:, 0], x[:, 1], z):
color = 'gray' if z_ < 0 else PLOT_COLORS[int(z_)]
alpha = 0.25 if z_ < 0 else 0.75
plt.scatter(x_1, x_2, marker='.', c=color, alpha=alpha)
file_name = 'pred{}_{}.pdf'.format('_ss' if with_supervision else '', plot_id)
save_path = os.path.join('.', file_name)
plt.savefig(save_path)
def load_gmm_dataset(csv_path):
"""Load dataset for Gaussian Mixture Model.
Args:
csv_path: Path to CSV file containing dataset.
Returns:
x: NumPy array shape (n_examples, dim)
z: NumPy array shape (n_exampls, 1)
NOTE: You do not need to edit this function.
"""
# Load headers
with open(csv_path, 'r') as csv_fh:
headers = csv_fh.readline().strip().split(',')
# Load features and labels
x_cols = [i for i in range(len(headers)) if headers[i].startswith('x')]
z_cols = [i for i in range(len(headers)) if headers[i] == 'z']
x = np.loadtxt(csv_path, delimiter=',', skiprows=1, usecols=x_cols, dtype=float)
z = np.loadtxt(csv_path, delimiter=',', skiprows=1, usecols=z_cols, dtype=float)
if z.ndim == 1:
z = np.expand_dims(z, axis=-1)
return x, z
if __name__ == '__main__':
np.random.seed(229)
# Run NUM_TRIALS trials to see how different initializations
# affect the final predictions with and without supervision
for t in range(NUM_TRIALS):
main(is_semi_supervised=False, trial_num=t)
main(is_semi_supervised=True, trial_num=t)