-
Notifications
You must be signed in to change notification settings - Fork 0
/
softmax_cifar10_pytorch.py
396 lines (308 loc) · 16.1 KB
/
softmax_cifar10_pytorch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
# -*- coding: utf-8 -*-
"""softmax_cifar10_Pytorch.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1xsLijt54UEl0PsNr3bdQ2kfEhROsy483
# Building regularized softmax regression model for CIFAR10 using PyTorch
"""
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
"""# The CIFAR10 dataset
- Download and normalize the CIFAR10 dataset from torchvision
- Split the CIFAR10 data into train, validation and test set
- Set the batch size for processing these datasets
- Build the dataloaders for train, validation, and test set which will be used in the training loop
- Define the string class labels (targets are numeric 0-9)
"""
# mean and std for the RGB channels in CIFAR10
tmean = [0.49139968, 0.48215841, 0.44653091]
tstd = [0.24703223, 0.24348513, 0.26158784]
# transform the 32x32x3 images into a tensor after normalizing
# each channel using the parameters above
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize(tmean, tstd)])
# download and transform the trainset and testset for training
trainset = torchvision.datasets.CIFAR10(root='./data',train=True,download=True,transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data',train=False,download=True,transform=transform)
#split trainset into a train and a val set (90-10 split)
lengths = [int(p * len(trainset)) for p in [0.9,0.1]]
tr,v = torch.utils.data.random_split(trainset,lengths)
train_sampler = torch.utils.data.SubsetRandomSampler(tr.indices)
val_sampler = torch.utils.data.SubsetRandomSampler(v.indices)
# set batch size and set up the data generators for train, val, test sets
batch_size = 128
trainloader = torch.utils.data.DataLoader(trainset,batch_size=batch_size,sampler=train_sampler)
valloader = torch.utils.data.DataLoader(trainset,batch_size=batch_size,sampler=val_sampler)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size)
print("Number of training batches = ",len(trainloader))
print("Number of validation batches = ",len(valloader))
print("Number of test batches = ",len(testloader))
# define the output classes
classes = ('plane', 'car', 'bird', 'cat',
'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
"""# Visualize the training data"""
Xtr,ytr = next(iter(trainloader))
# make a 8x8 grid and display 64 images from the first batch of training data
rows,cols = 8,8
fig = plt.figure(figsize=(8,8),constrained_layout=True)
for i in range(0,rows*cols):
fig.add_subplot(rows,cols,i+1)
tmp = np.transpose(Xtr[i].numpy(),(1,2,0))
plt.imshow(((tmp*tstd + tmean)*255).astype(np.uint8))
plt.xticks([])
plt.yticks([])
plt.title(classes[ytr[i].numpy()])
"""# The softmax function"""
def softmax(X):
X_exp = torch.exp(X)
partition = X_exp.sum(1, keepdims=True)
return X_exp / partition
"""# The SoftmaxRegression model (20 points)
- the __init__ function takes the number of inputs, number of outputs, a learning rate lr, and a weight decay wd (L2 regularization strength).
- set the learning rate and weight decay of the model
- build the network using torch.nn.Sequential() composed of the Flatten() function and a Linear() layer with num_inputs and num_outputs.
- initialize the weights of the linear layer from a zero-mean Gaussian with noise=0.01. You can access the linear layer as self.net[1]
- initialize the bias of the linear layer to be 0
- the forward function returns the softmax of the affine transform of the flattened input with the linear layer
- the loss function reshapes the prediction yhat and the true labels y into 1D tensors, and then calls the built in torch.nn.functional.cross_entropy() function to calculate the softmax loss with reduction = 'mean' if averaged is set to True.
- the predict function takes a batch of images X and runs the forward function to get the softmax, and return the index of the class with the highest probability (use .argmax())
- the configure_optimizers function that is a call to torch.optim.SGD() specifying the parameters to be updated, the weight decay, and the learning rate.
"""
import torch.optim as optim
class SoftmaxRegression(nn.Module):
def __init__(self, num_inputs, num_outputs, lr, wd):
super().__init__()
# Set the learning rate and weight decay
self.lr = lr
self.wd = wd
# Build the network
self.net = nn.Sequential(
nn.Flatten(), # Flatten the input images
nn.Linear(num_inputs, num_outputs, bias=True) # Linear layer
)
# Initialize weights from a zero-mean Gaussian with noise=0.01
nn.init.normal_(self.net[1].weight, mean=0, std=0.01)
# Initialize bias to be 0
nn.init.constant_(self.net[1].bias, val=0)
def forward(self, X):
# Compute the affine transform and apply softmax
return F.softmax(self.net(X), dim=1)
def loss(self, yhat, y, averaged=True):
# Reshape the predictions and labels into 1D tensors
yhat = yhat.view(-1, yhat.size(1))
y = y.view(-1)
# Calculate the softmax loss using torch.nn.functional.cross_entropy
loss = F.cross_entropy(yhat, y, reduction='mean' if averaged else 'sum')
return loss
def predict(self, X):
# Get softmax predictions and return the class with the highest probability
yhat = self.forward(X)
return torch.argmax(yhat, dim=1)
def configure_optimizers(self):
# Define the optimizer with the specified learning rate and weight decay
optimizer = optim.SGD(self.parameters(), lr=self.lr, weight_decay=self.wd)
return optimizer
"""# The training loop (50 points)
Complete the implementation of the function train_model which takes an initialized softmax model, a train set loader, a val set loader, and the number of epochs to train.
- Initialize train_loss and val_loss tensors to store the training set and val set losses for each epoch. We will plot them at the end
- Configure the optimizer associated with the model
- for each epoch until num_epochs do:
- initialize a running train loss and a running val loss (so we can accumulate losses over each batch)
- for each (X,y) in trainloader:
- zero gradients in optimizer
- compute output of model on X
- compute loss on output and y
- perform backward() step on loss
- update parameters with optimizer.step()
- accumulate running train loss with loss
- set train loss for epoch to be running train loss/number of train set batches
- with torch.no_grad() (do not update gradients during the evaluation on the val set)
- for each (valX,valy) in valloader:
- compute output of model on valX
- compute loss on output and valy
- accumulate running val loss with loss
- set val loss for epoch to be running val loss/number of val batches
- Return model, train loss, val loss
If you are using a GPU, then remember to map X and y, as well as valX and valy, to the device, using to(device) method. If you wish, you can also implement early termination of the outer train loop when the val loss exceeds the train set loss a given number of times (say 10, or 20).
"""
import torch
def train_model(model, trainloader, valloader, num_epochs, device='cuda' if torch.cuda.is_available() else 'cpu', early_stop_patience=None):
model.to(device) # Move the model to the specified device (GPU or CPU)
# Initialize tensors to store training and validation losses for each epoch
train_losses = torch.zeros(num_epochs)
val_losses = torch.zeros(num_epochs)
# Configure the optimizer associated with the model
optimizer = model.configure_optimizers()
# Variables for early stopping
early_stop_count = 0
best_val_loss = float('inf')
for epoch in range(num_epochs):
# Initialize running losses for this epoch
running_train_loss = 0.0
running_val_loss = 0.0
# Training loop
model.train()
for X, y in trainloader:
X, y = X.to(device), y.to(device) # Move data to device
optimizer.zero_grad() # Zero the gradients
output = model(X) # Forward pass
loss = model.loss(output, y) # Calculate loss
loss.backward() # Backward pass
optimizer.step() # Update weights
running_train_loss += loss.item()
# Calculate the average training loss for this epoch
train_loss = running_train_loss / len(trainloader)
# Validation loop
model.eval()
with torch.no_grad():
for valX, valy in valloader:
valX, valy = valX.to(device), valy.to(device) # Move data to device
val_output = model(valX) # Forward pass
val_loss = model.loss(val_output, valy) # Calculate loss
running_val_loss += val_loss.item()
# Calculate the average validation loss for this epoch
val_loss = running_val_loss / len(valloader)
# Store the training and validation losses
train_losses[epoch] = train_loss
val_losses[epoch] = val_loss
print(f'Epoch [{epoch + 1}/{num_epochs}] Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f}')
# Early stopping check
if early_stop_patience is not None:
if val_loss < best_val_loss:
best_val_loss = val_loss
early_stop_count = 0
else:
early_stop_count += 1
if early_stop_count >= early_stop_patience:
print(f'Early stopping after {epoch + 1} epochs.')
break
return model, train_losses, val_losses
"""# Test the training loop
- run this cell only after you have completed the function above.
"""
# Run on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Current device:", device)
# set learning rate and weight decay
lr=1e-2
wd=1e-3
num_epochs = 100
model1 = SoftmaxRegression(3*32*32,10,lr=lr, wd=wd).to(device)
model1,train_loss,val_loss = train_model(model1,trainloader,valloader,num_epochs)
plt.plot(torch.arange(len(train_loss)),train_loss, label="train_loss")
plt.plot(torch.arange(len(val_loss)),val_loss, label="val_loss")
plt.legend()
plt.show()
"""# Build models for various learning rates and weight decays
- model2: lr=1e-3, wd=1e-3, num_epochs = 100
- model3: lr=1e-3, wd=1e-2, num_epochs = 100
- model4: lr=1e-1, wd=1e-2, num_epochs = 100
- model5: lr=1e-5, wd=1e-2, num_epochs = 100
"""
# YOUR CODE HERE to build model2, model3, model4, model5
# Define hyperparameters
lr2, wd2, num_epochs2 = 1e-3, 1e-3, 100
lr3, wd3, num_epochs3 = 1e-3, 1e-2, 100
lr4, wd4, num_epochs4 = 1e-1, 1e-2, 100
lr5, wd5, num_epochs5 = 1e-5, 1e-2, 100
# Create model instances
model2 = SoftmaxRegression(3*32*32,10,lr=lr2, wd=wd2).to(device)
model3 = SoftmaxRegression(3*32*32,10, lr3, wd3).to(device)
model4 = SoftmaxRegression(3*32*32,10, lr4, wd4).to(device)
model5 = SoftmaxRegression(3*32*32,10, lr5, wd5).to(device)
"""# Evaluate the performance of models (20 points)
- for each model, use the sklearn metrics functions to calculate on the test set
- confusion matrix
- accuracy
- classification report
- build the function getTopKAcc() to calculate top_k_accuracy using the sklearn top_k_accuracy_score function with k = 1,2,3
"""
import numpy as np
import torch
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, top_k_accuracy_score
# Define a function to calculate top-k accuracy
def getTopKAcc(model, testloader, top_k):
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
for X, y in testloader:
X, y = X.to(device), y.to(device) # Move data to device
preds = model(X)
all_preds.extend(torch.softmax(preds, dim=1).cpu().numpy())
all_labels.extend(y.cpu().numpy())
top_k_acc = top_k_accuracy_score(np.array(all_labels), np.array(all_preds), k=top_k)
return top_k_acc
# Evaluate model1 to model5
models = [model1, model2, model3, model4, model5]
for i, model in enumerate(models, start=1):
print(f"Model {i}:")
# Calculate confusion matrix
test_labels = []
test_predictions = []
model.eval()
with torch.no_grad():
for X, y in testloader:
X, y = X.to(device), y.to(device) # Move data to device
preds = model(X)
test_labels.extend(y.cpu().numpy())
test_predictions.extend(torch.argmax(preds, dim=1).cpu().numpy())
cm = confusion_matrix(test_labels, test_predictions)
# Calculate accuracy
accuracy = accuracy_score(test_labels, test_predictions)
# Calculate classification report
class_report = classification_report(test_labels, test_predictions, target_names=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"])
# Calculate top k accuracies (k=1, 2, 3)
top_1_acc = getTopKAcc(model, testloader, 1)
top_2_acc = getTopKAcc(model, testloader, 2)
top_3_acc = getTopKAcc(model, testloader, 3)
print("Confusion Matrix:")
print(cm)
print("\nAccuracy:", accuracy)
print("\nClassification Report:")
print(class_report)
print(f"Top-1 Accuracy: {top_1_acc:.4f}")
print(f"Top-2 Accuracy: {top_2_acc:.4f}")
print(f"Top-3 Accuracy: {top_3_acc:.4f}\n")
"""# Best performing model (10 points)
- what is the learning rate and weight decay associated with the best performing model?
- comment on the effect of changing learning rate and weight decay on the basis of the five models you have built.
- use the visualization code below to understand the structure of the learned models under the different hyperparameters. Do you see a pattern in the coefficients as a function of learning rate, weight decay? Explain.
# Visualizing the learned models
"""
# Visualize the learned weights for each class`
def visualize_model(model):
theta = model.net[1].weight.data.cpu().numpy() # [10,3072]
theta = theta.reshape(10, 3, 32, 32) # Because we read the image as 3*32*32 instead of 32*32*3
theta=np.transpose(theta, (0,2,3,1)) # [10,32,32,3], so plt can plot it
theta_min, theta_max = np.min(theta), np.max(theta)
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
plt.figure(figsize=(8,8))
for i in range(10):
plt.subplot(2, 5, i + 1)
# Rescale the weights to be between 0 and 255
thetaimg = 255.0 * (theta[i] - theta_min) / (theta_max - theta_min)
plt.imshow(thetaimg.astype('uint8'),cmap='viridis')
plt.axis('off')
plt.title(classes[i])
plt.tight_layout()
plt.show()
# YOUR CODE HERE for visualizing model1, ..., model5
# comment on what patterns you see
# Are the coefficients of the best performing model more interpretable than the others?
# Visualize the learned weights for each class in model1 to model5
for i, model in enumerate(models, start=1):
print(f"Visualization for Model {i}:")
visualize_model(model)
"""Learning rate and weight decay can be the imporatnat hyperparameter .By changing these hyperparameters models can be used to diagnose an underfit, overfit, or well-fit model.Models 1 (lr=1e-4) and 5 (lr=1e-5) have very small learning rates. They might converge very slowly, and there's a risk of getting stuck in local minima during training. These models might not reach their full potential in terms of accuracy.
Model 2 (lr=1e-3) has a moderate learning rate. It likely converges faster than Models 1 and 5 but requires careful tuning to avoid overshooting and instability.
Model 4 (lr=1e-1) has a relatively high learning rate. It might converge quickly but could be susceptible to overshooting and oscillations. However, if properly tuned, it can yield excellent results.In above models we can see with change in them there is a change in accuracy and TOP-1,TOP-2,TOP-3 accuracy . And using visualition we can see which model gives the better result (can be seen above) using visualize_model(model).
"""