softmax_cifar10_pytorch.py

# -*- coding: utf-8 -*-
"""softmax_cifar10_Pytorch.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1xsLijt54UEl0PsNr3bdQ2kfEhROsy483

# Building regularized softmax regression model for CIFAR10 using PyTorch
"""

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

"""# The CIFAR10 dataset
- Download and normalize the CIFAR10 dataset from torchvision
- Split the CIFAR10 data into train, validation and test set
- Set the batch size for processing these datasets
- Build the dataloaders for train, validation, and test set which will be used in the training loop
- Define the string class labels (targets are numeric 0-9)
"""

# mean and std for the RGB channels in CIFAR10
tmean = [0.49139968, 0.48215841, 0.44653091]
tstd = [0.24703223, 0.24348513, 0.26158784]

# transform the 32x32x3 images into a tensor after normalizing
# each channel using the parameters above
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize(tmean, tstd)])

# download and transform the  trainset and testset for training
trainset = torchvision.datasets.CIFAR10(root='./data',train=True,download=True,transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data',train=False,download=True,transform=transform)

#split trainset into a train and a val set (90-10 split)
lengths = [int(p * len(trainset)) for p in [0.9,0.1]]
tr,v = torch.utils.data.random_split(trainset,lengths)
train_sampler = torch.utils.data.SubsetRandomSampler(tr.indices)
val_sampler = torch.utils.data.SubsetRandomSampler(v.indices)

# set batch size and set up the data generators for train, val, test sets
batch_size = 128
trainloader = torch.utils.data.DataLoader(trainset,batch_size=batch_size,sampler=train_sampler)
valloader = torch.utils.data.DataLoader(trainset,batch_size=batch_size,sampler=val_sampler)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size)

print("Number of training batches = ",len(trainloader))
print("Number of validation batches = ",len(valloader))
print("Number of test batches = ",len(testloader))

# define the output classes
classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

"""# Visualize the training data"""

Xtr,ytr = next(iter(trainloader))
# make a 8x8 grid and display 64 images from the first batch of training data
rows,cols = 8,8
fig = plt.figure(figsize=(8,8),constrained_layout=True)

for i in range(0,rows*cols):
    fig.add_subplot(rows,cols,i+1)
    tmp = np.transpose(Xtr[i].numpy(),(1,2,0))
    plt.imshow(((tmp*tstd + tmean)*255).astype(np.uint8))
    plt.xticks([])
    plt.yticks([])
    plt.title(classes[ytr[i].numpy()])

"""# The softmax function"""

def softmax(X):
    X_exp = torch.exp(X)
    partition = X_exp.sum(1, keepdims=True)
    return X_exp / partition

"""# The SoftmaxRegression model (20 points)
- the __init__ function takes the number of inputs, number of outputs, a learning rate lr, and a weight decay wd (L2 regularization strength).
     - set the learning rate and weight decay of the model
     - build the network using torch.nn.Sequential() composed of the Flatten() function and a Linear() layer with num_inputs and num_outputs.
     - initialize the weights of the linear layer from a zero-mean Gaussian with noise=0.01. You can access the linear layer as self.net[1]
     - initialize the bias of the linear layer to be 0
     
- the forward function returns the softmax of the affine transform of the flattened input with the linear layer
- the loss function reshapes the prediction yhat and the true labels y into 1D tensors, and then calls the built in torch.nn.functional.cross_entropy() function to calculate the softmax loss with reduction = 'mean' if averaged is set to True.
- the predict function takes a batch of images X and runs the forward function to get the softmax, and return the index of the class with the highest probability (use .argmax())
- the configure_optimizers function that is a call to torch.optim.SGD() specifying the parameters to be updated, the weight decay, and the learning rate.

"""

import torch.optim as optim

class SoftmaxRegression(nn.Module):
    def __init__(self, num_inputs, num_outputs, lr, wd):
        super().__init__()

        # Set the learning rate and weight decay
        self.lr = lr
        self.wd = wd

        # Build the network
        self.net = nn.Sequential(
            nn.Flatten(),                # Flatten the input images
            nn.Linear(num_inputs, num_outputs, bias=True)  # Linear layer
        )

        # Initialize weights from a zero-mean Gaussian with noise=0.01
        nn.init.normal_(self.net[1].weight, mean=0, std=0.01)

        # Initialize bias to be 0
        nn.init.constant_(self.net[1].bias, val=0)

    def forward(self, X):
        # Compute the affine transform and apply softmax
        return F.softmax(self.net(X), dim=1)

    def loss(self, yhat, y, averaged=True):
        # Reshape the predictions and labels into 1D tensors
        yhat = yhat.view(-1, yhat.size(1))
        y = y.view(-1)

        # Calculate the softmax loss using torch.nn.functional.cross_entropy
        loss = F.cross_entropy(yhat, y, reduction='mean' if averaged else 'sum')

        return loss

    def predict(self, X):
        # Get softmax predictions and return the class with the highest probability
        yhat = self.forward(X)
        return torch.argmax(yhat, dim=1)

    def configure_optimizers(self):
        # Define the optimizer with the specified learning rate and weight decay
        optimizer = optim.SGD(self.parameters(), lr=self.lr, weight_decay=self.wd)
        return optimizer

"""# The training loop (50 points)

Complete the implementation of the function train_model which takes an initialized softmax model, a train set loader, a val set loader, and the number of epochs to train.

- Initialize train_loss and val_loss tensors to store the training set and val set losses for each epoch. We will plot them at the end
- Configure the optimizer associated with the model
- for each epoch until num_epochs do:
    - initialize a running train loss and a running val loss (so we can accumulate losses over each batch)
    - for each (X,y) in trainloader:
         - zero gradients in optimizer
         - compute output of model on X
         - compute loss on output and y
         - perform backward() step on loss
         - update parameters with optimizer.step()
         - accumulate running train loss with loss
    - set train loss for epoch to be running train loss/number of train set batches
    - with torch.no_grad() (do not update gradients during the evaluation on the val set)
         - for each (valX,valy) in valloader:
            - compute output of model on valX
            - compute loss on output and valy
            - accumulate running val loss with loss
    - set val loss for epoch to be running val loss/number of val batches
- Return model, train loss, val loss
    
If you are using a GPU, then remember to map X and y, as well as valX and valy, to the device, using to(device) method. If you wish, you can also implement early termination of the outer train loop when the val loss exceeds the train set loss a given number of times (say 10, or 20).
"""

import torch

def train_model(model, trainloader, valloader, num_epochs, device='cuda' if torch.cuda.is_available() else 'cpu', early_stop_patience=None):
    model.to(device)  # Move the model to the specified device (GPU or CPU)

    # Initialize tensors to store training and validation losses for each epoch
    train_losses = torch.zeros(num_epochs)
    val_losses = torch.zeros(num_epochs)

    # Configure the optimizer associated with the model
    optimizer = model.configure_optimizers()

    # Variables for early stopping
    early_stop_count = 0
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        # Initialize running losses for this epoch
        running_train_loss = 0.0
        running_val_loss = 0.0

        # Training loop
        model.train()
        for X, y in trainloader:
            X, y = X.to(device), y.to(device)  # Move data to device

            optimizer.zero_grad()  # Zero the gradients
            output = model(X)  # Forward pass
            loss = model.loss(output, y)  # Calculate loss
            loss.backward()  # Backward pass
            optimizer.step()  # Update weights

            running_train_loss += loss.item()

        # Calculate the average training loss for this epoch
        train_loss = running_train_loss / len(trainloader)

        # Validation loop
        model.eval()
        with torch.no_grad():
            for valX, valy in valloader:
                valX, valy = valX.to(device), valy.to(device)  # Move data to device

                val_output = model(valX)  # Forward pass
                val_loss = model.loss(val_output, valy)  # Calculate loss

                running_val_loss += val_loss.item()

            # Calculate the average validation loss for this epoch
            val_loss = running_val_loss / len(valloader)

            # Store the training and validation losses
            train_losses[epoch] = train_loss
            val_losses[epoch] = val_loss

            print(f'Epoch [{epoch + 1}/{num_epochs}] Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f}')

            # Early stopping check
            if early_stop_patience is not None:
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    early_stop_count = 0
                else:
                    early_stop_count += 1

                if early_stop_count >= early_stop_patience:
                    print(f'Early stopping after {epoch + 1} epochs.')
                    break

    return model, train_losses, val_losses

"""# Test the training loop
- run this cell only after you have completed the function above.
"""

# Run on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Current device:", device)

# set learning rate and weight decay
lr=1e-2
wd=1e-3
num_epochs = 100

model1 = SoftmaxRegression(3*32*32,10,lr=lr, wd=wd).to(device)
model1,train_loss,val_loss = train_model(model1,trainloader,valloader,num_epochs)
plt.plot(torch.arange(len(train_loss)),train_loss, label="train_loss")
plt.plot(torch.arange(len(val_loss)),val_loss, label="val_loss")
plt.legend()
plt.show()

"""# Build models for various learning rates and weight decays
- model2: lr=1e-3, wd=1e-3, num_epochs = 100
- model3: lr=1e-3, wd=1e-2, num_epochs = 100
- model4: lr=1e-1, wd=1e-2, num_epochs = 100
- model5: lr=1e-5, wd=1e-2, num_epochs = 100
"""

# YOUR CODE HERE to build model2, model3, model4, model5

# Define hyperparameters
lr2, wd2, num_epochs2 = 1e-3, 1e-3, 100
lr3, wd3, num_epochs3 = 1e-3, 1e-2, 100
lr4, wd4, num_epochs4 = 1e-1, 1e-2, 100
lr5, wd5, num_epochs5 = 1e-5, 1e-2, 100

# Create model instances
model2 = SoftmaxRegression(3*32*32,10,lr=lr2, wd=wd2).to(device)
model3 = SoftmaxRegression(3*32*32,10, lr3, wd3).to(device)
model4 = SoftmaxRegression(3*32*32,10, lr4, wd4).to(device)
model5 = SoftmaxRegression(3*32*32,10, lr5, wd5).to(device)

"""# Evaluate the performance of models (20 points)
- for each model, use the sklearn metrics functions to calculate on the test set
     - confusion matrix
     - accuracy
     - classification report
- build the function getTopKAcc() to calculate top_k_accuracy using the sklearn top_k_accuracy_score function with k = 1,2,3
"""

import numpy as np
import torch
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, top_k_accuracy_score

# Define a function to calculate top-k accuracy
def getTopKAcc(model, testloader, top_k):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X, y in testloader:
            X, y = X.to(device), y.to(device)  # Move data to device
            preds = model(X)
            all_preds.extend(torch.softmax(preds, dim=1).cpu().numpy())
            all_labels.extend(y.cpu().numpy())

    top_k_acc = top_k_accuracy_score(np.array(all_labels), np.array(all_preds), k=top_k)
    return top_k_acc

# Evaluate model1 to model5
models = [model1, model2, model3, model4, model5]
for i, model in enumerate(models, start=1):
    print(f"Model {i}:")

    # Calculate confusion matrix
    test_labels = []
    test_predictions = []
    model.eval()
    with torch.no_grad():
        for X, y in testloader:
            X, y = X.to(device), y.to(device)  # Move data to device
            preds = model(X)
            test_labels.extend(y.cpu().numpy())
            test_predictions.extend(torch.argmax(preds, dim=1).cpu().numpy())
    cm = confusion_matrix(test_labels, test_predictions)

    # Calculate accuracy
    accuracy = accuracy_score(test_labels, test_predictions)

    # Calculate classification report
    class_report = classification_report(test_labels, test_predictions, target_names=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"])

    # Calculate top k accuracies (k=1, 2, 3)
    top_1_acc = getTopKAcc(model, testloader, 1)
    top_2_acc = getTopKAcc(model, testloader, 2)
    top_3_acc = getTopKAcc(model, testloader, 3)

    print("Confusion Matrix:")
    print(cm)
    print("\nAccuracy:", accuracy)
    print("\nClassification Report:")
    print(class_report)
    print(f"Top-1 Accuracy: {top_1_acc:.4f}")
    print(f"Top-2 Accuracy: {top_2_acc:.4f}")
    print(f"Top-3 Accuracy: {top_3_acc:.4f}\n")

"""# Best performing model (10 points)
- what is the learning rate and weight decay associated with the best performing model?
- comment on the effect of changing learning rate and weight decay on the basis of the five models you have built.
- use the visualization code below to understand the structure of the learned models under the different hyperparameters. Do you see a pattern in the coefficients as a function of learning rate, weight decay? Explain.

# Visualizing the learned models
"""

# Visualize the learned weights for each class`

def visualize_model(model):
    theta = model.net[1].weight.data.cpu().numpy() # [10,3072]
    theta = theta.reshape(10, 3, 32, 32)  # Because we read the image as 3*32*32 instead of 32*32*3
    theta=np.transpose(theta, (0,2,3,1)) # [10,32,32,3], so plt can plot it

    theta_min, theta_max = np.min(theta), np.max(theta)

    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
    plt.figure(figsize=(8,8))
    for i in range(10):
        plt.subplot(2, 5, i + 1)
        # Rescale the weights to be between 0 and 255
        thetaimg = 255.0 * (theta[i] - theta_min) / (theta_max - theta_min)
        plt.imshow(thetaimg.astype('uint8'),cmap='viridis')
        plt.axis('off')
        plt.title(classes[i])

    plt.tight_layout()
    plt.show()

# YOUR CODE HERE for visualizing model1, ..., model5
# comment on what patterns you see
# Are the coefficients of the best performing model more interpretable than the others?

# Visualize the learned weights for each class in model1 to model5
for i, model in enumerate(models, start=1):
    print(f"Visualization for Model {i}:")
    visualize_model(model)


"""Learning rate and weight decay can be the imporatnat hyperparameter .By changing these hyperparameters models can be used to diagnose an underfit, overfit, or well-fit model.Models 1 (lr=1e-4) and 5 (lr=1e-5) have very small learning rates. They might converge very slowly, and there's a risk of getting stuck in local minima during training. These models might not reach their full potential in terms of accuracy.
Model 2 (lr=1e-3) has a moderate learning rate. It likely converges faster than Models 1 and 5 but requires careful tuning to avoid overshooting and instability.
Model 4 (lr=1e-1) has a relatively high learning rate. It might converge quickly but could be susceptible to overshooting and oscillations. However, if properly tuned, it can yield excellent results.In above models we can see with change in them there is a change in accuracy and TOP-1,TOP-2,TOP-3 accuracy . And using visualition we can see which model gives the better result (can be seen above) using visualize_model(model).
"""