ML3 - Machine Learning in Practice; Generalization¶

Goals¶

  • Be able to build and train a simple multi-layer perceptron neural network in pytorch
  • Understand what it means for a model to generalize
  • Know how and why to prevent overfitting by splitting the dataset into training and validation sets
  • Know how and why to hold out a test set
  • Have a basic understanding of bias and variance as they relate to underfitting and overfitting
In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda') # nvidia/cuda
elif torch.mps.is_available():
    device = torch.device('mps') # apple
else:
    device = torch.device('cpu') # no acceleration

print(f'Using device: {device}')
Using device: mps

Autograd in pytorch:¶

Generate some data:

In [2]:
# Set random seed for reproducibility
torch.manual_seed(41)
np.random.seed(41)

x_np = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
y_true_np = 3.0 * x_np + 2.0
noise = np.random.randn(5) * 1.0
y_np = y_true_np + noise

# Visualize the data
plt.figure(figsize=(8, 5))
sns.scatterplot(x=x_np, y=y_np)
plt.xlabel('x')
plt.ylabel('y')
plt.show()
No description has been provided for this image
In [9]:
print(f"Data points (x, y):")
for i in range(5):
    print(f"  ({x_np[i]:.1f}, {y_np[i]:.2f})")

# Convert to PyTorch tensors
x = torch.from_numpy(x_np).float()
y = torch.from_numpy(y_np).float()

# Parameters (requires_grad=True enables autograd)
w = torch.tensor(0.0, requires_grad=True)
b = torch.tensor(0.0, requires_grad=True)

# Forward pass: compute prediction
y_pred = w * x + b

# Compute loss (MSE)
loss = ((y_pred - y) ** 2).mean()

print(w.grad)
print(b.grad)

# Backward pass: compute gradients
loss.backward()

# print(f"Prediction: {y_pred.item()}")
print(f"Loss: {loss.item()}")
print(f"Gradient w.r.t. w: {w.grad.item()}")
print(f"Gradient w.r.t. b: {b.grad.item()}")
Data points (x, y):
  (1.0, 4.73)
  (2.0, 8.10)
  (3.0, 11.25)
  (4.0, 13.07)
  (5.0, 17.57)
None
None
Loss: 138.83682250976562
Gradient w.r.t. w: -77.9301986694336
Gradient w.r.t. b: -21.890644073486328

Let's check if we agree with pytorch - these are the analytical gradients we computed in ML1:

In [10]:
def grad_wrt_w(yhat, x, y):
    return (2/len(y)) * np.sum((yhat - y) * x)
    
def grad_wrt_b(yhat, x, y):
    return (2/len(y)) * np.sum(yhat - y)

dw = grad_wrt_w(y_pred.detach().numpy(), x.detach().numpy(), y.detach().numpy())
print(f"Analytical gradient wrt w: {dw}")

db = grad_wrt_b(y_pred.detach().numpy(), x.detach().numpy(), y.detach().numpy())
print(f"Analytical gradient wrt b: {db}")
Analytical gradient wrt w: -77.9301986694336
Analytical gradient wrt b: -21.890644073486328

Gradient Descent in Pytorch¶

Now let's see what a gradient descent loop looks like:

In [11]:
plt.ion()

# Hyperparameters
learning_rate = 0.01
num_epochs = 10

# Parameters (requires_grad=True enables autograd)
w = torch.tensor(0.0, requires_grad=True)
b = torch.tensor(0.0, requires_grad=True)

print(f"Initial: w={w.item():.4f}, b={b.item():.4f}")

# Training loop
for epoch in range(num_epochs):
    # "Forward pass": compute predictions
    y_pred = w * x + b
    
    # Compute loss (MSE)
    loss = ((y_pred - y) ** 2).mean()
    
    # Backward pass: compute gradients with autograd
    loss.backward()
    
    # Manual gradient descent update
    with torch.no_grad():  # Don't track these operations in autograd
        w -= learning_rate * w.grad
        b -= learning_rate * b.grad
    
    # zero out the gradients for next iteration
    w.grad.zero_()
    b.grad.zero_()
    
    # Show progress
    print(f"Epoch {epoch+1:3d}: Loss={loss.item():.4f}, w={w.item():.4f}, b={b.item():.4f}")

    plt.figure(figsize=(10, 5))
    
    # Left plot: Current fit
    plt.subplot(1, 2, 1)
    sns.scatterplot(x=x_np, y=y_np, s=100, color='blue')
    # plt.plot(x_np, y_np, 'r--', alpha=0.5, label='True line (y=3x+2)')
    plt.plot(x_np, y_pred.detach().numpy(), 'g-', linewidth=2, label=f'Current fit (y={w.item():.2f}x+{b.item():.2f})')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.title(f'Epoch {epoch+1}/{num_epochs}')
    plt.ylim(0, 18)
    
    # Right plot: Loss over time
    if epoch == 0:
        loss_history = [loss.item()]
    else:
        loss_history.append(loss.item())
    
    plt.subplot(1, 2, 2)
    plt.plot(range(1, len(loss_history)+1), loss_history, 'b-', linewidth=2)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.grid(True, alpha=0.3)
    plt.show()
    import time
    time.sleep(1)
    
Initial: w=0.0000, b=0.0000
Epoch   1: Loss=138.8368, w=0.7793, b=0.2189
No description has been provided for this image
Epoch   2: Loss=81.0656, w=1.3740, b=0.3867
No description has been provided for this image
Epoch   3: Loss=47.3990, w=1.8278, b=0.5154
No description has been provided for this image
Epoch   4: Loss=27.7794, w=2.1741, b=0.6143
No description has been provided for this image
Epoch   5: Loss=16.3458, w=2.4382, b=0.6905
No description has been provided for this image
Epoch   6: Loss=9.6824, w=2.6397, b=0.7493
No description has been provided for this image
Epoch   7: Loss=5.7990, w=2.7933, b=0.7949
No description has been provided for this image
Epoch   8: Loss=3.5356, w=2.9104, b=0.8303
No description has been provided for this image
Epoch   9: Loss=2.2163, w=2.9996, b=0.8579
No description has been provided for this image
Epoch  10: Loss=1.4471, w=3.0675, b=0.8797
No description has been provided for this image

Part 1: Load CIFAR-10 and Display Sample Images¶

In [13]:
# Define CIFAR-10 class names
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# Normalize the data to (-1, 1)
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
])

# Load CIFAR-10
cifar10_train = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform
)

# Use a small subset for faster training
subset_size = 2000  # Small subset for demonstration
indices = torch.randperm(len(cifar10_train))[:subset_size]
data_subset = torch.utils.data.Subset(cifar10_train, indices)

# Display one image from each class
fig, axes = plt.subplots(2, 5, figsize=(14, 6))
axes = axes.flatten()

# Create a dictionary to track which classes we've seen
class_indices = {i: None for i in range(len(classes))}

# Find one image from each class
for idx, (image, label) in enumerate(data_subset):
    if class_indices[label] is None:
        class_indices[label] = idx
    if all(v is not None for v in class_indices.values()):
        break

# Plot one image from each class
for class_idx in range(len(classes)):
    image, label = data_subset[class_indices[class_idx]]
    # Denormalize for display: multiply by std and add mean
    image = image * 0.5 + 0.5  # undo normalization
    image = image.permute(1, 2, 0).numpy()  # Convert to HWC format
    axes[class_idx].imshow(image)
    axes[class_idx].set_title(f'{classes[label]}')
    axes[class_idx].axis('off')

plt.tight_layout()
plt.show()
No description has been provided for this image

Question: Suppose we're trying to learn a linear classifier for 10 classes that takes raw RGB pixels as input.

  • What is the size of the weight matrix $W$?
  • What is the size of the bias vector $b$?
  • How many parameters does the model have?
In [14]:
data_subset[0][0].shape # an input datapoint (image)
Out[14]:
torch.Size([3, 32, 32])
In [17]:
3 * 32 * 32
Out[17]:
3072

Part 2: Linear Classifier (Single-layer Neural Network)¶

In [20]:
# Define a simple linear classifier
class LinearClassifier(nn.Module):

    # the constructor initializes the "layers" - in this case, 
    # just one "fully connected" or linear layer
    def __init__(self, input_size=3*32*32, num_classes=10):
        super(LinearClassifier, self).__init__()
        self.fc = nn.Linear(input_size, num_classes) # our only layer
    
    def forward(self, x):
        x = x.view(x.size(0), -1) # unroll the pixels into a vector
        
        # Apply linear layer to get y (raw un-normalized scores)
        x = self.fc(x)
        return x

# Create model and move to device
linear_model = LinearClassifier().to(device)
print(f'Linear model created:')
print(linear_model)
print(f'Number of parameters: {sum(p.numel() for p in linear_model.parameters()):,}')
Linear model created:
LinearClassifier(
  (fc): Linear(in_features=3072, out_features=10, bias=True)
)
Number of parameters: 30,730
In [21]:
def train_model(model, dataset, num_epochs=10, batch_size=128, learning_rate=0.01):

    # takes care of loading data and creating batches for us
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True) 

    # calculates softmax + cross-entropy loss for us:
    loss_fn = nn.CrossEntropyLoss()

    # takes care of the gradient updates for us:
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    # we'll track some history:
    train_losses = []
    train_accuracy = []

    for epoch in range(num_epochs):
        model.train() # set model to "Train" mode
        total_loss = 0.0
        num_batches = 0
        num_correct = 0
        
        for batch_idx, (images, labels) in enumerate(loader):
            # Move data to device
            images = images.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(images) # compute the model outputs
            loss = loss_fn(outputs, labels) # calculate the loss
            
            # Backward pass and optimization
            optimizer.zero_grad()   # Clear old gradients
            loss.backward()         # Compute gradients
            optimizer.step()        # Update parameters
            
            total_loss += loss.item()
            num_batches += 1

            # calculate accuracy
            _, predicted = torch.max(outputs.data, 1)
            num_correct += (predicted == labels).sum().item()
        
        avg_loss = total_loss / num_batches
        train_losses.append(avg_loss)

        avg_acc = num_correct / len(dataset)
        train_accuracy.append(avg_acc)
        if epoch % 5 == 0:
            print(f'Epoch {epoch+1:2d}: Loss = {avg_loss:.4f}, Accuracy = {avg_acc:.4f}')

    return train_losses, train_accuracy
In [22]:
# Train the linear classifier

# Create model and move to device
linear_model = LinearClassifier().to(device)
print(f'Linear model created:')
print(linear_model)
print(f'Number of parameters: {sum(p.numel() for p in linear_model.parameters()):,}')

linear_train_losses, linear_train_accs = train_model(
    linear_model, data_subset, 
    num_epochs=50, 
    batch_size=128, 
    learning_rate=0.01
)

print(f'Linear Classifier - Final Accuracy: {linear_train_accs[-1]:.2f}%')
Linear model created:
LinearClassifier(
  (fc): Linear(in_features=3072, out_features=10, bias=True)
)
Number of parameters: 30,730
Epoch  1: Loss = 2.1366, Accuracy = 0.2370
Epoch  6: Loss = 1.8165, Accuracy = 0.4045
Epoch 11: Loss = 1.7216, Accuracy = 0.4425
Epoch 16: Loss = 1.6588, Accuracy = 0.4780
Epoch 21: Loss = 1.6092, Accuracy = 0.4965
Epoch 26: Loss = 1.5720, Accuracy = 0.5065
Epoch 31: Loss = 1.5368, Accuracy = 0.5215
Epoch 36: Loss = 1.5011, Accuracy = 0.5435
Epoch 41: Loss = 1.4748, Accuracy = 0.5490
Epoch 46: Loss = 1.4525, Accuracy = 0.5610
Linear Classifier - Final Accuracy: 0.56%
In [23]:
# Visualize linear classifier training
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Plot 1: Training loss
ax = axes[0]
ax.plot(linear_train_losses, marker='o', linewidth=2, markersize=4)
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Training Loss', fontsize=12)
ax.set_title('Linear Classifier: Training Loss', fontsize=13, fontweight='bold')
ax.grid(True, alpha=0.3)

# Plot 2: Accuracy
ax = axes[1]
ax.plot(linear_train_accs, marker='o', linewidth=2, markersize=4, color='tab:orange')
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Training Accuracy', fontsize=12)
ax.set_title('Linear Classifier: Training Accuracy', fontsize=13, fontweight='bold')
ax.grid(True, alpha=0.3)
ax.set_ylim([0, 1])

plt.tight_layout()
plt.savefig('linear_classifier_training.png', dpi=100, bbox_inches='tight')
plt.show()
No description has been provided for this image

Part 3: Multi-Layer Perceptron (MLP) Classifier¶

In [24]:
# Define a multi-layer perceptron
class MLPClassifier(nn.Module):
    def __init__(self, input_size=3*32*32, hidden_sizes=[512, 256], num_classes=10):
        super(MLPClassifier, self).__init__()
        
        layers = []
        prev_size = input_size
        
        # Hidden layers with ReLU activation
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.ReLU())
            prev_size = hidden_size
        
        # Output layer
        layers.append(nn.Linear(prev_size, num_classes))
        
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        # Flatten the input
        x = x.view(x.size(0), -1)
        # Pass through network
        x = self.network(x)
        return x
In [25]:
# Create MLP model and move to device
mlp_model = MLPClassifier(input_size=3*32*32, hidden_sizes=[512, 256], num_classes=10).to(device)
print(f'MLP model created:')
print(mlp_model)
print(f'Number of parameters: {sum(p.numel() for p in mlp_model.parameters()):,}')
print('Training MLP Classifier...')
print('='*50)

# Train the MLP classifier
mlp_train_losses, mlp_train_accs = train_model(
    mlp_model, data_subset,
    num_epochs=50, 
    batch_size=128, 
    learning_rate=0.1
)

print('='*50)
print(f'MLP Classifier - Final Train Accuracy: {mlp_train_accs[-1]:.2f}%')
MLP model created:
MLPClassifier(
  (network): Sequential(
    (0): Linear(in_features=3072, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=10, bias=True)
  )
)
Number of parameters: 1,707,274
Training MLP Classifier...
==================================================
Epoch  1: Loss = 2.2412, Accuracy = 0.1920
Epoch  6: Loss = 1.7231, Accuracy = 0.4160
Epoch 11: Loss = 1.4202, Accuracy = 0.5205
Epoch 16: Loss = 1.0770, Accuracy = 0.6710
Epoch 21: Loss = 0.8201, Accuracy = 0.7630
Epoch 26: Loss = 0.6024, Accuracy = 0.8270
Epoch 31: Loss = 0.4648, Accuracy = 0.8900
Epoch 36: Loss = 0.1478, Accuracy = 0.9795
Epoch 41: Loss = 0.3919, Accuracy = 0.9270
Epoch 46: Loss = 0.0446, Accuracy = 0.9990
==================================================
MLP Classifier - Final Train Accuracy: 1.00%
In [26]:
# Visualize MLP classifier training
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Plot 1: Training loss
ax = axes[0]
ax.plot(mlp_train_losses, marker='o', linewidth=2, markersize=4)
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Training Loss', fontsize=12)
ax.set_title('MLP Classifier: Training Loss', fontsize=13, fontweight='bold')
ax.grid(True, alpha=0.3)

# Plot 2: Train accuracy
ax = axes[1]
ax.plot(mlp_train_accs, marker='o', linewidth=2, markersize=4, color='tab:orange')
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Train Accuracy (%)', fontsize=12)
ax.set_title('MLP Classifier: Train Accuracy', fontsize=13, fontweight='bold')
ax.grid(True, alpha=0.3)
ax.set_ylim([0, 1])

plt.tight_layout()
plt.savefig('mlp_classifier_training.png', dpi=100, bbox_inches='tight')
plt.show()
No description has been provided for this image

Wow, we crushed it!!¶

Let's see how this model does on some more CIFAR10 images!

In [27]:
# Load CIFAR-10 test data
cifar10_test = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform
)

# Use a small subset for faster training
subset_size = 2000  # Small subset for demonstration
indices = torch.randperm(len(cifar10_test))[:subset_size]
test_subset = torch.utils.data.Subset(cifar10_test, indices)
In [28]:
def evaluate_model(model, dataset, batch_size=128):
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    model.eval()
    
    num_correct = 0
    num_samples = 0
    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            labels = labels.to(device)
            
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            
            num_correct += (predicted == labels).sum().item()
            num_samples += labels.size(0)
            
    accuracy = num_correct / num_samples
    return accuracy
In [29]:
print("Linear model evaluation accuracy:", evaluate_model(linear_model, test_subset))
Linear model evaluation accuracy: 0.348

Hmm... I thought we we had over 50% accuracy!

What about the MLP?

In [30]:
print("MLP evaluation accuracy:", evaluate_model(mlp_model, test_subset))
MLP evaluation accuracy: 0.396

uh oh...

What we're seeing here is that our models are failing to generalize.