EMNIST Baseline

Contents

EMNIST Baseline#

Copying from the beginner workshop, we introduce basic model training for the EMNIST dataset. We will build upon the approach outlined here throughout the series.

from utils.response import create_answer_box

create_answer_box("Please enter your name.", "01-01")
create_answer_box("Please briefly describe your prior experience with Pytorch.", "01-02")
create_answer_box("Are there any particular topics you especially hope to learn about in this workshop?", "01-03")

import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image
from torchvision.io import ImageReadMode
from torchvision import datasets, transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from pathlib import Path
import matplotlib.pyplot as plt

data_dir = f"/scratch/{os.environ['USER']}/data"
model_path = f"/scratch/{os.environ['USER']}/model.pt"

# Model and Training
batch_size=128 # input batch size for training (default: 64)
test_batch_size=1000 #input batch size for testing (default: 1000)
num_workers=10 # parallel data loading to speed things up
lr=1.0 #learning rate (default: 1.0)
gamma=0.7 #Learning rate step gamma (default: 0.7)
no_cuda=False #disables CUDA training (default: False)
seed=355 #random seed (default: 355)
log_interval=10 #how many batches to wait before logging training status (default: 10)
save_model=False #save the trained model (default: False)

# additional derived settings
use_cuda = not no_cuda and torch.cuda.is_available()
torch.manual_seed(seed)
device = torch.device("cuda" if use_cuda else "cpu")

print("Device:", device)

Dataset#

In the beginner series, we saw how to set up a Pytorch Dataset class for the ImageNet dataset. We will copy the result here.

# Loading the data

# Pytorch provides a number of pre-defined dataset classes
# EMNIST is one of them! Pytorch will automatically download the data.
# It will only download if the data is not already present.
data_train = datasets.EMNIST(data_dir, split='balanced', train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ]))

data_test = datasets.EMNIST(data_dir, split='balanced', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ]))

# define pytorch dataloaders for training and testing
train_loader = DataLoader(data_train, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
test_loader = DataLoader(data_test, batch_size=test_batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)

# save a test batch for later testing
image_gen = iter(test_loader)
test_img, test_trg = next(image_gen)

print("Training dataset:", train_loader.dataset)
print("Testing dataset:", test_loader.dataset)

Model definition#

In the EMNIST notebook, we developed CNN architectures for hand-written characters. We will copy our best architecture here and adapt it to the situation of color-image inputs and 1000-class output.

class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()

        # feature encoder
        self.feature_extractor = feature_extractor = nn.Sequential(
            # block 1
            nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3, stride=1, bias=False),
            nn.LazyBatchNorm2d(),
            nn.ReLU(),
            # 2
            nn.Conv2d(in_channels=4, out_channels=4, kernel_size=3, stride=1, bias=False),
            nn.LazyBatchNorm2d(),
            nn.ReLU(),
            # 3
            nn.Conv2d(in_channels=4, out_channels=4, kernel_size=3, stride=1, bias=False),
            nn.LazyBatchNorm2d(),
            nn.ReLU(),
            nn.MaxPool2d(2),
            # flatten just as with the linear classifier
            nn.Flatten()
        )
        
        # linear classification head -- ImageNet has 1000 classes
        self.classifier = nn.LazyLinear(47)

    def forward(self, x):
        x = self.feature_extractor(x)
        x = self.classifier(x)
        return x
    
    def num_params(self):
        return sum(p.numel() for p in model.parameters())

# Create the model
model = Classifier()

print(model)
print("Number of parameters:", model.num_params())

# let's make sure we can run a batch of data through the model
with torch.no_grad():
    x, y = next(iter(train_loader))
    y_hat = model(x)
    
y_hat.shape, y_hat

print(model)
print("Number of parameters:", model.num_params())

# Print each layer of the model along with the number of parameters in that layer
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.numel()} parameters")

from utils.response import create_answer_box

create_answer_box("As you see above, the model has 12 layers (11 in the feature extractor, plus the final classifier layer). The model also has 23,143 total trainable parameters. It's important, when designing a model, to have an intuitive grasp of its size and shape -- sometimes they are quite surprising! In this case, please list for each of the 12 layers of the model roughly what percent of the total parameters you would guess are in that layer. Start from layer 0 and use the format '`0: x%, 1: y%, ..., 11: z%`'. (If you can figure out the precise answer instead of guessing, feel free!)", "01-04")

Training and testing loops#

def train(model, device, train_loader, optimizer, epoch):
    model.train()
    losses = []
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('\r\tTrain epoch {}: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()), end='')
            
def test(model, device, test_loader, epoch):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.cross_entropy(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\rTest epoch {}: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'.format(
        epoch,
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
def train_and_test(model, dl_train, dl_test, save_name=model_path, lr=lr, gamma=gamma, epochs=5):
    # @title Train the linear model
    optimizer = optim.Adadelta(model.parameters(), lr=lr)
    scheduler = StepLR(optimizer, step_size=1, gamma=gamma)

    for epoch in range(1, epochs + 1):
        train(model, device, dl_train, optimizer, epoch)
        test(model, device, dl_test, epoch)
        scheduler.step()

    if save_model:
        torch.save(model.state_dict(), save_name)

Run Training#

model = Classifier().to(device)
model

train_and_test(model, train_loader, test_loader)

Code challenge#

Try modifying one hyperparameter (batch size, learning rate, number of epochs, etc.) and report on what happens to the final accuracy. Whichever hyperparameter you choose, try multiple values and report on what overall effect you can see, if any.

class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()

        # feature encoder
        self.feature_extractor = feature_extractor = nn.Sequential(
            # block 1
            nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3, stride=1, bias=False),
            nn.LazyBatchNorm2d(),
            nn.ReLU(),
            # 2
            nn.Conv2d(in_channels=4, out_channels=4, kernel_size=3, stride=1, bias=False),
            nn.LazyBatchNorm2d(),
            nn.ReLU(),
            # 3
            nn.Conv2d(in_channels=4, out_channels=4, kernel_size=3, stride=1, bias=False),
            nn.LazyBatchNorm2d(),
            nn.ReLU(),
            nn.MaxPool2d(2),
            # flatten just as with the linear classifier
            nn.Flatten()
        )
        
        # linear classification head -- ImageNet has 1000 classes
        self.classifier = nn.LazyLinear(47)

    def forward(self, x):
        x = self.feature_extractor(x)
        x = self.classifier(x)
        return x
    
    def num_params(self):
        return sum(p.numel() for p in model.parameters())

model = Classifier().to(device) # Start each run with a fresh model
train_and_test(model, train_loader, test_loader, lr=1, gamma=0.7, epochs=5)

create_answer_box("What hyperparameter did you vary, and what results did you observe?", "01-05")