EMNIST Baseline#
Copying from the beginner workshop, we introduce basic model training for the EMNIST dataset. We will build upon the approach outlined here throughout the series.
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image
from torchvision.io import ImageReadMode
from torchvision import datasets, transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from pathlib import Path
import matplotlib.pyplot as plt
data_dir = f"/scratch/{os.environ['USER']}/data"
model_path = f"/scratch/{os.environ['USER']}/model.pt"
# Model and Training
batch_size=128 #input batch size for training (default: 64)
test_batch_size=1000 #input batch size for testing (default: 1000)
num_workers=10 # parallel data loading to speed things up
lr=1.0 #learning rate (default: 1.0)
gamma=0.7 #Learning rate step gamma (default: 0.7)
no_cuda=False #disables CUDA training (default: False)
seed=42 #random seed (default: 42)
log_interval=10 #how many batches to wait before logging training status (default: 10)
save_model=False #save the trained model (default: False)
# additional derived settings
use_cuda = not no_cuda and torch.cuda.is_available()
torch.manual_seed(seed)
device = torch.device("cuda" if use_cuda else "cpu")
print("Device:", device)
Device: cuda
Dataset#
In the beginner series, we saw how to set up a Pytorch Dataset class for the ImageNet dataset. We will copy the result here.
# Loading the data
# Pytorch provides a number of pre-defined dataset classes
# EMNIST is one of them! Pytorch will automatically download the data.
# It will only download if the data is not already present.
data_train = datasets.EMNIST(data_dir, split='balanced', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
data_test = datasets.EMNIST(data_dir, split='balanced', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
# define pytorch dataloaders for training and testing
train_loader = DataLoader(data_train, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
test_loader = DataLoader(data_test, batch_size=test_batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
# save a test batch for later testing
image_gen = iter(test_loader)
test_img, test_trg = next(image_gen)
print("Training dataset:", train_loader.dataset)
print("Testing dataset:", test_loader.dataset)
Training dataset: Dataset EMNIST
Number of datapoints: 112800
Root location: /scratch/dane2/data
Split: Train
StandardTransform
Transform: Compose(
ToTensor()
Normalize(mean=(0.1307,), std=(0.3081,))
)
Testing dataset: Dataset EMNIST
Number of datapoints: 18800
Root location: /scratch/dane2/data
Split: Test
StandardTransform
Transform: Compose(
ToTensor()
Normalize(mean=(0.1307,), std=(0.3081,))
)
Model definition#
In the EMNIST notebook, we developed CNN architectures for hand-written characters. We will copy our best architecture here and adapt it to the situation of color-image inputs and 1000-class output.
class Classifier(nn.Module):
def __init__(self):
super(Classifier, self).__init__()
# feature encoder
self.feature_extractor = feature_extractor = nn.Sequential(
# block 1
nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3, stride=1, bias=False),
nn.LazyBatchNorm2d(),
nn.ReLU(),
# 2
nn.Conv2d(in_channels=4, out_channels=4, kernel_size=3, stride=1, bias=False),
nn.LazyBatchNorm2d(),
nn.ReLU(),
# 3
nn.Conv2d(in_channels=4, out_channels=4, kernel_size=3, stride=1, bias=False),
nn.LazyBatchNorm2d(),
nn.ReLU(),
nn.MaxPool2d(2),
# flatten just as with the linear classifier
nn.Flatten()
)
# linear classification head -- ImageNet has 1000 classes
self.classifier = nn.LazyLinear(47)
def forward(self, x):
x = self.feature_extractor(x)
x = self.classifier(x)
return x
def num_params(self):
return sum(p.numel() for p in model.parameters())
# Create the model
model = Classifier()
# let's make sure we can run a batch of data through the model
with torch.no_grad():
x, y = next(iter(train_loader))
y_hat = model(x)
y_hat.shape, y_hat, y_hat.sum(axis=-1)
(torch.Size([128, 47]),
tensor([[-0.2897, -0.5617, 0.0628, ..., -1.2602, 0.8728, 0.3598],
[ 0.8654, -0.0719, 0.6095, ..., -0.0504, -0.4102, -0.1677],
[ 0.2731, -0.2552, 0.0274, ..., 0.0960, 0.7097, 0.4219],
...,
[-0.2629, 0.0098, 0.0789, ..., -0.7038, 0.5512, -1.0838],
[ 0.2294, 0.5040, -0.4386, ..., -0.5136, 0.5362, 0.4131],
[-0.1829, -0.6301, 1.2044, ..., -1.4666, -0.5272, -0.3675]]),
tensor([-1.1857, 7.7833, 3.6772, -1.0103, 7.2764, 3.0098, 6.7484, -2.4809,
16.5917, 7.4026, 1.1977, 7.7264, 1.4142, 8.6851, 5.8889, 0.8193,
11.8244, 9.6126, 3.6529, 3.7360, 6.5334, 5.4083, 2.3121, 4.2810,
3.9888, 0.9701, 6.4130, 3.1773, 7.5547, 9.3368, 4.6726, 8.8772,
10.0957, 4.4073, 5.9341, -1.3580, 1.0337, 4.1729, 7.2586, 5.7577,
0.5533, 7.2799, 0.6950, 6.7011, -4.1594, 4.3148, 7.4138, 1.8631,
-1.8655, 6.9863, -0.7021, 8.2847, 4.9730, -1.9363, 6.9793, 1.7842,
13.1547, 7.6535, 6.8881, 9.3821, 8.7600, 5.9675, 3.2702, -0.2760,
2.1410, 5.2800, 4.0835, 0.3117, 8.4077, 4.7187, 1.8049, 0.7824,
7.1940, 10.2863, 7.8565, 6.0385, -1.9799, 6.3945, 0.3884, 0.3304,
5.6073, 9.4075, 10.4626, -0.7341, 3.5893, 2.9042, 6.0140, 4.1800,
7.2520, -0.9992, 8.0851, 7.3866, 10.2711, 5.5463, 2.7001, 7.8807,
7.6535, 7.2546, 7.2469, 4.9917, 0.8115, 12.6958, 3.4918, 4.0801,
8.3012, 2.5887, 1.9310, 4.5293, 4.0950, 4.1541, 6.9728, 7.7243,
3.2597, 8.9427, 3.4462, -2.8556, 7.1138, 4.0573, 1.0241, 6.8103,
2.1410, 11.8644, 2.2207, 1.7766, 4.6555, 4.4057, 7.7114, 6.6785]))
model
Classifier(
(feature_extractor): Sequential(
(0): Conv2d(1, 4, kernel_size=(3, 3), stride=(1, 1), bias=False)
(1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU()
(3): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), bias=False)
(4): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(5): ReLU()
(6): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), bias=False)
(7): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(8): ReLU()
(9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(10): Flatten(start_dim=1, end_dim=-1)
)
(classifier): Linear(in_features=484, out_features=47, bias=True)
)
print("Number of parameters:", model.num_params())
Number of parameters: 23143
Training and testing loops#
def train(model, device, train_loader, optimizer, epoch):
model.train()
losses = []
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.cross_entropy(output, target)
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
print('\r\tTrain epoch {}: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()), end='')
def test(model, device, test_loader, epoch):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.cross_entropy(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('\rTest epoch {}: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'.format(
epoch,
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
def train_and_test(model, dl_train, dl_test, save_name=model_path, epochs=5):
# @title Train the linear model
optimizer = optim.Adadelta(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=1, gamma=gamma)
for epoch in range(1, epochs + 1):
train(model, device, dl_train, optimizer, epoch)
test(model, device, dl_test, epoch)
scheduler.step()
if save_model:
torch.save(model.state_dict(), save_name)
Run Training#
model = Classifier().to(device)
model
Classifier(
(feature_extractor): Sequential(
(0): Conv2d(1, 4, kernel_size=(3, 3), stride=(1, 1), bias=False)
(1): LazyBatchNorm2d(0, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU()
(3): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), bias=False)
(4): LazyBatchNorm2d(0, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(5): ReLU()
(6): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), bias=False)
(7): LazyBatchNorm2d(0, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(8): ReLU()
(9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(10): Flatten(start_dim=1, end_dim=-1)
)
(classifier): LazyLinear(in_features=0, out_features=47, bias=True)
)
train_and_test(model, train_loader, test_loader)
Test epoch 1: Average loss: 0.6545, Accuracy: 15048/18800 (80.04%)
Test epoch 2: Average loss: 0.5639, Accuracy: 15416/18800 (82.00%)
Test epoch 3: Average loss: 0.5422, Accuracy: 15651/18800 (83.25%)
Test epoch 4: Average loss: 0.5266, Accuracy: 15735/18800 (83.70%)
Test epoch 5: Average loss: 0.5081, Accuracy: 15831/18800 (84.21%)