Building the network#

The nn.Module subpackage in PyTorch contains many neural network building blocks called “modules”. We can compose these in arbitrary ways to build network architectures tailored to a given problem.

import torch
import torch.nn as nn

# do everything on gpu unless we explicitly say otherwise
torch.set_default_device('cuda')

The basics#

We saw examples like this in earlier notebooks:

model = nn.Sequential(
    nn.Linear(10,10),
    nn.Tanh(),
    nn.Linear(10,10),
    nn.Tanh(),
    nn.Linear(10,3),
    nn.Sigmoid()
)

# printing the model shows the layers
model
Sequential(
  (0): Linear(in_features=10, out_features=10, bias=True)
  (1): Tanh()
  (2): Linear(in_features=10, out_features=10, bias=True)
  (3): Tanh()
  (4): Linear(in_features=10, out_features=3, bias=True)
  (5): Sigmoid()
)

nn.Sequential, nn.Linear, nn.Tanh, and nn.Sigmoid are all examples of modules. There are many more. You can see a full list here: https://pytorch.org/docs/stable/nn.html

Callable. All modules are callable, meaning they can be evaluated like a function:

layer = nn.Linear(4,5)
x = torch.randn(7, 4)
layer(x)
tensor([[ 0.2067, -0.1036, -0.0760,  0.1359,  0.1805],
        [-0.7918,  0.2130,  0.5677, -0.1730,  0.6101],
        [ 0.3045, -0.1712, -0.2181,  0.1504, -0.0715],
        [-0.0092, -0.1449, -0.2185,  0.2841,  0.0868],
        [ 1.0998, -0.0064, -0.1398,  0.6454,  0.4840],
        [ 0.0100, -0.4746, -0.8649,  0.5077, -0.4945],
        [ 0.8390, -0.2072, -0.4667,  0.6762,  0.2141]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
layer = nn.Tanh()
layer(x)
tensor([[ 0.0721, -0.4746,  0.3835, -0.3337],
        [-0.8418, -0.8637, -0.7904, -0.7616],
        [ 0.8242, -0.2777,  0.2174, -0.1672],
        [ 0.4278, -0.6650, -0.3149,  0.0979],
        [ 0.0996,  0.6908,  0.6997,  0.4332],
        [ 0.9625, -0.8417, -0.5027,  0.7079],
        [ 0.2415,  0.0522,  0.6528,  0.5939]], device='cuda:0')

Changing device. Modules can be moved between devices. Unlike tensors, this operation is in place.

layer = nn.Linear(4,5)
print("Before:", layer.weight.device)
layer.to('cpu')
print("After:", layer.weight.device)
Before: cuda:0
After: cpu

All nested modules also move:

model = nn.Sequential(
    nn.Linear(10,10),
    nn.Tanh(),
    nn.Linear(10,3)
)

print("Before:", model[0].weight.device)
model.to('cpu')
print("After:", model[0].weight.device)

# back on gpu for later
model.to('cuda')
Before: cuda:0
After: cpu
Sequential(
  (0): Linear(in_features=10, out_features=10, bias=True)
  (1): Tanh()
  (2): Linear(in_features=10, out_features=3, bias=True)
)

Saving/loading. Model weights can be saved to and loaded from disc. There are a few ways to do this. The recommended way is to just save the weights using the “state dict” object:

for k, v in model.state_dict().items():
    print(k, v.shape)
0.weight torch.Size([10, 10])
0.bias torch.Size([10])
2.weight torch.Size([3, 10])
2.bias torch.Size([3])
torch.save(model.state_dict(), 'model_weights.pt')
# Pytorch uses a version of pickle to save the weights
!head -n 3 model_weights.pt
PKmodel_weights/data.pklFZZZZZZZZ�ccollections
OrderedDict
q)Rq(0.weightqctorch._utils
# some time later...
model2 = nn.Sequential(
    nn.Linear(10,10),
    nn.Tanh(),
    nn.Linear(10,3)
)

model2.load_state_dict(torch.load('model_weights.pt'))
<All keys matched successfully>

Using the state dict required that we instantiate the model class first. We can also save the model structure together.

torch.save(model, 'model.pt')
model2 = torch.load('model.pt')

eval/train modes. Some layers need to behave differently at training time and evaluation time. These can all be toggled with the train() and eval() methods:

layer = nn.Dropout(0.5)

# the default mode is "training"
x = torch.randn(3, 5)
print(x)
layer(x)
tensor([[ 0.1891,  0.3391,  0.8269,  1.6241,  0.7730],
        [ 0.1412,  0.7913, -0.5472,  0.4971, -0.1926],
        [-0.9877, -0.7166, -0.0323, -0.2046,  1.9819]], device='cuda:0')
tensor([[ 0.3782,  0.0000,  0.0000,  3.2482,  0.0000],
        [ 0.0000,  1.5826, -1.0944,  0.9942, -0.3851],
        [-0.0000, -1.4331, -0.0000, -0.0000,  0.0000]], device='cuda:0')
# switch to eval:
layer.eval()
layer(x)
tensor([[ 0.1891,  0.3391,  0.8269,  1.6241,  0.7730],
        [ 0.1412,  0.7913, -0.5472,  0.4971, -0.1926],
        [-0.9877, -0.7166, -0.0323, -0.2046,  1.9819]], device='cuda:0')
# switch back to train
layer.train()
layer(x)
tensor([[ 0.3782,  0.6783,  1.6537,  3.2482,  1.5461],
        [ 0.2823,  0.0000, -1.0944,  0.0000, -0.3851],
        [-0.0000, -1.4331, -0.0646, -0.4092,  3.9638]], device='cuda:0')

Writing custom modules#

You can make your own modules. To do so, subclass nn.Module and define the __init__ and forward method. These modules can be used just like any other module.

class NeuralNetwork(nn.Module):
    def __init__(self):
        """
        The __init__ method defines all of the modules/parameters that will 
        appear in the model.
        """
        super().__init__()
        self.flatten = nn.Flatten()
        self.encoder = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU()
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(256,1)
        )

    def forward(self, x):
        """
        Define how to get from the input to the output. 
        You can use arbitrary python code here so long as the 
        tensor operations are differentiable. 
        """
        x = self.flatten(x)
        h = self.encoder(x)
        y = self.classifier(h)
        return y
    
model = NeuralNetwork()
model
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (encoder): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): ReLU()
  )
  (classifier): Sequential(
    (0): Linear(in_features=256, out_features=1, bias=True)
  )
)
# simulate a batch of grayscale images:
x = torch.randn(5, 1, 28, 28)

model(x)
tensor([[0.1180],
        [0.0702],
        [0.0694],
        [0.1116],
        [0.1236]], device='cuda:0', grad_fn=<AddmmBackward0>)

You can customize your network however you see fit. For example, say we had a problem where the network took two images as input and made some decision about them. We could do something like this:

class PairNetwork(nn.Module):
    def __init__(self):
        """
        The __init__ method defines all of the modules/parameters that will 
        appear in the model.
        """
        super().__init__()
        self.flatten = nn.Flatten()
        self.encoder = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU()
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(2*256,1)  # double the representation size
        )

    def forward(self, x1, x2):
        """
        Define how to get from the input to the output. 
        You can use arbitrary python code here so long as the 
        tensor operations are differentiable. 
        """
        x1 = self.flatten(x1)
        h1 = self.encoder(x1)
        
        x2 = self.flatten(x2)
        h2 = self.encoder(x2)
        
        # fuse the representations
        h = torch.concat([h1, h2], axis=-1)
        
        y = self.classifier(h)
        return y
    
pair_model = PairNetwork()
pair_model
PairNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (encoder): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): ReLU()
  )
  (classifier): Sequential(
    (0): Linear(in_features=512, out_features=1, bias=True)
  )
)
# simulate a batch of grayscale images:
x1 = torch.randn(5, 1, 28, 28)
x2 = torch.randn(5, 1, 28, 28)

pair_model(x1, x2)
tensor([[ 0.0418],
        [ 0.0130],
        [-0.0402],
        [ 0.0025],
        [ 0.0232]], device='cuda:0', grad_fn=<AddmmBackward0>)

Tracking parameters Pytorch automatically tracks all of the parameters that appear in your custom model. This allows Pytorch to optimize the network during training. It allows can allow you to get diagnostic information such as the number of parameters in your model:

num_pars = sum([p.numel() for p in model.parameters()])
print("Number of parameters:", num_pars)
Number of parameters: 796161