6.《动手学深度学习》卷积神经网络

1 从全连接层到卷积层

计算机视觉应具备的两个特性:

  1. 平移不变性(translation invariance):树上的一片叶子落到地上,它还是一片叶子
  2. 局部性(locality):一只眼睛和另一只眼睛在同一张脸上,才是一双眼

为了满足以上两点,神经网络引入了卷积层的概念

复习:卷积的公式定义如下:

  1. 连续型对象:$(f*g)(x)=\int{f(z)g(x-z)dz}$
  2. 离散型对象:$(f*g)(i)=\Sigma_a{f(a)g(i-a)}$
  3. 二维张量:$(f*g)(i,j)=\Sigma_a\Sigma_b{f(a,b)g(i-a,j-b)}$

注意区分卷积与互相关

2 图像卷积

普通卷积

卷积层对输入和卷积核权重进行互相关运算,并在添加标量偏置之后产生输出。

# PyTorch
import torch
from torch import nn
from d2l import torch as d2l

def corr2d(X, K):  #@save
    """计算二维互相关运算"""
    h, w = K.shape
    Y = torch.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            Y[i, j] = (X[i:i + h, j:j + w] * K).sum()
    return Y

# 自定义二维卷积层
class Conv2D(nn.Module):
    def __init__(self, kernel_size):
        super().__init__()
        self.weight = nn.Parameter(torch.rand(kernel_size))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        return corr2d(x, self.weight) + self.bias

# 卷积核`K`可以检测垂直边缘
K = torch.tensor([[1.0, -1.0]])
# Tensorflow

import tensorflow as tf
from d2l import tensorflow as d2l

def corr2d(X, K):  #@save
    """计算二维互相关运算"""
    h, w = K.shape
    Y = tf.Variable(tf.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1)))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            Y[i, j].assign(tf.reduce_sum(
                X[i: i + h, j: j + w] * K))
    return Y

# 自定义二维卷积层
class Conv2D(tf.keras.layers.Layer):
    def __init__(self):
        super().__init__()

    def build(self, kernel_size):
        initializer = tf.random_normal_initializer()
        self.weight = self.add_weight(name='w', shape=kernel_size,
                                      initializer=initializer)
        self.bias = self.add_weight(name='b', shape=(1, ),
                                    initializer=initializer)

    def call(self, inputs):
        return corr2d(inputs, self.weight) + self.bias

# 卷积核`K`可以检测垂直边缘
K = tf.constant([[1.0, -1.0]])

卷积层有时被称为特征映射(feature map):一个输入映射到下一层的空间维度的转换器

感受野(receptive field):卷积神经网络每一层输出的特征图(feature map)上的像素点映射回输入图像上的区域大小。举例说明:两层$3\times3$的卷积核卷积操作之后的感受野是$5\times5$;三层$3\times3$的卷积核卷积操作之后的感受野是$7\times7$

3 填充和步幅

图像边界会通过填充(padding)的方式来保证足够的空间移动卷积核

当需要保持输入输出的维度不变时,可设置填充长度=核长度-1

卷积核的高度和宽度通常为奇数,例如1、3、5或7:

  • 好处1:方便左右或上下填充的对称,padding=(k-1)/2
  • 好处2:奇数卷积核方便确定核中心,对边沿/线条更敏感

步幅(stride):卷积窗口的每次垂直/水平滑动长度(为了高效计算或缩减采样次数)

# PyTorch
import torch
from torch import nn

# 为了方便起见,我们定义了一个计算卷积层的函数。
# 此函数初始化卷积层权重,并对输入和输出提高和缩减相应的维数
def comp_conv2d(conv2d, X):
    # 这里的(1,1)表示批量大小和通道数都是1
    X = X.reshape((1, 1) + X.shape)
    Y = conv2d(X)
    # 省略前两个维度:批量大小和通道
    return Y.reshape(Y.shape[2:])

# 填充1;步幅2
conv2d = nn.Conv2d(1, 1, kernel_size=3, padding=1, stride=2)
comp_conv2d(conv2d, X).shape

# 上下不填充,左右填充1;步幅每次上下移动3,左右移动4
conv2d = nn.Conv2d(1, 1, kernel_size=(3, 5), padding=(0, 1), stride=(3, 4))
comp_conv2d(conv2d, X).shape
# Tensorflow
import tensorflow as tf

# 为了方便起见,我们定义了一个计算卷积层的函数。
# 此函数初始化卷积层权重,并对输入和输出提高和缩减相应的维数
def comp_conv2d(conv2d, X):
    # 这里的(1,1)表示批量大小和通道数都是1
    X = tf.reshape(X, (1, ) + X.shape + (1, ))
    Y = conv2d(X)
    # 省略前两个维度:批量大小和通道
    return tf.reshape(Y, Y.shape[1:3])

# filter的中心(K)与image的边角重合时开始卷积;步幅2
conv2d = tf.keras.layers.Conv2D(1, kernel_size=3, padding='same', strides=2)
comp_conv2d(conv2d, X).shape

# 当filter全部在image里面的时候,进行卷积运算;步幅每次上下移动3,左右移动4
conv2d = tf.keras.layers.Conv2D(1, kernel_size=(3,5), padding='valid',
                                strides=(3, 4))
comp_conv2d(conv2d, X).shape

4 多通道

此前分析的二维张量可看作灰度图像,彩色图像会多一个通道(channel)维度,此时卷积核也需要进行相应的维度调整:

针对多输入通道的情况,每个通道都可以分别进行互相关操作,并将结果加和。

神经网络层数的加深常伴随着输出通道的维数增加,通过减少空间分辨率以获得更大的通道深度。

多输出通道情况,就是不同卷积核的叠加,每种卷积核对应着一种特征的抽取逻辑。

1 * 1卷积核

  • 不再具备提取相邻像素间相关特征的能力
  • 计算发生在通道间,可以看作是一个全连接层
  • 常用于调整网络层的通道数量和控制模型复杂性

5 汇聚/池化层

汇聚 池化层

6 LeNet神经网络

LeNet,最早发布的卷积神经网络之一,由Yann LeCun在1989年提出的

主要用于手写数字识别,曾广泛用于自动取款机(ATM)机中,帮助识别处理支票的数字

代码实现(PyTorch):

# PyTorch 实现LeNet并应用于Fashion-MNIST
import torch
from torch import nn
from d2l import torch as d2l

# 相比于原版,去掉了最后一层的高斯激活
net = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5, padding=2), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(6, 16, kernel_size=5), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Flatten(),
    nn.Linear(16 * 5 * 5, 120), nn.Sigmoid(),
    nn.Linear(120, 84), nn.Sigmoid(),
    nn.Linear(84, 10))

X = torch.rand(size=(1, 1, 28, 28), dtype=torch.float32)
for layer in net: # 打印输出每一层的维度
    X = layer(X)
    print(layer.__class__.__name__,'output shape: \t',X.shape)

def evaluate_accuracy_gpu(net, data_iter, device=None): #@save
    """使用GPU计算模型在数据集上的精度"""
    if isinstance(net, nn.Module):
        net.eval()  # 设置为评估模式
        if not device:
            device = next(iter(net.parameters())).device
    # 正确预测的数量,总预测的数量
    metric = d2l.Accumulator(2)
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(X, list):
                # BERT微调所需的(之后将介绍)
                X = [x.to(device) for x in X]
            else:
                X = X.to(device)
            y = y.to(device)
            metric.add(d2l.accuracy(net(X), y), y.numel())
    return metric[0] / metric[1]

#@save
def train_ch6(net, train_iter, test_iter, num_epochs, lr, device):
    """用GPU训练模型(在第六章定义)"""
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.xavier_uniform_(m.weight)
    net.apply(init_weights)
    print('training on', device)
    net.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss()
    animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs],
                            legend=['train loss', 'train acc', 'test acc'])
    timer, num_batches = d2l.Timer(), len(train_iter)
    for epoch in range(num_epochs):
        # 训练损失之和,训练准确率之和,样本数
        metric = d2l.Accumulator(3)
        net.train()
        for i, (X, y) in enumerate(train_iter):
            timer.start()
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
            timer.stop()
            train_l = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                             (train_l, train_acc, None))
        test_acc = evaluate_accuracy_gpu(net, test_iter)
        animator.add(epoch + 1, (None, None, test_acc))
    print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, '
          f'test acc {test_acc:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(device)}')

lr, num_epochs = 0.9, 10
train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())D
# loss 0.476, train acc 0.820, test acc 0.812
# 103100.0 examples/sec on cuda:0

代码实现(Tensorflow):

# Tensorflow 实现LeNet并应用于Fashion-MNIST
import tensorflow as tf
from d2l import tensorflow as d2l

# 相比于原版,去掉了最后一层的高斯激活
def net():
    return tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(filters=6, kernel_size=5, activation='sigmoid',
                               padding='same'),
        tf.keras.layers.AvgPool2D(pool_size=2, strides=2),
        tf.keras.layers.Conv2D(filters=16, kernel_size=5,
                               activation='sigmoid'),
        tf.keras.layers.AvgPool2D(pool_size=2, strides=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(120, activation='sigmoid'),
        tf.keras.layers.Dense(84, activation='sigmoid'),
        tf.keras.layers.Dense(10)])

X = tf.random.uniform((1, 28, 28, 1))
for layer in net().layers:
    X = layer(X) # 打印输出每一层的维度
    print(layer.__class__.__name__, 'output shape: \t', X.shape)

batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

class TrainCallback(tf.keras.callbacks.Callback):  #@save
    """一个以可视化的训练进展的回调"""
    def __init__(self, net, train_iter, test_iter, num_epochs, device_name):
        self.timer = d2l.Timer()
        self.animator = d2l.Animator(
            xlabel='epoch', xlim=[1, num_epochs], legend=[
                'train loss', 'train acc', 'test acc'])
        self.net = net
        self.train_iter = train_iter
        self.test_iter = test_iter
        self.num_epochs = num_epochs
        self.device_name = device_name

    def on_epoch_begin(self, epoch, logs=None):
        self.timer.start()

    def on_epoch_end(self, epoch, logs):
        self.timer.stop()
        test_acc = self.net.evaluate(
            self.test_iter, verbose=0, return_dict=True)['accuracy']
        metrics = (logs['loss'], logs['accuracy'], test_acc)
        self.animator.add(epoch + 1, metrics)
        if epoch == self.num_epochs - 1:
            batch_size = next(iter(self.train_iter))[0].shape[0]
            num_examples = batch_size * tf.data.experimental.cardinality(
                self.train_iter).numpy()
            print(f'loss {metrics[0]:.3f}, train acc {metrics[1]:.3f}, '
                  f'test acc {metrics[2]:.3f}')
            print(f'{num_examples / self.timer.avg():.1f} examples/sec on '
                  f'{str(self.device_name)}')

#@save
def train_ch6(net_fn, train_iter, test_iter, num_epochs, lr, device):
    """用GPU训练模型(在第六章定义)"""
    device_name = device._device_name
    strategy = tf.distribute.OneDeviceStrategy(device_name)
    with strategy.scope():
        optimizer = tf.keras.optimizers.SGD(learning_rate=lr)
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        net = net_fn()
        net.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    callback = TrainCallback(net, train_iter, test_iter, num_epochs,
                             device_name)
    net.fit(train_iter, epochs=num_epochs, verbose=0, callbacks=[callback])
    return net

lr, num_epochs = 0.9, 10
train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())
#loss 0.467, train acc 0.824, test acc 0.812
#53671.6 examples/sec on /GPU:0

结果表现:

往年同期文章