PyG (PyTorch Geometric),基于 PyTorch 编写和训练图神经网络 (GNN)
- 集成了各种针对图或其他不规则结构的深度学习方法
- 包括易用小批量加载器,可在许多小型和单个巨型图上运行
- 多 GPU 支持,torch.compile 支持,DataPipe 支持
- 内置大量基准数据集、支持神经网络的设计和评估(GraphGym)
1 快速入门
1.1 图数据的处理
PyG 中的每个图都由一个 Data
对象表示
定义一个简单的未加权无向图:
import torch
from torch_geometric.data import Data
edge_index = torch.tensor([[0, 1, 1, 2],
[1, 0, 2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)
data = Data(x=x, edge_index=edge_index)
# Data(edge_index=[2, 4], x=[3, 1])
data.validate(raise_on_error=True) # 检查Data对象的合理性
edge_index
通常为二维张量,定义了所有边的源节点和目标节点;edge_index
中每一列代表一条边,第一行包含源节点的索引,第二行包含目标节点的索引- 尽管该图只有两条边,但需要在
edge_index
中定义四列来说明边的两个方向 - $4\times 2$ 的索引元组可以通过转置
.t().contiguous()
转化为 $2\times 4$ 的edge_index
edge_index
的表示形式也被称为COO
格式(坐标格式),常用于表示稀疏矩阵PyG 不区分有向图和无向图,并将无向图视为有向图的特殊情况
Data
对象的常见属性与方法:
data.num_nodes # 节点数
data.num_edges # 边数
data.num_node_features # 节点特征数
data.num_edge_features # 边特征数
data.has_isolated_nodes() # 是否存在孤立节点
data.has_self_loops() # 是否存在环 self-loops
data.is_directed() # 图中的边是否有向
device = torch.device('cuda')
data = data.to(device) # 切换至GPU模式
1.2 常见基准图数据集
PyG 包含大量常见的基准数据集
- 所有 Planetoid 文献引用网络数据集(Cora、Citeseer、Pubmed)
- TUDatasets 中的所有图分类数据集及其清理版本、QM7 和 QM9 数据集
- 少量 3D 网格/点云数据集,如 FAUST、ModelNet10/40 和 ShapeNet
加载 ENZYMES 数据集:
from torch_geometric.datasets import TUDataset
dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES')
# ENZYMES 数据集包含600个图,所有图可分为6个类型
len(dataset) # 600
dataset.num_classes # 6
dataset.num_node_features # 3
dataset[0] # 查看第一个图的基本信息
# Data(edge_index=[2, 168], x=[37, 3], y=[1])
dataset[0].is_undirected() # True
1.3 图数据的批次与并行化
小批量(mini-batch)是深度学习模型实现扩展性和并行化的关键
PyG 通过创建稀疏块对角邻接矩阵(由 edge_index
定义一个包含多个独立子图的巨型图)并在节点维度中连接特征矩阵 $X$ 和目标矩阵 $Y$,实现小批量的并行化。
这种组合允许将不同数量的节点和边划分到一个批次中:
$$
\begin{split}\mathbf{A} = \begin{bmatrix} \mathbf{A}_1 & & \\ & \ddots & \\ & & \mathbf{A}_n \end{bmatrix}, \qquad \mathbf{X} = \begin{bmatrix} \mathbf{X}_1 \\ \vdots \\ \mathbf{X}_n \end{bmatrix}, \qquad \mathbf{Y} = \begin{bmatrix} \mathbf{Y}_1 \\ \vdots \\ \mathbf{Y}_n \end{bmatrix}\end{split}
$$
该过程被封装在 torch_geometric.loader.DataLoader
中,调用方式如下:
from torch_geometric.utils import scatter
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
# 加载数据并构建batch_size=32的DataLoader
dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES', use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
for data_batch in loader:
data_batch # DataBatch(batch=[1082], edge_index=[2, 4066], x=[1082, 21], y=[32])
data_batch.num_graphs # 32
# 计算每个子图中所有节点特征的平均值
x = scatter(data_batch.x, data_batch.batch, dim=0, reduce='mean')
x.size() # torch.Size([32, 21])
data_batch
是一个列向量,将当前批次中的所有节点映射到对应的图中
$$ \mathrm{data\_batch} = {\begin{bmatrix} 0 & \cdots & 0 & 1 & \cdots & n - 2 & n -1 & \cdots & n - 1 \end{bmatrix}}^{\top} $$
- 邻接矩阵以稀疏方式保存非零条目(有效边),因此没有额外的内存开销
1.4 图数据的转换和增强
将点云数据集转换为图数据集:
from torch_geometric.datasets import ShapeNet
# 1. 普通转换
dataset = ShapeNet(root='/tmp/ShapeNet', categories=['Airplane'])
dataset[0] # Data(pos=[2518, 3], y=[2518])
# 2. 基于最近邻的转换
dataset = ShapeNet(root='/tmp/ShapeNet', categories=['Airplane'],
pre_transform=T.KNNGraph(k=6))
dataset[0] # Data(edge_index=[2, 15108], pos=[2518, 3], y=[2518])
# 3. 基于最近邻的转换+添加噪声的数据增强
dataset = ShapeNet(root='/tmp/ShapeNet', categories=['Airplane'],
pre_transform=T.KNNGraph(k=6),
transform=T.RandomJitter(0.01))
dataset[0] # Data(edge_index=[2, 15108], pos=[2518, 3], y=[2518])
- ShapeNet 点云数据集,包含17,000 个 3D 形状点云和 16 种点云标签
pre_transform
在将数据保存到磁盘之前对其进行转换(从而加快加载时间)- 使用
transform
参数对Data
对象添加噪声(节点的 3D 位置进行随机平移)
2 基础案例
前置知识:GCN_基于图卷积网络的半监督学习
2.1 节点嵌入可视化
数据集:Zachary 空手道俱乐部网络
- 描述了空手道俱乐部 34 名成员的社交网络,包含成员间的互动链接
- 最终目的是线性分离社区并正确分类大多数节点,同时进行可视化
- 环境准备与绘图辅助函数定义
import os
import torch
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt
# 辅助函数:图可视化
def visualize_graph(G, color):
plt.figure(figsize=(7,7))
plt.xticks([])
plt.yticks([])
nx.draw_networkx(G, pos=nx.spring_layout(G, seed=42), with_labels=False,
node_color=color, cmap="Set2")
plt.show()
# 辅助函数:节点嵌入可视化
def visualize_embedding(h, color, epoch=None, loss=None):
plt.figure(figsize=(7,7))
plt.xticks([])
plt.yticks([])
h = h.detach().cpu().numpy()
plt.scatter(h[:, 0], h[:, 1], s=140, c=color, cmap="Set2")
if epoch is not None and loss is not None:
plt.xlabel(f'Epoch: {epoch}, Loss: {loss.item():.4f}', fontsize=16)
plt.show()
- 数据加载与初步探索
from torch_geometric.datasets import KarateClub
from torch_geometric.utils import to_networkx
dataset = KarateClub()
print(f'Number of graphs: {len(dataset)}') # 1
print(f'Number of features: {dataset.num_features}') # 34
print(f'Number of classes: {dataset.num_classes}') # 4
# 每个节点包含34个属性,并被分配到4种社区
data = dataset[0]
print(data)
# Data(edge_index=[2, 156], x=[34, 34], y=[34], train_mask=[34])
# `data` 对象包含 4 个属性:
# (1) `edge_index` 属性包含有关图连通性的信息
# (2) 节点特征称为 `x` (34 个节点,每个节点对应一个 34 维特征向量)
# (3) 节点标签称为 `y` (每个节点恰好被分配给一个类)
# (4) 附加属性`train_mask`,它描述了已知社区分配的节点
print(f'Number of nodes: {data.num_nodes}') # 34
print(f'Number of edges: {data.num_edges}') # 156
print(f'Average node degree: {(data.num_edges) / data.num_nodes:.2f}') # 4.59
print(f'Number of training nodes: {data.train_mask.sum()}') # 4
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}') # 0.12
print(f'Contains isolated nodes: {data.has_isolated_nodes()}') # False
print(f'Contains self-loops: {data.has_self_loops()}') # False
print(f'Is undirected: {data.is_undirected()}') # True
# 图的可视化
G = to_networkx(data, to_undirected=True)
visualize_graph(G, color=data.y)
- GCN 建模与嵌入表示可视化
import torch
from torch.nn import Linear
from torch_geometric.nn import GCNConv
import time
from IPython.display import Javascript # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 430})'''))
class GCN(torch.nn.Module):
def __init__(self):
super().__init__()
torch.manual_seed(1234)
self.conv1 = GCNConv(dataset.num_features, 4)
self.conv2 = GCNConv(4, 4)
self.conv3 = GCNConv(4, 2)
self.classifier = Linear(2, dataset.num_classes)
def forward(self, x, edge_index):
h = self.conv1(x, edge_index)
h = h.tanh()
h = self.conv2(h, edge_index)
h = h.tanh()
h = self.conv3(h, edge_index)
h = h.tanh() # Final GNN embedding space.
# Apply a final (linear) classifier.
out = self.classifier(h)
return out, h
model = GCN()
criterion = torch.nn.CrossEntropyLoss() # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # Define optimizer.
def train(data):
optimizer.zero_grad() # Clear gradients.
out, h = model(data.x, data.edge_index) # Perform a single forward pass.
loss = criterion(out[data.train_mask], data.y[data.train_mask]) # Compute the loss solely based on the training nodes.
loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients.
return loss, h
for epoch in range(401):
loss, h = train(data)
if epoch % 10 == 0: # 每隔10个epoch,进行一次节点嵌入的可视化
visualize_embedding(h, color=data.y, epoch=epoch, loss=loss)
time.sleep(0.3)
2.2 简单节点分类任务
数据集:Cora 引文数据集
- 共2708个样本点,每个样本点都是一篇科学论文
- 论文分为8类(案例/遗传算法/神经网络/概率方法/强化学习/规则学习/理论)
- 每篇论文都由一个1433维的词向量表示,即每个样本点具有1433个特征
- 每篇论文都至少引用了一篇其他论文,或者被引用,即不存在孤立节点
- GCN 模型的定义:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
class GCN(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = GCNConv(dataset.num_node_features, 16)
self.conv2 = GCNConv(16, dataset.num_classes)
def forward(self, data):
x, edge_index = data.x, data.edge_index
x = self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
return F.log_softmax(x, dim=1)
- 定义了两个
GCNConv
层,它们在网络的前向传播中被调用 - 非线性激活函数为 ReLU,最终输出 softmax 分布用于节点分类任务
- 数据加载和模型训练:
from torch_geometric.datasets import Planetoid
dataset = Planetoid(root='/tmp/Cora', name='Cora')
# 参数配置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train() # 模型训练:200个epoch
for epoch in range(200):
optimizer.zero_grad()
out = model(data)
loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()
- 模型评估
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
def visualize(h, color):
z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())
plt.figure(figsize=(10,10))
plt.xticks([])
plt.yticks([])
plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
plt.show()
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}') # Accuracy: 0.8150
visualize(pred, color=data.y) # TSNE可视化
2.3 简单的图分类任务
数据集:TUDatasets - MUTAG 数据集
- 数据集提供了 188 个不同的图,任务是将每个图进行二分类预测
- 使用前 150 个图作为训练图,同时使用剩余的图进行测试
- 数据加载与预处理
import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
dataset = TUDataset(root='data/TUDataset', name='MUTAG')
torch.manual_seed(12345)
dataset = dataset.shuffle()
train_dataset = dataset[:150]
test_dataset = dataset[150:]
# 构建图的小批次处理
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
- 模型定义与训练
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
class GCN(torch.nn.Module):
def __init__(self, hidden_channels):
super(GCN, self).__init__()
torch.manual_seed(12345)
self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
self.conv2 = GCNConv(hidden_channels, hidden_channels)
self.conv3 = GCNConv(hidden_channels, hidden_channels)
self.lin = Linear(hidden_channels, dataset.num_classes)
def forward(self, x, edge_index, batch):
# 1. Obtain node embeddings
x = self.conv1(x, edge_index)
x = x.relu()
x = self.conv2(x, edge_index)
x = x.relu()
x = self.conv3(x, edge_index)
# 2. Readout layer:使用所有节点嵌入的均值作为图嵌入
x = global_mean_pool(x, batch) # [batch_size, hidden_channels]
# 3. Apply a final classifier
x = F.dropout(x, p=0.5, training=self.training)
x = self.lin(x)
return x
model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
def train():
model.train()
for data in train_loader: # Iterate in batches over the training dataset.
out = model(data.x, data.edge_index, data.batch) # Perform a single forward pass.
loss = criterion(out, data.y) # Compute the loss.
loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients.
optimizer.zero_grad() # Clear gradients.
def test(loader):
model.eval()
correct = 0
for data in loader: # Iterate in batches over the training/test dataset.
out = model(data.x, data.edge_index, data.batch)
pred = out.argmax(dim=1) # Use the class with highest probability.
correct += int((pred == data.y).sum()) # Check against ground-truth labels.
return correct / len(loader.dataset) # Derive ratio of correct predictions.
for epoch in range(1, 171):
train()
train_acc = test(train_loader)
test_acc = test(test_loader)
print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')
# Epoch: 170, Train Acc: 0.8000, Test Acc: 0.7632
2.4 缩放图神经网络
大规模图存在的内存消耗问题:
- 当前图神经网络的训练是并行计算,并节点隐藏表示可以传递到下一层
- 随着图的尺寸增加,该方案的内存消耗会呈爆炸式增长(对于一个 1000 万节点且隐藏特征维度为 128 的图,每层需要消耗约 5GB 的 GPU 显存)
- Cluster-GCN 方法将图预先划分为可小批量操作的子图,以适合大规模图
案例实践:针对大规模图进行 GCN 模型训练
- 数据加载和小批量样本的制作
import torch
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.loader import ClusterData, ClusterLoader
dataset = Planetoid(root='data/Planetoid', name='PubMed', transform=NormalizeFeatures())
# PubMed 数据集大约有 19717 个节点,88648 个边
torch.manual_seed(12345)
# PyTorch Geometric 提供了 Cluster-GCN 算法的两阶段实现:
# 1. `ClusterData` 将 `Data` 对象转换为包含 `num_parts` 分区的子图数据集
# 2. 给定 `batch_size` , `ClusterLoader` 实现随机分区方案以创建小批量样本
cluster_data = ClusterData(data, num_parts=128)
train_loader = ClusterLoader(cluster_data, batch_size=32, shuffle=True)
- 模型结构的定义与训练
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from IPython.display import Javascript
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))
class GCN(torch.nn.Module):
def __init__(self, hidden_channels):
super(GCN, self).__init__()
torch.manual_seed(12345)
self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
self.conv2 = GCNConv(hidden_channels, dataset.num_classes)
def forward(self, x, edge_index):
x = self.conv1(x, edge_index)
x = x.relu()
x = F.dropout(x, p=0.5, training=self.training)
x = self.conv2(x, edge_index)
return x
model = GCN(hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
def train():
model.train()
for sub_data in train_loader: # Iterate over each mini-batch.
out = model(sub_data.x, sub_data.edge_index) # Perform a single forward pass.
loss = criterion(out[sub_data.train_mask], sub_data.y[sub_data.train_mask]) # Compute the loss solely based on the training nodes.
loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients.
optimizer.zero_grad() # Clear gradients.
def test():
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1) # Use the class with highest probability.
accs = []
for mask in [data.train_mask, data.val_mask, data.test_mask]:
correct = pred[mask] == data.y[mask] # Check against ground-truth labels.
accs.append(int(correct.sum()) / int(mask.sum())) # Derive ratio of correct predictions.
return accs
for epoch in range(1, 51):
loss = train()
train_acc, val_acc, test_acc = test()
print(f'Epoch: {epoch:03d}, Train: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}')
# Epoch: 050, Train: 0.9833, Val Acc: 0.8120, Test Acc: 0.7870
2.5 更多示例与技巧
[Node Classification Instrumented with Weights&Biases]( https://colab.research.google.com/github/wandb/examples/blob/master/colabs/pyg/8_Node_Classification_ (with_W&B).ipynb)