如何将tensorflow1.x代码改写为pytorch代码(以图注意⼒⽹络
(GAT)为例)
之前讲解了图注意⼒⽹络的官⽅tensorflow版的实现,由于⾃⼰更了解pytorch,所以打算将其改写为pytorch版本的。
对于图注意⼒⽹络还不了解的可以先去看看tensorflow版本的代码,之前讲解的地址:
以下改写后的代码我已经上传到gihub上,地址为:
图注意⼒⽹络的官⽅代码使⽤的是tensorflow1.x版本的,地址为:
下⾯开始进⼊正题了。
1、tensorflow1.x的⼀般建模过程:
定义好训练的数据
定义计算图(包含占位)
定义训练主函数、损失函数计算、优化器
定义Session,参数初始化以及实际的前向传播和反向传播计算都是在Session中
2、将tensorflow转换为pytorch代码
其他数据处理的代码都是⼀致的,主要是⼀些需要改变的地⽅:
2.1 数据的读取
在tensorflow中,标签是要经过onehot编码的,⽽在pytorch中确是不⽤的,在load_data中:
def load_data(dataset_str): # {'pubmed', 'citeseer', 'cora'}
"""Load data."""
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []
for i in range(len(names)):
with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
if sys.version_info > (3, 0):
objects.append(pkl.load(f, encoding='latin1'))
else:
objects.append(pkl.load(f))
x, y, tx, ty, allx, ally, graph = tuple(objects)
test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
test_idx_range = np.sort(test_idx_reorder)
if dataset_str == 'citeseer':
# Fix citeseer dataset (there are some isolated nodes in the graph)
# Find isolated nodes, add them as zero-vecs into the right position
test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
tx_extended[test_idx_range-min(test_idx_range), :] = tx
tx = tx_extended
ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
ty_extended[test_idx_range-min(test_idx_range), :] = ty
ty = ty_extended
features = sp.vstack((allx, tx)).tolil()
features[test_idx_reorder, :] = features[test_idx_range, :]
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
labels = np.vstack((ally, ty))
labels[test_idx_reorder, :] = labels[test_idx_range, :]
#pytorch的标签不需要进⾏one-hot编码
my_labels = np.where(labels==1)[1]
idx_test = test_list()
idx_train = range(len(y))
idx_val = range(len(y), len(y)+500)
train_my_labels_mask = sample_mask(idx_train, my_labels.shape[0])
val_my_labels_mask = sample_mask(idx_val, my_labels.shape[0])
test_my_labels_mask = sample_mask(idx_test, my_labels.shape[0])
train_my_labels = my_labels[train_my_labels_mask]
val_my_labels = my_labels[val_my_labels_mask]
test_my_labels = my_labels[test_my_labels_mask]
train_mask = sample_mask(idx_train, labels.shape[0])
val_mask = sample_mask(idx_val, labels.shape[0])
test_mask = sample_mask(idx_test, labels.shape[0])
y_train = np.zeros(labels.shape)
y_val = np.zeros(labels.shape)
y_test = np.zeros(labels.shape)
y_train[train_mask, :] = labels[train_mask, :]
y_val[val_mask, :] = labels[val_mask, :]
y_test[test_mask, :] = labels[test_mask, :]
print(adj.shape)
print(features.shape)
data_dict = {
'adj': adj,
'features': features,
'y_train': y_train,
'y_val': y_val,
'y_test': y_test,
'train_mask': train_mask,
'val_mask': val_mask,
'test_mask': test_mask,
'train_my_labels': train_my_labels,
'val_my_labels': val_my_labels,
'test_my_labels': test_my_labels,
'my_labels': my_labels
}
return data_dict
我们要使⽤np.where()函数,将每⼀个ont-hot编码中值为1的索引(也就是标签)取出来,然后在对其进⾏划分训练标签、验证标签和测试标签。
顺便提⼀下,当我们要返回的值很多的时候,可以⽤⼀个字典包装起来,最后返回该字典就⾏了,这符合python的编码规范。
2.2 注意⼒层的搭建
在tensorflow中:
conv1d = v1d
def attn_head(seq, out_sz, bias_mat, activation, in_drop=0.0, coef_drop=0.0, residual=False):
with tf.name_scope('my_attn'):
if in_drop != 0.0:
seq = tf.nn.dropout(seq, 1.0 - in_drop)
seq_fts = v1d(seq, out_sz, 1, use_bias=False)
# simplest self-attention possible
f_1 = v1d(seq_fts, 1, 1)
f_2 = v1d(seq_fts, 1, 1)
logits = f_1 + tf.transpose(f_2, [0, 2, 1])
coefs = tf.nn.leaky_relu(logits) + bias_mat)
if coef_drop != 0.0:
coefs = tf.nn.dropout(coefs, 1.0 - coef_drop)
if in_drop != 0.0:
seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop)
vals = tf.matmul(coefs, seq_fts)
ret = tf.contrib.layers.bias_add(vals)
# residual connection
tensorflow版本选择if residual:
if seq.shape[-1] != ret.shape[-1]:
ret = ret + conv1d(seq, ret.shape[-1], 1) # activation
else:
ret = ret + seq
return activation(ret)  # activation
直接就可以使⽤相关api进⾏计算,但是在pytorch中,⽆论是定义⾃⼰的层还是模型,都需要先建⽴,然后再使⽤(⼀般是这样)。改写后的代码如下:
import torch
as nn
class Attn_head(nn.Module):
def__init__(self,
in_channel,
out_sz,
bias_mat,
in_drop=0.0,
coef_drop=0.0,
activation=None,
residual=False):
super(Attn_head, self).__init__()
self.in_channel = in_channel
self.out_sz = out_sz
self.bias_mat = bias_mat
self.in_drop = in_drop
self.activation = activation
self.leakyrelu = nn.LeakyReLU()
self.softmax = nn.Softmax(dim=1)
#pytorch中dropout的参数p表⽰每个神经元⼀定概率失活
self.in_dropout = nn.Dropout()
def forward(self,x):
seq = x
if self.in_drop != 0.0:
seq = self.in_dropout(x)
seq_fts = v1(seq)
f_1 = v2_1(seq_fts)
f_2 = v2_2(seq_fts)
logits = f_1 + anspose(f_2, 2, 1)
logits = self.leakyrelu(logits)
coefs = self.softmax(logits + self.bias_mat)
f_drop !=0.0:
coefs = f_dropout(coefs)
if self.in_dropout !=0.0:
seq_fts = self.in_dropout(seq_fts)
ret = torch.matmul(coefs, anspose(seq_fts, 2, 1))
ret = anspose(ret, 2, 1)
sidual:
if seq.shape[1] != ret.shape[1]:
ret = ret + s_conv(seq)
else:
ret = ret + seq
return self.activation(ret)
要继承nn.Module类,然后在__init__中初始化相关参数以及对应的层,在forward中进⾏前向传播计算。
2.3 搭建模型
有了注意⼒层之后,就可以搭建模型了,tensorflow的代码:
def inference(inputs, nb_classes, nb_nodes, training, attn_drop, ffd_drop,
bias_mat, hid_units, n_heads, lu, residual=False):
attns = []
for _ in range(n_heads[0]):
attns.append(layers.attn_head(inputs, bias_mat=bias_mat,
out_sz=hid_units[0], activation=activation,
in_drop=ffd_drop, coef_drop=attn_drop, residual=False))
h_1 = tf.concat(attns, axis=-1)
for i in range(1, len(hid_units)):
h_old = h_1
attns = []
for _ in range(n_heads[i]):
attns.append(layers.attn_head(h_1, bias_mat=bias_mat,
out_sz=hid_units[i], activation=activation,
in_drop=ffd_drop, coef_drop=attn_drop, residual=residual))
h_1 = tf.concat(attns, axis=-1)
out = []
for i in range(n_heads[-1]):
out.append(layers.attn_head(h_1, bias_mat=bias_mat,
out_sz=nb_classes, activation=lambda x: x,
in_drop=ffd_drop, coef_drop=attn_drop, residual=False))
logits = tf.add_n(out) / n_heads[-1]
return logits
改写之后的pytorch代码:
import numpy as np
as nn
import torch
from layer import *
class GAT(nn.Module):
def__init__(self,
nb_classes,
nb_nodes,
attn_drop,
ffd_drop,
bias_mat,
hid_units,
n_heads,
residual=False):
super(GAT, self).__init__()
self.nb_classes = nb_classes
self.nb_nodes = nb_nodes
self.attn_drop = attn_drop
self.ffd_drop = ffd_drop
self.bias_mat = bias_mat
self.hid_units = hid_units
self.n_heads = n_heads
self.attn1 = Attn_head(in_channel=1433, out_sz=self.hid_units[0],
bias_mat=self.bias_mat, in_drop=self.ffd_drop,
coef_drop=self.attn_drop, activation=nn.ELU(),
sidual)
self.attn2 = Attn_head(in_channel=64, out_sz=self.nb_classes,
bias_mat=self.bias_mat, in_drop=self.ffd_drop,
coef_drop=self.attn_drop, activation=nn.ELU(),
sidual)
self.softmax = nn.Softmax(dim=1)
def forward(self, x):
attns = []
for _ in range(self.n_heads[0]):
attns.append(self.attn1(x))
h_1 = torch.cat(attns, dim=1)
out = self.attn2(h_1)
logits = anspose(out.view(self.nb_classes,-1), 1, 0)
logits = self.softmax(logits)
return logits
和tensorflow代码不同的是,这⾥我们仅仅定义了两层注意⼒。还需要注意的是,我们在__init__中定义相关层的时候,对于输⼊和输出的维度我们是要预先知道的,并填充进去,如果在forward中实际的值与预先定义的维度不同,那么就会报错。
2.4 进⾏训练、验证和测试
⾸先还是来看⼀下tensorflow是怎么定义的:
with tf.Graph().as_default():
with tf.name_scope('input'):
ftr_in = tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, ft_size))
bias_in = tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, nb_nodes))
lbl_in = tf.placeholder(dtype=tf.int32, shape=(batch_size, nb_nodes, nb_classes))
msk_in = tf.placeholder(dtype=tf.int32, shape=(batch_size, nb_nodes))
attn_drop = tf.placeholder(dtype=tf.float32, shape=())
ffd_drop = tf.placeholder(dtype=tf.float32, shape=())
is_train = tf.placeholder(dtype=tf.bool, shape=())
logits = model.inference(ftr_in, nb_classes, nb_nodes, is_train,
attn_drop, ffd_drop,
bias_mat=bias_in,
hid_units=hid_units, n_heads=n_heads,
residual=residual, activation=nonlinearity)
log_resh = tf.reshape(logits, [-1, nb_classes])
lab_resh = tf.reshape(lbl_in, [-1, nb_classes])
msk_resh = tf.reshape(msk_in, [-1])
loss = model.masked_softmax_cross_entropy(log_resh, lab_resh, msk_resh)
accuracy = model.masked_accuracy(log_resh, lab_resh, msk_resh)
train_op = aining(loss, lr, l2_coef)
saver = tf.train.Saver()
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
vlss_mn = np.inf
vacc_mx = 0.0
curr_step = 0
with tf.Session() as sess:
sess.run(init_op)
train_loss_avg = 0
train_acc_avg = 0
val_loss_avg = 0
val_acc_avg = 0
for epoch in range(nb_epochs):
print("epoch: ",epoch)
tr_step = 0
tr_size = features.shape[0]
while tr_step * batch_size < tr_size:
_, loss_value_tr, acc_tr = sess.run([train_op, loss, accuracy],
feed_dict={
ftr_in: features[tr_step*batch_size:(tr_step+1)*batch_size],
bias_in: biases[tr_step*batch_size:(tr_step+1)*batch_size],
lbl_in: y_train[tr_step*batch_size:(tr_step+1)*batch_size],
msk_in: train_mask[tr_step*batch_size:(tr_step+1)*batch_size],
is_train: True,
attn_drop: 0.6, ffd_drop: 0.6})
train_loss_avg += loss_value_tr
train_acc_avg += acc_tr
tr_step += 1
vl_step = 0
vl_size = features.shape[0]
while vl_step * batch_size < vl_size:
loss_value_vl, acc_vl = sess.run([loss, accuracy],
feed_dict={
ftr_in: features[vl_step*batch_size:(vl_step+1)*batch_size],
bias_in: biases[vl_step*batch_size:(vl_step+1)*batch_size],
lbl_in: y_val[vl_step*batch_size:(vl_step+1)*batch_size],
msk_in: val_mask[vl_step*batch_size:(vl_step+1)*batch_size],
is_train: False,
attn_drop: 0.0, ffd_drop: 0.0})
val_loss_avg += loss_value_vl
val_acc_avg += acc_vl
vl_step += 1
print('Training: loss = %.5f, acc = %.5f | Val: loss = %.5f, acc = %.5f' %
(train_loss_avg/tr_step, train_acc_avg/tr_step,
val_loss_avg/vl_step, val_acc_avg/vl_step))
if val_acc_avg/vl_step >= vacc_mx or val_loss_avg/vl_step <= vlss_mn:
if val_acc_avg/vl_step >= vacc_mx and val_loss_avg/vl_step <= vlss_mn:
vacc_early_model = val_acc_avg/vl_step
vlss_early_model = val_loss_avg/vl_step
saver.save(sess, checkpt_file)
vacc_mx = np.max((val_acc_avg/vl_step, vacc_mx))
vlss_mn = np.min((val_loss_avg/vl_step, vlss_mn))
curr_step = 0
else:
curr_step += 1
if curr_step == patience:
print('Early stop! Min loss: ', vlss_mn, ', Max accuracy: ', vacc_mx)
print('Early stop model validation loss: ', vlss_early_model, ', accuracy: ', vacc_early_model)
break
train_loss_avg = 0
train_acc_avg = 0
val_loss_avg = 0
val_acc_avg = 0
ts_size = features.shape[0]
ts_step = 0
ts_loss = 0.0
ts_acc = 0.0
while ts_step * batch_size < ts_size:
loss_value_ts, acc_ts = sess.run([loss, accuracy],
feed_dict={
ftr_in: features[ts_step*batch_size:(ts_step+1)*batch_size],
bias_in: biases[ts_step*batch_size:(ts_step+1)*batch_size],
lbl_in: y_test[ts_step*batch_size:(ts_step+1)*batch_size],
msk_in: test_mask[ts_step*batch_size:(ts_step+1)*batch_size],
is_train: False,
attn_drop: 0.0, ffd_drop: 0.0})
ts_loss += loss_value_ts
ts_acc += acc_ts
ts_step += 1
print('Test loss:', ts_loss/ts_step, '; Test accuracy:', ts_acc/ts_step)
sess.close()
就是建⽴图、然后在Session中执⾏。