Seq2Seq代码实现


在这章节,我们将用RNN搭建一个seq2seq模型(sequences to sequences),实现英文到中文的翻译,数据集应用的是由Tatoeba项⽬的双语句⼦对114组成的“英-中文”数据集。

数据预处理

原始语料需要进行预处理,所以导入必要的包和模块。注意初次安装nltk后,进行分词需要依赖punkt,因此需要对其进行下载

import os
import jieba
import re
import random
from opencc import OpenCC
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kaxim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True

词元化

该部分的作用是将数据词元化,用en,cn两个列表分别用来保存源语言(英语)和目标语言(中文)。在这两个列表中,对应列表索引的内容分别表示英语和其对应的中文翻译。

cc = OpenCC('t2s')   # t2s -繁体转简体; s2t -简体转繁体
en, cn = [], []
data_dir = './data/cmn-eng/cmn.txt'
with open(data_dir, 'r', encoding='utf-8') as f:
    for line in f.readlines():
        sentence = re.split('\t', line)   # sentence[0]为英文句子,sentence[1]为中文句子
        sentence = list(filter(None, sentence))
        en_sentence = ''
        for word in word_tokenize(sentence[0]):
            en_sentence += word.lower() + ' '
        en.append(en_sentence)
        
        cn_sentence = ''
        for word in list(jieba.cut(sentence[1])):
            word = re.sub(r'[ \n\t\r]', '', word)
            if word == '':
                continue
            cn_sentence += cc.convert(word) + ' '
        cn.append(cn_sentence)
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\kaxim\AppData\Local\Temp\jieba.cache
Loading model cost 0.429 seconds.
Prefix dict has been built successfully.

查看词元化后的结果

en[8000], cn[8000]
('thanks for the memories . ', '感谢 那些 回忆 。 ')

构建字典

由于机器翻译数据集由语言对组成,因此我们需分别为源语言和目标语言构建字典,这样就方便之后转为one-hot vector。在该字典中,我们将出现次数少于3的低频率词视为相同未知词元(“<unk>”)。除此之外,我们还指定了额外的特定词元,例如在小批量时⽤于将序列填充到相同⻓度的填充词元(“<pad>”),以及序列的开始词元(“<bos>”)和结束词元(“<eos>”)。这些特殊词元在⾃然语⾔处理任务中⽐较常⽤。
构建的字典有以下两种形式:
int2word: 将整数转为对应文字
word2int: 将文字转为对应整数,该字典和前一个字典是一一对应关系

# 英文
words = {}
for sentence in en:
    _sentence = re.split('[ \n\t\r ]', sentence)
    _sentence = list(filter(None, _sentence))
    for word in _sentence:
        words[word] = words.get(word, 0) + 1

words =sorted(words.items(), key=lambda d: d[1], reverse=True)  #排序
words = [word for word, count in words if count >= 2]
words = ['<PAD>', '<BOS>', '<EOS>', '<UNK>'] + words

word2int_en, int2word_en = {}, {}
for index,word in enumerate(words):
    word2int_en[word] = index
    int2word_en[index] = word

# 中文
words = {}
for sentence in cn:
    _sentence = re.split('[ \n\t\r ]', sentence)
    _sentence = list(filter(None, _sentence))
    for word in _sentence:
        words[word] = words.get(word, 0) + 1

words =sorted(words.items(), key=lambda d: d[1], reverse=True)  #排序
words = [word for word, count in words if count >= 2]
words = ['<PAD>', '<BOS>', '<EOS>', '<UNK>'] + words

word2int_cn, int2word_cn = {}, {}
for index, word in enumerate(words):
    word2int_cn[word] = index
    int2word_cn[index] = word

部分字典展示,由于字典不支持切片功能,因此我们需构建一个函数,实现切片功能

def dict_slice(adict, index):
    keys = adict.keys()
    dict_slice = {}
    for k in list(keys)[: index]:
        dict_slice[k] = adict[k]
    return dict_slice

英文字典

dict_slice(word2int_en, 15), dict_slice(int2word_en, 15)
({'<PAD>': 0,
  '<BOS>': 1,
  '<EOS>': 2,
  '<UNK>': 3,
  '.': 4,
  'i': 5,
  'the': 6,
  'to': 7,
  'you': 8,
  'a': 9,
  '?': 10,
  'is': 11,
  'tom': 12,
  "n't": 13,
  'he': 14},
 {0: '<PAD>',
  1: '<BOS>',
  2: '<EOS>',
  3: '<UNK>',
  4: '.',
  5: 'i',
  6: 'the',
  7: 'to',
  8: 'you',
  9: 'a',
  10: '?',
  11: 'is',
  12: 'tom',
  13: "n't",
  14: 'he'})

中文字典

dict_slice(word2int_cn, 15), dict_slice(int2word_cn, 15)
({'<PAD>': 0,
  '<BOS>': 1,
  '<EOS>': 2,
  '<UNK>': 3,
  '。': 4,
  '我': 5,
  '的': 6,
  '了': 7,
  '你': 8,
  '他': 9,
  '?': 10,
  '在': 11,
  '汤姆': 12,
  '是': 13,
  '吗': 14},
 {0: '<PAD>',
  1: '<BOS>',
  2: '<EOS>',
  3: '<UNK>',
  4: '。',
  5: '我',
  6: '的',
  7: '了',
  8: '你',
  9: '他',
  10: '?',
  11: '在',
  12: '汤姆',
  13: '是',
  14: '吗'})

构建训练资料

将数据划分成训练集,验证集和测试集

  • 训练集:25000句
  • 验证集:1000句
  • 测试集:1965句
sentences = []
for en_sentence, cn_sentence in zip(en, cn):
    # 去除中英文种出现3个未知词汇以上的句子
    tokens = re.split('[ \n\t\r  ]', en_sentence)
    tokens = list(filter(None, tokens))
    count = 0
    for token in tokens:
        index = word2int_en.get(token, 3)
        if index == 3:
            count += 1
    if count >= 2:
        continue
    
    tokens = re.split('[ \n\t\r  ]', cn_sentence)
    tokens = list(filter(None, tokens))
    count = 0
    for token in tokens:
        Index = word2int_cn.get(token, 3)
        if Index == 3:
            count += 1
    if count >= 3:
        continue
    sentences.append(en_sentence + '\t' + cn_sentence)
    
sentences = list(set(sentences))     # 去重
random.seed(2022)
random.shuffle(sentences)

train_set = sentences[:25000]
validation_set = sentences[25000:26000]
test_set = sentences[26000:]
print(len(test_set))
1965

定义dataset

import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

# 首先定义一个函数,它可以将句子扩展到相同长度,以便模型训练
class LabelTransform(object):
    def __init__(self, size, pad):
        self.size = size
        self.pad = pad
        
    def __call__(self, label):
        label = np.pad(label, (0, (self.size - label.shape[0])), mode='constant', constant_values=self.pad)
        return label

# dataset
class EN2CNDataset(Dataset):
    def __init__(self, max_output_len, set_name):
        self.word2int_en, self.int2word_en = word2int_en, int2word_en
        self.word2int_cn, self.int2word_cn = word2int_cn, int2word_cn
        self.data = set_name
        
        self.cn_vocab_size = len(self.word2int_cn)
        self.en_vocab_size = len(self.word2int_en)
        self.transform = LabelTransform(max_output_len, self.word2int_en['<PAD>'])
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        # 先将中英文分开
        sentences = self.data[index]
        sentences = re.split('[\t\n]', sentences)
        sentences = list(filter(None, sentences))
        # print(sentences)
        assert len(sentences) == 2
        
        # 预备特殊字符
        BOS = self.word2int_en['<BOS>']
        EOS = self.word2int_en['<EOS>']
        UNK = self.word2int_en['<UNK>']
        
        # 在开头添加 <BOS>,在结尾添加 <EOS> ,不在字典里面的 subword (词)用 <UNK> 代替
        en, cn = [BOS], [BOS]
        # 将句子拆解为 subword ,并用字典对应的整数取代
        sentence = re.split(' ', sentences[0])
        sentence = list(filter(None, sentence))
        # print(f'en: {sentence}')
        for word in sentence:
            en.append(self.word2int_en.get(word, UNK))
        en.append(EOS)
        
        # 中文
        sentence = re.split(' ', sentences[1])
        sentence = list(filter(None, sentence))
        # print(f'cn: {sentence}')
        for word in sentence:
            cn.append(self.word2int_cn.get(word, UNK))
        cn.append(EOS)
        
        en, cn = np.asarray(en), np.asarray(cn)
        # print(en, cn)
        
        # 用 <PAD> 将句子补到相同的长度
        en, cn = self.transform(en), self.transform(cn)
        # print(en, cn)
        en, cn =  torch.LongTensor(en), torch.LongTensor(cn)
        
        return en, cn

Encoder-Decoder模型

Encoder

seq2seq模型的编码器为RNN。对于每个输入,Encoder会输出一个向量和一个隐状态(hidden state),并将隐状态作为Decoder的输入。换句话说,Encoder会逐步读取输入序列,并输出单个向量(最终隐状态)

参数:

  • en_vocab_size是英文字典的大小,也就是英文的subword的个数
  • emb_dim是embedding的维度,主要将one-hot vector的单词向量压缩到指定的维度,主要是为了将维和浓缩资讯的功能,可以使用预先训练好的word embedding
  • hid_dim是RNN输出和隐状态的维度
  • n_layers是RNN要叠多少层
  • dropout是决定有多少的概率会将某个节点变为0,主要是为了防止过拟合,一般来说是在训练时使用,测试时则不适用

Encoder的输入:

  • 英文的整数序列

输出:

  • outputs:最上层RNN全部的输出,可以用Attention进行处理
  • hidden:每层最后的隐状态,将传递到Decoder进行解码
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, en_vocab_size, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(en_vocab_size, emb_dim)
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input):
        # input = [batch size, sequence len, vocab size]
        embedding = self.embedding(input)
        outputs, hidden = self.rnn(self.dropout(embedding))
        # outputs = [batch size, sequence len, hid dim * directions]
        # hidden =  [num_layers * directions, batch size  , hid dim]
        # outputs 是最上层RNN的输出
        return outputs, hidden

Decoder

Decoder是另一个RNN,在最简单的seq2seq decoder中, 仅使用Encoder每一层最后的隐状态来进行解码,这个隐状态用作Decoder的初始隐状态,本节先做最简单的Decoder,你也可以尝试将Encoder的输出用于Attention机制加到Decoder的输入中。

参数:

  • cn_vocab_size是中文字典的大小,也就是中文的subword的个数
  • emb_dim是embedding的维度,主要将one-hot vector的单词向量压缩到指定的维度,主要是为了将维和浓缩资讯的功能,可以使用预先训练好的word embedding
  • hid_dim是RNN输出和隐状态的维度
  • n_layers是RNN要叠多少层
  • dropout是决定有多少的概率会将某个节点变为0,主要是为了防止过拟合,一般来说是在训练时使用,测试时则不适用

Decoder的输入:

  • 前一次解码出来的单词的整数表示

输出:

  • hidden:根据输入和前一次的隐状态,现在的隐状态更新的结果
  • output:每个字有多少概率是这次解码的结果
class Decoder(nn.Module):
    def __init__(self, cn_vocab_size, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.cn_vocab_size = cn_vocab_size
        self.hid_dim = hid_dim * 2    # Encoder使用双向RNN的缘故
        self.n_layers = n_layers
        self.embedding = nn.Embedding(cn_vocab_size, emb_dim)
        self.input_dim = emb_dim
        self.rnn = nn.GRU(self.input_dim, self.hid_dim, self.n_layers, dropout=dropout, batch_first=True)
        self.embedding2vocab = nn.Sequential(
            nn.Linear(self.hid_dim, self.hid_dim * 4),
            nn.Linear(self.hid_dim * 4, self.cn_vocab_size),
        )
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input, hidden):
        # input = [batch size, vocab size]
        # hidden = [batch size, n layers * directions, hid dim]
        # Decoder 只会是单向的,所以 directions=1
        input = input.unsqueeze(1)                       # input(batch_size, 1, vocab_size)
        embedded = self.dropout(self.embedding(input))  # embedded(batch_size, 1, embed_dim)
        output, hidden = self.rnn(embedded, hidden)      
        # output(batch_size, 1, hid_dim * 2)  hidden(batch_size, num_layers * 1, hid_dim * 2)
        
        # 将 RNN 的输出转为每个词出现的概率
        prediction = self.embedding2vocab(output.squeeze(1))
        # prediction(batch_size, cn_vocab_size)
        return prediction, hidden

Seq2Seq

这部分是整个seq2seq模型的构建,实现Encoder和Decoder的联合。简单来说就是Encoder接受输入得到输出,将Encoder的输出传给Decoder,然后将Decoder得到的输出传回Decoder进行解码,解码完成后,将Decoder的输出传回,就这样一直到输出中解码出<EOS>

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, input, target, teacher_forcing_ratio):
        # input  = [batch size, input len, vocab size]
        # target = [batch size, target len, vocab size]
        # teacher_forcing_ratio 是有多少概率使用正确答案来计算
        batch_size = target.shape[0]
        target_len = target.shape[1]
        vocab_size = self.decoder.cn_vocab_size
        
        # 准备一个储存空间来储存输出
        outputs = torch.zeros(batch_size, target_len, vocab_size).cuda()
        # 输入进入Encoder
        encoder_outputs, hidden = self.encoder(input)
        # Encoder 最后的隐状态(hidden state)用来初始化 Decoder
        # encoder_outputs 主要是使用在 Attention
        # 因为 Encoder 是双向的RNN,所以需要将同一层两个方向的 hidden state 连接在一起
        # hidden =  [num_layers * directions, batch size  , hid dim]  --> [num_layers, directions, batch size  , hid dim]
        hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1)
        hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2)
        # 取的 <BOS> token
        input = target[:, 0]
        preds = []
        for t in range(1, target_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output
            # 决定是否用正确答案来做训练
            teacher_force = random.random() <= teacher_forcing_ratio
            # 取出概率最大的单词
            top1 = output.argmax(1)
            # 如果是 teacher force 则用正解训练,反之用自己预测的单词训练
            input = target[:, t] if teacher_force and t < target_len else top1
            preds.append(top1.unsqueeze(1))
        preds = torch.cat(preds, 1)
        return outputs, preds
    
    def inference(self, input, target):
        # input  = [batch size, input len, vocab size]
        # target = [batch size, target len, vocab size]
        batch_size = target.shape[0]
        target_len = target.shape[1]
        vocab_size = self.decoder.cn_vocab_size
        
        # 准备一个储存空间来储存输出
        outputs = torch.zeros(batch_size, target_len, vocab_size).cuda()
        # 输入进入Encoder
        encoder_outputs, hidden = self.encoder(input)
        # Encoder 最后的隐状态(hidden state)用来初始化 Decoder
        # encoder_outputs 主要是使用在 Attention
        # 因为 Encoder 是双向的RNN,所以需要将同一层两个方向的 hidden state 连接在一起
        # hidden =  [num_layers * directions, batch size  , hid dim]  --> [num_layers, directions, batch size  , hid dim]
        hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1)
        hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2)
        # 取的 <BOS> token
        input = target[:, 0]
        preds = []
        for t in range(1, target_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output
            # 取出概率最大的单词
            top1 = output.argmax(1)
            input = top1
            preds.append(top1.unsqueeze(1))
        preds = torch.cat(preds, 1)
        return outputs, preds

Utils

储存模型

def save_model(model, optimizer, store_model_path, step):
    torch.save(model.state_dict(), f'{store_model_path}/model_{step}.ckpt')

载入模型

def load_model(model, load_model_path):
    print(f'Load model from {load_model_path}')
    model.load_state_dict(torch.load(f'{load_model_path}.ckpt'))
    return model

构建模型

def build_model(config, en_vocab_size, cn_vocab_size):
    # 构建模型
    encoder = Encoder(en_vocab_size, config.emb_dim, config.hid_dim, config.n_layers, config.dropout)
    decoder = Decoder(cn_vocab_size, config.emb_dim, config.hid_dim, config.n_layers, config.dropout)
    model = Seq2Seq(encoder, decoder)
    print(model)
    model = model.cuda()
    # optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    print(optimizer)
    if config.load_model:
        model = load_model(model, config.load_model_path)
        model = model.cuda()

    return model, optimizer

数字转句子

def tokens2sentence(outputs, int2word):
    sentences = []
    for tokens in outputs:
        sentence = []
    for token in tokens:
        word = int2word[int(token)]
        if word == '<EOS>':
            break
        sentence.append(word)
    sentences.append(sentence)

    return sentences

计算BLEU score

import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

def computebleu(sentences, targets):
    score = 0 
    assert (len(sentences) == len(targets))

    def cut_token(sentence):
        tmp = []
        for token in sentence:
            if token == '<UNK>' or token.isdigit() or len(bytes(token[0], encoding='utf-8')) == 1:
                tmp.append(token)
            else:
                tmp += [word for word in token]
        return tmp 

    for sentence, target in zip(sentences, targets):
        sentence = cut_token(sentence)
        target = cut_token(target)
        score += sentence_bleu([target], sentence, weights=(1, 0, 0, 0))                                                                                          

    return score

迭代dataloader

def infinite_iter(data_loader):
    it = iter(data_loader)
    while True:
        try:
            ret = next(it)
            yield ret
        except StopIteration:
            it = iter(data_loader)

训练与测试

实际工作中,训练一个好的机器翻译模型需要大量的语料,训练的周期长。本次实验数据集简单,训练耗时短,定义的训练和测试函数如下:

def train(model, optimizer, train_iter, loss_function, total_steps, summary_steps, train_dataset, teacher_forcing_ratio):
    model.train()
    model.zero_grad()
    losses = []
    loss_sum = 0.0
    for step in range(summary_steps):
        sources, targets = next(train_iter)
        sources, targets = sources.cuda(), targets.cuda()
        outputs, preds = model(sources, targets, teacher_forcing_ratio)
        # targets 的第一个 token 是 <BOS> 所以忽略
        outputs = outputs[:, 1:].reshape(-1, outputs.size(2))
        targets = targets[:, 1:].reshape(-1)
        loss = loss_function(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

    loss_sum += loss.item()
    if (step + 1) % 5 == 0:
        loss_sum = loss_sum / 5
        print ("\r", "train [{}] loss: {:.3f}, Perplexity: {:.3f} ".format(total_steps + step + 1, loss_sum, np.exp(loss_sum)), end=" ")
        losses.append(loss_sum)
        loss_sum = 0.0

    return model, optimizer, losses

def test(model, dataloader, loss_function):
    model.eval()
    loss_sum, bleu_score= 0.0, 0.0
    n = 0
    result = []
    for sources, targets in dataloader:
        sources, targets = sources.cuda(), targets.cuda()
        batch_size = sources.size(0)
        outputs, preds = model.inference(sources, targets)
        # targets 的第一個 token 是 <BOS> 所以忽略
        outputs = outputs[:, 1:].reshape(-1, outputs.size(2))
        targets = targets[:, 1:].reshape(-1)

        loss = loss_function(outputs, targets)
        loss_sum += loss.item()

        # 將預測結果轉為文字
        targets = targets.view(sources.size(0), -1)
        preds = tokens2sentence(preds, dataloader.dataset.int2word_cn)
        sources = tokens2sentence(sources, dataloader.dataset.int2word_en)
        targets = tokens2sentence(targets, dataloader.dataset.int2word_cn)
        for source, pred, target in zip(sources, preds, targets):
            result.append((source, pred, target))
        # 計算 Bleu Score
        bleu_score += computebleu(preds, targets)

        n += batch_size

    return loss_sum / len(dataloader), bleu_score / n, result

训练流程

先训练后测试

def train_process(config):
    # 准备训练资料
    train_dataset = EN2CNDataset(config.max_output_len, train_set)
    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    train_iter = infinite_iter(train_loader)
    # 准备验证资料
    val_dataset = EN2CNDataset(config.max_output_len, validation_set)
    val_loader = DataLoader(val_dataset, batch_size=1)
    # 构建模型
    model, optimizer = build_model(config, train_dataset.en_vocab_size, train_dataset.cn_vocab_size)
    loss_function = nn.CrossEntropyLoss(ignore_index=0)

    train_losses, val_losses, bleu_scores = [], [], []
    total_steps = 0
    while (total_steps < config.num_steps):
        # 训练模型
        model, optimizer, loss = train(model, optimizer, train_iter, loss_function, total_steps, config.summary_steps, train_dataset, config.teacher_forcing_ratio)
        train_losses += loss
        # 验证模型
        val_loss, bleu_score, result = test(model, val_loader, loss_function)
        val_losses.append(val_loss)
        bleu_scores.append(bleu_score)

        total_steps += config.summary_steps
        print ("\r", "val [{}] loss: {:.3f}, Perplexity: {:.3f}, blue score: {:.3f}  ".format(total_steps, val_loss, np.exp(val_loss), bleu_score))

        # 储存模型的结果
        if total_steps % config.store_steps == 0 or total_steps >= config.num_steps:
            save_model(model, optimizer, config.store_model_path, total_steps)
            with open(f'{config.store_model_path}/output_{total_steps}.txt', 'w') as f:
                for line in result:
                    print (line, file=f)

    return train_losses, val_losses, bleu_scores

测试模型

def test_process(config):
    # 准备测试资料
    test_dataset = EN2CNDataset(config.max_output_len, test_set)
    test_loader = DataLoader(test_dataset, batch_size=1)
    # 构建模型
    model, optimizer = build_model(config, test_dataset.en_vocab_size, test_dataset.cn_vocab_size)
    print ("Finish build model")
    loss_function = nn.CrossEntropyLoss(ignore_index=0)
    model.eval()
    # 测试模型
    test_loss, bleu_score, result = test(model, test_loader, loss_function)
    # 储存结果
    with open(f'./log/cmn-eng/test_output.txt', 'w') as f:
        for line in result:
            print(line, file=f)
    return test_loss, bleu_score

Config

  • 实验的参数设定表
class configurations(object):
    def __init__(self):
        self.batch_size = 64
        self.emb_dim = 256
        self.hid_dim = 512
        self.n_layers = 2
        self.dropout = 0.5
        self.learning_rate = 0.00005
        self.teacher_forcing_ratio = 0.8
        self.max_output_len = 40              # 最后输出句子的最大长度
        self.num_steps = 12000                # 总训练次数
        self.store_steps = 300                # 训练多少次后需存模型
        self.summary_steps = 300              # 训练多少次后需检验是否有过拟合
        self.load_model = False               # 是否需载入模型
        self.store_model_path = "./log/ckpt"      # 储存模型的位置
        self.load_model_path = "./log/ckpt/model_12000"           # 载入模型的位置 e.g. "./ckpt/model_{step}" 
        self.data_path = "./data/cmn-eng"          # 资料存放的位置
        self.attention = False                # 是否使用 Attention Mechanism

训练模型:

config = configurations()
train_losses, val_losses, bleu_scores = train_process(config)
Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4397, 256)
    (rnn): GRU(256, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(6798, 256)
    (rnn): GRU(256, 1024, num_layers=2, batch_first=True, dropout=0.5)
    (embedding2vocab): Sequential(
      (0): Linear(in_features=1024, out_features=4096, bias=True)
      (1): Linear(in_features=4096, out_features=6798, bias=True)
    )
    (dropout): Dropout(p=0.5, inplace=False)
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 5e-05
    weight_decay: 0
)
 train [300] loss: 0.911, Perplexity: 2.488  


 val [300] loss: 5.124, Perplexity: 168.024, blue score: 0.186  
 val [600] loss: 4.827, Perplexity: 124.873, blue score: 0.233  
 val [900] loss: 4.643, Perplexity: 103.901, blue score: 0.278  
 val [1200] loss: 4.538, Perplexity: 93.484, blue score: 0.291  
 val [1500] loss: 4.514, Perplexity: 91.273, blue score: 0.288  
 val [1800] loss: 4.457, Perplexity: 86.216, blue score: 0.307  
 val [2100] loss: 4.504, Perplexity: 90.364, blue score: 0.313  
 val [2400] loss: 4.392, Perplexity: 80.830, blue score: 0.333  
 val [2700] loss: 4.389, Perplexity: 80.539, blue score: 0.329  
 val [3000] loss: 4.380, Perplexity: 79.800, blue score: 0.342  
 val [3300] loss: 4.319, Perplexity: 75.126, blue score: 0.357  
 val [3600] loss: 4.245, Perplexity: 69.757, blue score: 0.358  
 val [3900] loss: 4.262, Perplexity: 70.927, blue score: 0.371  
 val [4200] loss: 4.241, Perplexity: 69.500, blue score: 0.375  
 val [4500] loss: 4.234, Perplexity: 68.983, blue score: 0.387  
 val [4800] loss: 4.186, Perplexity: 65.791, blue score: 0.385  
 val [5100] loss: 4.122, Perplexity: 61.675, blue score: 0.396  
 val [5400] loss: 4.162, Perplexity: 64.201, blue score: 0.399  
 val [5700] loss: 4.148, Perplexity: 63.323, blue score: 0.410  
 val [6000] loss: 4.065, Perplexity: 58.286, blue score: 0.404  
 val [6300] loss: 4.089, Perplexity: 59.695, blue score: 0.410  
 val [6600] loss: 4.076, Perplexity: 58.931, blue score: 0.417  
 val [6900] loss: 4.103, Perplexity: 60.544, blue score: 0.423  
 val [7200] loss: 4.102, Perplexity: 60.452, blue score: 0.426  
 val [7500] loss: 4.092, Perplexity: 59.838, blue score: 0.429  
 val [7800] loss: 4.029, Perplexity: 56.186, blue score: 0.433  
 val [8100] loss: 4.057, Perplexity: 57.809, blue score: 0.440  
 val [8400] loss: 4.023, Perplexity: 55.880, blue score: 0.440  
 val [8700] loss: 4.025, Perplexity: 55.962, blue score: 0.445  
 val [9000] loss: 4.043, Perplexity: 57.014, blue score: 0.450  
 val [9300] loss: 4.020, Perplexity: 55.690, blue score: 0.453  
 val [9600] loss: 4.030, Perplexity: 56.256, blue score: 0.460  
 val [9900] loss: 4.032, Perplexity: 56.365, blue score: 0.465  
 val [10200] loss: 4.056, Perplexity: 57.730, blue score: 0.454  
 val [10500] loss: 4.001, Perplexity: 54.639, blue score: 0.465  
 val [10800] loss: 4.055, Perplexity: 57.664, blue score: 0.461  
 val [11100] loss: 4.007, Perplexity: 55.003, blue score: 0.470  
 val [11400] loss: 4.063, Perplexity: 58.153, blue score: 0.468  
 val [11700] loss: 4.042, Perplexity: 56.917, blue score: 0.474  
 val [12000] loss: 4.072, Perplexity: 58.686, blue score: 0.475  

测试模型:

在执行这步之前,请先去config设定所要载入模型的位置,并将load_model设置为True

config.load_model = True
test_loss, bleu_score = test_process(config)
print(f'test loss: {test_loss}, bleu_score: {bleu_score}')
Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4397, 256)
    (rnn): GRU(256, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(6798, 256)
    (rnn): GRU(256, 1024, num_layers=2, batch_first=True, dropout=0.5)
    (embedding2vocab): Sequential(
      (0): Linear(in_features=1024, out_features=4096, bias=True)
      (1): Linear(in_features=4096, out_features=6798, bias=True)
    )
    (dropout): Dropout(p=0.5, inplace=False)
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 5e-05
    weight_decay: 0
)
Load model from ./log/ckpt/model_12000
Finish build model


test loss: 4.043186987083377, bleu_score: 0.470621067287946

图形化训练过程

训练的loss变化趋势

import matplotlib.pyplot as plt
plt.figure()
plt.plot(train_losses)
plt.xlabel('次数')
plt.ylabel('loss')
plt.title('train loss')
plt.show()

验证的loss变化趋势

import matplotlib.pyplot as plt
plt.figure()
plt.plot(val_losses)
plt.xlabel('次数')
plt.ylabel('loss')
plt.title('validation loss')
plt.show()

BLEU score

import matplotlib.pyplot as plt
plt.figure()
plt.plot(bleu_scores)
plt.xlabel('次数')
plt.ylabel('BLEU score')
plt.title('BLEU score')
plt.show()

参考资料

[1] 李宏毅 机器学习2020

[2] Dive into Deep Learning


文章作者: 不才叶某
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 不才叶某 !
评论
  目录