From f91528a96e4fac8d7ee87868877e2c001e540973 Mon Sep 17 00:00:00 2001 From: guosheng Date: Wed, 1 Apr 2020 13:36:35 +0800 Subject: [PATCH 01/13] Add seq2seq --- seq2seq/args.py | 134 +++++++++++++++++++++++ seq2seq/download.py | 54 +++++++++ seq2seq/predict.py | 0 seq2seq/reader.py | 218 +++++++++++++++++++++++++++++++++++++ seq2seq/run.sh | 17 +++ seq2seq/seq2seq.yaml | 83 ++++++++++++++ seq2seq/seq2seq_attn.py | 235 ++++++++++++++++++++++++++++++++++++++++ seq2seq/seq2seq_base.py | 197 +++++++++++++++++++++++++++++++++ seq2seq/train.py | 110 +++++++++++++++++++ 9 files changed, 1048 insertions(+) create mode 100644 seq2seq/args.py create mode 100644 seq2seq/download.py create mode 100644 seq2seq/predict.py create mode 100644 seq2seq/reader.py create mode 100644 seq2seq/run.sh create mode 100644 seq2seq/seq2seq.yaml create mode 100644 seq2seq/seq2seq_attn.py create mode 100644 seq2seq/seq2seq_base.py create mode 100644 seq2seq/train.py diff --git a/seq2seq/args.py b/seq2seq/args.py new file mode 100644 index 00000000000000..9c3911932fd0a6 --- /dev/null +++ b/seq2seq/args.py @@ -0,0 +1,134 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import distutils.util + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--train_data_prefix", type=str, help="file prefix for train data") + parser.add_argument( + "--eval_data_prefix", type=str, help="file prefix for eval data") + parser.add_argument( + "--test_data_prefix", type=str, help="file prefix for test data") + parser.add_argument( + "--vocab_prefix", type=str, help="file prefix for vocab") + parser.add_argument("--src_lang", type=str, help="source language suffix") + parser.add_argument("--tar_lang", type=str, help="target language suffix") + + parser.add_argument( + "--attention", + type=eval, + default=False, + help="Whether use attention model") + + parser.add_argument( + "--optimizer", + type=str, + default='adam', + help="optimizer to use, only supprt[sgd|adam]") + + parser.add_argument( + "--learning_rate", + type=float, + default=0.001, + help="learning rate for optimizer") + + parser.add_argument( + "--num_layers", + type=int, + default=1, + help="layers number of encoder and decoder") + parser.add_argument( + "--hidden_size", + type=int, + default=100, + help="hidden size of encoder and decoder") + parser.add_argument("--src_vocab_size", type=int, help="source vocab size") + parser.add_argument("--tar_vocab_size", type=int, help="target vocab size") + + parser.add_argument( + "--batch_size", type=int, help="batch size of each step") + + parser.add_argument( + "--max_epoch", type=int, default=12, help="max epoch for the training") + + parser.add_argument( + "--max_len", + type=int, + default=50, + help="max length for source and target sentence") + parser.add_argument( + "--dropout", type=float, default=0.0, help="drop probability") + parser.add_argument( + "--init_scale", + type=float, + default=0.0, + help="init scale for parameter") + parser.add_argument( + "--max_grad_norm", + type=float, + default=5.0, + help="max grad norm for global norm clip") + + parser.add_argument( + "--model_path", + type=str, + default='model', + help="model path for model to save") + + parser.add_argument( + "--reload_model", type=str, help="reload model to inference") + + parser.add_argument( + "--infer_file", type=str, help="file name for inference") + parser.add_argument( + "--infer_output_file", + type=str, + default='infer_output', + help="file name for inference output") + parser.add_argument( + "--beam_size", type=int, default=10, help="file name for inference") + + parser.add_argument( + '--use_gpu', + type=eval, + default=False, + help='Whether using gpu [True|False]') + + parser.add_argument( + '--eager_run', type=eval, default=False, help='Whether to use dygraph') + + parser.add_argument( + "--enable_ce", + action='store_true', + help="The flag indicating whether to run the task " + "for continuous evaluation.") + + parser.add_argument( + "--profile", action='store_true', help="Whether enable the profile.") + # NOTE: profiler args, used for benchmark + parser.add_argument( + "--profiler_path", + type=str, + default='./seq2seq.profile', + help="the profiler output file path. (used for benchmark)") + args = parser.parse_args() + return args diff --git a/seq2seq/download.py b/seq2seq/download.py new file mode 100644 index 00000000000000..6d2981f452f832 --- /dev/null +++ b/seq2seq/download.py @@ -0,0 +1,54 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +Script for downloading training data. +''' +import os +import urllib +import sys + +if sys.version_info >= (3, 0): + import urllib.request +import zipfile + +URLLIB = urllib +if sys.version_info >= (3, 0): + URLLIB = urllib.request + +remote_path = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi' +base_path = 'data' +tar_path = os.path.join(base_path, 'en-vi') +filenames = [ + 'train.en', 'train.vi', 'tst2012.en', 'tst2012.vi', 'tst2013.en', + 'tst2013.vi', 'vocab.en', 'vocab.vi' +] + + +def main(arguments): + print("Downloading data......") + + if not os.path.exists(tar_path): + if not os.path.exists(base_path): + os.mkdir(base_path) + os.mkdir(tar_path) + + for filename in filenames: + url = remote_path + '/' + filename + tar_file = os.path.join(tar_path, filename) + URLLIB.urlretrieve(url, tar_file) + print("Downloaded sucess......") + + +if __name__ == '__main__': + sys.exit(main(sys.argv[1:])) diff --git a/seq2seq/predict.py b/seq2seq/predict.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/seq2seq/reader.py b/seq2seq/reader.py new file mode 100644 index 00000000000000..e562b6e94d8429 --- /dev/null +++ b/seq2seq/reader.py @@ -0,0 +1,218 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import os +import io +import sys +import numpy as np + +Py3 = sys.version_info[0] == 3 + +UNK_ID = 0 + + +def _read_words(filename): + data = [] + with io.open(filename, "r", encoding='utf-8') as f: + if Py3: + return f.read().replace("\n", "").split() + else: + return f.read().decode("utf-8").replace(u"\n", u"").split() + + +def read_all_line(filenam): + data = [] + with io.open(filename, "r", encoding='utf-8') as f: + for line in f.readlines(): + data.append(line.strip()) + + +def _build_vocab(filename): + + vocab_dict = {} + ids = 0 + with io.open(filename, "r", encoding='utf-8') as f: + for line in f.readlines(): + vocab_dict[line.strip()] = ids + ids += 1 + + print("vocab word num", ids) + + return vocab_dict + + +def _para_file_to_ids(src_file, tar_file, src_vocab, tar_vocab): + + src_data = [] + with io.open(src_file, "r", encoding='utf-8') as f_src: + for line in f_src.readlines(): + arra = line.strip().split() + ids = [src_vocab[w] if w in src_vocab else UNK_ID for w in arra] + ids = ids + + src_data.append(ids) + + tar_data = [] + with io.open(tar_file, "r", encoding='utf-8') as f_tar: + for line in f_tar.readlines(): + arra = line.strip().split() + ids = [tar_vocab[w] if w in tar_vocab else UNK_ID for w in arra] + + ids = [1] + ids + [2] + + tar_data.append(ids) + + return src_data, tar_data + + +def filter_len(src, tar, max_sequence_len=50): + new_src = [] + new_tar = [] + + for id1, id2 in zip(src, tar): + if len(id1) > max_sequence_len: + id1 = id1[:max_sequence_len] + if len(id2) > max_sequence_len + 2: + id2 = id2[:max_sequence_len + 2] + + new_src.append(id1) + new_tar.append(id2) + + return new_src, new_tar + + +def raw_data(src_lang, + tar_lang, + vocab_prefix, + train_prefix, + eval_prefix, + test_prefix, + max_sequence_len=50): + + src_vocab_file = vocab_prefix + "." + src_lang + tar_vocab_file = vocab_prefix + "." + tar_lang + + src_train_file = train_prefix + "." + src_lang + tar_train_file = train_prefix + "." + tar_lang + + src_eval_file = eval_prefix + "." + src_lang + tar_eval_file = eval_prefix + "." + tar_lang + + src_test_file = test_prefix + "." + src_lang + tar_test_file = test_prefix + "." + tar_lang + + src_vocab = _build_vocab(src_vocab_file) + tar_vocab = _build_vocab(tar_vocab_file) + + train_src, train_tar = _para_file_to_ids( src_train_file, tar_train_file, \ + src_vocab, tar_vocab ) + train_src, train_tar = filter_len( + train_src, train_tar, max_sequence_len=max_sequence_len) + eval_src, eval_tar = _para_file_to_ids( src_eval_file, tar_eval_file, \ + src_vocab, tar_vocab ) + + test_src, test_tar = _para_file_to_ids( src_test_file, tar_test_file, \ + src_vocab, tar_vocab ) + + return ( train_src, train_tar), (eval_src, eval_tar), (test_src, test_tar),\ + (src_vocab, tar_vocab) + + +def raw_mono_data(vocab_file, file_path): + + src_vocab = _build_vocab(vocab_file) + + test_src, test_tar = _para_file_to_ids( file_path, file_path, \ + src_vocab, src_vocab ) + + return (test_src, test_tar) + + +def get_data_iter(raw_data, + batch_size, + mode='train', + enable_ce=False, + cache_num=20): + + src_data, tar_data = raw_data + + data_len = len(src_data) + + index = np.arange(data_len) + if mode == "train" and not enable_ce: + np.random.shuffle(index) + + def to_pad_np(data, source=False): + max_len = 0 + bs = min(batch_size, len(data)) + for ele in data: + if len(ele) > max_len: + max_len = len(ele) + + ids = np.ones((bs, max_len), dtype='int64') * 2 + mask = np.zeros((bs), dtype='int32') + + for i, ele in enumerate(data): + ids[i, :len(ele)] = ele + if not source: + mask[i] = len(ele) - 1 + else: + mask[i] = len(ele) + + return ids, mask + + b_src = [] + + if mode != "train": + cache_num = 1 + for j in range(data_len): + if len(b_src) == batch_size * cache_num: + # build batch size + + # sort + if mode == 'infer': + new_cache = b_src + else: + new_cache = sorted(b_src, key=lambda k: len(k[0])) + + for i in range(cache_num): + batch_data = new_cache[i * batch_size:(i + 1) * batch_size] + src_cache = [w[0] for w in batch_data] + tar_cache = [w[1] for w in batch_data] + src_ids, src_mask = to_pad_np(src_cache, source=True) + tar_ids, tar_mask = to_pad_np(tar_cache) + yield (src_ids, src_mask, tar_ids, tar_mask) + + b_src = [] + + b_src.append((src_data[index[j]], tar_data[index[j]])) + if len(b_src) == batch_size * cache_num or mode == 'infer': + if mode == 'infer': + new_cache = b_src + else: + new_cache = sorted(b_src, key=lambda k: len(k[0])) + + for i in range(cache_num): + batch_end = min(len(new_cache), (i + 1) * batch_size) + batch_data = new_cache[i * batch_size:batch_end] + src_cache = [w[0] for w in batch_data] + tar_cache = [w[1] for w in batch_data] + src_ids, src_mask = to_pad_np(src_cache, source=True) + tar_ids, tar_mask = to_pad_np(tar_cache) + yield (src_ids, src_mask, tar_ids, tar_mask) diff --git a/seq2seq/run.sh b/seq2seq/run.sh new file mode 100644 index 00000000000000..2fe8b7a0700ae4 --- /dev/null +++ b/seq2seq/run.sh @@ -0,0 +1,17 @@ +python train.py \ + --src_lang en --tar_lang vi \ + --attention True \ + --num_layers 2 \ + --hidden_size 512 \ + --src_vocab_size 17191 \ + --tar_vocab_size 7709 \ + --batch_size 128 \ + --dropout 0.2 \ + --init_scale 0.1 \ + --max_grad_norm 5.0 \ + --train_data_prefix data/en-vi/train \ + --eval_data_prefix data/en-vi/tst2012 \ + --test_data_prefix data/en-vi/tst2013 \ + --vocab_prefix data/en-vi/vocab \ + --use_gpu True \ + --model_path attention_models \ No newline at end of file diff --git a/seq2seq/seq2seq.yaml b/seq2seq/seq2seq.yaml new file mode 100644 index 00000000000000..8e0edb72f2faa9 --- /dev/null +++ b/seq2seq/seq2seq.yaml @@ -0,0 +1,83 @@ +# used for continuous evaluation +enable_ce: False + +eager_run: False + +# The frequency to save trained models when training. +save_step: 10000 +# The frequency to fetch and print output when training. +print_step: 100 +# path of the checkpoint, to resume the previous training +init_from_checkpoint: "" +# path of the pretrain model, to better solve the current task +init_from_pretrain_model: "" +# path of trained parameter, to make prediction +init_from_params: "trained_params/step_100000/" +# the directory for saving model +save_model: "trained_models" +# the directory for saving inference model. +inference_model_dir: "infer_model" +# Set seed for CE or debug +random_seed: None +# The pattern to match training data files. +training_file: "wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de" +# The pattern to match validation data files. +validation_file: "wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de" +# The pattern to match test data files. +predict_file: "wmt16_ende_data_bpe/newstest2016.tok.bpe.32000.en-de" +# The file to output the translation results of predict_file to. +output_file: "predict.txt" +# The path of vocabulary file of source language. +src_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000" +# The path of vocabulary file of target language. +trg_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000" +# The , and tokens in the dictionary. +special_token: ["", "", ""] +# max length of sequences +max_length: 256 + +# whether to use cuda +use_cuda: True + +# args for reader, see reader.py for details +token_delimiter: " " +use_token_batch: True +pool_size: 200000 +sort_type: "pool" +shuffle: True +shuffle_batch: True +batch_size: 4096 + +# Hyparams for training: +# the number of epoches for training +epoch: 30 +# the hyper parameters for Adam optimizer. +# This static learning_rate will be multiplied to the LearningRateScheduler +# derived learning rate the to get the final learning rate. +learning_rate: 0.001 + + +# Hyparams for generation: +# the parameters for beam search. +beam_size: 5 +max_out_len: 256 +# the number of decoded sentences to output. +n_best: 1 + +# Hyparams for model: +# These following five vocabularies related configurations will be set +# automatically according to the passed vocabulary path and special tokens. +# size of source word dictionary. +src_vocab_size: 10000 +# size of target word dictionay +trg_vocab_size: 10000 +# index for token +bos_idx: 0 +# index for token +eos_idx: 1 +# index for token +unk_idx: 2 +embed_dim: 512 +hidden_size: 512 +num_layers: 2 +dropout: 0.1 diff --git a/seq2seq/seq2seq_attn.py b/seq2seq/seq2seq_attn.py new file mode 100644 index 00000000000000..b71018e479bc7c --- /dev/null +++ b/seq2seq/seq2seq_attn.py @@ -0,0 +1,235 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from paddle.fluid import ParamAttr +from paddle.fluid.initializer import UniformInitializer +from paddle.fluid.dygraph import Embedding, Linear, Layer +from paddle.fluid.layers import BeamSearchDecoder + +from text import DynamicDecode, RNN, BasicLSTMCell, RNNCell +from model import Model, Loss +from seq2seq_base import Encoder + + +class AttentionLayer(Layer): + def __init__(self, hidden_size, bias=False, init_scale=0.1): + super(AttentionLayer, self).__init__() + self.input_proj = Linear( + hidden_size, + hidden_size, + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale)), + bias_attr=bias) + self.output_proj = Linear( + hidden_size + hidden_size, + hidden_size, + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale)), + bias_attr=bias) + + def forward(self, hidden, encoder_output, encoder_padding_mask): + query = self.input_proj(hidden) + attn_scores = layers.matmul( + layers.unsqueeze(query, [1]), encoder_output, transpose_y=True) + if encoder_padding_mask is not None: + attn_scores = layers.elementwise_add(attn_scores, + encoder_padding_mask) + attn_scores = layers.softmax(attn_scores) + attn_out = layers.squeeze( + layers.matmul(attn_scores, encoder_output), [1]) + attn_out = layers.concat([attn_out, hidden], 1) + attn_out = self.output_proj(attn_out) + return attn_out + + +class DecoderCell(RNNCell): + def __init__(self, + num_layers, + input_size, + hidden_size, + dropout_prob=0., + init_scale=0.1): + super(DecoderCell, self).__init__() + self.dropout_prob = dropout_prob + # use add_sublayer to add multi-layers + self.lstm_cells = [] + for i in range(num_layers): + self.lstm_cells.append( + self.add_sublayer( + "lstm_%d" % i, + BasicLSTMCell( + input_size=input_size + hidden_size + if i == 0 else hidden_size, + hidden_size=hidden_size))) + self.attention_layer = AttentionLayer(hidden_size) + + def forward(self, + step_input, + states, + encoder_output, + encoder_padding_mask=None): + lstm_states, input_feed = states + new_lstm_states = [] + step_input = layers.concat([step_input, input_feed], 1) + for i, lstm_cell in enumerate(self.lstm_cells): + out, new_lstm_state = lstm_cell(step_input, lstm_states[i]) + step_input = layers.dropout( + out, self.dropout_prob) if self.dropout_prob > 0 else out + new_lstm_states.append(new_lstm_state) + out = self.attention_layer(step_input, encoder_output, + encoder_padding_mask) + return out, [new_lstm_states, out] + + +class Decoder(Layer): + def __init__(self, + vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + init_scale=0.1): + super(Decoder, self).__init__() + self.embedder = Embedding( + size=[vocab_size, embed_dim], + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale))) + self.lstm_attention = RNN(DecoderCell(num_layers, embed_dim, + hidden_size, init_scale), + is_reverse=False, + time_major=False) + self.output_layer = Linear( + hidden_size, + vocab_size, + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale)), + bias_attr=False) + + def forward(self, target, decoder_initial_states, encoder_output, + encoder_padding_mask): + inputs = self.embedder(target) + decoder_output, _ = self.lstm_attention( + inputs, + initial_states=decoder_initial_states, + encoder_output=encoder_output, + encoder_padding_mask=encoder_padding_mask) + predict = self.output_layer(decoder_output) + return predict + + +class AttentionModel(Model): + def __init__(self, + src_vocab_size, + trg_vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + init_scale=0.1): + super(AttentionModel, self).__init__() + self.hidden_size = hidden_size + self.encoder = Encoder(src_vocab_size, embed_dim, hidden_size, + num_layers, dropout_prob, init_scale) + self.decoder = Decoder(trg_vocab_size, embed_dim, hidden_size, + num_layers, dropout_prob, init_scale) + + def forward(self, src, src_length, trg, trg_length): + # encoder + encoder_output, encoder_final_state = self.encoder(src, src_length) + + # decoder initial states: use input_feed and the structure is + # [[h,c] * num_layers, input_feed], consistent with DecoderCell.states + decoder_initial_states = [ + encoder_final_state, + self.decoder.lstm_attention.cell.get_initial_states( + batch_ref=encoder_output, shape=[self.hidden_size]) + ] + # attention mask to avoid paying attention on padddings + src_mask = layers.sequence_mask( + src_length, + maxlen=layers.shape(src)[1], + dtype=encoder_output.dtype) + encoder_padding_mask = (src_mask - 1.0) * 1e9 + encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1]) + + # decoder with attentioon + predict = self.decoder(trg, decoder_initial_states, encoder_output, + encoder_padding_mask) + + # for target padding mask + mask = layers.sequence_mask( + trg_length, maxlen=layers.shape(trg)[1], dtype=predict.dtype) + return predict, mask + + +class AttentionInferModel(AttentionModel): + def __init__(self, + vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + bos_id=0, + eos_id=1, + beam_size=4, + max_out_len=256): + args = dict(locals()) + args.pop("self") + args.pop("__class__", None) # py3 + self.beam_size = args.pop("beam_size") + self.max_out_len = args.pop("max_out_len") + super(AttentionInferModel, self).__init__(**args) + # dynamic decoder for inference + decoder = BeamSearchDecoder( + self.decoder.lstm_attention.cell, + start_token=bos_id, + end_token=eos_id, + beam_size=beam_size, + embedding_fn=self.decoder.embedder, + output_fn=self.decoder.output_layer) + self.beam_search_decoder = DynamicDecode( + decoder, max_step_num=max_out_len, is_test=True) + + def forward(self, src, src_length): + # encoding + encoder_output, encoder_final_state = self.encoder(src, src_length) + + # decoder initial states + decoder_initial_states = [ + encoder_final_state, + self.decoder.lstm_attention.cell.get_initial_states( + batch_ref=encoder_output, shape=[self.hidden_size]) + ] + # attention mask to avoid paying attention on padddings + src_mask = layers.sequence_mask( + src_length, + maxlen=layers.shape(src)[1], + dtype=encoder_output.dtype) + encoder_padding_mask = (src_mask - 1.0) * 1e9 + encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1]) + + # Tile the batch dimension with beam_size + encoder_output = BeamSearchDecoder.tile_beam_merge_with_batch( + encoder_output, self.beam_size) + encoder_padding_mask = BeamSearchDecoder.tile_beam_merge_with_batch( + encoder_padding_mask, self.beam_size) + + # dynamic decoding with beam search + rs, _ = self.beam_search_decoder( + inits=decoder_initial_states, + encoder_output=encoder_output, + encoder_padding_mask=encoder_padding_mask) + return rs diff --git a/seq2seq/seq2seq_base.py b/seq2seq/seq2seq_base.py new file mode 100644 index 00000000000000..f56a873e1460dc --- /dev/null +++ b/seq2seq/seq2seq_base.py @@ -0,0 +1,197 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from paddle.fluid import ParamAttr +from paddle.fluid.initializer import UniformInitializer +from paddle.fluid.dygraph import Embedding, Linear, Layer +from paddle.fluid.layers import BeamSearchDecoder +from text import DynamicDecode, RNN, BasicLSTMCell, RNNCell +from model import Model, Loss + + +class CrossEntropyCriterion(Loss): + def __init__(self): + super(CrossEntropyCriterion, self).__init__() + + def forward(self, outputs, labels): + (predict, mask), label = outputs, labels[0] + + cost = layers.softmax_with_cross_entropy( + logits=predict, label=label, soft_label=False) + masked_cost = layers.elementwise_mul(cost, mask, axis=0) + batch_mean_cost = layers.reduce_mean(masked_cost, dim=[0]) + seq_cost = layers.reduce_sum(batch_mean_cost) + return seq_cost + + +class EncoderCell(RNNCell): + def __init__(self, + num_layers, + input_size, + hidden_size, + dropout_prob=0., + init_scale=0.1): + super(EncoderCell, self).__init__() + self.dropout_prob = dropout_prob + # use add_sublayer to add multi-layers + self.lstm_cells = [] + for i in range(num_layers): + self.lstm_cells.append( + self.add_sublayer( + "lstm_%d" % i, + BasicLSTMCell( + input_size=input_size if i == 0 else hidden_size, + hidden_size=hidden_size, + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale))))) + + def forward(self, step_input, states): + new_states = [] + for i, lstm_cell in enumerate(self.lstm_cells): + out, new_state = lstm_cell(step_input, states[i]) + step_input = layers.dropout( + out, self.dropout_prob) if self.dropout_prob > 0 else out + new_states.append(new_state) + return step_input, new_states + + @property + def state_shape(self): + return [cell.state_shape for cell in self.lstm_cells] + + +class Encoder(Layer): + def __init__(self, + vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + init_scale=0.1): + super(Encoder, self).__init__() + self.embedder = Embedding( + size=[vocab_size, embed_dim], + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale))) + self.stack_lstm = RNN(EncoderCell(num_layers, embed_dim, hidden_size, + init_scale), + is_reverse=False, + time_major=False) + + def forward(self, sequence, sequence_length): + inputs = self.embedder(sequence) + encoder_output, encoder_state = self.stack_lstm( + inputs, sequence_length=sequence_length) + return encoder_output, encoder_state + + +DecoderCell = EncoderCell + + +class Decoder(Layer): + def __init__(self, + vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + init_scale=0.1): + super(Decoder, self).__init__() + self.embedder = Embedding( + size=[vocab_size, embed_dim], + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale))) + self.stack_lstm = RNN(DecoderCell(num_layers, embed_dim, hidden_size, + init_scale), + is_reverse=False, + time_major=False) + self.output_layer = Linear( + hidden_size, + vocab_size, + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale)), + bias_attr=False) + + def forward(self, target, decoder_initial_states): + inputs = self.embedder(target) + decoder_output, _ = self.stack_lstm( + inputs, initial_states=decoder_initial_states) + predict = self.output_layer(decoder_output) + return predict + + +class BaseModel(Model): + def __init__(self, + src_vocab_size, + trg_vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + init_scale=0.1): + super(BaseModel, self).__init__() + self.hidden_size = hidden_size + self.encoder = Encoder(src_vocab_size, embed_dim, hidden_size, + num_layers, dropout_prob, init_scale) + self.decoder = Decoder(trg_vocab_size, embed_dim, hidden_size, + num_layers, dropout_prob, init_scale) + + def forward(self, src, src_length, trg, trg_length): + # encoder + encoder_output, encoder_final_states = self.encoder(src, src_length) + + # decoder + predict = self.decoder(trg, encoder_final_states) + + # for target padding mask + mask = layers.sequence_mask( + trg_length, maxlen=layers.shape(trg)[1], dtype=predict.dtype) + return predict, mask + + +class BaseInferModel(BaseModel): + def __init__(self, + vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + bos_id=0, + eos_id=1, + beam_size=4, + max_out_len=256): + args = dict(locals()) + args.pop("self") + args.pop("__class__", None) # py3 + self.beam_size = args.pop("beam_size") + self.max_out_len = args.pop("max_out_len") + super(BaseInferModel, self).__init__(**args) + # dynamic decoder for inference + decoder = BeamSearchDecoder( + self.decoder.stack_lstm.cell, + start_token=bos_id, + end_token=eos_id, + beam_size=beam_size, + embedding_fn=self.decoder.embedder, + output_fn=self.decoder.output_layer) + self.beam_search_decoder = DynamicDecode( + decoder, max_step_num=max_out_len, is_test=True) + + def forward(self, src, src_length): + # encoding + encoder_output, encoder_final_states = self.encoder(src, src_length) + # dynamic decoding with beam search + rs, _ = self.beam_search_decoder(inits=encoder_final_states) + return rs diff --git a/seq2seq/train.py b/seq2seq/train.py new file mode 100644 index 00000000000000..3ca8ae61a97df9 --- /dev/null +++ b/seq2seq/train.py @@ -0,0 +1,110 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import six +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import time +import contextlib +from functools import partial + +import numpy as np +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph import to_variable +from paddle.fluid.io import DataLoader +from paddle.fluid.dygraph_grad_clip import GradClipByGlobalNorm + +import reader +from args import parse_args +from seq2seq_base import BaseModel, CrossEntropyCriterion +from seq2seq_attn import AttentionModel +from model import Input, set_device +from callbacks import ProgBarLogger +from metrics import Metric + + +class PPL(Metric): + pass + + +def do_train(args): + device = set_device("gpu" if args.use_gpu else "cpu") + fluid.enable_dygraph(device) #if args.eager_run else None + + # define model + inputs = [ + Input( + [None, None], "int64", name="src_word"), + Input( + [None], "int64", name="src_length"), + Input( + [None, None], "int64", name="trg_word"), + Input( + [None], "int64", name="trg_length"), + ] + labels = [Input([None, None, 1], "int64", name="label"), ] + + model = AttentionModel(args.src_vocab_size, args.tar_vocab_size, + args.hidden_size, args.hidden_size, args.num_layers, + args.dropout) + + model.prepare( + fluid.optimizer.Adam( + learning_rate=args.learning_rate, + parameter_list=model.parameters()), + CrossEntropyCriterion(), + inputs=inputs, + labels=labels) + + batch_size = 32 + src_seq_len = 10 + trg_seq_len = 12 + iter_num = 10 + + def random_generator(): + for i in range(iter_num): + src = np.random.randint(2, args.src_vocab_size, + (batch_size, src_seq_len)).astype("int64") + src_length = np.random.randint(1, src_seq_len, + (batch_size, )).astype("int64") + trg = np.random.randint(2, args.tar_vocab_size, + (batch_size, trg_seq_len)).astype("int64") + trg_length = np.random.randint(1, trg_seq_len, + (batch_size, )).astype("int64") + label = np.random.randint( + 1, trg_seq_len, (batch_size, trg_seq_len, 1)).astype("int64") + yield src, src_length, trg, trg_length, label + + model.fit(train_data=random_generator, log_freq=1) + exit(0) + + data_loaders = [None, None] + data_files = [args.training_file, args.validation_file + ] if args.validation_file else [args.training_file] + train_loader, eval_loader = data_loaders + + model.fit(train_data=train_loader, + eval_data=None, + epochs=1, + eval_freq=1, + save_freq=1, + verbose=2) + + +if __name__ == "__main__": + args = parse_args() + do_train(args) From 2bb216b7d34d11047f7f6c9546f56d07222616d2 Mon Sep 17 00:00:00 2001 From: guosheng Date: Sat, 4 Apr 2020 18:43:22 +0800 Subject: [PATCH 02/13] Update seq2seq --- seq2seq/reader.py | 510 +++++++++++++++++++++++++--------------- seq2seq/seq2seq_attn.py | 13 +- seq2seq/seq2seq_base.py | 4 +- seq2seq/train.py | 86 ++++--- 4 files changed, 379 insertions(+), 234 deletions(-) diff --git a/seq2seq/reader.py b/seq2seq/reader.py index e562b6e94d8429..145acec295fab1 100644 --- a/seq2seq/reader.py +++ b/seq2seq/reader.py @@ -16,203 +16,333 @@ from __future__ import division from __future__ import print_function -import collections -import os +import glob import io -import sys import numpy as np - -Py3 = sys.version_info[0] == 3 - -UNK_ID = 0 - - -def _read_words(filename): - data = [] - with io.open(filename, "r", encoding='utf-8') as f: - if Py3: - return f.read().replace("\n", "").split() +import itertools +from paddle.fluid.dygraph.parallel import ParallelEnv +from paddle.fluid.io import BatchSampler, DataLoader, Dataset + + +def prepare_train_input(insts, bos_id, eos_id, pad_id): + src, src_length = pad_batch_data( + [inst[0] for inst in insts], pad_id=pad_id) + trg, trg_length = pad_batch_data( + [[bos_id] + inst[1] + [eos_id] for inst in insts], pad_id=pad_id) + trg_length = trg_length - 1 + return src, src_length, trg[:, :-1], trg_length, trg[:, 1:, np.newaxis] + + +def pad_batch_data(insts, pad_id): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. + """ + inst_lens = np.array([len(inst) for inst in insts], dtype="int64") + max_len = np.max(inst_lens) + inst_data = np.array( + [inst + [pad_id] * (max_len - len(inst)) for inst in insts], + dtype="int64") + return inst_data, inst_lens + + +class SortType(object): + GLOBAL = 'global' + POOL = 'pool' + NONE = "none" + + +class Converter(object): + def __init__(self, vocab, beg, end, unk, delimiter, add_beg, add_end): + self._vocab = vocab + self._beg = beg + self._end = end + self._unk = unk + self._delimiter = delimiter + self._add_beg = add_beg + self._add_end = add_end + + def __call__(self, sentence): + return ([self._beg] if self._add_beg else []) + [ + self._vocab.get(w, self._unk) + for w in sentence.split(self._delimiter) + ] + ([self._end] if self._add_end else []) + + +class ComposedConverter(object): + def __init__(self, converters): + self._converters = converters + + def __call__(self, fields): + return [ + converter(field) + for field, converter in zip(fields, self._converters) + ] + + +class SentenceBatchCreator(object): + def __init__(self, batch_size): + self.batch = [] + self._batch_size = batch_size + + def append(self, info): + self.batch.append(info) + if len(self.batch) == self._batch_size: + tmp = self.batch + self.batch = [] + return tmp + + +class TokenBatchCreator(object): + def __init__(self, batch_size): + self.batch = [] + self.max_len = -1 + self._batch_size = batch_size + + def append(self, info): + cur_len = info.max_len + max_len = max(self.max_len, cur_len) + if max_len * (len(self.batch) + 1) > self._batch_size: + result = self.batch + self.batch = [info] + self.max_len = cur_len + return result else: - return f.read().decode("utf-8").replace(u"\n", u"").split() - - -def read_all_line(filenam): - data = [] - with io.open(filename, "r", encoding='utf-8') as f: - for line in f.readlines(): - data.append(line.strip()) - - -def _build_vocab(filename): - - vocab_dict = {} - ids = 0 - with io.open(filename, "r", encoding='utf-8') as f: - for line in f.readlines(): - vocab_dict[line.strip()] = ids - ids += 1 - - print("vocab word num", ids) - - return vocab_dict - - -def _para_file_to_ids(src_file, tar_file, src_vocab, tar_vocab): - - src_data = [] - with io.open(src_file, "r", encoding='utf-8') as f_src: - for line in f_src.readlines(): - arra = line.strip().split() - ids = [src_vocab[w] if w in src_vocab else UNK_ID for w in arra] - ids = ids - - src_data.append(ids) - - tar_data = [] - with io.open(tar_file, "r", encoding='utf-8') as f_tar: - for line in f_tar.readlines(): - arra = line.strip().split() - ids = [tar_vocab[w] if w in tar_vocab else UNK_ID for w in arra] - - ids = [1] + ids + [2] - - tar_data.append(ids) - - return src_data, tar_data - - -def filter_len(src, tar, max_sequence_len=50): - new_src = [] - new_tar = [] - - for id1, id2 in zip(src, tar): - if len(id1) > max_sequence_len: - id1 = id1[:max_sequence_len] - if len(id2) > max_sequence_len + 2: - id2 = id2[:max_sequence_len + 2] - - new_src.append(id1) - new_tar.append(id2) - - return new_src, new_tar - + self.max_len = max_len + self.batch.append(info) -def raw_data(src_lang, - tar_lang, - vocab_prefix, - train_prefix, - eval_prefix, - test_prefix, - max_sequence_len=50): - src_vocab_file = vocab_prefix + "." + src_lang - tar_vocab_file = vocab_prefix + "." + tar_lang +class SampleInfo(object): + def __init__(self, i, max_len, min_len): + self.i = i + self.min_len = min_len + self.max_len = max_len - src_train_file = train_prefix + "." + src_lang - tar_train_file = train_prefix + "." + tar_lang - src_eval_file = eval_prefix + "." + src_lang - tar_eval_file = eval_prefix + "." + tar_lang +class MinMaxFilter(object): + def __init__(self, max_len, min_len, underlying_creator): + self._min_len = min_len + self._max_len = max_len + self._creator = underlying_creator - src_test_file = test_prefix + "." + src_lang - tar_test_file = test_prefix + "." + tar_lang - - src_vocab = _build_vocab(src_vocab_file) - tar_vocab = _build_vocab(tar_vocab_file) - - train_src, train_tar = _para_file_to_ids( src_train_file, tar_train_file, \ - src_vocab, tar_vocab ) - train_src, train_tar = filter_len( - train_src, train_tar, max_sequence_len=max_sequence_len) - eval_src, eval_tar = _para_file_to_ids( src_eval_file, tar_eval_file, \ - src_vocab, tar_vocab ) - - test_src, test_tar = _para_file_to_ids( src_test_file, tar_test_file, \ - src_vocab, tar_vocab ) - - return ( train_src, train_tar), (eval_src, eval_tar), (test_src, test_tar),\ - (src_vocab, tar_vocab) - - -def raw_mono_data(vocab_file, file_path): - - src_vocab = _build_vocab(vocab_file) - - test_src, test_tar = _para_file_to_ids( file_path, file_path, \ - src_vocab, src_vocab ) - - return (test_src, test_tar) - - -def get_data_iter(raw_data, - batch_size, - mode='train', - enable_ce=False, - cache_num=20): - - src_data, tar_data = raw_data - - data_len = len(src_data) - - index = np.arange(data_len) - if mode == "train" and not enable_ce: - np.random.shuffle(index) - - def to_pad_np(data, source=False): - max_len = 0 - bs = min(batch_size, len(data)) - for ele in data: - if len(ele) > max_len: - max_len = len(ele) - - ids = np.ones((bs, max_len), dtype='int64') * 2 - mask = np.zeros((bs), dtype='int32') - - for i, ele in enumerate(data): - ids[i, :len(ele)] = ele - if not source: - mask[i] = len(ele) - 1 - else: - mask[i] = len(ele) - - return ids, mask - - b_src = [] - - if mode != "train": - cache_num = 1 - for j in range(data_len): - if len(b_src) == batch_size * cache_num: - # build batch size - - # sort - if mode == 'infer': - new_cache = b_src + def append(self, info): + if info.max_len > self._max_len or info.min_len < self._min_len: + return + else: + return self._creator.append(info) + + @property + def batch(self): + return self._creator.batch + + +class Seq2SeqDataset(Dataset): + def __init__(self, + src_vocab_fpath, + trg_vocab_fpath, + fpattern, + field_delimiter="\t", + token_delimiter=" ", + start_mark="", + end_mark="", + unk_mark="", + only_src=False, + trg_fpattern=None): + # convert str to bytes, and use byte data + # field_delimiter = field_delimiter.encode("utf8") + # token_delimiter = token_delimiter.encode("utf8") + # start_mark = start_mark.encode("utf8") + # end_mark = end_mark.encode("utf8") + # unk_mark = unk_mark.encode("utf8") + self._src_vocab = self.load_dict(src_vocab_fpath) + self._trg_vocab = self.load_dict(trg_vocab_fpath) + self._bos_idx = self._src_vocab[start_mark] + self._eos_idx = self._src_vocab[end_mark] + self._unk_idx = self._src_vocab[unk_mark] + self._only_src = only_src + self._field_delimiter = field_delimiter + self._token_delimiter = token_delimiter + self.load_src_trg_ids(fpattern, trg_fpattern) + + def load_src_trg_ids(self, fpattern, trg_fpattern=None): + src_converter = Converter( + vocab=self._src_vocab, + beg=self._bos_idx, + end=self._eos_idx, + unk=self._unk_idx, + delimiter=self._token_delimiter, + add_beg=False, + add_end=False) + + trg_converter = Converter( + vocab=self._trg_vocab, + beg=self._bos_idx, + end=self._eos_idx, + unk=self._unk_idx, + delimiter=self._token_delimiter, + add_beg=False, + add_end=False) + + converters = ComposedConverter([src_converter, trg_converter]) + + self._src_seq_ids = [] + self._trg_seq_ids = [] + self._sample_infos = [] + + slots = [self._src_seq_ids, self._trg_seq_ids] + lens = [] + for i, line in enumerate(self._load_lines(fpattern, trg_fpattern)): + lens = [] + for field, slot in zip(converters(line), slots): + slot.append(field) + lens.append(len(field)) + # self._sample_infos.append(SampleInfo(i, max(lens), min(lens))) + self._sample_infos.append(SampleInfo(i, lens[0], lens[0])) + + def _load_lines(self, fpattern, trg_fpattern=None): + fpaths = glob.glob(fpattern) + fpaths = sorted(fpaths) # TODO: Add custum sort + assert len(fpaths) > 0, "no matching file to the provided data path" + + if trg_fpattern is None: + for fpath in fpaths: + # with io.open(fpath, "rb") as f: + with io.open(fpath, "r", encoding="utf8") as f: + for line in f: + fields = line.strip("\n").split(self._field_delimiter) + yield fields + else: + # separated source and target language data files + # assume we can get aligned data by sort the two language files + # TODO: Need more rigorous check + trg_fpaths = glob.glob(trg_fpattern) + trg_fpaths = sorted(trg_fpaths) + assert len(fpaths) == len( + trg_fpaths + ), "the number of source language data files must equal \ + with that of source language" + + for fpath, trg_fpath in zip(fpaths, trg_fpaths): + # with io.open(fpath, "rb") as f: + # with io.open(trg_fpath, "rb") as trg_f: + with io.open(fpath, "r", encoding="utf8") as f: + with io.open(trg_fpath, "r", encoding="utf8") as trg_f: + for line in zip(f, trg_f): + fields = [field.strip("\n") for field in line] + yield fields + + @staticmethod + def load_dict(dict_path, reverse=False): + word_dict = {} + # with io.open(dict_path, "rb") as fdict: + with io.open(dict_path, "r", encoding="utf8") as fdict: + for idx, line in enumerate(fdict): + if reverse: + word_dict[idx] = line.strip("\n") + else: + word_dict[line.strip("\n")] = idx + return word_dict + + def get_vocab_summary(self): + return len(self._src_vocab), len( + self._trg_vocab), self._bos_idx, self._eos_idx, self._unk_idx + + def __getitem__(self, idx): + return (self._src_seq_ids[idx], self._trg_seq_ids[idx] + ) if self._trg_seq_ids else self._src_seq_ids[idx] + + def __len__(self): + return len(self._sample_infos) + + +class Seq2SeqBatchSampler(BatchSampler): + def __init__(self, + dataset, + batch_size, + pool_size=10000, + sort_type=SortType.NONE, + min_length=0, + max_length=100, + shuffle=False, + shuffle_batch=False, + use_token_batch=False, + clip_last_batch=False, + seed=None): + for arg, value in locals().items(): + if arg != "self": + setattr(self, "_" + arg, value) + self._random = np.random + self._random.seed(seed) + # for multi-devices + self._nranks = ParallelEnv().nranks + self._local_rank = ParallelEnv().local_rank + self._device_id = ParallelEnv().dev_id + + def __iter__(self): + # global sort or global shuffle + if self._sort_type == SortType.GLOBAL: + infos = sorted( + self._dataset._sample_infos, key=lambda x: x.max_len) + else: + if self._shuffle: + infos = self._dataset._sample_infos + self._random.shuffle(infos) else: - new_cache = sorted(b_src, key=lambda k: len(k[0])) - - for i in range(cache_num): - batch_data = new_cache[i * batch_size:(i + 1) * batch_size] - src_cache = [w[0] for w in batch_data] - tar_cache = [w[1] for w in batch_data] - src_ids, src_mask = to_pad_np(src_cache, source=True) - tar_ids, tar_mask = to_pad_np(tar_cache) - yield (src_ids, src_mask, tar_ids, tar_mask) - - b_src = [] - - b_src.append((src_data[index[j]], tar_data[index[j]])) - if len(b_src) == batch_size * cache_num or mode == 'infer': - if mode == 'infer': - new_cache = b_src + infos = self._dataset._sample_infos + + if self._sort_type == SortType.POOL: + reverse = True + for i in range(0, len(infos), self._pool_size): + # to avoid placing short next to long sentences + reverse = not reverse + infos[i:i + self._pool_size] = sorted( + infos[i:i + self._pool_size], + key=lambda x: x.max_len, + reverse=reverse) + + batches = [] + batch_creator = TokenBatchCreator( + self. + _batch_size) if self._use_token_batch else SentenceBatchCreator( + self._batch_size * self._nranks) + batch_creator = MinMaxFilter(self._max_length, self._min_length, + batch_creator) + + for info in infos: + batch = batch_creator.append(info) + if batch is not None: + batches.append(batch) + + if not self._clip_last_batch and len(batch_creator.batch) != 0: + batches.append(batch_creator.batch) + + if self._shuffle_batch: + self._random.shuffle(batches) + + if not self._use_token_batch: + # when producing batches according to sequence number, to confirm + # neighbor batches which would be feed and run parallel have similar + # length (thus similar computational cost) after shuffle, we as take + # them as a whole when shuffling and split here + batches = [[ + batch[self._batch_size * i:self._batch_size * (i + 1)] + for i in range(self._nranks) + ] for batch in batches] + batches = list(itertools.chain.from_iterable(batches)) + + # for multi-device + for batch_id, batch in enumerate(batches): + if batch_id % self._nranks == self._local_rank: + batch_indices = [info.i for info in batch] + yield batch_indices + if self._local_rank > len(batches) % self._nranks: + yield batch_indices + + def __len__(self): + if not self._use_token_batch: + batch_number = ( + len(self._dataset) + self._batch_size * self._nranks - 1) // ( + self._batch_size * self._nranks) else: - new_cache = sorted(b_src, key=lambda k: len(k[0])) - - for i in range(cache_num): - batch_end = min(len(new_cache), (i + 1) * batch_size) - batch_data = new_cache[i * batch_size:batch_end] - src_cache = [w[0] for w in batch_data] - tar_cache = [w[1] for w in batch_data] - src_ids, src_mask = to_pad_np(src_cache, source=True) - tar_ids, tar_mask = to_pad_np(tar_cache) - yield (src_ids, src_mask, tar_ids, tar_mask) + batch_number = 100 + return batch_number diff --git a/seq2seq/seq2seq_attn.py b/seq2seq/seq2seq_attn.py index b71018e479bc7c..599d25e5beaf35 100644 --- a/seq2seq/seq2seq_attn.py +++ b/seq2seq/seq2seq_attn.py @@ -41,9 +41,10 @@ def __init__(self, hidden_size, bias=False, init_scale=0.1): bias_attr=bias) def forward(self, hidden, encoder_output, encoder_padding_mask): - query = self.input_proj(hidden) + # query = self.input_proj(hidden) + encoder_output = self.input_proj(encoder_output) attn_scores = layers.matmul( - layers.unsqueeze(query, [1]), encoder_output, transpose_y=True) + layers.unsqueeze(hidden, [1]), encoder_output, transpose_y=True) if encoder_padding_mask is not None: attn_scores = layers.elementwise_add(attn_scores, encoder_padding_mask) @@ -73,7 +74,9 @@ def __init__(self, BasicLSTMCell( input_size=input_size + hidden_size if i == 0 else hidden_size, - hidden_size=hidden_size))) + hidden_size=hidden_size, + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale))))) self.attention_layer = AttentionLayer(hidden_size) def forward(self, @@ -107,8 +110,8 @@ def __init__(self, size=[vocab_size, embed_dim], param_attr=ParamAttr(initializer=UniformInitializer( low=-init_scale, high=init_scale))) - self.lstm_attention = RNN(DecoderCell(num_layers, embed_dim, - hidden_size, init_scale), + self.lstm_attention = RNN(DecoderCell( + num_layers, embed_dim, hidden_size, dropout_prob, init_scale), is_reverse=False, time_major=False) self.output_layer = Linear( diff --git a/seq2seq/seq2seq_base.py b/seq2seq/seq2seq_base.py index f56a873e1460dc..ae2cb4bbba4c31 100644 --- a/seq2seq/seq2seq_base.py +++ b/seq2seq/seq2seq_base.py @@ -86,7 +86,7 @@ def __init__(self, param_attr=ParamAttr(initializer=UniformInitializer( low=-init_scale, high=init_scale))) self.stack_lstm = RNN(EncoderCell(num_layers, embed_dim, hidden_size, - init_scale), + dropout_prob, init_scale), is_reverse=False, time_major=False) @@ -114,7 +114,7 @@ def __init__(self, param_attr=ParamAttr(initializer=UniformInitializer( low=-init_scale, high=init_scale))) self.stack_lstm = RNN(DecoderCell(num_layers, embed_dim, hidden_size, - init_scale), + dropout_prob, init_scale), is_reverse=False, time_major=False) self.output_layer = Linear( diff --git a/seq2seq/train.py b/seq2seq/train.py index 3ca8ae61a97df9..70f9315923a96a 100644 --- a/seq2seq/train.py +++ b/seq2seq/train.py @@ -17,8 +17,7 @@ import six import sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import time -import contextlib +import random from functools import partial import numpy as np @@ -34,16 +33,17 @@ from seq2seq_attn import AttentionModel from model import Input, set_device from callbacks import ProgBarLogger -from metrics import Metric - - -class PPL(Metric): - pass +from reader import Seq2SeqDataset, Seq2SeqBatchSampler, SortType, prepare_train_input def do_train(args): device = set_device("gpu" if args.use_gpu else "cpu") - fluid.enable_dygraph(device) #if args.eager_run else None + fluid.enable_dygraph(device) if args.eager_run else None + + if args.enable_ce: + fluid.default_main_program().random_seed = 102 + fluid.default_startup_program().random_seed = 102 + args.shuffle = False # define model inputs = [ @@ -58,6 +58,45 @@ def do_train(args): ] labels = [Input([None, None, 1], "int64", name="label"), ] + # def dataloader + data_loaders = [None, None] + data_prefixes = [args.train_data_prefix, args.eval_data_prefix + ] if args.eval_data_prefix else [args.train_data_prefix] + for i, data_prefix in enumerate(data_prefixes): + dataset = Seq2SeqDataset( + fpattern=data_prefix + "." + args.src_lang, + trg_fpattern=data_prefix + "." + args.tar_lang, + src_vocab_fpath=args.vocab_prefix + "." + args.src_lang, + trg_vocab_fpath=args.vocab_prefix + "." + args.tar_lang, + token_delimiter=None, + start_mark="", + end_mark="", + unk_mark="") + (args.src_vocab_size, args.trg_vocab_size, bos_id, eos_id, + unk_id) = dataset.get_vocab_summary() + batch_sampler = Seq2SeqBatchSampler( + dataset=dataset, + use_token_batch=False, + batch_size=args.batch_size, + pool_size=args.batch_size * 20, + sort_type=SortType.POOL, + shuffle=args.shuffle) + data_loader = DataLoader( + dataset=dataset, + batch_sampler=batch_sampler, + places=device, + feed_list=None if fluid.in_dygraph_mode() else + [x.forward() for x in inputs + labels], + collate_fn=partial( + prepare_train_input, + bos_id=bos_id, + eos_id=eos_id, + pad_id=eos_id), + num_workers=0, + return_list=True) + data_loaders[i] = data_loader + train_loader, eval_loader = data_loaders + model = AttentionModel(args.src_vocab_size, args.tar_vocab_size, args.hidden_size, args.hidden_size, args.num_layers, args.dropout) @@ -69,39 +108,12 @@ def do_train(args): CrossEntropyCriterion(), inputs=inputs, labels=labels) - - batch_size = 32 - src_seq_len = 10 - trg_seq_len = 12 - iter_num = 10 - - def random_generator(): - for i in range(iter_num): - src = np.random.randint(2, args.src_vocab_size, - (batch_size, src_seq_len)).astype("int64") - src_length = np.random.randint(1, src_seq_len, - (batch_size, )).astype("int64") - trg = np.random.randint(2, args.tar_vocab_size, - (batch_size, trg_seq_len)).astype("int64") - trg_length = np.random.randint(1, trg_seq_len, - (batch_size, )).astype("int64") - label = np.random.randint( - 1, trg_seq_len, (batch_size, trg_seq_len, 1)).astype("int64") - yield src, src_length, trg, trg_length, label - - model.fit(train_data=random_generator, log_freq=1) - exit(0) - - data_loaders = [None, None] - data_files = [args.training_file, args.validation_file - ] if args.validation_file else [args.training_file] - train_loader, eval_loader = data_loaders - model.fit(train_data=train_loader, - eval_data=None, + eval_data=eval_loader, epochs=1, eval_freq=1, save_freq=1, + log_freq=1, verbose=2) From e4e393c8c860326a1a55b4981407bd6468865958 Mon Sep 17 00:00:00 2001 From: guosheng Date: Sat, 4 Apr 2020 23:23:38 +0800 Subject: [PATCH 03/13] Add Additive Attention followed by GRU --- seq2seq/seq2seq_add_attn.py | 353 ++++++++++++++++++++++++++++++++++++ 1 file changed, 353 insertions(+) create mode 100644 seq2seq/seq2seq_add_attn.py diff --git a/seq2seq/seq2seq_add_attn.py b/seq2seq/seq2seq_add_attn.py new file mode 100644 index 00000000000000..710429b80289c4 --- /dev/null +++ b/seq2seq/seq2seq_add_attn.py @@ -0,0 +1,353 @@ +import numpy as np +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm, Embedding, GRUUnit + +from text import DynamicDecode, RNN, BasicLSTMCell, RNNCell +from model import Model, Loss + + +class ConvBNPool(fluid.dygraph.Layer): + def __init__(self, + out_ch, + channels, + act="relu", + is_test=False, + pool=True, + use_cudnn=True): + super(ConvBNPool, self).__init__() + self.pool = pool + + filter_size = 3 + conv_std_0 = (2.0 / (filter_size**2 * channels[0]))**0.5 + conv_param_0 = fluid.ParamAttr( + initializer=fluid.initializer.Normal(0.0, conv_std_0)) + + conv_std_1 = (2.0 / (filter_size**2 * channels[1]))**0.5 + conv_param_1 = fluid.ParamAttr( + initializer=fluid.initializer.Normal(0.0, conv_std_1)) + + self.conv_0_layer = Conv2D( + channels[0], + out_ch[0], + 3, + padding=1, + param_attr=conv_param_0, + bias_attr=False, + act=None, + use_cudnn=use_cudnn) + self.bn_0_layer = BatchNorm(out_ch[0], act=act, is_test=is_test) + self.conv_1_layer = Conv2D( + out_ch[0], + num_filters=out_ch[1], + filter_size=3, + padding=1, + param_attr=conv_param_1, + bias_attr=False, + act=None, + use_cudnn=use_cudnn) + self.bn_1_layer = BatchNorm(out_ch[1], act=act, is_test=is_test) + + if self.pool: + self.pool_layer = Pool2D( + pool_size=2, + pool_type='max', + pool_stride=2, + use_cudnn=use_cudnn, + ceil_mode=True) + + def forward(self, inputs): + conv_0 = self.conv_0_layer(inputs) + bn_0 = self.bn_0_layer(conv_0) + conv_1 = self.conv_1_layer(bn_0) + bn_1 = self.bn_1_layer(conv_1) + if self.pool: + bn_pool = self.pool_layer(bn_1) + + return bn_pool + return bn_1 + + +class OCRConv(fluid.dygraph.Layer): + def __init__(self, is_test=False, use_cudnn=True): + super(OCRConv, self).__init__() + self.conv_bn_pool_1 = ConvBNPool( + [16, 16], [1, 16], is_test=is_test, use_cudnn=use_cudnn) + self.conv_bn_pool_2 = ConvBNPool( + [32, 32], [16, 32], is_test=is_test, use_cudnn=use_cudnn) + self.conv_bn_pool_3 = ConvBNPool( + [64, 64], [32, 64], is_test=is_test, use_cudnn=use_cudnn) + self.conv_bn_pool_4 = ConvBNPool( + [128, 128], [64, 128], + is_test=is_test, + pool=False, + use_cudnn=use_cudnn) + + def forward(self, inputs): + inputs_1 = self.conv_bn_pool_1(inputs) + inputs_2 = self.conv_bn_pool_2(inputs_1) + inputs_3 = self.conv_bn_pool_3(inputs_2) + inputs_4 = self.conv_bn_pool_4(inputs_3) + + return inputs_4 + + +class GRUCell(RNNCell): + def __init__(self, + size, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + origin_mode=False, + init_size=None): + super(GRUCell, self).__init__() + + self.input_proj = Linear( + 768, size * 3, param_attr=param_attr, bias_attr=False) + + self.gru_unit = GRUUnit( + size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + + self.size = size + self.is_reverse = is_reverse + + def forward(self, inputs, states): + # step_outputs, new_states = cell(step_inputs, states) + # for GRUCell, `step_outputs` and `new_states` both are hidden + x = self.input_proj(inputs) + hidden, _, _ = self.gru_unit(x, states) + return hidden, hidden + + +class DecoderCell(RNNCell): + def __init__(self, size): + self.gru = GRUCell(size) + self.attention = SimpleAttention(size) + self.fc_1_layer = Linear( + input_dim=size * 2, output_dim=size * 3, bias_attr=False) + self.fc_2_layer = Linear( + input_dim=size, output_dim=size * 3, bias_attr=False) + + def forward(self, inputs, states, encoder_vec, encoder_proj): + context = self.attention(encoder_vec, encoder_proj, states) + fc_1 = self.fc_1_layer(context) + fc_2 = self.fc_2_layer(inputs) + decoder_inputs = fluid.layers.elementwise_add(x=fc_1, y=fc_2) + h, _ = self.gru(decoder_inputs, states) + return h, h + + +class Decoder(fluid.dygraph.Layer): + def __init__(self, size, num_classes): + super(Decoder, self).__init__() + self.embedder = Embedding(size=[num_classes, size]) + self.gru_attention = RNN(DecoderCell(size), + is_reverse=False, + time_major=False) + self.output_layer = Linear(size, num_classes, bias_attr=False) + + def forward(self, target, decoder_initial_states, encoder_vec, + encoder_proj): + inputs = self.embedder(target) + decoder_output, _ = self.gru_attention( + inputs, + initial_states=decoder_initial_states, + encoder_vec=encoder_vec, + encoder_proj=encoder_proj) + predict = self.output_layer(decoder_output) + return predict + + +class EncoderNet(fluid.dygraph.Layer): + def __init__(self, + batch_size, + decoder_size, + rnn_hidden_size=200, + is_test=False, + use_cudnn=True): + super(EncoderNet, self).__init__() + self.rnn_hidden_size = rnn_hidden_size + para_attr = fluid.ParamAttr( + initializer=fluid.initializer.Normal(0.0, 0.02)) + bias_attr = fluid.ParamAttr( + initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0) + self.ocr_convs = OCRConv(is_test=is_test, use_cudnn=use_cudnn) + + self.fc_1_layer = Linear( + 768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False) + self.fc_2_layer = Linear( + 768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False) + self.gru_forward_layer = DynamicGRU( + size=rnn_hidden_size, + param_attr=para_attr, + bias_attr=bias_attr, + candidate_activation='relu') + self.gru_backward_layer = DynamicGRU( + size=rnn_hidden_size, + param_attr=para_attr, + bias_attr=bias_attr, + candidate_activation='relu', + is_reverse=True) + + self.encoded_proj_fc = Linear( + rnn_hidden_size * 2, decoder_size, bias_attr=False) + + def forward(self, inputs): + conv_features = self.ocr_convs(inputs) + transpose_conv_features = fluid.layers.transpose( + conv_features, perm=[0, 3, 1, 2]) + + sliced_feature = fluid.layers.reshape( + transpose_conv_features, [ + -1, transpose_conv_features.shape[1], + transpose_conv_features.shape[2] * + transpose_conv_features.shape[3] + ], + inplace=False) + + fc_1 = self.fc_1_layer(sliced_feature) + + fc_2 = self.fc_2_layer(sliced_feature) + + gru_forward = self.gru_forward_layer(fc_1) + + gru_backward = self.gru_backward_layer(fc_2) + + encoded_vector = fluid.layers.concat( + input=[gru_forward, gru_backward], axis=2) + + encoded_proj = self.encoded_proj_fc(encoded_vector) + + return gru_backward, encoded_vector, encoded_proj + + +class SimpleAttention(fluid.dygraph.Layer): + def __init__(self, decoder_size): + super(SimpleAttention, self).__init__() + + self.fc_1 = Linear( + decoder_size, decoder_size, act=None, bias_attr=False) + self.fc_2 = Linear(decoder_size, 1, act=None, bias_attr=False) + + def forward(self, encoder_vec, encoder_proj, decoder_state): + + decoder_state_fc = self.fc_1(decoder_state) + + decoder_state_proj_reshape = fluid.layers.reshape( + decoder_state_fc, [-1, 1, decoder_state_fc.shape[1]], + inplace=False) + decoder_state_expand = fluid.layers.expand( + decoder_state_proj_reshape, [1, encoder_proj.shape[1], 1]) + concated = fluid.layers.elementwise_add(encoder_proj, + decoder_state_expand) + concated = fluid.layers.tanh(x=concated) + attention_weight = self.fc_2(concated) + weights_reshape = fluid.layers.reshape( + x=attention_weight, shape=[concated.shape[0], -1], inplace=False) + + weights_reshape = fluid.layers.softmax(weights_reshape) + scaled = fluid.layers.elementwise_mul( + x=encoder_vec, y=weights_reshape, axis=0) + + context = fluid.layers.reduce_sum(scaled, dim=1) + + return context + + +class GRUDecoderWithAttention(fluid.dygraph.Layer): + def __init__(self, encoder_size, decoder_size, num_classes): + super(GRUDecoderWithAttention, self).__init__() + self.simple_attention = SimpleAttention(decoder_size) + + self.fc_1_layer = Linear( + input_dim=encoder_size * 2, + output_dim=decoder_size * 3, + bias_attr=False) + self.fc_2_layer = Linear( + input_dim=decoder_size, + output_dim=decoder_size * 3, + bias_attr=False) + self.gru_unit = GRUUnit( + size=decoder_size * 3, param_attr=None, bias_attr=None) + self.out_layer = Linear( + input_dim=decoder_size, + output_dim=num_classes + 2, + bias_attr=None, + act='softmax') + + self.decoder_size = decoder_size + + def forward(self, + current_word, + encoder_vec, + encoder_proj, + decoder_boot, + inference=False): + current_word = fluid.layers.reshape( + current_word, [-1, current_word.shape[2]], inplace=False) + + context = self.simple_attention(encoder_vec, encoder_proj, + decoder_boot) + fc_1 = self.fc_1_layer(context) + fc_2 = self.fc_2_layer(current_word) + decoder_inputs = fluid.layers.elementwise_add(x=fc_1, y=fc_2) + + h, _, _ = self.gru_unit(decoder_inputs, decoder_boot) + out = self.out_layer(h) + + return out, h + + +class OCRAttention(fluid.dygraph.Layer): + def __init__(self, batch_size, num_classes, encoder_size, decoder_size, + word_vector_dim): + super(OCRAttention, self).__init__() + self.encoder_net = EncoderNet(batch_size, decoder_size) + self.fc = Linear( + input_dim=encoder_size, + output_dim=decoder_size, + bias_attr=False, + act='relu') + self.embedding = Embedding( + [num_classes + 2, word_vector_dim], dtype='float32') + self.gru_decoder_with_attention = GRUDecoderWithAttention( + encoder_size, decoder_size, num_classes) + self.batch_size = batch_size + + def forward(self, inputs, label_in): + gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs) + backward_first = fluid.layers.slice( + gru_backward, axes=[1], starts=[0], ends=[1]) + backward_first = fluid.layers.reshape( + backward_first, [-1, backward_first.shape[2]], inplace=False) + + decoder_boot = self.fc(backward_first) + + label_in = fluid.layers.reshape(label_in, [-1], inplace=False) + trg_embedding = self.embedding(label_in) + + trg_embedding = fluid.layers.reshape( + trg_embedding, [self.batch_size, -1, trg_embedding.shape[1]], + inplace=False) + + pred_temp = [] + for i in range(trg_embedding.shape[1]): + current_word = fluid.layers.slice( + trg_embedding, axes=[1], starts=[i], ends=[i + 1]) + out, decoder_boot = self.gru_decoder_with_attention( + current_word, encoded_vector, encoded_proj, decoder_boot) + pred_temp.append(out) + pred_temp = fluid.layers.concat(pred_temp, axis=1) + + batch_size = trg_embedding.shape[0] + seq_len = trg_embedding.shape[1] + prediction = fluid.layers.reshape( + pred_temp, shape=[batch_size, seq_len, -1]) + + return prediction From 27afc2867473221ef0f72cf715f7f2f8c0e31414 Mon Sep 17 00:00:00 2001 From: guosheng Date: Sun, 5 Apr 2020 04:21:41 +0800 Subject: [PATCH 04/13] Update Additive Attention followed by GRU --- seq2seq/seq2seq_add_attn.py | 268 ++++++++++++++---------------------- seq2seq/train_ocr.py | 140 +++++++++++++++++++ 2 files changed, 244 insertions(+), 164 deletions(-) create mode 100644 seq2seq/train_ocr.py diff --git a/seq2seq/seq2seq_add_attn.py b/seq2seq/seq2seq_add_attn.py index 710429b80289c4..ca0a4739f0c268 100644 --- a/seq2seq/seq2seq_add_attn.py +++ b/seq2seq/seq2seq_add_attn.py @@ -1,8 +1,9 @@ import numpy as np import paddle.fluid as fluid +import paddle.fluid.layers as layers from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm, Embedding, GRUUnit -from text import DynamicDecode, RNN, BasicLSTMCell, RNNCell +from text import DynamicDecode, RNN, RNNCell from model import Model, Loss @@ -91,82 +92,70 @@ def forward(self, inputs): return inputs_4 +class SimpleAttention(fluid.dygraph.Layer): + def __init__(self, decoder_size): + super(SimpleAttention, self).__init__() + + self.fc1 = Linear(decoder_size, decoder_size, bias_attr=False) + self.fc2 = Linear(decoder_size, 1, bias_attr=False) + + def forward(self, encoder_vec, encoder_proj, decoder_state): + decoder_state = self.fc1(decoder_state) + decoder_state = fluid.layers.unsqueeze(decoder_state, [1]) + + mix = fluid.layers.elementwise_add(encoder_proj, decoder_state) + mix = fluid.layers.tanh(x=mix) + + attn_score = self.fc2(mix) + attn_scores = layers.squeeze(attn_score, [2]) + attn_scores = fluid.layers.softmax(attn_scores) + + scaled = fluid.layers.elementwise_mul( + x=encoder_vec, y=attn_scores, axis=0) + + context = fluid.layers.reduce_sum(scaled, dim=1) + return context + + class GRUCell(RNNCell): def __init__(self, - size, + input_size, + hidden_size, param_attr=None, bias_attr=None, - is_reverse=False, gate_activation='sigmoid', candidate_activation='tanh', - origin_mode=False, - init_size=None): + origin_mode=False): super(GRUCell, self).__init__() - - self.input_proj = Linear( - 768, size * 3, param_attr=param_attr, bias_attr=False) + self.hidden_size = hidden_size + self.fc_layer = Linear( + input_size, + hidden_size * 3, + param_attr=param_attr, + bias_attr=False) self.gru_unit = GRUUnit( - size * 3, + hidden_size * 3, param_attr=param_attr, bias_attr=bias_attr, activation=candidate_activation, gate_activation=gate_activation, origin_mode=origin_mode) - self.size = size - self.is_reverse = is_reverse - def forward(self, inputs, states): # step_outputs, new_states = cell(step_inputs, states) # for GRUCell, `step_outputs` and `new_states` both are hidden - x = self.input_proj(inputs) + x = self.fc_layer(inputs) hidden, _, _ = self.gru_unit(x, states) return hidden, hidden - -class DecoderCell(RNNCell): - def __init__(self, size): - self.gru = GRUCell(size) - self.attention = SimpleAttention(size) - self.fc_1_layer = Linear( - input_dim=size * 2, output_dim=size * 3, bias_attr=False) - self.fc_2_layer = Linear( - input_dim=size, output_dim=size * 3, bias_attr=False) - - def forward(self, inputs, states, encoder_vec, encoder_proj): - context = self.attention(encoder_vec, encoder_proj, states) - fc_1 = self.fc_1_layer(context) - fc_2 = self.fc_2_layer(inputs) - decoder_inputs = fluid.layers.elementwise_add(x=fc_1, y=fc_2) - h, _ = self.gru(decoder_inputs, states) - return h, h - - -class Decoder(fluid.dygraph.Layer): - def __init__(self, size, num_classes): - super(Decoder, self).__init__() - self.embedder = Embedding(size=[num_classes, size]) - self.gru_attention = RNN(DecoderCell(size), - is_reverse=False, - time_major=False) - self.output_layer = Linear(size, num_classes, bias_attr=False) - - def forward(self, target, decoder_initial_states, encoder_vec, - encoder_proj): - inputs = self.embedder(target) - decoder_output, _ = self.gru_attention( - inputs, - initial_states=decoder_initial_states, - encoder_vec=encoder_vec, - encoder_proj=encoder_proj) - predict = self.output_layer(decoder_output) - return predict + @property + def state_shape(self): + return [self.hidden_size] class EncoderNet(fluid.dygraph.Layer): def __init__(self, - batch_size, decoder_size, rnn_hidden_size=200, is_test=False, @@ -179,21 +168,24 @@ def __init__(self, initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0) self.ocr_convs = OCRConv(is_test=is_test, use_cudnn=use_cudnn) - self.fc_1_layer = Linear( - 768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False) - self.fc_2_layer = Linear( - 768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False) - self.gru_forward_layer = DynamicGRU( - size=rnn_hidden_size, - param_attr=para_attr, - bias_attr=bias_attr, - candidate_activation='relu') - self.gru_backward_layer = DynamicGRU( - size=rnn_hidden_size, - param_attr=para_attr, - bias_attr=bias_attr, - candidate_activation='relu', - is_reverse=True) + self.gru_forward_layer = RNN( + cell=GRUCell( + input_size=128 * 6, # channel * h + hidden_size=rnn_hidden_size, + param_attr=para_attr, + bias_attr=bias_attr, + candidate_activation='relu'), + is_reverse=False, + time_major=False) + self.gru_backward_layer = RNN( + cell=GRUCell( + input_size=128 * 6, # channel * h + hidden_size=rnn_hidden_size, + param_attr=para_attr, + bias_attr=bias_attr, + candidate_activation='relu'), + is_reverse=True, + time_major=False) self.encoded_proj_fc = Linear( rnn_hidden_size * 2, decoder_size, bias_attr=False) @@ -211,13 +203,9 @@ def forward(self, inputs): ], inplace=False) - fc_1 = self.fc_1_layer(sliced_feature) + gru_forward, _ = self.gru_forward_layer(sliced_feature) - fc_2 = self.fc_2_layer(sliced_feature) - - gru_forward = self.gru_forward_layer(fc_1) - - gru_backward = self.gru_backward_layer(fc_2) + gru_backward, _ = self.gru_backward_layer(sliced_feature) encoded_vector = fluid.layers.concat( input=[gru_forward, gru_backward], axis=2) @@ -227,88 +215,50 @@ def forward(self, inputs): return gru_backward, encoded_vector, encoded_proj -class SimpleAttention(fluid.dygraph.Layer): - def __init__(self, decoder_size): - super(SimpleAttention, self).__init__() - - self.fc_1 = Linear( - decoder_size, decoder_size, act=None, bias_attr=False) - self.fc_2 = Linear(decoder_size, 1, act=None, bias_attr=False) - - def forward(self, encoder_vec, encoder_proj, decoder_state): - - decoder_state_fc = self.fc_1(decoder_state) - - decoder_state_proj_reshape = fluid.layers.reshape( - decoder_state_fc, [-1, 1, decoder_state_fc.shape[1]], - inplace=False) - decoder_state_expand = fluid.layers.expand( - decoder_state_proj_reshape, [1, encoder_proj.shape[1], 1]) - concated = fluid.layers.elementwise_add(encoder_proj, - decoder_state_expand) - concated = fluid.layers.tanh(x=concated) - attention_weight = self.fc_2(concated) - weights_reshape = fluid.layers.reshape( - x=attention_weight, shape=[concated.shape[0], -1], inplace=False) - - weights_reshape = fluid.layers.softmax(weights_reshape) - scaled = fluid.layers.elementwise_mul( - x=encoder_vec, y=weights_reshape, axis=0) - - context = fluid.layers.reduce_sum(scaled, dim=1) - - return context +class DecoderCell(RNNCell): + def __init__(self, encoder_size, decoder_size): + super(DecoderCell, self).__init__() + self.attention = SimpleAttention(decoder_size) + self.gru_cell = GRUCell( + input_size=encoder_size * 2 + + decoder_size, # encoded_vector.shape[-1] + embed_size + hidden_size=decoder_size) + + def forward(self, current_word, states, encoder_vec, encoder_proj): + context = self.attention(encoder_vec, encoder_proj, states) + decoder_inputs = layers.concat([current_word, context], axis=1) + hidden, _ = self.gru_cell(decoder_inputs, states) + return hidden, hidden class GRUDecoderWithAttention(fluid.dygraph.Layer): def __init__(self, encoder_size, decoder_size, num_classes): super(GRUDecoderWithAttention, self).__init__() - self.simple_attention = SimpleAttention(decoder_size) - - self.fc_1_layer = Linear( - input_dim=encoder_size * 2, - output_dim=decoder_size * 3, - bias_attr=False) - self.fc_2_layer = Linear( - input_dim=decoder_size, - output_dim=decoder_size * 3, - bias_attr=False) - self.gru_unit = GRUUnit( - size=decoder_size * 3, param_attr=None, bias_attr=None) + self.gru_attention = RNN(DecoderCell(encoder_size, decoder_size), + is_reverse=False, + time_major=False) self.out_layer = Linear( input_dim=decoder_size, output_dim=num_classes + 2, bias_attr=None, act='softmax') - self.decoder_size = decoder_size - - def forward(self, - current_word, - encoder_vec, - encoder_proj, - decoder_boot, - inference=False): - current_word = fluid.layers.reshape( - current_word, [-1, current_word.shape[2]], inplace=False) - - context = self.simple_attention(encoder_vec, encoder_proj, - decoder_boot) - fc_1 = self.fc_1_layer(context) - fc_2 = self.fc_2_layer(current_word) - decoder_inputs = fluid.layers.elementwise_add(x=fc_1, y=fc_2) - - h, _, _ = self.gru_unit(decoder_inputs, decoder_boot) - out = self.out_layer(h) - - return out, h + def forward(self, inputs, decoder_initial_states, encoder_vec, + encoder_proj): + out, _ = self.gru_attention( + inputs, + initial_states=decoder_initial_states, + encoder_vec=encoder_vec, + encoder_proj=encoder_proj) + predict = self.out_layer(out) + return predict -class OCRAttention(fluid.dygraph.Layer): - def __init__(self, batch_size, num_classes, encoder_size, decoder_size, +class OCRAttention(Model): + def __init__(self, num_classes, encoder_size, decoder_size, word_vector_dim): super(OCRAttention, self).__init__() - self.encoder_net = EncoderNet(batch_size, decoder_size) + self.encoder_net = EncoderNet(decoder_size) self.fc = Linear( input_dim=encoder_size, output_dim=decoder_size, @@ -318,36 +268,26 @@ def __init__(self, batch_size, num_classes, encoder_size, decoder_size, [num_classes + 2, word_vector_dim], dtype='float32') self.gru_decoder_with_attention = GRUDecoderWithAttention( encoder_size, decoder_size, num_classes) - self.batch_size = batch_size def forward(self, inputs, label_in): gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs) - backward_first = fluid.layers.slice( - gru_backward, axes=[1], starts=[0], ends=[1]) - backward_first = fluid.layers.reshape( - backward_first, [-1, backward_first.shape[2]], inplace=False) - - decoder_boot = self.fc(backward_first) - label_in = fluid.layers.reshape(label_in, [-1], inplace=False) + decoder_boot = self.fc(gru_backward[:, 0]) trg_embedding = self.embedding(label_in) + prediction = self.gru_decoder_with_attention( + trg_embedding, decoder_boot, encoded_vector, encoded_proj) - trg_embedding = fluid.layers.reshape( - trg_embedding, [self.batch_size, -1, trg_embedding.shape[1]], - inplace=False) + return prediction - pred_temp = [] - for i in range(trg_embedding.shape[1]): - current_word = fluid.layers.slice( - trg_embedding, axes=[1], starts=[i], ends=[i + 1]) - out, decoder_boot = self.gru_decoder_with_attention( - current_word, encoded_vector, encoded_proj, decoder_boot) - pred_temp.append(out) - pred_temp = fluid.layers.concat(pred_temp, axis=1) - batch_size = trg_embedding.shape[0] - seq_len = trg_embedding.shape[1] - prediction = fluid.layers.reshape( - pred_temp, shape=[batch_size, seq_len, -1]) +class CrossEntropyCriterion(Loss): + def __init__(self): + super(CrossEntropyCriterion, self).__init__() - return prediction + def forward(self, outputs, labels): + predict, (label, mask) = outputs[0], labels + + loss = layers.cross_entropy(predict, label=label, soft_label=False) + loss = layers.elementwise_mul(loss, mask, axis=0) + loss = layers.reduce_sum(loss) + return loss diff --git a/seq2seq/train_ocr.py b/seq2seq/train_ocr.py new file mode 100644 index 00000000000000..2dd7835b225825 --- /dev/null +++ b/seq2seq/train_ocr.py @@ -0,0 +1,140 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import os +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import paddle.fluid.profiler as profiler +import paddle.fluid as fluid + +import data_reader + +from paddle.fluid.dygraph.base import to_variable +import argparse +import functools +from utility import add_arguments, print_arguments, get_attention_feeder_data +from model import Input, set_device +from nets import OCRAttention, CrossEntropyCriterion +from eval import evaluate + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('batch_size', int, 32, "Minibatch size.") +add_arg('epoch_num', int, 30, "Epoch number.") +add_arg('lr', float, 0.001, "Learning rate.") +add_arg('lr_decay_strategy', str, "", "Learning rate decay strategy.") +add_arg('log_period', int, 200, "Log period.") +add_arg('save_model_period', int, 2000, "Save model period. '-1' means never saving the model.") +add_arg('eval_period', int, 2000, "Evaluate period. '-1' means never evaluating the model.") +add_arg('save_model_dir', str, "./output", "The directory the model to be saved to.") +add_arg('train_images', str, None, "The directory of images to be used for training.") +add_arg('train_list', str, None, "The list file of images to be used for training.") +add_arg('test_images', str, None, "The directory of images to be used for test.") +add_arg('test_list', str, None, "The list file of images to be used for training.") +add_arg('init_model', str, None, "The init model file of directory.") +add_arg('use_gpu', bool, True, "Whether use GPU to train.") +add_arg('parallel', bool, False, "Whether use parallel training.") +add_arg('profile', bool, False, "Whether to use profiling.") +add_arg('skip_batch_num', int, 0, "The number of first minibatches to skip as warm-up for better performance test.") +add_arg('skip_test', bool, False, "Whether to skip test phase.") +# model hyper paramters +add_arg('encoder_size', int, 200, "Encoder size.") +add_arg('decoder_size', int, 128, "Decoder size.") +add_arg('word_vector_dim', int, 128, "Word vector dim.") +add_arg('num_classes', int, 95, "Number classes.") +add_arg('gradient_clip', float, 5.0, "Gradient clip value.") +add_arg('dynamic', bool, False, "Whether to use dygraph.") + + +def train(args): + device = set_device("gpu" if args.use_gpu else "cpu") + fluid.enable_dygraph(device) if args.dynamic else None + + ocr_attention = OCRAttention(encoder_size=args.encoder_size, decoder_size=args.decoder_size, + num_classes=args.num_classes, word_vector_dim=args.word_vector_dim) + LR = args.lr + if args.lr_decay_strategy == "piecewise_decay": + learning_rate = fluid.layers.piecewise_decay([200000, 250000], [LR, LR * 0.1, LR * 0.01]) + else: + learning_rate = LR + optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, parameter_list=ocr_attention.parameters()) + # grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(args.gradient_clip) + + inputs = [ + Input([None, 1, 48, 384], "float32", name="pixel"), + Input([None, None], "int64", name="label_in"), + ] + labels = [ + Input([None, None], "int64", name="label_out"), + Input([None, None], "float32", name="mask")] + + ocr_attention.prepare(optimizer, CrossEntropyCriterion(), inputs=inputs, labels=labels) + + + train_reader = data_reader.data_reader( + args.batch_size, + shuffle=True, + images_dir=args.train_images, + list_file=args.train_list, + data_type='train') + + # test_reader = data_reader.data_reader( + # args.batch_size, + # images_dir=args.test_images, + # list_file=args.test_list, + # data_type="test") + + # if not os.path.exists(args.save_model_dir): + # os.makedirs(args.save_model_dir) + total_step = 0 + epoch_num = args.epoch_num + for epoch in range(epoch_num): + batch_id = 0 + total_loss = 0.0 + + for data in train_reader(): + + total_step += 1 + data_dict = get_attention_feeder_data(data) + pixel = data_dict["pixel"] + label_in = data_dict["label_in"].reshape([pixel.shape[0], -1]) + label_out = data_dict["label_out"].reshape([pixel.shape[0], -1]) + mask = data_dict["mask"].reshape(label_out.shape).astype("float32") + + avg_loss = ocr_attention.train(inputs=[pixel, label_in], labels=[label_out, mask])[0] + total_loss += avg_loss + + if True:#batch_id > 0 and batch_id % args.log_period == 0: + print("epoch: {}, batch_id: {}, loss {}".format(epoch, batch_id, + total_loss / args.batch_size / args.log_period)) + total_loss = 0.0 + + batch_id += 1 + + +if __name__ == '__main__': + args = parser.parse_args() + print_arguments(args) + if args.profile: + if args.use_gpu: + with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: + train(args) + else: + with profiler.profiler("CPU", sorted_key='total') as cpuprof: + train(args) + else: + train(args) \ No newline at end of file From 3500061d750b15c8d2abb39403041e59ef136de8 Mon Sep 17 00:00:00 2001 From: guosheng Date: Wed, 8 Apr 2020 02:15:18 +0800 Subject: [PATCH 05/13] Add seq2seq infer --- seq2seq/predict.py | 126 ++++++++++++++++++++++++++++++++++++++++ seq2seq/reader.py | 5 ++ seq2seq/seq2seq_attn.py | 10 +++- seq2seq/seq2seq_base.py | 10 +++- seq2seq/train.py | 19 +++--- 5 files changed, 155 insertions(+), 15 deletions(-) diff --git a/seq2seq/predict.py b/seq2seq/predict.py index e69de29bb2d1d6..fef74cfb22fcfe 100644 --- a/seq2seq/predict.py +++ b/seq2seq/predict.py @@ -0,0 +1,126 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import io +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import random +from functools import partial + +import numpy as np +import paddle.fluid as fluid +from paddle.fluid.layers.utils import flatten +from paddle.fluid.io import DataLoader + +from model import Input, set_device +from args import parse_args +from seq2seq_base import BaseInferModel +from seq2seq_attn import AttentionInferModel +from reader import Seq2SeqDataset, Seq2SeqBatchSampler, SortType, prepare_infer_input + + +def post_process_seq(seq, bos_idx, eos_idx, output_bos=False, + output_eos=False): + """ + Post-process the decoded sequence. + """ + eos_pos = len(seq) - 1 + for i, idx in enumerate(seq): + if idx == eos_idx: + eos_pos = i + break + seq = [ + idx for idx in seq[:eos_pos + 1] + if (output_bos or idx != bos_idx) and (output_eos or idx != eos_idx) + ] + return seq + + +def do_predict(args): + device = set_device("gpu" if args.use_gpu else "cpu") + fluid.enable_dygraph(device) if args.eager_run else None + + # define model + inputs = [ + Input( + [None, None], "int64", name="src_word"), + Input( + [None], "int64", name="src_length"), + ] + + # def dataloader + dataset = Seq2SeqDataset( + fpattern=args.infer_file, + src_vocab_fpath=args.vocab_prefix + "." + args.src_lang, + trg_vocab_fpath=args.vocab_prefix + "." + args.tar_lang, + token_delimiter=None, + start_mark="", + end_mark="", + unk_mark="") + trg_idx2word = Seq2SeqDataset.load_dict( + dict_path=args.vocab_prefix + "." + args.tar_lang, reverse=True) + (args.src_vocab_size, args.trg_vocab_size, bos_id, eos_id, + unk_id) = dataset.get_vocab_summary() + batch_sampler = Seq2SeqBatchSampler( + dataset=dataset, use_token_batch=False, batch_size=args.batch_size) + data_loader = DataLoader( + dataset=dataset, + batch_sampler=batch_sampler, + places=device, + feed_list=None + if fluid.in_dygraph_mode() else [x.forward() for x in inputs], + collate_fn=partial( + prepare_infer_input, bos_id=bos_id, eos_id=eos_id, pad_id=eos_id), + num_workers=0, + return_list=True) + + model_maker = AttentionInferModel if args.attention else BaseInferModel + model = model_maker( + args.src_vocab_size, + args.tar_vocab_size, + args.hidden_size, + args.hidden_size, + args.num_layers, + args.dropout, + bos_id=bos_id, + eos_id=eos_id, + beam_size=args.beam_size, + max_out_len=256) + + model.prepare(inputs=inputs) + + # load the trained model + assert args.reload_model, ( + "Please set reload_model to load the infer model.") + model.load(args.reload_model) + + # TODO(guosheng): use model.predict when support variant length + with io.open(args.infer_output_file, 'w', encoding='utf-8') as f: + for data in data_loader(): + finished_seq = model.test(inputs=flatten(data))[0] + finished_seq = np.transpose(finished_seq, [0, 2, 1]) + for ins in finished_seq: + for beam_idx, beam in enumerate(ins): + id_list = post_process_seq(beam, bos_id, eos_id) + word_list = [trg_idx2word[id] for id in id_list] + sequence = " ".join(word_list) + "\n" + f.write(sequence) + break + + +if __name__ == "__main__": + args = parse_args() + do_predict(args) diff --git a/seq2seq/reader.py b/seq2seq/reader.py index 145acec295fab1..6f007bdde0ca16 100644 --- a/seq2seq/reader.py +++ b/seq2seq/reader.py @@ -33,6 +33,11 @@ def prepare_train_input(insts, bos_id, eos_id, pad_id): return src, src_length, trg[:, :-1], trg_length, trg[:, 1:, np.newaxis] +def prepare_infer_input(insts, bos_id, eos_id, pad_id): + src, src_length = pad_batch_data(insts, pad_id=pad_id) + return src, src_length + + def pad_batch_data(insts, pad_id): """ Pad the instances to the max sequence length in batch, and generate the diff --git a/seq2seq/seq2seq_attn.py b/seq2seq/seq2seq_attn.py index 599d25e5beaf35..c5baee74e83281 100644 --- a/seq2seq/seq2seq_attn.py +++ b/seq2seq/seq2seq_attn.py @@ -90,7 +90,10 @@ def forward(self, for i, lstm_cell in enumerate(self.lstm_cells): out, new_lstm_state = lstm_cell(step_input, lstm_states[i]) step_input = layers.dropout( - out, self.dropout_prob) if self.dropout_prob > 0 else out + out, + self.dropout_prob, + dropout_implementation='upscale_in_train' + ) if self.dropout_prob > 0 else out new_lstm_states.append(new_lstm_state) out = self.attention_layer(step_input, encoder_output, encoder_padding_mask) @@ -180,7 +183,8 @@ def forward(self, src, src_length, trg, trg_length): class AttentionInferModel(AttentionModel): def __init__(self, - vocab_size, + src_vocab_size, + trg_vocab_size, embed_dim, hidden_size, num_layers, @@ -192,6 +196,8 @@ def __init__(self, args = dict(locals()) args.pop("self") args.pop("__class__", None) # py3 + self.bos_id = args.pop("bos_id") + self.eos_id = args.pop("eos_id") self.beam_size = args.pop("beam_size") self.max_out_len = args.pop("max_out_len") super(AttentionInferModel, self).__init__(**args) diff --git a/seq2seq/seq2seq_base.py b/seq2seq/seq2seq_base.py index ae2cb4bbba4c31..b37b871b03f8f7 100644 --- a/seq2seq/seq2seq_base.py +++ b/seq2seq/seq2seq_base.py @@ -63,7 +63,10 @@ def forward(self, step_input, states): for i, lstm_cell in enumerate(self.lstm_cells): out, new_state = lstm_cell(step_input, states[i]) step_input = layers.dropout( - out, self.dropout_prob) if self.dropout_prob > 0 else out + out, + self.dropout_prob, + dropout_implementation='upscale_in_train' + ) if self.dropout_prob > 0 else out new_states.append(new_state) return step_input, new_states @@ -163,7 +166,8 @@ def forward(self, src, src_length, trg, trg_length): class BaseInferModel(BaseModel): def __init__(self, - vocab_size, + src_vocab_size, + trg_vocab_size, embed_dim, hidden_size, num_layers, @@ -175,6 +179,8 @@ def __init__(self, args = dict(locals()) args.pop("self") args.pop("__class__", None) # py3 + self.bos_id = args.pop("bos_id") + self.eos_id = args.pop("eos_id") self.beam_size = args.pop("beam_size") self.max_out_len = args.pop("max_out_len") super(BaseInferModel, self).__init__(**args) diff --git a/seq2seq/train.py b/seq2seq/train.py index 70f9315923a96a..23dfccb5ad3c5f 100644 --- a/seq2seq/train.py +++ b/seq2seq/train.py @@ -14,25 +14,20 @@ import logging import os -import six import sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import random from functools import partial import numpy as np -import paddle import paddle.fluid as fluid -from paddle.fluid.dygraph import to_variable from paddle.fluid.io import DataLoader -from paddle.fluid.dygraph_grad_clip import GradClipByGlobalNorm -import reader +from model import Input, set_device +from callbacks import ProgBarLogger from args import parse_args from seq2seq_base import BaseModel, CrossEntropyCriterion from seq2seq_attn import AttentionModel -from model import Input, set_device -from callbacks import ProgBarLogger from reader import Seq2SeqDataset, Seq2SeqBatchSampler, SortType, prepare_train_input @@ -97,9 +92,10 @@ def do_train(args): data_loaders[i] = data_loader train_loader, eval_loader = data_loaders - model = AttentionModel(args.src_vocab_size, args.tar_vocab_size, - args.hidden_size, args.hidden_size, args.num_layers, - args.dropout) + model_maker = AttentionModel if args.attention else BaseModel + model = model_maker(args.src_vocab_size, args.tar_vocab_size, + args.hidden_size, args.hidden_size, args.num_layers, + args.dropout) model.prepare( fluid.optimizer.Adam( @@ -110,9 +106,10 @@ def do_train(args): labels=labels) model.fit(train_data=train_loader, eval_data=eval_loader, - epochs=1, + epochs=args.max_epoch, eval_freq=1, save_freq=1, + save_dir=args.model_path, log_freq=1, verbose=2) From ae47e2a8758e149b39f264961a64dacdcc1067b1 Mon Sep 17 00:00:00 2001 From: guosheng Date: Tue, 14 Apr 2020 13:49:29 +0800 Subject: [PATCH 06/13] Refine seq2seq --- seq2seq/README.md | 180 ++++++++++++++++++++++ seq2seq/reader.py | 155 +++++++++++++------ seq2seq/run.sh | 2 + seq2seq/seq2seq_add_attn.py | 293 ------------------------------------ seq2seq/train.py | 56 +------ seq2seq/train_ocr.py | 140 ----------------- transformer/reader.py | 1 - 7 files changed, 300 insertions(+), 527 deletions(-) create mode 100644 seq2seq/README.md delete mode 100644 seq2seq/seq2seq_add_attn.py delete mode 100644 seq2seq/train_ocr.py diff --git a/seq2seq/README.md b/seq2seq/README.md new file mode 100644 index 00000000000000..ef8bfd17fffc60 --- /dev/null +++ b/seq2seq/README.md @@ -0,0 +1,180 @@ +运行本目录下的范例模型需要安装PaddlePaddle Fluid 1.7版。如果您的 PaddlePaddle 安装版本低于此要求,请按照[安装文档](https://www.paddlepaddle.org.cn/#quick-start)中的说明更新 PaddlePaddle 安装版本。 + +# Sequence to Sequence (Seq2Seq) + +以下是本范例模型的简要目录结构及说明: + +``` +. +├── README.md # 文档,本文件 +├── args.py # 训练、预测以及模型参数配置程序 +├── reader.py # 数据读入程序 +├── download.py # 数据下载程序 +├── train.py # 训练主程序 +├── infer.py # 预测主程序 +├── run.sh # 默认配置的启动脚本 +├── infer.sh # 默认配置的解码脚本 +├── attention_model.py # 带注意力机制的翻译模型程序 +└── base_model.py # 无注意力机制的翻译模型程序 +``` + +## 简介 + +Sequence to Sequence (Seq2Seq),使用编码器-解码器(Encoder-Decoder)结构,用编码器将源序列编码成vector,再用解码器将该vector解码为目标序列。Seq2Seq 广泛应用于机器翻译,自动对话机器人,文档摘要自动生成,图片描述自动生成等任务中。 + +本目录包含Seq2Seq的一个经典样例:机器翻译,实现了一个base model(不带attention机制),一个带attention机制的翻译模型。Seq2Seq翻译模型,模拟了人类在进行翻译类任务时的行为:先解析源语言,理解其含义,再根据该含义来写出目标语言的语句。更多关于机器翻译的具体原理和数学表达式,我们推荐参考飞桨官网[机器翻译案例](https://www.paddlepaddle.org.cn/documentation/docs/zh/user_guides/nlp_case/machine_translation/README.cn.html)。 + +## 模型概览 + +本模型中,在编码器方面,我们采用了基于LSTM的多层的RNN encoder;在解码器方面,我们使用了带注意力(Attention)机制的RNN decoder,并同时提供了一个不带注意力机制的解码器实现作为对比。在预测时我们使用柱搜索(beam search)算法来生成翻译的目标语句。 + +## 数据介绍 + +本教程使用[IWSLT'15 English-Vietnamese data ](https://nlp.stanford.edu/projects/nmt/)数据集中的英语到越南语的数据作为训练语料,tst2012的数据作为开发集,tst2013的数据作为测试集 + +### 数据获取 + +``` +python download.py +``` + +## 模型训练 + +`run.sh`包含训练程序的主函数,要使用默认参数开始训练,只需要简单地执行: + +``` +sh run.sh +``` + +默认使用带有注意力机制的RNN模型,可以通过修改 `attention` 参数为False来训练不带注意力机制的RNN模型。 + +```sh +export CUDA_VISIBLE_DEVICES=0 + +python train.py \ + --src_lang en --tar_lang vi \ + --attention True \ + --num_layers 2 \ + --hidden_size 512 \ + --src_vocab_size 17191 \ + --tar_vocab_size 7709 \ + --batch_size 128 \ + --dropout 0.2 \ + --init_scale 0.1 \ + --max_grad_norm 5.0 \ + --train_data_prefix data/en-vi/train \ + --eval_data_prefix data/en-vi/tst2012 \ + --test_data_prefix data/en-vi/tst2013 \ + --vocab_prefix data/en-vi/vocab \ + --use_gpu True \ + --model_path ./attention_models +``` + +训练程序会在每个epoch训练结束之后,save一次模型。 + + +默认使用动态图模式进行训练,可以通过设置 `eager_run` 参数为False来以静态图模式进行训练,如下: + +```sh +export CUDA_VISIBLE_DEVICES=0 + +python train.py \ + --src_lang en --tar_lang vi \ + --attention True \ + --num_layers 2 \ + --hidden_size 512 \ + --src_vocab_size 17191 \ + --tar_vocab_size 7709 \ + --batch_size 128 \ + --dropout 0.2 \ + --init_scale 0.1 \ + --max_grad_norm 5.0 \ + --train_data_prefix data/en-vi/train \ + --eval_data_prefix data/en-vi/tst2012 \ + --test_data_prefix data/en-vi/tst2013 \ + --vocab_prefix data/en-vi/vocab \ + --use_gpu True \ + --model_path ./attention_models \ + --eager_run False +``` + +## 模型预测 + +当模型训练完成之后, 可以利用infer.sh的脚本进行预测,默认使用beam search的方法进行预测,加载第10个epoch的模型进行预测,对test的数据集进行解码 + +``` +sh infer.sh +``` + +如果想预测别的数据文件,只需要将 --infer_file参数进行修改。 + +```sh +export CUDA_VISIBLE_DEVICES=0 + +python infer.py \ + --attention True \ + --src_lang en --tar_lang vi \ + --num_layers 2 \ + --hidden_size 512 \ + --src_vocab_size 17191 \ + --tar_vocab_size 7709 \ + --batch_size 128 \ + --dropout 0.2 \ + --init_scale 0.1 \ + --max_grad_norm 5.0 \ + --vocab_prefix data/en-vi/vocab \ + --infer_file data/en-vi/tst2013.en \ + --reload_model attention_models/epoch_10 \ + --infer_output_file attention_infer_output/infer_output.txt \ + --beam_size 10 \ + --use_gpu True +``` + +和训练类似,预测时同样可以以静态图模式进行,如下: + +```sh +export CUDA_VISIBLE_DEVICES=0 + +python infer.py \ + --attention True \ + --src_lang en --tar_lang vi \ + --num_layers 2 \ + --hidden_size 512 \ + --src_vocab_size 17191 \ + --tar_vocab_size 7709 \ + --batch_size 128 \ + --dropout 0.2 \ + --init_scale 0.1 \ + --max_grad_norm 5.0 \ + --vocab_prefix data/en-vi/vocab \ + --infer_file data/en-vi/tst2013.en \ + --reload_model attention_models/epoch_10 \ + --infer_output_file attention_infer_output/infer_output.txt \ + --beam_size 10 \ + --use_gpu True + --eager_run False +``` + +## 效果评价 + +使用 [*multi-bleu.perl*](https://github.com/moses-smt/mosesdecoder.git) 工具来评价模型预测的翻译质量,使用方法如下: + +```sh +mosesdecoder/scripts/generic/multi-bleu.perl tst2013.vi < infer_output.txt +``` + +每个模型分别训练了10次,单次取第10个epoch保存的模型进行预测,取beam_size=10。效果如下(为了便于观察,对10次结果按照升序进行了排序): + +``` +> no attention +tst2012 BLEU: +[10.75 10.85 10.9 10.94 10.97 11.01 11.01 11.04 11.13 11.4] +tst2013 BLEU: +[10.71 10.71 10.74 10.76 10.91 10.94 11.02 11.16 11.21 11.44] + +> with attention +tst2012 BLEU: +[21.14 22.34 22.54 22.65 22.71 22.71 23.08 23.15 23.3 23.4] +tst2013 BLEU: +[23.41 24.79 25.11 25.12 25.19 25.24 25.39 25.61 25.61 25.63] +``` diff --git a/seq2seq/reader.py b/seq2seq/reader.py index 6f007bdde0ca16..ebdbb47266e2c4 100644 --- a/seq2seq/reader.py +++ b/seq2seq/reader.py @@ -17,13 +17,58 @@ from __future__ import print_function import glob +import six +import os import io -import numpy as np import itertools +from functools import partial + +import numpy as np +import paddle.fluid as fluid from paddle.fluid.dygraph.parallel import ParallelEnv from paddle.fluid.io import BatchSampler, DataLoader, Dataset +def create_data_loader(args, device, for_train=True): + data_loaders = [None, None] + data_prefixes = [args.train_data_prefix, args.eval_data_prefix + ] if args.eval_data_prefix else [args.train_data_prefix] + for i, data_prefix in enumerate(data_prefixes): + dataset = Seq2SeqDataset( + fpattern=data_prefix + "." + args.src_lang, + trg_fpattern=data_prefix + "." + args.tar_lang, + src_vocab_fpath=args.vocab_prefix + "." + args.src_lang, + trg_vocab_fpath=args.vocab_prefix + "." + args.tar_lang, + token_delimiter=None, + start_mark="", + end_mark="", + unk_mark="", + max_length=args.max_len if i == 0 else None, + truncate=True) + (args.src_vocab_size, args.tar_vocab_size, bos_id, eos_id, + unk_id) = dataset.get_vocab_summary() + batch_sampler = Seq2SeqBatchSampler( + dataset=dataset, + use_token_batch=False, + batch_size=args.batch_size, + pool_size=args.batch_size * 20, + sort_type=SortType.POOL, + shuffle=False if args.enable_ce else True) + data_loader = DataLoader( + dataset=dataset, + batch_sampler=batch_sampler, + places=device, + collate_fn=partial( + prepare_train_input, + bos_id=bos_id, + eos_id=eos_id, + pad_id=eos_id), + num_workers=0, + return_list=True) + data_loaders[i] = data_loader + return data_loaders + + def prepare_train_input(insts, bos_id, eos_id, pad_id): src, src_length = pad_batch_data( [inst[0] for inst in insts], pad_id=pad_id) @@ -118,10 +163,11 @@ def append(self, info): class SampleInfo(object): - def __init__(self, i, max_len, min_len): + def __init__(self, i, lens): self.i = i - self.min_len = min_len - self.max_len = max_len + # to be consistent with origianl reader implementation + self.min_len = lens[0] + self.max_len = lens[0] class MinMaxFilter(object): @@ -131,9 +177,8 @@ def __init__(self, max_len, min_len, underlying_creator): self._creator = underlying_creator def append(self, info): - if info.max_len > self._max_len or info.min_len < self._min_len: - return - else: + if (self._min_len is None or info.min_len >= self._min_len) and ( + self._max_len is None or info.max_len <= self._max_len): return self._creator.append(info) @property @@ -151,22 +196,30 @@ def __init__(self, start_mark="", end_mark="", unk_mark="", - only_src=False, - trg_fpattern=None): - # convert str to bytes, and use byte data - # field_delimiter = field_delimiter.encode("utf8") - # token_delimiter = token_delimiter.encode("utf8") - # start_mark = start_mark.encode("utf8") - # end_mark = end_mark.encode("utf8") - # unk_mark = unk_mark.encode("utf8") - self._src_vocab = self.load_dict(src_vocab_fpath) - self._trg_vocab = self.load_dict(trg_vocab_fpath) + trg_fpattern=None, + byte_data=False, + min_length=None, + max_length=None, + truncate=False): + if byte_data: + # The WMT16 bpe data used here seems including bytes can not be + # decoded by utf8. Thus convert str to bytes, and use byte data + field_delimiter = field_delimiter.encode("utf8") + token_delimiter = token_delimiter.encode("utf8") + start_mark = start_mark.encode("utf8") + end_mark = end_mark.encode("utf8") + unk_mark = unk_mark.encode("utf8") + self._byte_data = byte_data + self._src_vocab = self.load_dict(src_vocab_fpath, byte_data=byte_data) + self._trg_vocab = self.load_dict(trg_vocab_fpath, byte_data=byte_data) self._bos_idx = self._src_vocab[start_mark] self._eos_idx = self._src_vocab[end_mark] self._unk_idx = self._src_vocab[unk_mark] - self._only_src = only_src self._field_delimiter = field_delimiter self._token_delimiter = token_delimiter + self._min_length = min_length + self._max_length = max_length + self._truncate = truncate self.load_src_trg_ids(fpattern, trg_fpattern) def load_src_trg_ids(self, fpattern, trg_fpattern=None): @@ -195,26 +248,32 @@ def load_src_trg_ids(self, fpattern, trg_fpattern=None): self._sample_infos = [] slots = [self._src_seq_ids, self._trg_seq_ids] - lens = [] for i, line in enumerate(self._load_lines(fpattern, trg_fpattern)): - lens = [] - for field, slot in zip(converters(line), slots): - slot.append(field) - lens.append(len(field)) - # self._sample_infos.append(SampleInfo(i, max(lens), min(lens))) - self._sample_infos.append(SampleInfo(i, lens[0], lens[0])) + fields = converters(line) + lens = [len(field) for field in fields] + sample = SampleInfo(i, lens) + if (self._min_length is None or + sample.min_len >= self._min_length) and ( + self._max_length is None or + sample.max_len <= self._max_length or self._truncate): + for field, slot in zip(fields, slots): + slot.append(field[:self._max_length] if self._truncate and + self._max_length is not None else field) + self._sample_infos.append(sample) def _load_lines(self, fpattern, trg_fpattern=None): fpaths = glob.glob(fpattern) fpaths = sorted(fpaths) # TODO: Add custum sort assert len(fpaths) > 0, "no matching file to the provided data path" + (f_mode, f_encoding, + endl) = ("rb", None, b"\n") if self._byte_data else ("r", "utf8", + "\n") if trg_fpattern is None: for fpath in fpaths: - # with io.open(fpath, "rb") as f: - with io.open(fpath, "r", encoding="utf8") as f: + with io.open(fpath, f_mode, encoding=f_encoding) as f: for line in f: - fields = line.strip("\n").split(self._field_delimiter) + fields = line.strip(endl).split(self._field_delimiter) yield fields else: # separated source and target language data files @@ -228,24 +287,24 @@ def _load_lines(self, fpattern, trg_fpattern=None): with that of source language" for fpath, trg_fpath in zip(fpaths, trg_fpaths): - # with io.open(fpath, "rb") as f: - # with io.open(trg_fpath, "rb") as trg_f: - with io.open(fpath, "r", encoding="utf8") as f: - with io.open(trg_fpath, "r", encoding="utf8") as trg_f: + with io.open(fpath, f_mode, encoding=f_encoding) as f: + with io.open( + trg_fpath, f_mode, encoding=f_encoding) as trg_f: for line in zip(f, trg_f): - fields = [field.strip("\n") for field in line] + fields = [field.strip(endl) for field in line] yield fields @staticmethod - def load_dict(dict_path, reverse=False): + def load_dict(dict_path, reverse=False, byte_data=False): word_dict = {} - # with io.open(dict_path, "rb") as fdict: - with io.open(dict_path, "r", encoding="utf8") as fdict: + (f_mode, f_encoding, + endl) = ("rb", None, b"\n") if byte_data else ("r", "utf8", "\n") + with io.open(dict_path, f_mode, encoding=f_encoding) as fdict: for idx, line in enumerate(fdict): if reverse: - word_dict[idx] = line.strip("\n") + word_dict[idx] = line.strip(endl) else: - word_dict[line.strip("\n")] = idx + word_dict[line.strip(endl)] = idx return word_dict def get_vocab_summary(self): @@ -266,19 +325,21 @@ def __init__(self, batch_size, pool_size=10000, sort_type=SortType.NONE, - min_length=0, - max_length=100, + min_length=None, + max_length=None, shuffle=False, shuffle_batch=False, use_token_batch=False, clip_last_batch=False, - seed=None): + distribute_mode=True, + seed=0): for arg, value in locals().items(): if arg != "self": setattr(self, "_" + arg, value) self._random = np.random self._random.seed(seed) # for multi-devices + self._distribute_mode = distribute_mode self._nranks = ParallelEnv().nranks self._local_rank = ParallelEnv().local_rank self._device_id = ParallelEnv().dev_id @@ -337,11 +398,14 @@ def __iter__(self): # for multi-device for batch_id, batch in enumerate(batches): - if batch_id % self._nranks == self._local_rank: + if not self._distribute_mode or ( + batch_id % self._nranks == self._local_rank): batch_indices = [info.i for info in batch] yield batch_indices - if self._local_rank > len(batches) % self._nranks: - yield batch_indices + if self._distribute_mode and len(batches) % self._nranks != 0: + if self._local_rank >= len(batches) % self._nranks: + # use previous data to pad + yield batch_indices def __len__(self): if not self._use_token_batch: @@ -349,5 +413,6 @@ def __len__(self): len(self._dataset) + self._batch_size * self._nranks - 1) // ( self._batch_size * self._nranks) else: - batch_number = 100 + # TODO(guosheng): fix the uncertain length + batch_number = 1 return batch_number diff --git a/seq2seq/run.sh b/seq2seq/run.sh index 2fe8b7a0700ae4..4872fc996a8a86 100644 --- a/seq2seq/run.sh +++ b/seq2seq/run.sh @@ -1,3 +1,5 @@ +export CUDA_VISIBLE_DEVICES=0 + python train.py \ --src_lang en --tar_lang vi \ --attention True \ diff --git a/seq2seq/seq2seq_add_attn.py b/seq2seq/seq2seq_add_attn.py deleted file mode 100644 index ca0a4739f0c268..00000000000000 --- a/seq2seq/seq2seq_add_attn.py +++ /dev/null @@ -1,293 +0,0 @@ -import numpy as np -import paddle.fluid as fluid -import paddle.fluid.layers as layers -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm, Embedding, GRUUnit - -from text import DynamicDecode, RNN, RNNCell -from model import Model, Loss - - -class ConvBNPool(fluid.dygraph.Layer): - def __init__(self, - out_ch, - channels, - act="relu", - is_test=False, - pool=True, - use_cudnn=True): - super(ConvBNPool, self).__init__() - self.pool = pool - - filter_size = 3 - conv_std_0 = (2.0 / (filter_size**2 * channels[0]))**0.5 - conv_param_0 = fluid.ParamAttr( - initializer=fluid.initializer.Normal(0.0, conv_std_0)) - - conv_std_1 = (2.0 / (filter_size**2 * channels[1]))**0.5 - conv_param_1 = fluid.ParamAttr( - initializer=fluid.initializer.Normal(0.0, conv_std_1)) - - self.conv_0_layer = Conv2D( - channels[0], - out_ch[0], - 3, - padding=1, - param_attr=conv_param_0, - bias_attr=False, - act=None, - use_cudnn=use_cudnn) - self.bn_0_layer = BatchNorm(out_ch[0], act=act, is_test=is_test) - self.conv_1_layer = Conv2D( - out_ch[0], - num_filters=out_ch[1], - filter_size=3, - padding=1, - param_attr=conv_param_1, - bias_attr=False, - act=None, - use_cudnn=use_cudnn) - self.bn_1_layer = BatchNorm(out_ch[1], act=act, is_test=is_test) - - if self.pool: - self.pool_layer = Pool2D( - pool_size=2, - pool_type='max', - pool_stride=2, - use_cudnn=use_cudnn, - ceil_mode=True) - - def forward(self, inputs): - conv_0 = self.conv_0_layer(inputs) - bn_0 = self.bn_0_layer(conv_0) - conv_1 = self.conv_1_layer(bn_0) - bn_1 = self.bn_1_layer(conv_1) - if self.pool: - bn_pool = self.pool_layer(bn_1) - - return bn_pool - return bn_1 - - -class OCRConv(fluid.dygraph.Layer): - def __init__(self, is_test=False, use_cudnn=True): - super(OCRConv, self).__init__() - self.conv_bn_pool_1 = ConvBNPool( - [16, 16], [1, 16], is_test=is_test, use_cudnn=use_cudnn) - self.conv_bn_pool_2 = ConvBNPool( - [32, 32], [16, 32], is_test=is_test, use_cudnn=use_cudnn) - self.conv_bn_pool_3 = ConvBNPool( - [64, 64], [32, 64], is_test=is_test, use_cudnn=use_cudnn) - self.conv_bn_pool_4 = ConvBNPool( - [128, 128], [64, 128], - is_test=is_test, - pool=False, - use_cudnn=use_cudnn) - - def forward(self, inputs): - inputs_1 = self.conv_bn_pool_1(inputs) - inputs_2 = self.conv_bn_pool_2(inputs_1) - inputs_3 = self.conv_bn_pool_3(inputs_2) - inputs_4 = self.conv_bn_pool_4(inputs_3) - - return inputs_4 - - -class SimpleAttention(fluid.dygraph.Layer): - def __init__(self, decoder_size): - super(SimpleAttention, self).__init__() - - self.fc1 = Linear(decoder_size, decoder_size, bias_attr=False) - self.fc2 = Linear(decoder_size, 1, bias_attr=False) - - def forward(self, encoder_vec, encoder_proj, decoder_state): - decoder_state = self.fc1(decoder_state) - decoder_state = fluid.layers.unsqueeze(decoder_state, [1]) - - mix = fluid.layers.elementwise_add(encoder_proj, decoder_state) - mix = fluid.layers.tanh(x=mix) - - attn_score = self.fc2(mix) - attn_scores = layers.squeeze(attn_score, [2]) - attn_scores = fluid.layers.softmax(attn_scores) - - scaled = fluid.layers.elementwise_mul( - x=encoder_vec, y=attn_scores, axis=0) - - context = fluid.layers.reduce_sum(scaled, dim=1) - return context - - -class GRUCell(RNNCell): - def __init__(self, - input_size, - hidden_size, - param_attr=None, - bias_attr=None, - gate_activation='sigmoid', - candidate_activation='tanh', - origin_mode=False): - super(GRUCell, self).__init__() - self.hidden_size = hidden_size - self.fc_layer = Linear( - input_size, - hidden_size * 3, - param_attr=param_attr, - bias_attr=False) - - self.gru_unit = GRUUnit( - hidden_size * 3, - param_attr=param_attr, - bias_attr=bias_attr, - activation=candidate_activation, - gate_activation=gate_activation, - origin_mode=origin_mode) - - def forward(self, inputs, states): - # step_outputs, new_states = cell(step_inputs, states) - # for GRUCell, `step_outputs` and `new_states` both are hidden - x = self.fc_layer(inputs) - hidden, _, _ = self.gru_unit(x, states) - return hidden, hidden - - @property - def state_shape(self): - return [self.hidden_size] - - -class EncoderNet(fluid.dygraph.Layer): - def __init__(self, - decoder_size, - rnn_hidden_size=200, - is_test=False, - use_cudnn=True): - super(EncoderNet, self).__init__() - self.rnn_hidden_size = rnn_hidden_size - para_attr = fluid.ParamAttr( - initializer=fluid.initializer.Normal(0.0, 0.02)) - bias_attr = fluid.ParamAttr( - initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0) - self.ocr_convs = OCRConv(is_test=is_test, use_cudnn=use_cudnn) - - self.gru_forward_layer = RNN( - cell=GRUCell( - input_size=128 * 6, # channel * h - hidden_size=rnn_hidden_size, - param_attr=para_attr, - bias_attr=bias_attr, - candidate_activation='relu'), - is_reverse=False, - time_major=False) - self.gru_backward_layer = RNN( - cell=GRUCell( - input_size=128 * 6, # channel * h - hidden_size=rnn_hidden_size, - param_attr=para_attr, - bias_attr=bias_attr, - candidate_activation='relu'), - is_reverse=True, - time_major=False) - - self.encoded_proj_fc = Linear( - rnn_hidden_size * 2, decoder_size, bias_attr=False) - - def forward(self, inputs): - conv_features = self.ocr_convs(inputs) - transpose_conv_features = fluid.layers.transpose( - conv_features, perm=[0, 3, 1, 2]) - - sliced_feature = fluid.layers.reshape( - transpose_conv_features, [ - -1, transpose_conv_features.shape[1], - transpose_conv_features.shape[2] * - transpose_conv_features.shape[3] - ], - inplace=False) - - gru_forward, _ = self.gru_forward_layer(sliced_feature) - - gru_backward, _ = self.gru_backward_layer(sliced_feature) - - encoded_vector = fluid.layers.concat( - input=[gru_forward, gru_backward], axis=2) - - encoded_proj = self.encoded_proj_fc(encoded_vector) - - return gru_backward, encoded_vector, encoded_proj - - -class DecoderCell(RNNCell): - def __init__(self, encoder_size, decoder_size): - super(DecoderCell, self).__init__() - self.attention = SimpleAttention(decoder_size) - self.gru_cell = GRUCell( - input_size=encoder_size * 2 + - decoder_size, # encoded_vector.shape[-1] + embed_size - hidden_size=decoder_size) - - def forward(self, current_word, states, encoder_vec, encoder_proj): - context = self.attention(encoder_vec, encoder_proj, states) - decoder_inputs = layers.concat([current_word, context], axis=1) - hidden, _ = self.gru_cell(decoder_inputs, states) - return hidden, hidden - - -class GRUDecoderWithAttention(fluid.dygraph.Layer): - def __init__(self, encoder_size, decoder_size, num_classes): - super(GRUDecoderWithAttention, self).__init__() - self.gru_attention = RNN(DecoderCell(encoder_size, decoder_size), - is_reverse=False, - time_major=False) - self.out_layer = Linear( - input_dim=decoder_size, - output_dim=num_classes + 2, - bias_attr=None, - act='softmax') - - def forward(self, inputs, decoder_initial_states, encoder_vec, - encoder_proj): - out, _ = self.gru_attention( - inputs, - initial_states=decoder_initial_states, - encoder_vec=encoder_vec, - encoder_proj=encoder_proj) - predict = self.out_layer(out) - return predict - - -class OCRAttention(Model): - def __init__(self, num_classes, encoder_size, decoder_size, - word_vector_dim): - super(OCRAttention, self).__init__() - self.encoder_net = EncoderNet(decoder_size) - self.fc = Linear( - input_dim=encoder_size, - output_dim=decoder_size, - bias_attr=False, - act='relu') - self.embedding = Embedding( - [num_classes + 2, word_vector_dim], dtype='float32') - self.gru_decoder_with_attention = GRUDecoderWithAttention( - encoder_size, decoder_size, num_classes) - - def forward(self, inputs, label_in): - gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs) - - decoder_boot = self.fc(gru_backward[:, 0]) - trg_embedding = self.embedding(label_in) - prediction = self.gru_decoder_with_attention( - trg_embedding, decoder_boot, encoded_vector, encoded_proj) - - return prediction - - -class CrossEntropyCriterion(Loss): - def __init__(self): - super(CrossEntropyCriterion, self).__init__() - - def forward(self, outputs, labels): - predict, (label, mask) = outputs[0], labels - - loss = layers.cross_entropy(predict, label=label, soft_label=False) - loss = layers.elementwise_mul(loss, mask, axis=0) - loss = layers.reduce_sum(loss) - return loss diff --git a/seq2seq/train.py b/seq2seq/train.py index 23dfccb5ad3c5f..9e809b0000052e 100644 --- a/seq2seq/train.py +++ b/seq2seq/train.py @@ -28,7 +28,7 @@ from args import parse_args from seq2seq_base import BaseModel, CrossEntropyCriterion from seq2seq_attn import AttentionModel -from reader import Seq2SeqDataset, Seq2SeqBatchSampler, SortType, prepare_train_input +from reader import create_data_loader def do_train(args): @@ -38,7 +38,6 @@ def do_train(args): if args.enable_ce: fluid.default_main_program().random_seed = 102 fluid.default_startup_program().random_seed = 102 - args.shuffle = False # define model inputs = [ @@ -54,64 +53,25 @@ def do_train(args): labels = [Input([None, None, 1], "int64", name="label"), ] # def dataloader - data_loaders = [None, None] - data_prefixes = [args.train_data_prefix, args.eval_data_prefix - ] if args.eval_data_prefix else [args.train_data_prefix] - for i, data_prefix in enumerate(data_prefixes): - dataset = Seq2SeqDataset( - fpattern=data_prefix + "." + args.src_lang, - trg_fpattern=data_prefix + "." + args.tar_lang, - src_vocab_fpath=args.vocab_prefix + "." + args.src_lang, - trg_vocab_fpath=args.vocab_prefix + "." + args.tar_lang, - token_delimiter=None, - start_mark="", - end_mark="", - unk_mark="") - (args.src_vocab_size, args.trg_vocab_size, bos_id, eos_id, - unk_id) = dataset.get_vocab_summary() - batch_sampler = Seq2SeqBatchSampler( - dataset=dataset, - use_token_batch=False, - batch_size=args.batch_size, - pool_size=args.batch_size * 20, - sort_type=SortType.POOL, - shuffle=args.shuffle) - data_loader = DataLoader( - dataset=dataset, - batch_sampler=batch_sampler, - places=device, - feed_list=None if fluid.in_dygraph_mode() else - [x.forward() for x in inputs + labels], - collate_fn=partial( - prepare_train_input, - bos_id=bos_id, - eos_id=eos_id, - pad_id=eos_id), - num_workers=0, - return_list=True) - data_loaders[i] = data_loader - train_loader, eval_loader = data_loaders + train_loader, eval_loader = create_data_loader(args, device) model_maker = AttentionModel if args.attention else BaseModel model = model_maker(args.src_vocab_size, args.tar_vocab_size, args.hidden_size, args.hidden_size, args.num_layers, args.dropout) - + optimizer = fluid.optimizer.Adam( + learning_rate=args.learning_rate, parameter_list=model.parameters()) + optimizer._grad_clip = fluid.clip.GradientClipByGlobalNorm( + clip_norm=args.max_grad_norm) model.prepare( - fluid.optimizer.Adam( - learning_rate=args.learning_rate, - parameter_list=model.parameters()), - CrossEntropyCriterion(), - inputs=inputs, - labels=labels) + optimizer, CrossEntropyCriterion(), inputs=inputs, labels=labels) model.fit(train_data=train_loader, eval_data=eval_loader, epochs=args.max_epoch, eval_freq=1, save_freq=1, save_dir=args.model_path, - log_freq=1, - verbose=2) + log_freq=1) if __name__ == "__main__": diff --git a/seq2seq/train_ocr.py b/seq2seq/train_ocr.py deleted file mode 100644 index 2dd7835b225825..00000000000000 --- a/seq2seq/train_ocr.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import print_function - -import os -import sys -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -import paddle.fluid.profiler as profiler -import paddle.fluid as fluid - -import data_reader - -from paddle.fluid.dygraph.base import to_variable -import argparse -import functools -from utility import add_arguments, print_arguments, get_attention_feeder_data -from model import Input, set_device -from nets import OCRAttention, CrossEntropyCriterion -from eval import evaluate - -parser = argparse.ArgumentParser(description=__doc__) -add_arg = functools.partial(add_arguments, argparser=parser) -# yapf: disable -add_arg('batch_size', int, 32, "Minibatch size.") -add_arg('epoch_num', int, 30, "Epoch number.") -add_arg('lr', float, 0.001, "Learning rate.") -add_arg('lr_decay_strategy', str, "", "Learning rate decay strategy.") -add_arg('log_period', int, 200, "Log period.") -add_arg('save_model_period', int, 2000, "Save model period. '-1' means never saving the model.") -add_arg('eval_period', int, 2000, "Evaluate period. '-1' means never evaluating the model.") -add_arg('save_model_dir', str, "./output", "The directory the model to be saved to.") -add_arg('train_images', str, None, "The directory of images to be used for training.") -add_arg('train_list', str, None, "The list file of images to be used for training.") -add_arg('test_images', str, None, "The directory of images to be used for test.") -add_arg('test_list', str, None, "The list file of images to be used for training.") -add_arg('init_model', str, None, "The init model file of directory.") -add_arg('use_gpu', bool, True, "Whether use GPU to train.") -add_arg('parallel', bool, False, "Whether use parallel training.") -add_arg('profile', bool, False, "Whether to use profiling.") -add_arg('skip_batch_num', int, 0, "The number of first minibatches to skip as warm-up for better performance test.") -add_arg('skip_test', bool, False, "Whether to skip test phase.") -# model hyper paramters -add_arg('encoder_size', int, 200, "Encoder size.") -add_arg('decoder_size', int, 128, "Decoder size.") -add_arg('word_vector_dim', int, 128, "Word vector dim.") -add_arg('num_classes', int, 95, "Number classes.") -add_arg('gradient_clip', float, 5.0, "Gradient clip value.") -add_arg('dynamic', bool, False, "Whether to use dygraph.") - - -def train(args): - device = set_device("gpu" if args.use_gpu else "cpu") - fluid.enable_dygraph(device) if args.dynamic else None - - ocr_attention = OCRAttention(encoder_size=args.encoder_size, decoder_size=args.decoder_size, - num_classes=args.num_classes, word_vector_dim=args.word_vector_dim) - LR = args.lr - if args.lr_decay_strategy == "piecewise_decay": - learning_rate = fluid.layers.piecewise_decay([200000, 250000], [LR, LR * 0.1, LR * 0.01]) - else: - learning_rate = LR - optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, parameter_list=ocr_attention.parameters()) - # grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(args.gradient_clip) - - inputs = [ - Input([None, 1, 48, 384], "float32", name="pixel"), - Input([None, None], "int64", name="label_in"), - ] - labels = [ - Input([None, None], "int64", name="label_out"), - Input([None, None], "float32", name="mask")] - - ocr_attention.prepare(optimizer, CrossEntropyCriterion(), inputs=inputs, labels=labels) - - - train_reader = data_reader.data_reader( - args.batch_size, - shuffle=True, - images_dir=args.train_images, - list_file=args.train_list, - data_type='train') - - # test_reader = data_reader.data_reader( - # args.batch_size, - # images_dir=args.test_images, - # list_file=args.test_list, - # data_type="test") - - # if not os.path.exists(args.save_model_dir): - # os.makedirs(args.save_model_dir) - total_step = 0 - epoch_num = args.epoch_num - for epoch in range(epoch_num): - batch_id = 0 - total_loss = 0.0 - - for data in train_reader(): - - total_step += 1 - data_dict = get_attention_feeder_data(data) - pixel = data_dict["pixel"] - label_in = data_dict["label_in"].reshape([pixel.shape[0], -1]) - label_out = data_dict["label_out"].reshape([pixel.shape[0], -1]) - mask = data_dict["mask"].reshape(label_out.shape).astype("float32") - - avg_loss = ocr_attention.train(inputs=[pixel, label_in], labels=[label_out, mask])[0] - total_loss += avg_loss - - if True:#batch_id > 0 and batch_id % args.log_period == 0: - print("epoch: {}, batch_id: {}, loss {}".format(epoch, batch_id, - total_loss / args.batch_size / args.log_period)) - total_loss = 0.0 - - batch_id += 1 - - -if __name__ == '__main__': - args = parser.parse_args() - print_arguments(args) - if args.profile: - if args.use_gpu: - with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: - train(args) - else: - with profiler.profiler("CPU", sorted_key='total') as cpuprof: - train(args) - else: - train(args) \ No newline at end of file diff --git a/transformer/reader.py b/transformer/reader.py index 66fb8dc02b99f3..8b2d8fa028aff2 100644 --- a/transformer/reader.py +++ b/transformer/reader.py @@ -289,7 +289,6 @@ def __init__(self, start_mark="", end_mark="", unk_mark="", - only_src=False, trg_fpattern=None, byte_data=False): if byte_data: From bc039c5959b8d3fad3791285108a0a26f824e379 Mon Sep 17 00:00:00 2001 From: guosheng Date: Mon, 20 Apr 2020 17:41:01 +0800 Subject: [PATCH 07/13] Add greedy search. Add PPL metric. --- seq2seq/predict.py | 7 ++- seq2seq/reader.py | 48 ++++++++++++++------- seq2seq/seq2seq_attn.py | 95 ++++++++++++++++++++++++++++++++++++++--- seq2seq/seq2seq_base.py | 13 +++--- seq2seq/train.py | 74 ++++++++++++++++++++++++++++++-- 5 files changed, 204 insertions(+), 33 deletions(-) diff --git a/seq2seq/predict.py b/seq2seq/predict.py index fef74cfb22fcfe..c51eed2d9e0b59 100644 --- a/seq2seq/predict.py +++ b/seq2seq/predict.py @@ -28,7 +28,7 @@ from model import Input, set_device from args import parse_args from seq2seq_base import BaseInferModel -from seq2seq_attn import AttentionInferModel +from seq2seq_attn import AttentionInferModel, AttentionGreedyInferModel from reader import Seq2SeqDataset, Seq2SeqBatchSampler, SortType, prepare_infer_input @@ -87,7 +87,8 @@ def do_predict(args): num_workers=0, return_list=True) - model_maker = AttentionInferModel if args.attention else BaseInferModel + # model_maker = AttentionInferModel if args.attention else BaseInferModel + model_maker = AttentionGreedyInferModel if args.attention else BaseInferModel model = model_maker( args.src_vocab_size, args.tar_vocab_size, @@ -111,6 +112,8 @@ def do_predict(args): with io.open(args.infer_output_file, 'w', encoding='utf-8') as f: for data in data_loader(): finished_seq = model.test(inputs=flatten(data))[0] + finished_seq = finished_seq[:, :, np.newaxis] if len( + finished_seq.shape == 2) else finished_seq finished_seq = np.transpose(finished_seq, [0, 2, 1]) for ins in finished_seq: for beam_idx, beam in enumerate(ins): diff --git a/seq2seq/reader.py b/seq2seq/reader.py index ebdbb47266e2c4..a6fa73faf24496 100644 --- a/seq2seq/reader.py +++ b/seq2seq/reader.py @@ -44,7 +44,8 @@ def create_data_loader(args, device, for_train=True): end_mark="", unk_mark="", max_length=args.max_len if i == 0 else None, - truncate=True) + truncate=True, + trg_add_bos_eos=True) (args.src_vocab_size, args.tar_vocab_size, bos_id, eos_id, unk_id) = dataset.get_vocab_summary() batch_sampler = Seq2SeqBatchSampler( @@ -53,7 +54,8 @@ def create_data_loader(args, device, for_train=True): batch_size=args.batch_size, pool_size=args.batch_size * 20, sort_type=SortType.POOL, - shuffle=False if args.enable_ce else True) + shuffle=False if args.enable_ce else True, + distribute_mode=True if i == 0 else False) data_loader = DataLoader( dataset=dataset, batch_sampler=batch_sampler, @@ -73,7 +75,7 @@ def prepare_train_input(insts, bos_id, eos_id, pad_id): src, src_length = pad_batch_data( [inst[0] for inst in insts], pad_id=pad_id) trg, trg_length = pad_batch_data( - [[bos_id] + inst[1] + [eos_id] for inst in insts], pad_id=pad_id) + [inst[1] for inst in insts], pad_id=pad_id) trg_length = trg_length - 1 return src, src_length, trg[:, :-1], trg_length, trg[:, 1:, np.newaxis] @@ -165,9 +167,24 @@ def append(self, info): class SampleInfo(object): def __init__(self, i, lens): self.i = i - # to be consistent with origianl reader implementation - self.min_len = lens[0] - self.max_len = lens[0] + self.lens = lens + + def get_ranges(self, min_length=None, max_length=None, truncate=False): + ranges = [] + # source + if (min_length is None or self.lens[0] >= min_length) and ( + max_length is None or self.lens[0] <= max_length or truncate): + end = max_length if truncate and max_length else self.lens[0] + ranges.append([0, end]) + # target + if len(self.lens) == 2: + if (min_length is None or self.lens[1] >= min_length) and ( + max_length is None or self.lens[1] <= max_length + 2 or + truncate): + end = max_length + 2 if truncate and max_length else self.lens[ + 1] + ranges.append([0, end]) + return ranges if len(ranges) == len(self.lens) else None class MinMaxFilter(object): @@ -197,6 +214,7 @@ def __init__(self, end_mark="", unk_mark="", trg_fpattern=None, + trg_add_bos_eos=False, byte_data=False, min_length=None, max_length=None, @@ -220,6 +238,7 @@ def __init__(self, self._min_length = min_length self._max_length = max_length self._truncate = truncate + self._trg_add_bos_eos = trg_add_bos_eos self.load_src_trg_ids(fpattern, trg_fpattern) def load_src_trg_ids(self, fpattern, trg_fpattern=None): @@ -238,8 +257,8 @@ def load_src_trg_ids(self, fpattern, trg_fpattern=None): end=self._eos_idx, unk=self._unk_idx, delimiter=self._token_delimiter, - add_beg=False, - add_end=False) + add_beg=True if self._trg_add_bos_eos else False, + add_end=True if self._trg_add_bos_eos else False) converters = ComposedConverter([src_converter, trg_converter]) @@ -252,13 +271,12 @@ def load_src_trg_ids(self, fpattern, trg_fpattern=None): fields = converters(line) lens = [len(field) for field in fields] sample = SampleInfo(i, lens) - if (self._min_length is None or - sample.min_len >= self._min_length) and ( - self._max_length is None or - sample.max_len <= self._max_length or self._truncate): - for field, slot in zip(fields, slots): - slot.append(field[:self._max_length] if self._truncate and - self._max_length is not None else field) + field_ranges = sample.get_ranges(self._min_length, + self._max_length, self._truncate) + if field_ranges: + for field, field_range, slot in zip(fields, field_ranges, + slots): + slot.append(field[field_range[0]:field_range[1]]) self._sample_infos.append(sample) def _load_lines(self, fpattern, trg_fpattern=None): diff --git a/seq2seq/seq2seq_attn.py b/seq2seq/seq2seq_attn.py index c5baee74e83281..507c72aa5a39df 100644 --- a/seq2seq/seq2seq_attn.py +++ b/seq2seq/seq2seq_attn.py @@ -152,7 +152,7 @@ def __init__(self, self.decoder = Decoder(trg_vocab_size, embed_dim, hidden_size, num_layers, dropout_prob, init_scale) - def forward(self, src, src_length, trg, trg_length): + def forward(self, src, src_length, trg): # encoder encoder_output, encoder_final_state = self.encoder(src, src_length) @@ -174,11 +174,7 @@ def forward(self, src, src_length, trg, trg_length): # decoder with attentioon predict = self.decoder(trg, decoder_initial_states, encoder_output, encoder_padding_mask) - - # for target padding mask - mask = layers.sequence_mask( - trg_length, maxlen=layers.shape(trg)[1], dtype=predict.dtype) - return predict, mask + return predict class AttentionInferModel(AttentionModel): @@ -242,3 +238,90 @@ def forward(self, src, src_length): encoder_output=encoder_output, encoder_padding_mask=encoder_padding_mask) return rs + + +class GreedyEmbeddingHelper(fluid.layers.GreedyEmbeddingHelper): + def __init__(self, embedding_fn, start_tokens, end_token): + if isinstance(start_tokens, int): + self.need_convert_start_tokens = True + self.start_token_value = start_tokens + super(GreedyEmbeddingHelper, self).__init__(embedding_fn, start_tokens, + end_token) + + def initialize(self, batch_ref=None): + if getattr(self, "need_convert_start_tokens", False): + assert batch_ref is not None, ( + "Need to give batch_ref to get batch size " + "to initialize the tensor for start tokens.") + self.start_tokens = fluid.layers.fill_constant_batch_size_like( + input=fluid.layers.utils.flatten(batch_ref)[0], + shape=[-1], + dtype="int64", + value=self.start_token_value, + input_dim_idx=0) + return super(GreedyEmbeddingHelper, self).initialize() + + +class BasicDecoder(fluid.layers.BasicDecoder): + def initialize(self, initial_cell_states): + (initial_inputs, + initial_finished) = self.helper.initialize(initial_cell_states) + return initial_inputs, initial_cell_states, initial_finished + + +class AttentionGreedyInferModel(AttentionModel): + def __init__(self, + src_vocab_size, + trg_vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + bos_id=0, + eos_id=1, + beam_size=1, + max_out_len=256): + args = dict(locals()) + args.pop("self") + args.pop("__class__", None) # py3 + args.pop("beam_size", None) + self.bos_id = args.pop("bos_id") + self.eos_id = args.pop("eos_id") + self.max_out_len = args.pop("max_out_len") + super(AttentionGreedyInferModel, self).__init__(**args) + # dynamic decoder for inference + decoder_helper = GreedyEmbeddingHelper( + start_tokens=bos_id, + end_token=eos_id, + embedding_fn=self.decoder.embedder) + decoder = BasicDecoder( + cell=self.decoder.lstm_attention.cell, + helper=decoder_helper, + output_fn=self.decoder.output_layer) + self.greedy_search_decoder = DynamicDecode( + decoder, max_step_num=max_out_len, is_test=True) + + def forward(self, src, src_length): + # encoding + encoder_output, encoder_final_state = self.encoder(src, src_length) + + # decoder initial states + decoder_initial_states = [ + encoder_final_state, + self.decoder.lstm_attention.cell.get_initial_states( + batch_ref=encoder_output, shape=[self.hidden_size]) + ] + # attention mask to avoid paying attention on padddings + src_mask = layers.sequence_mask( + src_length, + maxlen=layers.shape(src)[1], + dtype=encoder_output.dtype) + encoder_padding_mask = (src_mask - 1.0) * 1e9 + encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1]) + + # dynamic decoding with beam search + rs, _ = self.greedy_search_decoder( + inits=decoder_initial_states, + encoder_output=encoder_output, + encoder_padding_mask=encoder_padding_mask) + return rs.sample_ids diff --git a/seq2seq/seq2seq_base.py b/seq2seq/seq2seq_base.py index b37b871b03f8f7..2cfd8eaa71e681 100644 --- a/seq2seq/seq2seq_base.py +++ b/seq2seq/seq2seq_base.py @@ -27,7 +27,10 @@ def __init__(self): super(CrossEntropyCriterion, self).__init__() def forward(self, outputs, labels): - (predict, mask), label = outputs, labels[0] + predict, (trg_length, label) = outputs[0], labels + # for target padding mask + mask = layers.sequence_mask( + trg_length, maxlen=layers.shape(predict)[1], dtype=predict.dtype) cost = layers.softmax_with_cross_entropy( logits=predict, label=label, soft_label=False) @@ -151,17 +154,13 @@ def __init__(self, self.decoder = Decoder(trg_vocab_size, embed_dim, hidden_size, num_layers, dropout_prob, init_scale) - def forward(self, src, src_length, trg, trg_length): + def forward(self, src, src_length, trg): # encoder encoder_output, encoder_final_states = self.encoder(src, src_length) # decoder predict = self.decoder(trg, encoder_final_states) - - # for target padding mask - mask = layers.sequence_mask( - trg_length, maxlen=layers.shape(trg)[1], dtype=predict.dtype) - return predict, mask + return predict class BaseInferModel(BaseModel): diff --git a/seq2seq/train.py b/seq2seq/train.py index 9e809b0000052e..a1cd45477c05ee 100644 --- a/seq2seq/train.py +++ b/seq2seq/train.py @@ -24,6 +24,7 @@ from paddle.fluid.io import DataLoader from model import Input, set_device +from metrics import Metric from callbacks import ProgBarLogger from args import parse_args from seq2seq_base import BaseModel, CrossEntropyCriterion @@ -31,6 +32,65 @@ from reader import create_data_loader +class TrainCallback(ProgBarLogger): + def __init__(self, args, ppl, verbose=2): + super(TrainCallback, self).__init__(1, verbose) + # control metric + self.ppl = ppl + self.batch_size = args.batch_size + + def on_train_begin(self, logs=None): + super(TrainCallback, self).on_train_begin(logs) + self.train_metrics += ["ppl"] # remove loss to not print it + self.ppl.reset() + + def on_train_batch_end(self, step, logs=None): + batch_loss = logs["loss"][0] + self.ppl.total_loss += batch_loss * self.batch_size + logs["ppl"] = np.exp(self.ppl.total_loss / self.ppl.word_count) + if step > 0 and step % self.ppl.reset_freq == 0: + self.ppl.reset() + super(TrainCallback, self).on_train_batch_end(step, logs) + + def on_eval_begin(self, logs=None): + super(TrainCallback, self).on_eval_begin(logs) + self.eval_metrics = ["ppl"] + self.ppl.reset() + + def on_eval_batch_end(self, step, logs=None): + batch_loss = logs["loss"][0] + self.ppl.total_loss += batch_loss * self.batch_size + logs["ppl"] = np.exp(self.ppl.total_loss / self.ppl.word_count) + super(TrainCallback, self).on_eval_batch_end(step, logs) + + +class PPL(Metric): + def __init__(self, reset_freq=100, name=None): + super(PPL, self).__init__() + self._name = name or "ppl" + self.reset_freq = reset_freq + self.reset() + + def add_metric_op(self, pred, label): + seq_length = label[0] + word_num = fluid.layers.reduce_sum(seq_length) + return word_num + + def update(self, word_num): + self.word_count += word_num + return word_num + + def reset(self): + self.total_loss = 0 + self.word_count = 0 + + def accumulate(self): + return self.word_count + + def name(self): + return self._name + + def do_train(args): device = set_device("gpu" if args.use_gpu else "cpu") fluid.enable_dygraph(device) if args.eager_run else None @@ -47,10 +107,13 @@ def do_train(args): [None], "int64", name="src_length"), Input( [None, None], "int64", name="trg_word"), + ] + labels = [ Input( [None], "int64", name="trg_length"), + Input( + [None, None, 1], "int64", name="label"), ] - labels = [Input([None, None, 1], "int64", name="label"), ] # def dataloader train_loader, eval_loader = create_data_loader(args, device) @@ -63,15 +126,20 @@ def do_train(args): learning_rate=args.learning_rate, parameter_list=model.parameters()) optimizer._grad_clip = fluid.clip.GradientClipByGlobalNorm( clip_norm=args.max_grad_norm) + ppl_metric = PPL() model.prepare( - optimizer, CrossEntropyCriterion(), inputs=inputs, labels=labels) + optimizer, + CrossEntropyCriterion(), + ppl_metric, + inputs=inputs, + labels=labels) model.fit(train_data=train_loader, eval_data=eval_loader, epochs=args.max_epoch, eval_freq=1, save_freq=1, save_dir=args.model_path, - log_freq=1) + callbacks=[TrainCallback(args, ppl_metric)]) if __name__ == "__main__": From 833a0157325125f89918127f07b233179050f061 Mon Sep 17 00:00:00 2001 From: guosheng Date: Mon, 20 Apr 2020 21:39:01 +0800 Subject: [PATCH 08/13] Use create_global_var instead of fill_constant in __init__ to make it compatible between dygraph and static-graph. --- seq2seq/predict.py | 2 +- seq2seq/reader.py | 1 + seq2seq/seq2seq_attn.py | 4 +++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/seq2seq/predict.py b/seq2seq/predict.py index c51eed2d9e0b59..c9120bff126cc5 100644 --- a/seq2seq/predict.py +++ b/seq2seq/predict.py @@ -113,7 +113,7 @@ def do_predict(args): for data in data_loader(): finished_seq = model.test(inputs=flatten(data))[0] finished_seq = finished_seq[:, :, np.newaxis] if len( - finished_seq.shape == 2) else finished_seq + finished_seq.shape) == 2 else finished_seq finished_seq = np.transpose(finished_seq, [0, 2, 1]) for ins in finished_seq: for beam_idx, beam in enumerate(ins): diff --git a/seq2seq/reader.py b/seq2seq/reader.py index a6fa73faf24496..26f5d6a4d1b9c3 100644 --- a/seq2seq/reader.py +++ b/seq2seq/reader.py @@ -168,6 +168,7 @@ class SampleInfo(object): def __init__(self, i, lens): self.i = i self.lens = lens + self.max_len = lens[0] def get_ranges(self, min_length=None, max_length=None, truncate=False): ranges = [] diff --git a/seq2seq/seq2seq_attn.py b/seq2seq/seq2seq_attn.py index 507c72aa5a39df..136b4741d95af9 100644 --- a/seq2seq/seq2seq_attn.py +++ b/seq2seq/seq2seq_attn.py @@ -247,6 +247,8 @@ def __init__(self, embedding_fn, start_tokens, end_token): self.start_token_value = start_tokens super(GreedyEmbeddingHelper, self).__init__(embedding_fn, start_tokens, end_token) + self.end_token = fluid.layers.create_global_var( + shape=[1], dtype="int64", value=end_token, persistable=True) def initialize(self, batch_ref=None): if getattr(self, "need_convert_start_tokens", False): @@ -319,7 +321,7 @@ def forward(self, src, src_length): encoder_padding_mask = (src_mask - 1.0) * 1e9 encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1]) - # dynamic decoding with beam search + # dynamic decoding with greedy search rs, _ = self.greedy_search_decoder( inits=decoder_initial_states, encoder_output=encoder_output, From 12fb5614944371c7be19901071112dad7f0b83bb Mon Sep 17 00:00:00 2001 From: guosheng Date: Tue, 21 Apr 2020 20:31:23 +0800 Subject: [PATCH 09/13] Refine PPL and reader for seq2seq. --- seq2seq/README.md | 2 +- seq2seq/args.py | 6 +++ seq2seq/reader.py | 4 +- seq2seq/seq2seq.yaml | 83 ----------------------------------------- seq2seq/seq2seq_base.py | 41 ++++++++++++++++++++ seq2seq/train.py | 71 ++++------------------------------- seq2seq/utility.py | 80 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 138 insertions(+), 149 deletions(-) delete mode 100644 seq2seq/seq2seq.yaml create mode 100644 seq2seq/utility.py diff --git a/seq2seq/README.md b/seq2seq/README.md index ef8bfd17fffc60..99be80a8419189 100644 --- a/seq2seq/README.md +++ b/seq2seq/README.md @@ -151,7 +151,7 @@ python infer.py \ --reload_model attention_models/epoch_10 \ --infer_output_file attention_infer_output/infer_output.txt \ --beam_size 10 \ - --use_gpu True + --use_gpu True \ --eager_run False ``` diff --git a/seq2seq/args.py b/seq2seq/args.py index 9c3911932fd0a6..94b07cd2cfed8a 100644 --- a/seq2seq/args.py +++ b/seq2seq/args.py @@ -88,6 +88,12 @@ def parse_args(): default=5.0, help="max grad norm for global norm clip") + parser.add_argument( + "--log_freq", + type=int, + default=100, + help="The frequency to print training logs") + parser.add_argument( "--model_path", type=str, diff --git a/seq2seq/reader.py b/seq2seq/reader.py index 26f5d6a4d1b9c3..afa88a81058e63 100644 --- a/seq2seq/reader.py +++ b/seq2seq/reader.py @@ -168,7 +168,7 @@ class SampleInfo(object): def __init__(self, i, lens): self.i = i self.lens = lens - self.max_len = lens[0] + self.max_len = lens[0] # to be consitent with the original reader def get_ranges(self, min_length=None, max_length=None, truncate=False): ranges = [] @@ -379,7 +379,7 @@ def __iter__(self): reverse = True for i in range(0, len(infos), self._pool_size): # to avoid placing short next to long sentences - reverse = not reverse + reverse = False # not reverse infos[i:i + self._pool_size] = sorted( infos[i:i + self._pool_size], key=lambda x: x.max_len, diff --git a/seq2seq/seq2seq.yaml b/seq2seq/seq2seq.yaml deleted file mode 100644 index 8e0edb72f2faa9..00000000000000 --- a/seq2seq/seq2seq.yaml +++ /dev/null @@ -1,83 +0,0 @@ -# used for continuous evaluation -enable_ce: False - -eager_run: False - -# The frequency to save trained models when training. -save_step: 10000 -# The frequency to fetch and print output when training. -print_step: 100 -# path of the checkpoint, to resume the previous training -init_from_checkpoint: "" -# path of the pretrain model, to better solve the current task -init_from_pretrain_model: "" -# path of trained parameter, to make prediction -init_from_params: "trained_params/step_100000/" -# the directory for saving model -save_model: "trained_models" -# the directory for saving inference model. -inference_model_dir: "infer_model" -# Set seed for CE or debug -random_seed: None -# The pattern to match training data files. -training_file: "wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de" -# The pattern to match validation data files. -validation_file: "wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de" -# The pattern to match test data files. -predict_file: "wmt16_ende_data_bpe/newstest2016.tok.bpe.32000.en-de" -# The file to output the translation results of predict_file to. -output_file: "predict.txt" -# The path of vocabulary file of source language. -src_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000" -# The path of vocabulary file of target language. -trg_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000" -# The , and tokens in the dictionary. -special_token: ["", "", ""] -# max length of sequences -max_length: 256 - -# whether to use cuda -use_cuda: True - -# args for reader, see reader.py for details -token_delimiter: " " -use_token_batch: True -pool_size: 200000 -sort_type: "pool" -shuffle: True -shuffle_batch: True -batch_size: 4096 - -# Hyparams for training: -# the number of epoches for training -epoch: 30 -# the hyper parameters for Adam optimizer. -# This static learning_rate will be multiplied to the LearningRateScheduler -# derived learning rate the to get the final learning rate. -learning_rate: 0.001 - - -# Hyparams for generation: -# the parameters for beam search. -beam_size: 5 -max_out_len: 256 -# the number of decoded sentences to output. -n_best: 1 - -# Hyparams for model: -# These following five vocabularies related configurations will be set -# automatically according to the passed vocabulary path and special tokens. -# size of source word dictionary. -src_vocab_size: 10000 -# size of target word dictionay -trg_vocab_size: 10000 -# index for token -bos_idx: 0 -# index for token -eos_idx: 1 -# index for token -unk_idx: 2 -embed_dim: 512 -hidden_size: 512 -num_layers: 2 -dropout: 0.1 diff --git a/seq2seq/seq2seq_base.py b/seq2seq/seq2seq_base.py index 2cfd8eaa71e681..83fd187d75ce1e 100644 --- a/seq2seq/seq2seq_base.py +++ b/seq2seq/seq2seq_base.py @@ -200,3 +200,44 @@ def forward(self, src, src_length): # dynamic decoding with beam search rs, _ = self.beam_search_decoder(inits=encoder_final_states) return rs + + +class BaseGreedyInferModel(BaseModel): + def __init__(self, + src_vocab_size, + trg_vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + bos_id=0, + eos_id=1, + beam_size=1, + max_out_len=256): + args = dict(locals()) + args.pop("self") + args.pop("__class__", None) # py3 + args.pop("beam_size", None) + self.bos_id = args.pop("bos_id") + self.eos_id = args.pop("eos_id") + self.max_out_len = args.pop("max_out_len") + super(BaseGreedyInferModel, self).__init__(**args) + # dynamic decoder for inference + decoder_helper = GreedyEmbeddingHelper( + start_tokens=bos_id, + end_token=eos_id, + embedding_fn=self.decoder.embedder) + decoder = BasicDecoder( + cell=self.decoder.stack_lstm.cell, + helper=decoder_helper, + output_fn=self.decoder.output_layer) + self.greedy_search_decoder = DynamicDecode( + decoder, max_step_num=max_out_len, is_test=True) + + def forward(self, src, src_length): + # encoding + encoder_output, encoder_final_states = self.encoder(src, src_length) + + # dynamic decoding with greedy search + rs, _ = self.greedy_search_decoder(inits=encoder_final_states) + return rs.sample_ids diff --git a/seq2seq/train.py b/seq2seq/train.py index a1cd45477c05ee..4502628737b1af 100644 --- a/seq2seq/train.py +++ b/seq2seq/train.py @@ -30,65 +30,7 @@ from seq2seq_base import BaseModel, CrossEntropyCriterion from seq2seq_attn import AttentionModel from reader import create_data_loader - - -class TrainCallback(ProgBarLogger): - def __init__(self, args, ppl, verbose=2): - super(TrainCallback, self).__init__(1, verbose) - # control metric - self.ppl = ppl - self.batch_size = args.batch_size - - def on_train_begin(self, logs=None): - super(TrainCallback, self).on_train_begin(logs) - self.train_metrics += ["ppl"] # remove loss to not print it - self.ppl.reset() - - def on_train_batch_end(self, step, logs=None): - batch_loss = logs["loss"][0] - self.ppl.total_loss += batch_loss * self.batch_size - logs["ppl"] = np.exp(self.ppl.total_loss / self.ppl.word_count) - if step > 0 and step % self.ppl.reset_freq == 0: - self.ppl.reset() - super(TrainCallback, self).on_train_batch_end(step, logs) - - def on_eval_begin(self, logs=None): - super(TrainCallback, self).on_eval_begin(logs) - self.eval_metrics = ["ppl"] - self.ppl.reset() - - def on_eval_batch_end(self, step, logs=None): - batch_loss = logs["loss"][0] - self.ppl.total_loss += batch_loss * self.batch_size - logs["ppl"] = np.exp(self.ppl.total_loss / self.ppl.word_count) - super(TrainCallback, self).on_eval_batch_end(step, logs) - - -class PPL(Metric): - def __init__(self, reset_freq=100, name=None): - super(PPL, self).__init__() - self._name = name or "ppl" - self.reset_freq = reset_freq - self.reset() - - def add_metric_op(self, pred, label): - seq_length = label[0] - word_num = fluid.layers.reduce_sum(seq_length) - return word_num - - def update(self, word_num): - self.word_count += word_num - return word_num - - def reset(self): - self.total_loss = 0 - self.word_count = 0 - - def accumulate(self): - return self.word_count - - def name(self): - return self._name +from utility import PPL, TrainCallback def do_train(args): @@ -122,10 +64,13 @@ def do_train(args): model = model_maker(args.src_vocab_size, args.tar_vocab_size, args.hidden_size, args.hidden_size, args.num_layers, args.dropout) - optimizer = fluid.optimizer.Adam( - learning_rate=args.learning_rate, parameter_list=model.parameters()) - optimizer._grad_clip = fluid.clip.GradientClipByGlobalNorm( + grad_clip = fluid.clip.GradientClipByGlobalNorm( clip_norm=args.max_grad_norm) + optimizer = fluid.optimizer.Adam( + learning_rate=args.learning_rate, + parameter_list=model.parameters(), + grad_clip=grad_clip) + ppl_metric = PPL() model.prepare( optimizer, @@ -139,7 +84,7 @@ def do_train(args): eval_freq=1, save_freq=1, save_dir=args.model_path, - callbacks=[TrainCallback(args, ppl_metric)]) + callbacks=[TrainCallback(ppl_metric, args.log_freq)]) if __name__ == "__main__": diff --git a/seq2seq/utility.py b/seq2seq/utility.py new file mode 100644 index 00000000000000..eb54e8c70e5a17 --- /dev/null +++ b/seq2seq/utility.py @@ -0,0 +1,80 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle.fluid as fluid + +from metrics import Metric +from callbacks import ProgBarLogger + + +class TrainCallback(ProgBarLogger): + def __init__(self, ppl, log_freq, verbose=2): + super(TrainCallback, self).__init__(log_freq, verbose) + self.ppl = ppl + + def on_train_begin(self, logs=None): + super(TrainCallback, self).on_train_begin(logs) + self.train_metrics = ["ppl"] # remove loss to not print it + + def on_epoch_begin(self, epoch=None, logs=None): + super(TrainCallback, self).on_epoch_begin(epoch, logs) + self.ppl.reset() + + def on_train_batch_end(self, step, logs=None): + logs["ppl"] = self.ppl.cal_acc_ppl(logs["loss"][0], logs["batch_size"]) + if step > 0 and step % self.ppl.reset_freq == 0: + self.ppl.reset() + super(TrainCallback, self).on_train_batch_end(step, logs) + + def on_eval_begin(self, logs=None): + super(TrainCallback, self).on_eval_begin(logs) + self.eval_metrics = ["ppl"] + self.ppl.reset() + + def on_eval_batch_end(self, step, logs=None): + logs["ppl"] = self.ppl.cal_acc_ppl(logs["loss"][0], logs["batch_size"]) + super(TrainCallback, self).on_eval_batch_end(step, logs) + + +class PPL(Metric): + def __init__(self, reset_freq=100, name=None): + super(PPL, self).__init__() + self._name = name or "ppl" + self.reset_freq = reset_freq + self.reset() + + def add_metric_op(self, pred, label): + seq_length = label[0] + word_num = fluid.layers.reduce_sum(seq_length) + return word_num + + def update(self, word_num): + self.word_count += word_num[0] + return word_num + + def reset(self): + self.total_loss = 0 + self.word_count = 0 + + def accumulate(self): + return self.word_count + + def name(self): + return self._name + + def cal_acc_ppl(self, batch_loss, batch_size): + self.total_loss += batch_loss * batch_size + ppl = np.exp(self.total_loss / self.word_count) + return ppl \ No newline at end of file From 0a326f39c1ee1814c2b9bdc6ca5c78034b7efd6e Mon Sep 17 00:00:00 2001 From: guosheng Date: Tue, 21 Apr 2020 21:34:43 +0800 Subject: [PATCH 10/13] Update seq2seq to adapt to latest code. --- {seq2seq => examples/seq2seq}/README.md | 0 {seq2seq => examples/seq2seq}/args.py | 0 {seq2seq => examples/seq2seq}/download.py | 0 {seq2seq => examples/seq2seq}/predict.py | 9 +- {seq2seq => examples/seq2seq}/reader.py | 0 {seq2seq => examples/seq2seq}/run.sh | 0 {seq2seq => examples/seq2seq}/seq2seq_attn.py | 94 +------------------ {seq2seq => examples/seq2seq}/seq2seq_base.py | 46 +-------- {seq2seq => examples/seq2seq}/train.py | 6 +- {seq2seq => examples/seq2seq}/utility.py | 4 +- hapi/text/text.py | 5 +- 11 files changed, 15 insertions(+), 149 deletions(-) rename {seq2seq => examples/seq2seq}/README.md (100%) rename {seq2seq => examples/seq2seq}/args.py (100%) rename {seq2seq => examples/seq2seq}/download.py (100%) rename {seq2seq => examples/seq2seq}/predict.py (92%) rename {seq2seq => examples/seq2seq}/reader.py (100%) rename {seq2seq => examples/seq2seq}/run.sh (100%) rename {seq2seq => examples/seq2seq}/seq2seq_attn.py (71%) rename {seq2seq => examples/seq2seq}/seq2seq_base.py (82%) rename {seq2seq => examples/seq2seq}/train.py (93%) rename {seq2seq => examples/seq2seq}/utility.py (97%) diff --git a/seq2seq/README.md b/examples/seq2seq/README.md similarity index 100% rename from seq2seq/README.md rename to examples/seq2seq/README.md diff --git a/seq2seq/args.py b/examples/seq2seq/args.py similarity index 100% rename from seq2seq/args.py rename to examples/seq2seq/args.py diff --git a/seq2seq/download.py b/examples/seq2seq/download.py similarity index 100% rename from seq2seq/download.py rename to examples/seq2seq/download.py diff --git a/seq2seq/predict.py b/examples/seq2seq/predict.py similarity index 92% rename from seq2seq/predict.py rename to examples/seq2seq/predict.py index c9120bff126cc5..ae8d11be0de3b3 100644 --- a/seq2seq/predict.py +++ b/examples/seq2seq/predict.py @@ -15,8 +15,6 @@ import logging import os import io -import sys -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import random from functools import partial @@ -25,10 +23,10 @@ from paddle.fluid.layers.utils import flatten from paddle.fluid.io import DataLoader -from model import Input, set_device +from hapi.model import Input, set_device from args import parse_args from seq2seq_base import BaseInferModel -from seq2seq_attn import AttentionInferModel, AttentionGreedyInferModel +from seq2seq_attn import AttentionInferModel from reader import Seq2SeqDataset, Seq2SeqBatchSampler, SortType, prepare_infer_input @@ -87,8 +85,7 @@ def do_predict(args): num_workers=0, return_list=True) - # model_maker = AttentionInferModel if args.attention else BaseInferModel - model_maker = AttentionGreedyInferModel if args.attention else BaseInferModel + model_maker = AttentionInferModel if args.attention else BaseInferModel model = model_maker( args.src_vocab_size, args.tar_vocab_size, diff --git a/seq2seq/reader.py b/examples/seq2seq/reader.py similarity index 100% rename from seq2seq/reader.py rename to examples/seq2seq/reader.py diff --git a/seq2seq/run.sh b/examples/seq2seq/run.sh similarity index 100% rename from seq2seq/run.sh rename to examples/seq2seq/run.sh diff --git a/seq2seq/seq2seq_attn.py b/examples/seq2seq/seq2seq_attn.py similarity index 71% rename from seq2seq/seq2seq_attn.py rename to examples/seq2seq/seq2seq_attn.py index 136b4741d95af9..ce9cc089ca2133 100644 --- a/seq2seq/seq2seq_attn.py +++ b/examples/seq2seq/seq2seq_attn.py @@ -19,8 +19,9 @@ from paddle.fluid.dygraph import Embedding, Linear, Layer from paddle.fluid.layers import BeamSearchDecoder -from text import DynamicDecode, RNN, BasicLSTMCell, RNNCell -from model import Model, Loss +from hapi.model import Model, Loss +from hapi.text import DynamicDecode, RNN, BasicLSTMCell, RNNCell + from seq2seq_base import Encoder @@ -238,92 +239,3 @@ def forward(self, src, src_length): encoder_output=encoder_output, encoder_padding_mask=encoder_padding_mask) return rs - - -class GreedyEmbeddingHelper(fluid.layers.GreedyEmbeddingHelper): - def __init__(self, embedding_fn, start_tokens, end_token): - if isinstance(start_tokens, int): - self.need_convert_start_tokens = True - self.start_token_value = start_tokens - super(GreedyEmbeddingHelper, self).__init__(embedding_fn, start_tokens, - end_token) - self.end_token = fluid.layers.create_global_var( - shape=[1], dtype="int64", value=end_token, persistable=True) - - def initialize(self, batch_ref=None): - if getattr(self, "need_convert_start_tokens", False): - assert batch_ref is not None, ( - "Need to give batch_ref to get batch size " - "to initialize the tensor for start tokens.") - self.start_tokens = fluid.layers.fill_constant_batch_size_like( - input=fluid.layers.utils.flatten(batch_ref)[0], - shape=[-1], - dtype="int64", - value=self.start_token_value, - input_dim_idx=0) - return super(GreedyEmbeddingHelper, self).initialize() - - -class BasicDecoder(fluid.layers.BasicDecoder): - def initialize(self, initial_cell_states): - (initial_inputs, - initial_finished) = self.helper.initialize(initial_cell_states) - return initial_inputs, initial_cell_states, initial_finished - - -class AttentionGreedyInferModel(AttentionModel): - def __init__(self, - src_vocab_size, - trg_vocab_size, - embed_dim, - hidden_size, - num_layers, - dropout_prob=0., - bos_id=0, - eos_id=1, - beam_size=1, - max_out_len=256): - args = dict(locals()) - args.pop("self") - args.pop("__class__", None) # py3 - args.pop("beam_size", None) - self.bos_id = args.pop("bos_id") - self.eos_id = args.pop("eos_id") - self.max_out_len = args.pop("max_out_len") - super(AttentionGreedyInferModel, self).__init__(**args) - # dynamic decoder for inference - decoder_helper = GreedyEmbeddingHelper( - start_tokens=bos_id, - end_token=eos_id, - embedding_fn=self.decoder.embedder) - decoder = BasicDecoder( - cell=self.decoder.lstm_attention.cell, - helper=decoder_helper, - output_fn=self.decoder.output_layer) - self.greedy_search_decoder = DynamicDecode( - decoder, max_step_num=max_out_len, is_test=True) - - def forward(self, src, src_length): - # encoding - encoder_output, encoder_final_state = self.encoder(src, src_length) - - # decoder initial states - decoder_initial_states = [ - encoder_final_state, - self.decoder.lstm_attention.cell.get_initial_states( - batch_ref=encoder_output, shape=[self.hidden_size]) - ] - # attention mask to avoid paying attention on padddings - src_mask = layers.sequence_mask( - src_length, - maxlen=layers.shape(src)[1], - dtype=encoder_output.dtype) - encoder_padding_mask = (src_mask - 1.0) * 1e9 - encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1]) - - # dynamic decoding with greedy search - rs, _ = self.greedy_search_decoder( - inits=decoder_initial_states, - encoder_output=encoder_output, - encoder_padding_mask=encoder_padding_mask) - return rs.sample_ids diff --git a/seq2seq/seq2seq_base.py b/examples/seq2seq/seq2seq_base.py similarity index 82% rename from seq2seq/seq2seq_base.py rename to examples/seq2seq/seq2seq_base.py index 83fd187d75ce1e..c28e2dc5293552 100644 --- a/seq2seq/seq2seq_base.py +++ b/examples/seq2seq/seq2seq_base.py @@ -18,8 +18,9 @@ from paddle.fluid.initializer import UniformInitializer from paddle.fluid.dygraph import Embedding, Linear, Layer from paddle.fluid.layers import BeamSearchDecoder -from text import DynamicDecode, RNN, BasicLSTMCell, RNNCell -from model import Model, Loss + +from hapi.model import Model, Loss +from hapi.text import DynamicDecode, RNN, BasicLSTMCell, RNNCell class CrossEntropyCriterion(Loss): @@ -200,44 +201,3 @@ def forward(self, src, src_length): # dynamic decoding with beam search rs, _ = self.beam_search_decoder(inits=encoder_final_states) return rs - - -class BaseGreedyInferModel(BaseModel): - def __init__(self, - src_vocab_size, - trg_vocab_size, - embed_dim, - hidden_size, - num_layers, - dropout_prob=0., - bos_id=0, - eos_id=1, - beam_size=1, - max_out_len=256): - args = dict(locals()) - args.pop("self") - args.pop("__class__", None) # py3 - args.pop("beam_size", None) - self.bos_id = args.pop("bos_id") - self.eos_id = args.pop("eos_id") - self.max_out_len = args.pop("max_out_len") - super(BaseGreedyInferModel, self).__init__(**args) - # dynamic decoder for inference - decoder_helper = GreedyEmbeddingHelper( - start_tokens=bos_id, - end_token=eos_id, - embedding_fn=self.decoder.embedder) - decoder = BasicDecoder( - cell=self.decoder.stack_lstm.cell, - helper=decoder_helper, - output_fn=self.decoder.output_layer) - self.greedy_search_decoder = DynamicDecode( - decoder, max_step_num=max_out_len, is_test=True) - - def forward(self, src, src_length): - # encoding - encoder_output, encoder_final_states = self.encoder(src, src_length) - - # dynamic decoding with greedy search - rs, _ = self.greedy_search_decoder(inits=encoder_final_states) - return rs.sample_ids diff --git a/seq2seq/train.py b/examples/seq2seq/train.py similarity index 93% rename from seq2seq/train.py rename to examples/seq2seq/train.py index 4502628737b1af..8a0190cee1865e 100644 --- a/seq2seq/train.py +++ b/examples/seq2seq/train.py @@ -14,8 +14,6 @@ import logging import os -import sys -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import random from functools import partial @@ -23,9 +21,7 @@ import paddle.fluid as fluid from paddle.fluid.io import DataLoader -from model import Input, set_device -from metrics import Metric -from callbacks import ProgBarLogger +from hapi.model import Input, set_device from args import parse_args from seq2seq_base import BaseModel, CrossEntropyCriterion from seq2seq_attn import AttentionModel diff --git a/seq2seq/utility.py b/examples/seq2seq/utility.py similarity index 97% rename from seq2seq/utility.py rename to examples/seq2seq/utility.py index eb54e8c70e5a17..a1c1264eb195df 100644 --- a/seq2seq/utility.py +++ b/examples/seq2seq/utility.py @@ -15,8 +15,8 @@ import numpy as np import paddle.fluid as fluid -from metrics import Metric -from callbacks import ProgBarLogger +from hapi.metrics import Metric +from hapi.callbacks import ProgBarLogger class TrainCallback(ProgBarLogger): diff --git a/hapi/text/text.py b/hapi/text/text.py index e5be32bcb531b9..319800d46597f1 100644 --- a/hapi/text/text.py +++ b/hapi/text/text.py @@ -238,8 +238,9 @@ def __init__(self, self._bias_attr = bias_attr self._gate_activation = gate_activation or layers.sigmoid self._activation = activation or layers.tanh - self._forget_bias = layers.fill_constant( - [1], dtype=dtype, value=forget_bias) + # TODO(guosheng): find better way to resolve constants in __init__ + self._forget_bias = layers.create_global_var( + shape=[1], dtype=dtype, value=forget_bias, persistable=True) self._forget_bias.stop_gradient = False self._dtype = dtype self._input_size = input_size From 79066ac6ca9cab6668bbe27ae43dc1d90bc0a472 Mon Sep 17 00:00:00 2001 From: guosheng Date: Tue, 21 Apr 2020 23:02:47 +0800 Subject: [PATCH 11/13] Fix sequence_length when None for RNN --- hapi/text/text.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hapi/text/text.py b/hapi/text/text.py index 319800d46597f1..ee74c516437a36 100644 --- a/hapi/text/text.py +++ b/hapi/text/text.py @@ -818,7 +818,7 @@ def _maybe_copy(state, new_state, step_mask): lambda x: fluid.layers.transpose(x, [1, 0] + list( range(2, len(x.shape)))), inputs) - if sequence_length: + if sequence_length is not None: mask = fluid.layers.sequence_mask( sequence_length, maxlen=time_steps, @@ -829,7 +829,7 @@ def _maybe_copy(state, new_state, step_mask): inputs = map_structure( lambda x: fluid.layers.reverse(x, axis=[0]), inputs) mask = fluid.layers.reverse( - mask, axis=[0]) if sequence_length else None + mask, axis=[0]) if sequence_length is not None else None states = initial_states outputs = [] @@ -837,7 +837,7 @@ def _maybe_copy(state, new_state, step_mask): step_inputs = map_structure(lambda x: x[i], inputs) step_outputs, new_states = self.cell(step_inputs, states, **kwargs) - if sequence_length: + if sequence_length is not None: new_states = map_structure( partial( _maybe_copy, step_mask=mask[i]), From ecb6d64c93e4c1a981adad83dddea10f17de7b00 Mon Sep 17 00:00:00 2001 From: guosheng Date: Wed, 22 Apr 2020 01:21:21 +0800 Subject: [PATCH 12/13] Update metric of seq2seq to adapt to latest code. --- examples/seq2seq/README.md | 37 +++++++++++-------------------------- examples/seq2seq/predict.py | 2 +- examples/seq2seq/run.sh | 19 ------------------- examples/seq2seq/train.py | 2 +- examples/seq2seq/utility.py | 10 +++++----- 5 files changed, 18 insertions(+), 52 deletions(-) delete mode 100644 examples/seq2seq/run.sh diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md index 99be80a8419189..808d4516c57593 100644 --- a/examples/seq2seq/README.md +++ b/examples/seq2seq/README.md @@ -11,11 +11,9 @@ ├── reader.py # 数据读入程序 ├── download.py # 数据下载程序 ├── train.py # 训练主程序 -├── infer.py # 预测主程序 -├── run.sh # 默认配置的启动脚本 -├── infer.sh # 默认配置的解码脚本 -├── attention_model.py # 带注意力机制的翻译模型程序 -└── base_model.py # 无注意力机制的翻译模型程序 +├── predict.py # 预测主程序 +├── seq2seq_attn.py # 带注意力机制的翻译模型程序 +└── seq2seq_base.py # 无注意力机制的翻译模型程序 ``` ## 简介 @@ -40,13 +38,7 @@ python download.py ## 模型训练 -`run.sh`包含训练程序的主函数,要使用默认参数开始训练,只需要简单地执行: - -``` -sh run.sh -``` - -默认使用带有注意力机制的RNN模型,可以通过修改 `attention` 参数为False来训练不带注意力机制的RNN模型。 +执行以下命令即可训练带有注意力机制的Seq2Seq机器翻译模型: ```sh export CUDA_VISIBLE_DEVICES=0 @@ -70,8 +62,7 @@ python train.py \ --model_path ./attention_models ``` -训练程序会在每个epoch训练结束之后,save一次模型。 - +可以通过修改 `attention` 参数为False来训练不带注意力机制的Seq2Seq模型,各参数的具体说明请参阅 `args.py` 。训练程序会在每个epoch训练结束之后,save一次模型。 默认使用动态图模式进行训练,可以通过设置 `eager_run` 参数为False来以静态图模式进行训练,如下: @@ -100,13 +91,7 @@ python train.py \ ## 模型预测 -当模型训练完成之后, 可以利用infer.sh的脚本进行预测,默认使用beam search的方法进行预测,加载第10个epoch的模型进行预测,对test的数据集进行解码 - -``` -sh infer.sh -``` - -如果想预测别的数据文件,只需要将 --infer_file参数进行修改。 +训练完成之后,可以使用保存的模型(由 `--reload_model` 指定)对test的数据集(由 `--infer_file` 指定)进行beam search解码,命令如下: ```sh export CUDA_VISIBLE_DEVICES=0 @@ -124,13 +109,13 @@ python infer.py \ --max_grad_norm 5.0 \ --vocab_prefix data/en-vi/vocab \ --infer_file data/en-vi/tst2013.en \ - --reload_model attention_models/epoch_10 \ - --infer_output_file attention_infer_output/infer_output.txt \ + --reload_model attention_models/10 \ + --infer_output_file infer_output.txt \ --beam_size 10 \ --use_gpu True ``` -和训练类似,预测时同样可以以静态图模式进行,如下: +各参数的具体说明请参阅 `args.py` ,注意预测时所用模型超参数需和训练时一致。和训练类似,预测时同样可以以静态图模式进行,如下: ```sh export CUDA_VISIBLE_DEVICES=0 @@ -148,8 +133,8 @@ python infer.py \ --max_grad_norm 5.0 \ --vocab_prefix data/en-vi/vocab \ --infer_file data/en-vi/tst2013.en \ - --reload_model attention_models/epoch_10 \ - --infer_output_file attention_infer_output/infer_output.txt \ + --reload_model attention_models/10 \ + --infer_output_file infer_output.txt \ --beam_size 10 \ --use_gpu True \ --eager_run False diff --git a/examples/seq2seq/predict.py b/examples/seq2seq/predict.py index ae8d11be0de3b3..d1e3e87fddf05d 100644 --- a/examples/seq2seq/predict.py +++ b/examples/seq2seq/predict.py @@ -108,7 +108,7 @@ def do_predict(args): # TODO(guosheng): use model.predict when support variant length with io.open(args.infer_output_file, 'w', encoding='utf-8') as f: for data in data_loader(): - finished_seq = model.test(inputs=flatten(data))[0] + finished_seq = model.test_batch(inputs=flatten(data))[0] finished_seq = finished_seq[:, :, np.newaxis] if len( finished_seq.shape) == 2 else finished_seq finished_seq = np.transpose(finished_seq, [0, 2, 1]) diff --git a/examples/seq2seq/run.sh b/examples/seq2seq/run.sh deleted file mode 100644 index 4872fc996a8a86..00000000000000 --- a/examples/seq2seq/run.sh +++ /dev/null @@ -1,19 +0,0 @@ -export CUDA_VISIBLE_DEVICES=0 - -python train.py \ - --src_lang en --tar_lang vi \ - --attention True \ - --num_layers 2 \ - --hidden_size 512 \ - --src_vocab_size 17191 \ - --tar_vocab_size 7709 \ - --batch_size 128 \ - --dropout 0.2 \ - --init_scale 0.1 \ - --max_grad_norm 5.0 \ - --train_data_prefix data/en-vi/train \ - --eval_data_prefix data/en-vi/tst2012 \ - --test_data_prefix data/en-vi/tst2013 \ - --vocab_prefix data/en-vi/vocab \ - --use_gpu True \ - --model_path attention_models \ No newline at end of file diff --git a/examples/seq2seq/train.py b/examples/seq2seq/train.py index 8a0190cee1865e..b7dc7698e31b1b 100644 --- a/examples/seq2seq/train.py +++ b/examples/seq2seq/train.py @@ -67,7 +67,7 @@ def do_train(args): parameter_list=model.parameters(), grad_clip=grad_clip) - ppl_metric = PPL() + ppl_metric = PPL(reset_freq=100) # ppl for every 100 batches model.prepare( optimizer, CrossEntropyCriterion(), diff --git a/examples/seq2seq/utility.py b/examples/seq2seq/utility.py index a1c1264eb195df..aa0dd4a461d24d 100644 --- a/examples/seq2seq/utility.py +++ b/examples/seq2seq/utility.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np +import math + import paddle.fluid as fluid from hapi.metrics import Metric @@ -55,13 +56,12 @@ def __init__(self, reset_freq=100, name=None): self.reset_freq = reset_freq self.reset() - def add_metric_op(self, pred, label): - seq_length = label[0] + def add_metric_op(self, pred, seq_length, label): word_num = fluid.layers.reduce_sum(seq_length) return word_num def update(self, word_num): - self.word_count += word_num[0] + self.word_count += word_num return word_num def reset(self): @@ -76,5 +76,5 @@ def name(self): def cal_acc_ppl(self, batch_loss, batch_size): self.total_loss += batch_loss * batch_size - ppl = np.exp(self.total_loss / self.word_count) + ppl = math.exp(self.total_loss / self.word_count) return ppl \ No newline at end of file From 666339f51216f2db02ac80d56b42fe9f6faed779 Mon Sep 17 00:00:00 2001 From: guosheng Date: Thu, 23 Apr 2020 01:05:15 +0800 Subject: [PATCH 13/13] Refine transformer batch length and move transformer to examples. Refine len for data_loader in model.py. --- .../transformer}/README.md | 2 +- .../transformer}/gen_data.sh | 0 .../images/multi_head_attention.png | Bin .../images/transformer_network.png | Bin .../transformer}/predict.py | 9 ++--- .../transformer}/reader.py | 13 ++++--- .../transformer}/train.py | 33 +++++++++++++---- .../transformer}/transformer.py | 4 +- .../transformer}/transformer.yaml | 0 .../transformer}/utils/__init__.py | 0 .../transformer}/utils/check.py | 0 .../transformer}/utils/configure.py | 16 +++++--- hapi/callbacks.py | 12 +++--- hapi/model.py | 35 +++++++++++------- 14 files changed, 77 insertions(+), 47 deletions(-) rename {transformer => examples/transformer}/README.md (99%) rename {transformer => examples/transformer}/gen_data.sh (100%) rename {transformer => examples/transformer}/images/multi_head_attention.png (100%) rename {transformer => examples/transformer}/images/transformer_network.png (100%) rename {transformer => examples/transformer}/predict.py (94%) rename {transformer => examples/transformer}/reader.py (97%) rename {transformer => examples/transformer}/train.py (83%) rename {transformer => examples/transformer}/transformer.py (99%) rename {transformer => examples/transformer}/transformer.yaml (100%) rename {transformer => examples/transformer}/utils/__init__.py (100%) rename {transformer => examples/transformer}/utils/check.py (100%) rename {transformer => examples/transformer}/utils/configure.py (95%) diff --git a/transformer/README.md b/examples/transformer/README.md similarity index 99% rename from transformer/README.md rename to examples/transformer/README.md index 2c4c22b91788a0..0c785de8a26210 100644 --- a/transformer/README.md +++ b/examples/transformer/README.md @@ -201,7 +201,7 @@ python -u predict.py \ --special_token '' '' '' \ --predict_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \ --batch_size 32 \ - --init_from_params base_model_dygraph/step_100000/transformer \ + --init_from_params big_model_dygraph/step_100000/transformer \ --beam_size 5 \ --max_out_len 255 \ --output_file predict.txt \ diff --git a/transformer/gen_data.sh b/examples/transformer/gen_data.sh similarity index 100% rename from transformer/gen_data.sh rename to examples/transformer/gen_data.sh diff --git a/transformer/images/multi_head_attention.png b/examples/transformer/images/multi_head_attention.png similarity index 100% rename from transformer/images/multi_head_attention.png rename to examples/transformer/images/multi_head_attention.png diff --git a/transformer/images/transformer_network.png b/examples/transformer/images/transformer_network.png similarity index 100% rename from transformer/images/transformer_network.png rename to examples/transformer/images/transformer_network.png diff --git a/transformer/predict.py b/examples/transformer/predict.py similarity index 94% rename from transformer/predict.py rename to examples/transformer/predict.py index b83d5403486c1e..a6e14314f523d7 100644 --- a/transformer/predict.py +++ b/examples/transformer/predict.py @@ -14,9 +14,6 @@ import logging import os -import six -import sys -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from functools import partial import numpy as np @@ -28,9 +25,9 @@ from utils.configure import PDConfig from utils.check import check_gpu, check_version -from model import Input, set_device +from hapi.model import Input, set_device from reader import prepare_infer_input, Seq2SeqDataset, Seq2SeqBatchSampler -from transformer import InferTransformer, position_encoding_init +from transformer import InferTransformer def post_process_seq(seq, bos_idx, eos_idx, output_bos=False, @@ -132,7 +129,7 @@ def do_predict(args): # TODO: use model.predict when support variant length f = open(args.output_file, "wb") for data in data_loader(): - finished_seq = transformer.test(inputs=flatten(data))[0] + finished_seq = transformer.test_batch(inputs=flatten(data))[0] finished_seq = np.transpose(finished_seq, [0, 2, 1]) for ins in finished_seq: for beam_idx, beam in enumerate(ins): diff --git a/transformer/reader.py b/examples/transformer/reader.py similarity index 97% rename from transformer/reader.py rename to examples/transformer/reader.py index 2e3fc59e0d3a85..f6891df960b66f 100644 --- a/transformer/reader.py +++ b/examples/transformer/reader.py @@ -13,7 +13,7 @@ # limitations under the License. import glob -import six +import sys import os import io import itertools @@ -26,7 +26,7 @@ def create_data_loader(args, device): - data_loaders = [None, None] + data_loaders = [(None, None)] * 2 data_files = [args.training_file, args.validation_file ] if args.validation_file else [args.training_file] for i, data_file in enumerate(data_files): @@ -65,7 +65,7 @@ def create_data_loader(args, device): n_head=args.n_head), num_workers=0, # TODO: use multi-process return_list=True) - data_loaders[i] = data_loader + data_loaders[i] = (data_loader, batch_sampler.__len__) return data_loaders @@ -476,6 +476,7 @@ def __iter__(self): for i in range(self._nranks) ] for batch in batches] batches = list(itertools.chain.from_iterable(batches)) + self.batch_number = (len(batches) + self._nranks - 1) // self._nranks # for multi-device for batch_id, batch in enumerate(batches): @@ -489,11 +490,13 @@ def __iter__(self): yield batch_indices def __len__(self): + if hasattr(self, "batch_number"): # + return self.batch_number if not self._use_token_batch: batch_number = ( len(self._dataset) + self._batch_size * self._nranks - 1) // ( self._batch_size * self._nranks) else: - # TODO(guosheng): fix the uncertain length - batch_number = 1 + # for uncertain batch number, the actual value is self.batch_number + batch_number = sys.maxsize return batch_number diff --git a/transformer/train.py b/examples/transformer/train.py similarity index 83% rename from transformer/train.py rename to examples/transformer/train.py index 04a61f83a0191a..94b52b4423839a 100644 --- a/transformer/train.py +++ b/examples/transformer/train.py @@ -14,9 +14,6 @@ import logging import os -import six -import sys -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import numpy as np import paddle @@ -26,14 +23,18 @@ from utils.configure import PDConfig from utils.check import check_gpu, check_version -from model import Input, set_device -from callbacks import ProgBarLogger +from hapi.model import Input, set_device +from hapi.callbacks import ProgBarLogger from reader import create_data_loader from transformer import Transformer, CrossEntropyCriterion class TrainCallback(ProgBarLogger): - def __init__(self, args, verbose=2): + def __init__(self, + args, + verbose=2, + train_steps_fn=None, + eval_steps_fn=None): # TODO(guosheng): save according to step super(TrainCallback, self).__init__(args.print_step, verbose) # the best cross-entropy value with label smoothing @@ -42,11 +43,17 @@ def __init__(self, args, verbose=2): (1. - args.label_smooth_eps)) + args.label_smooth_eps * np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20)) self.loss_normalizer = loss_normalizer + self.train_steps_fn = train_steps_fn + self.eval_steps_fn = eval_steps_fn def on_train_begin(self, logs=None): super(TrainCallback, self).on_train_begin(logs) self.train_metrics += ["normalized loss", "ppl"] + def on_train_batch_begin(self, step, logs=None): + if step == 0 and self.train_steps_fn: + self.train_progbar._num = self.train_steps_fn() + def on_train_batch_end(self, step, logs=None): logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer logs["ppl"] = np.exp(min(logs["loss"][0], 100)) @@ -57,6 +64,10 @@ def on_eval_begin(self, logs=None): self.eval_metrics = list( self.eval_metrics) + ["normalized loss", "ppl"] + def on_eval_batch_begin(self, step, logs=None): + if step == 0 and self.eval_steps_fn: + self.eval_progbar._num = self.eval_steps_fn() + def on_eval_batch_end(self, step, logs=None): logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer logs["ppl"] = np.exp(min(logs["loss"][0], 100)) @@ -104,7 +115,8 @@ def do_train(args): ] # def dataloader - train_loader, eval_loader = create_data_loader(args, device) + (train_loader, train_steps_fn), ( + eval_loader, eval_steps_fn) = create_data_loader(args, device) # define model transformer = Transformer( @@ -142,7 +154,12 @@ def do_train(args): eval_freq=1, save_freq=1, save_dir=args.save_model, - callbacks=[TrainCallback(args)]) + callbacks=[ + TrainCallback( + args, + train_steps_fn=train_steps_fn, + eval_steps_fn=eval_steps_fn) + ]) if __name__ == "__main__": diff --git a/transformer/transformer.py b/examples/transformer/transformer.py similarity index 99% rename from transformer/transformer.py rename to examples/transformer/transformer.py index 9caf4b04a1a34c..30bb931d28c3b5 100644 --- a/transformer/transformer.py +++ b/examples/transformer/transformer.py @@ -20,8 +20,8 @@ import paddle.fluid.layers as layers from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay -from model import Model, CrossEntropy, Loss -from text import TransformerBeamSearchDecoder, DynamicDecode +from hapi.model import Model, CrossEntropy, Loss +from hapi.text import TransformerBeamSearchDecoder, DynamicDecode def position_encoding_init(n_position, d_pos_vec): diff --git a/transformer/transformer.yaml b/examples/transformer/transformer.yaml similarity index 100% rename from transformer/transformer.yaml rename to examples/transformer/transformer.yaml diff --git a/transformer/utils/__init__.py b/examples/transformer/utils/__init__.py similarity index 100% rename from transformer/utils/__init__.py rename to examples/transformer/utils/__init__.py diff --git a/transformer/utils/check.py b/examples/transformer/utils/check.py similarity index 100% rename from transformer/utils/check.py rename to examples/transformer/utils/check.py diff --git a/transformer/utils/configure.py b/examples/transformer/utils/configure.py similarity index 95% rename from transformer/utils/configure.py rename to examples/transformer/utils/configure.py index 67e601282fee57..17dfaa53d8b44a 100644 --- a/transformer/utils/configure.py +++ b/examples/transformer/utils/configure.py @@ -195,13 +195,19 @@ def __init__(self, json_file="", yaml_file="", fuse_args=True): "Whether to perform predicting.") self.default_g.add_arg("do_eval", bool, False, "Whether to perform evaluating.") - self.default_g.add_arg("do_save_inference_model", bool, False, - "Whether to perform model saving for inference.") + self.default_g.add_arg( + "do_save_inference_model", bool, False, + "Whether to perform model saving for inference.") # NOTE: args for profiler - self.default_g.add_arg("is_profiler", int, 0, "the switch of profiler tools. (used for benchmark)") - self.default_g.add_arg("profiler_path", str, './', "the profiler output file path. (used for benchmark)") - self.default_g.add_arg("max_iter", int, 0, "the max train batch num.(used for benchmark)") + self.default_g.add_arg( + "is_profiler", int, 0, + "the switch of profiler tools. (used for benchmark)") + self.default_g.add_arg( + "profiler_path", str, './', + "the profiler output file path. (used for benchmark)") + self.default_g.add_arg("max_iter", int, 0, + "the max train batch num.(used for benchmark)") self.parser = parser diff --git a/hapi/callbacks.py b/hapi/callbacks.py index f02eec1ac7b20f..62d6402941d0ab 100644 --- a/hapi/callbacks.py +++ b/hapi/callbacks.py @@ -215,13 +215,13 @@ def on_train_batch_end(self, step, logs=None): if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv( ).local_rank == 0: - # if steps is not None, last step will update in on_epoch_end - if self.steps and self.train_step < self.steps: + if self.steps is None or self.train_step < self.steps: self._updates(logs, 'train') def on_epoch_end(self, epoch, logs=None): logs = logs or {} - if self.verbose and ParallelEnv().local_rank == 0: + if self.train_step % self.log_freq != 0 and self.verbose and ParallelEnv( + ).local_rank == 0: self._updates(logs, 'train') def on_eval_begin(self, logs=None): @@ -242,14 +242,14 @@ def on_eval_batch_end(self, step, logs=None): if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv( ).local_rank == 0: - # if steps is not None, last step will update in on_epoch_end - if self.eval_steps and self.eval_step < self.eval_steps: + if self.eval_steps is None or self.eval_step < self.eval_steps: self._updates(logs, 'eval') def on_eval_end(self, logs=None): logs = logs or {} if self.verbose and ParallelEnv().local_rank == 0: - self._updates(logs, 'eval') + if self.eval_step % self.log_freq != 0: + self._updates(logs, 'eval') print('Eval samples: %d' % (self.evaled_samples)) diff --git a/hapi/model.py b/hapi/model.py index 3593f00acaa9f2..fa8d8f7f9fcedb 100644 --- a/hapi/model.py +++ b/hapi/model.py @@ -576,14 +576,15 @@ def train_batch(self, inputs, labels=None): if labels is not None: labels = [to_variable(l) for l in to_list(labels)] if self._nranks > 1: - outputs = self.ddp_model.forward(*[to_variable(x) for x in inputs]) + outputs = self.ddp_model.forward( + * [to_variable(x) for x in inputs]) losses = self.model._loss_function(outputs, labels) final_loss = fluid.layers.sum(losses) final_loss = self.ddp_model.scale_loss(final_loss) final_loss.backward() self.ddp_model.apply_collective_grads() else: - outputs = self.model.forward(*[to_variable(x) for x in inputs]) + outputs = self.model.forward(* [to_variable(x) for x in inputs]) losses = self.model._loss_function(outputs, labels) final_loss = fluid.layers.sum(losses) final_loss.backward() @@ -592,9 +593,9 @@ def train_batch(self, inputs, labels=None): self.model.clear_gradients() metrics = [] for metric in self.model._metrics: - metric_outs = metric.add_metric_op( - *(to_list(outputs) + to_list(labels))) - m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)]) + metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list( + labels))) + m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)]) metrics.append(m) return ([to_numpy(l) for l in losses], metrics) \ @@ -606,7 +607,7 @@ def eval_batch(self, inputs, labels=None): inputs = to_list(inputs) if labels is not None: labels = [to_variable(l) for l in to_list(labels)] - outputs = self.model.forward(*[to_variable(x) for x in inputs]) + outputs = self.model.forward(* [to_variable(x) for x in inputs]) if self.model._loss_function: losses = self.model._loss_function(outputs, labels) else: @@ -632,9 +633,9 @@ def eval_batch(self, inputs, labels=None): self._merge_count[self.mode + '_total'] += samples self._merge_count[self.mode + '_batch'] = samples - metric_outs = metric.add_metric_op( - *(to_list(outputs) + to_list(labels))) - m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)]) + metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list( + labels))) + m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)]) metrics.append(m) # To be consistent with static graph @@ -1009,7 +1010,7 @@ def fit( do_eval = eval_loader is not None self._test_dataloader = eval_loader metrics_name = self._metrics_name() - steps = len(train_loader) if hasattr(train_loader, '__len__') else None + steps = self._len_data_loader(train_loader) cbks = config_callbacks( callbacks, model=self, @@ -1037,8 +1038,7 @@ def fit( if not isinstance(eval_loader, Iterable): loader = eval_loader() - eval_steps = len(loader) if hasattr(loader, - '__len__') else None + eval_steps = self._len_data_loader(loader) cbks.on_begin('eval', { 'steps': eval_steps, 'metrics_name': metrics_name @@ -1114,7 +1114,7 @@ def evaluate( if not isinstance(eval_loader, Iterable): loader = eval_loader() - eval_steps = len(loader) if hasattr(loader, '__len__') else None + eval_steps = self._len_data_loader(loader) cbks.on_begin('eval', {'steps': eval_steps, 'metrics_name': metrics_name}) @@ -1205,7 +1205,7 @@ def _run_one_epoch(self, mode, metrics_name, epoch=None): - size = len(data_loader) if hasattr(data_loader, '__len__') else None + size = self._len_data_loader(data_loader) logs = { 'steps': size, 'metrics_name': metrics_name, @@ -1280,3 +1280,10 @@ def _metrics_name(self): for m in self._metrics: metrics_name.extend(to_list(m.name())) return metrics_name + + def _len_data_loader(self, data_loader): + try: + steps = len(data_loader) + except Exception: + steps = None + return steps