diff --git a/applications/text_summarization/pegasus/README.md b/applications/text_summarization/pegasus/README.md index 16ab3d18f40d..d2fd3754867b 100644 --- a/applications/text_summarization/pegasus/README.md +++ b/applications/text_summarization/pegasus/README.md @@ -1,6 +1,6 @@ # 生成式文本摘要应用 - **目录** + - [生成式文本摘要应用](#生成式文本摘要应用) - [简介](#简介) - [效果展示](#效果展示) @@ -21,53 +21,65 @@ - [模型部署](#模型部署) - [References](#references) - ## 简介 + 文本摘要的目标是自动地将输入文本转换成简短摘要,为用户提供简明扼要的内容描述,是缓解文本信息过载的一个重要手段。 文本摘要也是自然语言生成领域中的一个重要任务,有很多应用场景,如新闻摘要、论文摘要、财报摘要、传记摘要、专利摘要、对话摘要、评论摘要、观点摘要、电影摘要、文章标题生成、商品名生成、自动报告生成、搜索结果预览等。 本项目是基于预训练语言模型PEGASUS的中文文本摘要产业实践,具有以下优势: + - 效果领先。在LCSTS上效果达到SOTA。 - 开箱即用。本项目提供TaskFlow接口,无需训练,仅需几行代码便可预测。 -- 高性能推理。本项目基于[FasterGeneration](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/faster_generation)进行推理加速,能够提供更高性能的推理体验。 +- 高性能推理。本项目基于[FasterGeneration](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/faster_generation) + 进行推理加速,能够提供更高性能的推理体验。 - 训练推理全流程打通。本项目提供了全面的定制训练流程,从数据准备、模型训练预测,到模型推理部署,一应俱全。 ## 效果展示 ## 开箱即用 + PaddleNLP提供开箱即用的产业级NLP预置任务能力,无需训练,一键预测。 + ### 支持单条、批量预测 ```python ->>> from paddlenlp import Taskflow ->>> summarizer = Taskflow("text_summarization") +>> > from paddlenlp import Taskflow +>> > summarizer = Taskflow("text_summarization") # 单条输入 ->>> summarizer('2022年,中国房地产进入转型阵痛期,传统“高杠杆、快周转”的模式难以为继,万科甚至直接喊话,中国房地产进入“黑铁时代”') +>> > summarizer( + '2022年,中国房地产进入转型阵痛期,传统“高杠杆、快周转”的模式难以为继,万科甚至直接喊话,中国房地产进入“黑铁时代”') # 输出:['万科喊话中国房地产进入“黑铁时代”'] # 多条输入 ->>> summarizer([ +>> > summarizer([ '据悉,2022年教育部将围绕“巩固提高、深化落实、创新突破”三个关键词展开工作。要进一步强化学校教育主阵地作用,继续把落实“双减”作为学校工作的重中之重,重点从提高作业设计水平、提高课后服务水平、提高课堂教学水平、提高均衡发展水平四个方面持续巩固提高学校“双减”工作水平。', '党参有降血脂,降血压的作用,可以彻底消除血液中的垃圾,从而对冠心病以及心血管疾病的患者都有一定的稳定预防工作作用,因此平时口服党参能远离三高的危害。另外党参除了益气养血,降低中枢神经作用,调整消化系统功能,健脾补肺的功能。' - ]) -#输出:['教育部:将从四个方面持续巩固提高学校“双减”工作水平', '党参能降低三高的危害'] +]) +# 输出:['教育部:将从四个方面持续巩固提高学校“双减”工作水平', '党参能降低三高的危害'] ``` ### 可配置参数说明 + * `model`:可选模型,默认为`IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese`。 * `batch_size`:批处理大小,请结合机器情况进行调整,默认为1。 - ## 训练定制 + ### 文本摘要应用定制训练全流程介绍 + 接下来,我们将按数据准备、训练、预测、推理部署对文本摘要应用的全流程进行介绍。 + 1. **数据准备** + - 如果没有已标注的数据集,我们推荐[doccano](https://github.com/doccano/doccano)数据标注工具。 -如果已有标注好的本地数据集,我们需要根据将数据集整理为文档要求的格式,请参考[从本地文件创建数据集](#从本地文件创建数据集)。 + 如果已有标注好的本地数据集,我们需要根据将数据集整理为文档要求的格式,请参考[从本地文件创建数据集](#从本地文件创建数据集) + 。 2. **模型训练** -- 数据准备完成后,可以开始使用我们的数据集对预训练模型进行微调训练。我们可以根据任务需求,调整可配置参数,选择使用GPU或CPU进行模型训练,脚本默认保存在开发集最佳表现模型。中文任务默认使用"IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese"模型,还支持large模型: "IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese"。 +- +数据准备完成后,可以开始使用我们的数据集对预训练模型进行微调训练。我们可以根据任务需求,调整可配置参数,选择使用GPU或CPU进行模型训练,脚本默认保存在开发集最佳表现模型。中文任务默认使用" +IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese"模型,还支持large模型: "IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese"。 3. **模型预测** @@ -90,6 +102,9 @@ PaddleNLP提供开箱即用的产业级NLP预置任务能力,无需训练, ```text text_summarization/ +├── data # 数据 +│ ├── train.json # 训练数据集文件 +│ └── test.json # 可选,待预测数据文件 ├── deploy # 部署 │ ├── paddle_inference # PaddleInference高性能推理部署 │ │ ├── inference_pegasus.py # 推理部署脚本 @@ -100,12 +115,11 @@ text_summarization/ │ ├── pipeline_service.py # 服务器程序 │ ├── export_serving.sh # serving模型导出脚本 │ └── README.md # 说明文档 +├── run_prepare.py # 小数据集获取脚本 ├── export_model.py # 动态图参数导出静态图参数脚本 ├── export_model.sh # 动态图参数导出静态图参数shell脚本 -├── run_summarization.py # 训练评估脚本 -├── run_train.sh # 训练评估shell脚本 -├── run_generate.py # 预测脚本 -├── run_generate.sh # 预测shell脚本 +├── predict.py # 预测脚本 +├── train.py # 训练评估脚本 ├── utils.py # 工具函数脚本 ├── requirements.txt # 依赖包 └── README.md # 说明文档 @@ -114,6 +128,7 @@ text_summarization/ ### 数据准备 #### 数据加载 + #### 从本地文件创建数据集 在许多情况,我们需要使用本地数据集来训练我们的文本摘要模型,本项目支持使用固定格式本地数据集文件进行训练。 @@ -125,8 +140,11 @@ data/ ├── train.json # 训练数据集文件 └── test.json # 可选,待预测数据文件 ``` + 本地数据集文件格式如下: + - train.json/test.json 文件每行格式: + ```text { "title": "任志强抨击政府把土地作为投机品地产业被人为破坏", @@ -134,24 +152,31 @@ data/ } ``` -更多数据集读取格式详见[数据集加载](https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_load.html#)和[自定义数据集](https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html)。 +这里提供小数据集供测试,运行下面命令即可下载: + +```bash +python run_prepare.py +``` +更多数据集读取格式详见[数据集加载](https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_load.html#) +和[自定义数据集](https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html)。 ### 模型训练 + 运行如下命令即可在样例训练集上进行finetune,并在样例验证集上进行验证。 ```shell # GPU启动,参数`--gpus`指定训练所用的GPU卡号,可以是单卡,也可以多卡 unset CUDA_VISIBLE_DEVICES -python -m paddle.distributed.launch --gpus "2,3,4,5,6,7" run_summarization.py \ +python -m paddle.distributed.launch --gpus "2,3,4,5,6,7" train.py \ --model_name_or_path=IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese \ - --train_file train.json \ - --eval_file test.json \ - --output_dir pegesus_out \ + --train_file data/train.json \ + --eval_file data/test.json \ + --output_dir pegasus_out \ --max_source_length 128 \ --max_target_length 64 \ - --num_train_epochs 20 \ + --epoch 20 \ --logging_steps 1 \ --save_steps 10000 \ --train_batch_size 128 \ @@ -161,18 +186,20 @@ python -m paddle.distributed.launch --gpus "2,3,4,5,6,7" run_summarization.py \ --weight_decay=0.01 \ --device=gpu \ ``` -也可以直接使用`run_train.sh`. 关键参数释义如下: + - `gpus` 指示了训练所用的GPU卡号。 - `train_file` 本地训练数据地址。 - `eval_file` 本地测试数据地址。 -- `model_name_or_path` 指示了finetune使用的具体预训练模型,可以是PaddleNLP提供的预训练模型,或者是本地的预训练模型。如果使用本地的预训练模型,可以配置本地模型的目录地址,例如: ./checkpoints/model_xx/,目录中需包含paddle预训练模型model_state.pdparams。如果使用PaddleNLP提供的预训练模型,可以选择下面其中之一。 +- `model_name_or_path` + 指示了finetune使用的具体预训练模型,可以是PaddleNLP提供的预训练模型,或者是本地的预训练模型。如果使用本地的预训练模型,可以配置本地模型的目录地址,例如: + ./checkpoints/model_xx/,目录中需包含paddle预训练模型model_state.pdparams。如果使用PaddleNLP提供的预训练模型,可以选择下面其中之一。 - | PaddleNLP提供的预训练模型 | - |---------------------------------| - | IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese | - | IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese | + | PaddleNLP提供的预训练模型 | + |---------------------------------| + | IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese | + | IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese | - `output_dir` 表示模型的保存路径。 - `logging_steps` 表示日志打印间隔。 @@ -183,7 +210,9 @@ python -m paddle.distributed.launch --gpus "2,3,4,5,6,7" run_summarization.py \ - `eval_batch_size` 表示每次验证**每张卡**上的样本数目。 - `learning_rate` 表示基础学习率大小,将于learning rate scheduler产生的值相乘作为当前学习率。 - `weight_decay` 表示AdamW优化器中使用的weight_decay的系数。 -- `warmup_propotion` 表示学习率逐渐升高到基础学习率(即上面配置的learning_rate)所需要的迭代数占总步数的比例,最早的使用可以参考[这篇论文](https://arxiv.org/pdf/1706.02677.pdf)。 +- `warmup_propotion` + 表示学习率逐渐升高到基础学习率(即上面配置的learning_rate)所需要的迭代数占总步数的比例,最早的使用可以参考[这篇论文](https://arxiv.org/pdf/1706.02677.pdf) + 。 - `max_source_length` 模型输入序列的最大长度。 - `max_target_length` 模型训练时标签的最大长度。 - `device` 表示使用的设备,从gpu和cpu中选择。 @@ -192,20 +221,18 @@ python -m paddle.distributed.launch --gpus "2,3,4,5,6,7" run_summarization.py \ 程序运行时将会自动进行训练和验证,训练过程中会自动保存模型在指定的`output_dir`中。 如: + ```text -./pegeaus_model/ -├── pegeaus_model_10000 -│ ├── model_config.json -│ ├── model_state.pdparams -│ ├── special_tokens_map.json -│ ├── tokenizer_config.json -│ └── vocab.txt -└── ... +./pegasus_out/ +├── model_config.json +├── model_state.pdparams +├── special_tokens_map.json +├── tokenizer_config.json +└── vocab.txt ``` **NOTE:** 如需恢复模型训练,`model_name_or_path`配置本地模型的目录地址即可。 - ### 模型预测 运行下方脚本可以使用训练好的模型进行预测。 @@ -213,43 +240,42 @@ python -m paddle.distributed.launch --gpus "2,3,4,5,6,7" run_summarization.py \ ```shell unset CUDA_VISIBLE_DEVICES -python run_generate.py \ - --model_name_or_path=pegesus_out/pegeaus_model_10000 \ - --prefict_file valid.json \ +python predict.py \ + --init_checkpoint_dir=pegasus_out \ + --prefict_file data/valid.json \ --max_source_length 128 \ --max_target_length 64 \ --batch_size 128 \ - --output_path generate.txt \ --device=gpu \ ``` 程序运行结束后会将预测结果保存在`output_path`中。 - Finetuned baseline的模型在[LCSTS](https://aclanthology.org/D15-1229/)测试集上有如下结果: -| model_name | Rouge-1 | Rouge-2 | Rouge-L | BLEU-4 | +| model_name | Rouge-1 | Rouge-2 | Rouge-L | BLEU-4 | | :-----------------------------: | :---: | :-----------: | :-------------------: |:-------------------: | -| finetuned IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese | 43.30 | 30.08 | 40.12 | 24.50 | -| finetuned IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese | 48.13 | 36.41 | 45.39 | 31.99 | - +| finetuned IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese | 43.30 | 30.08 | 40.12 | 24.50 | +| finetuned IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese | 48.13 | 36.41 | 45.39 | 31.99 | ### 模型推理部署 #### FasterGeneration加速及模型静态图导出 -使用动态图训练结束之后,可以通过[静态图导出脚本](export_model.py)实现基于FasterGeneration的高性能预测加速,并将动态图参数导出成静态图参数,静态图参数保存在`output_path`指定路径中。运行方式: +使用动态图训练结束之后,可以通过[静态图导出脚本](export_model.py) +实现基于FasterGeneration的高性能预测加速,并将动态图参数导出成静态图参数,静态图参数保存在`output_path`指定路径中。运行方式: ```shell python export_model.py \ --model_name_or_path IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese \ --decoding_strategy beam_search \ - --inference_model_dir ./inference_model \ + --export_output_dir ./inference_model \ --max_out_len 30 \ ``` + 关键参数释义如下: * `model_name_or_path`:动态图训练保存的参数路径;默认为"IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese"。 -* `inference_model_dir`:静态图图保存的参数路径;默认为"./inference_model"。 +* `export_output_dir`:静态图图保存的参数路径;默认为"./inference_model"。 * `max_out_len`:最大输出长度。 执行命令后将会自动导出模型到指定的 `inference_model` 中,保存模型文件结构如下所示: @@ -262,10 +288,15 @@ inference_model/ ``` #### 模型部署 + 文本摘要应用已打通多种场景部署方案,点击链接获取具体的使用教程。 + - [Paddle Inference 推理 (Python)](./deploy/paddle_inference/README.md) - [Paddle Serving 服务化部署(Python)](./deploy/paddle_serving/README.md) ## References -- Zhang J, Zhao Y, Saleh M, et al. Pegasus: Pre-training with extracted gap-sentences for abstractive summarization[C]//International Conference on Machine Learning. PMLR, 2020: 11328-11339. -- Wang J, Zhang Y, Zhang L, et al. Fengshenbang 1.0: Being the Foundation of Chinese Cognitive Intelligence[J]. arXiv preprint arXiv:2209.02970, 2022. + +- Zhang J, Zhao Y, Saleh M, et al. Pegasus: Pre-training with extracted gap-sentences for abstractive summarization[C] + //International Conference on Machine Learning. PMLR, 2020: 11328-11339. +- Wang J, Zhang Y, Zhang L, et al. Fengshenbang 1.0: Being the Foundation of Chinese Cognitive Intelligence[J]. arXiv + preprint arXiv:2209.02970, 2022. diff --git a/applications/text_summarization/pegasus/deploy/paddle_inference/inference_pegasus.py b/applications/text_summarization/pegasus/deploy/paddle_inference/inference_pegasus.py index a8db929a8b7f..533619f87ab6 100644 --- a/applications/text_summarization/pegasus/deploy/paddle_inference/inference_pegasus.py +++ b/applications/text_summarization/pegasus/deploy/paddle_inference/inference_pegasus.py @@ -12,14 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import argparse -import numpy as np +import os from pprint import pprint -import paddle + +import numpy as np from paddle import inference -from paddlenlp.transformers import PegasusChineseTokenizer, PegasusForConditionalGeneration + from paddlenlp.ops.ext_utils import load +from paddlenlp.transformers import PegasusChineseTokenizer def setup_args(): diff --git a/applications/text_summarization/pegasus/deploy/paddle_serving/pipeline_client.py b/applications/text_summarization/pegasus/deploy/paddle_serving/pipeline_client.py index 44b2c1afa660..dd29c419b16f 100644 --- a/applications/text_summarization/pegasus/deploy/paddle_serving/pipeline_client.py +++ b/applications/text_summarization/pegasus/deploy/paddle_serving/pipeline_client.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import time + from paddle_serving_server.pipeline import PipelineClient + from paddlenlp.utils.log import logger diff --git a/applications/text_summarization/pegasus/deploy/paddle_serving/pipeline_service.py b/applications/text_summarization/pegasus/deploy/paddle_serving/pipeline_service.py index 7bdaa67febeb..e8558d501408 100644 --- a/applications/text_summarization/pegasus/deploy/paddle_serving/pipeline_service.py +++ b/applications/text_summarization/pegasus/deploy/paddle_serving/pipeline_service.py @@ -13,11 +13,10 @@ # limitations under the License. import numpy as np -from numpy import array -import paddle_serving_server.pipeline.operator -from paddle_serving_server.web_service import WebService, Op -from paddlenlp.transformers import PegasusChineseTokenizer +from paddle_serving_server.web_service import Op, WebService + from paddlenlp.ops.ext_utils import load +from paddlenlp.transformers import PegasusChineseTokenizer from paddlenlp.utils.log import logger diff --git a/applications/text_summarization/pegasus/export_model.py b/applications/text_summarization/pegasus/export_model.py index 820aa5aa22ce..fad70e14ee88 100644 --- a/applications/text_summarization/pegasus/export_model.py +++ b/applications/text_summarization/pegasus/export_model.py @@ -12,12 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import argparse -import paddle +import os from pprint import pprint -from paddlenlp.transformers import PegasusChineseTokenizer, PegasusForConditionalGeneration + +import paddle + from paddlenlp.ops import FasterPegasus +from paddlenlp.transformers import ( + PegasusChineseTokenizer, + PegasusForConditionalGeneration, +) from paddlenlp.utils.log import logger @@ -30,10 +35,7 @@ def parse_args(): help="The model name to specify the Pegasus to use. ", ) parser.add_argument( - "--inference_model_dir", - default="./inference_model", - type=str, - help="Path to save inference model of Pegasus. ", + "--export_output_dir", default="./inference_model", type=str, help="Path to save inference model of Pegasus. " ) parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure top_k sampling. ") parser.add_argument( @@ -122,8 +124,8 @@ def do_predict(args): ) # Save converted static graph model - paddle.jit.save(pegasus, os.path.join(args.inference_model_dir, "pegasus")) - logger.info("PEGASUS has been saved to {}.".format(args.inference_model_dir)) + paddle.jit.save(pegasus, os.path.join(args.export_output_dir, "pegasus")) + logger.info("PEGASUS has been saved to {}.".format(args.export_output_dir)) if __name__ == "__main__": diff --git a/applications/text_summarization/pegasus/export_model.sh b/applications/text_summarization/pegasus/export_model.sh index 1cc885d3dfeb..95bf1d5acb75 100644 --- a/applications/text_summarization/pegasus/export_model.sh +++ b/applications/text_summarization/pegasus/export_model.sh @@ -15,5 +15,5 @@ python export_model.py \ --model_name_or_path IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese \ --decoding_strategy beam_search \ - --inference_model_dir ./inference_model \ + --export_output_dir ./inference_model \ --max_out_len 30 \ \ No newline at end of file diff --git a/applications/text_summarization/pegasus/run_generate.py b/applications/text_summarization/pegasus/predict.py similarity index 92% rename from applications/text_summarization/pegasus/run_generate.py rename to applications/text_summarization/pegasus/predict.py index 08b704069e21..17c55e175c3a 100644 --- a/applications/text_summarization/pegasus/run_generate.py +++ b/applications/text_summarization/pegasus/predict.py @@ -12,38 +12,44 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys import argparse import random import time from functools import partial from pprint import pprint + import numpy as np -from datasets import load_dataset import paddle +from datasets import load_dataset from paddle.io import BatchSampler, DataLoader -from paddlenlp.transformers import PegasusForConditionalGeneration, PegasusChineseTokenizer -from utils import convert_example, compute_metrics +from utils import compute_metrics, convert_example + from paddlenlp.data import DataCollatorForSeq2Seq +from paddlenlp.transformers import ( + PegasusChineseTokenizer, + PegasusForConditionalGeneration, +) def parse_args(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( - "--model_name_or_path", + "--init_checkpoint_dir", default="IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese", type=str, required=True, help="Path to pre-trained model. ", ) - parser.add_argument("--prefict_file", type=str, required=False, default=None, help="Predict data path.") + parser.add_argument( + "--prefict_file", type=str, required=False, default="data/valid.json", help="Predict data path." + ) parser.add_argument( "--output_path", type=str, default="generate.txt", help="The file path where the infer result will be saved." ) parser.add_argument( "--max_source_length", - default=1024, + default=128, type=int, help="The maximum total input sequence length after " "tokenization.Sequences longer than this will be truncated, sequences shorter will be padded.", @@ -56,7 +62,7 @@ def parse_args(): ) parser.add_argument( "--max_target_length", - default=142, + default=64, type=int, help="The maximum total sequence length for target text after " "tokenization. Sequences longer than this will be truncated, sequences shorter will be padded." @@ -94,7 +100,7 @@ def parse_args(): action="store_true", help="Whether to use fp16 when using faster transformer. Only works when using faster transformer. ", ) - parser.add_argument("--batch_size", default=64, type=int, help="Batch size per GPU/CPU for testing or evaluation.") + parser.add_argument("--batch_size", default=2, type=int, help="Batch size per GPU/CPU for testing or evaluation.") parser.add_argument("--seed", default=42, type=int, help="random seed for initialization") parser.add_argument( "--device", @@ -122,8 +128,8 @@ def set_seed(args): def generate(args): paddle.set_device(args.device) set_seed(args) - tokenizer = PegasusChineseTokenizer.from_pretrained(args.model_name_or_path) - model = PegasusForConditionalGeneration.from_pretrained(args.model_name_or_path) + tokenizer = PegasusChineseTokenizer.from_pretrained(args.init_checkpoint_dir) + model = PegasusForConditionalGeneration.from_pretrained(args.init_checkpoint_dir) dataset = load_dataset("json", data_files=args.prefict_file, split="train") remove_columns = ["content", "title"] trans_func = partial( diff --git a/applications/text_summarization/pegasus/run_generate.sh b/applications/text_summarization/pegasus/run_prepare.py similarity index 53% rename from applications/text_summarization/pegasus/run_generate.sh rename to applications/text_summarization/pegasus/run_prepare.py index a5093ac8d635..373d0b08d7c6 100644 --- a/applications/text_summarization/pegasus/run_generate.sh +++ b/applications/text_summarization/pegasus/run_prepare.py @@ -1,24 +1,29 @@ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# GPU启动,参数`--gpus`指定训练所用的GPU卡号,可以是单卡,也可以多卡 -unset CUDA_VISIBLE_DEVICES +import os -python run_generate.py \ - --model_name_or_path=pegesus_out/bart_model_50000.pdparams \ - --prefict_file valid.json \ - --max_source_length 128 \ - --max_target_length 64 \ - --batch_size 128 \ - --device=gpu \ + +def prepare(): + + bos_link_train = "https://paddlenlp.bj.bcebos.com/datasets/tiny_summary_dataset/train.json" + bos_link_valid = "https://paddlenlp.bj.bcebos.com/datasets/tiny_summary_dataset/valid.json" + bos_link_test = "https://paddlenlp.bj.bcebos.com/datasets/tiny_summary_dataset/test.json" + os.system("mkdir data") + os.system("cd data && wget %s " % (bos_link_train)) + os.system("cd data && wget %s " % (bos_link_valid)) + os.system("cd data && wget %s " % (bos_link_test)) + + +prepare() diff --git a/applications/text_summarization/pegasus/run_train.sh b/applications/text_summarization/pegasus/run_train.sh deleted file mode 100644 index 63edf0e646bc..000000000000 --- a/applications/text_summarization/pegasus/run_train.sh +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# GPU启动,参数`--gpus`指定训练所用的GPU卡号,可以是单卡,也可以多卡 -unset CUDA_VISIBLE_DEVICES - -python -m paddle.distributed.launch --gpus "2,3,4,5,6,7" run_summarization.py \ - --model_name_or_path=IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese \ - --train_file train.json \ - --eval_file test.json \ - --output_dir pegesus_out \ - --max_source_length 128 \ - --max_target_length 64 \ - --num_train_epochs 20 \ - --logging_steps 1 \ - --save_steps 10000 \ - --train_batch_size 128 \ - --eval_batch_size 128 \ - --learning_rate 5e-5 \ - --warmup_proportion 0.02 \ - --weight_decay=0.01 \ - --device=gpu \ diff --git a/applications/text_summarization/pegasus/run_summarization.py b/applications/text_summarization/pegasus/train.py similarity index 88% rename from applications/text_summarization/pegasus/run_summarization.py rename to applications/text_summarization/pegasus/train.py index ddbf316a13b3..23eb944c6abe 100644 --- a/applications/text_summarization/pegasus/run_summarization.py +++ b/applications/text_summarization/pegasus/train.py @@ -12,27 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import json import argparse +import distutils.util +import math +import os import random import time -import distutils.util -from pprint import pprint from functools import partial -from tqdm import tqdm +from pprint import pprint + import numpy as np -import math +import paddle from datasets import load_dataset +from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler +from tqdm import tqdm +from utils import compute_metrics, convert_example, main_process_first -import paddle -import paddle.nn as nn -from paddle.io import BatchSampler, DistributedBatchSampler, DataLoader -from paddlenlp.transformers import PegasusForConditionalGeneration, PegasusChineseTokenizer -from paddlenlp.transformers import LinearDecayWithWarmup -from paddlenlp.utils.log import logger from paddlenlp.data import DataCollatorForSeq2Seq -from utils import convert_example, compute_metrics, main_process_first +from paddlenlp.transformers import ( + LinearDecayWithWarmup, + PegasusChineseTokenizer, + PegasusForConditionalGeneration, +) +from paddlenlp.utils.log import logger def parse_args(): @@ -42,11 +44,10 @@ def parse_args(): "--model_name_or_path", default="IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese", type=str, - required=True, help="Path to pre-trained model. ", ) - parser.add_argument("--train_file", type=str, required=False, default=None, help="Train data path.") - parser.add_argument("--eval_file", type=str, required=False, default=None, help="Eval data path.") + parser.add_argument("--train_file", type=str, required=False, default="data/train.json", help="Train data path.") + parser.add_argument("--eval_file", type=str, required=False, default="data/test.json", help="Eval data path.") parser.add_argument( "--output_dir", default="output", @@ -77,7 +78,7 @@ def parse_args(): ) parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument( - "--num_train_epochs", + "--epoch", default=3, type=int, help="Total number of training epochs to perform.", @@ -86,13 +87,13 @@ def parse_args(): parser.add_argument("--save_steps", type=int, default=100, help="Save checkpoint every X updates steps.") parser.add_argument( "--train_batch_size", - default=20, + default=2, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--eval_batch_size", - default=12, + default=2, type=int, help="Batch size per GPU/CPU for evaluation.", ) @@ -111,7 +112,7 @@ def parse_args(): "--max_steps", default=-1, type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.", + help="If > 0: set total number of training steps to perform. Override epoch.", ) parser.add_argument("--seed", default=42, type=int, help="random seed for initialization") parser.add_argument( @@ -159,8 +160,9 @@ def evaluate(model, data_loader, tokenizer, min_target_length, max_target_length ) labels = np.where(labels != -100, labels, tokenizer.pad_token_id) all_labels.extend(tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=False)) - compute_metrics(all_preds, all_labels) + rougel = compute_metrics(all_preds, all_labels) model.train() + return rougel def do_train(args): @@ -207,8 +209,8 @@ def do_train(args): num_training_steps = args.max_steps num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) else: - num_training_steps = len(train_data_loader) * args.num_train_epochs - num_train_epochs = args.num_train_epochs + num_training_steps = len(train_data_loader) * args.epoch + num_train_epochs = args.epoch warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion @@ -230,6 +232,7 @@ def do_train(args): if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) global_step = 0 + best_rougel = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): @@ -262,10 +265,11 @@ def do_train(args): tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() - evaluate(model, dev_data_loader, tokenizer, args.min_target_length, args.max_target_length) + rougel = evaluate(model, dev_data_loader, tokenizer, args.min_target_length, args.max_target_length) logger.info("eval done total : %s s" % (time.time() - tic_eval)) - if paddle.distributed.get_rank() == 0: - output_dir = os.path.join(args.output_dir, "pegeaus_model_%d" % global_step) + if paddle.distributed.get_rank() == 0 and best_rougel < rougel: + best_rougel = rougel + output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel @@ -274,14 +278,6 @@ def do_train(args): tokenizer.save_pretrained(output_dir) if global_step >= num_training_steps: return - if paddle.distributed.get_rank() == 0: - output_dir = os.path.join(args.output_dir, "pegeaus_modelfinal_%d" % global_step) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - # Need better way to get inner model of DataParallel - model_to_save = model._layers if isinstance(model, paddle.DataParallel) else model - model_to_save.save_pretrained(output_dir) - tokenizer.save_pretrained(output_dir) if __name__ == "__main__": diff --git a/applications/text_summarization/pegasus/utils.py b/applications/text_summarization/pegasus/utils.py index 3c457283caa3..0c63f342db8b 100644 --- a/applications/text_summarization/pegasus/utils.py +++ b/applications/text_summarization/pegasus/utils.py @@ -12,13 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import contextlib -from rouge import Rouge -import json +import numpy as np import paddle -from paddlenlp.data import Pad +from rouge import Rouge + from paddlenlp.metrics import BLEU from paddlenlp.utils.log import logger @@ -61,6 +60,7 @@ def compute_metrics(preds, targets): print("rouge-2:", round(rouge2, 4)) print("rouge-L:", round(rougel, 4)) print("BLEU-4:", round(bleu4.score(), 4)) + return rougel @contextlib.contextmanager