PaddlePaddle · gongel · Dec 15, 2022 · Dec 13, 2022 · Dec 13, 2022 · Dec 14, 2022
diff --git a/applications/text_summarization/pegasus/README.md b/applications/text_summarization/pegasus/README.md
@@ -1,4 +1,4 @@
-# 生成式文本摘要应用
+
 
 **目录**
 - [生成式文本摘要应用](#生成式文本摘要应用)
@@ -90,6 +90,9 @@ PaddleNLP提供开箱即用的产业级NLP预置任务能力，无需训练，
 
 ```text
 text_summarization/
+├── data # 数据
+│   ├── train.json # 训练数据集文件
+│   └── test.json # 可选，待预测数据文件
 ├── deploy # 部署
 │   ├── paddle_inference # PaddleInference高性能推理部署
 │   │   ├── inference_pegasus.py # 推理部署脚本
@@ -100,12 +103,13 @@ text_summarization/
 │       ├── pipeline_service.py # 服务器程序
 │       ├── export_serving.sh # serving模型导出脚本
 │       └── README.md # 说明文档
+├── run_prepare.py # 小数据集获取脚本
 ├── export_model.py # 动态图参数导出静态图参数脚本
 ├── export_model.sh # 动态图参数导出静态图参数shell脚本
-├── run_summarization.py # 训练评估脚本
-├── run_train.sh # 训练评估shell脚本
-├── run_generate.py # 预测脚本
-├── run_generate.sh # 预测shell脚本
+├── predict.py    # 预测脚本
+├── predict.sh    # 预测shell脚本
+├── train.py # 训练评估脚本
+├── train.sh # 训练评估shell脚本
 ├── utils.py # 工具函数脚本
 ├── requirements.txt # 依赖包
 └── README.md # 说明文档
@@ -133,6 +137,10 @@ data/
 "content": "“北京的保障房市场就像一个巨大的赌场，每个人都在期待中奖。”面对中国目前现行的保障性住房政策，华远地产董事长任志强再次语出惊人。（分享自@第一财经-中国房地产金融）"
 }
 ```
+这里提供小数据集供测试，运行下面命令即可下载:
+```bash
+python run_prepare.py
+```
 
 更多数据集读取格式详见[数据集加载](https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_load.html#)和[自定义数据集](https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html)。
 
@@ -144,14 +152,14 @@ data/
 # GPU启动，参数`--gpus`指定训练所用的GPU卡号，可以是单卡，也可以多卡
 unset CUDA_VISIBLE_DEVICES
 
-python -m paddle.distributed.launch --gpus "2,3,4,5,6,7" run_summarization.py \
+python -m paddle.distributed.launch --gpus "2,3,4,5,6,7" train.py \
     --model_name_or_path=IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese \
-    --train_file train.json \
-    --eval_file test.json \
+    --train_file data/train.json \
+    --eval_file data/test.json \
     --output_dir pegesus_out \
     --max_source_length 128 \
     --max_target_length 64 \
-    --num_train_epochs 20 \
+    --epoch 20 \
     --logging_steps 1 \
     --save_steps 10000 \
     --train_batch_size 128 \
@@ -161,7 +169,6 @@ python -m paddle.distributed.launch --gpus "2,3,4,5,6,7" run_summarization.py \
     --weight_decay=0.01 \
     --device=gpu \
 ```
-也可以直接使用`run_train.sh`.
 
 关键参数释义如下：
 - `gpus` 指示了训练所用的GPU卡号。
@@ -193,14 +200,12 @@ python -m paddle.distributed.launch --gpus "2,3,4,5,6,7" run_summarization.py \
 程序运行时将会自动进行训练和验证，训练过程中会自动保存模型在指定的`output_dir`中。
 如：
 ```text
-./pegeaus_model/
-├── pegeaus_model_10000
-│   ├── model_config.json
-│   ├── model_state.pdparams
-│   ├── special_tokens_map.json
-│   ├── tokenizer_config.json
-│   └── vocab.txt
-└── ...
+./pegesus_out/
+├── model_config.json
+├── model_state.pdparams
+├── special_tokens_map.json
+├── tokenizer_config.json
+└── vocab.txt
 ```
 
 **NOTE:** 如需恢复模型训练，`model_name_or_path`配置本地模型的目录地址即可。
@@ -213,13 +218,12 @@ python -m paddle.distributed.launch --gpus "2,3,4,5,6,7" run_summarization.py \
 ```shell
 unset CUDA_VISIBLE_DEVICES
 
-python run_generate.py \
-    --model_name_or_path=pegesus_out/pegeaus_model_10000 \
-    --prefict_file valid.json \
+python predict.py \
+    --init_checkpoint_dir=pegesus_out \
+    --prefict_file data/valid.json \
     --max_source_length 128 \
     --max_target_length 64 \
     --batch_size 128 \
-    --output_path generate.txt \
     --device=gpu \
 ```
 
@@ -243,13 +247,13 @@ Finetuned baseline的模型在[LCSTS](https://aclanthology.org/D15-1229/)测试
 python export_model.py \
     --model_name_or_path IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese \
     --decoding_strategy beam_search \
-    --inference_model_dir ./inference_model \
+    --export_output_dir ./inference_model \
     --max_out_len 30 \
 ```
 关键参数释义如下：
 
 * `model_name_or_path`：动态图训练保存的参数路径；默认为"IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese"。
-* `inference_model_dir`：静态图图保存的参数路径；默认为"./inference_model"。
+* `export_output_dir`：静态图图保存的参数路径；默认为"./inference_model"。
 * `max_out_len`：最大输出长度。
 
 执行命令后将会自动导出模型到指定的 `inference_model` 中，保存模型文件结构如下所示：

diff --git a/applications/text_summarization/pegasus/deploy/paddle_inference/inference_pegasus.py b/applications/text_summarization/pegasus/deploy/paddle_inference/inference_pegasus.py
@@ -12,14 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import argparse
-import numpy as np
+import os
 from pprint import pprint
-import paddle
+
+import numpy as np
 from paddle import inference
-from paddlenlp.transformers import PegasusChineseTokenizer, PegasusForConditionalGeneration
+
 from paddlenlp.ops.ext_utils import load
+from paddlenlp.transformers import PegasusChineseTokenizer
 
 
 def setup_args():

diff --git a/applications/text_summarization/pegasus/deploy/paddle_serving/pipeline_client.py b/applications/text_summarization/pegasus/deploy/paddle_serving/pipeline_client.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import time
+
 from paddle_serving_server.pipeline import PipelineClient
+
 from paddlenlp.utils.log import logger
 
 

diff --git a/applications/text_summarization/pegasus/deploy/paddle_serving/pipeline_service.py b/applications/text_summarization/pegasus/deploy/paddle_serving/pipeline_service.py
@@ -13,11 +13,10 @@
 # limitations under the License.
 
 import numpy as np
-from numpy import array
-import paddle_serving_server.pipeline.operator
-from paddle_serving_server.web_service import WebService, Op
-from paddlenlp.transformers import PegasusChineseTokenizer
+from paddle_serving_server.web_service import Op, WebService
+
 from paddlenlp.ops.ext_utils import load
+from paddlenlp.transformers import PegasusChineseTokenizer
 from paddlenlp.utils.log import logger
 
 

diff --git a/applications/text_summarization/pegasus/export_model.py b/applications/text_summarization/pegasus/export_model.py
@@ -12,12 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import argparse
-import paddle
+import os
 from pprint import pprint
-from paddlenlp.transformers import PegasusChineseTokenizer, PegasusForConditionalGeneration
+
+import paddle
+
 from paddlenlp.ops import FasterPegasus
+from paddlenlp.transformers import (
+    PegasusChineseTokenizer,
+    PegasusForConditionalGeneration,
+)
 from paddlenlp.utils.log import logger
 
 
@@ -30,10 +35,7 @@ def parse_args():
         help="The model name to specify the Pegasus to use. ",
     )
     parser.add_argument(
-        "--inference_model_dir",
-        default="./inference_model",
-        type=str,
-        help="Path to save inference model of Pegasus. ",
+        "--export_output_dir", default="./inference_model", type=str, help="Path to save inference model of Pegasus. "
     )
     parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure top_k sampling. ")
     parser.add_argument(
@@ -122,8 +124,8 @@ def do_predict(args):
     )
 
     # Save converted static graph model
-    paddle.jit.save(pegasus, os.path.join(args.inference_model_dir, "pegasus"))
-    logger.info("PEGASUS has been saved to {}.".format(args.inference_model_dir))
+    paddle.jit.save(pegasus, os.path.join(args.export_output_dir, "pegasus"))
+    logger.info("PEGASUS has been saved to {}.".format(args.export_output_dir))
 
 
 if __name__ == "__main__":

diff --git a/applications/text_summarization/pegasus/export_model.sh b/applications/text_summarization/pegasus/export_model.sh
@@ -15,5 +15,5 @@
 python export_model.py \
     --model_name_or_path IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese \
     --decoding_strategy beam_search \
-    --inference_model_dir ./inference_model \
+    --export_output_dir ./inference_model \
     --max_out_len 30 \
diff --git a/...ext_summarization/pegasus/run_generate.py → ...ons/text_summarization/pegasus/predict.py b/...ext_summarization/pegasus/run_generate.py → ...ons/text_summarization/pegasus/predict.py
@@ -12,38 +12,44 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import argparse
 import random
 import time
 from functools import partial
 from pprint import pprint
+
 import numpy as np
-from datasets import load_dataset
 import paddle
+from datasets import load_dataset
 from paddle.io import BatchSampler, DataLoader
-from paddlenlp.transformers import PegasusForConditionalGeneration, PegasusChineseTokenizer
-from utils import convert_example, compute_metrics
+from utils import compute_metrics, convert_example
+
 from paddlenlp.data import DataCollatorForSeq2Seq
+from paddlenlp.transformers import (
+    PegasusChineseTokenizer,
+    PegasusForConditionalGeneration,
+)
 
 
 def parse_args():
     parser = argparse.ArgumentParser()
     # Required parameters
     parser.add_argument(
-        "--model_name_or_path",
+        "--init_checkpoint_dir",
         default="IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese",
         type=str,
         required=True,
         help="Path to pre-trained model. ",
     )
-    parser.add_argument("--prefict_file", type=str, required=False, default=None, help="Predict data path.")
+    parser.add_argument(
+        "--prefict_file", type=str, required=False, default="data/valid.json", help="Predict data path."
+    )
     parser.add_argument(
         "--output_path", type=str, default="generate.txt", help="The file path where the infer result will be saved."
     )
     parser.add_argument(
         "--max_source_length",
-        default=1024,
+        default=128,
         type=int,
         help="The maximum total input sequence length after "
         "tokenization.Sequences longer than this will be truncated, sequences shorter will be padded.",
@@ -56,7 +62,7 @@ def parse_args():
     )
     parser.add_argument(
         "--max_target_length",
-        default=142,
+        default=64,
         type=int,
         help="The maximum total sequence length for target text after "
         "tokenization. Sequences longer than this will be truncated, sequences shorter will be padded."
@@ -94,7 +100,7 @@ def parse_args():
         action="store_true",
         help="Whether to use fp16 when using faster transformer. Only works when using faster transformer. ",
     )
-    parser.add_argument("--batch_size", default=64, type=int, help="Batch size per GPU/CPU for testing or evaluation.")
+    parser.add_argument("--batch_size", default=2, type=int, help="Batch size per GPU/CPU for testing or evaluation.")
     parser.add_argument("--seed", default=42, type=int, help="random seed for initialization")
     parser.add_argument(
         "--device",
@@ -122,8 +128,8 @@ def set_seed(args):
 def generate(args):
     paddle.set_device(args.device)
     set_seed(args)
-    tokenizer = PegasusChineseTokenizer.from_pretrained(args.model_name_or_path)
-    model = PegasusForConditionalGeneration.from_pretrained(args.model_name_or_path)
+    tokenizer = PegasusChineseTokenizer.from_pretrained(args.init_checkpoint_dir)
+    model = PegasusForConditionalGeneration.from_pretrained(args.init_checkpoint_dir)
     dataset = load_dataset("json", data_files=args.prefict_file, split="train")
     remove_columns = ["content", "title"]
     trans_func = partial(

diff --git a/...ext_summarization/pegasus/run_generate.sh → ...ons/text_summarization/pegasus/predict.sh b/...ext_summarization/pegasus/run_generate.sh → ...ons/text_summarization/pegasus/predict.sh
@@ -15,9 +15,9 @@
 # GPU启动，参数`--gpus`指定训练所用的GPU卡号，可以是单卡，也可以多卡
 unset CUDA_VISIBLE_DEVICES
 
-python run_generate.py \
-    --model_name_or_path=pegesus_out/bart_model_50000.pdparams \
-    --prefict_file valid.json \
+python predict.py \
+    --init_checkpoint_dir=pegasus_out \
+    --prefict_file data/valid.json \
     --max_source_length 128 \
     --max_target_length 64 \
     --batch_size 128 \

diff --git a/applications/text_summarization/pegasus/run_prepare.py b/applications/text_summarization/pegasus/run_prepare.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+def prepare():
+
+    bos_link_train = "https://paddlenlp.bj.bcebos.com/datasets/tiny_summary_dataset/train.json"
+    bos_link_valid = "https://paddlenlp.bj.bcebos.com/datasets/tiny_summary_dataset/valid.json"
+    bos_link_test = "https://paddlenlp.bj.bcebos.com/datasets/tiny_summary_dataset/test.json"
+    os.system("mkdir data")
+    os.system("cd data && wget %s " % (bos_link_train))
+    os.system("cd data && wget %s " % (bos_link_valid))
+    os.system("cd data && wget %s " % (bos_link_test))
+
+
+prepare()