Merge branch 'develop' into electra_inputs_embeds

PaddlePaddle · Oct 13, 2022 · 57aadef · 57aadef
2 parents c18cd26 + c65dbb4
commit 57aadef
Show file tree

Hide file tree

Showing 18 changed files with 721 additions and 63 deletions.
diff --git a/applications/question_answering/faq_finance/README.md b/applications/question_answering/faq_finance/README.md
@@ -399,10 +399,24 @@ python milvus_ann_search.py --data_path data/qa_pair.csv \
 
 #### Paddle Serving 部署
 
-Paddle Serving 的安装可以参考[Paddle Serving 安装文档](https://github.com/PaddlePaddle/Serving#installation)。需要在服务端和客户端安装相关的依赖，安装完依赖后就可以执行下面的步骤。
+Paddle Serving 的安装可以参考[Paddle Serving 安装文档](https://github.com/PaddlePaddle/Serving#installation)。需要在服务端和客户端安装相关的依赖，用pip安装Paddle Serving的依赖如下：
 
+```
+pip install paddle-serving-client==0.8.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
+pip install paddle-serving-app==0.8.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+# 如果是CPU部署，只需要安装CPU Server
+pip install paddle-serving-server==0.8.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
 
-首先把生成的静态图模型导出为 Paddle Serving的格式，命令如下：
+# 如果是GPU Server，需要确认环境再选择执行哪一条，推荐使用CUDA 10.2的包
+# CUDA10.2 + Cudnn7 + TensorRT6（推荐）
+pip install paddle-serving-server-gpu==0.8.3.post102 -i https://pypi.tuna.tsinghua.edu.cn/simple
+# CUDA10.1 + TensorRT6
+pip install paddle-serving-server-gpu==0.8.3.post101 -i https://pypi.tuna.tsinghua.edu.cn/simple
+# CUDA11.2 + TensorRT8
+pip install paddle-serving-server-gpu==0.8.3.post112 -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+更详细的安装信息请参考[链接]((https://github.com/PaddlePaddle/Serving/blob/v0.9.0/doc/Install_Linux_Env_CN.md))，安装完依赖后就可以执行下面的步骤。首先把生成的静态图模型导出为 Paddle Serving的格式，命令如下：
 
 ```
 python export_to_serving.py \

diff --git a/applications/question_answering/faq_finance/requirements.txt b/applications/question_answering/faq_finance/requirements.txt
@@ -5,7 +5,4 @@ paddlepaddle-gpu>=2.2.3
 hnswlib>=0.5.2
 numpy>=1.17.2
 visualdl>=2.2.2
-paddle-serving-app>=0.7.0        
-paddle-serving-client>=0.7.0        
-paddle-serving-server-gpu>=0.7.0.post102
 pybind11
diff --git a/examples/machine_reading_comprehension/SQuAD/args.py b/examples/machine_reading_comprehension/SQuAD/args.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 
 
@@ -78,7 +92,7 @@ def parse_args():
                         help="random seed for initialization")
     parser.add_argument(
         '--device',
-        choices=['cpu', 'gpu'],
+        choices=['cpu', 'gpu', 'mlu'],
         default="gpu",
         help="Select which device to train model, defaults to gpu.")
     parser.add_argument(
@@ -131,5 +145,12 @@ def parse_args():
     parser.add_argument("--do_predict",
                         action='store_true',
                         help="Whether to predict.")
+    parser.add_argument("--use_amp",
+                        action='store_true',
+                        help="Whether to use AMP.")
+    parser.add_argument("--scale_loss",
+                        type=float,
+                        default=2**15,
+                        help="The value of scale_loss for fp16.")
     args = parser.parse_args()
     return args
diff --git a/examples/machine_reading_comprehension/SQuAD/run_squad.py b/examples/machine_reading_comprehension/SQuAD/run_squad.py
@@ -288,27 +288,46 @@ def run(args):
             apply_decay_param_fun=lambda x: x in decay_params)
         criterion = CrossEntropyLossForSQuAD()
 
+        if args.use_amp:
+            scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
+
         global_step = 0
         tic_train = time.time()
 
         for epoch in range(num_train_epochs):
             for step, batch in enumerate(train_data_loader):
                 global_step += 1
-                logits = model(input_ids=batch['input_ids'],
-                               token_type_ids=batch['token_type_ids'],
-                               attention_mask=batch['attention_mask'])
-                loss = criterion(
-                    logits, (batch['start_positions'], batch['end_positions']))
+                if args.use_amp:
+                    with paddle.amp.auto_cast(
+                            args.use_amp,
+                            custom_white_list=["layer_norm", "softmax",
+                                               "gelu"]):
+                        logits = model(input_ids=batch['input_ids'],
+                                       token_type_ids=batch['token_type_ids'],
+                                       attention_mask=batch['attention_mask'])
+                        loss = criterion(
+                            logits,
+                            (batch['start_positions'], batch['end_positions']))
+                    scaler.scale(loss).backward()
+                    scaler.minimize(optimizer, loss)
+                else:
+                    logits = model(input_ids=batch['input_ids'],
+                                   token_type_ids=batch['token_type_ids'],
+                                   attention_mask=batch['attention_mask'])
+                    loss = criterion(
+                        logits,
+                        (batch['start_positions'], batch['end_positions']))
+                    loss.backward()
+                    optimizer.step()
+                lr_scheduler.step()
+                optimizer.clear_grad()
+
                 if global_step % args.logging_steps == 0:
                     print(
                         "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                         % (global_step, epoch + 1, step + 1, loss,
                            args.logging_steps / (time.time() - tic_train)))
                     tic_train = time.time()
-                loss.backward()
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.clear_grad()
 
                 if global_step % args.save_steps == 0 or global_step == num_training_steps:
                     if rank == 0:

diff --git a/model_zoo/uie/utils.py b/model_zoo/uie/utils.py
@@ -222,6 +222,10 @@ def reader(data_path, max_seq_len=512):
             if len(content) <= max_content_len:
                 yield json_line
             else:
+                if result['end'] - result['start'] > max_content_len:
+                    logger.warn(
+                        "result['end '] - result ['start'] exceeds max_content_len, which will result in no valid instance being returned"
+                    )
                 result_list = json_line['result_list']
                 json_lines = []
                 accumulate = 0
@@ -230,7 +234,8 @@ def reader(data_path, max_seq_len=512):
 
                     for result in result_list:
                         if result['start'] + 1 <= max_content_len < result[
-                                'end']:
+                                'end'] and result['end'] - result[
+                                    'start'] <= max_content_len:
                             max_content_len = result['start']
                             break
 

diff --git a/paddlenlp/transformers/electra/modeling.py b/paddlenlp/transformers/electra/modeling.py
@@ -23,8 +23,8 @@
 from paddle.nn.layer.transformer import _convert_attention_mask
 
 from .. import PretrainedModel, register_base_model
-from ..model_outputs import (BaseModelOutput, SequenceClassifierOutput,
-                             TokenClassifierOutput,
+from ..model_outputs import (BaseModelOutputWithPastAndCrossAttentions,
+                             SequenceClassifierOutput, TokenClassifierOutput,
                              QuestionAnsweringModelOutput,
                              MultipleChoiceModelOutput, MaskedLMOutput,
                              tuple_output)
@@ -153,9 +153,12 @@ def forward(self,
                              src_mask=src_mask,
                              output_attentions=output_attentions)
             else:
+                cache_wrapper = cache[i] if isinstance(
+                    cache[i], nn.MultiHeadAttention.Cache
+                ) else nn.MultiHeadAttention.Cache(*cache[i])
                 output, new_cache = mod(output,
                                         src_mask=src_mask,
-                                        cache=cache[i],
+                                        cache=cache_wrapper,
                                         output_attentions=output_attentions)
                 new_caches.append(new_cache)
             if output_attentions:
@@ -174,14 +177,13 @@ def forward(self,
         if not return_dict:
             if output_attentions or output_hidden_states:
                 output = (output, all_attentions, all_hidden_states)
-
             return output if cache is None else (output, new_caches)
 
-        return BaseModelOutput(
+        return BaseModelOutputWithPastAndCrossAttentions(
             last_hidden_state=output,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
-        )
+            past_key_values=new_caches)
 
 
 class ElectraEmbeddings(nn.Layer):
@@ -203,12 +205,14 @@ def forward(self,
                 input_ids,
                 token_type_ids=None,
                 position_ids=None,
-                inputs_embeds=None):
-
+                inputs_embeds=None,
+                past_key_values_length=None):
         if position_ids is None:
             ones = paddle.ones_like(input_ids, dtype="int64")
             seq_length = paddle.cumsum(ones, axis=-1)
             position_ids = seq_length - ones
+            if past_key_values_length is not None:
+                position_ids += past_key_values_length
             position_ids.stop_gradient = True
         position_ids = position_ids.astype("int64")
 
@@ -559,6 +563,8 @@ def forward(self,
                 position_ids=None,
                 attention_mask=None,
                 inputs_embeds=None,
+                past_key_values=None,
+                use_cache=None,
                 output_attentions=False,
                 output_hidden_states=False,
                 return_dict=False):
@@ -599,6 +605,17 @@ def forward(self,
                 This is useful for use cases such as P-Tuning, where you want more control over how to convert input_ids indices
                 into the embedding space.
                 Its data type should be `float32` and it has a shape of [batch_size, sequence_length, embedding_size].
+            past_key_values (tuple(tuple(Tensor)), optional):
+                Precomputed key and value hidden states of the attention blocks of each layer. This can be used to speedup
+                auto-regressive decoding for generation tasks or to support use cases such as Prefix-Tuning where vectors are prepended
+                to each attention layer. The length of tuple equals to the number of layers, and each tuple having 2 tensors of shape
+                `(batch_size, num_heads, past_key_values_length, embed_size_per_head)`)
+                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
+                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+                `input_ids` of shape `(batch_size, sequence_length)`.
+            use_cache (`bool`, optional):
+                If set to `True`, `past_key_values` key value states are returned.
+                Defaults to `None`.
             output_hidden_states (bool, optional):
                 Whether to return the hidden states of all layers.
                 Defaults to `False`.
@@ -627,27 +644,40 @@ def forward(self,
                 output = model(**inputs)
 
         '''
+        past_key_values_length = None
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
 
         if attention_mask is None:
             attention_mask = paddle.unsqueeze(
                 (input_ids == self.pad_token_id).astype(
                     paddle.get_default_dtype()) * -1e4,
                 axis=[1, 2])
+            if past_key_values is not None:
+                batch_size = past_key_values[0][0].shape[0]
+                past_mask = paddle.zeros(
+                    [batch_size, 1, 1, past_key_values_length],
+                    dtype=attention_mask.dtype)
+                attention_mask = paddle.concat([past_mask, attention_mask],
+                                               axis=-1)
         else:
             if attention_mask.ndim == 2:
                 attention_mask = attention_mask.unsqueeze(axis=[1, 2])
 
         embedding_output = self.embeddings(input_ids=input_ids,
                                            position_ids=position_ids,
                                            token_type_ids=token_type_ids,
-                                           inputs_embeds=inputs_embeds)
+                                           inputs_embeds=inputs_embeds,
+                                           past_key_values_length=past_key_values_length)
 
         if hasattr(self, "embeddings_project"):
             embedding_output = self.embeddings_project(embedding_output)
 
+        self.encoder._use_cache = use_cache  # To be consistent with HF
         encoder_outputs = self.encoder(
             embedding_output,
             attention_mask,
+            cache=past_key_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict)

diff --git a/paddlenlp/transformers/gpt/modeling.py b/paddlenlp/transformers/gpt/modeling.py
@@ -1182,7 +1182,7 @@ def prepare_inputs_for_generation(self,
         # only last token for inputs_ids if cache is defined in kwargs
         position_ids = kwargs.get("position_ids", None)
         attention_mask = kwargs.get("attention_mask", None)
-        if attention_mask is not None and len(attention_mask.shape) == 4:
+        if attention_mask is not None and attention_mask.ndim == 4:
             attention_mask = attention_mask[:, -1:, -1:, :]
         if cache is not None:
             input_ids = input_ids[:, -1].unsqueeze(-1)

diff --git a/pipelines/examples/semantic-search/semantic_search_example.py b/pipelines/examples/semantic-search/semantic_search_example.py
@@ -209,6 +209,22 @@ def semantic_search_tutorial():
                           })
 
     print_documents(prediction)
+    # Batch prediction
+    predictions = pipe.run_batch(queries=["亚马逊河流的介绍", '期货交易手续费指的是什么?'],
+                                 params={
+                                     "Retriever": {
+                                         "top_k": 50
+                                     },
+                                     "Ranker": {
+                                         "top_k": 5
+                                     }
+                                 })
+    for i in range(len(predictions['queries'])):
+        result = {
+            'documents': predictions['documents'][i],
+            'query': predictions['queries'][i]
+        }
+        print_documents(result)
 
 
 if __name__ == "__main__":

diff --git a/pipelines/pipelines/nodes/base.py b/pipelines/pipelines/nodes/base.py
@@ -127,16 +127,33 @@ def _dispatch_run(self, **kwargs) -> Tuple[Dict, str]:
           - collate `_debug` information if present
           - merge component output with the preceding output and pass it on to the subsequent Component in the Pipeline
         """
+        return self._dispatch_run_general(self.run, **kwargs)
+
+    def _dispatch_run_batch(self, **kwargs):
+        """
+        The Pipelines call this method when run_batch() is executed. This method in turn executes the
+        _dispatch_run_general() method with the correct run method.
+        """
+        return self._dispatch_run_general(self.run_batch, **kwargs)
+
+    def _dispatch_run_general(self, run_method: Callable, **kwargs):
+        """
+        This method takes care of the following:
+          - inspect run_method's signature to validate if all necessary arguments are available
+          - pop `debug` and sets them on the instance to control debug output
+          - call run_method with the corresponding arguments and gather output
+          - collate `_debug` information if present
+          - merge component output with the preceding output and pass it on to the subsequent Component in the Pipeline
+        """
         arguments = deepcopy(kwargs)
         params = arguments.get("params") or {}
 
-        run_signature_args = inspect.signature(self.run).parameters.keys()
+        run_signature_args = inspect.signature(run_method).parameters.keys()
 
         run_params: Dict[str, Any] = {}
         for key, value in params.items():
             if key == self.name:  # targeted params for this node
                 if isinstance(value, dict):
-
                     # Extract debug attributes
                     if "debug" in value.keys():
                         self.debug = value.pop("debug")
@@ -156,19 +173,19 @@ def _dispatch_run(self, **kwargs) -> Tuple[Dict, str]:
             if key in run_signature_args:
                 run_inputs[key] = value
 
-        output, stream = self.run(**run_inputs, **run_params)
+        output, stream = run_method(**run_inputs, **run_params)
 
         # Collect debug information
         debug_info = {}
         if getattr(self, "debug", None):
             # Include input
             debug_info["input"] = {**run_inputs, **run_params}
             debug_info["input"]["debug"] = self.debug
-            # Include output
+            # Include output, exclude _debug to avoid recursion
             filtered_output = {
                 key: value
                 for key, value in output.items() if key != "_debug"
-            }  # Exclude _debug to avoid recursion
+            }
             debug_info["output"] = filtered_output
         # Include custom debug info
         custom_debug = output.get("_debug", {})
@@ -182,9 +199,9 @@ def _dispatch_run(self, **kwargs) -> Tuple[Dict, str]:
         if all_debug:
             output["_debug"] = all_debug
 
-        # add "extra" args that were not used by the node
+        # add "extra" args that were not used by the node, but not the 'inputs' value
         for k, v in arguments.items():
-            if k not in output.keys():
+            if k not in output.keys() and k != "inputs":
                 output[k] = v
 
         output["params"] = params