diff --git a/paddlenlp/transformers/generation_utils.py b/paddlenlp/transformers/generation_utils.py
index 195efa6e107b..c9c9f87b25ad 100644
--- a/paddlenlp/transformers/generation_utils.py
+++ b/paddlenlp/transformers/generation_utils.py
@@ -422,7 +422,8 @@ def update_model_kwargs_for_generation(outputs,
         # method.
 
         # update cache
-        if isinstance(outputs, tuple):
+        if isinstance(outputs,
+                      tuple) and not isinstance(outputs[1], paddle.Tensor):
             model_kwargs["cache"] = outputs[1]
 
         # update token_type_ids with last value
diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py
index efeffa66b67e..e054426a0001 100644
--- a/paddlenlp/transformers/t5/modeling.py
+++ b/paddlenlp/transformers/t5/modeling.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
 #
@@ -31,6 +30,12 @@
     'T5ForConditionalGeneration',
 ]
 
+T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "t5-small",
+    "t5-base",
+    "t5-large",
+]
+
 
 def finfo(dtype):
     if dtype == paddle.float32:
@@ -107,6 +112,27 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+class T5DenseGatedSiluDense(nn.Layer):
+    """
+    Construct a dense-gated_gelu-dense module.
+    """
+
+    def __init__(self, d_model, d_ff, dropout_rate):
+        super().__init__()
+        self.wi_0 = nn.Linear(d_model, d_ff, bias_attr=False)
+        self.wi_1 = nn.Linear(d_model, d_ff, bias_attr=False)
+        self.wo = nn.Linear(d_ff, d_model, bias_attr=False)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def forward(self, hidden_states):
+        hidden_silu = F.silu(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_silu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
 class T5LayerFF(nn.Layer):
 
     def __init__(self, feed_forward_proj, d_model, d_ff, layer_norm_epsilon,
@@ -117,6 +143,9 @@ def __init__(self, feed_forward_proj, d_model, d_ff, layer_norm_epsilon,
         elif feed_forward_proj == "gated-gelu":
             self.DenseReluDense = T5DenseGatedGeluDense(d_model, d_ff,
                                                         dropout_rate)
+        elif feed_forward_proj == "gated-silu":
+            self.DenseReluDense = T5DenseGatedSiluDense(d_model, d_ff,
+                                                        dropout_rate)
         else:
             raise ValueError(
                 f"{feed_forward_proj} is not supported. Choose between `relu` and `gated-gelu`"
@@ -522,6 +551,7 @@ def forward(
             output_attentions=output_attentions,
         )
         hidden_states, present_key_value_state = self_attention_outputs[:2]
+
         attention_outputs = self_attention_outputs[
             2:]  # Keep self-attention outputs and relative position weights
 
@@ -989,7 +1019,7 @@ def forward(self,
 
             # layer_outputs is a tuple with:
             # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-            if use_cache is False:
+            if not use_cache:
                 layer_outputs = layer_outputs[:1] + (None, ) + layer_outputs[1:]
 
             hidden_states, present_key_value_state = layer_outputs[:2]
@@ -1040,8 +1070,6 @@ def get_extended_attention_mask(self, attention_mask, input_shape):
                 causal_mask = paddle.tile(seq_ids.unsqueeze(axis=[0, 1]),
                                           [batch_size, seq_length, 1
                                            ]) <= seq_ids.unsqueeze(axis=[0, 2])
-                # in case cache are used we need to add a prefix ones mask to the causal mask
-                # causal and attention masks must have same type with pytorch version < 1.3
                 causal_mask = causal_mask.astype(attention_mask.dtype)
 
                 if causal_mask.shape[1] < attention_mask.shape[1]:
@@ -1062,6 +1090,35 @@ def get_extended_attention_mask(self, attention_mask, input_shape):
                     1) * attention_mask.unsqueeze([1, 2])
             else:
                 extended_attention_mask = attention_mask.unsqueeze([1, 2])
+        elif attention_mask.ndim == 4:
+            if self.is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = paddle.arange(seq_length)
+                causal_mask = paddle.tile(seq_ids.unsqueeze(axis=[0, 1]),
+                                          [batch_size, seq_length, 1
+                                           ]) <= seq_ids.unsqueeze(axis=[0, 2])
+                # in case cache are used we need to add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.astype(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[-1]:
+                    prefix_seq_len = attention_mask.shape[
+                        1] - causal_mask.shape[1]
+                    causal_mask = paddle.concat(
+                        [
+                            paddle.ones(
+                                [batch_size, seq_length, prefix_seq_len],
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+
+                extended_attention_mask = causal_mask.unsqueeze(
+                    1) * attention_mask
+            else:
+                extended_attention_mask = attention_mask
         else:
             raise ValueError(
                 f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
@@ -1072,10 +1129,12 @@ def get_extended_attention_mask(self, attention_mask, input_shape):
         return extended_attention_mask
 
     def invert_attention_mask(self, encoder_attention_mask):
-        if encoder_attention_mask.ndim == 3:
+        if encoder_attention_mask.ndim == 4:
+            encoder_extended_attention_mask = encoder_attention_mask
+        elif encoder_attention_mask.ndim == 3:
             encoder_extended_attention_mask = encoder_attention_mask.unsqueeze(
                 1)
-        if encoder_attention_mask.ndim == 2:
+        elif encoder_attention_mask.ndim == 2:
             encoder_extended_attention_mask = encoder_attention_mask.unsqueeze(
                 [1, 2])
         encoder_extended_attention_mask = encoder_extended_attention_mask.astype(
@@ -1176,6 +1235,13 @@ def __init__(self,
         self.d_model = d_model
         self.initializer_factor = initializer_factor
 
+        if num_decoder_layers is None and num_layers is None:
+            raise ValueError(
+                "You have to specify either num_decoder_layers or num_layers or both."
+            )
+        elif num_decoder_layers is None:
+            num_decoder_layers = num_layers
+
         self.shared = nn.Embedding(vocab_size, d_model)
         self.encoder = T5Stack(d_model,
                                num_layers,
@@ -1401,9 +1467,10 @@ def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
     def get_output_embeddings(self):
-        if not self.t5.config["tie_word_embeddings"]:
+        if self.t5.config["tie_word_embeddings"]:
             return self.t5.shared
-        return self.lm_head
+        else:
+            return self.lm_head
 
     def get_encoder(self):
         return self.t5.encoder
@@ -1514,7 +1581,10 @@ def forward(self,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states)
 
-        hidden_states = encoder_output[0]
+        if isinstance(encoder_output, (tuple, list)):
+            hidden_states = encoder_output[0]
+        else:
+            hidden_states = encoder_output
 
         if labels is not None and decoder_input_ids is None:
             # get decoder inputs from shifting lm labels to the right
@@ -1559,6 +1629,9 @@ def forward(self,
             loss = loss_fct(lm_logits.reshape(shape=[-1, lm_logits.shape[-1]]),
                             labels.flatten())
 
+        if not isinstance(encoder_output, (list, tuple)):
+            encoder_output = (encoder_output, )
+
         output = (lm_logits, ) + decoder_outputs[1:] + encoder_output
         return ((loss, ) + output) if loss is not None else output
 
diff --git a/paddlenlp/transformers/t5/tokenizer.py b/paddlenlp/transformers/t5/tokenizer.py
index 7f78caa80264..549a9bdccf9c 100644
--- a/paddlenlp/transformers/t5/tokenizer.py
+++ b/paddlenlp/transformers/t5/tokenizer.py
@@ -24,6 +24,12 @@
     'T5Tokenizer',
 ]
 
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "t5-small": 512,
+    "t5-base": 512,
+    "t5-large": 512,
+}
+
 
 class T5Tokenizer(AlbertEnglishTokenizer):
     """
@@ -88,6 +94,8 @@ class T5Tokenizer(AlbertEnglishTokenizer):
         },
     }
 
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
     def __init__(self,
                  sentencepiece_model_file,
                  do_lower_case=False,
@@ -98,6 +106,7 @@ def __init__(self,
                  pad_token="<pad>",
                  extra_ids=100,
                  additional_special_tokens=[],
+                 sp_model_kwargs=None,
                  **kwargs):
 
         # Add extra_ids to the special token list
@@ -123,28 +132,54 @@ def __init__(self,
         self.extra_ids = extra_ids
         self.sentencepiece_model_file = sentencepiece_model_file
 
-        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.Load(sentencepiece_model_file)
 
     def __call__(self,
                  text,
                  text_pair=None,
-                 max_seq_len=None,
+                 max_length=None,
                  stride=0,
                  is_split_into_words=False,
-                 pad_to_max_seq_len=False,
-                 truncation_strategy="longest_first",
+                 padding=None,
+                 truncation="longest_first",
                  return_position_ids=False,
                  return_token_type_ids=False,
                  return_attention_mask=True,
                  return_length=False,
                  return_overflowing_tokens=False,
-                 return_special_tokens_mask=False):
+                 return_special_tokens_mask=False,
+                 **kwargs):
+        if "pad_to_max_seq_len" in kwargs and padding is None:
+            pad_to_max_seq_len = kwargs.pop("pad_to_max_seq_len")
+            padding = "max_length" if pad_to_max_seq_len else False
+        elif padding is None:
+            padding = False
+
+        if "max_seq_len" in kwargs and max_length is None:
+            max_length = kwargs["max_seq_len"]
+
+        if "truncation_strategy" in kwargs and kwargs[
+                "truncation_strategy"] != "longest_first":
+            truncation = kwargs["truncation_strategy"]
+
         return super(T5Tokenizer, self).__call__(
-            text, text_pair, max_seq_len, stride, is_split_into_words,
-            pad_to_max_seq_len, truncation_strategy, return_position_ids,
-            return_token_type_ids, return_attention_mask, return_length,
-            return_overflowing_tokens, return_special_tokens_mask)
+            text=text,
+            text_pair=text_pair,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            padding=padding,
+            truncation=truncation,
+            return_position_ids=return_position_ids,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_length=return_length,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            **kwargs)
 
     @property
     def vocab_size(self):
@@ -254,36 +289,6 @@ def convert_tokens_to_string(self, tokens):
         out_string += self.sp_model.decode_pieces(current_sub_tokens)
         return out_string.strip()
 
-    def decode(self,
-               token_ids,
-               skip_special_tokens=False,
-               clean_up_tokenization_spaces=True):
-        """
-        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
-        tokens and clean up tokenization spaces.
-
-        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
-
-        Args:
-            token_ids (Union[List[int], Tensor]):
-                List of tokenized input ids. 
-            skip_special_tokens (bool, optional):
-                Whether or not to remove special tokens in the decoding. Defaults to `False`.
-            clean_up_tokenization_spaces (bool, optional):
-                Whether or not to clean up the tokenization spaces. Defaults to `True`.
-
-        Returns:
-            str: The decoded sentence.
-        """
-        if hasattr(token_ids, "tolist"):
-            token_ids = token_ids.tolist()
-        text = self.convert_tokens_to_string(
-            self.convert_ids_to_tokens(token_ids,
-                                       skip_special_tokens=skip_special_tokens))
-        if clean_up_tokenization_spaces:
-            text = self.clean_up_tokenization(text)
-        return text
-
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
         if token.startswith("<extra_id_"):
@@ -343,3 +348,18 @@ def clean_up_tokenization(out_string):
                 "n't").replace(" 'm", "'m").replace(" 's", "'s").replace(
                     " 've", "'ve").replace(" 're", "'re"))
         return out_string
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.sentencepiece_model_file)
diff --git a/tests/transformers/bart/test_modeling.py b/tests/transformers/bart/test_modeling.py
index 1e4bd4e52eaf..b3dbfa2f24aa 100644
--- a/tests/transformers/bart/test_modeling.py
+++ b/tests/transformers/bart/test_modeling.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # Copyright 2021, The HuggingFace Inc. team. All rights reserved.
 #
diff --git a/tests/transformers/gpt/test_modeling.py b/tests/transformers/gpt/test_modeling.py
index 3d9d1a3cb0a6..84d90c7f59bd 100644
--- a/tests/transformers/gpt/test_modeling.py
+++ b/tests/transformers/gpt/test_modeling.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
diff --git a/tests/transformers/gpt/test_tokenizer.py b/tests/transformers/gpt/test_tokenizer.py
index 1556de1f333c..ef31d240dd2c 100644
--- a/tests/transformers/gpt/test_tokenizer.py
+++ b/tests/transformers/gpt/test_tokenizer.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
diff --git a/tests/transformers/t5/__init__.py b/tests/transformers/t5/__init__.py
new file mode 100644
index 000000000000..97043fd7ba68
--- /dev/null
+++ b/tests/transformers/t5/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/transformers/t5/test_modeling.py b/tests/transformers/t5/test_modeling.py
new file mode 100644
index 000000000000..ee14f4598f40
--- /dev/null
+++ b/tests/transformers/t5/test_modeling.py
@@ -0,0 +1,1020 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import random
+import copy
+import tempfile
+import unittest
+
+from tests.testing_utils import slow
+
+from ..test_generation_utils import GenerationTesterMixin
+from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+import paddle
+from paddlenlp.transformers import T5ForConditionalGeneration, T5Model, T5Tokenizer
+from paddlenlp.transformers.t5.modeling import T5_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+def make_model_instance(config, model_class, base_model_class):
+    if model_class == base_model_class:
+        return model_class(**config)
+    else:
+        return model_class(base_model_class(**config))
+
+
+class T5ModelTester:
+
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        encoder_seq_length=7,
+        decoder_seq_length=9,
+        # For common tests
+        is_training=True,
+        use_attention_mask=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        d_ff=37,
+        relative_attention_num_buckets=8,
+        dropout_rate=0.1,
+        initializer_factor=0.002,
+        eos_token_id=1,
+        pad_token_id=0,
+        scope=None,
+        decoder_layers=None,
+    ):
+
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.d_ff = d_ff
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.dropout_rate = dropout_rate
+        self.initializer_factor = initializer_factor
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = pad_token_id
+        self.scope = None
+        self.decoder_layers = decoder_layers
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length],
+                               self.vocab_size)
+        decoder_input_ids = ids_tensor(
+            [self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        decoder_attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor(
+                [self.batch_size, self.encoder_seq_length], vocab_size=2)
+            decoder_attention_mask = ids_tensor(
+                [self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length],
+                                   self.vocab_size)
+
+        config = self.get_config()
+
+        return (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        )
+
+    def get_pipeline_config(self):
+        return {
+            "vocab_size": 166,  # t5 forces 100 extra tokens
+            "d_model": self.hidden_size,
+            "d_ff": self.d_ff,
+            "d_kv": self.hidden_size // self.num_attention_heads,
+            "num_layers": self.num_hidden_layers,
+            "num_decoder_layers": self.decoder_layers,
+            "num_heads": self.num_attention_heads,
+            "relative_attention_num_buckets":
+            self.relative_attention_num_buckets,
+            "dropout_rate": self.dropout_rate,
+            "initializer_factor": self.initializer_factor,
+            "eos_token_id": self.eos_token_id,
+            "bos_token_id": self.pad_token_id,
+            "pad_token_id": self.pad_token_id,
+        }
+
+    def get_config(self):
+        return {
+            "vocab_size": self.vocab_size,
+            "d_model": self.hidden_size,
+            "d_ff": self.d_ff,
+            "d_kv": self.hidden_size // self.num_attention_heads,
+            "num_layers": self.num_hidden_layers,
+            "num_decoder_layers": self.decoder_layers,
+            "num_heads": self.num_attention_heads,
+            "relative_attention_num_buckets":
+            self.relative_attention_num_buckets,
+            "dropout_rate": self.dropout_rate,
+            "initializer_factor": self.initializer_factor,
+            "eos_token_id": self.eos_token_id,
+            "bos_token_id": self.pad_token_id,
+            "pad_token_id": self.pad_token_id,
+        }
+
+    def check_prepare_lm_labels_via_shift_left(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = T5Model(**config)
+        model.eval()
+
+        # make sure that lm_labels are correctly padded from the right
+        lm_labels = masked_fill(lm_labels,
+                                (lm_labels == self.decoder_start_token_id),
+                                self.eos_token_id)
+
+        # add casaul pad token mask
+        triangular_mask = paddle.tril(paddle.ones(
+            lm_labels.shape)).logical_not()
+        lm_labels = masked_fill(lm_labels, triangular_mask, self.pad_token_id)
+        decoder_input_ids = model._shift_right(lm_labels)
+
+        for i, (decoder_input_ids_slice,
+                lm_labels_slice) in enumerate(zip(decoder_input_ids,
+                                                  lm_labels)):
+            # first item
+            self.parent.assertEqual(decoder_input_ids_slice[0].item(),
+                                    self.decoder_start_token_id)
+            if i < decoder_input_ids_slice.shape[-1]:
+                if i < decoder_input_ids.shape[-1] - 1:
+                    # items before diagonal
+                    self.parent.assertListEqual(
+                        decoder_input_ids_slice[1:i + 1].tolist(),
+                        lm_labels_slice[:i].tolist())
+                # pad items after diagonal
+                if i < decoder_input_ids.shape[-1] - 2:
+                    self.parent.assertListEqual(
+                        decoder_input_ids_slice[i + 2:].tolist(),
+                        lm_labels_slice[i + 1:-1].tolist())
+            else:
+                # all items after square
+                self.parent.assertListEqual(
+                    decoder_input_ids_slice[1:].tolist(),
+                    lm_labels_slice[:-1].tolist())
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = T5Model(**config)
+        model.eval()
+        result = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        decoder_output = result[0]
+        decoder_past = result[1]
+        encoder_output = result[2]
+
+        self.parent.assertEqual(
+            encoder_output.shape,
+            [self.batch_size, self.encoder_seq_length, self.hidden_size])
+        self.parent.assertEqual(
+            decoder_output.shape,
+            [self.batch_size, self.decoder_seq_length, self.hidden_size])
+        # There should be `num_layers` key value embeddings stored in decoder_past
+        self.parent.assertEqual(len(decoder_past), config["num_layers"])
+        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
+        self.parent.assertEqual(len(decoder_past[0]), 4)
+
+    def create_and_check_with_lm_head(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        pretrained_model = T5Model(**config)
+        model = T5ForConditionalGeneration(pretrained_model)
+        model.eval()
+        outputs = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            labels=lm_labels,
+        )
+        self.parent.assertEqual(len(outputs), 4)
+        self.parent.assertEqual(
+            outputs[1].shape,
+            [self.batch_size, self.decoder_seq_length, self.vocab_size])
+        self.parent.assertEqual(outputs[0].shape, [1])
+
+    def create_and_check_decoder_model_past(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = T5Model(**config).get_decoder()
+        model.eval()
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf) + 1)
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        output, past_key_values = outputs
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor([self.batch_size, 1], config["vocab_size"])
+
+        # append to next input_ids and
+        next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1)
+
+        output_from_no_past = model(next_input_ids)[0]
+        output_from_past = model(next_tokens, cache=past_key_values)[0]
+
+        # select random slice
+        random_slice_idx = ids_tensor([
+            1,
+        ], output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1,
+                                                        random_slice_idx].detach(
+                                                        )
+        output_from_past_slice = output_from_past[:, 0,
+                                                  random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(
+            paddle.allclose(output_from_past_slice,
+                            output_from_no_past_slice,
+                            atol=1e-3))
+
+    def create_and_check_decoder_model_attention_mask_past(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = T5Model(**config).get_decoder()
+        model.eval()
+
+        # create attention mask
+        attn_mask = paddle.ones(input_ids.shape, dtype="int64")
+
+        half_seq_length = input_ids.shape[-1] // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        output, past_key_values = model(input_ids,
+                                        attention_mask=attn_mask,
+                                        use_cache=True)
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor([self.batch_size, 1], config["vocab_size"])
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor([
+            1,
+        ], half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor([self.batch_size, 1],
+                                              config["vocab_size"]).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1)
+        attn_mask = paddle.concat(
+            [attn_mask,
+             paddle.ones((attn_mask.shape[0], 1), dtype="int64")],
+            axis=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0]
+        output_from_past = model(next_tokens,
+                                 cache=past_key_values,
+                                 attention_mask=paddle.ones(
+                                     (attn_mask.shape[0], 1), dtype="int64"))[0]
+
+        # select random slice
+        random_slice_idx = ids_tensor([
+            1,
+        ], output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1,
+                                                        random_slice_idx].detach(
+                                                        )
+        output_from_past_slice = output_from_past[:, 0,
+                                                  random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(
+            paddle.allclose(output_from_past_slice,
+                            output_from_no_past_slice,
+                            atol=1e-3))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = T5Model(**config).get_decoder()
+        model.eval()
+        # first forward pass
+        outputs = model(input_ids,
+                        attention_mask=attention_mask,
+                        use_cache=True)
+
+        output, past_key_values = outputs
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor([self.batch_size, 3], config["vocab_size"])
+        next_mask = ids_tensor([self.batch_size, 3], vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1)
+        next_attention_mask = paddle.concat([attention_mask, next_mask],
+                                            axis=-1)
+
+        output_from_no_past = model(next_input_ids,
+                                    attention_mask=next_attention_mask)[0]
+        output_from_past = model(next_tokens,
+                                 attention_mask=next_attention_mask,
+                                 cache=past_key_values)[0]
+
+        # select random slice
+        random_slice_idx = ids_tensor([
+            1,
+        ], output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:,
+                                                        random_slice_idx].detach(
+                                                        )
+        output_from_past_slice = output_from_past[:, :,
+                                                  random_slice_idx].detach()
+
+        self.parent.assertTrue(
+            output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(
+            paddle.allclose(output_from_past_slice,
+                            output_from_no_past_slice,
+                            atol=1e-3))
+
+    def create_and_check_generate_with_past_key_values(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        paddle.seed(0)
+        np.random.seed(0)
+        random.seed(0)
+
+        pretrained_model = T5Model(**config)
+        model = T5ForConditionalGeneration(pretrained_model)
+        model.eval()
+
+        output_without_past_cache, _ = model.generate(
+            input_ids[:1],
+            top_k=1,
+            max_length=5,
+            decode_strategy="sampling",
+            use_cache=False)
+
+        paddle.seed(0)
+        np.random.seed(0)
+        random.seed(0)
+
+        output_with_past_cache, _ = model.generate(input_ids[:1],
+                                                   top_k=1,
+                                                   max_length=5,
+                                                   decode_strategy="sampling")
+
+        self.parent.assertTrue(
+            paddle.all(output_with_past_cache == output_without_past_cache))
+
+    def check_resize_embeddings_t5_v1_1(
+        self,
+        config,
+    ):
+        prev_vocab_size = config["vocab_size"]
+
+        pretrained_model = T5Model(**config)
+        model = T5ForConditionalGeneration(pretrained_model)
+        model.eval()
+        model.resize_token_embeddings(prev_vocab_size - 10)
+
+        self.parent.assertEqual(model.get_input_embeddings().weight.shape[0],
+                                prev_vocab_size - 10)
+        self.parent.assertEqual(model.get_output_embeddings().weight.shape[0],
+                                prev_vocab_size - 10)
+        self.parent.assertEqual(model.t5.config["vocab_size"],
+                                prev_vocab_size - 10)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+            "use_cache": False,
+        }
+        return config, inputs_dict
+
+
+class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    base_model_class = T5Model
+
+    all_model_classes = (T5Model, T5ForConditionalGeneration)
+    all_generative_model_classes = {T5ForConditionalGeneration: (T5Model, "t5")}
+    all_parallelizable_model_classes = (T5Model, T5ForConditionalGeneration)
+    fx_compatible = True
+    test_pruning = False
+    test_resize_embeddings = True
+    test_model_parallel = True
+    is_encoder_decoder = True
+    # The small T5 model needs higher percentages for CPU/MP tests
+    model_split_percents = [0.8, 0.9]
+
+    def setUp(self):
+        self.model_tester = T5ModelTester(self)
+
+    def test_shift_right(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_prepare_lm_labels_via_shift_left(
+            *config_and_inputs)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_v1_1(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        # check that gated gelu feed forward and different word embeddings work
+        config = config_and_inputs[0]
+        config["feed_forward_proj"] = "gated-gelu"
+        self.model_tester.create_and_check_model(config, *config_and_inputs[1:])
+
+    def test_config_and_model_silu_gated(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        config = config_and_inputs[0]
+        config["feed_forward_proj"] = "gated-silu"
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_with_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
+
+    def test_decoder_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past(
+            *config_and_inputs)
+
+    def test_decoder_model_past_with_attn_mask(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(
+            *config_and_inputs)
+
+    def test_decoder_model_past_with_3d_attn_mask(self):
+        (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        ) = self.model_tester.prepare_config_and_inputs()
+
+        attention_mask = ids_tensor(
+            [
+                self.model_tester.batch_size,
+                self.model_tester.encoder_seq_length,
+                self.model_tester.encoder_seq_length
+            ],
+            vocab_size=2,
+        )
+        decoder_attention_mask = ids_tensor(
+            [
+                self.model_tester.batch_size,
+                self.model_tester.decoder_seq_length,
+                self.model_tester.decoder_seq_length
+            ],
+            vocab_size=2,
+        )
+
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        )
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(
+            *config_and_inputs)
+
+    def test_generate_with_past_key_values(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_generate_with_past_key_values(
+            *config_and_inputs)
+
+    def test_v1_1_resize_embeddings(self):
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        self.model_tester.check_resize_embeddings_t5_v1_1(config)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in T5_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = T5Model.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class T5ModelIntegrationTests(unittest.TestCase):
+
+    def model(self):
+        return T5ForConditionalGeneration.from_pretrained("t5-base")
+
+    def tokenizer(self):
+        return T5Tokenizer.from_pretrained("t5-base")
+
+    @slow
+    def test_small_generation(self):
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+
+        model.eval()
+
+        input_ids = tokenizer("summarize: Hello there",
+                              return_tensors="pd")["input_ids"]
+
+        sequences = model.generate(input_ids,
+                                   max_length=8,
+                                   decode_strategy="greedy_search")
+
+        output_str = tokenizer.batch_decode(sequences,
+                                            skip_special_tokens=True)[0]
+        self.assertTrue(output_str == "Hello there!")
+
+    @slow
+    def test_small_integration_test(self):
+
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+
+        model.eval()
+
+        input_ids = tokenizer("Hello there", return_tensors="pd")["input_ids"]
+        labels = tokenizer("Hi I am", return_tensors="pd")["input_ids"]
+
+        loss = model(input_ids, labels=labels)[0]
+        mtf_score = -(labels.shape[-1] * loss.item())
+
+        EXPECTED_SCORE = -19.084566
+        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
+
+    @slow
+    def test_small_v1_1_integration_test(self):
+
+        model = T5ForConditionalGeneration.from_pretrained("t5-v1_1-base")
+        tokenizer = T5Tokenizer.from_pretrained("t5-v1_1-base")
+
+        model.eval()
+
+        input_ids = tokenizer("Hello there", return_tensors="pd")["input_ids"]
+        labels = tokenizer("Hi I am", return_tensors="pt")["input_ids"]
+
+        loss = model(input_ids, labels=labels)[0]
+        mtf_score = -(labels.shape[-1] * loss.item())
+
+        EXPECTED_SCORE = -56.207352
+        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
+
+    @slow
+    def test_summarization(self):
+        model = self.model
+        model.eval()
+        tok = self.tokenizer
+
+        FRANCE_ARTICLE = (  # @noqa
+            "Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings"
+            " Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane."
+            ' Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation."'
+            ' He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s'
+            " comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video"
+            " showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French"
+            " Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a"
+            " phone at the wreckage site. The two publications described the supposed video, but did not post it on"
+            " their websites. The publications said that they watched the video, which was found by a source close to"
+            " the investigation. \"One can hear cries of 'My God' in several languages,\" Paris Match reported."
+            ' "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the'
+            " cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the"
+            ' screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt,'
+            " editor-in-chief of Bild online. An official with France's accident investigation agency, the BEA, said"
+            " the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman"
+            " in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the"
+            ' reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said,'
+            ' but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be'
+            " sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by"
+            " specialized technicians working hand-in-hand with investigators. But none of the cell phones found so"
+            " far have been sent to the institute, Menichini said. Asked whether staff involved in the search could"
+            ' have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin'
+            ' Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match'
+            ' are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered'
+            ' cell phones from the crash site after Bild and Paris Match published their reports. "That is something'
+            " we did not know before. ... Overall we can say many things of the investigation weren't revealed by the"
+            ' investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline'
+            " Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the"
+            " controls of Germanwings Flight 9525, which he's accused of deliberately crashing last week in the"
+            ' French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of'
+            ' severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school'
+            " discovered in an internal investigation, Lufthansa said, included medical documents he submitted in"
+            " connection with resuming his flight training. The announcement indicates that Lufthansa, the parent"
+            " company of Germanwings, knew of Lubitz's battle with depression, allowed him to continue training and"
+            " ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100%"
+            ' fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was'
+            " sharing the information and documents -- including training and medical records -- with public"
+            " prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the"
+            " past week to recover human remains and plane debris scattered across a steep mountainside. He saw the"
+            " crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash"
+            " site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late"
+            " Tuesday that no visible human remains were left at the site but recovery teams would keep searching."
+            " French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all"
+            " the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested."
+            " In the meantime, the recovery of the victims' personal belongings will start Wednesday, Menichini said."
+            " Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew"
+            " on board. Check out the latest from our correspondents . The details about Lubitz's correspondence with"
+            " the flight school during his training were among several developments as investigators continued to"
+            " delve into what caused the crash and Lubitz's possible motive for downing the jet. A Lufthansa"
+            " spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his"
+            ' examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in'
+            " Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at"
+            " some point before his aviation career and underwent psychotherapy before he got his pilot's license."
+            " Kumpa emphasized there's no evidence suggesting Lubitz was suicidal or acting aggressively before the"
+            " crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to"
+            " lose his pilot's license, a European government official briefed on the investigation told CNN on"
+            ' Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being'
+            " considered. Another source, a law enforcement official briefed on the investigation, also told CNN that"
+            " authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would"
+            " not be allowed to fly because of his medical problems. Lubitz's girlfriend told investigators he had"
+            " seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded"
+            " he had psychological issues, the European government official said. But no matter what details emerge"
+            " about his previous mental health struggles, there's more to the story, said Brian Russell, a forensic"
+            ' psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact'
+            " that maybe they weren't going to keep doing their job and they're upset about that and so they're"
+            ' suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to'
+            " also take that rage and turn it outward on 149 other people who had nothing to do with the person's"
+            ' problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight'
+            " 9525? CNN's Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura"
+            " Smith-Spark wrote from London. CNN's Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine"
+            " Amiel and Anna-Maja Rappard contributed to this report.")
+        SHORTER_ARTICLE = (
+            "(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
+            " Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The"
+            " formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based."
+            " The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its"
+            ' jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East'
+            ' Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the'
+            " situation in Palestinian territories, paving the way for possible war crimes investigations against"
+            " Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and"
+            " the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the"
+            " body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony, said it was a"
+            ' move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the'
+            ' world is also a step closer to ending a long era of impunity and injustice," he said, according to an'
+            ' ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge'
+            " Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the"
+            ' Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine'
+            " acquires all the rights as well as responsibilities that come with being a State Party to the Statute."
+            ' These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights'
+            ' Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should'
+            " immediately end their pressure, and countries that support universal acceptance of the court's treaty"
+            ' should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the'
+            " group. \"What's objectionable is the attempts to undermine international justice, not Palestine's"
+            ' decision to join a treaty to which over 100 countries around the world are members." In January, when'
+            " the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an"
+            ' outrage, saying the court was overstepping its boundaries. The United States also said it "strongly"'
+            " disagreed with the court's decision. \"As we have said repeatedly, we do not believe that Palestine is a"
+            ' state and therefore we do not believe that it is eligible to join the ICC," the State Department said in'
+            ' a statement. It urged the warring sides to resolve their differences through direct negotiations. "We'
+            ' will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace,"'
+            " it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the"
+            ' territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the'
+            " court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou"
+            ' Bensouda said her office would "conduct its analysis in full independence and impartiality." The war'
+            " between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry"
+            " will include alleged war crimes committed since June. The International Criminal Court was set up in"
+            " 2002 to prosecute genocide, crimes against humanity and war crimes. CNN's Vasco Cotovio, Kareem Khadder"
+            " and Faith Karimi contributed to this report.")
+        IRAN_ARTICLE = (
+            "(CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran"
+            " in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively"
+            " block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger."
+            " Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli"
+            " Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a"
+            " letter to the Iranian leadership warning them away from a deal. The debate that has already begun since"
+            " the announcement of the new framework will likely result in more heat than light. It will not be helped"
+            " by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: ."
+            " The most misleading assertion, despite universal rejection by experts, is that the negotiations'"
+            " objective at the outset was the total elimination of any nuclear program in Iran. That is the position"
+            " of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it"
+            " had been, there would have been no Iranian team at the negotiating table. Rather, the objective has"
+            " always been to structure an agreement or series of agreements so that Iran could not covertly develop a"
+            " nuclear arsenal before the United States and its allies could respond. The new framework has exceeded"
+            " expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by"
+            " two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another"
+            " dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite"
+            " sharp accusations by some in the United States and its allies, Iran denies having such a program, and"
+            " U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's"
+            " continued cooperation with International Atomic Energy Agency inspections is further evidence on this"
+            " point, and we'll know even more about Iran's program in the coming months and years because of the deal."
+            " In fact, the inspections provisions that are part of this agreement are designed to protect against any"
+            " covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that"
+            " the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter"
+            " warning that a deal might be killed by Congress or a future president). This of course is not the case."
+            " The talks were between Iran and the five permanent members of the U.N. Security Council (United States,"
+            " United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has"
+            " played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement"
+            " reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran"
+            " and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement"
+            " contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the"
+            " case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased"
+            " or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes"
+            " Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear"
+            " sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going"
+            " forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such"
+            " a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the"
+            ' agreement should be a formal treaty requiring the Senate to "advise and consent." But the issue is not'
+            " suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New"
+            " START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement"
+            " with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement"
+            " will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove"
+            " most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally"
+            " some insist that any agreement must address Iranian missile programs, human rights violations or support"
+            " for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are"
+            " unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in"
+            " the negotiations would be a poison pill. This agreement should be judged on its merits and on how it"
+            " affects the security of our negotiating partners and allies, including Israel. Those judgments should be"
+            " fact-based, not based on questionable assertions or dubious assumptions."
+        )
+        ARTICLE_SUBWAY = (
+            "New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
+            " year later, she got married again in Westchester County, but to a different man and without divorcing"
+            " her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos"
+            ' declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married'
+            " once more, this time in the Bronx. In an application for a marriage license, she stated it was her"
+            ' "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false'
+            ' instrument for filing in the first degree," referring to her false statements on the 2010 marriage'
+            " license application, according to court documents. Prosecutors said the marriages were part of an"
+            " immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to"
+            " her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was"
+            " arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New"
+            " York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total,"
+            " Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All"
+            " occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be"
+            " married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors"
+            " said the immigration scam involved some of her husbands, who filed for permanent residence status"
+            " shortly after the marriages.  Any divorces happened only after such filings were approved. It was"
+            " unclear whether any of the men will be prosecuted. The case was referred to the Bronx District"
+            " Attorney's Office by Immigration and Customs Enforcement and the Department of Homeland Security's"
+            ' Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt,'
+            " Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his"
+            " native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces"
+            " up to four years in prison.  Her next court appearance is scheduled for May 18."
+        )
+
+        expected_summaries = [
+            'prosecutor: "so far no videos were used in the crash investigation" two magazines claim to have found a'
+            " cell phone video of the final seconds . \"one can hear cries of 'My God' in several languages,\" one"
+            " magazine says .",
+            "the formal accession was marked by a ceremony at The Hague, in the Netherlands . the ICC opened a"
+            " preliminary examination into the situation in the occupied Palestinian territory . as members of the"
+            " court, Palestinians may be subject to counter-charges as well .",
+            "the u.s. and its negotiating partners reached a very strong framework agreement with Iran . aaron miller:"
+            " the debate that has already begun since the announcement of the new framework will likely result in more"
+            " heat than light . the deal would reduce Iran's low-enriched uranium stockpile, cut centrifuges and"
+            " implement a rigorous inspection regime .",
+            "prosecutors say the marriages were part of an immigration scam . if convicted, barrientos faces two"
+            ' criminal counts of "offering a false instrument for filing in the first degree" she has been married 10'
+            " times, with nine of her marriages occurring between 1999 and 2002 .",
+        ]
+
+        dct = tok(
+            [
+                "summarize: " + x for x in
+                [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY]
+            ],
+            padding="max_length",
+            truncation=True,
+            return_tensors="pd",
+        )
+        self.assertEqual(512, dct["input_ids"].shape[1])
+
+        hypotheses_batch = model.generate(
+            **dct,
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=142,
+            min_length=56,
+            decode_strategy="beam_search",
+            early_stopping=True,
+        )
+
+        decoded = tok.batch_decode(hypotheses_batch[0],
+                                   skip_special_tokens=True,
+                                   clean_up_tokenization_spaces=False)
+
+        self.assertListEqual(
+            expected_summaries,
+            decoded,
+        )
+
+    @slow
+    def test_translation_en_to_de(self):
+        model = self.model
+        tok = self.tokenizer
+        use_task_specific_params(model, "translation_en_to_de")
+
+        en_text = '"Luigi often said to me that he never wanted the brothers to end up in court", she wrote.'
+        expected_translation = (
+            '"Luigi sagte mir oft, dass er nie wollte, dass die Brüder am Gericht sitzen", schrieb sie.'
+        )
+
+        input_ids = tok.encode("translate English to German: " + en_text,
+                               return_tensors="pd")["input_ids"]
+        output = model.generate(input_ids,
+                                decode_strategy="greedy_search",
+                                max_length=100)
+        translation = tok.decode(output[0][0],
+                                 skip_special_tokens=True,
+                                 clean_up_tokenization_spaces=False)
+        self.assertEqual(translation, expected_translation)
+
+    @slow
+    def test_translation_en_to_fr(self):
+        model = self.model  # t5-base
+        tok = self.tokenizer
+
+        en_text = (
+            ' This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of'
+            " countless generations of stars: the oldest stars are seen as blue dots. "
+        )
+
+        input_ids = tok.encode("translate English to French: " + en_text,
+                               return_tensors="pd")["input_ids"]
+
+        output = model.generate(
+            input_ids=input_ids,
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=100,
+            decode_strategy="beam_search",
+            early_stopping=True,
+        )
+        translation = tok.decode(output[0][0],
+                                 skip_special_tokens=True,
+                                 clean_up_tokenization_spaces=False)
+        new_truncated_translation = (
+            "Cette section d'images provenant de l'enregistrement infrarouge effectué par le télescope Spitzer montre "
+            "un "
+            "« portrait familial » de générations innombrables d’étoiles : les plus anciennes sont observées "
+            "sous forme "
+            "de points bleus.")
+
+        self.assertEqual(translation, new_truncated_translation)
+
+    @slow
+    def test_translation_en_to_ro(self):
+        model = self.model
+        tok = self.tokenizer
+
+        en_text = "Taco Bell said it plans to add 2,000 locations in the US by 2022."
+        expected_translation = "Taco Bell a declarat că intenţionează să adauge 2 000 de locaţii în SUA până în 2022."
+
+        input_ids = tok("translate English to Romanian: " + en_text,
+                        return_tensors="pd")["input_ids"]
+        output = model.generate(input_ids,
+                                decode_strategy="greedy_search",
+                                max_length=100)
+        translation = tok.decode(output[0][0],
+                                 skip_special_tokens=True,
+                                 clean_up_tokenization_spaces=False)
+        self.assertEqual(translation, expected_translation)
+
+
+class TestAsymmetricT5(unittest.TestCase):
+
+    def build_model_and_check_forward_pass(self, **kwargs):
+        tester = T5ModelTester(self, **kwargs)
+        config, *inputs = tester.prepare_config_and_inputs()
+        (
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        ) = inputs
+        pretrained_model = T5Model(**config)
+        model = T5ForConditionalGeneration(pretrained_model)
+        model.eval()
+        outputs = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            labels=lm_labels,
+        )
+        # outputs = model(*inputs)
+        assert len(outputs) == 4
+        assert outputs[1].shape == [
+            tester.batch_size, tester.decoder_seq_length, tester.vocab_size
+        ]
+        assert outputs[0].shape == [1]
+        return model
+
+    def test_small_decoder(self):
+        # num_hidden_layers is passed to T5Config as num_layers
+        model = self.build_model_and_check_forward_pass(decoder_layers=1,
+                                                        num_hidden_layers=2)
+        assert len(model.encoder.block) == 2
+        assert len(model.decoder.block) == 1
+
+    def test_defaulting_to_symmetry(self):
+        # num_hidden_layers is passed to T5Config as num_layers
+        model = self.build_model_and_check_forward_pass(num_hidden_layers=2)
+        assert len(model.decoder.block) == len(model.encoder.block) == 2
diff --git a/tests/transformers/t5/test_tokenizer.py b/tests/transformers/t5/test_tokenizer.py
new file mode 100644
index 000000000000..5eae98bd5e2e
--- /dev/null
+++ b/tests/transformers/t5/test_tokenizer.py
@@ -0,0 +1,320 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import tempfile
+import unittest
+
+from paddlenlp.transformers import SPIECE_UNDERLINE, AddedToken, T5Tokenizer
+from paddlenlp.transformers.tokenizer_utils_base import BatchEncoding
+from tests.testing_utils import get_tests_dir, slow
+
+from ..test_tokenizer_common import TokenizerTesterMixin
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
+
+FRAMEWORK = "pd"
+
+
+class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = T5Tokenizer
+    test_sentencepiece = True
+    from_pretrained_vocab_key = "sentencepiece_model_file"
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<s>"
+        token_id = 1
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token),
+                         token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id),
+                         token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<unk>")
+        self.assertEqual(vocab_keys[1], "<s>")
+        self.assertEqual(vocab_keys[-1], "<pad>")
+        self.assertEqual(len(vocab_keys), 1_101)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_100)
+
+    def test_full_tokenizer(self):
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
+                             [285, 46, 10, 170, 382])
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(ids, [
+            8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72,
+            80, 6, 0, 4
+        ])
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "<unk>",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "<unk>",
+                ".",
+            ],
+        )
+
+    def t5_base_tokenizer(self):
+        return T5Tokenizer.from_pretrained("t5-base")
+
+    def get_tokenizer(self, **kwargs) -> T5Tokenizer:
+        return self.tokenizer_class.from_pretrained(self.tmpdirname,
+                                                    pad_token=None,
+                                                    **kwargs)
+
+    def test_eos_treatment(self):
+        tokenizer = self.t5_base_tokenizer()
+        batch_with_eos_added = tokenizer(
+            ["hi</s>", "I went to the gym</s>", "</s>"])
+        batch_without_eos_added = tokenizer(["hi", "I went to the gym", ""])
+        self.assertListEqual(batch_with_eos_added["input_ids"],
+                             batch_without_eos_added["input_ids"])
+
+    def test_prepare_batch(self):
+        tokenizer = self.t5_base_tokenizer()
+        src_text = [
+            "A long paragraph for summarization.",
+            "Another paragraph for summarization."
+        ]
+        expected_src_tokens = [
+            71, 307, 8986, 21, 4505, 1635, 1707, 5, tokenizer.eos_token_id
+        ]
+        batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK)
+        self.assertIsInstance(batch, BatchEncoding)
+
+        result = list(batch["input_ids"].tolist()[0])
+
+        self.assertListEqual(expected_src_tokens, result)
+
+        self.assertEqual([2, 9], batch["input_ids"].shape)
+        self.assertEqual([2, 9], batch.attention_mask.shape)
+
+    def test_empty_target_text(self):
+        tokenizer = self.t5_base_tokenizer()
+        src_text = [
+            "A long paragraph for summarization.",
+            "Another paragraph for summarization."
+        ]
+        batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK)
+        # check if input_ids are returned and no decoder_input_ids
+        self.assertIn("input_ids", batch)
+        self.assertIn("attention_mask", batch)
+        self.assertNotIn("decoder_input_ids", batch)
+        self.assertNotIn("decoder_attention_mask", batch)
+
+    def test_max_length(self):
+        tokenizer = self.t5_base_tokenizer()
+        tgt_text = [
+            "Summary of the text.",
+            "Another summary.",
+        ]
+        targets = tokenizer(text=tgt_text,
+                            max_length=32,
+                            padding="max_length",
+                            truncation=True,
+                            return_tensors=FRAMEWORK)
+        self.assertEqual(32, targets["input_ids"].shape[1])
+
+    def test_outputs_not_longer_than_maxlen(self):
+        tokenizer = self.t5_base_tokenizer()
+
+        batch = tokenizer(["I am a small frog" * 1000, "I am a small frog"],
+                          padding=True,
+                          truncation=True,
+                          return_tensors=FRAMEWORK)
+        self.assertIsInstance(batch, BatchEncoding)
+        # Since T5 does NOT have a max input length,
+        # this test should be changed to the following in Transformers v5:
+        # self.assertEqual(batch["input_ids"].shape, (2, 8001))
+        self.assertEqual(batch["input_ids"].shape, [2, 512])
+
+    def test_eos_in_input(self):
+        tokenizer = self.t5_base_tokenizer()
+        src_text = ["A long paragraph for summarization. </s>"]
+        tgt_text = ["Summary of the text. </s>"]
+        expected_src_tokens = [71, 307, 8986, 21, 4505, 1635, 1707, 5, 1]
+        expected_tgt_tokens = [20698, 13, 8, 1499, 5, 1]
+
+        batch = tokenizer(src_text, text_target=tgt_text)
+
+        self.assertEqual(expected_src_tokens, batch["input_ids"][0])
+        # self.assertEqual(expected_tgt_tokens, batch["labels"][0])
+
+    def test_token_type_ids(self):
+        src_text_1 = ["A first paragraph for summarization."]
+        src_text_2 = ["A second paragraph for summarization."]
+
+        tokenizer = self.t5_base_tokenizer()
+
+        slow_token_type_ids = tokenizer(
+            src_text_1,
+            src_text_2,
+            add_special_tokens=True,
+            return_token_type_ids=True)["token_type_ids"]
+
+        self.assertEqual(len(slow_token_type_ids[0]), 18)
+
+    def test_special_tokens_initialization_with_non_empty_additional_special_tokens(
+            self):
+        tokenizer_list = []
+        tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
+
+        for tokenizer_class, tokenizer_utils in tokenizer_list:
+
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tokenizer_utils.save_pretrained(tmp_dir)
+
+                with open(os.path.join(tmp_dir, "special_tokens_map.json"),
+                          encoding="utf-8") as json_file:
+                    special_tokens_map = json.load(json_file)
+
+                with open(os.path.join(tmp_dir, "tokenizer_config.json"),
+                          encoding="utf-8") as json_file:
+                    tokenizer_config = json.load(json_file)
+
+                added_tokens_extra_ids = [f"<extra_id_{i}>" for i in range(100)]
+
+                special_tokens_map[
+                    "additional_special_tokens"] = added_tokens_extra_ids + [
+                        "an_additional_special_token"
+                    ]
+                tokenizer_config[
+                    "additional_special_tokens"] = added_tokens_extra_ids + [
+                        "an_additional_special_token"
+                    ]
+
+                with open(os.path.join(tmp_dir, "special_tokens_map.json"),
+                          "w",
+                          encoding="utf-8") as outfile:
+                    json.dump(special_tokens_map, outfile)
+                with open(os.path.join(tmp_dir, "tokenizer_config.json"),
+                          "w",
+                          encoding="utf-8") as outfile:
+                    json.dump(tokenizer_config, outfile)
+
+                # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
+                # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
+                # "special_tokens_map.json" files
+                tokenizer_without_change_in_init = tokenizer_class.from_pretrained(
+                    tmp_dir, )
+                self.assertIn(
+                    "an_additional_special_token",
+                    tokenizer_without_change_in_init.additional_special_tokens)
+                # self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
+                self.assertEqual(
+                    ["an_additional_special_token"],
+                    tokenizer_without_change_in_init.convert_ids_to_tokens(
+                        tokenizer_without_change_in_init.convert_tokens_to_ids(
+                            ["an_additional_special_token"])),
+                )
+
+                # Now we test that we can change the value of additional_special_tokens in the from_pretrained
+                new_added_tokens = added_tokens_extra_ids + [
+                    AddedToken("a_new_additional_special_token", lstrip=True)
+                ]
+                tokenizer = tokenizer_class.from_pretrained(
+                    tmp_dir,
+                    additional_special_tokens=new_added_tokens,
+                )
+
+                self.assertIn("a_new_additional_special_token",
+                              tokenizer.additional_special_tokens)
+                self.assertEqual(
+                    ["a_new_additional_special_token"],
+                    tokenizer.convert_ids_to_tokens(
+                        tokenizer.convert_tokens_to_ids(
+                            ["a_new_additional_special_token"])),
+                )
+
+    # overwritten from `test_tokenization_common` since T5 has no max length
+    def test_pretrained_model_lists(self):
+        # We should have at least one default checkpoint for each tokenizer
+        # We should specify the max input length as well (used in some part to list the pretrained checkpoints)
+        self.assertGreaterEqual(
+            len(self.tokenizer_class.pretrained_resource_files_map), 1)
+        self.assertGreaterEqual(
+            len(
+                list(
+                    self.tokenizer_class.pretrained_resource_files_map.values())
+                [0]), 1)
+
+    def test_offsets_mapping(self):
+        pass
diff --git a/tests/transformers/test_generation_utils.py b/tests/transformers/test_generation_utils.py
index 8d80e668290b..c6031f641971 100644
--- a/tests/transformers/test_generation_utils.py
+++ b/tests/transformers/test_generation_utils.py
@@ -83,19 +83,19 @@ def _get_logits_processor_and_kwargs(
         forced_bos_token_id=None,
         forced_eos_token_id=None,
         max_length=None,
-        diversity_penalty=None,
+        diversity_rate=None,
     ):
         process_kwargs = {
             "min_length": 1 if max_length is None else max_length - 1,
             "repetition_penalty": 1.2,
         }
 
-        if diversity_penalty is not None:
-            process_kwargs["diversity_rate"] = diversity_penalty
+        if diversity_rate is not None:
+            process_kwargs["diversity_rate"] = diversity_rate
         logits_processor = LogitsProcessorList(([
             HammingDiversityLogitsProcessor(
-                diversity_penalty, num_beams=2, num_beam_groups=2),
-        ] if diversity_penalty is not None else []) + ([
+                diversity_rate, num_beams=2, num_beam_groups=2),
+        ] if diversity_rate is not None else []) + ([
             MinLengthLogitsProcessor(process_kwargs["min_length"], eos_token_id
                                      ),
         ] if eos_token_id is not None else []) + ([
@@ -143,7 +143,7 @@ def _get_diverse_beam_scorer_and_kwargs(batch_size,
             "num_beams": 2,
             "num_return_sequences": num_return_sequences,
             "num_beam_groups": 2,  # one beam per group
-            "diversity_penalty": 2.0,
+            "diversity_rate": 2.0,
         }
         beam_scorer = BeamSearchScorer(
             batch_size=batch_size,
@@ -171,6 +171,9 @@ def _get_encoder_outputs(
             input_ids,
             attention_mask=attention_mask,
         )
+        if isinstance(encoder_outputs, (list, tuple)):
+            encoder_outputs = encoder_outputs[0]
+
         encoder_outputs = encoder_outputs.repeat_interleave(num_interleave,
                                                             axis=0)
 
@@ -368,6 +371,7 @@ def _group_beam_search_generate(
         logits_processor,
         logits_process_kwargs,
     ):
+        beam_kwargs.pop("diversity_rate")
         model.eval()
         with paddle.no_grad():
             output_generate = model.generate(
@@ -593,7 +597,7 @@ def test_group_beam_search_generate(self):
                 getattr(config, "forced_bos_token_id", None),
                 getattr(config, "forced_eos_token_id", None),
                 max_length,
-                diversity_penalty=2.0,
+                diversity_rate=2.0,
             )
 
             # check `generate()` and `group_beam_search()` are equal
@@ -790,7 +794,7 @@ def test_diverse_beam_search(self):
             num_beams=4,
             num_return_sequences=3,
             num_beam_groups=4,
-            diversity_penalty=2.0,
+            diversity_rate=2.0,
         )
 
         generated_text = bart_tokenizer.batch_decode(outputs,
diff --git a/tests/transformers/test_tokenizer_common.py b/tests/transformers/test_tokenizer_common.py
index 3316d91df773..2aae52804272 100644
--- a/tests/transformers/test_tokenizer_common.py
+++ b/tests/transformers/test_tokenizer_common.py
@@ -947,7 +947,8 @@ def test_maximum_encoding_length_single_input(self):
 
                 sequence1 = tokenizer(seq_1,
                                       return_token_type_ids=None,
-                                      add_special_tokens=False)
+                                      add_special_tokens=False,
+                                      truncation=False)
                 total_length1 = len(sequence1["input_ids"])
                 self.assertGreater(
                     total_length1, model_max_length,
@@ -1080,12 +1081,14 @@ def test_maximum_encoding_length_pair_input(self):
 
                 sequence1 = tokenizer(seq_1,
                                       return_token_type_ids=None,
-                                      add_special_tokens=False)
+                                      add_special_tokens=False,
+                                      truncation=False)
                 total_length1 = len(sequence1["input_ids"])
                 sequence2 = tokenizer(seq_2,
                                       seq_1,
                                       return_token_type_ids=None,
-                                      add_special_tokens=False)
+                                      add_special_tokens=False,
+                                      truncation=False)
                 total_length2 = len(sequence2["input_ids"])
                 self.assertLess(
                     total_length1, model_max_length - 10,
@@ -1900,25 +1903,46 @@ def test_call(self):
                 ]
 
                 # Test not batched
-                encoded_sequences_1 = tokenizer.encode(sequences[0])
-                encoded_sequences_2 = tokenizer(sequences[0])
+                encoded_sequences_1 = tokenizer.encode(
+                    sequences[0],
+                    return_token_type_ids=False,
+                    return_attention_mask=True)
+                encoded_sequences_2 = tokenizer(sequences[0],
+                                                return_token_type_ids=False,
+                                                return_attention_mask=True)
                 self.assertEqual(encoded_sequences_1, encoded_sequences_2)
 
                 # Test not batched pairs
-                encoded_sequences_1 = tokenizer.encode(sequences[0],
-                                                       sequences[1])
-                encoded_sequences_2 = tokenizer(sequences[0], sequences[1])
+                encoded_sequences_1 = tokenizer.encode(
+                    sequences[0],
+                    sequences[1],
+                    return_token_type_ids=False,
+                    return_attention_mask=True)
+                encoded_sequences_2 = tokenizer(sequences[0],
+                                                sequences[1],
+                                                return_token_type_ids=False,
+                                                return_attention_mask=True)
                 self.assertEqual(encoded_sequences_1, encoded_sequences_2)
 
                 # Test batched
-                encoded_sequences_1 = tokenizer.batch_encode(sequences)
-                encoded_sequences_2 = tokenizer(sequences)
+                encoded_sequences_1 = tokenizer.batch_encode(
+                    sequences,
+                    return_token_type_ids=False,
+                    return_attention_mask=True)
+                encoded_sequences_2 = tokenizer(sequences,
+                                                return_token_type_ids=False,
+                                                return_attention_mask=True)
                 self.assertEqual(encoded_sequences_1, encoded_sequences_2)
 
                 # Test batched pairs
                 encoded_sequences_1 = tokenizer.batch_encode(
-                    list(zip(sequences, sequences)))
-                encoded_sequences_2 = tokenizer(sequences, sequences)
+                    list(zip(sequences, sequences)),
+                    return_token_type_ids=False,
+                    return_attention_mask=True)
+                encoded_sequences_2 = tokenizer(sequences,
+                                                sequences,
+                                                return_token_type_ids=False,
+                                                return_attention_mask=True)
                 self.assertEqual(encoded_sequences_1, encoded_sequences_2)
 
     def test_batch_encode_plus_batch_sequence_length(self):