From 0f8dbc0bd3b28e88bb37263db9813872a5a9a42e Mon Sep 17 00:00:00 2001 From: Towdo Date: Fri, 6 Oct 2023 10:40:47 +0200 Subject: [PATCH] Fixed inconsistency in several fast tokenizers (#26561) --- .../models/bert/tokenization_bert_fast.py | 2 +- .../convbert/tokenization_convbert_fast.py | 2 +- .../retribert/tokenization_retribert_fast.py | 2 +- .../tokenization_distilbert_fast.py | 2 +- .../electra/tokenization_electra_fast.py | 2 +- .../models/funnel/tokenization_funnel_fast.py | 2 +- .../layoutlm/tokenization_layoutlm_fast.py | 2 +- .../models/lxmert/tokenization_lxmert_fast.py | 2 +- .../tokenization_mobilebert_fast.py | 2 +- .../models/realm/tokenization_realm_fast.py | 2 +- .../roformer/tokenization_roformer_fast.py | 2 +- .../tokenization_squeezebert_fast.py | 2 +- tests/test_tokenization_common.py | 30 ++++++++++++------- 13 files changed, 31 insertions(+), 23 deletions(-) diff --git a/src/transformers/models/bert/tokenization_bert_fast.py b/src/transformers/models/bert/tokenization_bert_fast.py index e55f3f36ad6dd3..80d542367dca33 100644 --- a/src/transformers/models/bert/tokenization_bert_fast.py +++ b/src/transformers/models/bert/tokenization_bert_fast.py @@ -265,7 +265,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/convbert/tokenization_convbert_fast.py b/src/transformers/models/convbert/tokenization_convbert_fast.py index 07447bb6a17caa..7ccc21b3e058d5 100644 --- a/src/transformers/models/convbert/tokenization_convbert_fast.py +++ b/src/transformers/models/convbert/tokenization_convbert_fast.py @@ -159,7 +159,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py b/src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py index 30cb69c2b32b3f..07f7964b9f3f8e 100644 --- a/src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py +++ b/src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py @@ -164,7 +164,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/distilbert/tokenization_distilbert_fast.py b/src/transformers/models/distilbert/tokenization_distilbert_fast.py index dd9dcd165d4109..adb90f857d75fe 100644 --- a/src/transformers/models/distilbert/tokenization_distilbert_fast.py +++ b/src/transformers/models/distilbert/tokenization_distilbert_fast.py @@ -190,7 +190,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/electra/tokenization_electra_fast.py b/src/transformers/models/electra/tokenization_electra_fast.py index cf92dd01714f9d..81704317f869a2 100644 --- a/src/transformers/models/electra/tokenization_electra_fast.py +++ b/src/transformers/models/electra/tokenization_electra_fast.py @@ -192,7 +192,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/funnel/tokenization_funnel_fast.py b/src/transformers/models/funnel/tokenization_funnel_fast.py index 864303eb210153..17946eb74b5839 100644 --- a/src/transformers/models/funnel/tokenization_funnel_fast.py +++ b/src/transformers/models/funnel/tokenization_funnel_fast.py @@ -212,7 +212,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py index 7ba06d7fa1107e..afa92abaf87745 100644 --- a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py +++ b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py @@ -166,7 +166,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/lxmert/tokenization_lxmert_fast.py b/src/transformers/models/lxmert/tokenization_lxmert_fast.py index 8e58a3aafac5c1..0584f1fe83c351 100644 --- a/src/transformers/models/lxmert/tokenization_lxmert_fast.py +++ b/src/transformers/models/lxmert/tokenization_lxmert_fast.py @@ -152,7 +152,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py b/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py index 6bac366d237859..f8d62158b22cef 100644 --- a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py +++ b/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py @@ -150,7 +150,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/realm/tokenization_realm_fast.py b/src/transformers/models/realm/tokenization_realm_fast.py index 1cc1a996653530..59b23f45ee0b30 100644 --- a/src/transformers/models/realm/tokenization_realm_fast.py +++ b/src/transformers/models/realm/tokenization_realm_fast.py @@ -282,7 +282,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/roformer/tokenization_roformer_fast.py b/src/transformers/models/roformer/tokenization_roformer_fast.py index d73e3cdb93ccc6..360b76b843dd7f 100644 --- a/src/transformers/models/roformer/tokenization_roformer_fast.py +++ b/src/transformers/models/roformer/tokenization_roformer_fast.py @@ -163,7 +163,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py b/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py index bf7659ffd18b4b..23faab71349f78 100644 --- a/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py +++ b/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py @@ -173,7 +173,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index a2f207c96391c2..523d49bc9d34fd 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -3209,19 +3209,27 @@ def test_build_inputs_with_special_tokens(self): # output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair) # self.assertEqual(output_p, output_r) - # Input tokens id - input_simple = tokenizer_p.encode("This is a sample input", add_special_tokens=False) - input_pair = tokenizer_p.encode("This is a sample pair", add_special_tokens=False) + input_pairs = [ + ("", ""), + ("", "This is a sample pair"), + ("This is a sample input", ""), + ("This is a sample input", "This is a sample pair"), + ] - # Generate output - output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple) - output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple) - self.assertEqual(output_p, output_r) + for sample_input, sample_pair in input_pairs: + # Input tokens id + input_simple = tokenizer_p.encode(sample_input, add_special_tokens=False) + input_pair = tokenizer_p.encode(sample_pair, add_special_tokens=False) - # Generate pair output - output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair) - output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair) - self.assertEqual(output_p, output_r) + # Generate output + output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple) + output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple) + self.assertEqual(output_p, output_r) + + # Generate pair output + output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair) + output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair) + self.assertEqual(output_p, output_r) def test_padding(self, max_length=50): if not self.test_slow_tokenizer: