fix whitespace LM tokenize issue (#7407)

* fix whitespace LM tokenize issue * Add testcase * format testcase with black * Add changelog entry * Update changelog/7407.bugfix.md Co-authored-by: Tom Bocklisch <tom@rasa.com> Co-authored-by: Roberto <43567378+rasabot@users.noreply.github.com>
RasaHQ · Dec 14, 2020 · 3c5102e · 3c5102e
1 parent f6e12bf
commit 3c5102e
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 0 deletions.
diff --git a/changelog/7407.bugfix.md b/changelog/7407.bugfix.md
@@ -0,0 +1 @@
+Remove token when its text (for example, whitespace) can't be tokenized by LM tokenizer (from `LanguageModelFeaturizer`).
diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -347,6 +347,12 @@ def _tokenize_example(
             # use lm specific tokenizer to further tokenize the text
             split_token_ids, split_token_strings = self._lm_tokenize(token.text)
 
+            if not split_token_ids:
+                # fix the situation that `token.text` only contains whitespace or other special characters,
+                # which cause `split_token_ids` and `split_token_strings` be empty,
+                # finally cause `self._lm_specific_token_cleanup()` to raise an exception
+                continue
+
             (split_token_ids, split_token_strings) = self._lm_specific_token_cleanup(
                 split_token_ids, split_token_strings
             )

diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py
@@ -763,3 +763,33 @@ def test_preserve_sentence_and_sequence_features_old_config():
     assert not (message.features[1].features == lm_docs[SENTENCE_FEATURES]).any()
     assert (message.features[0].features == hf_docs[SEQUENCE_FEATURES]).all()
     assert (message.features[1].features == hf_docs[SENTENCE_FEATURES]).all()
+
+
+@pytest.mark.parametrize(
+    "text, tokens, expected_feature_tokens",
+    [
+        (
+            "购买 iPhone 12",  # whitespace ' ' is expected to be removed
+            [("购买", 0), (" ", 2), ("iPhone", 3), (" ", 9), ("12", 10)],
+            [("购买", 0), ("iPhone", 3), ("12", 10)],
+        )
+    ],
+)
+def test_lm_featurizer_correctly_handle_whitespace_token(
+    text, tokens, expected_feature_tokens
+):
+    from rasa.nlu.tokenizers.tokenizer import Token
+
+    config = {
+        "model_name": "bert",
+        "model_weights": "bert-base-chinese",
+    }
+
+    lm_featurizer = LanguageModelFeaturizer(config)
+
+    message = Message.build(text=text)
+    message.set(TOKENS_NAMES[TEXT], [Token(word, start) for (word, start) in tokens])
+
+    result, _ = lm_featurizer._tokenize_example(message, TEXT)
+
+    assert [(token.text, token.start) for token in result] == expected_feature_tokens
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Remove token when its text (for example, whitespace) can't be tokenized by LM tokenizer (from `LanguageModelFeaturizer`).