Skip to content

Commit

Permalink
fix whitespace LM tokenize issue (#7407)
Browse files Browse the repository at this point in the history
* fix whitespace LM tokenize issue

* Add testcase

* format testcase with black

* Add changelog entry

* Update changelog/7407.bugfix.md

Co-authored-by: Tom Bocklisch <tom@rasa.com>
Co-authored-by: Roberto <43567378+rasabot@users.noreply.github.com>
  • Loading branch information
3 people authored Dec 14, 2020
1 parent f6e12bf commit 3c5102e
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 0 deletions.
1 change: 1 addition & 0 deletions changelog/7407.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Remove token when its text (for example, whitespace) can't be tokenized by LM tokenizer (from `LanguageModelFeaturizer`).
6 changes: 6 additions & 0 deletions rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,12 @@ def _tokenize_example(
# use lm specific tokenizer to further tokenize the text
split_token_ids, split_token_strings = self._lm_tokenize(token.text)

if not split_token_ids:
# fix the situation that `token.text` only contains whitespace or other special characters,
# which cause `split_token_ids` and `split_token_strings` be empty,
# finally cause `self._lm_specific_token_cleanup()` to raise an exception
continue

(split_token_ids, split_token_strings) = self._lm_specific_token_cleanup(
split_token_ids, split_token_strings
)
Expand Down
30 changes: 30 additions & 0 deletions tests/nlu/featurizers/test_lm_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -763,3 +763,33 @@ def test_preserve_sentence_and_sequence_features_old_config():
assert not (message.features[1].features == lm_docs[SENTENCE_FEATURES]).any()
assert (message.features[0].features == hf_docs[SEQUENCE_FEATURES]).all()
assert (message.features[1].features == hf_docs[SENTENCE_FEATURES]).all()


@pytest.mark.parametrize(
"text, tokens, expected_feature_tokens",
[
(
"购买 iPhone 12", # whitespace ' ' is expected to be removed
[("购买", 0), (" ", 2), ("iPhone", 3), (" ", 9), ("12", 10)],
[("购买", 0), ("iPhone", 3), ("12", 10)],
)
],
)
def test_lm_featurizer_correctly_handle_whitespace_token(
text, tokens, expected_feature_tokens
):
from rasa.nlu.tokenizers.tokenizer import Token

config = {
"model_name": "bert",
"model_weights": "bert-base-chinese",
}

lm_featurizer = LanguageModelFeaturizer(config)

message = Message.build(text=text)
message.set(TOKENS_NAMES[TEXT], [Token(word, start) for (word, start) in tokens])

result, _ = lm_featurizer._tokenize_example(message, TEXT)

assert [(token.text, token.start) for token in result] == expected_feature_tokens

0 comments on commit 3c5102e

Please sign in to comment.