diff --git a/changelog/6143.bugfix.rst b/changelog/6143.bugfix.rst new file mode 100644 index 000000000000..8ee8df6f15e4 --- /dev/null +++ b/changelog/6143.bugfix.rst @@ -0,0 +1 @@ +Prevent ``WhitespaceTokenizer`` from outputting empty list of tokens. \ No newline at end of file diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py index eaefcaf2a79f..9d51f0d33701 100644 --- a/rasa/nlu/tokenizers/whitespace_tokenizer.py +++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py @@ -72,13 +72,13 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: text, ).split() + words = [self.remove_emoji(w) for w in words] + words = [w for w in words if w] + # if we removed everything like smiles `:)`, use the whole text as 1 token if not words: words = [text] - words = [self.remove_emoji(w) for w in words] - words = [w for w in words if w] - tokens = self._convert_words_to_tokens(words, text) return self._apply_token_pattern(tokens) diff --git a/tests/nlu/tokenizers/test_whitespace_tokenizer.py b/tests/nlu/tokenizers/test_whitespace_tokenizer.py index fa0744c4825e..24fc2b9a9ee8 100644 --- a/tests/nlu/tokenizers/test_whitespace_tokenizer.py +++ b/tests/nlu/tokenizers/test_whitespace_tokenizer.py @@ -65,6 +65,7 @@ ), (":)", [":)"], [(0, 2)]), ("Hi :-)", ["Hi"], [(0, 2)]), + ("👍", ["👍"], [(0, 1)]), ], ) def test_whitespace(text, expected_tokens, expected_indices):