From 648a5e7bd5303091728abfe7058210f698484784 Mon Sep 17 00:00:00 2001 From: Daksh Date: Mon, 6 Jul 2020 13:04:15 +0200 Subject: [PATCH 1/2] fix --- rasa/nlu/tokenizers/whitespace_tokenizer.py | 6 +++--- tests/nlu/tokenizers/test_whitespace_tokenizer.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py index 4cd223ae71ba..47f614f3d534 100644 --- a/rasa/nlu/tokenizers/whitespace_tokenizer.py +++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py @@ -70,11 +70,11 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: text, ).split() + words = [self.remove_emoji(w) for w in words] + words = [w for w in words if w] + # if we removed everything like smiles `:)`, use the whole text as 1 token if not words: words = [text] - words = [self.remove_emoji(w) for w in words] - words = [w for w in words if w] - return self._convert_words_to_tokens(words, text) diff --git a/tests/nlu/tokenizers/test_whitespace_tokenizer.py b/tests/nlu/tokenizers/test_whitespace_tokenizer.py index fa0744c4825e..24fc2b9a9ee8 100644 --- a/tests/nlu/tokenizers/test_whitespace_tokenizer.py +++ b/tests/nlu/tokenizers/test_whitespace_tokenizer.py @@ -65,6 +65,7 @@ ), (":)", [":)"], [(0, 2)]), ("Hi :-)", ["Hi"], [(0, 2)]), + ("👍", ["👍"], [(0, 1)]), ], ) def test_whitespace(text, expected_tokens, expected_indices): From 4888a6f4046ca596e96f113ce562653d1506510b Mon Sep 17 00:00:00 2001 From: Daksh Date: Mon, 6 Jul 2020 13:09:33 +0200 Subject: [PATCH 2/2] add changelog --- changelog/6143.bugfix.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog/6143.bugfix.rst diff --git a/changelog/6143.bugfix.rst b/changelog/6143.bugfix.rst new file mode 100644 index 000000000000..8ee8df6f15e4 --- /dev/null +++ b/changelog/6143.bugfix.rst @@ -0,0 +1 @@ +Prevent ``WhitespaceTokenizer`` from outputting empty list of tokens. \ No newline at end of file