Skip to content

Commit

Permalink
Merge branch 'master' into token-pattern
Browse files Browse the repository at this point in the history
  • Loading branch information
tabergma committed Jul 6, 2020
2 parents 7c88a45 + 8f51534 commit 05866fb
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 3 deletions.
1 change: 1 addition & 0 deletions changelog/6143.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Prevent ``WhitespaceTokenizer`` from outputting empty list of tokens.
6 changes: 3 additions & 3 deletions rasa/nlu/tokenizers/whitespace_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,13 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]:
text,
).split()

words = [self.remove_emoji(w) for w in words]
words = [w for w in words if w]

# if we removed everything like smiles `:)`, use the whole text as 1 token
if not words:
words = [text]

words = [self.remove_emoji(w) for w in words]
words = [w for w in words if w]

tokens = self._convert_words_to_tokens(words, text)

return self._apply_token_pattern(tokens)
1 change: 1 addition & 0 deletions tests/nlu/tokenizers/test_whitespace_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
),
(":)", [":)"], [(0, 2)]),
("Hi :-)", ["Hi"], [(0, 2)]),
("👍", ["👍"], [(0, 1)]),
],
)
def test_whitespace(text, expected_tokens, expected_indices):
Expand Down

0 comments on commit 05866fb

Please sign in to comment.