migrate to sacremoses and add toktok tokenizer (#361)

pytorch · Aug 7, 2018 · da8bfac · da8bfac
1 parent dc97900
commit da8bfac
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 7 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -7,6 +7,7 @@ requests
 # Optional NLP tools
 nltk
 spacy
+sacremoses
 git+git://github.com/jekbradbury/revtok.git
 
 # Documentation

diff --git a/test/data/test_utils.py b/test/data/test_utils.py
@@ -16,7 +16,7 @@ def test_get_tokenizer(self):
             "A", "string", ",", "particularly", "one", "with", "slightly",
             "complex", "punctuation", "."]
 
-        # Test Moses option. Test strings taken from NLTK doctests.
+        # Test Moses option.
         # Note that internally, MosesTokenizer converts to unicode if applicable
         moses_tokenizer = data.get_tokenizer("moses")
         assert moses_tokenizer(test_str) == [
@@ -26,6 +26,13 @@ def test_get_tokenizer(self):
         # Nonbreaking prefixes should tokenize the final period.
         assert moses_tokenizer(six.text_type("abc def.")) == ["abc", "def", "."]
 
+        # Test Toktok option. Test strings taken from NLTK doctests.
+        # Note that internally, MosesTokenizer converts to unicode if applicable
+        toktok_tokenizer = data.get_tokenizer("toktok")
+        assert toktok_tokenizer(test_str) == [
+            "A", "string", ",", "particularly", "one", "with", "slightly",
+            "complex", "punctuation", "."]
+
         # Test that errors are raised for invalid input arguments.
         with self.assertRaises(ValueError):
             data.get_tokenizer(1)

diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py
@@ -21,16 +21,22 @@ def get_tokenizer(tokenizer):
             raise
     elif tokenizer == "moses":
         try:
-            from nltk.tokenize.moses import MosesTokenizer
+            from sacremoses import MosesTokenizer
             moses_tokenizer = MosesTokenizer()
             return moses_tokenizer.tokenize
         except ImportError:
-            print("Please install NLTK. "
-                  "See the docs at http://nltk.org for more information.")
+            print("Please install SacreMoses. "
+                  "See the docs at https://github.com/alvations/sacremoses "
+                  "for more information.")
             raise
-        except LookupError:
-            print("Please install the necessary NLTK corpora. "
-                  "See the docs at http://nltk.org for more information.")
+    elif tokenizer == "toktok":
+        try:
+            from nltk.tokenize.toktok import ToktokTokenizer
+            toktok = ToktokTokenizer()
+            return toktok.tokenize
+        except ImportError:
+            print("Please install NLTK. "
+                  "See the docs at https://nltk.org  for more information.")
             raise
     elif tokenizer == 'revtok':
         try: