Merge pull request #542 from Kaggle/allennlp-ci

add allennlp package
Kaggle · May 1, 2019 · 2cac9bf · 2cac9bf
2 parents c039845 + 54c6812
commit 2cac9bf
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 1 deletion.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASE_TAG=5.2.0
+ARG BASE_TAG=5.3.0
 
 FROM gcr.io/kaggle-images/python-tensorflow-whl:1.13.1-py36 as tensorflow_whl
 FROM continuumio/anaconda3:${BASE_TAG}
@@ -480,6 +480,7 @@ RUN pip install flashtext && \
     pip install chainercv && \
     pip install plotly_express && \
     pip install albumentations && \
+    pip install allennlp && \
     /tmp/clean-layer.sh
 
 # Tesseract and some associated utility packages

diff --git a/tests/test_allennlp.py b/tests/test_allennlp.py
@@ -0,0 +1,15 @@
+import unittest
+
+from allennlp.data.tokenizers import WordTokenizer
+
+
+class TestAllenNlp(unittest.TestCase):
+    # reference
+    # https://github.com/allenai/allennlp/blob/master/allennlp/tests/data/tokenizers/word_tokenizer_test.py
+    def test_passes_through_correctly(self):
+        tokenizer = WordTokenizer(start_tokens=['@@', '%%'], end_tokens=['^^'])
+        sentence = "this (sentence) has 'crazy' \"punctuation\"."
+        tokens = [t.text for t in tokenizer.tokenize(sentence)]
+        expected_tokens = ["@@", "%%", "this", "(", "sentence", ")", "has", "'", "crazy", "'", "\"",
+                           "punctuation", "\"", ".", "^^"]
+        assert tokens == expected_tokens