feat(crafter): add Sentencizer

jina-ai · Apr 10, 2020 · 8349c33 · 8349c33
1 parent f69d82b
commit 8349c33
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 0 deletions.
diff --git a/jina/executors/crafters/nlp/__init__.py b/jina/executors/crafters/nlp/__init__.py
diff --git a/jina/executors/crafters/nlp/split.py b/jina/executors/crafters/nlp/split.py
@@ -0,0 +1,34 @@
+import re
+import json
+import string
+from typing import List, Dict
+
+from .. import BaseSegmenter
+
+
+class Sentencizer(BaseSegmenter):
+    def __init__(self,
+                 min_sent_len: int = 1,
+                 max_sent_len: int = -1,
+                 punct_chars: str = None,
+                 *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.min_sent_len = min_sent_len
+        self.max_sent_len = max_sent_len if max_sent_len > 0 else 1e5
+        self.punct_chars = punct_chars
+        if not punct_chars:
+            self.punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '‼', '‽', '⁇', '⁈', '⁉', '⸮', '﹖', '﹗',
+                                '！', '．', '？', '｡', '。']
+        self._slit_pat = re.compile('([{0}])+([^{0}])'.format(''.join(self.punct_chars)))
+
+    def craft(self, raw_bytes: bytes, doc_id: int, *args, **kwargs) -> List[Dict]:
+        text = raw_bytes.decode('utf8')
+        # split into sentences
+        sents_str = self._slit_pat.sub(r'\1\n\2', text)
+        sents_str = sents_str.rstrip('\n')
+        sents = [s for s in sents_str.split('\n') if self.min_sent_len <= len(s) <= self.max_sent_len]
+        results = []
+        for idx, s in enumerate(sents):
+            results.append(
+                dict(doc_id=doc_id, text=s, offset=idx, weight=1.0, length=len(sents)))
+        return results
diff --git a/tests/executors/crafters/nlp/__init__.py b/tests/executors/crafters/nlp/__init__.py
diff --git a/tests/executors/crafters/nlp/split.py b/tests/executors/crafters/nlp/split.py
@@ -0,0 +1,21 @@
+import unittest
+from tests import JinaTestCase
+from jina.executors.crafters.nlp.split import Sentencizer
+
+
+class MyTestCase(JinaTestCase):
+    def test_sentencier_en(self):
+        sentencizer = Sentencizer()
+        raw_bytes = b'It is a sunny day!!!! When Andy comes back, we are going to the zoo.'
+        crafted_chunk_list = sentencizer.craft(raw_bytes, 0)
+        self.assertEqual(len(crafted_chunk_list), 2)
+
+    def test_sentencier_cn(self):
+        sentencizer = Sentencizer()
+        raw_bytes = '今天是个大晴天！安迪回来以后，我们准备去动物园。'.encode('utf8')
+        crafted_chunk_list = sentencizer.craft(raw_bytes, 0)
+        self.assertEqual(len(crafted_chunk_list), 2)
+
+
+if __name__ == '__main__':
+    unittest.main()