feat(crafter): add docs

jina-ai · Apr 10, 2020 · b184c0e · b184c0e
1 parent 8349c33
commit b184c0e
Showing 1 changed file with 16 additions and 1 deletion.
diff --git a/jina/executors/crafters/nlp/split.py b/jina/executors/crafters/nlp/split.py
@@ -7,11 +7,20 @@
 
 
 class Sentencizer(BaseSegmenter):
+    """
+    :class:`Sentencizer` split the text on the doc-level into sentences on the chunk-level with a rule-base strategy.
+    """
     def __init__(self,
                  min_sent_len: int = 1,
                  max_sent_len: int = -1,
                  punct_chars: str = None,
                  *args, **kwargs):
+        """
+
+        :param min_sent_len: the minimal length of the sentence.
+        :param max_sent_len: the maximal length of the sentence.
+        :param punct_chars: the punctuation characters to split on.
+        """
         super().__init__(*args, **kwargs)
         self.min_sent_len = min_sent_len
         self.max_sent_len = max_sent_len if max_sent_len > 0 else 1e5
@@ -22,8 +31,14 @@ def __init__(self,
         self._slit_pat = re.compile('([{0}])+([^{0}])'.format(''.join(self.punct_chars)))
 
     def craft(self, raw_bytes: bytes, doc_id: int, *args, **kwargs) -> List[Dict]:
+        """
+        Split the text into sentences.
+
+        :param raw_bytes: the raw text in the `bytes` format
+        :param doc_id: the doc id
+        :return: a list of chunk dicts with the cropped images
+        """
         text = raw_bytes.decode('utf8')
-        # split into sentences
         sents_str = self._slit_pat.sub(r'\1\n\2', text)
         sents_str = sents_str.rstrip('\n')
         sents = [s for s in sents_str.split('\n') if self.min_sent_len <= len(s) <= self.max_sent_len]