From b184c0e2e82ec3d0ec556032e7d2ee148eec2bbd Mon Sep 17 00:00:00 2001
From: Nan Wang <nan.wang@jina.ai>
Date: Fri, 10 Apr 2020 12:56:42 +0800
Subject: [PATCH] feat(crafter): add docs

---
 jina/executors/crafters/nlp/split.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/jina/executors/crafters/nlp/split.py b/jina/executors/crafters/nlp/split.py
index fc8623170a376..787ad7ab0c476 100644
--- a/jina/executors/crafters/nlp/split.py
+++ b/jina/executors/crafters/nlp/split.py
@@ -7,11 +7,20 @@
 
 
 class Sentencizer(BaseSegmenter):
+    """
+    :class:`Sentencizer` split the text on the doc-level into sentences on the chunk-level with a rule-base strategy.
+    """
     def __init__(self,
                  min_sent_len: int = 1,
                  max_sent_len: int = -1,
                  punct_chars: str = None,
                  *args, **kwargs):
+        """
+
+        :param min_sent_len: the minimal length of the sentence.
+        :param max_sent_len: the maximal length of the sentence.
+        :param punct_chars: the punctuation characters to split on.
+        """
         super().__init__(*args, **kwargs)
         self.min_sent_len = min_sent_len
         self.max_sent_len = max_sent_len if max_sent_len > 0 else 1e5
@@ -22,8 +31,14 @@ def __init__(self,
         self._slit_pat = re.compile('([{0}])+([^{0}])'.format(''.join(self.punct_chars)))
 
     def craft(self, raw_bytes: bytes, doc_id: int, *args, **kwargs) -> List[Dict]:
+        """
+        Split the text into sentences.
+
+        :param raw_bytes: the raw text in the `bytes` format
+        :param doc_id: the doc id
+        :return: a list of chunk dicts with the cropped images
+        """
         text = raw_bytes.decode('utf8')
-        # split into sentences
         sents_str = self._slit_pat.sub(r'\1\n\2', text)
         sents_str = sents_str.rstrip('\n')
         sents = [s for s in sents_str.split('\n') if self.min_sent_len <= len(s) <= self.max_sent_len]