Skip to content

Commit

Permalink
feat(crafter): add docs
Browse files Browse the repository at this point in the history
  • Loading branch information
nan-wang committed Apr 10, 2020
1 parent 8349c33 commit b184c0e
Showing 1 changed file with 16 additions and 1 deletion.
17 changes: 16 additions & 1 deletion jina/executors/crafters/nlp/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,20 @@


class Sentencizer(BaseSegmenter):
"""
:class:`Sentencizer` split the text on the doc-level into sentences on the chunk-level with a rule-base strategy.
"""
def __init__(self,
min_sent_len: int = 1,
max_sent_len: int = -1,
punct_chars: str = None,
*args, **kwargs):
"""
:param min_sent_len: the minimal length of the sentence.
:param max_sent_len: the maximal length of the sentence.
:param punct_chars: the punctuation characters to split on.
"""
super().__init__(*args, **kwargs)
self.min_sent_len = min_sent_len
self.max_sent_len = max_sent_len if max_sent_len > 0 else 1e5
Expand All @@ -22,8 +31,14 @@ def __init__(self,
self._slit_pat = re.compile('([{0}])+([^{0}])'.format(''.join(self.punct_chars)))

def craft(self, raw_bytes: bytes, doc_id: int, *args, **kwargs) -> List[Dict]:
"""
Split the text into sentences.
:param raw_bytes: the raw text in the `bytes` format
:param doc_id: the doc id
:return: a list of chunk dicts with the cropped images
"""
text = raw_bytes.decode('utf8')
# split into sentences
sents_str = self._slit_pat.sub(r'\1\n\2', text)
sents_str = sents_str.rstrip('\n')
sents = [s for s in sents_str.split('\n') if self.min_sent_len <= len(s) <= self.max_sent_len]
Expand Down

0 comments on commit b184c0e

Please sign in to comment.