Skip to content

Commit

Permalink
feat(crafter): add Sentencizer
Browse files Browse the repository at this point in the history
  • Loading branch information
nan-wang committed Apr 10, 2020
1 parent f69d82b commit 8349c33
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 0 deletions.
Empty file.
34 changes: 34 additions & 0 deletions jina/executors/crafters/nlp/split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import re
import json
import string
from typing import List, Dict

from .. import BaseSegmenter


class Sentencizer(BaseSegmenter):
def __init__(self,
min_sent_len: int = 1,
max_sent_len: int = -1,
punct_chars: str = None,
*args, **kwargs):
super().__init__(*args, **kwargs)
self.min_sent_len = min_sent_len
self.max_sent_len = max_sent_len if max_sent_len > 0 else 1e5
self.punct_chars = punct_chars
if not punct_chars:
self.punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '‼', '‽', '⁇', '⁈', '⁉', '⸮', '﹖', '﹗',
'!', '.', '?', '。', '。']
self._slit_pat = re.compile('([{0}])+([^{0}])'.format(''.join(self.punct_chars)))

def craft(self, raw_bytes: bytes, doc_id: int, *args, **kwargs) -> List[Dict]:
text = raw_bytes.decode('utf8')
# split into sentences
sents_str = self._slit_pat.sub(r'\1\n\2', text)
sents_str = sents_str.rstrip('\n')
sents = [s for s in sents_str.split('\n') if self.min_sent_len <= len(s) <= self.max_sent_len]
results = []
for idx, s in enumerate(sents):
results.append(
dict(doc_id=doc_id, text=s, offset=idx, weight=1.0, length=len(sents)))
return results
Empty file.
21 changes: 21 additions & 0 deletions tests/executors/crafters/nlp/split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import unittest
from tests import JinaTestCase
from jina.executors.crafters.nlp.split import Sentencizer


class MyTestCase(JinaTestCase):
def test_sentencier_en(self):
sentencizer = Sentencizer()
raw_bytes = b'It is a sunny day!!!! When Andy comes back, we are going to the zoo.'
crafted_chunk_list = sentencizer.craft(raw_bytes, 0)
self.assertEqual(len(crafted_chunk_list), 2)

def test_sentencier_cn(self):
sentencizer = Sentencizer()
raw_bytes = '今天是个大晴天!安迪回来以后,我们准备去动物园。'.encode('utf8')
crafted_chunk_list = sentencizer.craft(raw_bytes, 0)
self.assertEqual(len(crafted_chunk_list), 2)


if __name__ == '__main__':
unittest.main()

0 comments on commit 8349c33

Please sign in to comment.