Skip to content
This repository has been archived by the owner on Mar 9, 2023. It is now read-only.

Commit

Permalink
Fix Morphemelist split (#134)
Browse files Browse the repository at this point in the history
* Fix latttice node related access due to Cythonization #133

* Add a case for morpheme split
  • Loading branch information
Sorami Hisamoto authored Jun 19, 2020
1 parent 3c8df53 commit 98a112f
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 2 deletions.
4 changes: 2 additions & 2 deletions sudachipy/morphemelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ def split(self, mode, index, wi):
nodes = []
for wid in word_ids:
n = latticenode.LatticeNode(self.lexicon, 0, 0, 0, wid)
n.begin = offset
n.set_begin(offset)
offset += n.get_word_info().head_word_length
n.end = offset
n.set_end(offset)
nodes.append(n)

return MorphemeList(self.input_text, self.grammar, self.lexicon, nodes)
Expand Down
11 changes: 11 additions & 0 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,17 @@ def test_tokenizer_with_dots(self):
self.assertEqual(ms[3].surface(), '')
self.assertEqual(ms[3].normalized_form(), '.')

def test_tokenizer_morpheme_split(self):
from sudachipy import tokenizer
ms = self.tokenizer_obj.tokenize('東京都', tokenizer.Tokenizer.SplitMode.C)
self.assertEqual(1, ms.size())
self.assertEqual(ms[0].surface(), '東京都')

ms_a = ms[0].split(tokenizer.Tokenizer.SplitMode.A)
self.assertEqual(2, ms_a.size())
self.assertEqual(ms_a[0].surface(), '東京')
self.assertEqual(ms_a[1].surface(), '都')


if __name__ == '__main__':
unittest.main()

0 comments on commit 98a112f

Please sign in to comment.