From 98a112fb69232048db601e00f2a21a867271690f Mon Sep 17 00:00:00 2001 From: Sorami Hisamoto Date: Fri, 19 Jun 2020 15:15:02 +0900 Subject: [PATCH] Fix Morphemelist split (#134) * Fix latttice node related access due to Cythonization #133 * Add a case for morpheme split --- sudachipy/morphemelist.py | 4 ++-- tests/test_tokenizer.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/sudachipy/morphemelist.py b/sudachipy/morphemelist.py index 59e9240..c60ae53 100644 --- a/sudachipy/morphemelist.py +++ b/sudachipy/morphemelist.py @@ -72,9 +72,9 @@ def split(self, mode, index, wi): nodes = [] for wid in word_ids: n = latticenode.LatticeNode(self.lexicon, 0, 0, 0, wid) - n.begin = offset + n.set_begin(offset) offset += n.get_word_info().head_word_length - n.end = offset + n.set_end(offset) nodes.append(n) return MorphemeList(self.input_text, self.grammar, self.lexicon, nodes) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 2ee548d..80dc771 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -80,6 +80,17 @@ def test_tokenizer_with_dots(self): self.assertEqual(ms[3].surface(), '') self.assertEqual(ms[3].normalized_form(), '.') + def test_tokenizer_morpheme_split(self): + from sudachipy import tokenizer + ms = self.tokenizer_obj.tokenize('東京都', tokenizer.Tokenizer.SplitMode.C) + self.assertEqual(1, ms.size()) + self.assertEqual(ms[0].surface(), '東京都') + + ms_a = ms[0].split(tokenizer.Tokenizer.SplitMode.A) + self.assertEqual(2, ms_a.size()) + self.assertEqual(ms_a[0].surface(), '東京') + self.assertEqual(ms_a[1].surface(), '都') + if __name__ == '__main__': unittest.main()