diff --git a/sudachipy/utf8inputtextbuilder.py b/sudachipy/utf8inputtextbuilder.py index 4def75f..694375b 100644 --- a/sudachipy/utf8inputtextbuilder.py +++ b/sudachipy/utf8inputtextbuilder.py @@ -24,7 +24,7 @@ def __init__(self, text, grammar): self.grammar = grammar self.original_text = text self.modified_text = text - self.text_offsets = list(range(len(self.original_text) + 1)) + self.modified_to_original = list(range(len(self.original_text) + 1)) # 注: サロゲートペア文字は考慮していない def replace(self, begin, end, str_): @@ -42,15 +42,17 @@ def replace(self, begin, end, str_): self.modified_text = str_.join([self.modified_text[:begin], self.modified_text[end:]]) - offset = self.text_offsets[begin] + modified_begin = self.modified_to_original[begin] + modified_end = self.modified_to_original[end] length = len(str_) if end - begin > length: - del self.text_offsets[begin + length:end] - for i in range(length): + del self.modified_to_original[begin + length:end] + self.modified_to_original[begin] = modified_begin + for i in range(1, length): if begin + i < end: - self.text_offsets[begin + i] = offset + self.modified_to_original[begin + i] = modified_end else: - self.text_offsets.insert(begin + i, offset) + self.modified_to_original.insert(begin + i, modified_end) def get_original_text(self): return self.original_text @@ -70,10 +72,10 @@ def build(self): # 注: サロゲートペア文字は考慮していない for _ in range(self.utf8_byte_length(ord(self.modified_text[i]))): byte_indexes[j] = i - offsets[j] = self.text_offsets[i] + offsets[j] = self.modified_to_original[i] j += 1 byte_indexes[length] = len(modified_string_text) - offsets[length] = self.text_offsets[-1] + offsets[length] = self.modified_to_original[-1] char_categories = self.get_char_category_types(modified_string_text) char_category_continuities = self.get_char_category_continuities(modified_string_text, length, char_categories) diff --git a/tests/plugin/test_default_input_text_plugin.py b/tests/plugin/test_default_input_text_plugin.py index ccec51e..471706b 100644 --- a/tests/plugin/test_default_input_text_plugin.py +++ b/tests/plugin/test_default_input_text_plugin.py @@ -60,7 +60,7 @@ def test_before_rewrite(self): self.assertEqual(9, text.get_original_index(24)) self.assertEqual(9, text.get_original_index(26)) - def test_after_write(self): + def test_after_rewrite(self): self.assertEqual(self.original_text, self.builder.get_original_text()) self.assertEqual(self.original_text, self.builder.get_text()) self.plugin.rewrite(self.builder) @@ -76,7 +76,8 @@ def test_after_write(self): self.assertEqual(1, text.get_original_index(2)) self.assertEqual(2, text.get_original_index(3)) self.assertEqual(4, text.get_original_index(7)) - self.assertEqual(4, text.get_original_index(11)) + self.assertEqual(5, text.get_original_index(8)) + self.assertEqual(5, text.get_original_index(11)) self.assertEqual(7, text.get_original_index(15)) self.assertEqual(7, text.get_original_index(17)) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 65b4254..2ee548d 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -70,6 +70,16 @@ def test_tokenize_kanji_alphabet_word(self): self.assertEqual(len(self.tokenizer_obj.tokenize('ab')), 1) self.assertEqual(len(self.tokenizer_obj.tokenize('特ab')), 2) + def test_tokenizer_with_dots(self): + ms = self.tokenizer_obj.tokenize('京都…') + self.assertEqual(4, ms.size()) + self.assertEqual(ms[1].surface(), '…') + self.assertEqual(ms[1].normalized_form(), '.') + self.assertEqual(ms[2].surface(), '') + self.assertEqual(ms[2].normalized_form(), '.') + self.assertEqual(ms[3].surface(), '') + self.assertEqual(ms[3].normalized_form(), '.') + if __name__ == '__main__': unittest.main() diff --git a/tests/test_utf8inputtext.py b/tests/test_utf8inputtext.py index b92b851..c7b0527 100644 --- a/tests/test_utf8inputtext.py +++ b/tests/test_utf8inputtext.py @@ -117,8 +117,8 @@ def test_replace_with_same_length(self): self.assertEqual(input_.get_original_index(12), 7) self.assertEqual(input_.get_original_index(13), 8) self.assertEqual(input_.get_original_index(15), 8) - self.assertEqual(input_.get_original_index(16), 8) - self.assertEqual(input_.get_original_index(18), 8) + self.assertEqual(input_.get_original_index(16), 10) + self.assertEqual(input_.get_original_index(18), 10) self.assertEqual(input_.get_original_index(19), 10) self.assertEqual(input_.get_original_index(22), 10) self.assertEqual(input_.get_original_index(31), 13) @@ -147,13 +147,13 @@ def test_replaceWithInsertion(self): self.assertEqual(input_.get_original_text(), self.TEXT) self.assertEqual(input_.get_text(), "âbC1あ234あああ𡈽アゴ") self.assertEqual(len(input_.get_byte_text()), 35) - self.assertEqual(input_.get_original_index(0), 0) - self.assertEqual(input_.get_original_index(12), 7) - self.assertEqual(input_.get_original_index(13), 8) - self.assertEqual(input_.get_original_index(21), 8) - self.assertEqual(input_.get_original_index(22), 10) - self.assertEqual(input_.get_original_index(25), 10) - self.assertEqual(input_.get_original_index(35), 14) + self.assertEqual(input_.get_original_index(0), 0) # â + self.assertEqual(input_.get_original_index(12), 7) # 4 + self.assertEqual(input_.get_original_index(13), 8) # >あ< ああ + self.assertEqual(input_.get_original_index(21), 10) # ああ >あ< + self.assertEqual(input_.get_original_index(22), 10) # 𡈽 + self.assertEqual(input_.get_original_index(25), 10) # 𡈽 + self.assertEqual(input_.get_original_index(35), 14) # ゙ def test_replaceMultiTimes(self): self.builder.replace(0, 1, "a")