Skip to content
This repository has been archived by the owner on Mar 9, 2023. It is now read-only.

Commit

Permalink
Fix a bug causing … is converted to "", "", "…" (#121)
Browse files Browse the repository at this point in the history
* Change a variable name to modifield_to_original, to make it align with the original Java implementation

* Fix a bug causing … is converted to "", "", "…"

* Fix tests for according to the new replace method

* Fix comment format
  • Loading branch information
Sorami Hisamoto authored Jun 2, 2020
1 parent 80cdf94 commit 1a6649b
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 19 deletions.
18 changes: 10 additions & 8 deletions sudachipy/utf8inputtextbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self, text, grammar):
self.grammar = grammar
self.original_text = text
self.modified_text = text
self.text_offsets = list(range(len(self.original_text) + 1))
self.modified_to_original = list(range(len(self.original_text) + 1))
# 注: サロゲートペア文字は考慮していない

def replace(self, begin, end, str_):
Expand All @@ -42,15 +42,17 @@ def replace(self, begin, end, str_):

self.modified_text = str_.join([self.modified_text[:begin], self.modified_text[end:]])

offset = self.text_offsets[begin]
modified_begin = self.modified_to_original[begin]
modified_end = self.modified_to_original[end]
length = len(str_)
if end - begin > length:
del self.text_offsets[begin + length:end]
for i in range(length):
del self.modified_to_original[begin + length:end]
self.modified_to_original[begin] = modified_begin
for i in range(1, length):
if begin + i < end:
self.text_offsets[begin + i] = offset
self.modified_to_original[begin + i] = modified_end
else:
self.text_offsets.insert(begin + i, offset)
self.modified_to_original.insert(begin + i, modified_end)

def get_original_text(self):
return self.original_text
Expand All @@ -70,10 +72,10 @@ def build(self):
# 注: サロゲートペア文字は考慮していない
for _ in range(self.utf8_byte_length(ord(self.modified_text[i]))):
byte_indexes[j] = i
offsets[j] = self.text_offsets[i]
offsets[j] = self.modified_to_original[i]
j += 1
byte_indexes[length] = len(modified_string_text)
offsets[length] = self.text_offsets[-1]
offsets[length] = self.modified_to_original[-1]

char_categories = self.get_char_category_types(modified_string_text)
char_category_continuities = self.get_char_category_continuities(modified_string_text, length, char_categories)
Expand Down
5 changes: 3 additions & 2 deletions tests/plugin/test_default_input_text_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def test_before_rewrite(self):
self.assertEqual(9, text.get_original_index(24))
self.assertEqual(9, text.get_original_index(26))

def test_after_write(self):
def test_after_rewrite(self):
self.assertEqual(self.original_text, self.builder.get_original_text())
self.assertEqual(self.original_text, self.builder.get_text())
self.plugin.rewrite(self.builder)
Expand All @@ -76,7 +76,8 @@ def test_after_write(self):
self.assertEqual(1, text.get_original_index(2))
self.assertEqual(2, text.get_original_index(3))
self.assertEqual(4, text.get_original_index(7))
self.assertEqual(4, text.get_original_index(11))
self.assertEqual(5, text.get_original_index(8))
self.assertEqual(5, text.get_original_index(11))
self.assertEqual(7, text.get_original_index(15))
self.assertEqual(7, text.get_original_index(17))

Expand Down
10 changes: 10 additions & 0 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,16 @@ def test_tokenize_kanji_alphabet_word(self):
self.assertEqual(len(self.tokenizer_obj.tokenize('ab')), 1)
self.assertEqual(len(self.tokenizer_obj.tokenize('特ab')), 2)

def test_tokenizer_with_dots(self):
ms = self.tokenizer_obj.tokenize('京都…')
self.assertEqual(4, ms.size())
self.assertEqual(ms[1].surface(), '…')
self.assertEqual(ms[1].normalized_form(), '.')
self.assertEqual(ms[2].surface(), '')
self.assertEqual(ms[2].normalized_form(), '.')
self.assertEqual(ms[3].surface(), '')
self.assertEqual(ms[3].normalized_form(), '.')


if __name__ == '__main__':
unittest.main()
18 changes: 9 additions & 9 deletions tests/test_utf8inputtext.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,8 @@ def test_replace_with_same_length(self):
self.assertEqual(input_.get_original_index(12), 7)
self.assertEqual(input_.get_original_index(13), 8)
self.assertEqual(input_.get_original_index(15), 8)
self.assertEqual(input_.get_original_index(16), 8)
self.assertEqual(input_.get_original_index(18), 8)
self.assertEqual(input_.get_original_index(16), 10)
self.assertEqual(input_.get_original_index(18), 10)
self.assertEqual(input_.get_original_index(19), 10)
self.assertEqual(input_.get_original_index(22), 10)
self.assertEqual(input_.get_original_index(31), 13)
Expand Down Expand Up @@ -147,13 +147,13 @@ def test_replaceWithInsertion(self):
self.assertEqual(input_.get_original_text(), self.TEXT)
self.assertEqual(input_.get_text(), "âbC1あ234あああ𡈽アゴ")
self.assertEqual(len(input_.get_byte_text()), 35)
self.assertEqual(input_.get_original_index(0), 0)
self.assertEqual(input_.get_original_index(12), 7)
self.assertEqual(input_.get_original_index(13), 8)
self.assertEqual(input_.get_original_index(21), 8)
self.assertEqual(input_.get_original_index(22), 10)
self.assertEqual(input_.get_original_index(25), 10)
self.assertEqual(input_.get_original_index(35), 14)
self.assertEqual(input_.get_original_index(0), 0) # â
self.assertEqual(input_.get_original_index(12), 7) # 4
self.assertEqual(input_.get_original_index(13), 8) # >あ< ああ
self.assertEqual(input_.get_original_index(21), 10) # ああ >あ<
self.assertEqual(input_.get_original_index(22), 10) # 𡈽
self.assertEqual(input_.get_original_index(25), 10) # 𡈽
self.assertEqual(input_.get_original_index(35), 14) # ゙

def test_replaceMultiTimes(self):
self.builder.replace(0, 1, "a")
Expand Down

0 comments on commit 1a6649b

Please sign in to comment.