Skip to content

Commit

Permalink
fix(opencc): better handle word skips
Browse files Browse the repository at this point in the history
fix #16
  • Loading branch information
outloudvi committed Jul 23, 2024
1 parent 42b50d9 commit eea5929
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 7 deletions.
13 changes: 6 additions & 7 deletions mw2fcitx/exporters/opencc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import opencc
from ..utils import console

DEFAULT_PLACEHOLDER = "_ERROR_"


def manual_fix(text, table):
if text in table:
Expand All @@ -17,15 +19,14 @@ def export(words, **kwargs):
fixfile = kwargs.get("fixfile")
if fixfile is not None:
table = json.load(open(fixfile, "r", encoding="utf-8"))
HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$')
count = 0
last_word = None
for line in words:
line = line.rstrip("\n")
if not HANZI_RE.match(line):
pinyins = lazy_pinyin(line, errors=lambda x: DEFAULT_PLACEHOLDER)
if DEFAULT_PLACEHOLDER in pinyins:
# The word is not fully converable
continue

pinyin = "'".join(lazy_pinyin(line))
pinyin = "'".join(pinyins)
if pinyin == line:
# print("Failed to convert, ignoring:", pinyin, file=sys.stderr)
continue
Expand All @@ -36,8 +37,6 @@ def export(words, **kwargs):
pinyin = fixed_pinyin
console.debug(f"Fixing {line} to {pinyin}")

last_word = line

result += "\t".join((converter.convert(line), pinyin, "0"))
result += "\n"
count += 1
Expand Down
27 changes: 27 additions & 0 deletions tests/lib/test_exporters_opencc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from mw2fcitx.exporters.opencc import export


def test_opencc_exporter():
assert (
export(["测试"]) == "测试\tce'shi\t0\n"
)

assert (
export([
"测试",
"琴吹䌷" # outloudvi/mw2fcitx#16
]) == "测试\tce'shi\t0\n"
"琴吹䌷\tqin'chui'chou\t0\n"
)

assert (
export([
"测试",
"无效:词条"
]) == "测试\tce'shi\t0\n"

)


if __name__ == "__main__":
test_opencc_exporter()

0 comments on commit eea5929

Please sign in to comment.