diff --git a/mw2fcitx/exporters/opencc.py b/mw2fcitx/exporters/opencc.py index 77e0eec..10ba331 100644 --- a/mw2fcitx/exporters/opencc.py +++ b/mw2fcitx/exporters/opencc.py @@ -4,6 +4,8 @@ import opencc from ..utils import console +DEFAULT_PLACEHOLDER = "_ERROR_" + def manual_fix(text, table): if text in table: @@ -17,15 +19,14 @@ def export(words, **kwargs): fixfile = kwargs.get("fixfile") if fixfile is not None: table = json.load(open(fixfile, "r", encoding="utf-8")) - HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$') count = 0 - last_word = None for line in words: line = line.rstrip("\n") - if not HANZI_RE.match(line): + pinyins = lazy_pinyin(line, errors=lambda x: DEFAULT_PLACEHOLDER) + if DEFAULT_PLACEHOLDER in pinyins: + # The word is not fully converable continue - - pinyin = "'".join(lazy_pinyin(line)) + pinyin = "'".join(pinyins) if pinyin == line: # print("Failed to convert, ignoring:", pinyin, file=sys.stderr) continue @@ -36,8 +37,6 @@ def export(words, **kwargs): pinyin = fixed_pinyin console.debug(f"Fixing {line} to {pinyin}") - last_word = line - result += "\t".join((converter.convert(line), pinyin, "0")) result += "\n" count += 1 diff --git a/tests/lib/test_exporters_opencc.py b/tests/lib/test_exporters_opencc.py new file mode 100644 index 0000000..f1ad47b --- /dev/null +++ b/tests/lib/test_exporters_opencc.py @@ -0,0 +1,27 @@ +from mw2fcitx.exporters.opencc import export + + +def test_opencc_exporter(): + assert ( + export(["测试"]) == "测试\tce'shi\t0\n" + ) + + assert ( + export([ + "测试", + "琴吹䌷" # outloudvi/mw2fcitx#16 + ]) == "测试\tce'shi\t0\n" + "琴吹䌷\tqin'chui'chou\t0\n" + ) + + assert ( + export([ + "测试", + "无效:词条" + ]) == "测试\tce'shi\t0\n" + + ) + + +if __name__ == "__main__": + test_opencc_exporter()