From 74d74210f83ec7fb54c818c7769830d2d98a604f Mon Sep 17 00:00:00 2001 From: groverlynn Date: Fri, 9 Feb 2024 14:05:06 +0100 Subject: [PATCH] chain conversion (#715) --- src/rime/gear/simplifier.cc | 66 ++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 19 deletions(-) diff --git a/src/rime/gear/simplifier.cc b/src/rime/gear/simplifier.cc index b257721355..073e3f0e75 100644 --- a/src/rime/gear/simplifier.cc +++ b/src/rime/gear/simplifier.cc @@ -65,11 +65,30 @@ class Opencc { opencc::Optional item = dict->Match(original_word); if (item.IsNull()) { - // Current dictionary doesn't convert the word. We need to keep it for - // other dicts in the chain. e.g. s2t.json expands 里 to 里 and 裏, - // then t2tw.json passes 里 as-is and converts 裏 to 裡. - if (word_set.insert(original_word).second) { - converted_words.push_back(original_word); + // There is no exact match, but still need to convert partially + // matched in a chain conversion. Here apply default (max. seg.) + // match to get the most probable conversion result + std::ostringstream buffer; + for (const char* wstr = original_word.c_str(); *wstr != '\0';) { + opencc::Optional matched = + dict->MatchPrefix(wstr); + size_t matched_length; + if (matched.IsNull()) { + matched_length = opencc::UTF8Util::NextCharLength(wstr); + buffer << opencc::UTF8Util::FromSubstr(wstr, matched_length); + } else { + matched_length = matched.Get()->KeyLength(); + buffer << matched.Get()->GetDefault(); + } + wstr += matched_length; + } + const string& converted_word = buffer.str(); + // Even if current dictionary doesn't convert the word + // (converted_word == original_word), we still need to keep it for + // subsequent dicts in the chain. e.g. s2t.json expands 里 to 里 and + // 裏, then t2tw.json passes 里 as-is and converts 裏 to 裡. + if (word_set.insert(converted_word).second) { + converted_words.push_back(converted_word); } continue; } @@ -94,23 +113,32 @@ class Opencc { bool RandomConvertText(const string& text, string* simplified) { if (dict_ == nullptr) return false; + const list conversions = + converter_->GetConversionChain()->GetConversions(); const char* phrase = text.c_str(); - std::ostringstream buffer; - for (const char* pstr = phrase; *pstr != '\0';) { - opencc::Optional matched = - dict_->MatchPrefix(pstr); - size_t matchedLength; - if (matched.IsNull()) { - matchedLength = opencc::UTF8Util::NextCharLength(pstr); - buffer << opencc::UTF8Util::FromSubstr(pstr, matchedLength); - } else { - matchedLength = matched.Get()->KeyLength(); - size_t i = rand() % (matched.Get()->NumValues()); - buffer << matched.Get()->Values().at(i); + for (auto conversion : conversions) { + opencc::DictPtr dict = conversion->GetDict(); + if (dict == nullptr) { + return false; + } + std::ostringstream buffer; + for (const char* pstr = phrase; *pstr != '\0';) { + opencc::Optional matched = + dict->MatchPrefix(pstr); + size_t matched_length; + if (matched.IsNull()) { + matched_length = opencc::UTF8Util::NextCharLength(pstr); + buffer << opencc::UTF8Util::FromSubstr(pstr, matched_length); + } else { + matched_length = matched.Get()->KeyLength(); + size_t i = rand() % (matched.Get()->NumValues()); + buffer << matched.Get()->Values().at(i); + } + pstr += matched_length; } - pstr += matchedLength; + *simplified = buffer.str(); + phrase = simplified->c_str(); } - *simplified = buffer.str(); return *simplified != text; }