Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chain conversion #715

Merged
merged 2 commits into from
Feb 9, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 47 additions & 19 deletions src/rime/gear/simplifier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,30 @@ class Opencc {
opencc::Optional<const opencc::DictEntry*> item =
dict->Match(original_word);
if (item.IsNull()) {
// Current dictionary doesn't convert the word. We need to keep it for
// other dicts in the chain. e.g. s2t.json expands 里 to 里 and 裏,
// then t2tw.json passes 里 as-is and converts 裏 to 裡.
if (word_set.insert(original_word).second) {
converted_words.push_back(original_word);
// There is no exact match, but still need to convert partially
// matched in a chain conversion. Here apply default (max. seg.)
// match to get the most probable conversion result
std::ostringstream buffer;
for (const char* wstr = original_word.c_str(); *wstr != '\0';) {
opencc::Optional<const opencc::DictEntry*> matched =
dict->MatchPrefix(wstr);
size_t matchedLength;
if (matched.IsNull()) {
matchedLength = opencc::UTF8Util::NextCharLength(wstr);
buffer << opencc::UTF8Util::FromSubstr(wstr, matchedLength);
} else {
matchedLength = matched.Get()->KeyLength();
buffer << matched.Get()->GetDefault();
}
wstr += matchedLength;
}
const string& converted_word = buffer.str();
// Even if current dictionary doesn't convert the word
// (converted_word == original_word), we still need to keep it for
// subsequent dicts in the chain. e.g. s2t.json expands 里 to 里 and
// 裏, then t2tw.json passes 里 as-is and converts 裏 to 裡.
if (word_set.insert(converted_word).second) {
converted_words.push_back(converted_word);
}
continue;
}
Expand All @@ -105,23 +124,32 @@ class Opencc {
bool RandomConvertText(const string& text, string* simplified) {
if (dict_ == nullptr)
return false;
const list<opencc::ConversionPtr> conversions =
converter_->GetConversionChain()->GetConversions();
const char* phrase = text.c_str();
std::ostringstream buffer;
for (const char* pstr = phrase; *pstr != '\0';) {
opencc::Optional<const opencc::DictEntry*> matched =
dict_->MatchPrefix(pstr);
size_t matchedLength;
if (matched.IsNull()) {
matchedLength = opencc::UTF8Util::NextCharLength(pstr);
buffer << opencc::UTF8Util::FromSubstr(pstr, matchedLength);
} else {
matchedLength = matched.Get()->KeyLength();
size_t i = rand() % (matched.Get()->NumValues());
buffer << matched.Get()->Values().at(i);
for (auto conversion : conversions) {
opencc::DictPtr dict = conversion->GetDict();
if (dict == nullptr) {
return false;
}
std::ostringstream buffer;
for (const char* pstr = phrase; *pstr != '\0';) {
opencc::Optional<const opencc::DictEntry*> matched =
dict->MatchPrefix(pstr);
size_t matchedLength;
if (matched.IsNull()) {
matchedLength = opencc::UTF8Util::NextCharLength(pstr);
buffer << opencc::UTF8Util::FromSubstr(pstr, matchedLength);
} else {
matchedLength = matched.Get()->KeyLength();
size_t i = rand() % (matched.Get()->NumValues());
buffer << matched.Get()->Values().at(i);
}
pstr += matchedLength;
}
pstr += matchedLength;
*simplified = buffer.str();
phrase = simplified->c_str();
}
*simplified = buffer.str();
return *simplified != text;
}

Expand Down