From 6ddb1edbf9a55710ee7b078eb8b0677824b97777 Mon Sep 17 00:00:00 2001 From: Weng Xuetian Date: Fri, 22 Mar 2024 23:19:35 -0700 Subject: [PATCH] Add new variant of encoder parser function that accepts profile Also improve encoder test to make it actually check the value. New variant will be able to return the fuzzy flag associated with the parsed pinyin. --- src/libime/pinyin/pinyinencoder.cpp | 231 ++++++++++++++++++---------- src/libime/pinyin/pinyinencoder.h | 25 ++- test/testpinyinencoder.cpp | 82 +++++++--- 3 files changed, 235 insertions(+), 103 deletions(-) diff --git a/src/libime/pinyin/pinyinencoder.cpp b/src/libime/pinyin/pinyinencoder.cpp index be16b30..aa60640 100644 --- a/src/libime/pinyin/pinyinencoder.cpp +++ b/src/libime/pinyin/pinyinencoder.cpp @@ -4,6 +4,7 @@ * SPDX-License-Identifier: LGPL-2.1-or-later */ #include "pinyinencoder.h" +#include "pinyincorrectionprofile.h" #include "pinyindata.h" #include "shuangpinprofile.h" #include @@ -12,8 +13,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -97,7 +98,8 @@ struct LongestMatchResult { }; template -LongestMatchResult longestMatch(Iter iter, Iter end, PinyinFuzzyFlags flags) { +LongestMatchResult longestMatch(Iter iter, Iter end, PinyinFuzzyFlags flags, + const PinyinMap &map) { if (*iter == 'i' || *iter == 'u' || *iter == 'v') { return {false, std::string_view(&*iter, std::distance(iter, end)), false}; @@ -106,7 +108,6 @@ LongestMatchResult longestMatch(Iter iter, Iter end, PinyinFuzzyFlags flags) { end = iter + maxPinyinLength; } auto range = std::string_view(&*iter, std::distance(iter, end)); - const auto &map = getPinyinMapV2(); for (; !range.empty(); range.remove_suffix(1)) { auto iterPair = map.equal_range(range); if (iterPair.first != iterPair.second) { @@ -137,48 +138,61 @@ std::string PinyinSyllable::toString() const { return PinyinEncoder::initialToString(initial_) + PinyinEncoder::finalToString(final_); } - SegmentGraph PinyinEncoder::parseUserPinyin(std::string userPinyin, PinyinFuzzyFlags flags) { + return parseUserPinyin(std::move(userPinyin), nullptr, flags); +} + +SegmentGraph +PinyinEncoder::parseUserPinyin(std::string userPinyin, + const PinyinCorrectionProfile *profile, + PinyinFuzzyFlags flags) { SegmentGraph result{std::move(userPinyin)}; auto pinyin = result.data(); std::transform(pinyin.begin(), pinyin.end(), pinyin.begin(), fcitx::charutils::tolower); const auto end = pinyin.end(); + if (!profile) { + flags = flags.unset(PinyinFuzzyFlag::Correction); + } std::vector flagsToTry = {flags}; - // The parsing should be fast enough to do it twice, so we don't lost - // anything. + if (flags.test(PinyinFuzzyFlag::Correction)) { + flagsToTry.push_back(flags.unset(PinyinFuzzyFlag::Correction)); + } if (flags.test(PinyinFuzzyFlag::AdvancedTypo)) { - flagsToTry.push_back(flags.unset(PinyinFuzzyFlag::AdvancedTypo)); - } - - for (const auto fuzzyFlags : flagsToTry) { - std::priority_queue, std::greater<>> q; - q.push(0); - while (!q.empty()) { - size_t top; - do { - top = q.top(); - q.pop(); - } while (!q.empty() && q.top() == top); - if (top >= pinyin.size()) { - continue; + flagsToTry.push_back(flags.unset(PinyinFuzzyFlag::AdvancedTypo) + .unset(PinyinFuzzyFlag::Correction)); + } + + const auto &pinyinMap = profile ? profile->pinyinMap() : getPinyinMapV2(); + + std::priority_queue, std::greater<>> q; + q.push(0); + while (!q.empty()) { + size_t top; + do { + top = q.top(); + q.pop(); + } while (!q.empty() && q.top() == top); + if (top >= pinyin.size()) { + continue; + } + auto iter = std::next(pinyin.begin(), top); + if (*iter == '\'') { + while (iter != pinyin.end() && *iter == '\'') { + iter++; } - auto iter = std::next(pinyin.begin(), top); - if (*iter == '\'') { - while (iter != pinyin.end() && *iter == '\'') { - iter++; - } - auto next = std::distance(pinyin.begin(), iter); - result.addNext(top, next); - if (static_cast(next) < pinyin.size()) { - q.push(next); - } - continue; + auto next = std::distance(pinyin.begin(), iter); + result.addNext(top, next); + if (static_cast(next) < pinyin.size()) { + q.push(next); } + continue; + } + for (const auto fuzzyFlags : flagsToTry) { auto [valid, str, isCompletePinyin] = - longestMatch(iter, end, fuzzyFlags); + longestMatch(iter, end, fuzzyFlags, pinyinMap); // it's not complete a pinyin, no need to try if (!valid || !isCompletePinyin) { @@ -192,7 +206,6 @@ SegmentGraph PinyinEncoder::parseUserPinyin(std::string userPinyin, // don't consider it also, make sure current pinyin does not end // with a separator, other wise, jin'an may be parsed into ji'n // because, nextMatch is starts with "'". - const auto &map = getPinyinMapV2(); std::array nextSize; size_t nNextSize = 0; // Check if we can do fuzzy segement, e.g. @@ -202,13 +215,15 @@ SegmentGraph PinyinEncoder::parseUserPinyin(std::string userPinyin, (str.back() == 'a' || str.back() == 'e' || str.back() == 'g' || str.back() == 'n' || str.back() == 'o' || str.back() == 'r' || - str.back() == 'h') && - map.find(str.substr(0, str.size() - 1)) != map.end()) { + str.back() == 'h' || + fuzzyFlags.test(PinyinFuzzyFlag::Correction)) && + pinyinMap.find(str.substr(0, str.size() - 1)) != + pinyinMap.end()) { // str[0:-1] is also a full pinyin, check next pinyin - auto nextMatch = - longestMatch(iter + str.size(), end, fuzzyFlags); - auto nextMatchAlt = - longestMatch(iter + str.size() - 1, end, fuzzyFlags); + auto nextMatch = longestMatch(iter + str.size(), end, + fuzzyFlags, pinyinMap); + auto nextMatchAlt = longestMatch(iter + str.size() - 1, end, + fuzzyFlags, pinyinMap); auto matchSize = str.size() + nextMatch.match.size(); auto matchSizeAlt = str.size() - 1 + nextMatchAlt.match.size(); @@ -534,13 +549,18 @@ std::string PinyinEncoder::initialFinalToPinyinString(PinyinInitial initial, return result; } -static void getFuzzy( - std::vector>>> &syls, - PinyinSyllable syl, PinyinFuzzyFlags flags, bool isSp) { +namespace { + +template +void getFuzzy(FuzzyPinyinSyllables &syls, PinyinSyllable syl, + PinyinFuzzyFlags flags, bool isSp, const Adjuster &adjuster) { // ng/gn is already handled by table - boost::container::static_vector initials{syl.initial()}; - boost::container::static_vector finals{syl.final()}; + boost::container::static_vector, + 2> + initials{{syl.initial(), PinyinFuzzyFlag::None}}; + boost::container::static_vector, + 10> + finals{{syl.final(), PinyinFuzzyFlag::None}}; // for full pinyin {s,z,c} we also want them to match {sh,zh,ch} if (syl.final() == PinyinFinal::Invalid && !isSp) { @@ -568,10 +588,11 @@ static void getFuzzy( for (const auto &initialFuzzy : initialFuzzies) { if ((syl.initial() == std::get<0>(initialFuzzy) || syl.initial() == std::get<1>(initialFuzzy)) && - flags & std::get<2>(initialFuzzy)) { - initials.push_back(syl.initial() == std::get<0>(initialFuzzy) - ? std::get<1>(initialFuzzy) - : std::get<0>(initialFuzzy)); + flags.test(std::get<2>(initialFuzzy))) { + initials.emplace_back((syl.initial() == std::get<0>(initialFuzzy) + ? std::get<1>(initialFuzzy) + : std::get<0>(initialFuzzy)), + std::get<2>(initialFuzzy)); break; } } @@ -592,10 +613,11 @@ static void getFuzzy( for (const auto &finalFuzzy : finalFuzzies) { if ((syl.final() == std::get<0>(finalFuzzy) || syl.final() == std::get<1>(finalFuzzy)) && - flags & std::get<2>(finalFuzzy)) { - finals.push_back(syl.final() == std::get<0>(finalFuzzy) - ? std::get<1>(finalFuzzy) - : std::get<0>(finalFuzzy)); + flags.test(std::get<2>(finalFuzzy))) { + finals.emplace_back((syl.final() == std::get<0>(finalFuzzy) + ? std::get<1>(finalFuzzy) + : std::get<0>(finalFuzzy)), + std::get<2>(finalFuzzy)); break; } } @@ -615,19 +637,22 @@ static void getFuzzy( {PinyinFinal::O, PinyinFinal::OU}, {PinyinFinal::O, PinyinFinal::ONG}, }; - if (initials.size() == 1 && initials[0] == PinyinInitial::Zero && + if (initials.size() == 1 && + std::get<0>(initials[0]) == PinyinInitial::Zero && flags.test(PinyinFuzzyFlag::PartialFinal)) { for (const auto &partialFinal : partialFinals) { if (syl.final() == std::get<0>(partialFinal)) { - finals.push_back(std::get<1>(partialFinal)); + finals.emplace_back(std::get<1>(partialFinal), + PinyinFuzzyFlag::PartialFinal); } } } for (size_t i = 0; i < initials.size(); i++) { for (size_t j = 0; j < finals.size(); j++) { - auto initial = initials[i]; - auto final = finals[j]; + auto initial = std::get<0>(initials[i]); + auto final = std::get<0>(finals[j]); + auto flags = std::get<1>(initials[i]) | std::get<1>(finals[j]); if ((i == 0 && j == 0) || final == PinyinFinal::Invalid || PinyinEncoder::isValidInitialFinal(initial, final)) { auto iter = std::find_if( @@ -644,23 +669,21 @@ static void getFuzzy( [final](auto &p) { return p.first == final; }) == finals.end()) { - finals.emplace_back(final, i > 0 || j > 0); + finals.emplace_back(final, adjuster(flags)); } } } } } -MatchedPinyinSyllables -PinyinEncoder::stringToSyllables(std::string_view pinyinView, - PinyinFuzzyFlags flags) { - std::vector< - std::pair>>> - result; +template +FuzzyPinyinSyllables +stringToSyllablesImpl(std::string_view pinyinView, const PinyinMap &map, + PinyinFuzzyFlags flags, const Adjuster &adjuster) { + FuzzyPinyinSyllables result; std::string pinyin(pinyinView); std::transform(pinyin.begin(), pinyin.end(), pinyin.begin(), fcitx::charutils::tolower); - const auto &map = getPinyinMapV2(); // we only want {M,N,R}/Invalid instead of {M,N,R}/Zero, so we could get // match for everything. if (pinyin != "m" && pinyin != "n" && pinyin != "r") { @@ -669,7 +692,10 @@ PinyinEncoder::stringToSyllables(std::string_view pinyinView, boost::make_iterator_range(iterPair.first, iterPair.second)) { if (flags.test(item.flags())) { getFuzzy(result, {item.initial(), item.final()}, flags, - /*isSp=*/false); + /*isSp=*/false, + [&adjuster, &item](PinyinFuzzyFlags flags) { + return adjuster(item.flags() | flags); + }); } } } @@ -677,15 +703,16 @@ PinyinEncoder::stringToSyllables(std::string_view pinyinView, auto iter = initialMap.right.find(pinyin); if (initialMap.right.end() != iter) { getFuzzy(result, {iter->second, PinyinFinal::Invalid}, flags, - /*isSp=*/false); + /*isSp=*/false, adjuster); } if (result.empty()) { result.emplace_back( std::piecewise_construct, std::forward_as_tuple(PinyinInitial::Invalid), - std::forward_as_tuple(1, - std::make_pair(PinyinFinal::Invalid, false))); + std::forward_as_tuple( + 1, std::make_pair(PinyinFinal::Invalid, + adjuster(PinyinFuzzyFlag::None)))); } #if 0 @@ -709,10 +736,35 @@ PinyinEncoder::stringToSyllables(std::string_view pinyinView, return result; } +} // namespace + MatchedPinyinSyllables -PinyinEncoder::shuangpinToSyllables(std::string_view pinyinView, - const ShuangpinProfile &sp, - PinyinFuzzyFlags flags) { +PinyinEncoder::stringToSyllables(std::string_view pinyinView, + PinyinFuzzyFlags flags) { + auto adjuster = [](const PinyinFuzzyFlags &flags) { + return flags != PinyinFuzzyFlag::None; + }; + return stringToSyllablesImpl(pinyinView, getPinyinMapV2(), flags, + adjuster); +} + +MatchedPinyinSyllablesWithFuzzyFlags +PinyinEncoder::stringToSyllablesWithFuzzyFlags( + std::string_view pinyinView, const PinyinCorrectionProfile *profile, + PinyinFuzzyFlags flags) { + auto identity = [](const PinyinFuzzyFlags &flags) { return flags; }; + return stringToSyllablesImpl( + pinyinView, profile ? profile->pinyinMap() : getPinyinMapV2(), flags, + identity); +} + +namespace { + +template +FuzzyPinyinSyllables +shuangpinToSyllablesImpl(std::string_view pinyinView, + const ShuangpinProfile &sp, PinyinFuzzyFlags flags, + const Adjuster &adjuster) { assert(pinyinView.size() <= 2); std::string pinyin(pinyinView); std::transform(pinyin.begin(), pinyin.end(), pinyin.begin(), @@ -726,14 +778,15 @@ PinyinEncoder::shuangpinToSyllables(std::string_view pinyinView, flags = flags.unset(PinyinFuzzyFlag::PartialFinal); } - std::vector< - std::pair>>> - result; + FuzzyPinyinSyllables result; if (iter != table.end()) { for (const auto &p : iter->second) { if (flags.test(p.second)) { getFuzzy(result, {p.first.initial(), p.first.final()}, flags, - /*isSp=*/true); + /*isSp=*/true, + [base = p.second, &adjuster](PinyinFuzzyFlags flags) { + return adjuster(flags | base); + }); } } } @@ -742,13 +795,35 @@ PinyinEncoder::shuangpinToSyllables(std::string_view pinyinView, result.emplace_back( std::piecewise_construct, std::forward_as_tuple(PinyinInitial::Invalid), - std::forward_as_tuple(1, - std::make_pair(PinyinFinal::Invalid, false))); + std::forward_as_tuple( + 1, std::make_pair(PinyinFinal::Invalid, + adjuster(PinyinFuzzyFlag::None)))); } return result; } +} // namespace + +MatchedPinyinSyllables +PinyinEncoder::shuangpinToSyllables(std::string_view pinyinView, + const ShuangpinProfile &sp, + PinyinFuzzyFlags flags) { + auto adjuster = [](const PinyinFuzzyFlags &flags) { + return flags != PinyinFuzzyFlag::None; + }; + return shuangpinToSyllablesImpl(pinyinView, sp, flags, adjuster); +} + +MatchedPinyinSyllablesWithFuzzyFlags +PinyinEncoder::shuangpinToSyllablesWithFuzzyFlags(std::string_view pinyinView, + const ShuangpinProfile &sp, + PinyinFuzzyFlags flags) { + auto identity = [](const PinyinFuzzyFlags &flags) { return flags; }; + return shuangpinToSyllablesImpl(pinyinView, sp, flags, + identity); +} + std::string PinyinEncoder::shuangpinToPinyin(std::string_view pinyinView, const libime::ShuangpinProfile &sp) { diff --git a/src/libime/pinyin/pinyinencoder.h b/src/libime/pinyin/pinyinencoder.h index f85e7c6..2a910ee 100644 --- a/src/libime/pinyin/pinyinencoder.h +++ b/src/libime/pinyin/pinyinencoder.h @@ -17,6 +17,7 @@ namespace libime { class ShuangpinProfile; +class PinyinCorrectionProfile; enum class PinyinFuzzyFlag { None = 0, @@ -203,13 +204,23 @@ LIBIMEPINYIN_EXPORT fcitx::LogMessageBuilder &operator<<(fcitx::LogMessageBuilder &log, PinyinSyllable syl); -using MatchedPinyinSyllables = std::vector< - std::pair>>>; +template +using FuzzyPinyinSyllables = std::vector< + std::pair>>>; + +using MatchedPinyinSyllables = FuzzyPinyinSyllables; + +using MatchedPinyinSyllablesWithFuzzyFlags = + FuzzyPinyinSyllables; class LIBIMEPINYIN_EXPORT PinyinEncoder { public: static SegmentGraph parseUserPinyin(std::string pinyin, PinyinFuzzyFlags flags); + static SegmentGraph parseUserPinyin(std::string pinyin, + const PinyinCorrectionProfile *profile, + PinyinFuzzyFlags flags); + static SegmentGraph parseUserShuangpin(std::string pinyin, const ShuangpinProfile &sp, PinyinFuzzyFlags flags); @@ -270,9 +281,19 @@ class LIBIMEPINYIN_EXPORT PinyinEncoder { static MatchedPinyinSyllables stringToSyllables(std::string_view pinyin, PinyinFuzzyFlags flags); + + static MatchedPinyinSyllablesWithFuzzyFlags + stringToSyllablesWithFuzzyFlags(std::string_view pinyin, + const PinyinCorrectionProfile *profile, + PinyinFuzzyFlags flags); + static MatchedPinyinSyllables shuangpinToSyllables(std::string_view pinyin, const ShuangpinProfile &sp, PinyinFuzzyFlags flags); + static MatchedPinyinSyllablesWithFuzzyFlags + shuangpinToSyllablesWithFuzzyFlags(std::string_view pinyin, + const ShuangpinProfile &sp, + PinyinFuzzyFlags flags); static const char firstInitial = static_cast(PinyinInitial::B); static const char lastInitial = static_cast(PinyinInitial::Zero); diff --git a/test/testpinyinencoder.cpp b/test/testpinyinencoder.cpp index b031aa9..9ac7fc6 100644 --- a/test/testpinyinencoder.cpp +++ b/test/testpinyinencoder.cpp @@ -4,6 +4,7 @@ * SPDX-License-Identifier: LGPL-2.1-or-later */ +#include "libime/pinyin/pinyincorrectionprofile.h" #include "libime/pinyin/pinyinencoder.h" #include #include @@ -72,29 +73,56 @@ int main() { check("qi'e", PinyinFuzzyFlag::Inner, {"qi", "'", "e"}); check("nng", PinyinFuzzyFlag::InnerShort, {"n", "ng"}); - for (const auto &syl : PinyinEncoder::stringToSyllables( - "niagn", - PinyinFuzzyFlags{PinyinFuzzyFlag::L_N, PinyinFuzzyFlag::IAN_IANG, - PinyinFuzzyFlag::CommonTypo})) { - for (auto f : syl.second) { - FCITX_INFO() << PinyinSyllable(syl.first, f.first).toString(); - } - } - for (const auto &syl : PinyinEncoder::stringToSyllables( - "n", - PinyinFuzzyFlags{PinyinFuzzyFlag::L_N, PinyinFuzzyFlag::IAN_IANG, - PinyinFuzzyFlag::CommonTypo})) { - for (auto f : syl.second) { - FCITX_INFO() << PinyinSyllable(syl.first, f.first).toString(); - } - } - for (const auto &syl : PinyinEncoder::stringToSyllables( - "cuagn", {PinyinFuzzyFlag::C_CH, PinyinFuzzyFlag::UAN_UANG, - PinyinFuzzyFlag::CommonTypo})) { - for (auto f : syl.second) { - FCITX_INFO() << PinyinSyllable(syl.first, f.first).toString(); - } - } + FCITX_ASSERT(PinyinEncoder::stringToSyllables( + "niagn", PinyinFuzzyFlags{PinyinFuzzyFlag::L_N, + PinyinFuzzyFlag::IAN_IANG, + PinyinFuzzyFlag::CommonTypo}) == + MatchedPinyinSyllables{ + {PinyinInitial::N, + {{PinyinFinal::IANG, true}, {PinyinFinal::IAN, true}}}, + {PinyinInitial::L, + {{PinyinFinal::IANG, true}, {PinyinFinal::IAN, true}}}}); + FCITX_ASSERT( + PinyinEncoder::stringToSyllablesWithFuzzyFlags( + "niagn", nullptr, + PinyinFuzzyFlags{PinyinFuzzyFlag::L_N, PinyinFuzzyFlag::IAN_IANG, + PinyinFuzzyFlag::CommonTypo}) == + MatchedPinyinSyllablesWithFuzzyFlags{ + {PinyinInitial::N, + {{PinyinFinal::IANG, + PinyinFuzzyFlags{PinyinFuzzyFlag::CommonTypo}}, + {PinyinFinal::IAN, + PinyinFuzzyFlags{PinyinFuzzyFlag::CommonTypo, PinyinFuzzyFlag::IAN_IANG}}}}, + {PinyinInitial::L, + {{PinyinFinal::IANG, + PinyinFuzzyFlags{PinyinFuzzyFlag::CommonTypo, PinyinFuzzyFlag::L_N}}, + {PinyinFinal::IAN, + PinyinFuzzyFlags{PinyinFuzzyFlag::CommonTypo, PinyinFuzzyFlag::IAN_IANG, PinyinFuzzyFlag::L_N}}}}}); + + FCITX_ASSERT(PinyinEncoder::stringToSyllables( + "n", PinyinFuzzyFlags{PinyinFuzzyFlag::L_N, + PinyinFuzzyFlag::IAN_IANG, + PinyinFuzzyFlag::CommonTypo}) == + MatchedPinyinSyllables{ + {PinyinInitial::N, {{PinyinFinal::Invalid, false}}}, + {PinyinInitial::L, {{PinyinFinal::Invalid, true}}}}); + + FCITX_ASSERT(PinyinEncoder::stringToSyllables( + "cuagn", {PinyinFuzzyFlag::C_CH, PinyinFuzzyFlag::UAN_UANG, + PinyinFuzzyFlag::CommonTypo}) == + MatchedPinyinSyllables{ + {PinyinInitial::C, {{PinyinFinal::UAN, true}}}, + {PinyinInitial::CH, + {{PinyinFinal::UAN, true}, {PinyinFinal::UANG, true}}}}); + + FCITX_ASSERT(PinyinEncoder::stringToSyllables( + "e", PinyinFuzzyFlags{PinyinFuzzyFlag::PartialFinal}) == + MatchedPinyinSyllables{{PinyinInitial::Zero, + {{PinyinFinal::E, false}, + {PinyinFinal::EI, true}, + {PinyinFinal::EN, true}, + {PinyinFinal::ENG, true}, + {PinyinFinal::ER, true}}}}); for (const auto &syl : PinyinEncoder::stringToSyllables( "e", PinyinFuzzyFlags{PinyinFuzzyFlag::PartialFinal})) { @@ -203,5 +231,13 @@ int main() { check("zhunipingan", PinyinFuzzyFlag::Inner, {"zhu", "ni", "pin", "gan"}); check("zhuna", PinyinFuzzyFlag::Inner, {"zhu", "na"}); check("zhuna", PinyinFuzzyFlag::Inner, {"zhun", "a"}); + + { + PinyinCorrectionProfile profile(BuiltinPinyinCorrectionProfile::Qwerty); + auto graph = PinyinEncoder::parseUserPinyin( + "zhyi", &profile, PinyinFuzzyFlag::Correction); + dfs(graph, {"zhyi"}); + } + return 0; }