From 2f69ca90275314d4f686bad2fee5cae8a4bc73fd Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 20 Mar 2024 11:29:02 +0100 Subject: [PATCH] ICU-22707 UTC-179-C28 Simplify the UAX14 formulation --- icu4c/source/test/intltest/rbbitst.cpp | 42 ++++++++++++++------------ 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 5111b0119b3a..768863e9c9d4 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -3254,39 +3254,41 @@ int32_t RBBILineMonkey::next(int32_t startPos) { setAppliedRule(pos, "LB 19 × [QU-\\p{Pi}]"); continue; } - // [^\p{ea=F}\p{ea=W}\p{ea=H}] × [\p{Pi}&QU] - if (!feaFWH->contains(prevChar) && fPi->contains(thisChar) && fQU->contains(thisChar)) { - setAppliedRule(pos, "LB 19 [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] × [\\p{Pi}&QU]"); + + // [QU-\p{Pf}] × + if (fQU->contains(prevChar) && !fPf->contains(prevChar)) { + setAppliedRule(pos, "LB 19 [QU-\\p{Pf}] ×"); + continue; + } + + // LB 19a + // [^\p{ea=F}\p{ea=W}\p{ea=H}] × QU + if (!feaFWH->contains(prevChar) && fQU->contains(thisChar)) { + setAppliedRule(pos, "LB 19a [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] × QU"); continue; } - // × [\p{Pi}&QU] ( [^\p{ea=F}\p{ea=W}\p{ea=H}] | eot ) - if (fPi->contains(thisChar) && fQU->contains(thisChar)) { + // × QU ( [^\p{ea=F}\p{ea=W}\p{ea=H}] | eot ) + if (fQU->contains(thisChar)) { if (nextPos < fText->length()) { UChar32 nextChar = fText->char32At(nextPos); if (!feaFWH->contains(nextChar)) { - setAppliedRule(pos, "LB 19 × [\\p{Pi}&QU] [^\\p{ea=F}\\p{ea=W}\\p{ea=H}]"); + setAppliedRule(pos, "LB 19a × QU [^\\p{ea=F}\\p{ea=W}\\p{ea=H}]"); continue; } } else { - setAppliedRule(pos, "LB 19 × [\\p{Pi}&QU] eot"); + setAppliedRule(pos, "LB 19 × QU eot"); continue; } } - - // [QU-\p{Pf}] × - if (fQU->contains(prevChar) && !fPf->contains(prevChar)) { - setAppliedRule(pos, "LB 19 [QU-\\p{Pf}] ×"); - continue; - } - // [\p{Pf}&QU] × [^\p{ea=F}\p{ea=W}\p{ea=H}] - if (fPf->contains(prevChar) && fQU->contains(prevChar) && !feaFWH->contains(thisChar)) { - setAppliedRule(pos, "LB 19 [\\p{Pf}&QU] × [^\\p{ea=F}\\p{ea=W}\\p{ea=H}]"); + // QU × [^\p{ea=F}\p{ea=W}\p{ea=H}] + if (fQU->contains(prevChar) && !feaFWH->contains(thisChar)) { + setAppliedRule(pos, "LB 19a QU × [^\\p{ea=F}\\p{ea=W}\\p{ea=H}]"); continue; } - // ( sot | [^\p{ea=F}\p{ea=W}\p{ea=H}] ) [\p{Pf}&QU] × - if (fPf->contains(prevChar) && fQU->contains(prevChar)) { + // ( sot | [^\p{ea=F}\p{ea=W}\p{ea=H}] ) QU × + if (fQU->contains(prevChar)) { if (prevPos == 0) { - setAppliedRule(pos, "LB 19 sot [\\p{Pf}&QU] ×"); + setAppliedRule(pos, "LB 19a sot QU ×"); continue; } // prevPosX2 is -1 if there was a break, and prevCharX2 is 0; but the UAX #14 rules can @@ -3308,7 +3310,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) { breakObliviousPrevPosX2 = beforeCM; } if (!feaFWH->contains(fText->char32At(breakObliviousPrevPosX2))) { - setAppliedRule(pos, "LB 19 [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] [\\p{Pf}&QU] ×"); + setAppliedRule(pos, "LB 19a [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] QU ×"); continue; } }