diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index 49dd0d17eb05..cb17b3b56b71 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -311,6 +311,18 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # Note: this is not default UAX-14 behaviour. See issue ICU-8151. # ^($HY | $HH) $CM* $ALPlus; +$GL ($HY | $HH) $CM* $ALPlus; +# Non-breaking CB from LB8a: +$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; +# Non-breaking SP from LB14: +$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus; +# Non-breaking SP from LB15a: +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus; +^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus; +# Non-breaking SP from LB15a following LB15b: +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus; # LB 21 x (BA | HY | NS) # BB x diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 768863e9c9d4..0f94d590b2b6 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -3320,14 +3320,36 @@ int32_t RBBILineMonkey::next(int32_t startPos) { break; } - // Don't break between Hyphens and letters if a break precedes the hyphen. - // Formerly this was a Finnish tailoring. - // Moved to root in ICU 63. This is an ICU customization, not in UAX-14. - // ^($HY | $HH) $AL; - if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) && - prevPosX2 == -1) { - setAppliedRule(pos, "LB 20.09"); - continue; + // Don't break between Hyphens and letters if a break or a space precedes the hyphen. + // Formerly this was a Finnish tailoring. + // (sot | BK | CR | LF | NL | SP | ZW | CB | GL) ( HY | [\u2010] ) × AL + if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar))) { + // sot ( HY | [\u2010] ) × AL. + if (prevPos == 0) { + setAppliedRule(pos, "LB 20a"); + continue; + } + // prevPosX2 is -1 if there was a break; but the UAX #14 rules can + // look through breaks. + int breakObliviousPrevPosX2 = fText->moveIndex32(prevPos, -1); + if (fBK->contains(fText->char32At(breakObliviousPrevPosX2)) || + fCR->contains(fText->char32At(breakObliviousPrevPosX2)) || + fLF->contains(fText->char32At(breakObliviousPrevPosX2)) || + fNL->contains(fText->char32At(breakObliviousPrevPosX2)) || + fSP->contains(fText->char32At(breakObliviousPrevPosX2)) || + fGL->contains(fText->char32At(breakObliviousPrevPosX2)) || + fZW->contains(fText->char32At(breakObliviousPrevPosX2))) { + setAppliedRule(pos, "LB 20a"); + continue; + } + while (breakObliviousPrevPosX2 > 0 && + fCM->contains(fText->char32At(breakObliviousPrevPosX2))) { + breakObliviousPrevPosX2 = fText->moveIndex32(breakObliviousPrevPosX2, -1); + } + if (fCB->contains(fText->char32At(breakObliviousPrevPosX2))) { + setAppliedRule(pos, "LB 20a"); + continue; + } } if (fBA->contains(thisChar) ||