Skip to content

Commit

Permalink
ICU-22707 UTC-179-C32 Upstream and improve the old Finnish tailoring …
Browse files Browse the repository at this point in the history
…LB20a from CLDR-3029 and ICU-8151
  • Loading branch information
eggrobin committed Jun 21, 2024
1 parent c466f45 commit 0e71e57
Show file tree
Hide file tree
Showing 3 changed files with 426 additions and 8 deletions.
12 changes: 12 additions & 0 deletions icu4c/source/data/brkitr/rules/line.txt
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,18 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a:
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a following LB15b:
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;

# LB 21 x (BA | HY | NS)
# BB x
Expand Down
38 changes: 30 additions & 8 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3320,14 +3320,36 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
break;
}

// Don't break between Hyphens and letters if a break precedes the hyphen.
// Formerly this was a Finnish tailoring.
// Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
// ^($HY | $HH) $AL;
if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
prevPosX2 == -1) {
setAppliedRule(pos, "LB 20.09");
continue;
// Don't break between Hyphens and letters if a break or a space precedes the hyphen.
// Formerly this was a Finnish tailoring.
// (sot | BK | CR | LF | NL | SP | ZW | CB | GL) ( HY | [\u2010] ) × AL
if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar))) {
// sot ( HY | [\u2010] ) × AL.
if (prevPos == 0) {
setAppliedRule(pos, "LB 20a");
continue;
}
// prevPosX2 is -1 if there was a break; but the UAX #14 rules can
// look through breaks.
int breakObliviousPrevPosX2 = fText->moveIndex32(prevPos, -1);
if (fBK->contains(fText->char32At(breakObliviousPrevPosX2)) ||
fCR->contains(fText->char32At(breakObliviousPrevPosX2)) ||
fLF->contains(fText->char32At(breakObliviousPrevPosX2)) ||
fNL->contains(fText->char32At(breakObliviousPrevPosX2)) ||
fSP->contains(fText->char32At(breakObliviousPrevPosX2)) ||
fGL->contains(fText->char32At(breakObliviousPrevPosX2)) ||
fZW->contains(fText->char32At(breakObliviousPrevPosX2))) {
setAppliedRule(pos, "LB 20a");
continue;
}
while (breakObliviousPrevPosX2 > 0 &&
fCM->contains(fText->char32At(breakObliviousPrevPosX2))) {
breakObliviousPrevPosX2 = fText->moveIndex32(breakObliviousPrevPosX2, -1);
}
if (fCB->contains(fText->char32At(breakObliviousPrevPosX2))) {
setAppliedRule(pos, "LB 20a");
continue;
}
}

if (fBA->contains(thisChar) ||
Expand Down
Loading

0 comments on commit 0e71e57

Please sign in to comment.