Skip to content

Commit

Permalink
ICU-22707 UTC-179-C28 LB19 change for simplified chinese
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin committed Jun 21, 2024
1 parent 54da599 commit cb17485
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 9 deletions.
25 changes: 22 additions & 3 deletions icu4c/source/data/brkitr/rules/line.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ $BK = [:LineBreak = Mandatory_Break:];
$B2 = [:LineBreak = Break_Both:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
$CL = [[:LineBreak = Close_Punctuation:]];
# $CM = [:LineBreak = Combining_Mark:];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
Expand Down Expand Up @@ -251,6 +251,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];

#
# LB 15d Do not break before numeric separators (IS), even after spaces.
# SP IS QU is handled below as part of LB 19.

[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
Expand All @@ -274,19 +275,37 @@ $LB18Breaks = [$LB8Breaks $SP];


# LB 19
# x QU
$LB18NonBreaks $CM* $QU;
^$CM+ $QU;

# QU x
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];

$QU $CM* .;
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];

$SP [$IS & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [$QU & \p{Pi}] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$SP [$IS & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [$QU & \p{Pi}] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$SP [$IS & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [$QU & \p{Pf}] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
$SP [$IS & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [$QU & \p{Pf}] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];

^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];

# LB 20
# <break> $CB
# $CB <break>
#
$LB20NonBreaks = [$LB18NonBreaks - $CB];

[$LB20NonBreaks - $HL] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
[$LB20NonBreaks - $HL] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];

# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
Expand Down
88 changes: 82 additions & 6 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2706,6 +2706,7 @@ class RBBILineMonkey: public RBBIMonkeyKind {
UnicodeSet *fVI;
UnicodeSet *fPi;
UnicodeSet *fPf;
UnicodeSet *feaFWH;

BreakIterator *fCharBI;
const UnicodeString *fText;
Expand Down Expand Up @@ -2785,6 +2786,8 @@ RBBILineMonkey::RBBILineMonkey() :
fPi = new UnicodeSet(uR"([\p{Pi}])", status);
fPf = new UnicodeSet(uR"([\p{Pf}])", status);

feaFWH = new UnicodeSet(uR"([\p{ea=F}\p{ea=W}\p{ea=H}])", status);

if (U_FAILURE(status)) {
deferredStatus = status;
return;
Expand Down Expand Up @@ -2916,9 +2919,23 @@ void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos
// LB 9 Treat X CM* as if it were x.
// No explicit action required.

// LB 10 Treat any remaining combining mark as AL
// LB 10 Treat any remaining combining mark as AL, but preserve its East
// Asian Width.
if (fCM->contains(*posChar)) {
*posChar = u'A';
switch (u_getIntPropertyValue(*posChar, UCHAR_EAST_ASIAN_WIDTH)) {
case U_EA_WIDE:
*posChar = u'';
break;
case U_EA_NEUTRAL:
*posChar = u'';
break;
case U_EA_AMBIGUOUS:
*posChar = u'';
break;
default:
puts("Unexpected ea value for lb=CM");
std::terminate();
}
}

// Push the updated nextPos and nextChar back to our caller.
Expand Down Expand Up @@ -3231,12 +3248,70 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
break;
}

// x QU
// QU x
if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
setAppliedRule(pos, "LB 19");
// LB 19
// × [QU-\p{Pi}]
if (fQU->contains(thisChar) && !fPi->contains(thisChar)) {
setAppliedRule(pos, "LB 19 × [QU-\\p{Pi}]");
continue;
}
// [^\p{ea=F}\p{ea=W}\p{ea=H}] × [\p{Pi}&QU]
if (!feaFWH->contains(prevChar) && fPi->contains(thisChar) && fQU->contains(thisChar)) {
setAppliedRule(pos, "LB 19 [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] × [\\p{Pi}&QU]");
continue;
}
// × [\p{Pi}&QU] ( [^\p{ea=F}\p{ea=W}\p{ea=H}] | eot )
if (fPi->contains(thisChar) && fQU->contains(thisChar)) {
if (nextPos < fText->length()) {
UChar32 nextChar = fText->char32At(nextPos);
if (!feaFWH->contains(nextChar)) {
setAppliedRule(pos, "LB 19 × [\\p{Pi}&QU] [^\\p{ea=F}\\p{ea=W}\\p{ea=H}]");
continue;
}
} else {
setAppliedRule(pos, "LB 19 × [\\p{Pi}&QU] eot");
continue;
}
}

// [QU-\p{Pf}] ×
if (fQU->contains(prevChar) && !fPf->contains(prevChar)) {
setAppliedRule(pos, "LB 19 [QU-\\p{Pf}] ×");
continue;
}
// [\p{Pf}&QU] × [^\p{ea=F}\p{ea=W}\p{ea=H}]
if (fPf->contains(prevChar) && fQU->contains(prevChar) && !feaFWH->contains(thisChar)) {
setAppliedRule(pos, "LB 19 [\\p{Pf}&QU] × [^\\p{ea=F}\\p{ea=W}\\p{ea=H}]");
continue;
}
// ( sot | [^\p{ea=F}\p{ea=W}\p{ea=H}] ) [\p{Pf}&QU] ×
if (fPf->contains(prevChar) && fQU->contains(prevChar)) {
if (prevPos == 0) {
setAppliedRule(pos, "LB 19 sot [\\p{Pf}&QU] ×");
continue;
}
// prevPosX2 is -1 if there was a break, and prevCharX2 is 0; but the UAX #14 rules can
// look through breaks.
int breakObliviousPrevPosX2 = fText->moveIndex32(prevPos, -1);
while (fCM->contains(fText->char32At(breakObliviousPrevPosX2))) {
if (breakObliviousPrevPosX2 == 0) {
break;
}
int beforeCM = fText->moveIndex32(breakObliviousPrevPosX2, -1);
if (fBK->contains(fText->char32At(beforeCM)) ||
fCR->contains(fText->char32At(beforeCM)) ||
fLF->contains(fText->char32At(beforeCM)) ||
fNL->contains(fText->char32At(beforeCM)) ||
fSP->contains(fText->char32At(beforeCM)) ||
fZW->contains(fText->char32At(beforeCM))) {
break;
}
breakObliviousPrevPosX2 = beforeCM;
}
if (!feaFWH->contains(fText->char32At(breakObliviousPrevPosX2))) {
setAppliedRule(pos, "LB 19 [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] [\\p{Pf}&QU] ×");
continue;
}
}

if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
setAppliedRule(pos, "LB 20 Break around a CB");
Expand Down Expand Up @@ -3615,6 +3690,7 @@ RBBILineMonkey::~RBBILineMonkey() {
delete fVI;
delete fPi;
delete fPf;
delete feaFWH;

delete fCharBI;
delete fNumberMatcher;
Expand Down

0 comments on commit cb17485

Please sign in to comment.