From 70089cd68383daeb611017393708b54a907f17d0 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 2 Jul 2024 19:04:39 +0200 Subject: [PATCH] ICU-22707 A completely different approach to LB19, partial implementation. --- icu4c/source/data/brkitr/rules/line.txt | 104 +++++++++++++++--------- 1 file changed, 64 insertions(+), 40 deletions(-) diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index 5f03170dabe1..3643d1a7ac00 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -122,7 +122,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs # AL_FOLLOW set of chars that can unconditionally follow an AL # Needed in rules where stand-alone $CM s are treated as AL. # -$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus]; +$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 [$QU - \p{Pi}] $BA $HY $NS $IN $NU $PR $PO $ALPlus]; # @@ -275,42 +275,68 @@ $LB18Breaks = [$LB8Breaks $SP]; # LB 19 -$LB18NonBreaks $CM* $QU; -^$CM+ $QU; - -# OP and GL are subtracted because of LB14 and LB12 (there is no break after them). -# BA is subtracted because of LB21a: -# We must not poke a hole into HL U+3000 × [\p{Pi} & QU] [\p{ea=F}\p{ea=W}\p{ea=H}], -# where U+3000 is lb=BA and ea=W. -[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; - -$OP $CM* $SP+ [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -^([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; - -$QU $CM* .; -[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; - -$OP $CM* $SP+ [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -^([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; - -# TODO(egg): We probably need this rule for everything that has an $AL_FOLLOW, as above, plus -# for the double-chaining into $SP $IS $CM* $CanFollowIS. -^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +$LB18NonBreaks $CM* [$QU - \p{Pi}]; +^$CM+ [$QU - \p{Pi}]; + +[$QU - \p{Pf}] $CM* .; + +# LB 19a +# × QU ( [^$EastAsian] | eot ) +$LB18NonBreaks $CM* $QU $CM* [[^ \p{ea=F}\p{ea=W}\p{ea=H} $CM] {eof}]; +^$CM+ $QU $CM* [[^ \p{ea=F}\p{ea=W}\p{ea=H} $CM] {eof}]; + +# QU × [^$EastAsian] +($QU $CM*)+ [^\p{ea=F}\p{ea=W}\p{ea=H} $CM]; + +# [^$EastAsian] × QU +# ( sot | [^$EastAsian] ) QU × +[$LB18NonBreaks - [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $QU $CM* .; +^[$CM - [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $QU $CM* .; +^ $QU $CM* .; + +# LB19a chaining: + +# LB19a can be chained into from a large number of rules, including itself; +# it can also chain into LB15a on QU Pi. +# In the rules below, the expression +# ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) +# covers chaining on LB15a. +# TODO(egg): In addition, it can be chained into from LB15a followed by a CM, and in particular an LB19a-LB15a chain is possible. + +# Note: all lb=QU are outside [\p{ea=F}\p{ea=W}\p{ea=H}]. This takes +# care of the pure self-chaining of LB19a. +$LB18NonBreaks $CM* $QU $CM* ($QU $CM*)+ .; +^$CM+ $QU $CM* ($QU $CM*)+ .; +^ $QU $CM* ($QU $CM*)+ .; +# Chaining into LB15a: +$LB18NonBreaks $CM* $QU $CM* ($QU $CM*)* ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?); +^$CM+ $QU $CM* ($QU $CM*)* ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?); +^ $QU $CM* ($QU $CM*)* ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?); + +# Chaining on Pf QU from LB 15b: +$LB8NonBreaks [\p{Pf} & $QU] $CM* ( ($QU $CM*)+ . | ($QU $CM*)* ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ( ($QU $CM*)+ . | ($QU $CM*)* ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); +^$CM+ [\p{Pf} & $QU] $CM* ( ($QU $CM*)+ . | ($QU $CM*)* ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); + +# SP is never [$EastAsian], but can be non-breaking, in which case we need to manually chain on SP QU: +# Non-breaking SP from LB14: +$OP $CM* $SP+ ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); +# Non-breaking SP from LB15a: +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); +^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); +# Non-breaking SP from LB15a following LB15b: +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); +# Chaining on RI QU from LB30a: +$RI $CM* $RI $CM* ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); +# Chaining on IS QU from LB15d: +$SP $IS $CM* ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); + +# Non-breaking SP from LB14 followed by CM: +$OP $CM* $SP+ [ $CM - [\p{ea=F}\p{ea=W}\p{ea=H}] ] $CM* ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); +$OP $CM* $SP+ $CM+ $QU $CM* ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); + # LB 20 # $CB @@ -318,9 +344,6 @@ $CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea= # $LB20NonBreaks = [$LB18NonBreaks - $CB]; -[$LB20NonBreaks - $HL] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -[$LB20NonBreaks - $HL] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; - # LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. # Originally added as a Finnish tailoring, now promoted to default ICU behavior. # Note: this is not default UAX-14 behaviour. See issue ICU-8151. @@ -332,6 +355,7 @@ $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: $OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB15a: +# TODO(egg): needs to be chained into the LB15as on the tail of a LB19. ($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus; ^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB15a following LB15b: