Skip to content

Commit

Permalink
Unicode 15.1 linebreaking (#48)
Browse files Browse the repository at this point in the history
* ICU-22039 Line Break on Orthographic Syllable Boundaries

This is an experimental implementation of the line breaking rules proposed in the
Unicode document L2/22-080R. It is not suitable for merging into ICU main.

Limitations:
   - ICU4C only.
   - Root locale only (not implemented for the various LB tailorings).
   - New Line Break properties implemented with hard-coded UnicodeSets. (unmaintainable)
   - RBBIMonkeyTest not updated. (There are two ICU monkey tests; the other is updated.)

---------

Co-authored-by: Andy Heninger <andy.heninger@gmail.com>
  • Loading branch information
eggrobin and aheninger authored Jul 10, 2023
1 parent 996e1c0 commit f1a9e57
Show file tree
Hide file tree
Showing 42 changed files with 875 additions and 181 deletions.
35 changes: 30 additions & 5 deletions icu4c/source/data/brkitr/rules/line.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@
!!quoted_literals_only;

$AI = [:LineBreak = Ambiguous:];
$AK = [:LineBreak = Aksara:];
$AL = [:LineBreak = Alphabetic:];
$AP = [:LineBreak = Aksara_Prebase:];
$AS = [:LineBreak = Aksara_Start:];
$BA = [:LineBreak = Break_After:];
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
$BB = [:LineBreak = Break_Before:];
Expand Down Expand Up @@ -64,6 +67,8 @@ $SA = [:LineBreak = Complex_Context:];
$SG = [:LineBreak = Surrogate:];
$SP = [:LineBreak = Space:];
$SY = [:LineBreak = Break_Symbols:];
$VF = [:LineBreak = Virama_Final:];
$VI = [:LineBreak = Virama:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
Expand Down Expand Up @@ -215,7 +220,27 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# by rule 8, CM following a SP is stand-alone.


# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23"
# LB 15a
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .;
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
^([\p{Pi} & $QU] $CM* $SP*)+ .;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;

# LB 15b
$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
$CAN_CM $CM* [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
^$CM+ [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];

# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi.
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;


# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23"
# Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations.
# See issue ICU-20303

Expand All @@ -225,7 +250,7 @@ $SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];

#
# LB 14b Do not break before numeric separators (IS), even after spaces.
# LB 15d Do not break before numeric separators (IS), even after spaces.

[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
Expand All @@ -235,9 +260,6 @@ $CAN_CM $CM* $IS;
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL


# LB 15
$QU $CM* $SP* $OP;

# LB 16
($CL | $CP) $CM* $SP* $NS;

Expand Down Expand Up @@ -338,6 +360,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
($ALPlus | $HL) $CM* ($ALPlus | $HL);
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL

#LB 28a Do not break Orthographic syllables
($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?;

# LB 29
$IS $CM* ($ALPlus | $HL);

Expand Down
35 changes: 30 additions & 5 deletions icu4c/source/data/brkitr/rules/line_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@
!!quoted_literals_only;

$AI = [:LineBreak = Ambiguous:];
$AK = [:LineBreak = Aksara:];
$AL = [:LineBreak = Alphabetic:];
$AP = [:LineBreak = Aksara_Prebase:];
$AS = [:LineBreak = Aksara_Start:];
$BA = [:LineBreak = Break_After:];
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
$BB = [:LineBreak = Break_Before:];
Expand Down Expand Up @@ -65,6 +68,8 @@ $SA = [:LineBreak = Complex_Context:];
$SG = [:LineBreak = Surrogate:];
$SP = [:LineBreak = Space:];
$SY = [:LineBreak = Break_Symbols:];
$VF = [:LineBreak = Virama_Final:];
$VI = [:LineBreak = Virama:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
Expand Down Expand Up @@ -216,7 +221,27 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# by rule 8, CM following a SP is stand-alone.


# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23"
# LB 15a
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .;
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
^([\p{Pi} & $QU] $CM* $SP*)+ .;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;

# LB 15b
$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
$CAN_CM $CM* [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
^$CM+ [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];

# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi.
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;


# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23"
# Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations.
# See issue ICU-20303

Expand All @@ -226,7 +251,7 @@ $SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];

#
# LB 14b Do not break before numeric separators (IS), even after spaces.
# LB 15d Do not break before numeric separators (IS), even after spaces.

[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
Expand All @@ -236,9 +261,6 @@ $CAN_CM $CM* $IS;
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL


# LB 15
$QU $CM* $SP* $OP;

# LB 16
($CL | $CP) $CM* $SP* $NS;

Expand Down Expand Up @@ -339,6 +361,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
($ALPlus | $HL) $CM* ($ALPlus | $HL);
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL

#LB 28a Do not break Orthographic syllables
($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?;

# LB 29
$IS $CM* ($ALPlus | $HL);

Expand Down
35 changes: 30 additions & 5 deletions icu4c/source/data/brkitr/rules/line_loose.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@
!!quoted_literals_only;

$AI = [:LineBreak = Ambiguous:];
$AK = [:LineBreak = Aksara:];
$AL = [:LineBreak = Alphabetic:];
$AP = [:LineBreak = Aksara_Prebase:];
$AS = [:LineBreak = Aksara_Start:];
$BA = [:LineBreak = Break_After:];
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
$BB = [:LineBreak = Break_Before:];
Expand Down Expand Up @@ -71,6 +74,8 @@ $SA = [:LineBreak = Complex_Context:];
$SG = [:LineBreak = Surrogate:];
$SP = [:LineBreak = Space:];
$SY = [:LineBreak = Break_Symbols:];
$VF = [:LineBreak = Virama_Final:];
$VI = [:LineBreak = Virama:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
Expand Down Expand Up @@ -222,7 +227,27 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# by rule 8, CM following a SP is stand-alone.


# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23"
# LB 15a
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .;
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
^([\p{Pi} & $QU] $CM* $SP*)+ .;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;

# LB 15b
$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
$CAN_CM $CM* [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
^$CM+ [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];

# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi.
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;


# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23"
# Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations.
# See issue ICU-20303

Expand All @@ -232,7 +257,7 @@ $SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];

#
# LB 14b Do not break before numeric separators (IS), even after spaces.
# LB 15d Do not break before numeric separators (IS), even after spaces.

[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
Expand All @@ -242,9 +267,6 @@ $CAN_CM $CM* $IS;
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL


# LB 15
$QU $CM* $SP* $OP;

# LB 16
# Do not break between closing punctuation and $NS, even with intervening spaces
# But DO allow a break between closing punctuation and $NSX, don't include it here
Expand Down Expand Up @@ -349,6 +371,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
($ALPlus | $HL) $CM* ($ALPlus | $HL);
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL

#LB 28a Do not break Orthographic syllables
($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?;

# LB 29
$IS $CM* ($ALPlus | $HL);

Expand Down
35 changes: 30 additions & 5 deletions icu4c/source/data/brkitr/rules/line_loose_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@
!!quoted_literals_only;

$AI = [:LineBreak = Ambiguous:];
$AK = [:LineBreak = Aksara:];
$AL = [:LineBreak = Alphabetic:];
$AP = [:LineBreak = Aksara_Prebase:];
$AS = [:LineBreak = Aksara_Start:];
$BAX = [\u2010 \u2013];
$BA = [[:LineBreak = Break_After:] - $BAX];
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
Expand Down Expand Up @@ -83,6 +86,8 @@ $SA = [:LineBreak = Complex_Context:];
$SG = [:LineBreak = Surrogate:];
$SP = [:LineBreak = Space:];
$SY = [:LineBreak = Break_Symbols:];
$VF = [:LineBreak = Virama_Final:];
$VI = [:LineBreak = Virama:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
Expand Down Expand Up @@ -234,7 +239,27 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# by rule 8, CM following a SP is stand-alone.


# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23"
# LB 15a
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .;
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
^([\p{Pi} & $QU] $CM* $SP*)+ .;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;

# LB 15b
$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
$CAN_CM $CM* [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
^$CM+ [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];

# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi.
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;


# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23"
# Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations.
# See issue ICU-20303

Expand All @@ -244,7 +269,7 @@ $SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];

#
# LB 14b Do not break before numeric separators (IS), even after spaces.
# LB 15d Do not break before numeric separators (IS), even after spaces.

[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
Expand All @@ -254,9 +279,6 @@ $CAN_CM $CM* $IS;
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL


# LB 15
$QU $CM* $SP* $OP;

# LB 16
# Do not break between closing punctuation and $NS, even with intervening spaces
# But DO allow a break between closing punctuation and $NSX, don't include it here
Expand Down Expand Up @@ -367,6 +389,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
($ALPlus | $HL) $CM* ($ALPlus | $HL);
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL

#LB 28a Do not break Orthographic syllables
($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?;

# LB 29
$IS $CM* ($ALPlus | $HL);

Expand Down
35 changes: 30 additions & 5 deletions icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@
!!quoted_literals_only;

$AI = [:LineBreak = Ambiguous:];
$AK = [:LineBreak = Aksara:];
$AL = [:LineBreak = Alphabetic:];
$AP = [:LineBreak = Aksara_Prebase:];
$AS = [:LineBreak = Aksara_Start:];
$BAX = [\u2010 \u2013];
$BA = [[:LineBreak = Break_After:] - $BAX];
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
Expand Down Expand Up @@ -85,6 +88,8 @@ $SA = [:LineBreak = Complex_Context:];
$SG = [:LineBreak = Surrogate:];
$SP = [:LineBreak = Space:];
$SY = [:LineBreak = Break_Symbols:];
$VF = [:LineBreak = Virama_Final:];
$VI = [:LineBreak = Virama:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
Expand Down Expand Up @@ -247,7 +252,27 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# by rule 8, CM following a SP is stand-alone.


# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23"
# LB 15a
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .;
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
^([\p{Pi} & $QU] $CM* $SP*)+ .;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;

# LB 15b
$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
$CAN_CM $CM* [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
^$CM+ [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];

# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi.
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;


# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23"
# Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations.
# See issue ICU-20303

Expand All @@ -257,7 +282,7 @@ $SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];

#
# LB 14b Do not break before numeric separators (IS), even after spaces.
# LB 15d Do not break before numeric separators (IS), even after spaces.

[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
Expand All @@ -267,9 +292,6 @@ $CAN_CM $CM* $IS;
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL


# LB 15
$QU $CM* $SP* $OP;

# LB 16
# Do not break between closing punctuation and $NS, even with intervening spaces
# But DO allow a break between closing punctuation and $NSX, don't include it here
Expand Down Expand Up @@ -380,6 +402,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
($ALPlus | $HL) $CM* ($ALPlus | $HL);
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL

#LB 28a Do not break Orthographic syllables
($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?;

# LB 29
$IS $CM* ($ALPlus | $HL);

Expand Down
Loading

0 comments on commit f1a9e57

Please sign in to comment.