Skip to content

Commit

Permalink
ICU-22707 Copy new monkey rules to ICU4J
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin authored and markusicu committed Jul 18, 2024
1 parent d149089 commit 4acb472
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 34 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,7 @@
# file: line.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 for Unicode 14.0,
# with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when
# there is a boundary preceding the hyphen. See rule 20.9
# Rules derived from Unicode Standard Annex #14 for Unicode 16.0.
#
# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
# It sets characters of class CJ to behave like NS.
Expand Down Expand Up @@ -80,6 +76,12 @@ ZWJ = [:LineBreak = ZWJ:];
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];

eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ];
eaFWHminusCM = [ eaFWH - CMS ];
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];
BAminuseaFWH = [BA - eaFWH ];

PiQU = [\p{Pi}&QU];
PfQU = [\p{Pf}&QU];

Expand Down Expand Up @@ -116,15 +118,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (
# Rules LB14 - LB17.

# Moved before LB14, because it matches a supersequence.
LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;

# Moved before LB14. These are really the cases where LB19a does not apply, but
# the old LB19 would. This is to avoid many instances of chaining over two code
# points.
LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM;
LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19;
LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;

# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
# For example, the sequence "OP CM SP AL" matches LB14
# while the prefix of it, "OP CM SP" matches LB7.1
LB20a.7: OP CM* SP+ (HY | HH) CM* AL;
LB14: OP CM* SP* .;

LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.2: ^ (PiQU CM* SP*)+ .;
# LB15b/LB15a chaining.
LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .;
LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ );

Expand Down Expand Up @@ -161,7 +175,9 @@ LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

LB12: GL CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA HY] CM* GL;

Expand All @@ -183,15 +199,16 @@ LB19.1: QU CM* [^CM];
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;

LB20a.6: CB CM* ZWJ (HY | HH) CM* AL;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;

# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
LB20.09: ^(HY | HH) CM* AL;
# LB 20a Do not break after a word-initial hyphen.
LB20a.1: ^(HY | HH) CM* AL;

# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB21a: HL CM* (HY | BA) CM* [^CM CB];
LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL];

LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# file: line.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 for Unicode 14.0.
# Rules derived from Unicode Standard Annex #14 for Unicode 16.0.
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
Expand Down Expand Up @@ -80,6 +80,12 @@ CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];

eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ];
eaFWHminusCM = [ eaFWH - CMS ];
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];
BAminuseaFWH = [BA - eaFWH ];

PiQU = [\p{Pi}&QU];
PfQU = [\p{Pf}&QU];

Expand Down Expand Up @@ -116,15 +122,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (
# Rules LB14 - LB17.

# Moved before LB14, because it matches a supersequence.
LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;

# Moved before LB14. These are really the cases where LB19a does not apply, but
# the old LB19 would. This is to avoid many instances of chaining over two code
# points.
LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM;
LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19;
LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;

# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
# For example, the sequence "OP CM SP AL" matches LB14
# while the prefix of it, "OP CM SP" matches LB7.1
LB20a.7: OP CM* SP+ (HY | HH) CM* AL;
LB14: OP CM* SP* .;

LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.2: ^ (PiQU CM* SP*)+ .;
# LB15b/LB15a chaining.
LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .;
LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ );

Expand Down Expand Up @@ -161,7 +179,9 @@ LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

LB12: GL CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA HY] CM* GL;

Expand All @@ -183,15 +203,16 @@ LB19.1: QU CM* [^CM];
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;

LB20a.6: CB CM* ZWJ (HY | HH) CM* AL;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;

# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
LB20.09: ^(HY | HH) CM* AL;
# LB 20a Do not break after a word-initial hyphen.
LB20a.1: ^(HY | HH) CM* AL;

# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB21a: HL CM* (HY | BA) CM* [^CM CB];
LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL];

LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# file: line_loose.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 for Unicode 14.0.
# Rules derived from Unicode Standard Annex #14 for Unicode 16.0.
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
Expand Down Expand Up @@ -81,6 +81,12 @@ ZWJ = [:LineBreak = ZWJ:];
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];

eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ];
eaFWHminusCM = [ eaFWH - CMS ];
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];
BAminuseaFWH = [BA - eaFWH ];

PiQU = [\p{Pi}&QU];
PfQU = [\p{Pf}&QU];

Expand Down Expand Up @@ -117,15 +123,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (
# Rules LB14 - LB17.

# Moved before LB14, because it matches a supersequence.
LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;

# Moved before LB14. These are really the cases where LB19a does not apply, but
# the old LB19 would. This is to avoid many instances of chaining over two code
# points.
LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM;
LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19;
LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;

# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
# For example, the sequence "OP CM SP AL" matches LB14
# while the prefix of it, "OP CM SP" matches LB7.1
LB20a.7: OP CM* SP+ (HY | HH) CM* AL;
LB14: OP CM* SP* .;

LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.2: ^ (PiQU CM* SP*)+ .;
# LB15b/LB15a chaining.
LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .;
LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ );

Expand Down Expand Up @@ -162,7 +180,9 @@ LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

LB12: GL CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA HY] CM* GL;

Expand All @@ -184,15 +204,16 @@ LB19.1: QU CM* [^CM];
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;

LB20a.6: CB CM* ZWJ (HY | HH) CM* AL;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;

# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
LB20.09: ^(HY | HH) CM* AL;
# LB 20a Do not break after a word-initial hyphen.
LB20a.1: ^(HY | HH) CM* AL;

# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB21a: HL CM* (HY | BA) CM* [^CM CB];
LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL];

LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# file: line_loose_cj.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 for Unicode 14.0.
# Rules derived from Unicode Standard Annex #14 for Unicode 16.0.
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
Expand Down Expand Up @@ -99,6 +99,12 @@ ZWJ = [:LineBreak = ZWJ:];
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];

eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ];
eaFWHminusCM = [ eaFWH - CMS ];
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];
BAminuseaFWH = [BA - eaFWH ];

PiQU = [\p{Pi}&QU];
PfQU = [\p{Pf}&QU];

Expand Down Expand Up @@ -136,15 +142,27 @@ LB25: ((PR | PO | POX)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | I
# Rules LB14 - LB17.

# Moved before LB14, because it matches a supersequence.
LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;

# Moved before LB14. These are really the cases where LB19a does not apply, but
# the old LB19 would. This is to avoid many instances of chaining over two code
# points.
LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM;
LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19;
LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;

# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
# For example, the sequence "OP CM SP AL" matches LB14
# while the prefix of it, "OP CM SP" matches LB7.1
LB20a.7: OP CM* SP+ (HY | HH) CM* AL;
LB14: OP CM* SP* .;

LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.2: ^ (PiQU CM* SP*)+ .;
# LB15b/LB15a chaining.
LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .;
LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ );

Expand Down Expand Up @@ -181,7 +199,9 @@ LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

LB12: GL CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA BAX HY] CM* GL;

Expand All @@ -204,18 +224,19 @@ LB19.1: QU CM* [^CM];
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;

LB20a.6: CB CM* ZWJ (HY | HH) CM* AL;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;

# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
LB20.09: ^(HY | HH) CM* AL;
# LB 20a Do not break after a word-initial hyphen.
LB20a.1: ^(HY | HH) CM* AL;

# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x

LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
LB21a: HL CM* (HY | BAminuseaFWH | BAX) CM* [^CM CB HL];

LB21.1: [^ID] CM* [BA BAX HY NS];
LB21.2: ID CM* [BA HY NS];
Expand Down
Loading

0 comments on commit 4acb472

Please sign in to comment.