diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line.txt index c68a66b95f20..9f85b7917139 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line.txt @@ -6,11 +6,7 @@ # file: line.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 for Unicode 14.0, -# with the following modification: -# -# Boundaries between hyphens and following letters are suppressed when -# there is a boundary preceding the hyphen. See rule 20.9 +# Rules derived from Unicode Standard Annex #14 for Unicode 16.0. # # This corresponds to CSS line-break=strict (BCP47 -u-lb-strict). # It sets characters of class CJ to behave like NS. @@ -80,6 +76,12 @@ ZWJ = [:LineBreak = ZWJ:]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}]; +eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ]; +eaFWHminusCM = [ eaFWH - CMS ]; +eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ]; +BAminuseaFWH = [BA - eaFWH ]; + PiQU = [\p{Pi}&QU]; PfQU = [\p{Pf}&QU]; @@ -116,15 +118,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* ( # Rules LB14 - LB17. # Moved before LB14, because it matches a supersequence. +LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; +# Moved before LB14. These are really the cases where LB19a does not apply, but +# the old LB19 would. This is to avoid many instances of chaining over two code +# points. +LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM; +LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM; +LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19; +LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 +LB20a.7: OP CM* SP+ (HY | HH) CM* AL; LB14: OP CM* SP* .; +LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15a.2: ^ (PiQU CM* SP*)+ .; # LB15b/LB15a chaining. +LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); @@ -161,7 +175,9 @@ LB11.1: [^SP] CM* WJ; LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; -LB12: GL CM* [^CM]; +# Needs to apply before LB12, because the new monkeys are not greedy. +LB20a.2: GL (HY | HH) CM* AL; +LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; @@ -183,15 +199,16 @@ LB19.1: QU CM* [^CM]; LB20.1: . CM* ZWJ CB; LB20.2: . CM* ÷ CB; +LB20a.6: CB CM* ZWJ (HY | HH) CM* AL; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; -# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. -LB20.09: ^(HY | HH) CM* AL; +# LB 20a Do not break after a word-initial hyphen. +LB20a.1: ^(HY | HH) CM* AL; # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. -LB21a: HL CM* (HY | BA) CM* [^CM CB]; +LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL]; LB21.1: . CM* [BA HY NS]; LB21.2: BB CM* [^CM CB]; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt index bcfc94f05485..7aad76ecf107 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt @@ -6,7 +6,7 @@ # file: line.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 for Unicode 14.0. +# Rules derived from Unicode Standard Annex #14 for Unicode 16.0. # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. @@ -80,6 +80,12 @@ CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}]; +eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ]; +eaFWHminusCM = [ eaFWH - CMS ]; +eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ]; +BAminuseaFWH = [BA - eaFWH ]; + PiQU = [\p{Pi}&QU]; PfQU = [\p{Pf}&QU]; @@ -116,15 +122,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* ( # Rules LB14 - LB17. # Moved before LB14, because it matches a supersequence. +LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; +# Moved before LB14. These are really the cases where LB19a does not apply, but +# the old LB19 would. This is to avoid many instances of chaining over two code +# points. +LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM; +LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM; +LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19; +LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 +LB20a.7: OP CM* SP+ (HY | HH) CM* AL; LB14: OP CM* SP* .; +LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15a.2: ^ (PiQU CM* SP*)+ .; # LB15b/LB15a chaining. +LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); @@ -161,7 +179,9 @@ LB11.1: [^SP] CM* WJ; LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; -LB12: GL CM* [^CM]; +# Needs to apply before LB12, because the new monkeys are not greedy. +LB20a.2: GL (HY | HH) CM* AL; +LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; @@ -183,15 +203,16 @@ LB19.1: QU CM* [^CM]; LB20.1: . CM* ZWJ CB; LB20.2: . CM* ÷ CB; +LB20a.6: CB CM* ZWJ (HY | HH) CM* AL; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; -# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. -LB20.09: ^(HY | HH) CM* AL; +# LB 20a Do not break after a word-initial hyphen. +LB20a.1: ^(HY | HH) CM* AL; # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. -LB21a: HL CM* (HY | BA) CM* [^CM CB]; +LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL]; LB21.1: . CM* [BA HY NS]; LB21.2: BB CM* [^CM CB]; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt index 05c2bea74025..72e7563c9274 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt @@ -6,7 +6,7 @@ # file: line_loose.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 for Unicode 14.0. +# Rules derived from Unicode Standard Annex #14 for Unicode 16.0. # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. @@ -81,6 +81,12 @@ ZWJ = [:LineBreak = ZWJ:]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}]; +eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ]; +eaFWHminusCM = [ eaFWH - CMS ]; +eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ]; +BAminuseaFWH = [BA - eaFWH ]; + PiQU = [\p{Pi}&QU]; PfQU = [\p{Pf}&QU]; @@ -117,15 +123,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* ( # Rules LB14 - LB17. # Moved before LB14, because it matches a supersequence. +LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; +# Moved before LB14. These are really the cases where LB19a does not apply, but +# the old LB19 would. This is to avoid many instances of chaining over two code +# points. +LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM; +LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM; +LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19; +LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 +LB20a.7: OP CM* SP+ (HY | HH) CM* AL; LB14: OP CM* SP* .; +LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15a.2: ^ (PiQU CM* SP*)+ .; # LB15b/LB15a chaining. +LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); @@ -162,7 +180,9 @@ LB11.1: [^SP] CM* WJ; LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; -LB12: GL CM* [^CM]; +# Needs to apply before LB12, because the new monkeys are not greedy. +LB20a.2: GL (HY | HH) CM* AL; +LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; @@ -184,15 +204,16 @@ LB19.1: QU CM* [^CM]; LB20.1: . CM* ZWJ CB; LB20.2: . CM* ÷ CB; +LB20a.6: CB CM* ZWJ (HY | HH) CM* AL; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; -# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. -LB20.09: ^(HY | HH) CM* AL; +# LB 20a Do not break after a word-initial hyphen. +LB20a.1: ^(HY | HH) CM* AL; # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. -LB21a: HL CM* (HY | BA) CM* [^CM CB]; +LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL]; LB21.1: . CM* [BA HY NS]; LB21.2: BB CM* [^CM CB]; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt index 93a06be94310..99d01874d1fb 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt @@ -6,7 +6,7 @@ # file: line_loose_cj.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 for Unicode 14.0. +# Rules derived from Unicode Standard Annex #14 for Unicode 16.0. # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. @@ -99,6 +99,12 @@ ZWJ = [:LineBreak = ZWJ:]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}]; +eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ]; +eaFWHminusCM = [ eaFWH - CMS ]; +eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ]; +BAminuseaFWH = [BA - eaFWH ]; + PiQU = [\p{Pi}&QU]; PfQU = [\p{Pf}&QU]; @@ -136,15 +142,27 @@ LB25: ((PR | PO | POX)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | I # Rules LB14 - LB17. # Moved before LB14, because it matches a supersequence. +LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; +# Moved before LB14. These are really the cases where LB19a does not apply, but +# the old LB19 would. This is to avoid many instances of chaining over two code +# points. +LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM; +LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM; +LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19; +LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 +LB20a.7: OP CM* SP+ (HY | HH) CM* AL; LB14: OP CM* SP* .; +LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15a.2: ^ (PiQU CM* SP*)+ .; # LB15b/LB15a chaining. +LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); @@ -181,7 +199,9 @@ LB11.1: [^SP] CM* WJ; LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; -LB12: GL CM* [^CM]; +# Needs to apply before LB12, because the new monkeys are not greedy. +LB20a.2: GL (HY | HH) CM* AL; +LB12: GL CM* [^CM]; LB12a: [^SP BA BAX HY] CM* GL; @@ -204,18 +224,19 @@ LB19.1: QU CM* [^CM]; LB20.1: . CM* ZWJ CB; LB20.2: . CM* ÷ CB; +LB20a.6: CB CM* ZWJ (HY | HH) CM* AL; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; -# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. -LB20.09: ^(HY | HH) CM* AL; +# LB 20a Do not break after a word-initial hyphen. +LB20a.1: ^(HY | HH) CM* AL; # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. # LB 21a Don't break after Hebrew + Hyphen # HL (HY | BA) x -LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?; +LB21a: HL CM* (HY | BAminuseaFWH | BAX) CM* [^CM CB HL]; LB21.1: [^ID] CM* [BA BAX HY NS]; LB21.2: ID CM* [BA HY NS]; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt index 0397ec5a5f2b..211298539797 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt @@ -6,7 +6,7 @@ # file: line_normal.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 for Unicode 14.0. +# Rules derived from Unicode Standard Annex #14 for Unicode 16.0. # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. @@ -82,6 +82,12 @@ ZWJ = [:LineBreak = ZWJ:]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}]; +eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ]; +eaFWHminusCM = [ eaFWH - CMS ]; +eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ]; +BAminuseaFWH = [BA - eaFWH ]; + PiQU = [\p{Pi}&QU]; PfQU = [\p{Pf}&QU]; @@ -118,15 +124,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* ( # Rules LB14 - LB17. # Moved before LB14, because it matches a supersequence. +LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; +# Moved before LB14. These are really the cases where LB19a does not apply, but +# the old LB19 would. This is to avoid many instances of chaining over two code +# points. +LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM; +LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM; +LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19; +LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 +LB20a.7: OP CM* SP+ (HY | HH) CM* AL; LB14: OP CM* SP* .; +LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15a.2: ^ (PiQU CM* SP*)+ .; # LB15b/LB15a chaining. +LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); @@ -163,7 +181,9 @@ LB11.1: [^SP] CM* WJ; LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; -LB12: GL CM* [^CM]; +# Needs to apply before LB12, because the new monkeys are not greedy. +LB20a.2: GL (HY | HH) CM* AL; +LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; @@ -185,15 +205,16 @@ LB19.1: QU CM* [^CM]; LB20.1: . CM* ZWJ CB; LB20.2: . CM* ÷ CB; +LB20a.6: CB CM* ZWJ (HY | HH) CM* AL; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; -# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. -LB20.09: ^(HY | HH) CM* AL; +# LB 20a Do not break after a word-initial hyphen. +LB20a.1: ^(HY | HH) CM* AL; # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. -LB21a: HL CM* (HY | BA) CM* [^CM CB]; +LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL]; LB21.1: . CM* [BA HY NS]; LB21.2: BB CM* [^CM CB]; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt index 04889a31ca4e..2061f9170848 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt @@ -6,7 +6,7 @@ # file: line_normal_cj.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 for Unicode 14.0. +# Rules derived from Unicode Standard Annex #14 for Unicode 16.0. # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. @@ -84,6 +84,12 @@ ZWJ = [:LineBreak = ZWJ:]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}]; +eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ]; +eaFWHminusCM = [ eaFWH - CMS ]; +eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ]; +BAminuseaFWH = [BA - eaFWH ]; + PiQU = [\p{Pi}&QU]; PfQU = [\p{Pf}&QU]; @@ -120,15 +126,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* ( # Rules LB14 - LB17. # Moved before LB14, because it matches a supersequence. +LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; +# Moved before LB14. These are really the cases where LB19a does not apply, but +# the old LB19 would. This is to avoid many instances of chaining over two code +# points. +LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM; +LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM; +LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19; +LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 +LB20a.7: OP CM* SP+ (HY | HH) CM* AL; LB14: OP CM* SP* .; +LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15a.2: ^ (PiQU CM* SP*)+ .; # LB15b/LB15a chaining. +LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL; LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); @@ -167,7 +185,9 @@ LB11.1: [^SP] CM* WJ; LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; -LB12: GL CM* [^CM]; +# Needs to apply before LB12, because the new monkeys are not greedy. +LB20a.2: GL (HY | HH) CM* AL; +LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; @@ -189,15 +209,16 @@ LB19.1: QU CM* [^CM]; LB20.1: . CM* ZWJ CB; LB20.2: . CM* ÷ CB; +LB20a.6: CB CM* ZWJ (HY | HH) CM* AL; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; -# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. -LB20.09: ^(HY | HH) CM* AL; +# LB 20a Do not break after a word-initial hyphen. +LB20a.1: ^(HY | HH) CM* AL; # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. -LB21a: HL CM* (HY | BA) CM* [^CM CB]?; +LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL]; # DO allow breaks here before $NSXcm, so don't include it LB21.1: . CM* [BA HY NS];