Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICU-21592 Update cj normal/loose linebreak per CSS #1991

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions icu4c/source/data/brkitr/rules/line_loose_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * between ID and hyphens 2010 & 2013 (both BA)
# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
Expand Down Expand Up @@ -238,7 +239,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# See issue ICU-20303


$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN];
$SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];

Expand Down Expand Up @@ -294,8 +295,10 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 21 x (BA | HY | NS)
# BB x
#
# DO allow breaks here before $BAX and $NSX, so don't include them
$LB20NonBreaks $CM* ($BA | $HY | $NS);
# DO allow breaks here before $NSX, so don't include it.
# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them.
[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS);
$ID $CM* ($BA | $HY | $NS);


^$CM+ ($BA | $HY | $NS);
Expand Down
11 changes: 7 additions & 4 deletions icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * between ID and hyphens 2010 & 2013 (both BA)
# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
Expand Down Expand Up @@ -251,7 +252,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# See issue ICU-20303


$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN];
$SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];

Expand Down Expand Up @@ -307,8 +308,10 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 21 x (BA | HY | NS)
# BB x
#
# DO allow breaks here before $BAX and $NSX, so don't include them
$LB20NonBreaks $CM* ($BA | $HY | $NS);
# DO allow breaks here before $NSX, so don't include it.
# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them.
[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS);
$ID $CM* ($BA | $HY | $NS);


^$CM+ ($BA | $HY | $NS);
Expand Down
11 changes: 5 additions & 6 deletions icu4c/source/data/brkitr/rules/line_normal_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.

#
Expand All @@ -29,8 +29,7 @@

$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$BAX = [\u2010 \u2013];
$BA = [[:LineBreak = Break_After:] - $BAX];
$BA = [:LineBreak = Break_After:];
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
Expand Down Expand Up @@ -184,7 +183,7 @@ $GL $CM* .;
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
^$CM+ $GL;


Expand Down Expand Up @@ -282,7 +281,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 21 x (BA | HY | NS)
# BB x
#
# DO allow breaks here before $BAX and $NSX, so don't include them
# DO allow breaks here before $NSX, so don't include it
$LB20NonBreaks $CM* ($BA | $HY | $NS);


Expand All @@ -294,7 +293,7 @@ $BB $CM* $LB20NonBreaks;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
#
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
$HL $CM* ($HY | $BA) $CM* [^$CB]?;

# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
Expand Down
11 changes: 5 additions & 6 deletions icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
#
# The content is the same as line_normal_cj.txt except the following
Expand All @@ -31,8 +31,7 @@

$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$BAX = [\u2010 \u2013];
$BA = [[:LineBreak = Break_After:] - $BAX];
$BA = [:LineBreak = Break_After:];
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
Expand Down Expand Up @@ -197,7 +196,7 @@ $GL $CM* .;
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
^$CM+ $GL;


Expand Down Expand Up @@ -295,7 +294,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 21 x (BA | HY | NS)
# BB x
#
# DO allow breaks here before $BAX and $NSX, so don't include them
# DO allow breaks here before $NSX, so don't include it
$LB20NonBreaks $CM* ($BA | $HY | $NS);


Expand All @@ -307,7 +306,7 @@ $BB $CM* $LB20NonBreaks;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
#
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
$HL $CM* ($HY | $BA) $CM* [^$CB]?;

# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
Expand Down
9 changes: 6 additions & 3 deletions icu4c/source/test/testdata/break_rules/line_loose_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * between ID and hyphens 2010 & 2013 (both BA)
# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
Expand Down Expand Up @@ -200,8 +201,10 @@ LB20.09: ^(HY | HH) CM* AL;

LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;

LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21.1: [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think the long negated set is necessary; the hard breaks should never reach this point, having been handled by earlier rules. Monkey test rules are handled sequentially, unlike the main production rules, which are run in parallel.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks Andy for looking at this and for approving!

LB21.2: ID CM* [BA HY NS];
LB21.3: CM+ [BA HY NS];
LB21.4: BB CM* [^CM CB];

LB21b: SY CM* HL;

Expand Down
14 changes: 5 additions & 9 deletions icu4c/source/test/testdata/break_rules/line_normal_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,15 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.

type = line;
locale = ja@lb=normal;

AI = [:LineBreak = Ambiguous:];
AL = [:LineBreak = Alphabetic:];
BAX = [\u2010 \u2013];
BA = [[:LineBreak = Break_After:] - BAX];
BA = [:LineBreak = Break_After:];
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
Expand Down Expand Up @@ -154,7 +153,7 @@ LB11.3: WJ CM* [^CM];

LB12: GL CM* [^CM];

LB12a: [^SP BA BAX HY] CM* GL;
LB12a: [^SP BA HY] CM* GL;

# LB 13 Do not break before ‘]’ or ‘!’ or ‘/’, even after spaces.
LB13.1: [^SP] CM* [CL CP EX SY];
Expand Down Expand Up @@ -182,12 +181,9 @@ LB20.09: ^(HY | HH) CM* AL;

# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
# should "HL BAX" not break when followed by a CB? Thats what the current
# rules do, which is why "[^CM CB]?" includes the ?.
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
LB21a: HL CM* (HY | BA) CM* [^CM CB]?;

# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
# DO allow breaks here before $NSXcm, so don't include it
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];

Expand Down
6 changes: 6 additions & 0 deletions icu4c/source/test/testdata/rbbitst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1646,11 +1646,17 @@ Bangkok)•</data>
# •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •no brk before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>

# •no brk before 2010 •
<data>•\u3042\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>

<locale ja@lb=loose>
<line>
# •brk OK before 3063 •brk OK before 301C •brk OK btw 2026 •brk OK before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020•</data>

# •no brk before 2010 except ok after ID •
<data>•\u3042•\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>

<locale en@lb=strict>
<line>
# •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01•
Expand Down
4 changes: 2 additions & 2 deletions icu4j/main/shared/data/icudata.jar
Git LFS file not shown
4 changes: 2 additions & 2 deletions icu4j/main/shared/data/icutzdata.jar
Git LFS file not shown
4 changes: 2 additions & 2 deletions icu4j/main/shared/data/testdata.jar
Git LFS file not shown
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * between ID and hyphens 2010 & 2013 (both BA)
# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
Expand Down Expand Up @@ -200,8 +201,10 @@ LB20.09: ^(HY | HH) CM* AL;

LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;

LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21.1: [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS];
LB21.2: ID CM* [BA HY NS];
LB21.3: CM+ [BA HY NS];
LB21.4: BB CM* [^CM CB];

LB21b: SY CM* HL;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,15 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.

type = line;
locale = ja@lb=normal;

AI = [:LineBreak = Ambiguous:];
AL = [:LineBreak = Alphabetic:];
BAX = [\u2010 \u2013];
BA = [[:LineBreak = Break_After:] - BAX];
BA = [:LineBreak = Break_After:];
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
Expand Down Expand Up @@ -154,7 +153,7 @@ LB11.3: WJ CM* [^CM];

LB12: GL CM* [^CM];

LB12a: [^SP BA BAX HY] CM* GL;
LB12a: [^SP BA HY] CM* GL;

# LB 13 Do not break before ‘]’ or ‘!’ or ‘/’, even after spaces.
LB13.1: [^SP] CM* [CL CP EX SY];
Expand Down Expand Up @@ -182,12 +181,9 @@ LB20.09: ^(HY | HH) CM* AL;

# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
# should "HL BAX" not break when followed by a CB? Thats what the current
# rules do, which is why "[^CM CB]?" includes the ?.
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
LB21a: HL CM* (HY | BA) CM* [^CM CB]?;

# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
# DO allow breaks here before $NSXcm, so don't include it
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1646,11 +1646,17 @@ Bangkok)•</data>
# •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •no brk before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>

# •no brk before 2010 •
<data>•\u3042\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>

<locale ja@lb=loose>
<line>
# •brk OK before 3063 •brk OK before 301C •brk OK btw 2026 •brk OK before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020•</data>

# •no brk before 2010 except ok after ID •
<data>•\u3042•\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>

<locale en@lb=strict>
<line>
# •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01•
Expand Down Expand Up @@ -1888,7 +1894,7 @@ Bangkok)•</data>
<line>
#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。•
<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
Expand Down