Skip to content

Commit

Permalink
Fix Unicode 15.0 line breaking (#4389)
Browse files Browse the repository at this point in the history
The current implementation was attempting the LB25 tailoring recommended
in Example 7 of [Section
8.2](https://www.unicode.org/reports/tr14/tr14-49.html#Examples) in
UAX14 version 15.0; however, this requires more than one code point of
lookahead* because of `(PR | PO) × ( OP | HY )? NU`, which the current
implementation of the line segmenter cannot do. Instead this pull
request goes back to the untailored LB25 from Unicode 15.0.

The implementation was tested with two million test cases; I last
encountered a failure somewhere in the nine thousands. I should probably
do an overnight run. Only 200 test cases are included here; as usual,
anyone working on the rules should try very long monkey test runs.

This fixes #4146.

—
\* This will be needed for 15.1 line segmentation too. While we have
that capability in the other segmenters, used in the sentence segmenter
(the relevant rules are called intermediate match rules or
interm(ediate) break states in this implementation), straightforwardly
reusing that code would run into into issues as we have so many states
in line breaking that we cannot dedicate a whole bit to that property of
the state. This can probably be worked around (as far as I can tell we
use the sign bit for a property of two special states, so we could
probably be a bit more sparing), but will come later.
  • Loading branch information
eggrobin authored Dec 1, 2023
1 parent 615824d commit e080ecd
Show file tree
Hide file tree
Showing 8 changed files with 7,647 additions and 1,600 deletions.
15 changes: 15 additions & 0 deletions components/segmenter/src/line.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,21 @@ pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTyp
/// let breakpoints: Vec<usize> = segmenter.segment_str(text).collect();
/// // 9 and 22 are mandatory breaks, 14 is a line break opportunity.
/// assert_eq!(&breakpoints, &[0, 9, 14, 22]);
///
/// // There is a break opportunity between emoji, but not within the ZWJ sequence 🏳️‍🌈.
/// let flag_equation = "🏳️➕🌈🟰🏳️\u{200D}🌈";
/// let possible_first_lines: Vec<&str> =
/// segmenter.segment_str(flag_equation).skip(1).map(|i| &flag_equation[..i]).collect();
/// assert_eq!(
/// &possible_first_lines,
/// &[
/// "🏳️",
/// "🏳️➕",
/// "🏳️➕🌈",
/// "🏳️➕🌈🟰",
/// "🏳️➕🌈🟰🏳️‍🌈"
/// ]
/// );
/// ```
///
/// # Examples
Expand Down
46 changes: 41 additions & 5 deletions components/segmenter/tests/spec_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,18 +106,52 @@ impl Iterator for TestContentIterator {
fn line_break_test(filename: &str) {
let test_iter = TestContentIterator::new(filename);
let segmenter = LineSegmenter::new_dictionary();
for mut test in test_iter {
for (i, mut test) in test_iter.enumerate() {
let s: String = test.utf8_vec.into_iter().collect();
let iter = segmenter.segment_str(&s);
let result: Vec<usize> = iter.collect();
// NOTE: For consistency with ICU4C and other Segmenters, we return a breakpoint at
// index 0, despite UAX #14 suggesting otherwise. See issue #3283.
test.break_result_utf8.insert(0, 0);
assert_eq!(result, test.break_result_utf8, "{}", test.original_line);
if test.break_result_utf8.first() != Some(&0) {
test.break_result_utf8.insert(0, 0);
}
if result != test.break_result_utf8 {
let lb = icu::properties::maps::line_break();
let lb_name = icu::properties::LineBreak::enum_to_long_name_mapper();
let mut iter = segmenter.segment_str(&s);
// TODO(egg): It would be really nice to have Name here.
println!(" | A | E | Code pt. | Line_Break | Literal");
for (i, c) in s.char_indices() {
let expected_break = test.break_result_utf8.contains(&i);
let actual_break = result.contains(&i);
if actual_break {
iter.next();
}
println!(
"{}| {} | {} | {:>8} | {:>18} | {}",
if actual_break != expected_break {
"😭"
} else {
" "
},
if actual_break { "÷" } else { "×" },
if expected_break { "÷" } else { "×" },
format!("{:04X}", c as u32),
lb_name
.get(lb.get(c))
.unwrap_or(&format!("{:?}", lb.get(c))),
c
)
}
println!("Test case #{}", i);
panic!()
}

let iter = segmenter.segment_utf16(&test.utf16_vec);
let result: Vec<usize> = iter.collect();
test.break_result_utf16.insert(0, 0);
if test.break_result_utf16.first() != Some(&0) {
test.break_result_utf16.insert(0, 0);
}
assert_eq!(
result, test.break_result_utf16,
"UTF16: {}",
Expand All @@ -127,7 +161,9 @@ fn line_break_test(filename: &str) {
// Test data is Latin-1 character only, it can run for Latin-1 segmenter test.
if let Some(mut break_result_latin1) = test.break_result_latin1 {
let iter = segmenter.segment_latin1(&test.latin1_vec);
break_result_latin1.insert(0, 0);
if break_result_latin1.first() != Some(&0) {
break_result_latin1.insert(0, 0);
}
let result: Vec<usize> = iter.collect();
assert_eq!(
result, break_result_latin1,
Expand Down
208 changes: 208 additions & 0 deletions components/segmenter/tests/testdata/LineBreakExtraTest.txt

Large diffs are not rendered by default.

209 changes: 105 additions & 104 deletions components/segmenter/tests/testdata/LineBreakTest.txt

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Loading

0 comments on commit e080ecd