Fix Unicode 15.0 line breaking (#4389)

The current implementation was attempting the LB25 tailoring recommended in Example 7 of [Section 8.2](https://www.unicode.org/reports/tr14/tr14-49.html#Examples) in UAX14 version 15.0; however, this requires more than one code point of lookahead* because of `(PR | PO) × ( OP | HY )? NU`, which the current implementation of the line segmenter cannot do. Instead this pull request goes back to the untailored LB25 from Unicode 15.0. The implementation was tested with two million test cases; I last encountered a failure somewhere in the nine thousands. I should probably do an overnight run. Only 200 test cases are included here; as usual, anyone working on the rules should try very long monkey test runs. This fixes #4146. — \* This will be needed for 15.1 line segmentation too. While we have that capability in the other segmenters, used in the sentence segmenter (the relevant rules are called intermediate match rules or interm(ediate) break states in this implementation), straightforwardly reusing that code would run into into issues as we have so many states in line breaking that we cannot dedicate a whole bit to that property of the state. This can probably be worked around (as far as I can tell we use the sign bit for a property of two special states, so we could probably be a bit more sparing), but will come later.
unicode-org · Dec 1, 2023 · e080ecd · e080ecd
1 parent 615824d
commit e080ecd
Show file tree

Hide file tree

Showing 8 changed files with 7,647 additions and 1,600 deletions.
diff --git a/components/segmenter/src/line.rs b/components/segmenter/src/line.rs
@@ -151,6 +151,21 @@ pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTyp
 /// let breakpoints: Vec<usize> = segmenter.segment_str(text).collect();
 /// // 9 and 22 are mandatory breaks, 14 is a line break opportunity.
 /// assert_eq!(&breakpoints, &[0, 9, 14, 22]);
+///
+/// // There is a break opportunity between emoji, but not within the ZWJ sequence 🏳️‍🌈.
+/// let flag_equation = "🏳️➕🌈🟰🏳️\u{200D}🌈";
+/// let possible_first_lines: Vec<&str> =
+///     segmenter.segment_str(flag_equation).skip(1).map(|i| &flag_equation[..i]).collect();
+/// assert_eq!(
+///     &possible_first_lines,
+///     &[
+///         "🏳️",
+///         "🏳️➕",
+///         "🏳️➕🌈",
+///         "🏳️➕🌈🟰",
+///         "🏳️➕🌈🟰🏳️‍🌈"
+///     ]
+/// );
 /// ```
 ///
 /// # Examples

diff --git a/components/segmenter/tests/spec_test.rs b/components/segmenter/tests/spec_test.rs
@@ -106,18 +106,52 @@ impl Iterator for TestContentIterator {
 fn line_break_test(filename: &str) {
     let test_iter = TestContentIterator::new(filename);
     let segmenter = LineSegmenter::new_dictionary();
-    for mut test in test_iter {
+    for (i, mut test) in test_iter.enumerate() {
         let s: String = test.utf8_vec.into_iter().collect();
         let iter = segmenter.segment_str(&s);
         let result: Vec<usize> = iter.collect();
         // NOTE: For consistency with ICU4C and other Segmenters, we return a breakpoint at
         // index 0, despite UAX #14 suggesting otherwise. See issue #3283.
-        test.break_result_utf8.insert(0, 0);
-        assert_eq!(result, test.break_result_utf8, "{}", test.original_line);
+        if test.break_result_utf8.first() != Some(&0) {
+            test.break_result_utf8.insert(0, 0);
+        }
+        if result != test.break_result_utf8 {
+            let lb = icu::properties::maps::line_break();
+            let lb_name = icu::properties::LineBreak::enum_to_long_name_mapper();
+            let mut iter = segmenter.segment_str(&s);
+            // TODO(egg): It would be really nice to have Name here.
+            println!("  | A | E | Code pt. | Line_Break     | Literal");
+            for (i, c) in s.char_indices() {
+                let expected_break = test.break_result_utf8.contains(&i);
+                let actual_break = result.contains(&i);
+                if actual_break {
+                    iter.next();
+                }
+                println!(
+                    "{}| {} | {} | {:>8} | {:>18} | {}",
+                    if actual_break != expected_break {
+                        "😭"
+                    } else {
+                        "  "
+                    },
+                    if actual_break { "÷" } else { "×" },
+                    if expected_break { "÷" } else { "×" },
+                    format!("{:04X}", c as u32),
+                    lb_name
+                        .get(lb.get(c))
+                        .unwrap_or(&format!("{:?}", lb.get(c))),
+                    c
+                )
+            }
+            println!("Test case #{}", i);
+            panic!()
+        }
 
         let iter = segmenter.segment_utf16(&test.utf16_vec);
         let result: Vec<usize> = iter.collect();
-        test.break_result_utf16.insert(0, 0);
+        if test.break_result_utf16.first() != Some(&0) {
+            test.break_result_utf16.insert(0, 0);
+        }
         assert_eq!(
             result, test.break_result_utf16,
             "UTF16: {}",
@@ -127,7 +161,9 @@ fn line_break_test(filename: &str) {
         // Test data is Latin-1 character only, it can run for Latin-1 segmenter test.
         if let Some(mut break_result_latin1) = test.break_result_latin1 {
             let iter = segmenter.segment_latin1(&test.latin1_vec);
-            break_result_latin1.insert(0, 0);
+            if break_result_latin1.first() != Some(&0) {
+                break_result_latin1.insert(0, 0);
+            }
             let result: Vec<usize> = iter.collect();
             assert_eq!(
                 result, break_result_latin1,

diff --git a/components/segmenter/tests/testdata/LineBreakExtraTest.txt b/components/segmenter/tests/testdata/LineBreakExtraTest.txt
diff --git a/components/segmenter/tests/testdata/LineBreakTest.txt b/components/segmenter/tests/testdata/LineBreakTest.txt
diff --git a/provider/baked/segmenter/data/macros/segmenter_line_v1.rs.data b/provider/baked/segmenter/data/macros/segmenter_line_v1.rs.data