Add all fixups for retaining explicits

servo · Dec 20, 2022 · 98c05a8 · 98c05a8
1 parent 8c9781b
commit 98c05a8
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 14 deletions.
diff --git a/src/explicit.rs b/src/explicit.rs
@@ -47,13 +47,17 @@ pub fn compute(
             RLE | LRE | RLO | LRO | RLI | LRI | FSI => {
                 let last_level = stack.last().level;
 
+                // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
+                levels[i] = last_level;
+
                 // X5a-X5c: Isolate initiators get the level of the last entry on the stack.
                 let is_isolate = match original_classes[i] {
                     RLI | LRI | FSI => true,
                     _ => false,
                 };
                 if is_isolate {
-                    levels[i] = last_level;
+                    // Redundant due to "Retaining explicit formatting characters" step
+                    // levels[i] = last_level;
                     match stack.last().status {
                         OverrideStatus::RTL => processing_classes[i] = R,
                         OverrideStatus::LTR => processing_classes[i] = L,
@@ -90,10 +94,17 @@ pub fn compute(
                 } else if overflow_isolate_count == 0 {
                     overflow_embedding_count += 1;
                 }
+
+                if !is_isolate {
+                    // X9 +
+                    // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
+                    processing_classes[i] = BN;
+                }
             }
 
             // <http://www.unicode.org/reports/tr9/#X6a>
-            PDI => {
+            // The BN is from <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
+            PDI | BN => {
                 if overflow_isolate_count > 0 {
                     overflow_isolate_count -= 1;
                 } else if valid_isolate_count > 0 {
@@ -132,13 +143,14 @@ pub fn compute(
                 if stack.last().status != OverrideStatus::Isolate && stack.vec.len() >= 2 {
                     stack.vec.pop();
                 }
-                // The spec doesn't explicitly mention this step, but it is necessary.
-                // See the reference implementations for comparison.
+                // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
                 levels[i] = stack.last().level;
+                processing_classes[i] = BN;
             }
 
             // Nothing
-            B | BN => {}
+            // BN case moved up with PDI, see <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
+            B => {}
 
             // <http://www.unicode.org/reports/tr9/#X6>
             _ => {

diff --git a/src/implicit.rs b/src/implicit.rs
@@ -38,6 +38,8 @@ pub fn resolve_weak(
     let mut prev_class = sequence.sos;
     let mut last_strong_is_al = false;
     let mut et_run_indices = Vec::new(); // for W5
+    let mut bn_run_indices = Vec::new(); // for W5 +  <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
+
     // Like sequence.runs.iter().flat_map(Clone::clone), but make indices itself clonable.
     fn id(x: LevelRun) -> LevelRun {
         x
@@ -88,6 +90,7 @@ pub fn resolve_weak(
                         .clone()
                         .skip(ch.len_utf8() - 1)
                         .map(|j| processing_classes[j])
+                        // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
                         .find(not_removed_by_x9)
                         .unwrap_or(sequence.eos);
                     if next_class == EN && last_strong_is_al {
@@ -99,6 +102,25 @@ pub fn resolve_weak(
                         (EN, ES, EN) | (EN, CS, EN) => EN,
                         (AN, CS, AN) => AN,
                         (_, _, _) => ON,
+                    };
+
+                    // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
+                    // we have to do this before W5 gets its grubby hands on these characters and thinks
+                    // they're part of an ET run
+                    if processing_classes[i] == ON {
+                        for class in processing_classes[..i]
+                            .iter_mut()
+                            .rev()
+                            .take_while(|c| **c == BN)
+                        {
+                            *class = ON;
+                        }
+                        for class in processing_classes[(i + ch.len_utf8() - 1)..]
+                            .iter_mut()
+                            .take_while(|c| **c == BN)
+                        {
+                            *class = ON;
+                        }
                     }
                 } else {
                     // we're in the middle of a character, copy over work done for previous bytes
@@ -110,16 +132,30 @@ pub fn resolve_weak(
             ET => {
                 match prev_class {
                     EN => processing_classes[i] = EN,
-                    _ => et_run_indices.push(i), // In case this is followed by an EN.
+                    _ => {
+                        // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
+                        // if there was a BN run before this, that's now a part of this ET run
+                        et_run_indices.extend(&bn_run_indices);
+
+                        // In case this is followed by an EN.
+                        et_run_indices.push(i);
+                    }
                 }
             }
-            class => {
-                if removed_by_x9(class) {
-                    continue;
-                }
+            BN => {
+                // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
+                // keeps track of bn runs for W5 in case we see an ET
+                bn_run_indices.push(i);
+                // skips over BNs for W1
+                continue;
             }
+            _ => {}
         }
 
+        // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
+        // BN runs would not exit the above loop
+        bn_run_indices.clear();
+
         prev_class = processing_classes[i];
         match w2_processing_class {
             L | R => {
@@ -153,6 +189,8 @@ pub fn resolve_weak(
                 R | AL => {
                     last_strong_is_l = false;
                 }
+                // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
+                // already scanning past BN here
                 _ => {}
             }
         }
@@ -286,22 +324,30 @@ pub fn resolve_neutral<D: BidiDataSource>(
             for class in &mut processing_classes[pair.end..pair.end + end_len_utf8] {
                 *class = class_to_set;
             }
+            // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
+            for class in processing_classes[..pair.start]
+                .iter_mut()
+                .rev()
+                .take_while(|c| **c == BN)
+            {
+                *class = class_to_set;
+            }
             // > Any number of characters that had original bidirectional character type NSM prior to the application of
             // > W1 that immediately follow a paired bracket which changed to L or R under N0 should change to match the type of their preceding bracket.
 
             // This rule deals with sequences of NSMs, so we can just update them all at once, we don't need to worry
             // about character boundaries. We do need to be careful to skip the full set of bytes for the parentheses characters.
             let nsm_start = pair.start + start_len_utf8;
             for (idx, class) in original_classes[nsm_start..].iter().enumerate() {
-                if *class == BidiClass::NSM {
+                if *class == BidiClass::NSM || processing_classes[nsm_start + idx] == BN {
                     processing_classes[nsm_start + idx] = class_to_set;
                 } else {
                     break;
                 }
             }
             let nsm_end = pair.end + end_len_utf8;
             for (idx, class) in original_classes[nsm_end..].iter().enumerate() {
-                if *class == BidiClass::NSM {
+                if *class == BidiClass::NSM || processing_classes[nsm_end + idx] == BN {
                     processing_classes[nsm_end + idx] = class_to_set;
                 } else {
                     break;
@@ -321,7 +367,8 @@ pub fn resolve_neutral<D: BidiDataSource>(
     while let Some(mut i) = indices.next() {
         // Process sequences of NI characters.
         let mut ni_run = Vec::new();
-        if is_NI(processing_classes[i]) {
+        // The BN is for <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
+        if is_NI(processing_classes[i]) || processing_classes[i] == BN {
             // Consume a run of consecutive NI characters.
             ni_run.push(i);
             let mut next_class;
@@ -469,6 +516,7 @@ pub fn resolve_levels(original_classes: &[BidiClass], levels: &mut [Level]) -> L
             (false, R) | (true, L) | (true, EN) | (true, AN) => {
                 levels[i].raise(1).expect("Level number error")
             }
+            // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> handled here
             (_, _) => {}
         }
         max_level = max(max_level, levels[i]);

diff --git a/tests/conformance_tests.rs b/tests/conformance_tests.rs
@@ -138,7 +138,7 @@ fn gen_base_levels_for_base_tests(bitset: u8) -> Vec<Option<Level>> {
 }
 
 #[test]
-#[should_panic(expected = "29 test cases failed! (91678 passed)")]
+#[should_panic(expected = "19 test cases failed! (91688 passed)")]
 fn test_character_conformance() {
     let test_data = include_str!("data/BidiCharacterTest.txt");