[lib] Implement L1 rule

To be able to implement L1, we need access to more information from `BidiInfo`, namely `original_classes` of the `text`, in `visual_runs()`, which would mean it should pass through `reorder_line()`. The fact that information from `BidiInfo` is needed for both steps of the public API (generating `BidiInfo` and consuming it per-paragraph/per-level) made me change the API design and move these methods into `impl BidiInfo`. Then, since we needed access to `text` for every `BidiInfo` consumption, I added a reference to `text` to `BidiInfo`, which also enables more compile-time checks for `BidiInfo` isntance not outliving the text in the user code. NOTE: We are already breaking API in version 0.3.0 and paving for full spec support is a good reason to do so, IMHO. The L1 rule works by one pass on the text of the line. Conformance Test: this implementation reduces the number of failures from 60494 to 23770 (out of total 256747 cases). Fix #2
servo · May 12, 2017 · 6427532 · 6427532
1 parent 19d9858
commit 6427532
Show file tree

Hide file tree

Showing 8 changed files with 462 additions and 357 deletions.
diff --git a/AUTHORS b/AUTHORS
@@ -1,3 +1,4 @@
 This software was written by the following people:
 
 Matt Brubeck <mbrubeck@limpet.net>
+Behnam Esfahbod <behnam@zwnj.org>
diff --git a/src/char_data/mod.rs b/src/char_data/mod.rs
@@ -24,6 +24,13 @@ pub fn bidi_class(c: char) -> BidiClass {
     bsearch_range_value_table(c, bidi_class_table)
 }
 
+pub fn is_rtl(bidi_class: BidiClass) -> bool {
+    match bidi_class {
+        RLE | RLO | RLI => true,
+        _ => false,
+    }
+}
+
 fn bsearch_range_value_table(c: char, r: &'static [(char, char, BidiClass)]) -> BidiClass {
     match r.binary_search_by(
         |&(lo, hi, _)| if lo <= c && c <= hi {
@@ -45,7 +52,7 @@ fn bsearch_range_value_table(c: char, r: &'static [(char, char, BidiClass)]) ->
 }
 
 #[cfg(test)]
-mod test {
+mod tests {
     use super::*;
 
     #[test]

diff --git a/src/explicit.rs b/src/explicit.rs
@@ -11,21 +11,21 @@
 //!
 //! http://www.unicode.org/reports/tr9/#Explicit_Levels_and_Directions
 
-use super::char_data::BidiClass;
+use super::char_data::{BidiClass, is_rtl};
 use super::level::Level;
 
 use BidiClass::*;
 
 /// Compute explicit embedding levels for one paragraph of text (X1-X8).
 ///
-/// `classes[i]` must contain the BidiClass of the char at byte index `i`,
+/// `processing_classes[i]` must contain the BidiClass of the char at byte index `i`,
 /// for each char in `text`.
 pub fn compute(
     text: &str,
     para_level: Level,
     initial_classes: &[BidiClass],
     levels: &mut [Level],
-    classes: &mut [BidiClass],
+    processing_classes: &mut [BidiClass],
 ) {
     assert!(text.len() == initial_classes.len());
 
@@ -41,13 +41,8 @@ pub fn compute(
         match initial_classes[i] {
             // Rules X2-X5c
             RLE | LRE | RLO | LRO | RLI | LRI | FSI => {
-                let char_is_rtl = match initial_classes[i] {
-                    RLE | RLO | RLI => true,
-                    _ => false,
-                };
-
                 let last_level = stack.last().level;
-                let new_level = if char_is_rtl {
+                let new_level = if is_rtl(initial_classes[i]) {
                     last_level.new_explicit_next_rtl()
                 } else {
                     last_level.new_explicit_next_ltr()
@@ -58,8 +53,8 @@ pub fn compute(
                 if is_isolate {
                     levels[i] = last_level;
                     match stack.last().status {
-                        OverrideStatus::RTL => classes[i] = R,
-                        OverrideStatus::LTR => classes[i] = L,
+                        OverrideStatus::RTL => processing_classes[i] = R,
+                        OverrideStatus::LTR => processing_classes[i] = L,
                         _ => {}
                     }
                 }
@@ -108,8 +103,8 @@ pub fn compute(
                 let last = stack.last();
                 levels[i] = last.level;
                 match last.status {
-                    OverrideStatus::RTL => classes[i] = R,
-                    OverrideStatus::LTR => classes[i] = L,
+                    OverrideStatus::RTL => processing_classes[i] = R,
+                    OverrideStatus::LTR => processing_classes[i] = L,
                     _ => {}
                 }
             }
@@ -135,16 +130,16 @@ pub fn compute(
                 let last = stack.last();
                 levels[i] = last.level;
                 match last.status {
-                    OverrideStatus::RTL => classes[i] = R,
-                    OverrideStatus::LTR => classes[i] = L,
+                    OverrideStatus::RTL => processing_classes[i] = R,
+                    OverrideStatus::LTR => processing_classes[i] = L,
                     _ => {}
                 }
             }
         }
         // Handle multi-byte characters.
         for j in 1..c.len_utf8() {
             levels[i + j] = levels[i];
-            classes[i + j] = classes[i];
+            processing_classes[i + j] = processing_classes[i];
         }
     }
 }

diff --git a/src/implicit.rs b/src/implicit.rs
@@ -20,7 +20,7 @@ use BidiClass::*;
 /// 3.3.4 Resolving Weak Types
 ///
 /// http://www.unicode.org/reports/tr9/#Resolving_Weak_Types
-pub fn resolve_weak(sequence: &IsolatingRunSequence, classes: &mut [BidiClass]) {
+pub fn resolve_weak(sequence: &IsolatingRunSequence, processing_classes: &mut [BidiClass]) {
     // FIXME (#8): This function applies steps W1-W6 in a single pass.  This can produce
     // incorrect results in cases where a "later" rule changes the value of `prev_class` seen
     // by an "earlier" rule.  We should either split this into separate passes, or preserve
@@ -41,38 +41,38 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, classes: &mut [BidiClass])
         .flat_map(id as fn(LevelRun) -> LevelRun);
 
     while let Some(i) = indices.next() {
-        match classes[i] {
+        match processing_classes[i] {
             // http://www.unicode.org/reports/tr9/#W1
             NSM => {
-                classes[i] = match prev_class {
+                processing_classes[i] = match prev_class {
                     RLI | LRI | FSI | PDI => ON,
                     _ => prev_class,
                 };
             }
             EN => {
                 if last_strong_is_al {
                     // W2. If previous strong char was AL, change EN to AN.
-                    classes[i] = AN;
+                    processing_classes[i] = AN;
                 } else {
                     // W5. If a run of ETs is adjacent to an EN, change the ETs to EN.
                     for j in &et_run_indices {
-                        classes[*j] = EN;
+                        processing_classes[*j] = EN;
                     }
                     et_run_indices.clear();
                 }
             }
             // http://www.unicode.org/reports/tr9/#W3
-            AL => classes[i] = R,
+            AL => processing_classes[i] = R,
 
             // http://www.unicode.org/reports/tr9/#W4
             ES | CS => {
                 let next_class = indices
                     .clone()
-                    .map(|j| classes[j])
+                    .map(|j| processing_classes[j])
                     .filter(not_removed_by_x9)
                     .next()
                     .unwrap_or(sequence.eos);
-                classes[i] = match (prev_class, classes[i], next_class) {
+                processing_classes[i] = match (prev_class, processing_classes[i], next_class) {
                     (EN, ES, EN) | (EN, CS, EN) => EN,
                     (AN, CS, AN) => AN,
                     (_, _, _) => ON,
@@ -81,7 +81,7 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, classes: &mut [BidiClass])
             // http://www.unicode.org/reports/tr9/#W5
             ET => {
                 match prev_class {
-                    EN => classes[i] = EN,
+                    EN => processing_classes[i] = EN,
                     _ => et_run_indices.push(i), // In case this is followed by an EN.
                 }
             }
@@ -92,7 +92,7 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, classes: &mut [BidiClass])
             }
         }
 
-        prev_class = classes[i];
+        prev_class = processing_classes[i];
         match prev_class {
             L | R => {
                 last_strong_is_al = false;
@@ -105,7 +105,7 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, classes: &mut [BidiClass])
         if prev_class != ET {
             // W6. If we didn't find an adjacent EN, turn any ETs into ON instead.
             for j in &et_run_indices {
-                classes[*j] = ON;
+                processing_classes[*j] = ON;
             }
             et_run_indices.clear();
         }
@@ -115,9 +115,9 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, classes: &mut [BidiClass])
     let mut last_strong_is_l = sequence.sos == L;
     for run in &sequence.runs {
         for i in run.clone() {
-            match classes[i] {
+            match processing_classes[i] {
                 EN if last_strong_is_l => {
-                    classes[i] = L;
+                    processing_classes[i] = L;
                 }
                 L => {
                     last_strong_is_l = true;
@@ -137,7 +137,7 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, classes: &mut [BidiClass])
 pub fn resolve_neutral(
     sequence: &IsolatingRunSequence,
     levels: &[Level],
-    classes: &mut [BidiClass],
+    processing_classes: &mut [BidiClass],
 ) {
     let mut indices = sequence.runs.iter().flat_map(Clone::clone);
     let mut prev_class = sequence.sos;
@@ -154,18 +154,18 @@ pub fn resolve_neutral(
 
         // Process sequences of NI characters.
         let mut ni_run = Vec::new();
-        if ni(classes[i]) {
+        if ni(processing_classes[i]) {
             // Consume a run of consecutive NI characters.
             ni_run.push(i);
             let mut next_class;
             loop {
                 match indices.next() {
                     Some(j) => {
                         i = j;
-                        if removed_by_x9(classes[i]) {
+                        if removed_by_x9(processing_classes[i]) {
                             continue;
                         }
-                        next_class = classes[j];
+                        next_class = processing_classes[j];
                         if ni(next_class) {
                             ni_run.push(i);
                         } else {
@@ -187,11 +187,11 @@ pub fn resolve_neutral(
                 (_, _) => levels[i].bidi_class(),
             };
             for j in &ni_run {
-                classes[*j] = new_class;
+                processing_classes[*j] = new_class;
             }
             ni_run.clear();
         }
-        prev_class = classes[i];
+        prev_class = processing_classes[i];
     }
 }
 
@@ -200,12 +200,12 @@ pub fn resolve_neutral(
 /// Returns the maximum embedding level in the paragraph.
 ///
 /// http://www.unicode.org/reports/tr9/#Resolving_Implicit_Levels
-pub fn resolve_levels(classes: &[BidiClass], levels: &mut [Level]) -> Level {
+pub fn resolve_levels(original_classes: &[BidiClass], levels: &mut [Level]) -> Level {
     let mut max_level = Level::ltr();
 
-    assert!(classes.len() == levels.len());
+    assert!(original_classes.len() == levels.len());
     for i in 0..levels.len() {
-        match (levels[i].is_rtl(), classes[i]) {
+        match (levels[i].is_rtl(), original_classes[i]) {
             // http://www.unicode.org/reports/tr9/#I1
             (false, R) => levels[i].raise(1).expect("Level number error"),
             (false, AN) | (false, EN) => levels[i].raise(2).expect("Level number error"),

diff --git a/src/level.rs b/src/level.rs
@@ -228,7 +228,7 @@ impl<'a> PartialEq<&'a str> for Level {
 }
 
 #[cfg(test)]
-mod test {
+mod tests {
     use super::*;
 
     #[test]