Merge pull request #85 from Manishearth/brace-yourself

Implement N0 (bracket matching)
servo · Dec 20, 2022 · cb61bd5 · cb61bd5
2 parents cd99a45 + 02f3ddc
commit cb61bd5
Show file tree

Hide file tree

Showing 8 changed files with 368 additions and 14 deletions.
diff --git a/src/char_data/mod.rs b/src/char_data/mod.rs
@@ -19,10 +19,10 @@ use core::cmp::Ordering::{Equal, Greater, Less};
 
 #[cfg(feature = "hardcoded-data")]
 use self::tables::bidi_class_table;
+use crate::data_source::BidiMatchedOpeningBracket;
 use crate::BidiClass::*;
 #[cfg(feature = "hardcoded-data")]
 use crate::BidiDataSource;
-
 /// Hardcoded Bidi data that ships with the unicode-bidi crate.
 ///
 /// This can be enabled with the default `hardcoded-data` Cargo feature.
@@ -42,6 +42,22 @@ pub fn bidi_class(c: char) -> BidiClass {
     bsearch_range_value_table(c, bidi_class_table)
 }
 
+/// If this character is a bracket according to BidiBrackets.txt,
+/// return the corresponding *normalized* *opening bracket* of the pair,
+/// and whether or not it itself is an opening bracket.
+pub(crate) fn bidi_matched_opening_bracket(c: char) -> Option<BidiMatchedOpeningBracket> {
+    for pair in self::tables::bidi_pairs_table {
+        if pair.0 == c || pair.1 == c {
+            let skeleton = pair.2.unwrap_or(pair.0);
+            return Some(BidiMatchedOpeningBracket {
+                opening: skeleton,
+                is_open: pair.0 == c,
+            });
+        }
+    }
+    None
+}
+
 pub fn is_rtl(bidi_class: BidiClass) -> bool {
     match bidi_class {
         RLE | RLO | RLI => true,

diff --git a/src/char_data/tables.rs b/src/char_data/tables.rs
@@ -508,3 +508,28 @@ pub const bidi_class_table: &'static [(char, char, BidiClass)] = &[
     ('\u{f0000}', '\u{ffffd}', L), ('\u{100000}', '\u{10fffd}', L)
 ];
 
+pub const bidi_pairs_table: &'static [(char, char, Option<char>)] = &[
+    ('\u{28}', '\u{29}', None), ('\u{5b}', '\u{5d}', None), ('\u{7b}', '\u{7d}', None), ('\u{f3a}',
+    '\u{f3b}', None), ('\u{f3c}', '\u{f3d}', None), ('\u{169b}', '\u{169c}', None), ('\u{2045}',
+    '\u{2046}', None), ('\u{207d}', '\u{207e}', None), ('\u{208d}', '\u{208e}', None), ('\u{2308}',
+    '\u{2309}', None), ('\u{230a}', '\u{230b}', None), ('\u{2329}', '\u{232a}', Some('\u{3008}')),
+    ('\u{2768}', '\u{2769}', None), ('\u{276a}', '\u{276b}', None), ('\u{276c}', '\u{276d}', None),
+    ('\u{276e}', '\u{276f}', None), ('\u{2770}', '\u{2771}', None), ('\u{2772}', '\u{2773}', None),
+    ('\u{2774}', '\u{2775}', None), ('\u{27c5}', '\u{27c6}', None), ('\u{27e6}', '\u{27e7}', None),
+    ('\u{27e8}', '\u{27e9}', None), ('\u{27ea}', '\u{27eb}', None), ('\u{27ec}', '\u{27ed}', None),
+    ('\u{27ee}', '\u{27ef}', None), ('\u{2983}', '\u{2984}', None), ('\u{2985}', '\u{2986}', None),
+    ('\u{2987}', '\u{2988}', None), ('\u{2989}', '\u{298a}', None), ('\u{298b}', '\u{298c}', None),
+    ('\u{298d}', '\u{2990}', None), ('\u{298f}', '\u{298e}', None), ('\u{2991}', '\u{2992}', None),
+    ('\u{2993}', '\u{2994}', None), ('\u{2995}', '\u{2996}', None), ('\u{2997}', '\u{2998}', None),
+    ('\u{29d8}', '\u{29d9}', None), ('\u{29da}', '\u{29db}', None), ('\u{29fc}', '\u{29fd}', None),
+    ('\u{2e22}', '\u{2e23}', None), ('\u{2e24}', '\u{2e25}', None), ('\u{2e26}', '\u{2e27}', None),
+    ('\u{2e28}', '\u{2e29}', None), ('\u{2e55}', '\u{2e56}', None), ('\u{2e57}', '\u{2e58}', None),
+    ('\u{2e59}', '\u{2e5a}', None), ('\u{2e5b}', '\u{2e5c}', None), ('\u{3008}', '\u{3009}', None),
+    ('\u{300a}', '\u{300b}', None), ('\u{300c}', '\u{300d}', None), ('\u{300e}', '\u{300f}', None),
+    ('\u{3010}', '\u{3011}', None), ('\u{3014}', '\u{3015}', None), ('\u{3016}', '\u{3017}', None),
+    ('\u{3018}', '\u{3019}', None), ('\u{301a}', '\u{301b}', None), ('\u{fe59}', '\u{fe5a}', None),
+    ('\u{fe5b}', '\u{fe5c}', None), ('\u{fe5d}', '\u{fe5e}', None), ('\u{ff08}', '\u{ff09}', None),
+    ('\u{ff3b}', '\u{ff3d}', None), ('\u{ff5b}', '\u{ff5d}', None), ('\u{ff5f}', '\u{ff60}', None),
+    ('\u{ff62}', '\u{ff63}', None)
+];
+
diff --git a/src/data_source.rs b/src/data_source.rs
@@ -9,8 +9,37 @@
 
 use crate::BidiClass;
 
+/// This is the return value of [`BidiDataSource::bidi_matched_opening_bracket()`].
+///
+/// It represents the matching *normalized* opening bracket for a given bracket in a bracket pair,
+/// and whether or not that bracket is opening.
+pub struct BidiMatchedOpeningBracket {
+    /// The corresponding opening bracket in this bracket pair, normalized
+    ///
+    /// In case of opening brackets, this will be the bracket itself, except for when the bracket
+    /// is not normalized, in which case it will be the normalized form.
+    pub opening: char,
+    /// Whether or not the requested bracket was an opening bracket. True for opening
+    pub is_open: bool,
+}
+
 /// This trait abstracts over a data source that is able to produce the Unicode Bidi class for a given
 /// character
 pub trait BidiDataSource {
     fn bidi_class(&self, c: char) -> BidiClass;
+    /// If this character is a bracket according to BidiBrackets.txt,
+    /// return the corresponding *normalized* *opening bracket* of the pair,
+    /// and whether or not it itself is an opening bracket.
+    ///
+    /// This effectively buckets brackets into equivalence classes keyed on the
+    /// normalized opening bracket.
+    ///
+    /// The default implementation will pull in a small amount of hardcoded data,
+    /// regardless of the `hardcoded-data` feature. This is in part for convenience
+    /// (since this data is small and changes less often), and in part so that this method can be
+    /// added without needing a breaking version bump.
+    /// Override this method in your custom data source to prevent the use of hardcoded data.
+    fn bidi_matched_opening_bracket(&self, c: char) -> Option<BidiMatchedOpeningBracket> {
+        crate::char_data::bidi_matched_opening_bracket(c)
+    }
 }
diff --git a/src/implicit.rs b/src/implicit.rs
@@ -11,10 +11,12 @@
 
 use alloc::vec::Vec;
 use core::cmp::max;
+use core::ops::Range;
 
 use super::char_data::BidiClass::{self, *};
 use super::level::Level;
 use super::prepare::{not_removed_by_x9, removed_by_x9, IsolatingRunSequence, LevelRun};
+use super::BidiDataSource;
 
 /// 3.3.4 Resolving Weak Types
 ///
@@ -137,19 +139,162 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, processing_classes: &mut [B
 ///
 /// <http://www.unicode.org/reports/tr9/#Resolving_Neutral_Types>
 #[cfg_attr(feature = "flame_it", flamer::flame)]
-pub fn resolve_neutral(
+pub fn resolve_neutral<D: BidiDataSource>(
+    text: &str,
+    data_source: &D,
     sequence: &IsolatingRunSequence,
     levels: &[Level],
+    original_classes: &[BidiClass],
     processing_classes: &mut [BidiClass],
 ) {
+    // e = embedding direction
     let e: BidiClass = levels[sequence.runs[0].start].bidi_class();
+    let not_e = if e == BidiClass::L {
+        BidiClass::R
+    } else {
+        BidiClass::L
+    };
+    // N0. Process bracket pairs.
+
+    // > Identify the bracket pairs in the current isolating run sequence according to BD16.
+    let bracket_pairs = identify_bracket_pairs(text, data_source, sequence, original_classes);
+
+    // > For each bracket-pair element in the list of pairs of text positions
+    //
+    // Note: Rust ranges are interpreted as [start..end), be careful using `pair` directly
+    // for indexing as it will include the opening bracket pair but not the closing one
+    for pair in bracket_pairs {
+        #[cfg(feature = "std")]
+        debug_assert!(
+            pair.start < processing_classes.len(),
+            "identify_bracket_pairs returned a range that is out of bounds!"
+        );
+        #[cfg(feature = "std")]
+        debug_assert!(
+            pair.end < processing_classes.len(),
+            "identify_bracket_pairs returned a range that is out of bounds!"
+        );
+        let mut found_e = false;
+        let mut found_not_e = false;
+        let mut class_to_set = None;
+
+        let start_len_utf8 = text[pair.start..].chars().next().unwrap().len_utf8();
+        // get the range of characters enclosed
+        let enclosed = (pair.start + start_len_utf8)..pair.end;
+        // > Inspect the bidirectional types of the characters enclosed within the bracket pair.
+        //
+        // `pair` is [start, end) so we will end up processing the opening character but not the closing one.
+        //
+        // Note: Given that processing_classes has been modified in the previous runs, and resolve_weak
+        // modifies processing_classes inconsistently at non-character-boundaries,
+        // this and the later iteration will end up iterating over some obsolete classes.
+        // This is fine since all we care about is looking for strong
+        // classes, and strong_classes do not change in resolve_weak. The alternative is calling `.char_indices()`
+        // on the text (or checking `text.get(idx).is_some()`), which would be a way to avoid hitting these
+        // processing_classes of bytes not on character boundaries. This is both cleaner and likely to be faster
+        // (this is worth benchmarking, though!) so we'll stick with the current approach of iterating over processing_classes.
+        for &class in &processing_classes[enclosed] {
+            if class == e {
+                found_e = true;
+            } else if class == not_e {
+                found_not_e = true;
+            } else if class == BidiClass::EN || class == BidiClass::AN {
+                // > Within this scope, bidirectional types EN and AN are treated as R.
+                if e == BidiClass::L {
+                    found_not_e = true;
+                } else {
+                    found_e = true;
+                }
+            }
+
+            // if we have found a character with the class of the embedding direction
+            // we can bail early
+            if found_e {
+                break;
+            }
+        }
+        // > If any strong type (either L or R) matching the embedding direction is found
+        if found_e {
+            // > .. set the type for both brackets in the pair to match the embedding direction
+            class_to_set = Some(e);
+        // > Otherwise, if there is a strong type it must be opposite the embedding direction
+        } else if found_not_e {
+            // Therefore, test for an established context with a preceding strong type by
+            // checking backwards before the opening paired bracket
+            // until the first strong type (L, R, or sos) is found.
+            // (see note above about processing_classes and character boundaries)
+            let mut previous_strong = processing_classes[..pair.start]
+                .iter()
+                .copied()
+                .rev()
+                .find(|class| {
+                    *class == BidiClass::L
+                        || *class == BidiClass::R
+                        || *class == BidiClass::EN
+                        || *class == BidiClass::AN
+                })
+                .unwrap_or(sequence.sos);
+
+            // > Within this scope, bidirectional types EN and AN are treated as R.
+            if previous_strong == BidiClass::EN || previous_strong == BidiClass::AN {
+                previous_strong = BidiClass::R;
+            }
+
+            // > If the preceding strong type is also opposite the embedding direction,
+            // > context is established,
+            // > so set the type for both brackets in the pair to that direction.
+            // AND
+            // > Otherwise set the type for both brackets in the pair to the embedding direction.
+            // > Either way it gets set to previous_strong
+            //
+            // XXXManishearth perhaps the reason the spec writes these as two separate lines is
+            // because sos is supposed to be handled differently?
+            class_to_set = Some(previous_strong);
+        }
+
+        if let Some(class_to_set) = class_to_set {
+            // update all processing classes corresponding to the start and end elements, as requested.
+            // We should include all bytes of the character, not the first one.
+            let end_len_utf8 = text[pair.start..].chars().next().unwrap().len_utf8();
+            for class in &mut processing_classes[pair.start..pair.start + start_len_utf8] {
+                *class = class_to_set;
+            }
+            for class in &mut processing_classes[pair.end..pair.end + end_len_utf8] {
+                *class = class_to_set;
+            }
+            // > Any number of characters that had original bidirectional character type NSM prior to the application of
+            // > W1 that immediately follow a paired bracket which changed to L or R under N0 should change to match the type of their preceding bracket.
+
+            // This rule deals with sequences of NSMs, so we can just update them all at once, we don't need to worry
+            // about character boundaries. We do need to be careful to skip the full set of bytes for the parentheses characters.
+            let nsm_start = pair.start + start_len_utf8;
+            for (idx, class) in original_classes[nsm_start..].iter().enumerate() {
+                if *class == BidiClass::NSM {
+                    processing_classes[nsm_start + idx] = class_to_set;
+                } else {
+                    break;
+                }
+            }
+            let nsm_end = pair.end + end_len_utf8;
+            for (idx, class) in original_classes[nsm_end..].iter().enumerate() {
+                if *class == BidiClass::NSM {
+                    processing_classes[nsm_end + idx] = class_to_set;
+                } else {
+                    break;
+                }
+            }
+        }
+        // > Otherwise, there are no strong types within the bracket pair
+        // > Therefore, do not set the type for that bracket pair
+    }
+
+    // N1 and N2
+    // indices of every byte in this isolating run sequence
+    // XXXManishearth Note for later: is it okay to iterate over every index here, since
+    // that includes char boundaries?
     let mut indices = sequence.runs.iter().flat_map(Clone::clone);
     let mut prev_class = sequence.sos;
-
     while let Some(mut i) = indices.next() {
-        // N0. Process bracket pairs.
-        // TODO
-
         // Process sequences of NI characters.
         let mut ni_run = Vec::new();
         if is_NI(processing_classes[i]) {
@@ -203,6 +348,88 @@ pub fn resolve_neutral(
     }
 }
 
+/// 3.1.3 Identifying Bracket Pairs
+///
+/// Returns all paired brackets in the source
+///
+/// <https://www.unicode.org/reports/tr9/#BD16>
+fn identify_bracket_pairs<D: BidiDataSource>(
+    text: &str,
+    data_source: &D,
+    run_sequence: &IsolatingRunSequence,
+    original_classes: &[BidiClass],
+) -> Vec<Range<usize>> {
+    let mut ret = vec![];
+    let mut stack = vec![];
+
+    let index_range = run_sequence.text_range();
+    let slice = if let Some(slice) = text.get(index_range.clone()) {
+        slice
+    } else {
+        #[cfg(feature = "std")]
+        std::debug_assert!(
+            false,
+            "Found broken indices in isolating run sequence: found indices {}..{} for string {:?}",
+            index_range.start,
+            index_range.end,
+            text
+        );
+        return ret;
+    };
+
+    // XXXManishearth perhaps try and coalesce this into one of the earlier
+    // full-string iterator runs, perhaps explicit::compute()
+    for (i, ch) in slice.char_indices() {
+        // all paren characters are ON
+        // From BidiBrackets.txt:
+        // > The Unicode property value stability policy guarantees that characters
+        // > which have bpt=o or bpt=c also have bc=ON and Bidi_M=Y
+        if original_classes[i] != BidiClass::ON {
+            continue;
+        }
+
+        if let Some(matched) = data_source.bidi_matched_opening_bracket(ch) {
+            if matched.is_open {
+                // If an opening paired bracket is found ...
+
+                // ... and there is no room in the stack,
+                // stop processing BD16 for the remainder of the isolating run sequence.
+                if stack.len() >= 63 {
+                    break;
+                }
+                // ... push its Bidi_Paired_Bracket property value and its text position onto the stack
+                stack.push((matched.opening, i))
+            } else {
+                // If a closing paired bracket is found, do the following
+
+                // Declare a variable that holds a reference to the current stack element
+                // and initialize it with the top element of the stack.
+                // AND
+                // Else, if the current stack element is not at the bottom of the stack
+                for (stack_index, element) in stack.iter().enumerate().rev() {
+                    // Compare the closing paired bracket being inspected or its canonical
+                    // equivalent to the bracket in the current stack element.
+                    if element.0 == matched.opening {
+                        // If the values match, meaning the two characters form a bracket pair, then
+
+                        // Append the text position in the current stack element together with the
+                        // text position of the closing paired bracket to the list.
+                        ret.push(element.1..i);
+
+                        // Pop the stack through the current stack element inclusively.
+                        stack.truncate(stack_index);
+                        break;
+                    }
+                }
+            }
+        }
+    }
+    // Sort the list of pairs of text positions in ascending order based on
+    // the text position of the opening paired bracket.
+    ret.sort_by_key(|r| r.start);
+    ret
+}
+
 /// 3.3.6 Resolving Implicit Levels
 ///
 /// Returns the maximum embedding level in the paragraph.

diff --git a/src/lib.rs b/src/lib.rs
@@ -355,7 +355,14 @@ impl<'text> BidiInfo<'text> {
             let sequences = prepare::isolating_run_sequences(para.level, original_classes, levels);
             for sequence in &sequences {
                 implicit::resolve_weak(sequence, processing_classes);
-                implicit::resolve_neutral(sequence, levels, processing_classes);
+                implicit::resolve_neutral(
+                    text,
+                    data_source,
+                    sequence,
+                    levels,
+                    original_classes,
+                    processing_classes,
+                );
             }
             implicit::resolve_levels(processing_classes, levels);
 
@@ -939,7 +946,7 @@ mod tests {
         assert_eq!(reorder_paras("א(ב)ג."), vec![".ג)ב(א"]);
 
         // With mirrorable characters on level boundry
-        assert_eq!(reorder_paras("אב(גד[&ef].)gh"), vec!["ef].)gh&[דג(בא"]);
+        assert_eq!(reorder_paras("אב(גד[&ef].)gh"), vec!["gh).]ef&[דג(בא"]);
     }
 
     fn reordered_levels_for_paras(text: &str) -> Vec<Vec<Level>> {