Skip to content

Commit

Permalink
Merge pull request #85 from Manishearth/brace-yourself
Browse files Browse the repository at this point in the history
Implement N0 (bracket matching)
  • Loading branch information
Manishearth authored Dec 20, 2022
2 parents cd99a45 + 02f3ddc commit cb61bd5
Show file tree
Hide file tree
Showing 8 changed files with 368 additions and 14 deletions.
18 changes: 17 additions & 1 deletion src/char_data/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ use core::cmp::Ordering::{Equal, Greater, Less};

#[cfg(feature = "hardcoded-data")]
use self::tables::bidi_class_table;
use crate::data_source::BidiMatchedOpeningBracket;
use crate::BidiClass::*;
#[cfg(feature = "hardcoded-data")]
use crate::BidiDataSource;

/// Hardcoded Bidi data that ships with the unicode-bidi crate.
///
/// This can be enabled with the default `hardcoded-data` Cargo feature.
Expand All @@ -42,6 +42,22 @@ pub fn bidi_class(c: char) -> BidiClass {
bsearch_range_value_table(c, bidi_class_table)
}

/// If this character is a bracket according to BidiBrackets.txt,
/// return the corresponding *normalized* *opening bracket* of the pair,
/// and whether or not it itself is an opening bracket.
pub(crate) fn bidi_matched_opening_bracket(c: char) -> Option<BidiMatchedOpeningBracket> {
for pair in self::tables::bidi_pairs_table {
if pair.0 == c || pair.1 == c {
let skeleton = pair.2.unwrap_or(pair.0);
return Some(BidiMatchedOpeningBracket {
opening: skeleton,
is_open: pair.0 == c,
});
}
}
None
}

pub fn is_rtl(bidi_class: BidiClass) -> bool {
match bidi_class {
RLE | RLO | RLI => true,
Expand Down
25 changes: 25 additions & 0 deletions src/char_data/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,3 +508,28 @@ pub const bidi_class_table: &'static [(char, char, BidiClass)] = &[
('\u{f0000}', '\u{ffffd}', L), ('\u{100000}', '\u{10fffd}', L)
];

pub const bidi_pairs_table: &'static [(char, char, Option<char>)] = &[
('\u{28}', '\u{29}', None), ('\u{5b}', '\u{5d}', None), ('\u{7b}', '\u{7d}', None), ('\u{f3a}',
'\u{f3b}', None), ('\u{f3c}', '\u{f3d}', None), ('\u{169b}', '\u{169c}', None), ('\u{2045}',
'\u{2046}', None), ('\u{207d}', '\u{207e}', None), ('\u{208d}', '\u{208e}', None), ('\u{2308}',
'\u{2309}', None), ('\u{230a}', '\u{230b}', None), ('\u{2329}', '\u{232a}', Some('\u{3008}')),
('\u{2768}', '\u{2769}', None), ('\u{276a}', '\u{276b}', None), ('\u{276c}', '\u{276d}', None),
('\u{276e}', '\u{276f}', None), ('\u{2770}', '\u{2771}', None), ('\u{2772}', '\u{2773}', None),
('\u{2774}', '\u{2775}', None), ('\u{27c5}', '\u{27c6}', None), ('\u{27e6}', '\u{27e7}', None),
('\u{27e8}', '\u{27e9}', None), ('\u{27ea}', '\u{27eb}', None), ('\u{27ec}', '\u{27ed}', None),
('\u{27ee}', '\u{27ef}', None), ('\u{2983}', '\u{2984}', None), ('\u{2985}', '\u{2986}', None),
('\u{2987}', '\u{2988}', None), ('\u{2989}', '\u{298a}', None), ('\u{298b}', '\u{298c}', None),
('\u{298d}', '\u{2990}', None), ('\u{298f}', '\u{298e}', None), ('\u{2991}', '\u{2992}', None),
('\u{2993}', '\u{2994}', None), ('\u{2995}', '\u{2996}', None), ('\u{2997}', '\u{2998}', None),
('\u{29d8}', '\u{29d9}', None), ('\u{29da}', '\u{29db}', None), ('\u{29fc}', '\u{29fd}', None),
('\u{2e22}', '\u{2e23}', None), ('\u{2e24}', '\u{2e25}', None), ('\u{2e26}', '\u{2e27}', None),
('\u{2e28}', '\u{2e29}', None), ('\u{2e55}', '\u{2e56}', None), ('\u{2e57}', '\u{2e58}', None),
('\u{2e59}', '\u{2e5a}', None), ('\u{2e5b}', '\u{2e5c}', None), ('\u{3008}', '\u{3009}', None),
('\u{300a}', '\u{300b}', None), ('\u{300c}', '\u{300d}', None), ('\u{300e}', '\u{300f}', None),
('\u{3010}', '\u{3011}', None), ('\u{3014}', '\u{3015}', None), ('\u{3016}', '\u{3017}', None),
('\u{3018}', '\u{3019}', None), ('\u{301a}', '\u{301b}', None), ('\u{fe59}', '\u{fe5a}', None),
('\u{fe5b}', '\u{fe5c}', None), ('\u{fe5d}', '\u{fe5e}', None), ('\u{ff08}', '\u{ff09}', None),
('\u{ff3b}', '\u{ff3d}', None), ('\u{ff5b}', '\u{ff5d}', None), ('\u{ff5f}', '\u{ff60}', None),
('\u{ff62}', '\u{ff63}', None)
];

29 changes: 29 additions & 0 deletions src/data_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,37 @@

use crate::BidiClass;

/// This is the return value of [`BidiDataSource::bidi_matched_opening_bracket()`].
///
/// It represents the matching *normalized* opening bracket for a given bracket in a bracket pair,
/// and whether or not that bracket is opening.
pub struct BidiMatchedOpeningBracket {
/// The corresponding opening bracket in this bracket pair, normalized
///
/// In case of opening brackets, this will be the bracket itself, except for when the bracket
/// is not normalized, in which case it will be the normalized form.
pub opening: char,
/// Whether or not the requested bracket was an opening bracket. True for opening
pub is_open: bool,
}

/// This trait abstracts over a data source that is able to produce the Unicode Bidi class for a given
/// character
pub trait BidiDataSource {
fn bidi_class(&self, c: char) -> BidiClass;
/// If this character is a bracket according to BidiBrackets.txt,
/// return the corresponding *normalized* *opening bracket* of the pair,
/// and whether or not it itself is an opening bracket.
///
/// This effectively buckets brackets into equivalence classes keyed on the
/// normalized opening bracket.
///
/// The default implementation will pull in a small amount of hardcoded data,
/// regardless of the `hardcoded-data` feature. This is in part for convenience
/// (since this data is small and changes less often), and in part so that this method can be
/// added without needing a breaking version bump.
/// Override this method in your custom data source to prevent the use of hardcoded data.
fn bidi_matched_opening_bracket(&self, c: char) -> Option<BidiMatchedOpeningBracket> {
crate::char_data::bidi_matched_opening_bracket(c)
}
}
237 changes: 232 additions & 5 deletions src/implicit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@

use alloc::vec::Vec;
use core::cmp::max;
use core::ops::Range;

use super::char_data::BidiClass::{self, *};
use super::level::Level;
use super::prepare::{not_removed_by_x9, removed_by_x9, IsolatingRunSequence, LevelRun};
use super::BidiDataSource;

/// 3.3.4 Resolving Weak Types
///
Expand Down Expand Up @@ -137,19 +139,162 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, processing_classes: &mut [B
///
/// <http://www.unicode.org/reports/tr9/#Resolving_Neutral_Types>
#[cfg_attr(feature = "flame_it", flamer::flame)]
pub fn resolve_neutral(
pub fn resolve_neutral<D: BidiDataSource>(
text: &str,
data_source: &D,
sequence: &IsolatingRunSequence,
levels: &[Level],
original_classes: &[BidiClass],
processing_classes: &mut [BidiClass],
) {
// e = embedding direction
let e: BidiClass = levels[sequence.runs[0].start].bidi_class();
let not_e = if e == BidiClass::L {
BidiClass::R
} else {
BidiClass::L
};
// N0. Process bracket pairs.

// > Identify the bracket pairs in the current isolating run sequence according to BD16.
let bracket_pairs = identify_bracket_pairs(text, data_source, sequence, original_classes);

// > For each bracket-pair element in the list of pairs of text positions
//
// Note: Rust ranges are interpreted as [start..end), be careful using `pair` directly
// for indexing as it will include the opening bracket pair but not the closing one
for pair in bracket_pairs {
#[cfg(feature = "std")]
debug_assert!(
pair.start < processing_classes.len(),
"identify_bracket_pairs returned a range that is out of bounds!"
);
#[cfg(feature = "std")]
debug_assert!(
pair.end < processing_classes.len(),
"identify_bracket_pairs returned a range that is out of bounds!"
);
let mut found_e = false;
let mut found_not_e = false;
let mut class_to_set = None;

let start_len_utf8 = text[pair.start..].chars().next().unwrap().len_utf8();
// get the range of characters enclosed
let enclosed = (pair.start + start_len_utf8)..pair.end;
// > Inspect the bidirectional types of the characters enclosed within the bracket pair.
//
// `pair` is [start, end) so we will end up processing the opening character but not the closing one.
//
// Note: Given that processing_classes has been modified in the previous runs, and resolve_weak
// modifies processing_classes inconsistently at non-character-boundaries,
// this and the later iteration will end up iterating over some obsolete classes.
// This is fine since all we care about is looking for strong
// classes, and strong_classes do not change in resolve_weak. The alternative is calling `.char_indices()`
// on the text (or checking `text.get(idx).is_some()`), which would be a way to avoid hitting these
// processing_classes of bytes not on character boundaries. This is both cleaner and likely to be faster
// (this is worth benchmarking, though!) so we'll stick with the current approach of iterating over processing_classes.
for &class in &processing_classes[enclosed] {
if class == e {
found_e = true;
} else if class == not_e {
found_not_e = true;
} else if class == BidiClass::EN || class == BidiClass::AN {
// > Within this scope, bidirectional types EN and AN are treated as R.
if e == BidiClass::L {
found_not_e = true;
} else {
found_e = true;
}
}

// if we have found a character with the class of the embedding direction
// we can bail early
if found_e {
break;
}
}
// > If any strong type (either L or R) matching the embedding direction is found
if found_e {
// > .. set the type for both brackets in the pair to match the embedding direction
class_to_set = Some(e);
// > Otherwise, if there is a strong type it must be opposite the embedding direction
} else if found_not_e {
// Therefore, test for an established context with a preceding strong type by
// checking backwards before the opening paired bracket
// until the first strong type (L, R, or sos) is found.
// (see note above about processing_classes and character boundaries)
let mut previous_strong = processing_classes[..pair.start]
.iter()
.copied()
.rev()
.find(|class| {
*class == BidiClass::L
|| *class == BidiClass::R
|| *class == BidiClass::EN
|| *class == BidiClass::AN
})
.unwrap_or(sequence.sos);

// > Within this scope, bidirectional types EN and AN are treated as R.
if previous_strong == BidiClass::EN || previous_strong == BidiClass::AN {
previous_strong = BidiClass::R;
}

// > If the preceding strong type is also opposite the embedding direction,
// > context is established,
// > so set the type for both brackets in the pair to that direction.
// AND
// > Otherwise set the type for both brackets in the pair to the embedding direction.
// > Either way it gets set to previous_strong
//
// XXXManishearth perhaps the reason the spec writes these as two separate lines is
// because sos is supposed to be handled differently?
class_to_set = Some(previous_strong);
}

if let Some(class_to_set) = class_to_set {
// update all processing classes corresponding to the start and end elements, as requested.
// We should include all bytes of the character, not the first one.
let end_len_utf8 = text[pair.start..].chars().next().unwrap().len_utf8();
for class in &mut processing_classes[pair.start..pair.start + start_len_utf8] {
*class = class_to_set;
}
for class in &mut processing_classes[pair.end..pair.end + end_len_utf8] {
*class = class_to_set;
}
// > Any number of characters that had original bidirectional character type NSM prior to the application of
// > W1 that immediately follow a paired bracket which changed to L or R under N0 should change to match the type of their preceding bracket.

// This rule deals with sequences of NSMs, so we can just update them all at once, we don't need to worry
// about character boundaries. We do need to be careful to skip the full set of bytes for the parentheses characters.
let nsm_start = pair.start + start_len_utf8;
for (idx, class) in original_classes[nsm_start..].iter().enumerate() {
if *class == BidiClass::NSM {
processing_classes[nsm_start + idx] = class_to_set;
} else {
break;
}
}
let nsm_end = pair.end + end_len_utf8;
for (idx, class) in original_classes[nsm_end..].iter().enumerate() {
if *class == BidiClass::NSM {
processing_classes[nsm_end + idx] = class_to_set;
} else {
break;
}
}
}
// > Otherwise, there are no strong types within the bracket pair
// > Therefore, do not set the type for that bracket pair
}

// N1 and N2
// indices of every byte in this isolating run sequence
// XXXManishearth Note for later: is it okay to iterate over every index here, since
// that includes char boundaries?
let mut indices = sequence.runs.iter().flat_map(Clone::clone);
let mut prev_class = sequence.sos;

while let Some(mut i) = indices.next() {
// N0. Process bracket pairs.
// TODO

// Process sequences of NI characters.
let mut ni_run = Vec::new();
if is_NI(processing_classes[i]) {
Expand Down Expand Up @@ -203,6 +348,88 @@ pub fn resolve_neutral(
}
}

/// 3.1.3 Identifying Bracket Pairs
///
/// Returns all paired brackets in the source
///
/// <https://www.unicode.org/reports/tr9/#BD16>
fn identify_bracket_pairs<D: BidiDataSource>(
text: &str,
data_source: &D,
run_sequence: &IsolatingRunSequence,
original_classes: &[BidiClass],
) -> Vec<Range<usize>> {
let mut ret = vec![];
let mut stack = vec![];

let index_range = run_sequence.text_range();
let slice = if let Some(slice) = text.get(index_range.clone()) {
slice
} else {
#[cfg(feature = "std")]
std::debug_assert!(
false,
"Found broken indices in isolating run sequence: found indices {}..{} for string {:?}",
index_range.start,
index_range.end,
text
);
return ret;
};

// XXXManishearth perhaps try and coalesce this into one of the earlier
// full-string iterator runs, perhaps explicit::compute()
for (i, ch) in slice.char_indices() {
// all paren characters are ON
// From BidiBrackets.txt:
// > The Unicode property value stability policy guarantees that characters
// > which have bpt=o or bpt=c also have bc=ON and Bidi_M=Y
if original_classes[i] != BidiClass::ON {
continue;
}

if let Some(matched) = data_source.bidi_matched_opening_bracket(ch) {
if matched.is_open {
// If an opening paired bracket is found ...

// ... and there is no room in the stack,
// stop processing BD16 for the remainder of the isolating run sequence.
if stack.len() >= 63 {
break;
}
// ... push its Bidi_Paired_Bracket property value and its text position onto the stack
stack.push((matched.opening, i))
} else {
// If a closing paired bracket is found, do the following

// Declare a variable that holds a reference to the current stack element
// and initialize it with the top element of the stack.
// AND
// Else, if the current stack element is not at the bottom of the stack
for (stack_index, element) in stack.iter().enumerate().rev() {
// Compare the closing paired bracket being inspected or its canonical
// equivalent to the bracket in the current stack element.
if element.0 == matched.opening {
// If the values match, meaning the two characters form a bracket pair, then

// Append the text position in the current stack element together with the
// text position of the closing paired bracket to the list.
ret.push(element.1..i);

// Pop the stack through the current stack element inclusively.
stack.truncate(stack_index);
break;
}
}
}
}
}
// Sort the list of pairs of text positions in ascending order based on
// the text position of the opening paired bracket.
ret.sort_by_key(|r| r.start);
ret
}

/// 3.3.6 Resolving Implicit Levels
///
/// Returns the maximum embedding level in the paragraph.
Expand Down
11 changes: 9 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,14 @@ impl<'text> BidiInfo<'text> {
let sequences = prepare::isolating_run_sequences(para.level, original_classes, levels);
for sequence in &sequences {
implicit::resolve_weak(sequence, processing_classes);
implicit::resolve_neutral(sequence, levels, processing_classes);
implicit::resolve_neutral(
text,
data_source,
sequence,
levels,
original_classes,
processing_classes,
);
}
implicit::resolve_levels(processing_classes, levels);

Expand Down Expand Up @@ -939,7 +946,7 @@ mod tests {
assert_eq!(reorder_paras("א(ב)ג."), vec![".ג)ב(א"]);

// With mirrorable characters on level boundry
assert_eq!(reorder_paras("אב(גד[&ef].)gh"), vec!["ef].)gh&[דג(בא"]);
assert_eq!(reorder_paras("אב(גד[&ef].)gh"), vec!["gh).]ef&[דג(בא"]);
}

fn reordered_levels_for_paras(text: &str) -> Vec<Vec<Level>> {
Expand Down
Loading

0 comments on commit cb61bd5

Please sign in to comment.