Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement N0 (bracket matching) #85

Merged
merged 16 commits into from
Dec 20, 2022
Merged
18 changes: 17 additions & 1 deletion src/char_data/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ use core::cmp::Ordering::{Equal, Greater, Less};

#[cfg(feature = "hardcoded-data")]
use self::tables::bidi_class_table;
use crate::data_source::BidiMatchedOpeningBracket;
use crate::BidiClass::*;
#[cfg(feature = "hardcoded-data")]
use crate::BidiDataSource;

/// Hardcoded Bidi data that ships with the unicode-bidi crate.
///
/// This can be enabled with the default `hardcoded-data` Cargo feature.
Expand All @@ -42,6 +42,22 @@ pub fn bidi_class(c: char) -> BidiClass {
bsearch_range_value_table(c, bidi_class_table)
}

/// If this character is a bracket according to BidiBrackets.txt,
/// return the corresponding *normalized* *opening bracket* of the pair,
/// and whether or not it itself is an opening bracket.
pub(crate) fn bidi_matched_opening_bracket(c: char) -> Option<BidiMatchedOpeningBracket> {
for pair in self::tables::bidi_pairs_table {
if pair.0 == c || pair.1 == c {
let skeleton = pair.2.unwrap_or(pair.0);
return Some(BidiMatchedOpeningBracket {
opening: skeleton,
is_open: pair.0 == c,
});
}
}
None
}

pub fn is_rtl(bidi_class: BidiClass) -> bool {
match bidi_class {
RLE | RLO | RLI => true,
Expand Down
25 changes: 25 additions & 0 deletions src/char_data/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,3 +508,28 @@ pub const bidi_class_table: &'static [(char, char, BidiClass)] = &[
('\u{f0000}', '\u{ffffd}', L), ('\u{100000}', '\u{10fffd}', L)
];

pub const bidi_pairs_table: &'static [(char, char, Option<char>)] = &[
('\u{28}', '\u{29}', None), ('\u{5b}', '\u{5d}', None), ('\u{7b}', '\u{7d}', None), ('\u{f3a}',
'\u{f3b}', None), ('\u{f3c}', '\u{f3d}', None), ('\u{169b}', '\u{169c}', None), ('\u{2045}',
'\u{2046}', None), ('\u{207d}', '\u{207e}', None), ('\u{208d}', '\u{208e}', None), ('\u{2308}',
'\u{2309}', None), ('\u{230a}', '\u{230b}', None), ('\u{2329}', '\u{232a}', Some('\u{3008}')),
('\u{2768}', '\u{2769}', None), ('\u{276a}', '\u{276b}', None), ('\u{276c}', '\u{276d}', None),
('\u{276e}', '\u{276f}', None), ('\u{2770}', '\u{2771}', None), ('\u{2772}', '\u{2773}', None),
('\u{2774}', '\u{2775}', None), ('\u{27c5}', '\u{27c6}', None), ('\u{27e6}', '\u{27e7}', None),
('\u{27e8}', '\u{27e9}', None), ('\u{27ea}', '\u{27eb}', None), ('\u{27ec}', '\u{27ed}', None),
('\u{27ee}', '\u{27ef}', None), ('\u{2983}', '\u{2984}', None), ('\u{2985}', '\u{2986}', None),
('\u{2987}', '\u{2988}', None), ('\u{2989}', '\u{298a}', None), ('\u{298b}', '\u{298c}', None),
('\u{298d}', '\u{2990}', None), ('\u{298f}', '\u{298e}', None), ('\u{2991}', '\u{2992}', None),
('\u{2993}', '\u{2994}', None), ('\u{2995}', '\u{2996}', None), ('\u{2997}', '\u{2998}', None),
('\u{29d8}', '\u{29d9}', None), ('\u{29da}', '\u{29db}', None), ('\u{29fc}', '\u{29fd}', None),
('\u{2e22}', '\u{2e23}', None), ('\u{2e24}', '\u{2e25}', None), ('\u{2e26}', '\u{2e27}', None),
('\u{2e28}', '\u{2e29}', None), ('\u{2e55}', '\u{2e56}', None), ('\u{2e57}', '\u{2e58}', None),
('\u{2e59}', '\u{2e5a}', None), ('\u{2e5b}', '\u{2e5c}', None), ('\u{3008}', '\u{3009}', None),
('\u{300a}', '\u{300b}', None), ('\u{300c}', '\u{300d}', None), ('\u{300e}', '\u{300f}', None),
('\u{3010}', '\u{3011}', None), ('\u{3014}', '\u{3015}', None), ('\u{3016}', '\u{3017}', None),
('\u{3018}', '\u{3019}', None), ('\u{301a}', '\u{301b}', None), ('\u{fe59}', '\u{fe5a}', None),
('\u{fe5b}', '\u{fe5c}', None), ('\u{fe5d}', '\u{fe5e}', None), ('\u{ff08}', '\u{ff09}', None),
('\u{ff3b}', '\u{ff3d}', None), ('\u{ff5b}', '\u{ff5d}', None), ('\u{ff5f}', '\u{ff60}', None),
('\u{ff62}', '\u{ff63}', None)
];

29 changes: 29 additions & 0 deletions src/data_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,37 @@

use crate::BidiClass;

/// This is the return value of [`BidiDataSource::bidi_matched_opening_bracket()`].
///
/// It represents the matching *normalized* opening bracket for a given bracket in a bracket pair,
/// and whether or not that bracket is opening.
pub struct BidiMatchedOpeningBracket {
/// The corresponding opening bracket in this bracket pair, normalized
///
/// In case of opening brackets, this will be the bracket itself, except for when the bracket
/// is not normalized, in which case it will be the normalized form.
pub opening: char,
/// Whether or not the requested bracket was an opening bracket. True for opening
pub is_open: bool,
}

/// This trait abstracts over a data source that is able to produce the Unicode Bidi class for a given
/// character
pub trait BidiDataSource {
fn bidi_class(&self, c: char) -> BidiClass;
/// If this character is a bracket according to BidiBrackets.txt,
/// return the corresponding *normalized* *opening bracket* of the pair,
/// and whether or not it itself is an opening bracket.
///
/// This effectively buckets brackets into equivalence classes keyed on the
/// normalized opening bracket.
///
/// The default implementation will pull in a small amount of hardcoded data,
/// regardless of the `hardcoded-data` feature. This is in part for convenience
/// (since this data is small and changes less often), and in part so that this method can be
/// added without needing a breaking version bump.
/// Override this method in your custom data source to prevent the use of hardcoded data.
fn bidi_matched_opening_bracket(&self, c: char) -> Option<BidiMatchedOpeningBracket> {
crate::char_data::bidi_matched_opening_bracket(c)
}
}
237 changes: 232 additions & 5 deletions src/implicit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@

use alloc::vec::Vec;
use core::cmp::max;
use core::ops::Range;

use super::char_data::BidiClass::{self, *};
use super::level::Level;
use super::prepare::{not_removed_by_x9, removed_by_x9, IsolatingRunSequence, LevelRun};
use super::BidiDataSource;

/// 3.3.4 Resolving Weak Types
///
Expand Down Expand Up @@ -137,19 +139,162 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, processing_classes: &mut [B
///
/// <http://www.unicode.org/reports/tr9/#Resolving_Neutral_Types>
#[cfg_attr(feature = "flame_it", flamer::flame)]
pub fn resolve_neutral(
pub fn resolve_neutral<D: BidiDataSource>(
text: &str,
data_source: &D,
sequence: &IsolatingRunSequence,
levels: &[Level],
original_classes: &[BidiClass],
processing_classes: &mut [BidiClass],
) {
// e = embedding direction
let e: BidiClass = levels[sequence.runs[0].start].bidi_class();
let not_e = if e == BidiClass::L {
BidiClass::R
} else {
BidiClass::L
};
// N0. Process bracket pairs.

// > Identify the bracket pairs in the current isolating run sequence according to BD16.
let bracket_pairs = identify_bracket_pairs(text, data_source, sequence, original_classes);

// > For each bracket-pair element in the list of pairs of text positions
//
// Note: Rust ranges are interpreted as [start..end), be careful using `pair` directly
Manishearth marked this conversation as resolved.
Show resolved Hide resolved
// for indexing as it will include the opening bracket pair but not the closing one
for pair in bracket_pairs {
#[cfg(feature = "std")]
debug_assert!(
pair.start < processing_classes.len(),
"identify_bracket_pairs returned a range that is out of bounds!"
);
#[cfg(feature = "std")]
debug_assert!(
pair.end < processing_classes.len(),
"identify_bracket_pairs returned a range that is out of bounds!"
);
let mut found_e = false;
let mut found_not_e = false;
let mut class_to_set = None;

let start_len_utf8 = text[pair.start..].chars().next().unwrap().len_utf8();
// get the range of characters enclosed
let enclosed = (pair.start + start_len_utf8)..pair.end;
// > Inspect the bidirectional types of the characters enclosed within the bracket pair.
//
// `pair` is [start, end) so we will end up processing the opening character but not the closing one.
//
// Note: Given that processing_classes has been modified in the previous runs, and resolve_weak
// modifies processing_classes inconsistently at non-character-boundaries,
// this and the later iteration will end up iterating over some obsolete classes.
// This is fine since all we care about is looking for strong
// classes, and strong_classes do not change in resolve_weak. The alternative is calling `.char_indices()`
// on the text (or checking `text.get(idx).is_some()`), which would be a way to avoid hitting these
// processing_classes of bytes not on character boundaries. This is both cleaner and likely to be faster
// (this is worth benchmarking, though!) so we'll stick with the current approach of iterating over processing_classes.
for &class in &processing_classes[enclosed] {
if class == e {
found_e = true;
} else if class == not_e {
found_not_e = true;
} else if class == BidiClass::EN || class == BidiClass::AN {
// > Within this scope, bidirectional types EN and AN are treated as R.
if e == BidiClass::L {
found_not_e = true;
} else {
found_e = true;
}
}

// if we have found a character with the class of the embedding direction
// we can bail early
if found_e {
break;
}
}
// > If any strong type (either L or R) matching the embedding direction is found
if found_e {
// > .. set the type for both brackets in the pair to match the embedding direction
class_to_set = Some(e);
// > Otherwise, if there is a strong type it must be opposite the embedding direction
} else if found_not_e {
// Therefore, test for an established context with a preceding strong type by
// checking backwards before the opening paired bracket
// until the first strong type (L, R, or sos) is found.
// (see note above about processing_classes and character boundaries)
let mut previous_strong = processing_classes[..pair.start]
.iter()
.copied()
.rev()
.find(|class| {
*class == BidiClass::L
|| *class == BidiClass::R
|| *class == BidiClass::EN
|| *class == BidiClass::AN
})
.unwrap_or(sequence.sos);

// > Within this scope, bidirectional types EN and AN are treated as R.
if previous_strong == BidiClass::EN || previous_strong == BidiClass::AN {
previous_strong = BidiClass::R;
}

// > If the preceding strong type is also opposite the embedding direction,
// > context is established,
// > so set the type for both brackets in the pair to that direction.
// AND
// > Otherwise set the type for both brackets in the pair to the embedding direction.
// > Either way it gets set to previous_strong
//
// XXXManishearth perhaps the reason the spec writes these as two separate lines is
// because sos is supposed to be handled differently?
class_to_set = Some(previous_strong);
}

if let Some(class_to_set) = class_to_set {
// update all processing classes corresponding to the start and end elements, as requested.
// We should include all bytes of the character, not the first one.
let end_len_utf8 = text[pair.start..].chars().next().unwrap().len_utf8();
for class in &mut processing_classes[pair.start..pair.start + start_len_utf8] {
*class = class_to_set;
}
for class in &mut processing_classes[pair.end..pair.end + end_len_utf8] {
*class = class_to_set;
}
// > Any number of characters that had original bidirectional character type NSM prior to the application of
// > W1 that immediately follow a paired bracket which changed to L or R under N0 should change to match the type of their preceding bracket.

// This rule deals with sequences of NSMs, so we can just update them all at once, we don't need to worry
// about character boundaries. We do need to be careful to skip the full set of bytes for the parentheses characters.
let nsm_start = pair.start + start_len_utf8;
for (idx, class) in original_classes[nsm_start..].iter().enumerate() {
if *class == BidiClass::NSM {
processing_classes[nsm_start + idx] = class_to_set;
} else {
break;
}
}
let nsm_end = pair.end + end_len_utf8;
for (idx, class) in original_classes[nsm_end..].iter().enumerate() {
if *class == BidiClass::NSM {
processing_classes[nsm_end + idx] = class_to_set;
} else {
break;
}
}
}
// > Otherwise, there are no strong types within the bracket pair
// > Therefore, do not set the type for that bracket pair
}

// N1 and N2
// indices of every byte in this isolating run sequence
// XXXManishearth Note for later: is it okay to iterate over every index here, since
// that includes char boundaries?
let mut indices = sequence.runs.iter().flat_map(Clone::clone);
let mut prev_class = sequence.sos;

while let Some(mut i) = indices.next() {
// N0. Process bracket pairs.
// TODO

// Process sequences of NI characters.
let mut ni_run = Vec::new();
if is_NI(processing_classes[i]) {
Expand Down Expand Up @@ -203,6 +348,88 @@ pub fn resolve_neutral(
}
}

/// 3.1.3 Identifying Bracket Pairs
///
/// Returns all paired brackets in the source
///
/// <https://www.unicode.org/reports/tr9/#BD16>
fn identify_bracket_pairs<D: BidiDataSource>(
text: &str,
data_source: &D,
run_sequence: &IsolatingRunSequence,
original_classes: &[BidiClass],
) -> Vec<Range<usize>> {
let mut ret = vec![];
let mut stack = vec![];

let index_range = run_sequence.text_range();
let slice = if let Some(slice) = text.get(index_range.clone()) {
slice
} else {
#[cfg(feature = "std")]
std::debug_assert!(
false,
"Found broken indices in isolating run sequence: found indices {}..{} for string {:?}",
index_range.start,
index_range.end,
text
);
return ret;
};

// XXXManishearth perhaps try and coalesce this into one of the earlier
// full-string iterator runs, perhaps explicit::compute()
for (i, ch) in slice.char_indices() {
// all paren characters are ON
// From BidiBrackets.txt:
// > The Unicode property value stability policy guarantees that characters
// > which have bpt=o or bpt=c also have bc=ON and Bidi_M=Y
if original_classes[i] != BidiClass::ON {
continue;
}

if let Some(matched) = data_source.bidi_matched_opening_bracket(ch) {
if matched.is_open {
// If an opening paired bracket is found ...

// ... and there is no room in the stack,
// stop processing BD16 for the remainder of the isolating run sequence.
if stack.len() >= 63 {
break;
}
// ... push its Bidi_Paired_Bracket property value and its text position onto the stack
stack.push((matched.opening, i))
} else {
// If a closing paired bracket is found, do the following

// Declare a variable that holds a reference to the current stack element
// and initialize it with the top element of the stack.
// AND
// Else, if the current stack element is not at the bottom of the stack
for (stack_index, element) in stack.iter().enumerate().rev() {
// Compare the closing paired bracket being inspected or its canonical
// equivalent to the bracket in the current stack element.
if element.0 == matched.opening {
// If the values match, meaning the two characters form a bracket pair, then

// Append the text position in the current stack element together with the
// text position of the closing paired bracket to the list.
ret.push(element.1..i);

// Pop the stack through the current stack element inclusively.
stack.truncate(stack_index);
break;
}
}
}
}
}
// Sort the list of pairs of text positions in ascending order based on
// the text position of the opening paired bracket.
ret.sort_by_key(|r| r.start);
ret
}

/// 3.3.6 Resolving Implicit Levels
///
/// Returns the maximum embedding level in the paragraph.
Expand Down
11 changes: 9 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,14 @@ impl<'text> BidiInfo<'text> {
let sequences = prepare::isolating_run_sequences(para.level, original_classes, levels);
for sequence in &sequences {
implicit::resolve_weak(sequence, processing_classes);
implicit::resolve_neutral(sequence, levels, processing_classes);
implicit::resolve_neutral(
text,
data_source,
sequence,
levels,
original_classes,
processing_classes,
);
}
implicit::resolve_levels(processing_classes, levels);

Expand Down Expand Up @@ -939,7 +946,7 @@ mod tests {
assert_eq!(reorder_paras("א(ב)ג."), vec![".ג)ב(א"]);

// With mirrorable characters on level boundry
assert_eq!(reorder_paras("אב(גד[&ef].)gh"), vec!["ef].)gh&[דג(בא"]);
assert_eq!(reorder_paras("אב(גד[&ef].)gh"), vec!["gh).]ef&[דג(בא"]);
}

fn reordered_levels_for_paras(text: &str) -> Vec<Vec<Level>> {
Expand Down
Loading