Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement N0 (bracket matching) #85

Merged
merged 16 commits into from
Dec 20, 2022
Merged
13 changes: 13 additions & 0 deletions src/char_data/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,19 @@ pub fn bidi_class(c: char) -> BidiClass {
bsearch_range_value_table(c, bidi_class_table)
}

/// If this character is an opening bracket according to BidiBrackets.txt,
/// return its corresponding closing bracket.
pub(crate) fn bidi_matched_bracket(c: char) -> Option<(char, bool)> {
for pair in self::tables::bidi_pairs_table {
if pair.0 == c {
return Some((pair.1, true));
} else if pair.1 == c {
return Some((pair.0, false));
}
}
None
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think it's worth making this a hashmap or at least binary search?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that ICU4X will eventually not be using this data.

There's no static hashmap in Rust, and we could use a perfect hashing crate but that would be an additional dependency. We could also implement our own perfect hashing which would probably be more efficient.

The table has very few entries so it wasn't clear to me that the complexity of binary search was worth it. We can benchmark if desired.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@robertbastian had some thoughts on this


pub fn is_rtl(bidi_class: BidiClass) -> bool {
match bidi_class {
RLE | RLO | RLI => true,
Expand Down
22 changes: 22 additions & 0 deletions src/char_data/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,3 +508,25 @@ pub const bidi_class_table: &'static [(char, char, BidiClass)] = &[
('\u{f0000}', '\u{ffffd}', L), ('\u{100000}', '\u{10fffd}', L)
];

pub const bidi_pairs_table: &'static [(char, char)] = &[
('\u{28}', '\u{29}'), ('\u{5b}', '\u{5d}'), ('\u{7b}', '\u{7d}'), ('\u{f3a}', '\u{f3b}'),
('\u{f3c}', '\u{f3d}'), ('\u{169b}', '\u{169c}'), ('\u{2045}', '\u{2046}'), ('\u{207d}',
'\u{207e}'), ('\u{208d}', '\u{208e}'), ('\u{2308}', '\u{2309}'), ('\u{230a}', '\u{230b}'),
('\u{2329}', '\u{232a}'), ('\u{2768}', '\u{2769}'), ('\u{276a}', '\u{276b}'), ('\u{276c}',
'\u{276d}'), ('\u{276e}', '\u{276f}'), ('\u{2770}', '\u{2771}'), ('\u{2772}', '\u{2773}'),
('\u{2774}', '\u{2775}'), ('\u{27c5}', '\u{27c6}'), ('\u{27e6}', '\u{27e7}'), ('\u{27e8}',
'\u{27e9}'), ('\u{27ea}', '\u{27eb}'), ('\u{27ec}', '\u{27ed}'), ('\u{27ee}', '\u{27ef}'),
('\u{2983}', '\u{2984}'), ('\u{2985}', '\u{2986}'), ('\u{2987}', '\u{2988}'), ('\u{2989}',
'\u{298a}'), ('\u{298b}', '\u{298c}'), ('\u{298d}', '\u{2990}'), ('\u{298f}', '\u{298e}'),
('\u{2991}', '\u{2992}'), ('\u{2993}', '\u{2994}'), ('\u{2995}', '\u{2996}'), ('\u{2997}',
'\u{2998}'), ('\u{29d8}', '\u{29d9}'), ('\u{29da}', '\u{29db}'), ('\u{29fc}', '\u{29fd}'),
('\u{2e22}', '\u{2e23}'), ('\u{2e24}', '\u{2e25}'), ('\u{2e26}', '\u{2e27}'), ('\u{2e28}',
'\u{2e29}'), ('\u{2e55}', '\u{2e56}'), ('\u{2e57}', '\u{2e58}'), ('\u{2e59}', '\u{2e5a}'),
('\u{2e5b}', '\u{2e5c}'), ('\u{3008}', '\u{3009}'), ('\u{300a}', '\u{300b}'), ('\u{300c}',
'\u{300d}'), ('\u{300e}', '\u{300f}'), ('\u{3010}', '\u{3011}'), ('\u{3014}', '\u{3015}'),
('\u{3016}', '\u{3017}'), ('\u{3018}', '\u{3019}'), ('\u{301a}', '\u{301b}'), ('\u{fe59}',
'\u{fe5a}'), ('\u{fe5b}', '\u{fe5c}'), ('\u{fe5d}', '\u{fe5e}'), ('\u{ff08}', '\u{ff09}'),
('\u{ff3b}', '\u{ff3d}'), ('\u{ff5b}', '\u{ff5d}'), ('\u{ff5f}', '\u{ff60}'), ('\u{ff62}',
'\u{ff63}')
];

12 changes: 12 additions & 0 deletions src/data_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,16 @@ use crate::BidiClass;
/// character
pub trait BidiDataSource {
fn bidi_class(&self, c: char) -> BidiClass;
/// If this character is a bracket according to BidiBrackets.txt,
/// return its corresponding matched bracket, and whether or not it is an
/// opening bracket
///
/// The default implementation will pull in a small amount of hardcoded data,
/// regardless of the `hardcoded-data` feature. This is in part for convenience
/// (since this data is small and changes less often), and in part so that this method can be
/// added without needing a breaking version bump
Manishearth marked this conversation as resolved.
Show resolved Hide resolved
/// Override this method in your custom data source to prevent the use of hardcoded data.
fn bidi_matched_bracket(&self, c: char) -> Option<(char, bool)> {
crate::char_data::bidi_matched_bracket(c)
}
}
215 changes: 210 additions & 5 deletions src/implicit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@

use alloc::vec::Vec;
use core::cmp::max;
use core::ops::Range;

use super::char_data::BidiClass::{self, *};
use super::level::Level;
use super::prepare::{not_removed_by_x9, removed_by_x9, IsolatingRunSequence, LevelRun};
use super::BidiDataSource;

/// 3.3.4 Resolving Weak Types
///
Expand Down Expand Up @@ -137,19 +139,140 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, processing_classes: &mut [B
///
/// <http://www.unicode.org/reports/tr9/#Resolving_Neutral_Types>
#[cfg_attr(feature = "flame_it", flamer::flame)]
pub fn resolve_neutral(
pub fn resolve_neutral<D: BidiDataSource>(
text: &str,
data_source: &D,
sequence: &IsolatingRunSequence,
levels: &[Level],
original_classes: &[BidiClass],
processing_classes: &mut [BidiClass],
) {
// e = embedding direction
let e: BidiClass = levels[sequence.runs[0].start].bidi_class();
let e_is_l = e == BidiClass::L;
// N0. Process bracket pairs.

// Identify the bracket pairs in the current isolating run sequence according to BD16.
let bracket_pairs = identify_bracket_pairs(text, data_source, sequence, original_classes);

// For each bracket-pair element in the list of pairs of text positions
Manishearth marked this conversation as resolved.
Show resolved Hide resolved
// Note: Rust ranges are interpreted as [start..end), be careful using `pair` directly
Manishearth marked this conversation as resolved.
Show resolved Hide resolved
// for indexing as it will include the opening bracket pair but not the closing one
for pair in bracket_pairs {
#[cfg(feature = "std")]
debug_assert!(
pair.start < processing_classes.len(),
"identify_bracket_pairs returned a range that is out of bounds!"
);
#[cfg(feature = "std")]
debug_assert!(
pair.end < processing_classes.len(),
"identify_bracket_pairs returned a range that is out of bounds!"
);
let mut found_l = false;
let mut found_r = false;
let mut class_to_set = None;
// Inspect the bidirectional types of the characters enclosed within the bracket pair.
// Note: the algorithm wants us to inspect the types of the *enclosed* characters,
// not the brackets themselves, however since the brackets will never be L or R, we can
// just scan them as well and not worry about trying to skip them in the array (they may take
// up multiple indices in processing_classes if they're multibyte!).
//
// `pair` is [start, end) so we will end up processing the opening character but not the closing one.
//
// Note: Given that processing_classes has been modified in the previous runs, and resolve_weak
// modifies processing_classes inconsistently at non-character-boundaries,
// this and the later iteration will end up iterating over some obsolete classes.
// This is fine since all we care about is looking for strong
// classes, and strong_classes do not change in resolve_weak. The alternative is calling `.char_indices()`
// on the text (or checking `text.get(idx).is_some()`), which would be a way to avoid hitting these
// processing_classes of bytes not on character boundaries. This is both cleaner and likely to be faster
// (this is worth benchmarking, though!) so we'll stick with the current approach of iterating over processing_classes.
for &class in &processing_classes[pair.clone()] {
if class == BidiClass::R {
found_r = true;
} else if class == BidiClass::L {
found_l = true;
}

// if we have found a character with the class of the embedding direction
// we can bail early
if found_l && e_is_l || found_r {
break;
}
}
// If any strong type (either L or R) matching the embedding direction is found
if found_l && e_is_l || found_r {
Manishearth marked this conversation as resolved.
Show resolved Hide resolved
// set the type for both brackets in the pair to match the embedding direction
class_to_set = Some(e);
// Otherwise, if there is a strong type it must be opposite the embedding direction
} else if found_l || found_r {
// Therefore, test for an established context with a preceding strong type by
// checking backwards before the opening paired bracket
// until the first strong type (L, R, or sos) is found.
// (see note above about processing_classes and character boundaries)
let previous_strong = processing_classes[..pair.start]
.iter()
.copied()
.rev()
.find(|class| *class == BidiClass::L || *class == BidiClass::R)
.unwrap_or(sequence.sos);

// If the preceding strong type is also opposite the embedding direction,
// context is established,
// so set the type for both brackets in the pair to that direction.
// AND
// Otherwise set the type for both brackets in the pair to the embedding direction.
// Either way it gets set to previous_strong
// XXXManishearth perhaps the reason the spec writes these as two separate lines is
// because sos is supposed to be handled differently?
class_to_set = Some(previous_strong);
}

if let Some(class_to_set) = class_to_set {
// update all processing classes corresponding to the start and end elements, as requested.
// We should include all bytes of the character, not the first one.
let start_len_utf8 = text[pair.start..].chars().next().unwrap().len_utf8();
let end_len_utf8 = text[pair.start..].chars().next().unwrap().len_utf8();
for class in &mut processing_classes[pair.start..pair.start + start_len_utf8] {
*class = class_to_set;
}
for class in &mut processing_classes[pair.end..pair.end + end_len_utf8] {
*class = class_to_set;
}
// Any number of characters that had original bidirectional character type NSM prior to the application of
// W1 that immediately follow a paired bracket which changed to L or R under N0 should change to match the type of their preceding bracket.

// This rule deals with sequences of NSMs, so we can just update them all at once, we don't need to worry
// about character boundaries. We do need to be careful to skip the full set of bytes for the parentheses characters.
let nsm_start = pair.start + start_len_utf8;
for (idx, class) in original_classes[nsm_start..].iter().enumerate() {
if *class == BidiClass::NSM {
processing_classes[nsm_start + idx] = class_to_set;
} else {
break;
}
}
let nsm_end = pair.end + end_len_utf8;
for (idx, class) in original_classes[nsm_end..].iter().enumerate() {
if *class == BidiClass::NSM {
processing_classes[nsm_end + idx] = class_to_set;
} else {
break;
}
}
}
// Otherwise, there are no strong types within the bracket pair
// Therefore, do not set the type for that bracket pair
}

// N1 and N2
// indices of every byte in this isolating run sequence
// XXXManishearth Note for later: is it okay to iterate over every index here, since
// that includes char boundaries?
let mut indices = sequence.runs.iter().flat_map(Clone::clone);
let mut prev_class = sequence.sos;

while let Some(mut i) = indices.next() {
// N0. Process bracket pairs.
// TODO

// Process sequences of NI characters.
let mut ni_run = Vec::new();
if is_NI(processing_classes[i]) {
Expand Down Expand Up @@ -203,6 +326,88 @@ pub fn resolve_neutral(
}
}

/// 3.1.3 Identifying Bracket Pairs
///
/// Returns all paired brackets in the source
///
/// <https://www.unicode.org/reports/tr9/#BD16>
fn identify_bracket_pairs<D: BidiDataSource>(
text: &str,
data_source: &D,
run_sequence: &IsolatingRunSequence,
original_classes: &[BidiClass],
) -> Vec<Range<usize>> {
let mut ret = vec![];
let mut stack = vec![];

let index_range = run_sequence.text_range();
let slice = if let Some(slice) = text.get(index_range.clone()) {
slice
} else {
#[cfg(feature = "std")]
std::debug_assert!(
false,
"Found broken indices in isolating run sequence: found indices {}..{} for string {:?}",
index_range.start,
index_range.end,
text
);
return ret;
};

// XXXManishearth perhaps try and coalesce this into one of the earlier
// full-string iterator runs, perhaps explicit::compute()
for (i, ch) in slice.char_indices() {
// all paren characters are ON
// From BidiBrackets.txt:
// > The Unicode property value stability policy guarantees that characters
// > which have bpt=o or bpt=c also have bc=ON and Bidi_M=Y
if original_classes[i] != BidiClass::ON {
continue;
}

if let Some((matched, is_open)) = data_source.bidi_matched_bracket(ch) {
if is_open {
// If an opening paired bracket is found ...

// ... and there is no room in the stack,
// stop processing BD16 for the remainder of the isolating run sequence.
if stack.len() >= 63 {
break;
}
// ... push its Bidi_Paired_Bracket property value and its text position onto the stack
stack.push((matched, i))
} else {
// If a closing paired bracket is found, do the following

// Declare a variable that holds a reference to the current stack element
// and initialize it with the top element of the stack.
// AND
// Else, if the current stack element is not at the bottom of the stack
for (stack_index, element) in stack.iter().enumerate().rev() {
// Compare the closing paired bracket being inspected or its canonical
// equivalent to the bracket in the current stack element.
if element.0 == ch {
// If the values match, meaning the two characters form a bracket pair, then

// Append the text position in the current stack element together with the
// text position of the closing paired bracket to the list.
ret.push(element.1..i);

// Pop the stack through the current stack element inclusively.
stack.truncate(stack_index);
break;
}
}
}
}
}
// Sort the list of pairs of text positions in ascending order based on
// the text position of the opening paired bracket.
ret.sort_by_key(|r| r.start);
ret
}

/// 3.3.6 Resolving Implicit Levels
///
/// Returns the maximum embedding level in the paragraph.
Expand Down
11 changes: 9 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,14 @@ impl<'text> BidiInfo<'text> {
let sequences = prepare::isolating_run_sequences(para.level, original_classes, levels);
for sequence in &sequences {
implicit::resolve_weak(sequence, processing_classes);
implicit::resolve_neutral(sequence, levels, processing_classes);
implicit::resolve_neutral(
text,
data_source,
sequence,
levels,
original_classes,
processing_classes,
);
}
implicit::resolve_levels(processing_classes, levels);

Expand Down Expand Up @@ -939,7 +946,7 @@ mod tests {
assert_eq!(reorder_paras("א(ב)ג."), vec![".ג)ב(א"]);

// With mirrorable characters on level boundry
assert_eq!(reorder_paras("אב(גד[&ef].)gh"), vec!["ef].)gh&[דג(בא"]);
assert_eq!(reorder_paras("אב(גד[&ef].)gh"), vec!["gh).]ef&[דג(בא"]);
}

fn reordered_levels_for_paras(text: &str) -> Vec<Vec<Level>> {
Expand Down
11 changes: 11 additions & 0 deletions src/prepare.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,17 @@ pub fn isolating_run_sequences(
.collect()
}

impl IsolatingRunSequence {
/// Returns the full range of text represented by this isolating run sequence
pub(crate) fn text_range(&self) -> Range<usize> {
if let (Some(start), Some(end)) = (self.runs.first(), self.runs.last()) {
start.start..end.end
} else {
return 0..0;
}
}
}

/// Finds the level runs in a paragraph.
///
/// <http://www.unicode.org/reports/tr9/#BD7>
Expand Down
2 changes: 1 addition & 1 deletion tests/conformance_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ fn gen_base_levels_for_base_tests(bitset: u8) -> Vec<Option<Level>> {
}

#[test]
#[should_panic(expected = "14562 test cases failed! (77145 passed)")]
#[should_panic(expected = "3514 test cases failed! (88193 passed)")]
Manishearth marked this conversation as resolved.
Show resolved Hide resolved
fn test_character_conformance() {
let test_data = include_str!("data/BidiCharacterTest.txt");

Expand Down
Loading