Skip to content

Commit

Permalink
Handle canonical equivalence
Browse files Browse the repository at this point in the history
  • Loading branch information
Manishearth committed Dec 19, 2022
1 parent 1069c3b commit ea43d44
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 42 deletions.
14 changes: 7 additions & 7 deletions src/char_data/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,14 @@ pub fn bidi_class(c: char) -> BidiClass {
bsearch_range_value_table(c, bidi_class_table)
}

/// If this character is an opening bracket according to BidiBrackets.txt,
/// return its corresponding closing bracket.
pub(crate) fn bidi_matched_bracket(c: char) -> Option<(char, bool)> {
/// If this character is a bracket according to BidiBrackets.txt,
/// return the corresponding *normalized* *opening bracket* of the pair,
/// and whether or not it itself is an opening bracket.
pub(crate) fn bidi_matched_opening_bracket(c: char) -> Option<(char, bool)> {
for pair in self::tables::bidi_pairs_table {
if pair.0 == c {
return Some((pair.1, true));
} else if pair.1 == c {
return Some((pair.0, false));
if pair.0 == c || pair.1 == c {
let skeleton = pair.2.unwrap_or(pair.0);
return Some((skeleton, pair.0 == c));
}
}
None
Expand Down
43 changes: 23 additions & 20 deletions src/char_data/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,25 +508,28 @@ pub const bidi_class_table: &'static [(char, char, BidiClass)] = &[
('\u{f0000}', '\u{ffffd}', L), ('\u{100000}', '\u{10fffd}', L)
];

pub const bidi_pairs_table: &'static [(char, char)] = &[
('\u{28}', '\u{29}'), ('\u{5b}', '\u{5d}'), ('\u{7b}', '\u{7d}'), ('\u{f3a}', '\u{f3b}'),
('\u{f3c}', '\u{f3d}'), ('\u{169b}', '\u{169c}'), ('\u{2045}', '\u{2046}'), ('\u{207d}',
'\u{207e}'), ('\u{208d}', '\u{208e}'), ('\u{2308}', '\u{2309}'), ('\u{230a}', '\u{230b}'),
('\u{2329}', '\u{232a}'), ('\u{2768}', '\u{2769}'), ('\u{276a}', '\u{276b}'), ('\u{276c}',
'\u{276d}'), ('\u{276e}', '\u{276f}'), ('\u{2770}', '\u{2771}'), ('\u{2772}', '\u{2773}'),
('\u{2774}', '\u{2775}'), ('\u{27c5}', '\u{27c6}'), ('\u{27e6}', '\u{27e7}'), ('\u{27e8}',
'\u{27e9}'), ('\u{27ea}', '\u{27eb}'), ('\u{27ec}', '\u{27ed}'), ('\u{27ee}', '\u{27ef}'),
('\u{2983}', '\u{2984}'), ('\u{2985}', '\u{2986}'), ('\u{2987}', '\u{2988}'), ('\u{2989}',
'\u{298a}'), ('\u{298b}', '\u{298c}'), ('\u{298d}', '\u{2990}'), ('\u{298f}', '\u{298e}'),
('\u{2991}', '\u{2992}'), ('\u{2993}', '\u{2994}'), ('\u{2995}', '\u{2996}'), ('\u{2997}',
'\u{2998}'), ('\u{29d8}', '\u{29d9}'), ('\u{29da}', '\u{29db}'), ('\u{29fc}', '\u{29fd}'),
('\u{2e22}', '\u{2e23}'), ('\u{2e24}', '\u{2e25}'), ('\u{2e26}', '\u{2e27}'), ('\u{2e28}',
'\u{2e29}'), ('\u{2e55}', '\u{2e56}'), ('\u{2e57}', '\u{2e58}'), ('\u{2e59}', '\u{2e5a}'),
('\u{2e5b}', '\u{2e5c}'), ('\u{3008}', '\u{3009}'), ('\u{300a}', '\u{300b}'), ('\u{300c}',
'\u{300d}'), ('\u{300e}', '\u{300f}'), ('\u{3010}', '\u{3011}'), ('\u{3014}', '\u{3015}'),
('\u{3016}', '\u{3017}'), ('\u{3018}', '\u{3019}'), ('\u{301a}', '\u{301b}'), ('\u{fe59}',
'\u{fe5a}'), ('\u{fe5b}', '\u{fe5c}'), ('\u{fe5d}', '\u{fe5e}'), ('\u{ff08}', '\u{ff09}'),
('\u{ff3b}', '\u{ff3d}'), ('\u{ff5b}', '\u{ff5d}'), ('\u{ff5f}', '\u{ff60}'), ('\u{ff62}',
'\u{ff63}')
pub const bidi_pairs_table: &'static [(char, char, Option<char>)] = &[
('\u{28}', '\u{29}', None), ('\u{5b}', '\u{5d}', None), ('\u{7b}', '\u{7d}', None), ('\u{f3a}',
'\u{f3b}', None), ('\u{f3c}', '\u{f3d}', None), ('\u{169b}', '\u{169c}', None), ('\u{2045}',
'\u{2046}', None), ('\u{207d}', '\u{207e}', None), ('\u{208d}', '\u{208e}', None), ('\u{2308}',
'\u{2309}', None), ('\u{230a}', '\u{230b}', None), ('\u{2329}', '\u{232a}', Some('\u{3008}')),
('\u{2768}', '\u{2769}', None), ('\u{276a}', '\u{276b}', None), ('\u{276c}', '\u{276d}', None),
('\u{276e}', '\u{276f}', None), ('\u{2770}', '\u{2771}', None), ('\u{2772}', '\u{2773}', None),
('\u{2774}', '\u{2775}', None), ('\u{27c5}', '\u{27c6}', None), ('\u{27e6}', '\u{27e7}', None),
('\u{27e8}', '\u{27e9}', None), ('\u{27ea}', '\u{27eb}', None), ('\u{27ec}', '\u{27ed}', None),
('\u{27ee}', '\u{27ef}', None), ('\u{2983}', '\u{2984}', None), ('\u{2985}', '\u{2986}', None),
('\u{2987}', '\u{2988}', None), ('\u{2989}', '\u{298a}', None), ('\u{298b}', '\u{298c}', None),
('\u{298d}', '\u{2990}', None), ('\u{298f}', '\u{298e}', None), ('\u{2991}', '\u{2992}', None),
('\u{2993}', '\u{2994}', None), ('\u{2995}', '\u{2996}', None), ('\u{2997}', '\u{2998}', None),
('\u{29d8}', '\u{29d9}', None), ('\u{29da}', '\u{29db}', None), ('\u{29fc}', '\u{29fd}', None),
('\u{2e22}', '\u{2e23}', None), ('\u{2e24}', '\u{2e25}', None), ('\u{2e26}', '\u{2e27}', None),
('\u{2e28}', '\u{2e29}', None), ('\u{2e55}', '\u{2e56}', None), ('\u{2e57}', '\u{2e58}', None),
('\u{2e59}', '\u{2e5a}', None), ('\u{2e5b}', '\u{2e5c}', None), ('\u{3008}', '\u{3009}', None),
('\u{300a}', '\u{300b}', None), ('\u{300c}', '\u{300d}', None), ('\u{300e}', '\u{300f}', None),
('\u{3010}', '\u{3011}', None), ('\u{3014}', '\u{3015}', None), ('\u{3016}', '\u{3017}', None),
('\u{3018}', '\u{3019}', None), ('\u{301a}', '\u{301b}', None), ('\u{fe59}', '\u{fe5a}', None),
('\u{fe5b}', '\u{fe5c}', None), ('\u{fe5d}', '\u{fe5e}', None), ('\u{ff08}', '\u{ff09}', None),
('\u{ff3b}', '\u{ff3d}', None), ('\u{ff5b}', '\u{ff5d}', None), ('\u{ff5f}', '\u{ff60}', None),
('\u{ff62}', '\u{ff63}', None)
];

11 changes: 7 additions & 4 deletions src/data_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,18 @@ use crate::BidiClass;
pub trait BidiDataSource {
fn bidi_class(&self, c: char) -> BidiClass;
/// If this character is a bracket according to BidiBrackets.txt,
/// return its corresponding matched bracket, and whether or not it is an
/// opening bracket
/// return the corresponding *normalized* *opening bracket* of the pair,
/// and whether or not it itself is an opening bracket.
///
/// This effectively buckets brackets into equivalence classes keyed on the
/// normalized opening bracket.
///
/// The default implementation will pull in a small amount of hardcoded data,
/// regardless of the `hardcoded-data` feature. This is in part for convenience
/// (since this data is small and changes less often), and in part so that this method can be
/// added without needing a breaking version bump.
/// Override this method in your custom data source to prevent the use of hardcoded data.
fn bidi_matched_bracket(&self, c: char) -> Option<(char, bool)> {
crate::char_data::bidi_matched_bracket(c)
fn bidi_matched_opening_bracket(&self, c: char) -> Option<(char, bool)> {
crate::char_data::bidi_matched_opening_bracket(c)
}
}
6 changes: 3 additions & 3 deletions src/implicit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ fn identify_bracket_pairs<D: BidiDataSource>(
continue;
}

if let Some((matched, is_open)) = data_source.bidi_matched_bracket(ch) {
if let Some((opening, is_open)) = data_source.bidi_matched_opening_bracket(ch) {
if is_open {
// If an opening paired bracket is found ...

Expand All @@ -381,7 +381,7 @@ fn identify_bracket_pairs<D: BidiDataSource>(
break;
}
// ... push its Bidi_Paired_Bracket property value and its text position onto the stack
stack.push((matched, i))
stack.push((opening, i))
} else {
// If a closing paired bracket is found, do the following

Expand All @@ -392,7 +392,7 @@ fn identify_bracket_pairs<D: BidiDataSource>(
for (stack_index, element) in stack.iter().enumerate().rev() {
// Compare the closing paired bracket being inspected or its canonical
// equivalent to the bracket in the current stack element.
if element.0 == ch {
if element.0 == opening {
// If the values match, meaning the two characters form a bracket pair, then

// Append the text position in the current stack element together with the
Expand Down
2 changes: 1 addition & 1 deletion tests/conformance_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ fn gen_base_levels_for_base_tests(bitset: u8) -> Vec<Option<Level>> {
}

#[test]
#[should_panic(expected = "69 test cases failed! (91638 passed)")]
#[should_panic(expected = "65 test cases failed! (91642 passed)")]
fn test_character_conformance() {
let test_data = include_str!("data/BidiCharacterTest.txt");

Expand Down
25 changes: 18 additions & 7 deletions tools/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def open_data(name):
def is_surrogate(n):
return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]

def load_bidi_pairs():
def load_bidi_pairs(on_decomps):
fetch_data(BIDI_BRACKETS_NAME)
arr = []
for line in fileinput.input(os.path.join(DATA_DIR, BIDI_BRACKETS_NAME)):
Expand All @@ -69,12 +69,21 @@ def load_bidi_pairs():
continue
cp1 = int(data[0], 16);
cp2 = int(data[1], 16);
arr += [(cp1, cp2)]
decomp = None
if cp1 in on_decomps:
decomp = int(on_decomps[cp1], 16)
arr += [(cp1, cp2, decomp)]
return arr

# Returns (group_categories, on_decomps),
# where on_decomps is a map containing canonical equivalents for
# ON characters only, and group_categories is the result of group_categories()
# on bidi properties
def load_unicode_data():
fetch_data(UNICODE_DATA_NAME)
udict = {};
# Decompositions of all ON characters that have them
on_decomps = {}

range_start = -1;
for line in fileinput.input(os.path.join(DATA_DIR, UNICODE_DATA_NAME)):
Expand Down Expand Up @@ -103,6 +112,8 @@ def load_unicode_data():

if bidi not in bidi_class:
bidi_class[bidi] = []
if len(decomp) != 0 and " " not in decomp:
on_decomps[code] = decomp
bidi_class[bidi].append(code)

# Default Bidi_Class for unassigned codepoints.
Expand All @@ -124,7 +135,7 @@ def load_unicode_data():
if not code in udict:
bidi_class[default].append(code)

return group_categories(bidi_class)
return (group_categories(bidi_class), on_decomps)

def group_categories(cats):
cats_out = []
Expand Down Expand Up @@ -223,8 +234,8 @@ def emit_bidi_module(file_, bidi_class_table, cats, bidi_pairs_table):
file_,
"bidi_pairs_table",
bidi_pairs_table,
"&'static [(char, char)]",
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])),
"&'static [(char, char, Option<char>)]",
pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), "Some(%s)" % escape_char(x[2]) if x[2] else "None"),
)

def get_unicode_version():
Expand All @@ -249,8 +260,8 @@ def get_unicode_version():
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
""" % unicode_version)

(bidi_categories, bidi_class_table) = load_unicode_data()
bidi_pairs_table = load_bidi_pairs()
((bidi_categories, bidi_class_table), on_decomps) = load_unicode_data()
bidi_pairs_table = load_bidi_pairs(on_decomps)
emit_bidi_module(file_, bidi_class_table, bidi_categories, bidi_pairs_table)

# Fetch test data files
Expand Down

0 comments on commit ea43d44

Please sign in to comment.