Skip to content

Commit

Permalink
[base-dir] Add a function to get the base direction of a text
Browse files Browse the repository at this point in the history
This is a lightweight function equivalent to ICU's ubidi_getBaseDirection,
which just determines the base direction of a piece of text. This is intended
for use when the caller just wants the base directionality, and no further
bidi processing will be required.

Returns Direction::Ltr or Direction::Rtl according to the first strong character
(bidi type L, R, or AL) found in the text (outside of isolate embeddings).

If no strong character is found, returns Direction::Mixed (meaning Neutral in
this context, but to avoid breaking API compatibility I didn't want to add a
Neutral value to the Direction enum, and creating a new BaseDirection enum here
seemed like overkill).
  • Loading branch information
jfkthame committed Dec 6, 2023
1 parent e699bf1 commit b90ddb8
Showing 1 changed file with 62 additions and 0 deletions.
62 changes: 62 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1000,6 +1000,49 @@ fn assign_levels_to_removed_chars(para_level: Level, classes: &[BidiClass], leve
}
}

/// Get the base direction of the text provided according to the Unicode Bidirectional Algorithm.
///
/// See rules P2 and P3.
///
/// The base direction is derived from the first character in the string with bidi character type
/// L, R, or AL. If the first such character has type L, Direction::Ltr is returned. If the first
/// such character has type R or AL, Direction::Rtl is returned.
///
/// If the string does not contain any character of these types (outside of embedded isolate runs),
/// then Direction::Mixed is returned (but should be considered as meaning "neutral" or "unknown",
/// not in fact mixed directions).
///
/// This is a lightweight function for use when only the base direction is needed and no further
/// bidi processing of the text is needed.
///
/// If the text contains paragraph separators, this function considers only the first paragraph.
#[cfg(feature = "hardcoded-data")]
#[inline]
pub fn get_base_direction<'a, T: TextSource<'a> + ?Sized>(text: &'a T) -> Direction {
get_base_direction_with_data_source(&HardcodedBidiData, text)
}

pub fn get_base_direction_with_data_source<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
data_source: &D,
text: &'a T,
) -> Direction {
let mut isolate_level = 0;
for c in text.chars() {
match data_source.bidi_class(c) {
LRI | RLI | FSI => isolate_level = isolate_level + 1,
PDI if isolate_level > 0 => isolate_level = isolate_level - 1,
L if isolate_level == 0 => return Direction::Ltr,
R | AL if isolate_level == 0 => return Direction::Rtl,
B => break,
_ => (),
}
}
// If no strong char was found, return Mixed. Normally this will be treated as Ltr by callers
// (see rule P3), but we don't map this to Ltr here so that a caller that wants to apply other
// heuristics to an all-neutral paragraph can tell the difference.
Direction::Mixed
}

/// Implementation of TextSource for UTF-8 text (a string slice).
impl<'text> TextSource<'text> for str {
type CharIter = core::str::Chars<'text>;
Expand Down Expand Up @@ -1736,6 +1779,25 @@ mod tests {
assert_eq!(p_mixed.para.range.start, 21);
assert_eq!(p_mixed.level_at(ltr_text.len()), RTL_LEVEL);
}

#[test]
fn test_get_base_direction() {
let tests = vec![
("", Direction::Mixed), // return Mixed if no strong character found
("123[]-+\u{2019}\u{2060}\u{00bf}?", Direction::Mixed),
("3.14\npi", Direction::Mixed), // only first paragraph is considered
("[123 'abc']", Direction::Ltr),
("[123 '\u{0628}' abc", Direction::Rtl),
("[123 '\u{2066}abc\u{2069}'\u{0628}]", Direction::Rtl), // embedded isolate is ignored
("[123 '\u{2066}abc\u{2068}'\u{0628}]", Direction::Mixed),
];

for t in tests {
assert_eq!(get_base_direction(t.0), t.1);
let text = &to_utf16(t.0);
assert_eq!(get_base_direction(text.as_slice()), t.1);
}
}
}

#[cfg(all(feature = "serde", feature = "hardcoded-data", test))]
Expand Down

0 comments on commit b90ddb8

Please sign in to comment.