From b90ddb8cd0545e448b442f8cf629617c26898457 Mon Sep 17 00:00:00 2001 From: Jonathan Kew Date: Wed, 6 Dec 2023 17:18:29 +0000 Subject: [PATCH] [base-dir] Add a function to get the base direction of a text This is a lightweight function equivalent to ICU's ubidi_getBaseDirection, which just determines the base direction of a piece of text. This is intended for use when the caller just wants the base directionality, and no further bidi processing will be required. Returns Direction::Ltr or Direction::Rtl according to the first strong character (bidi type L, R, or AL) found in the text (outside of isolate embeddings). If no strong character is found, returns Direction::Mixed (meaning Neutral in this context, but to avoid breaking API compatibility I didn't want to add a Neutral value to the Direction enum, and creating a new BaseDirection enum here seemed like overkill). --- src/lib.rs | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 00122df..2b8792f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1000,6 +1000,49 @@ fn assign_levels_to_removed_chars(para_level: Level, classes: &[BidiClass], leve } } +/// Get the base direction of the text provided according to the Unicode Bidirectional Algorithm. +/// +/// See rules P2 and P3. +/// +/// The base direction is derived from the first character in the string with bidi character type +/// L, R, or AL. If the first such character has type L, Direction::Ltr is returned. If the first +/// such character has type R or AL, Direction::Rtl is returned. +/// +/// If the string does not contain any character of these types (outside of embedded isolate runs), +/// then Direction::Mixed is returned (but should be considered as meaning "neutral" or "unknown", +/// not in fact mixed directions). +/// +/// This is a lightweight function for use when only the base direction is needed and no further +/// bidi processing of the text is needed. +/// +/// If the text contains paragraph separators, this function considers only the first paragraph. +#[cfg(feature = "hardcoded-data")] +#[inline] +pub fn get_base_direction<'a, T: TextSource<'a> + ?Sized>(text: &'a T) -> Direction { + get_base_direction_with_data_source(&HardcodedBidiData, text) +} + +pub fn get_base_direction_with_data_source<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( + data_source: &D, + text: &'a T, +) -> Direction { + let mut isolate_level = 0; + for c in text.chars() { + match data_source.bidi_class(c) { + LRI | RLI | FSI => isolate_level = isolate_level + 1, + PDI if isolate_level > 0 => isolate_level = isolate_level - 1, + L if isolate_level == 0 => return Direction::Ltr, + R | AL if isolate_level == 0 => return Direction::Rtl, + B => break, + _ => (), + } + } + // If no strong char was found, return Mixed. Normally this will be treated as Ltr by callers + // (see rule P3), but we don't map this to Ltr here so that a caller that wants to apply other + // heuristics to an all-neutral paragraph can tell the difference. + Direction::Mixed +} + /// Implementation of TextSource for UTF-8 text (a string slice). impl<'text> TextSource<'text> for str { type CharIter = core::str::Chars<'text>; @@ -1736,6 +1779,25 @@ mod tests { assert_eq!(p_mixed.para.range.start, 21); assert_eq!(p_mixed.level_at(ltr_text.len()), RTL_LEVEL); } + + #[test] + fn test_get_base_direction() { + let tests = vec![ + ("", Direction::Mixed), // return Mixed if no strong character found + ("123[]-+\u{2019}\u{2060}\u{00bf}?", Direction::Mixed), + ("3.14\npi", Direction::Mixed), // only first paragraph is considered + ("[123 'abc']", Direction::Ltr), + ("[123 '\u{0628}' abc", Direction::Rtl), + ("[123 '\u{2066}abc\u{2069}'\u{0628}]", Direction::Rtl), // embedded isolate is ignored + ("[123 '\u{2066}abc\u{2068}'\u{0628}]", Direction::Mixed), + ]; + + for t in tests { + assert_eq!(get_base_direction(t.0), t.1); + let text = &to_utf16(t.0); + assert_eq!(get_base_direction(text.as_slice()), t.1); + } + } } #[cfg(all(feature = "serde", feature = "hardcoded-data", test))]