Skip to content

Commit

Permalink
Add icu_properties feature to icu_normalizer
Browse files Browse the repository at this point in the history
  • Loading branch information
Manishearth committed Sep 17, 2024
1 parent d434507 commit a901761
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 49 deletions.
3 changes: 2 additions & 1 deletion components/normalizer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ all-features = true
[dependencies]
displaydoc = { workspace = true }
icu_collections = { workspace = true }
icu_properties = { workspace = true }
icu_properties = { workspace = true, optional = true, default-features = false }
icu_provider = { workspace = true, features = ["macros"] }
smallvec = { workspace = true }
utf16_iter = { workspace = true }
Expand Down Expand Up @@ -53,6 +53,7 @@ serde = ["dep:serde", "icu_collections/serde", "zerovec/serde", "icu_properties/
datagen = ["serde", "dep:databake", "icu_collections/databake", "zerovec/databake", "icu_properties/datagen"]
experimental = []
compiled_data = ["dep:icu_normalizer_data", "icu_properties/compiled_data"]
icu_properties = ["dep:icu_properties"]

[[bench]]
name = "bench"
Expand Down
81 changes: 42 additions & 39 deletions components/normalizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,18 @@

extern crate alloc;

// We don't depend on icu_properties to minimize deps, but we want to be able
// to ensure we're using the right CCC values
macro_rules! ccc {
($name:ident, $num:expr) => {{
#[cfg(all(debug_assertions, feature = "icu_properties"))]
if icu_properties::CanonicalCombiningClass::$name.0 != $num {
panic!("icu_normalizer has incorrect ccc values")
}
$num
}};
}

pub mod properties;
pub mod provider;
pub mod uts46;
Expand All @@ -84,7 +96,6 @@ use icu_collections::char16trie::Char16Trie;
use icu_collections::char16trie::Char16TrieIterator;
use icu_collections::char16trie::TrieResult;
use icu_collections::codepointtrie::CodePointTrie;
use icu_properties::CanonicalCombiningClass;
use icu_provider::prelude::*;
use provider::CanonicalCompositionsV1Marker;
use provider::CanonicalDecompositionTablesV1Marker;
Expand All @@ -97,6 +108,9 @@ use utf8_iter::Utf8CharsEx;
use write16::Write16;
use zerovec::{zeroslice, ZeroSlice};

const CCC_NOT_REORDERED: u8 = ccc!(NotReordered, 0);
const CCC_ABOVE: u8 = ccc!(Above, 230);

#[derive(Debug)]
enum SupplementPayloadHolder {
Compatibility(DataPayload<CompatibilityDecompositionSupplementV1Marker>),
Expand Down Expand Up @@ -180,12 +194,12 @@ fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
///
/// The trie value must not be one that signifies a special non-starter
/// decomposition. (Debug-only)
fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
fn ccc_from_trie_value(trie_value: u32) -> u8 {
if trie_value_has_ccc(trie_value) {
CanonicalCombiningClass(trie_value as u8)
trie_value as u8
} else {
debug_assert_ne!(trie_value, SPECIAL_NON_STARTER_DECOMPOSITION_MARKER);
CanonicalCombiningClass::NotReordered
0
}
}

Expand Down Expand Up @@ -456,8 +470,8 @@ impl CharacterAndTrieValue {
struct CharacterAndClass(u32);

impl CharacterAndClass {
pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self {
CharacterAndClass(u32::from(c) | (u32::from(ccc.0) << 24))
pub fn new(c: char, ccc: u8) -> Self {
CharacterAndClass(u32::from(c) | (u32::from(ccc) << 24))
}
pub fn new_with_placeholder(c: char) -> Self {
CharacterAndClass(u32::from(c) | ((0xFF) << 24))
Expand All @@ -473,18 +487,18 @@ impl CharacterAndClass {
// originally.
unsafe { char::from_u32_unchecked(self.0 & 0xFFFFFF) }
}
pub fn ccc(&self) -> CanonicalCombiningClass {
CanonicalCombiningClass((self.0 >> 24) as u8)
pub fn ccc(&self) -> u8 {
(self.0 >> 24) as u8
}
pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) {
pub fn character_and_ccc(&self) -> (char, u8) {
(self.character(), self.ccc())
}
pub fn set_ccc_from_trie_if_not_already_set(&mut self, trie: &CodePointTrie<u32>) {
if self.0 >> 24 != 0xFF {
return;
}
let scalar = self.0 & 0xFFFFFF;
self.0 = ((ccc_from_trie_value(trie.get32_u32(scalar)).0 as u32) << 24) | scalar;
self.0 = ((ccc_from_trie_value(trie.get32_u32(scalar)) as u32) << 24) | scalar;
}
}

Expand Down Expand Up @@ -727,7 +741,7 @@ where
} else {
'\u{309A}'
},
0xD800 | u32::from(CanonicalCombiningClass::KanaVoicing.0),
0xD800 | ccc!(KanaVoicing, 8),
));
}
let trie_value = supplementary.get32(u32::from(c));
Expand Down Expand Up @@ -920,47 +934,36 @@ where
let mapped = match ch_and_trie_val.character {
'\u{0340}' => {
// COMBINING GRAVE TONE MARK
CharacterAndClass::new('\u{0300}', CanonicalCombiningClass::Above)
CharacterAndClass::new('\u{0300}', CCC_ABOVE)
}
'\u{0341}' => {
// COMBINING ACUTE TONE MARK
CharacterAndClass::new('\u{0301}', CanonicalCombiningClass::Above)
CharacterAndClass::new('\u{0301}', CCC_ABOVE)
}
'\u{0343}' => {
// COMBINING GREEK KORONIS
CharacterAndClass::new('\u{0313}', CanonicalCombiningClass::Above)
CharacterAndClass::new('\u{0313}', CCC_ABOVE)
}
'\u{0344}' => {
// COMBINING GREEK DIALYTIKA TONOS
self.buffer.push(CharacterAndClass::new(
'\u{0308}',
CanonicalCombiningClass::Above,
));
CharacterAndClass::new('\u{0301}', CanonicalCombiningClass::Above)
self.buffer
.push(CharacterAndClass::new('\u{0308}', CCC_ABOVE));
CharacterAndClass::new('\u{0301}', CCC_ABOVE)
}
'\u{0F73}' => {
// TIBETAN VOWEL SIGN II
self.buffer.push(CharacterAndClass::new(
'\u{0F71}',
CanonicalCombiningClass::CCC129,
));
CharacterAndClass::new('\u{0F72}', CanonicalCombiningClass::CCC130)
self.buffer.push(CharacterAndClass::new('\u{0F71}', 129));
CharacterAndClass::new('\u{0F72}', 130)
}
'\u{0F75}' => {
// TIBETAN VOWEL SIGN UU
self.buffer.push(CharacterAndClass::new(
'\u{0F71}',
CanonicalCombiningClass::CCC129,
));
CharacterAndClass::new('\u{0F74}', CanonicalCombiningClass::CCC132)
self.buffer.push(CharacterAndClass::new('\u{0F71}', 129));
CharacterAndClass::new('\u{0F74}', 132)
}
'\u{0F81}' => {
// TIBETAN VOWEL SIGN REVERSED II
self.buffer.push(CharacterAndClass::new(
'\u{0F71}',
CanonicalCombiningClass::CCC129,
));
CharacterAndClass::new('\u{0F80}', CanonicalCombiningClass::CCC130)
self.buffer.push(CharacterAndClass::new('\u{0F71}', 129));
CharacterAndClass::new('\u{0F80}', 130)
}
_ => {
// GIGO case
Expand Down Expand Up @@ -1085,7 +1088,7 @@ where
self.decomposition.buffer.clear();
self.decomposition.buffer_pos = 0;
}
if ccc == CanonicalCombiningClass::NotReordered {
if ccc == CCC_NOT_REORDERED {
// Previous decomposition contains a starter. This must
// now become the `unprocessed_starter` for it to have
// a chance to compose with the upcoming characters.
Expand Down Expand Up @@ -1177,7 +1180,7 @@ where
.drain(0..self.decomposition.buffer_pos);
}
self.decomposition.buffer_pos = 0;
if most_recent_skipped_ccc == CanonicalCombiningClass::NotReordered {
if most_recent_skipped_ccc == CCC_NOT_REORDERED {
// We failed to compose a starter. Discontiguous match not allowed.
// We leave the starter in `buffer` for `next()` to find.
return Some(starter);
Expand All @@ -1189,7 +1192,7 @@ where
.get(i)
.map(|c| c.character_and_ccc())
{
if ccc == CanonicalCombiningClass::NotReordered {
if ccc == CCC_NOT_REORDERED {
// Discontiguous match not allowed.
return Some(starter);
}
Expand Down Expand Up @@ -1333,7 +1336,7 @@ macro_rules! composing_normalize_to {
continue;
}
let mut most_recent_skipped_ccc = ccc;
if most_recent_skipped_ccc == CanonicalCombiningClass::NotReordered {
if most_recent_skipped_ccc == CCC_NOT_REORDERED {
// We failed to compose a starter. Discontiguous match not allowed.
// Write the current `starter` we've been composing, make the unmatched
// starter in the buffer the new `starter` (we know it's been decomposed)
Expand All @@ -1358,7 +1361,7 @@ macro_rules! composing_normalize_to {
.get(i)
.map(|c| c.character_and_ccc())
{
if ccc == CanonicalCombiningClass::NotReordered {
if ccc == CCC_NOT_REORDERED {
// Discontiguous match not allowed.
$sink.write_char(starter)?;
for cc in $composition.decomposition.buffer.drain(..i) {
Expand Down
46 changes: 37 additions & 9 deletions components/normalizer/src/properties.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ use crate::NON_ROUND_TRIP_MARKER;
use crate::SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16;
/// want access to the underlying properties e.g. for use in a
/// glyph-availability-guided custom normalizer.
#[cfg(feature = "icu_properties")]
use icu_properties::CanonicalCombiningClass;
use icu_provider::prelude::*;

Expand Down Expand Up @@ -563,29 +564,56 @@ impl<'a> Default for CanonicalCombiningClassMapBorrowed<'a> {
}

impl<'a> CanonicalCombiningClassMapBorrowed<'a> {
/// Look up the canonical combining class for a scalar value
/// Look up the canonical combining class for a scalar value.
///
/// The return value is a u8 representing the canonical combining class,
/// you may enable the `"icu_properties"` feature if you would like to use a typed
/// `CanonicalCombiningClass`.
#[inline(always)]
pub fn get(&self, c: char) -> CanonicalCombiningClass {
self.get32(u32::from(c))
pub fn get_u8(&self, c: char) -> u8 {
self.get32_u8(u32::from(c))
}

/// Look up the canonical combining class for a scalar value
/// represented as `u32`. If the argument is outside the scalar
/// value range, `CanonicalCombiningClass::NotReordered` is returned.
pub fn get32(&self, c: u32) -> CanonicalCombiningClass {
/// value range, `Not_Reordered` is returned.
///
/// The return value is a u8 representing the canonical combining class,
/// you may enable the `"icu_properties"` feature if you would like to use a typed
/// `CanonicalCombiningClass`.
pub fn get32_u8(&self, c: u32) -> u8 {
let trie_value = self.decompositions.trie.get32(c);
if trie_value_has_ccc(trie_value) {
CanonicalCombiningClass(trie_value as u8)
trie_value as u8
} else if trie_value_indicates_special_non_starter_decomposition(trie_value) {
match c {
0x0340 | 0x0341 | 0x0343 | 0x0344 => CanonicalCombiningClass::Above,
_ => CanonicalCombiningClass::NotReordered,
0x0340 | 0x0341 | 0x0343 | 0x0344 => ccc!(Above, 230),
_ => ccc!(NotReordered, 0),
}
} else {
CanonicalCombiningClass::NotReordered
ccc!(NotReordered, 0)
}
}

/// Look up the canonical combining class for a scalar value
///
/// ✨ *Enabled with the `icu_properties` Cargo feature.*
#[inline(always)]
#[cfg(feature = "icu_properties")]
pub fn get(&self, c: char) -> CanonicalCombiningClass {
CanonicalCombiningClass(self.get_u8(c))
}

/// Look up the canonical combining class for a scalar value
/// represented as `u32`. If the argument is outside the scalar
/// value range, `CanonicalCombiningClass::NotReordered` is returned.
///
/// ✨ *Enabled with the `icu_properties` Cargo feature.*
#[cfg(feature = "icu_properties")]
pub fn get32(&self, c: u32) -> CanonicalCombiningClass {
CanonicalCombiningClass(self.get32_u8(c))
}

/// Construct from compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
Expand Down

0 comments on commit a901761

Please sign in to comment.