Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make the normalizer work with new Unicode 16 normalization behaviors #4860

Merged
merged 9 commits into from
Oct 28, 2024
21 changes: 17 additions & 4 deletions components/normalizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,19 @@ const SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16: u16 = 2;

/// Marker that a complex decomposition isn't round-trippable
/// under re-composition.
const NON_ROUND_TRIP_MARKER: u16 = 1;
///
/// TODO: When taking a data format break, swap this and
/// `BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER` around
/// to make backward-combiningness use the same bit in all
/// cases.
const NON_ROUND_TRIP_MARKER: u16 = 0b1;

/// Marker that a complex decomposition starts with a starter
/// that can combine backwards.
const BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER: u16 = 0b10;

/// Values above this are treated as a BMP character.
const HIGHEST_MARKER: u16 = NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER;

/// Checks if a trie value carries a (non-zero) canonical
/// combining class.
Expand Down Expand Up @@ -398,6 +410,7 @@ impl CharacterAndTrieValue {
pub fn can_combine_backwards(&self) -> bool {
decomposition_starts_with_non_starter(self.trie_val)
|| self.trie_val == BACKWARD_COMBINING_STARTER_MARKER
|| (((self.trie_val as u16) & !1) == BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER && (self.trie_val >> 16) != 0) // Combine with the previous condition when taking a data format break
|| in_inclusive_range32(self.trie_val, 0x1161, 0x11C2)
}
#[inline(always)]
Expand Down Expand Up @@ -426,7 +439,7 @@ impl CharacterAndTrieValue {
if lead == 0 {
return true;
}
if lead == NON_ROUND_TRIP_MARKER {
if lead <= HIGHEST_MARKER {
return false;
}
if (trail_or_complex & 0x7F) == 0x3C
Expand Down Expand Up @@ -830,14 +843,14 @@ where
} else {
let trail_or_complex = (decomposition >> 16) as u16;
let lead = decomposition as u16;
if lead > NON_ROUND_TRIP_MARKER && trail_or_complex != 0 {
if lead > HIGHEST_MARKER && trail_or_complex != 0 {
// Decomposition into two BMP characters: starter and non-starter
let starter = char_from_u16(lead);
let combining = char_from_u16(trail_or_complex);
self.buffer
.push(CharacterAndClass::new_with_placeholder(combining));
(starter, 0)
} else if lead > NON_ROUND_TRIP_MARKER {
} else if trail_or_complex == 0 {
if lead != FDFA_MARKER {
debug_assert_ne!(
lead, SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16,
Expand Down
12 changes: 1 addition & 11 deletions components/normalizer/src/properties.rs
Original file line number Diff line number Diff line change
Expand Up @@ -396,17 +396,7 @@ impl CanonicalDecompositionBorrowed<'_> {
let offset24 = offset - tables.scalars16.len();
if let Some(first_c) = tables.scalars24.get(offset24) {
if len == 1 {
if c != first_c {
return Decomposed::Singleton(first_c);
} else {
// Singleton representation used to avoid
// NFC passthrough of characters that combine
// with starters that can occur as the first
// character of an expansion decomposition.
// See section 5 of
// https://www.unicode.org/L2/L2024/24009-utc178-properties-recs.pdf
return Decomposed::Default;
}
return Decomposed::Singleton(first_c);
}
if let Some(second_c) = tables.scalars24.get(offset24 + 1) {
return Decomposed::Expansion(first_c, second_c);
Expand Down