From 768235d3f16d163c593fba71c89eed6c7f1b61d5 Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Thu, 18 Jan 2024 16:12:41 +0200 Subject: [PATCH] Fix NFKD for accented digraph followed by accent (#4530) Fixes https://github.com/unicode-org/icu4x/discussions/4527 --- CHANGELOG.md | 6 +++++- components/normalizer/src/lib.rs | 4 ++-- components/normalizer/tests/tests.rs | 22 ++++++++++++++++++++++ 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e21bcc364b..9ba270ae6e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,11 +4,15 @@ - [Remove icu_datagen's dep on `fractional`](https://github.com/unicode-org/icu4x/pull/4472) - `icu_datagen@1.4.1` + - Fix normalization of character whose decomposition contains more than one starter and ends with a non-starter followed by a non-starter + with a lower Canonical Combining Class than the last character of the decomposition. (https://github.com/unicode-org/icu4x/pull/4530) + - `icu_normalizer@1.4.1` + ## icu4x 1.4 (Nov 16, 2023) - General - MSRV is now 1.67 - + - Components - Compiled data updated to CLDR 44 and ICU 74 (https://github.com/unicode-org/icu4x/pull/4245) - `icu_calendar` diff --git a/components/normalizer/src/lib.rs b/components/normalizer/src/lib.rs index 067b15d933f..9d71e72287a 100644 --- a/components/normalizer/src/lib.rs +++ b/components/normalizer/src/lib.rs @@ -637,7 +637,7 @@ where i += 1; // Half-width kana and iota subscript don't occur in the tails // of these multicharacter decompositions. - if decomposition_starts_with_non_starter(trie_value) { + if !decomposition_starts_with_non_starter(trie_value) { combining_start = i; } } @@ -676,7 +676,7 @@ where i += 1; // Half-width kana and iota subscript don't occur in the tails // of these multicharacter decompositions. - if decomposition_starts_with_non_starter(trie_value) { + if !decomposition_starts_with_non_starter(trie_value) { combining_start = i; } } diff --git a/components/normalizer/tests/tests.rs b/components/normalizer/tests/tests.rs index d95486e5e13..f6d1a1e49c5 100644 --- a/components/normalizer/tests/tests.rs +++ b/components/normalizer/tests/tests.rs @@ -1308,6 +1308,28 @@ fn test_utf16_basic() { ); } +#[test] +fn test_accented_digraph() { + let normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfkd(); + assert_eq!( + normalizer.normalize("\u{01C4}\u{0323}"), + "DZ\u{0323}\u{030C}" + ); + assert_eq!( + normalizer.normalize("DZ\u{030C}\u{0323}"), + "DZ\u{0323}\u{030C}" + ); +} + +#[test] +fn test_ddd() { + let normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfd(); + assert_eq!( + normalizer.normalize("\u{0DDD}\u{0334}"), + "\u{0DD9}\u{0DCF}\u{0334}\u{0DCA}" + ); +} + #[test] fn test_is_normalized() { let nfd: DecomposingNormalizer = DecomposingNormalizer::new_nfd();