Fix NFKD for accented digraph followed by accent (unicode-org#4530)

Fixes unicode-org#4527
hsivonen · Jan 23, 2024 · 768235d · 768235d
1 parent 40b418c
commit 768235d
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,11 +4,15 @@
  - [Remove icu_datagen's dep on `fractional`](https://github.com/unicode-org/icu4x/pull/4472)
    - `icu_datagen@1.4.1`
 
+ - Fix normalization of character whose decomposition contains more than one starter and ends with a non-starter followed by a non-starter
+   with a lower Canonical Combining Class than the last character of the decomposition. (https://github.com/unicode-org/icu4x/pull/4530)
+   - `icu_normalizer@1.4.1`
+
 ## icu4x 1.4 (Nov 16, 2023)
 
 - General
   - MSRV is now 1.67
- 
+
 - Components
     - Compiled data updated to CLDR 44 and ICU 74 (https://github.com/unicode-org/icu4x/pull/4245)
     - `icu_calendar`

diff --git a/components/normalizer/src/lib.rs b/components/normalizer/src/lib.rs
@@ -637,7 +637,7 @@ where
                 i += 1;
                 // Half-width kana and iota subscript don't occur in the tails
                 // of these multicharacter decompositions.
-                if decomposition_starts_with_non_starter(trie_value) {
+                if !decomposition_starts_with_non_starter(trie_value) {
                     combining_start = i;
                 }
             }
@@ -676,7 +676,7 @@ where
                 i += 1;
                 // Half-width kana and iota subscript don't occur in the tails
                 // of these multicharacter decompositions.
-                if decomposition_starts_with_non_starter(trie_value) {
+                if !decomposition_starts_with_non_starter(trie_value) {
                     combining_start = i;
                 }
             }

diff --git a/components/normalizer/tests/tests.rs b/components/normalizer/tests/tests.rs
@@ -1308,6 +1308,28 @@ fn test_utf16_basic() {
     );
 }
 
+#[test]
+fn test_accented_digraph() {
+    let normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
+    assert_eq!(
+        normalizer.normalize("\u{01C4}\u{0323}"),
+        "DZ\u{0323}\u{030C}"
+    );
+    assert_eq!(
+        normalizer.normalize("DZ\u{030C}\u{0323}"),
+        "DZ\u{0323}\u{030C}"
+    );
+}
+
+#[test]
+fn test_ddd() {
+    let normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
+    assert_eq!(
+        normalizer.normalize("\u{0DDD}\u{0334}"),
+        "\u{0DD9}\u{0DCF}\u{0334}\u{0DCA}"
+    );
+}
+
 #[test]
 fn test_is_normalized() {
     let nfd: DecomposingNormalizer = DecomposingNormalizer::new_nfd();