implement dutch titlecasing

unicode-org · Jun 28, 2023 · f6ade85 · f6ade85
1 parent eb2577c
commit f6ade85
Show file tree

Hide file tree

Showing 3 changed files with 121 additions and 0 deletions.
diff --git a/experimental/casemapping/src/casemapping.rs b/experimental/casemapping/src/casemapping.rs
@@ -281,6 +281,9 @@ impl CaseMapping {
     ///
     /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &root), "Եւ երևանի");
     /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &langid!("hy")), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature
+    ///
+    /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &root), "Ijkdijk");
+    /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &langid!("nl")), "IJkdijk"); // Dutch IJ digraph
     /// ```
     pub fn titlecase_segment_to_string(&self, src: &str, langid: &LanguageIdentifier) -> String {
         self.data

diff --git a/experimental/casemapping/src/internals.rs b/experimental/casemapping/src/internals.rs
@@ -14,6 +14,8 @@ use core::fmt;
 use icu_locid::LanguageIdentifier;
 use writeable::Writeable;
 
+const ACUTE: char = '\u{301}';
+
 // Used to control the behavior of CaseMapping::fold.
 // Currently only used to decide whether to use Turkic (T) mappings for dotted/dotless i.
 #[derive(Default)]
@@ -143,6 +145,18 @@ impl<'data> CaseMappingV1<'data> {
             !IS_TITLE_CONTEXT || kind == MappingKind::Title || kind == MappingKind::Lower
         );
 
+        // ICU4C's non-standard extension for Dutch IJ titlecasing
+        // handled here instead of in full_lower_special_case because J does not have conditional
+        // special casemapping.
+        if IS_TITLE_CONTEXT && locale == CaseMapLocale::Dutch && kind == MappingKind::Lower {
+            // When titlecasing, a J found immediately after an I at the beginning of the segment
+            // should also uppercase. They are both allowed to have an acute accent but it must
+            // be present on both letters or neither. They may not have any other combining marks.
+            if (c == 'j' || c == 'J') && context.is_dutch_ij_pair_at_beginning(self) {
+                return FullMappingResult::CodePoint('J');
+            }
+        }
+
         let data = self.lookup_data(c);
         if !data.has_exception() {
             if data.is_relevant_to(kind) {
@@ -648,4 +662,56 @@ impl<'a> ContextIterator<'a> {
         }
         false
     }
+
+    /// Checks the preceding and surrounding context of a j or J
+    /// and returns true if it is preceded by an i or I at the start of the string.
+    /// If one has an acute accent,
+    /// both must have the accent for this to return true. No other accents are handled.
+    fn is_dutch_ij_pair_at_beginning(&self, mapping: &CaseMappingV1) -> bool {
+        let mut before = self.before.chars().rev();
+        let mut i_has_acute = false;
+        loop {
+            match before.next() {
+                Some('i') | Some('I') => break,
+                Some('í') | Some('Í') => {
+                    i_has_acute = true;
+                    break;
+                }
+                Some(ACUTE) => i_has_acute = true,
+                _ => return false,
+            }
+        }
+
+        if before.next().is_some() {
+            // not at the beginning of a string, doesn't matter
+            return false;
+        }
+        let mut j_has_acute = false;
+        for c in self.after.chars() {
+            if c == ACUTE {
+                j_has_acute = true;
+                continue;
+            }
+            // We are supposed to check that `j` has no other combining marks aside
+            // from potentially an acute accent. Once we hit the first non-combining mark
+            // we are done.
+            //
+            // ICU4C checks for `gc=Mn` to determine if something is a combining mark,
+            // however this requires extra data (and is the *only* point in the casemapping algorithm
+            // where there is a direct dependency on properties data not mediated by the casemapping data trie).
+            //
+            // Instead, we can check for ccc via dot_type, the same way the rest of the algorithm does.
+            //
+            // See https://unicode-org.atlassian.net/browse/ICU-22429
+            match mapping.dot_type(c) {
+                // Not a combining character; ccc = 0
+                DotType::NoDot | DotType::SoftDotted => break,
+                // found combining character, bail
+                _ => return false,
+            }
+        }
+
+        // either both should have an acute accent, or none. this is an XNOR operation
+        !(j_has_acute ^ i_has_acute)
+    }
 }
diff --git a/experimental/casemapping/tests/conversions.rs b/experimental/casemapping/tests/conversions.rs
@@ -214,3 +214,55 @@ fn test_armenian() {
     assert_eq!(cm.titlecase_segment_to_string(ew, &west), "Եւ");
     assert_eq!(cm.titlecase_segment_to_string(yerevan, &west), "Երևանի");
 }
+
+#[test]
+fn test_dutch() {
+    let cm = CaseMapping::new();
+    let nl = langid!("nl");
+
+    assert_eq!(cm.titlecase_segment_to_string("ijssel", &nl), "IJssel");
+    assert_eq!(cm.titlecase_segment_to_string("igloo", &nl), "Igloo");
+    assert_eq!(cm.titlecase_segment_to_string("IJMUIDEN", &nl), "IJmuiden");
+
+    assert_eq!(cm.titlecase_segment_to_string("ij", &nl), "IJ");
+    assert_eq!(cm.titlecase_segment_to_string("IJ", &nl), "IJ");
+    assert_eq!(cm.titlecase_segment_to_string("íj́", &nl), "ÍJ́");
+    assert_eq!(cm.titlecase_segment_to_string("ÍJ́", &nl), "ÍJ́");
+    assert_eq!(cm.titlecase_segment_to_string("íJ́", &nl), "ÍJ́");
+    assert_eq!(cm.titlecase_segment_to_string("Ij́", &nl), "Ij́");
+    assert_eq!(cm.titlecase_segment_to_string("ij́", &nl), "Ij́");
+    assert_eq!(cm.titlecase_segment_to_string("ïj́", &nl), "Ïj́");
+    assert_eq!(
+        cm.titlecase_segment_to_string("íj\u{0308}", &nl),
+        "Íj\u{0308}"
+    );
+    assert_eq!(
+        cm.titlecase_segment_to_string("íj́\u{1D16E}", &nl),
+        "Íj́\u{1D16E}"
+    );
+    assert_eq!(
+        cm.titlecase_segment_to_string("íj\u{1ABE}", &nl),
+        "Íj\u{1ABE}"
+    );
+
+    assert_eq!(cm.titlecase_segment_to_string("ijabc", &nl), "IJabc");
+    assert_eq!(cm.titlecase_segment_to_string("IJabc", &nl), "IJabc");
+    assert_eq!(cm.titlecase_segment_to_string("íj́abc", &nl), "ÍJ́abc");
+    assert_eq!(cm.titlecase_segment_to_string("ÍJ́abc", &nl), "ÍJ́abc");
+    assert_eq!(cm.titlecase_segment_to_string("íJ́abc", &nl), "ÍJ́abc");
+    assert_eq!(cm.titlecase_segment_to_string("Ij́abc", &nl), "Ij́abc");
+    assert_eq!(cm.titlecase_segment_to_string("ij́abc", &nl), "Ij́abc");
+    assert_eq!(cm.titlecase_segment_to_string("ïj́abc", &nl), "Ïj́abc");
+    assert_eq!(
+        cm.titlecase_segment_to_string("íjabc\u{0308}", &nl),
+        "Íjabc\u{0308}"
+    );
+    assert_eq!(
+        cm.titlecase_segment_to_string("íj́abc\u{1D16E}", &nl),
+        "ÍJ́abc\u{1D16E}"
+    );
+    assert_eq!(
+        cm.titlecase_segment_to_string("íjabc\u{1ABE}", &nl),
+        "Íjabc\u{1ABE}"
+    );
+}