Skip to content

Commit

Permalink
implement dutch titlecasing
Browse files Browse the repository at this point in the history
  • Loading branch information
Manishearth committed Jun 28, 2023
1 parent eb2577c commit f6ade85
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 0 deletions.
3 changes: 3 additions & 0 deletions experimental/casemapping/src/casemapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,9 @@ impl CaseMapping {
///
/// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &root), "Եւ երևանի");
/// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &langid!("hy")), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature
///
/// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &root), "Ijkdijk");
/// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &langid!("nl")), "IJkdijk"); // Dutch IJ digraph
/// ```
pub fn titlecase_segment_to_string(&self, src: &str, langid: &LanguageIdentifier) -> String {
self.data
Expand Down
66 changes: 66 additions & 0 deletions experimental/casemapping/src/internals.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ use core::fmt;
use icu_locid::LanguageIdentifier;
use writeable::Writeable;

const ACUTE: char = '\u{301}';

// Used to control the behavior of CaseMapping::fold.
// Currently only used to decide whether to use Turkic (T) mappings for dotted/dotless i.
#[derive(Default)]
Expand Down Expand Up @@ -143,6 +145,18 @@ impl<'data> CaseMappingV1<'data> {
!IS_TITLE_CONTEXT || kind == MappingKind::Title || kind == MappingKind::Lower
);

// ICU4C's non-standard extension for Dutch IJ titlecasing
// handled here instead of in full_lower_special_case because J does not have conditional
// special casemapping.
if IS_TITLE_CONTEXT && locale == CaseMapLocale::Dutch && kind == MappingKind::Lower {
// When titlecasing, a J found immediately after an I at the beginning of the segment
// should also uppercase. They are both allowed to have an acute accent but it must
// be present on both letters or neither. They may not have any other combining marks.
if (c == 'j' || c == 'J') && context.is_dutch_ij_pair_at_beginning(self) {
return FullMappingResult::CodePoint('J');
}
}

let data = self.lookup_data(c);
if !data.has_exception() {
if data.is_relevant_to(kind) {
Expand Down Expand Up @@ -648,4 +662,56 @@ impl<'a> ContextIterator<'a> {
}
false
}

/// Checks the preceding and surrounding context of a j or J
/// and returns true if it is preceded by an i or I at the start of the string.
/// If one has an acute accent,
/// both must have the accent for this to return true. No other accents are handled.
fn is_dutch_ij_pair_at_beginning(&self, mapping: &CaseMappingV1) -> bool {
let mut before = self.before.chars().rev();
let mut i_has_acute = false;
loop {
match before.next() {
Some('i') | Some('I') => break,
Some('í') | Some('Í') => {
i_has_acute = true;
break;
}
Some(ACUTE) => i_has_acute = true,
_ => return false,
}
}

if before.next().is_some() {
// not at the beginning of a string, doesn't matter
return false;
}
let mut j_has_acute = false;
for c in self.after.chars() {
if c == ACUTE {
j_has_acute = true;
continue;
}
// We are supposed to check that `j` has no other combining marks aside
// from potentially an acute accent. Once we hit the first non-combining mark
// we are done.
//
// ICU4C checks for `gc=Mn` to determine if something is a combining mark,
// however this requires extra data (and is the *only* point in the casemapping algorithm
// where there is a direct dependency on properties data not mediated by the casemapping data trie).
//
// Instead, we can check for ccc via dot_type, the same way the rest of the algorithm does.
//
// See https://unicode-org.atlassian.net/browse/ICU-22429
match mapping.dot_type(c) {
// Not a combining character; ccc = 0
DotType::NoDot | DotType::SoftDotted => break,
// found combining character, bail
_ => return false,
}
}

// either both should have an acute accent, or none. this is an XNOR operation
!(j_has_acute ^ i_has_acute)
}
}
52 changes: 52 additions & 0 deletions experimental/casemapping/tests/conversions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,3 +214,55 @@ fn test_armenian() {
assert_eq!(cm.titlecase_segment_to_string(ew, &west), "Եւ");
assert_eq!(cm.titlecase_segment_to_string(yerevan, &west), "Երևանի");
}

#[test]
fn test_dutch() {
let cm = CaseMapping::new();
let nl = langid!("nl");

assert_eq!(cm.titlecase_segment_to_string("ijssel", &nl), "IJssel");
assert_eq!(cm.titlecase_segment_to_string("igloo", &nl), "Igloo");
assert_eq!(cm.titlecase_segment_to_string("IJMUIDEN", &nl), "IJmuiden");

assert_eq!(cm.titlecase_segment_to_string("ij", &nl), "IJ");
assert_eq!(cm.titlecase_segment_to_string("IJ", &nl), "IJ");
assert_eq!(cm.titlecase_segment_to_string("íj́", &nl), "ÍJ́");
assert_eq!(cm.titlecase_segment_to_string("ÍJ́", &nl), "ÍJ́");
assert_eq!(cm.titlecase_segment_to_string("íJ́", &nl), "ÍJ́");
assert_eq!(cm.titlecase_segment_to_string("Ij́", &nl), "Ij́");
assert_eq!(cm.titlecase_segment_to_string("ij́", &nl), "Ij́");
assert_eq!(cm.titlecase_segment_to_string("ïj́", &nl), "Ïj́");
assert_eq!(
cm.titlecase_segment_to_string("íj\u{0308}", &nl),
"Íj\u{0308}"
);
assert_eq!(
cm.titlecase_segment_to_string("íj́\u{1D16E}", &nl),
"Íj́\u{1D16E}"
);
assert_eq!(
cm.titlecase_segment_to_string("íj\u{1ABE}", &nl),
"Íj\u{1ABE}"
);

assert_eq!(cm.titlecase_segment_to_string("ijabc", &nl), "IJabc");
assert_eq!(cm.titlecase_segment_to_string("IJabc", &nl), "IJabc");
assert_eq!(cm.titlecase_segment_to_string("íj́abc", &nl), "ÍJ́abc");
assert_eq!(cm.titlecase_segment_to_string("ÍJ́abc", &nl), "ÍJ́abc");
assert_eq!(cm.titlecase_segment_to_string("íJ́abc", &nl), "ÍJ́abc");
assert_eq!(cm.titlecase_segment_to_string("Ij́abc", &nl), "Ij́abc");
assert_eq!(cm.titlecase_segment_to_string("ij́abc", &nl), "Ij́abc");
assert_eq!(cm.titlecase_segment_to_string("ïj́abc", &nl), "Ïj́abc");
assert_eq!(
cm.titlecase_segment_to_string("íjabc\u{0308}", &nl),
"Íjabc\u{0308}"
);
assert_eq!(
cm.titlecase_segment_to_string("íj́abc\u{1D16E}", &nl),
"ÍJ́abc\u{1D16E}"
);
assert_eq!(
cm.titlecase_segment_to_string("íjabc\u{1ABE}", &nl),
"Íjabc\u{1ABE}"
);
}

0 comments on commit f6ade85

Please sign in to comment.