From 5e5e16c6fddac511abba62ddda3481fc1f911254 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 28 Jun 2023 13:20:34 +0200 Subject: [PATCH 1/6] Add IS_TITLE_CONTEXT --- experimental/casemapping/src/casemapping.rs | 30 ++++++++++++------ experimental/casemapping/src/internals.rs | 34 ++++++++++++--------- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/experimental/casemapping/src/casemapping.rs b/experimental/casemapping/src/casemapping.rs index c5ae97995a2..7b000a744a0 100644 --- a/experimental/casemapping/src/casemapping.rs +++ b/experimental/casemapping/src/casemapping.rs @@ -87,7 +87,7 @@ impl CaseMapping { src: &'a str, langid: &LanguageIdentifier, ) -> impl Writeable + 'a { - self.data.get().full_helper_writeable( + self.data.get().full_helper_writeable::( src, CaseMapLocale::from_langid(langid), MappingKind::Lower, @@ -106,7 +106,7 @@ impl CaseMapping { src: &'a str, langid: &LanguageIdentifier, ) -> impl Writeable + 'a { - self.data.get().full_helper_writeable( + self.data.get().full_helper_writeable::( src, CaseMapLocale::from_langid(langid), MappingKind::Upper, @@ -123,7 +123,7 @@ impl CaseMapping { pub fn fold<'a>(&'a self, src: &'a str) -> impl Writeable + 'a { self.data .get() - .full_helper_writeable(src, CaseMapLocale::Root, MappingKind::Fold) + .full_helper_writeable::(src, CaseMapLocale::Root, MappingKind::Fold) } /// Case-folds the characters in the given string as a [`Writeable`], @@ -135,9 +135,11 @@ impl CaseMapping { /// See [`Self::fold_turkic_string()`] for the equivalent convenience function that returns a String, /// as well as for an example. pub fn fold_turkic<'a>(&'a self, src: &'a str) -> impl Writeable + 'a { - self.data - .get() - .full_helper_writeable(src, CaseMapLocale::Turkish, MappingKind::Fold) + self.data.get().full_helper_writeable::( + src, + CaseMapLocale::Turkish, + MappingKind::Fold, + ) } /// Returns the full lowercase mapping of the given string as a String. @@ -169,7 +171,11 @@ impl CaseMapping { pub fn lowercase_to_string(&self, src: &str, langid: &LanguageIdentifier) -> String { self.data .get() - .full_helper_writeable(src, CaseMapLocale::from_langid(langid), MappingKind::Lower) + .full_helper_writeable::( + src, + CaseMapLocale::from_langid(langid), + MappingKind::Lower, + ) .write_to_string() .into_owned() } @@ -206,7 +212,11 @@ impl CaseMapping { pub fn uppercase_to_string(&self, src: &str, langid: &LanguageIdentifier) -> String { self.data .get() - .full_helper_writeable(src, CaseMapLocale::from_langid(langid), MappingKind::Upper) + .full_helper_writeable::( + src, + CaseMapLocale::from_langid(langid), + MappingKind::Upper, + ) .write_to_string() .into_owned() } @@ -236,7 +246,7 @@ impl CaseMapping { pub fn fold_string(&self, src: &str) -> String { self.data .get() - .full_helper_writeable(src, CaseMapLocale::Root, MappingKind::Fold) + .full_helper_writeable::(src, CaseMapLocale::Root, MappingKind::Fold) .write_to_string() .into_owned() } @@ -270,7 +280,7 @@ impl CaseMapping { pub fn fold_turkic_string(&self, src: &str) -> String { self.data .get() - .full_helper_writeable(src, CaseMapLocale::Turkish, MappingKind::Fold) + .full_helper_writeable::(src, CaseMapLocale::Turkish, MappingKind::Fold) .write_to_string() .into_owned() } diff --git a/experimental/casemapping/src/internals.rs b/experimental/casemapping/src/internals.rs index 7586d53553d..27f36681270 100644 --- a/experimental/casemapping/src/internals.rs +++ b/experimental/casemapping/src/internals.rs @@ -126,13 +126,18 @@ impl<'data> CaseMappingV1<'data> { } #[inline(always)] - fn full_helper( + // IS_TITLE_CONTEXT must be true if kind is MappingKind::Title + fn full_helper( &self, c: char, context: ContextIterator, locale: CaseMapLocale, kind: MappingKind, ) -> FullMappingResult { + // IS_TITLE_CONTEXT exists to avoid perf impacts on the other, more common modes + // Ensure that they are either both true or both false, i.e. an XNOR operation + debug_assert!(!(IS_TITLE_CONTEXT ^ (kind == MappingKind::Title))); + let data = self.lookup_data(c); if !data.has_exception() { if data.is_relevant_to(kind) { @@ -151,12 +156,7 @@ impl<'data> CaseMappingV1<'data> { MappingKind::Lower => self.full_lower_special_case(c, context, locale), MappingKind::Fold => self.full_fold_special_case(c, context, locale), MappingKind::Upper | MappingKind::Title => self - .full_upper_or_title_special_case( - c, - context, - locale, - kind == MappingKind::Title, - ), + .full_upper_or_title_special_case::(c, context, locale), } { return special; } @@ -276,12 +276,11 @@ impl<'data> CaseMappingV1<'data> { None } - fn full_upper_or_title_special_case( + fn full_upper_or_title_special_case( &self, c: char, context: ContextIterator, locale: CaseMapLocale, - is_title: bool, ) -> Option { if locale == CaseMapLocale::Turkish && c == 'i' { // In Turkic languages, i turns into a dotted capital I. @@ -297,7 +296,7 @@ impl<'data> CaseMappingV1<'data> { } // ICU4C's non-standard extension for Armenian ligature ech-yiwn. if c == '\u{587}' { - return match (locale, is_title) { + return match (locale, IS_TITLE_CONTEXT) { (CaseMapLocale::Armenian, false) => Some(FullMappingResult::String("ԵՎ")), (CaseMapLocale::Armenian, true) => Some(FullMappingResult::String("Եվ")), (_, false) => Some(FullMappingResult::String("ԵՒ")), @@ -325,20 +324,20 @@ impl<'data> CaseMappingV1<'data> { (_, _) => None, } } - pub(crate) fn full_helper_writeable<'a: 'data>( + pub(crate) fn full_helper_writeable<'a: 'data, const IS_TITLE_CONTEXT: bool>( &'a self, src: &'a str, locale: CaseMapLocale, mapping: MappingKind, ) -> impl Writeable + 'a { - struct FullCaseWriteable<'a> { + struct FullCaseWriteable<'a, const IS_TITLE_CONTEXT: bool> { data: &'a CaseMappingV1<'a>, src: &'a str, locale: CaseMapLocale, mapping: MappingKind, } - impl<'a> Writeable for FullCaseWriteable<'a> { + impl<'a, const IS_TITLE_CONTEXT: bool> Writeable for FullCaseWriteable<'a, IS_TITLE_CONTEXT> { #[allow(clippy::indexing_slicing)] // last_uncopied_index and i are known to be in bounds fn write_to(&self, sink: &mut W) -> fmt::Result { // To speed up the copying of long runs where nothing changes, we keep track @@ -348,7 +347,12 @@ impl<'data> CaseMappingV1<'data> { let src = self.src; for (i, c) in src.char_indices() { let context = ContextIterator::new(&src[..i], &src[i..]); - match self.data.full_helper(c, context, self.locale, self.mapping) { + match self.data.full_helper::( + c, + context, + self.locale, + self.mapping, + ) { FullMappingResult::CodePoint(c2) => { if c == c2 { continue; @@ -378,7 +382,7 @@ impl<'data> CaseMappingV1<'data> { } } - FullCaseWriteable { + FullCaseWriteable:: { data: self, src, locale, From 210853dd24962e2df23ae14692f258169cb4b783 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Tue, 27 Jun 2023 15:27:24 +0200 Subject: [PATCH 2/6] Add titlecasing functions --- experimental/casemapping/src/casemapping.rs | 115 ++++++++++++++++---- 1 file changed, 91 insertions(+), 24 deletions(-) diff --git a/experimental/casemapping/src/casemapping.rs b/experimental/casemapping/src/casemapping.rs index 7b000a744a0..b2a60f0cb1f 100644 --- a/experimental/casemapping/src/casemapping.rs +++ b/experimental/casemapping/src/casemapping.rs @@ -113,6 +113,31 @@ impl CaseMapping { ) } + /// Returns the full titlecase mapping of the given string as a [`Writeable`], treating + /// the string as a single segment (and thus only titlecasing the beginning of it). + /// + /// This should typically be used as a lower-level helper to construct the titlecasing operation desired + /// by the application, for example one can titlecase on a per-word basis by mixing this with + /// a `WordSegmenter`. + /// + /// This function is context and language sensitive. Callers should pass the text's language + /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or + /// `Default::default()` for the root locale. + /// + /// See [`Self::titlecase_to_string()`] for the equivalent convenience function that returns a String, + /// as well as for an example. + pub fn titlecase_segment<'a>( + &'a self, + src: &'a str, + langid: &LanguageIdentifier, + ) -> impl Writeable + 'a { + self.data.get().full_helper_writeable::( + src, + CaseMapLocale::from_langid(langid), + MappingKind::Title, + ) + } + /// Case-folds the characters in the given string as a [`Writeable`]. /// This function is locale-independent and context-insensitive. /// @@ -221,6 +246,54 @@ impl CaseMapping { .into_owned() } + /// Returns the full titlecase mapping of the given string as a String, treating + /// the string as a single segment (and thus only titlecasing the beginning of it). + /// + /// This should typically be used as a lower-level helper to construct the titlecasing operation desired + /// by the application, for example one can titlecase on a per-word basis by mixing this with + /// a `WordSegmenter`. + /// + /// This function is context and language sensitive. Callers should pass the text's language + /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or + /// `Default::default()` for the root locale. + /// + /// See [`Self::titlecase_segment()`] for the equivalent lower-level function that returns a [`Writeable`] + /// + /// # Example + /// + /// ```rust,ignore + /// use icu_casemapping::CaseMapping; + /// use icu_locid::langid; + /// + /// let cm = CaseMapping::new(); + /// let root = langid!("und"); + /// + /// // note that the subsequent words are not titlecased, this function assumes + /// // that the entire string is a single segment and only titlecases at the beginning. + /// assert_eq!(cm.titlecase_segment_to_string("hEllO WorLd", &root), "Hello world"); + /// assert_eq!(cm.titlecase_segment_to_string("Γειά σου Κόσμε", &root), "Γειά σου κόσμε"); + /// assert_eq!(cm.titlecase_segment_to_string("नमस्ते दुनिया", &root), "नमस्ते दुनिया"); + /// assert_eq!(cm.titlecase_segment_to_string("Привет мир", &root), "Привет мир"); + /// + /// // Some behavior is language-sensitive + /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &root), "Istanbuk"); + /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &langid!("tr")), "İstanbul"); // Turkish dotted i + /// + /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &root), "Եւ Երևանի"); + /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &langid!("hy")), "Եվ Երևանի"); // Eastern Armenian ech-yiwn ligature + /// ``` + pub fn titlecase_segment_to_string(&self, src: &str, langid: &LanguageIdentifier) -> String { + self.data + .get() + .full_helper_writeable::( + src, + CaseMapLocale::from_langid(langid), + MappingKind::Title, + ) + .write_to_string() + .into_owned() + } + /// Case-folds the characters in the given string as a String. /// This function is locale-independent and context-insensitive. /// @@ -499,18 +572,6 @@ mod tests { use super::*; use icu_locid::langid; - impl CaseMapping { - /// Only for testing titlecase special-cases, does NOT - /// segment input string - fn titlecase_to_string_test(&self, src: &str, langid: &LanguageIdentifier) -> String { - self.data - .get() - .full_helper_writeable(src, CaseMapLocale::from_langid(langid), MappingKind::Title) - .write_to_string() - .into_owned() - } - } - #[test] /// Tests for SpecialCasing.txt. Some of the special cases are data-driven, some are code-driven fn test_special_cases() { @@ -543,22 +604,22 @@ mod tests { ); // U+1F80 GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI - assert_eq!(cm.titlecase_to_string_test("ᾀ", &root), "ᾈ"); + assert_eq!(cm.titlecase_segment_to_string("ᾀ", &root), "ᾈ"); assert_eq!(cm.uppercase_to_string("ᾀ", &root), "ἈΙ"); // U+1FFC GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI assert_eq!(cm.lowercase_to_string("ῼ", &root), "ῳ"); - assert_eq!(cm.titlecase_to_string_test("ῼ", &root), "ῼ"); + assert_eq!(cm.titlecase_segment_to_string("ῼ", &root), "ῼ"); assert_eq!(cm.uppercase_to_string("ῼ", &root), "ΩΙ"); // U+1F98 GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI assert_eq!(cm.lowercase_to_string("ᾘ", &root), "ᾐ"); - assert_eq!(cm.titlecase_to_string_test("ᾘ", &root), "ᾘ"); + assert_eq!(cm.titlecase_segment_to_string("ᾘ", &root), "ᾘ"); assert_eq!(cm.uppercase_to_string("ᾘ", &root), "ἨΙ"); // U+1FB2 GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI assert_eq!(cm.lowercase_to_string("ᾲ", &root), "ᾲ"); - assert_eq!(cm.titlecase_to_string_test("ᾲ", &root), "Ὰ\u{345}"); + assert_eq!(cm.titlecase_segment_to_string("ᾲ", &root), "Ὰ\u{345}"); assert_eq!(cm.uppercase_to_string("ᾲ", &root), "ᾺΙ"); // Final sigma test @@ -571,32 +632,38 @@ mod tests { // U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE assert_eq!(cm.lowercase_to_string("İ", &tr), "i"); assert_eq!(cm.lowercase_to_string("İ", &az), "i"); - assert_eq!(cm.titlecase_to_string_test("İ", &tr), "İ"); - assert_eq!(cm.titlecase_to_string_test("İ", &az), "İ"); + assert_eq!(cm.titlecase_segment_to_string("İ", &tr), "İ"); + assert_eq!(cm.titlecase_segment_to_string("İ", &az), "İ"); assert_eq!(cm.uppercase_to_string("İ", &tr), "İ"); assert_eq!(cm.uppercase_to_string("İ", &az), "İ"); // U+0049 LATIN CAPITAL LETTER I and U+0307 COMBINING DOT ABOVE assert_eq!(cm.lowercase_to_string("I\u{0307}", &tr), "i"); assert_eq!(cm.lowercase_to_string("I\u{0307}", &az), "i"); - assert_eq!(cm.titlecase_to_string_test("I\u{0307}", &tr), "I\u{0307}"); - assert_eq!(cm.titlecase_to_string_test("I\u{0307}", &az), "I\u{0307}"); + assert_eq!( + cm.titlecase_segment_to_string("I\u{0307}", &tr), + "I\u{0307}" + ); + assert_eq!( + cm.titlecase_segment_to_string("I\u{0307}", &az), + "I\u{0307}" + ); assert_eq!(cm.uppercase_to_string("I\u{0307}", &tr), "I\u{0307}"); assert_eq!(cm.uppercase_to_string("I\u{0307}", &az), "I\u{0307}"); // U+0049 LATIN CAPITAL LETTER I assert_eq!(cm.lowercase_to_string("I", &tr), "ı"); assert_eq!(cm.lowercase_to_string("I", &az), "ı"); - assert_eq!(cm.titlecase_to_string_test("I", &tr), "I"); - assert_eq!(cm.titlecase_to_string_test("I", &az), "I"); + assert_eq!(cm.titlecase_segment_to_string("I", &tr), "I"); + assert_eq!(cm.titlecase_segment_to_string("I", &az), "I"); assert_eq!(cm.uppercase_to_string("I", &tr), "I"); assert_eq!(cm.uppercase_to_string("I", &az), "I"); // U+0069 LATIN SMALL LETTER I assert_eq!(cm.lowercase_to_string("i", &tr), "i"); assert_eq!(cm.lowercase_to_string("i", &az), "i"); - assert_eq!(cm.titlecase_to_string_test("i", &tr), "İ"); - assert_eq!(cm.titlecase_to_string_test("i", &az), "İ"); + assert_eq!(cm.titlecase_segment_to_string("i", &tr), "İ"); + assert_eq!(cm.titlecase_segment_to_string("i", &az), "İ"); assert_eq!(cm.uppercase_to_string("i", &tr), "İ"); assert_eq!(cm.uppercase_to_string("i", &az), "İ"); } From 282d43aa870232ba373c664dfbd7479aeadd5b81 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 28 Jun 2023 14:00:01 +0200 Subject: [PATCH 3/6] Handle basic titlecasing --- experimental/casemapping/src/casemapping.rs | 35 ++++++++++--------- experimental/casemapping/src/internals.rs | 25 ++++++++++--- experimental/casemapping/tests/conversions.rs | 12 ++++--- 3 files changed, 47 insertions(+), 25 deletions(-) diff --git a/experimental/casemapping/src/casemapping.rs b/experimental/casemapping/src/casemapping.rs index b2a60f0cb1f..e69944975ce 100644 --- a/experimental/casemapping/src/casemapping.rs +++ b/experimental/casemapping/src/casemapping.rs @@ -261,7 +261,7 @@ impl CaseMapping { /// /// # Example /// - /// ```rust,ignore + /// ```rust /// use icu_casemapping::CaseMapping; /// use icu_locid::langid; /// @@ -276,11 +276,11 @@ impl CaseMapping { /// assert_eq!(cm.titlecase_segment_to_string("Привет мир", &root), "Привет мир"); /// /// // Some behavior is language-sensitive - /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &root), "Istanbuk"); + /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &root), "Istanbul"); /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &langid!("tr")), "İstanbul"); // Turkish dotted i /// - /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &root), "Եւ Երևանի"); - /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &langid!("hy")), "Եվ Երևանի"); // Eastern Armenian ech-yiwn ligature + /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &root), "Եւ երևանի"); + /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &langid!("hy")), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature /// ``` pub fn titlecase_segment_to_string(&self, src: &str, langid: &LanguageIdentifier) -> String { self.data @@ -300,7 +300,7 @@ impl CaseMapping { /// Can be used to test if two strings are case-insensitively equivalent. /// /// See [`Self::fold()`] for the equivalent lower-level function that returns a [`Writeable`] - /// + ///s s /// # Example /// /// ```rust @@ -638,18 +638,19 @@ mod tests { assert_eq!(cm.uppercase_to_string("İ", &az), "İ"); // U+0049 LATIN CAPITAL LETTER I and U+0307 COMBINING DOT ABOVE - assert_eq!(cm.lowercase_to_string("I\u{0307}", &tr), "i"); - assert_eq!(cm.lowercase_to_string("I\u{0307}", &az), "i"); - assert_eq!( - cm.titlecase_segment_to_string("I\u{0307}", &tr), - "I\u{0307}" - ); - assert_eq!( - cm.titlecase_segment_to_string("I\u{0307}", &az), - "I\u{0307}" - ); - assert_eq!(cm.uppercase_to_string("I\u{0307}", &tr), "I\u{0307}"); - assert_eq!(cm.uppercase_to_string("I\u{0307}", &az), "I\u{0307}"); + // TODO + // assert_eq!(cm.lowercase_to_string("I\u{0307}", &tr), "i"); + // assert_eq!(cm.lowercase_to_string("I\u{0307}", &az), "i"); + // assert_eq!( + // cm.titlecase_segment_to_string("I\u{0307}", &tr), + // "I\u{0307}" + // ); + // assert_eq!( + // cm.titlecase_segment_to_string("I\u{0307}", &az), + // "I\u{0307}" + // ); + // assert_eq!(cm.uppercase_to_string("I\u{0307}", &tr), "I\u{0307}"); + // assert_eq!(cm.uppercase_to_string("I\u{0307}", &az), "I\u{0307}"); // U+0049 LATIN CAPITAL LETTER I assert_eq!(cm.lowercase_to_string("I", &tr), "ı"); diff --git a/experimental/casemapping/src/internals.rs b/experimental/casemapping/src/internals.rs index 27f36681270..35fd1e4e591 100644 --- a/experimental/casemapping/src/internals.rs +++ b/experimental/casemapping/src/internals.rs @@ -127,6 +127,8 @@ impl<'data> CaseMappingV1<'data> { #[inline(always)] // IS_TITLE_CONTEXT must be true if kind is MappingKind::Title + // The kind may be a different kind with IS_TITLE_CONTEXT still true because + // titlecasing a segment involves switching to lowercase later fn full_helper( &self, c: char, @@ -134,9 +136,12 @@ impl<'data> CaseMappingV1<'data> { locale: CaseMapLocale, kind: MappingKind, ) -> FullMappingResult { - // IS_TITLE_CONTEXT exists to avoid perf impacts on the other, more common modes - // Ensure that they are either both true or both false, i.e. an XNOR operation - debug_assert!(!(IS_TITLE_CONTEXT ^ (kind == MappingKind::Title))); + // If using a title mapping IS_TITLE_CONTEXT must be true + debug_assert!(kind != MappingKind::Title || IS_TITLE_CONTEXT); + // In a title context, kind MUST be Title or Lower + debug_assert!( + !IS_TITLE_CONTEXT || kind == MappingKind::Title || kind == MappingKind::Lower + ); let data = self.lookup_data(c); if !data.has_exception() { @@ -324,12 +329,17 @@ impl<'data> CaseMappingV1<'data> { (_, _) => None, } } + /// IS_TITLE_CONTEXT is true iff the mapping is MappingKind::Title, primarily exists + /// to avoid perf impacts on other more common modes of operation pub(crate) fn full_helper_writeable<'a: 'data, const IS_TITLE_CONTEXT: bool>( &'a self, src: &'a str, locale: CaseMapLocale, mapping: MappingKind, ) -> impl Writeable + 'a { + // Ensure that they are either both true or both false, i.e. an XNOR operation + debug_assert!(!(IS_TITLE_CONTEXT ^ (mapping == MappingKind::Title))); + struct FullCaseWriteable<'a, const IS_TITLE_CONTEXT: bool> { data: &'a CaseMappingV1<'a>, src: &'a str, @@ -345,16 +355,20 @@ impl<'data> CaseMappingV1<'data> { let mut last_uncopied_idx = 0; let src = self.src; + let mut mapping = self.mapping; for (i, c) in src.char_indices() { let context = ContextIterator::new(&src[..i], &src[i..]); match self.data.full_helper::( c, context, self.locale, - self.mapping, + mapping, ) { FullMappingResult::CodePoint(c2) => { if c == c2 { + if IS_TITLE_CONTEXT { + mapping = MappingKind::Lower; + } continue; } sink.write_str(&src[last_uncopied_idx..i])?; @@ -371,6 +385,9 @@ impl<'data> CaseMappingV1<'data> { last_uncopied_idx = i + c.len_utf8(); } } + if IS_TITLE_CONTEXT { + mapping = MappingKind::Lower; + } } if last_uncopied_idx < src.len() { sink.write_str(&src[last_uncopied_idx..])?; diff --git a/experimental/casemapping/tests/conversions.rs b/experimental/casemapping/tests/conversions.rs index 08bfaf39bb8..3517a945759 100644 --- a/experimental/casemapping/tests/conversions.rs +++ b/experimental/casemapping/tests/conversions.rs @@ -205,8 +205,12 @@ fn test_armenian() { assert_eq!(cm.uppercase_to_string(s, &east), "ԵՎ ԵՐԵՎԱՆԻ"); assert_eq!(cm.uppercase_to_string(s, &west), "ԵՒ ԵՐԵՒԱՆԻ"); - // Titlecase doesn't work yet - // assert_eq!(cm.titlecase_to_string(s, &root), "Եւ Երևանի"); - // assert_eq!(cm.uppercase_to_string(s, &east), "Եվ Երևանի"); - // assert_eq!(cm.uppercase_to_string(s, &west), "Եւ Երևանի"); + let ew = "և"; + let yerevan = "Երևանի"; + assert_eq!(cm.titlecase_segment_to_string(ew, &root), "Եւ"); + assert_eq!(cm.titlecase_segment_to_string(yerevan, &root), "Երևանի"); + assert_eq!(cm.titlecase_segment_to_string(ew, &east), "Եվ"); + assert_eq!(cm.titlecase_segment_to_string(yerevan, &east), "Երևանի"); + assert_eq!(cm.titlecase_segment_to_string(ew, &west), "Եւ"); + assert_eq!(cm.titlecase_segment_to_string(yerevan, &west), "Երևանի"); } From 9babda136aae10ef76a111648a7f926709807141 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 28 Jun 2023 14:36:34 +0200 Subject: [PATCH 4/6] Handle turkish special case --- experimental/casemapping/src/casemapping.rs | 25 +++++++++-------- experimental/casemapping/src/internals.rs | 30 ++++++++++++++++----- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/experimental/casemapping/src/casemapping.rs b/experimental/casemapping/src/casemapping.rs index e69944975ce..e0ebfb89d0d 100644 --- a/experimental/casemapping/src/casemapping.rs +++ b/experimental/casemapping/src/casemapping.rs @@ -638,19 +638,18 @@ mod tests { assert_eq!(cm.uppercase_to_string("İ", &az), "İ"); // U+0049 LATIN CAPITAL LETTER I and U+0307 COMBINING DOT ABOVE - // TODO - // assert_eq!(cm.lowercase_to_string("I\u{0307}", &tr), "i"); - // assert_eq!(cm.lowercase_to_string("I\u{0307}", &az), "i"); - // assert_eq!( - // cm.titlecase_segment_to_string("I\u{0307}", &tr), - // "I\u{0307}" - // ); - // assert_eq!( - // cm.titlecase_segment_to_string("I\u{0307}", &az), - // "I\u{0307}" - // ); - // assert_eq!(cm.uppercase_to_string("I\u{0307}", &tr), "I\u{0307}"); - // assert_eq!(cm.uppercase_to_string("I\u{0307}", &az), "I\u{0307}"); + assert_eq!(cm.lowercase_to_string("I\u{0307}", &tr), "i"); + assert_eq!(cm.lowercase_to_string("I\u{0307}", &az), "i"); + assert_eq!( + cm.titlecase_segment_to_string("I\u{0307}", &tr), + "I\u{0307}" + ); + assert_eq!( + cm.titlecase_segment_to_string("I\u{0307}", &az), + "I\u{0307}" + ); + assert_eq!(cm.uppercase_to_string("I\u{0307}", &tr), "I\u{0307}"); + assert_eq!(cm.uppercase_to_string("I\u{0307}", &az), "I\u{0307}"); // U+0049 LATIN CAPITAL LETTER I assert_eq!(cm.lowercase_to_string("I", &tr), "ı"); diff --git a/experimental/casemapping/src/internals.rs b/experimental/casemapping/src/internals.rs index 35fd1e4e591..971e66ea37d 100644 --- a/experimental/casemapping/src/internals.rs +++ b/experimental/casemapping/src/internals.rs @@ -158,7 +158,9 @@ impl<'data> CaseMappingV1<'data> { let exception = self.exceptions.get(idx); if exception.bits.has_conditional_special() { if let Some(special) = match kind { - MappingKind::Lower => self.full_lower_special_case(c, context, locale), + MappingKind::Lower => { + self.full_lower_special_case::(c, context, locale) + } MappingKind::Fold => self.full_fold_special_case(c, context, locale), MappingKind::Upper | MappingKind::Title => self .full_upper_or_title_special_case::(c, context, locale), @@ -216,7 +218,7 @@ impl<'data> CaseMappingV1<'data> { } } - fn full_lower_special_case( + fn full_lower_special_case( &self, c: char, context: ContextIterator, @@ -252,10 +254,15 @@ impl<'data> CaseMappingV1<'data> { if c == '\u{130}' { // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri return Some(FullMappingResult::CodePoint('i')); - } else if c == '\u{307}' && context.preceded_by_capital_i(self) { + } else if c == '\u{307}' && context.preceded_by_capital_i::(self) { // When lowercasing, remove dot_above in the sequence I + dot_above, // which will turn into i. This matches the behaviour of the // canonically equivalent I-dot_above. + // + // In a titlecase context, we do not want to apply this behavior to cases where the I + // was at the beginning of the string, as that I and its marks should be handled by the + // uppercasing rules (which ignore it, see below) + return Some(FullMappingResult::Remove); } else if c == 'I' && !context.followed_by_dot_above(self) { // When lowercasing, unless an I is before a dot_above, it turns @@ -580,10 +587,21 @@ impl<'a> ContextIterator<'a> { } false } - fn preceded_by_capital_i(&self, mapping: &CaseMappingV1) -> bool { - for c in self.before.chars().rev() { + /// Checks if the preceding character is a capital I, allowing for non-Above combining characters in between. + /// + /// If I_MUST_NOT_START_STRING is true, additionally will require that the capital I does not start the string + fn preceded_by_capital_i( + &self, + mapping: &CaseMappingV1, + ) -> bool { + let mut iter = self.before.chars().rev(); + while let Some(c) = iter.next() { if c == 'I' { - return true; + if I_MUST_NOT_START_STRING { + return iter.next().is_some(); + } else { + return true; + } } if mapping.dot_type(c) != DotType::OtherAccent { break; From eb2577c7d59be237c5419f4dc1d24e28e045f1b6 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 28 Jun 2023 14:39:48 +0200 Subject: [PATCH 5/6] add another ypogegrammeni case --- experimental/casemapping/src/casemapping.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/experimental/casemapping/src/casemapping.rs b/experimental/casemapping/src/casemapping.rs index e0ebfb89d0d..4731d6dc67b 100644 --- a/experimental/casemapping/src/casemapping.rs +++ b/experimental/casemapping/src/casemapping.rs @@ -602,6 +602,11 @@ mod tests { cm.uppercase_to_string("α\u{0313}\u{0345}", &root), "Α\u{0313}Ι" ); + // but the YPOGEGRAMMENI should not titlecase + assert_eq!( + cm.titlecase_segment_to_string("α\u{0313}\u{0345}", &root), + "Α\u{0313}\u{0345}" + ); // U+1F80 GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI assert_eq!(cm.titlecase_segment_to_string("ᾀ", &root), "ᾈ"); From f6ade85c0b221c2d79cee17b1259e55e2048d621 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 28 Jun 2023 15:42:29 +0200 Subject: [PATCH 6/6] implement dutch titlecasing --- experimental/casemapping/src/casemapping.rs | 3 + experimental/casemapping/src/internals.rs | 66 +++++++++++++++++++ experimental/casemapping/tests/conversions.rs | 52 +++++++++++++++ 3 files changed, 121 insertions(+) diff --git a/experimental/casemapping/src/casemapping.rs b/experimental/casemapping/src/casemapping.rs index 4731d6dc67b..e284e138baf 100644 --- a/experimental/casemapping/src/casemapping.rs +++ b/experimental/casemapping/src/casemapping.rs @@ -281,6 +281,9 @@ impl CaseMapping { /// /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &root), "Եւ երևանի"); /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &langid!("hy")), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature + /// + /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &root), "Ijkdijk"); + /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &langid!("nl")), "IJkdijk"); // Dutch IJ digraph /// ``` pub fn titlecase_segment_to_string(&self, src: &str, langid: &LanguageIdentifier) -> String { self.data diff --git a/experimental/casemapping/src/internals.rs b/experimental/casemapping/src/internals.rs index 971e66ea37d..54afce90998 100644 --- a/experimental/casemapping/src/internals.rs +++ b/experimental/casemapping/src/internals.rs @@ -14,6 +14,8 @@ use core::fmt; use icu_locid::LanguageIdentifier; use writeable::Writeable; +const ACUTE: char = '\u{301}'; + // Used to control the behavior of CaseMapping::fold. // Currently only used to decide whether to use Turkic (T) mappings for dotted/dotless i. #[derive(Default)] @@ -143,6 +145,18 @@ impl<'data> CaseMappingV1<'data> { !IS_TITLE_CONTEXT || kind == MappingKind::Title || kind == MappingKind::Lower ); + // ICU4C's non-standard extension for Dutch IJ titlecasing + // handled here instead of in full_lower_special_case because J does not have conditional + // special casemapping. + if IS_TITLE_CONTEXT && locale == CaseMapLocale::Dutch && kind == MappingKind::Lower { + // When titlecasing, a J found immediately after an I at the beginning of the segment + // should also uppercase. They are both allowed to have an acute accent but it must + // be present on both letters or neither. They may not have any other combining marks. + if (c == 'j' || c == 'J') && context.is_dutch_ij_pair_at_beginning(self) { + return FullMappingResult::CodePoint('J'); + } + } + let data = self.lookup_data(c); if !data.has_exception() { if data.is_relevant_to(kind) { @@ -648,4 +662,56 @@ impl<'a> ContextIterator<'a> { } false } + + /// Checks the preceding and surrounding context of a j or J + /// and returns true if it is preceded by an i or I at the start of the string. + /// If one has an acute accent, + /// both must have the accent for this to return true. No other accents are handled. + fn is_dutch_ij_pair_at_beginning(&self, mapping: &CaseMappingV1) -> bool { + let mut before = self.before.chars().rev(); + let mut i_has_acute = false; + loop { + match before.next() { + Some('i') | Some('I') => break, + Some('í') | Some('Í') => { + i_has_acute = true; + break; + } + Some(ACUTE) => i_has_acute = true, + _ => return false, + } + } + + if before.next().is_some() { + // not at the beginning of a string, doesn't matter + return false; + } + let mut j_has_acute = false; + for c in self.after.chars() { + if c == ACUTE { + j_has_acute = true; + continue; + } + // We are supposed to check that `j` has no other combining marks aside + // from potentially an acute accent. Once we hit the first non-combining mark + // we are done. + // + // ICU4C checks for `gc=Mn` to determine if something is a combining mark, + // however this requires extra data (and is the *only* point in the casemapping algorithm + // where there is a direct dependency on properties data not mediated by the casemapping data trie). + // + // Instead, we can check for ccc via dot_type, the same way the rest of the algorithm does. + // + // See https://unicode-org.atlassian.net/browse/ICU-22429 + match mapping.dot_type(c) { + // Not a combining character; ccc = 0 + DotType::NoDot | DotType::SoftDotted => break, + // found combining character, bail + _ => return false, + } + } + + // either both should have an acute accent, or none. this is an XNOR operation + !(j_has_acute ^ i_has_acute) + } } diff --git a/experimental/casemapping/tests/conversions.rs b/experimental/casemapping/tests/conversions.rs index 3517a945759..d461bc76195 100644 --- a/experimental/casemapping/tests/conversions.rs +++ b/experimental/casemapping/tests/conversions.rs @@ -214,3 +214,55 @@ fn test_armenian() { assert_eq!(cm.titlecase_segment_to_string(ew, &west), "Եւ"); assert_eq!(cm.titlecase_segment_to_string(yerevan, &west), "Երևանի"); } + +#[test] +fn test_dutch() { + let cm = CaseMapping::new(); + let nl = langid!("nl"); + + assert_eq!(cm.titlecase_segment_to_string("ijssel", &nl), "IJssel"); + assert_eq!(cm.titlecase_segment_to_string("igloo", &nl), "Igloo"); + assert_eq!(cm.titlecase_segment_to_string("IJMUIDEN", &nl), "IJmuiden"); + + assert_eq!(cm.titlecase_segment_to_string("ij", &nl), "IJ"); + assert_eq!(cm.titlecase_segment_to_string("IJ", &nl), "IJ"); + assert_eq!(cm.titlecase_segment_to_string("íj́", &nl), "ÍJ́"); + assert_eq!(cm.titlecase_segment_to_string("ÍJ́", &nl), "ÍJ́"); + assert_eq!(cm.titlecase_segment_to_string("íJ́", &nl), "ÍJ́"); + assert_eq!(cm.titlecase_segment_to_string("Ij́", &nl), "Ij́"); + assert_eq!(cm.titlecase_segment_to_string("ij́", &nl), "Ij́"); + assert_eq!(cm.titlecase_segment_to_string("ïj́", &nl), "Ïj́"); + assert_eq!( + cm.titlecase_segment_to_string("íj\u{0308}", &nl), + "Íj\u{0308}" + ); + assert_eq!( + cm.titlecase_segment_to_string("íj́\u{1D16E}", &nl), + "Íj́\u{1D16E}" + ); + assert_eq!( + cm.titlecase_segment_to_string("íj\u{1ABE}", &nl), + "Íj\u{1ABE}" + ); + + assert_eq!(cm.titlecase_segment_to_string("ijabc", &nl), "IJabc"); + assert_eq!(cm.titlecase_segment_to_string("IJabc", &nl), "IJabc"); + assert_eq!(cm.titlecase_segment_to_string("íj́abc", &nl), "ÍJ́abc"); + assert_eq!(cm.titlecase_segment_to_string("ÍJ́abc", &nl), "ÍJ́abc"); + assert_eq!(cm.titlecase_segment_to_string("íJ́abc", &nl), "ÍJ́abc"); + assert_eq!(cm.titlecase_segment_to_string("Ij́abc", &nl), "Ij́abc"); + assert_eq!(cm.titlecase_segment_to_string("ij́abc", &nl), "Ij́abc"); + assert_eq!(cm.titlecase_segment_to_string("ïj́abc", &nl), "Ïj́abc"); + assert_eq!( + cm.titlecase_segment_to_string("íjabc\u{0308}", &nl), + "Íjabc\u{0308}" + ); + assert_eq!( + cm.titlecase_segment_to_string("íj́abc\u{1D16E}", &nl), + "ÍJ́abc\u{1D16E}" + ); + assert_eq!( + cm.titlecase_segment_to_string("íjabc\u{1ABE}", &nl), + "Íjabc\u{1ABE}" + ); +}