From c90040cb0a1f8c3282d08e5c14d7ebde7653475f Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Thu, 31 Mar 2022 10:41:32 +0300 Subject: [PATCH] Map -u-co- to provider variant --- experimental/collator/src/lib.rs | 85 +++++++++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 7 deletions(-) diff --git a/experimental/collator/src/lib.rs b/experimental/collator/src/lib.rs index 175b08c238b..ee004cdc3da 100644 --- a/experimental/collator/src/lib.rs +++ b/experimental/collator/src/lib.rs @@ -70,6 +70,7 @@ pub mod error; pub mod provider; extern crate alloc; +use crate::alloc::string::ToString; use crate::provider::CollationDataV1Marker; use crate::provider::CollationDiacriticsV1Marker; use crate::provider::CollationJamoV1Marker; @@ -2217,14 +2218,63 @@ impl<'data> Collator { > + ?Sized, { let locale = locale.into(); + + // TODO: Alias mapping here. + let langid: icu_locid::LanguageIdentifier = locale.clone().into(); + // TODO: Is there a compile-time macro for this? + let key = "co".parse::().unwrap(); + + // In the provider, the variant called "standard" in CLDR is represented + // as `variant: None`. It is the default for all but two languages: + // sv and zh. Since there are only two special cases, hard-coding them + // here for now instead of making the defaulting fancy and data driven. + // The Swedish naming seems ad hoc from + // https://unicode-org.atlassian.net/browse/CLDR-679 . + let variant = if let Some(extension) = locale.get_unicode_extension(&key) { + // Surely there should be a non-allocating way to get a + // &str out of a single-component extension. + let extension_string = extension.to_string(); + match extension_string.as_str() { + "trad" => Some("traditional".into()), + "phonebk" => Some("phonebook".into()), + "dict" => Some("dictionary".into()), + "gb2312" => Some("gb2312han".into()), + "standard" => None, + _ => { + // XXX Test that the provider ignores non-matching + // variants. + + // `Locale` should enforce that it's impossible + // to get these values that are longer than + // 8 ASCII characters: + debug_assert_ne!(extension_string, "traditional"); + debug_assert_ne!(extension_string, "phonebook"); + debug_assert_ne!(extension_string, "dictionary"); + debug_assert_ne!(extension_string, "gb2312han"); + Some(extension_string.into()) + } + } + } else { + if &langid.language == &"zh" { + Some("pinyin".into()) + } else if &langid.language == &"sv" { + Some("reformed".into()) + } else { + None + } + }; + + // TODO: Handle POSIX, which is part of the language instead + // of the -u-co- extension. + let metadata_payload: DataPayload = data_provider .load_resource(&DataRequest { options: ResourceOptions { langid: Some(langid.clone()), - variant: None, + variant: variant.clone(), }, metadata: Default::default(), })? @@ -2239,7 +2289,7 @@ impl<'data> Collator { .load_resource(&DataRequest { options: ResourceOptions { langid: Some(langid.clone()), - variant: None, + variant: variant, }, metadata: Default::default(), })? @@ -2267,6 +2317,8 @@ impl<'data> Collator { } else { Some(LanguageIdentifier::und()) }, + // XXX: Check if this should go inside the condition + // above. variant: None, }, metadata: Default::default(), @@ -2280,11 +2332,13 @@ impl<'data> Collator { let jamo: DataPayload = data_provider .load_resource(&DataRequest { options: ResourceOptions { - langid: if metadata.tailored_jamo() { - Some(langid.clone()) - } else { - Some(LanguageIdentifier::und()) - }, + // TODO: load other jamo tables. + // langid: if metadata.tailored_jamo() { + // Some(langid.clone()) + // } else { + // Some(LanguageIdentifier::und()) + // }, + langid: Some(LanguageIdentifier::und()), variant: None, }, metadata: Default::default(), @@ -3580,6 +3634,23 @@ mod tests { } } + // TODO: This test should eventually test fallback + // TODO: Test Swedish and Chinese, also, since they have unusual + // variant defaults. (But are currently not part of the test data.) + #[test] + fn test_region_fallback() { + // There's no explicit fi-FI data. + let locale: Locale = "fi-u-co-standard".parse().unwrap(); + + // let locale: Locale = langid!("fi-FI").into(); + + let data_provider = icu_testdata::get_provider(); + + let collator: Collator = + Collator::try_new(locale, &data_provider, CollatorOptions::new()).unwrap(); + assert_eq!(collator.compare("รค", "z"), Ordering::Greater); + } + // TODO: frcoll requires support for fr-CA // TODO: Write a test for Bangla