diff --git a/provider/baked/collator/data/macros/collator_meta_v1.rs.data b/provider/baked/collator/data/macros/collator_meta_v1.rs.data index 821f1c5774d..20abe13104e 100644 --- a/provider/baked/collator/data/macros/collator_meta_v1.rs.data +++ b/provider/baked/collator/data/macros/collator_meta_v1.rs.data @@ -20,8 +20,8 @@ macro_rules! __impl_collator_meta_v1 { static AR: ::Yokeable = icu::collator::provider::CollationMetadataV1 { bits: 41u32 }; static LT: ::Yokeable = icu::collator::provider::CollationMetadataV1 { bits: 73u32 }; static AF: ::Yokeable = icu::collator::provider::CollationMetadataV1 { bits: 9u32 }; - static VALUES: [&::Yokeable; 87usize] = [&AF, &AM, &AR, &AR, &AR, &AR, &AM, &AR, &AF, &AR, &AF, &AM, &AF, &AF, &DA, &AF, &AF, &AM, &AF, &AF, &AF, &AF, &AR, &AF, &AF, &AF, &AF, &FR_CA, &AF, &AF, &AR, &AF, &AM, &AR, &AR, &AF, &AF, &AR, &AF, &AF, &AR, &AM, &AR, &AR, &AR, &AR, &AR, &AR, &AR, &AM, <, &AF, &AR, &AR, &AM, &AR, &DA, &AR, &AM, &AF, &AR, &AR, &AF, &AR, &AF, &AM, &AR, &AF, &AF, &AF, &AR, &AF, &AR, &AR, &TH, &AF, &AF, &AF, &AR, &AR, &UND, &AR, &AF, &VI, &AF, &AF, &AR]; - static KEYS: [&str; 87usize] = ["af", "am", "ar", "as", "az", "be", "bg", "bn", "br", "bs", "ceb", "chr", "cs", "cy", "da", "de-u-co-phonebk", "dsb", "el", "en-US-posix", "eo", "es", "et", "fa", "ff-Adlm", "fi", "fil", "fo", "fr-CA", "fy", "gl", "gu", "ha", "he", "hi", "hr", "hsb", "hu", "hy", "ig", "is", "ja", "ka", "kk", "km", "kn", "ko", "kok", "ku", "ky", "lo", "lt", "lv", "mk", "ml", "mn", "mr", "mt", "my", "ne", "no", "or", "pa", "pl", "ps", "ro", "ru", "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "th", "tk", "to", "tr", "ug", "uk", "und", "ur", "uz", "vi", "wo", "yo", "zh"]; + static VALUES: [&::Yokeable; 88usize] = [&AF, &AM, &AR, &AR, &AR, &AR, &AM, &AR, &AF, &AR, &AF, &AM, &AF, &AF, &DA, &AF, &AF, &AF, &AM, &AF, &AF, &AF, &AF, &AR, &AF, &AF, &AF, &AF, &FR_CA, &AF, &AF, &AR, &AF, &AM, &AR, &AR, &AF, &AF, &AR, &AF, &AF, &AR, &AM, &AR, &AR, &AR, &AR, &AR, &AR, &AR, &AM, <, &AF, &AR, &AR, &AM, &AR, &DA, &AR, &AM, &AF, &AR, &AR, &AF, &AR, &AF, &AM, &AR, &AF, &AF, &AF, &AR, &AF, &AR, &AR, &TH, &AF, &AF, &AF, &AR, &AR, &UND, &AR, &AF, &VI, &AF, &AF, &AR]; + static KEYS: [&str; 88usize] = ["af", "am", "ar", "as", "az", "be", "bg", "bn", "br", "bs", "ceb", "chr", "cs", "cy", "da", "de-AT-u-co-phonebk", "de-u-co-phonebk", "dsb", "el", "en-US-posix", "eo", "es", "et", "fa", "ff-Adlm", "fi", "fil", "fo", "fr-CA", "fy", "gl", "gu", "ha", "he", "hi", "hr", "hsb", "hu", "hy", "ig", "is", "ja", "ka", "kk", "km", "kn", "ko", "kok", "ku", "ky", "lo", "lt", "lv", "mk", "ml", "mn", "mr", "mt", "my", "ne", "no", "or", "pa", "pl", "ps", "ro", "ru", "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "th", "tk", "to", "tr", "ug", "uk", "und", "ur", "uz", "vi", "wo", "yo", "zh"]; let mut metadata = icu_provider::DataResponseMetadata::default(); let payload = if let Ok(payload) = KEYS.binary_search_by(|k| req.locale.strict_cmp(k.as_bytes()).reverse()).map(|i| *unsafe { VALUES.get_unchecked(i) }) { payload diff --git a/provider/blob/src/export/mod.rs b/provider/blob/src/export/mod.rs index ad4886617bc..4292b0c50b7 100644 --- a/provider/blob/src/export/mod.rs +++ b/provider/blob/src/export/mod.rs @@ -23,7 +23,7 @@ //! DatagenDriver::new() //! .with_keys([icu_provider::hello_world::HelloWorldV1Marker::KEY]) //! .with_all_locales() -//! .export(&DatagenProvider::new_latest_tested(), exporter) +//! .export(&icu_provider::hello_world::HelloWorldProvider, exporter) //! .unwrap(); //! //! // communicate the blob to the client application (network, disk, etc.) diff --git a/provider/blob/tests/test_versions.rs b/provider/blob/tests/test_versions.rs index f254638b74b..427c7e00c41 100644 --- a/provider/blob/tests/test_versions.rs +++ b/provider/blob/tests/test_versions.rs @@ -16,7 +16,7 @@ fn run_driver(exporter: BlobExporter) -> Result<(), DataError> { DatagenDriver::new() .with_keys([icu_provider::hello_world::HelloWorldV1Marker::KEY]) .with_locales_and_fallback([LocaleFamily::full()], Default::default()) - .export(&DatagenProvider::new_custom(), exporter) + .export(&icu_provider::hello_world::HelloWorldProvider, exporter) } fn check_hello_world(blob_provider: impl DataProvider) { diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 9c36ced0c86..2c7517e4ca7 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -16,6 +16,7 @@ use once_cell::sync::Lazy; use std::collections::HashMap; use std::collections::HashSet; use std::fmt; +use std::hash::Hash; use std::str::FromStr; use std::time::Duration; use std::time::Instant; @@ -75,12 +76,52 @@ pub enum DeduplicationStrategy { None, } +/// Inner fields of a [`LocaleFamily`]. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub(crate) struct LocaleFamilyAnnotations { + include_ancestors: bool, + include_descendants: bool, +} + +impl LocaleFamilyAnnotations { + #[inline] + pub(crate) const fn with_descendants() -> Self { + Self { + include_ancestors: true, + include_descendants: true, + } + } + + #[inline] + pub(crate) const fn without_descendants() -> Self { + Self { + include_ancestors: true, + include_descendants: false, + } + } + + #[inline] + pub(crate) const fn without_ancestors() -> Self { + Self { + include_ancestors: false, + include_descendants: true, + } + } + + #[inline] + pub(crate) const fn single() -> Self { + Self { + include_ancestors: false, + include_descendants: false, + } + } +} + /// A family of locales to export. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct LocaleFamily { langid: Option, - include_ancestors: bool, - include_descendants: bool, + annotations: LocaleFamilyAnnotations, } impl LocaleFamily { @@ -95,11 +136,10 @@ impl LocaleFamily { /// - Descendants: "en-GB", "en-ZA", ... /// /// Stylized on the CLI as: "en-US" - pub fn with_descendants(langid: LanguageIdentifier) -> Self { + pub const fn with_descendants(langid: LanguageIdentifier) -> Self { Self { langid: Some(langid), - include_ancestors: true, - include_descendants: true, + annotations: LocaleFamilyAnnotations::with_descendants(), } } @@ -114,11 +154,29 @@ impl LocaleFamily { /// - Ancestors: "und", "en" /// /// Stylized on the CLI as: "^en-US" - pub fn without_descendants(langid: LanguageIdentifier) -> Self { + pub const fn without_descendants(langid: LanguageIdentifier) -> Self { Self { langid: Some(langid), - include_ancestors: true, - include_descendants: false, + annotations: LocaleFamilyAnnotations::without_descendants(), + } + } + + /// The family containing all descendants of the selected locale. + /// + /// This family may be useful if the root locale is not desired. + /// + /// For example, the family `::without_ancestors("en-001")` contains: + /// + /// - Self: "en-001" + /// - Descendants: "en-GB", "en-ZA", ... + /// + /// but it does _not_ contain the ancestors "en" and "und". + /// + /// Stylized on the CLI as: "%en-US" + pub const fn without_ancestors(langid: LanguageIdentifier) -> Self { + Self { + langid: Some(langid), + annotations: LocaleFamilyAnnotations::without_ancestors(), } } @@ -127,43 +185,89 @@ impl LocaleFamily { /// For example, the family `::single("en-001")` contains only "en-001". /// /// Stylized on the CLI as: "@en-US" - pub fn single(langid: LanguageIdentifier) -> Self { + pub const fn single(langid: LanguageIdentifier) -> Self { Self { langid: Some(langid), - include_ancestors: false, - include_descendants: false, + annotations: LocaleFamilyAnnotations::single(), } } /// The family containing all locales. /// /// Stylized on the CLI as: "full" - pub fn full() -> Self { + pub const fn full() -> Self { Self { langid: None, - include_ancestors: false, - include_descendants: true, + annotations: LocaleFamilyAnnotations { + include_ancestors: false, + include_descendants: true, + }, + } + } + + pub(crate) fn into_parts(self) -> (Option, LocaleFamilyAnnotations) { + (self.langid, self.annotations) + } + + pub(crate) fn as_borrowed(&self) -> LocaleFamilyBorrowed { + LocaleFamilyBorrowed { + langid: self.langid.as_ref(), + annotations: self.annotations, } } } impl Writeable for LocaleFamily { + #[inline] + fn write_to(&self, sink: &mut W) -> core::fmt::Result { + self.as_borrowed().write_to(sink) + } + + #[inline] + fn writeable_length_hint(&self) -> writeable::LengthHint { + self.as_borrowed().writeable_length_hint() + } +} + +writeable::impl_display_with_writeable!(LocaleFamily); + +/// A [`LocaleFamily`] that does not own its [`LanguageIdentifier`]. +pub(crate) struct LocaleFamilyBorrowed<'a> { + langid: Option<&'a LanguageIdentifier>, + annotations: LocaleFamilyAnnotations, +} + +impl<'a> LocaleFamilyBorrowed<'a> { + pub(crate) fn from_parts( + inner: (&'a Option, &LocaleFamilyAnnotations), + ) -> Self { + Self { + langid: inner.0.as_ref(), + annotations: *inner.1, + } + } +} + +impl Writeable for LocaleFamilyBorrowed<'_> { fn write_to(&self, sink: &mut W) -> core::fmt::Result { match ( &self.langid, - self.include_ancestors, - self.include_descendants, + self.annotations.include_ancestors, + self.annotations.include_descendants, ) { (Some(langid), true, true) => langid.write_to(sink), (Some(langid), true, false) => { sink.write_char('^')?; langid.write_to(sink) } + (Some(langid), false, true) => { + sink.write_char('%')?; + langid.write_to(sink) + } (Some(langid), false, false) => { sink.write_char('@')?; langid.write_to(sink) } - (Some(_), false, true) => unreachable!(), (None, _, _) => sink.write_str("full"), } } @@ -171,20 +275,18 @@ impl Writeable for LocaleFamily { fn writeable_length_hint(&self) -> writeable::LengthHint { match ( &self.langid, - self.include_ancestors, - self.include_descendants, + self.annotations.include_ancestors, + self.annotations.include_descendants, ) { (Some(langid), true, true) => langid.writeable_length_hint(), (Some(langid), true, false) => langid.writeable_length_hint() + 1, + (Some(langid), false, true) => langid.writeable_length_hint() + 1, (Some(langid), false, false) => langid.writeable_length_hint() + 1, - (Some(_), false, true) => unreachable!(), (None, _, _) => writeable::LengthHint::exact(4), } } } -writeable::impl_display_with_writeable!(LocaleFamily); - /// An error while parsing a [`LocaleFamily`]. #[derive(Debug, Copy, Clone, PartialEq, Display)] #[non_exhaustive] @@ -218,18 +320,19 @@ impl FromStr for LocaleFamily { match first { b'^' => Ok(Self { langid: Some(LanguageIdentifier::try_from_bytes(remainder)?), - include_ancestors: true, - include_descendants: false, + annotations: LocaleFamilyAnnotations::without_descendants(), + }), + b'%' => Ok(Self { + langid: Some(LanguageIdentifier::try_from_bytes(remainder)?), + annotations: LocaleFamilyAnnotations::without_ancestors(), }), b'@' => Ok(Self { langid: Some(LanguageIdentifier::try_from_bytes(remainder)?), - include_ancestors: false, - include_descendants: false, + annotations: LocaleFamilyAnnotations::single(), }), b if b.is_ascii_alphanumeric() => Ok(Self { langid: Some(s.parse()?), - include_ancestors: true, - include_descendants: true, + annotations: LocaleFamilyAnnotations::with_descendants(), }), _ => Err(LocaleFamilyParseError::InvalidFamily), } @@ -238,7 +341,7 @@ impl FromStr for LocaleFamily { #[test] fn test_locale_family_parsing() { - let valid_families = ["und", "de-CH", "^es", "@pt-BR", "full"]; + let valid_families = ["und", "de-CH", "^es", "@pt-BR", "%en-001", "full"]; let invalid_families = ["invalid", "@invalid", "-foo", "@full", "full-001"]; for family_str in valid_families { let family = family_str.parse::().unwrap(); @@ -269,11 +372,11 @@ pub struct FallbackOptions { #[derive(Debug, Clone)] enum LocalesWithOrWithoutFallback { WithFallback { - locales: HashSet, + families: HashMap, LocaleFamilyAnnotations>, options: FallbackOptions, }, WithoutFallback { - locales: HashSet, + langids: HashSet, }, } @@ -381,7 +484,7 @@ impl DatagenDriver { ) -> Self { Self { locales_fallback: Some(LocalesWithOrWithoutFallback::WithoutFallback { - locales: locales.into_iter().collect(), + langids: locales.into_iter().collect(), }), ..self } @@ -392,6 +495,9 @@ impl DatagenDriver { /// Use the [`langid!`] macro from the prelude to create an /// explicit list, or [`DatagenProvider::locales_for_coverage_levels`] for CLDR coverage levels. /// + /// If there are multiple [`LocaleFamily`]s for the same [`LanguageIdentifier`], the last entry + /// in the iterator takes precedence. + /// /// [`langid!`]: crate::prelude::langid /// [`DatagenProvider::locales_for_coverage_levels`]: crate::DatagenProvider::locales_for_coverage_levels pub fn with_locales_and_fallback( @@ -401,7 +507,7 @@ impl DatagenDriver { ) -> Self { Self { locales_fallback: Some(LocalesWithOrWithoutFallback::WithFallback { - locales: locales.into_iter().collect(), + families: locales.into_iter().map(LocaleFamily::into_parts).collect(), options, }), ..self @@ -526,8 +632,15 @@ impl DatagenDriver { let map_legacy_locales_to_locales_with_expansion = |legacy_locales: Option>| match legacy_locales { - Some(v) => v.into_iter().map(LocaleFamily::with_descendants).collect(), - None => [LocaleFamily::full()].into_iter().collect(), + Some(v) => v + .into_iter() + .map(LocaleFamily::with_descendants) + .map(LocaleFamily::into_parts) + .collect(), + None => [LocaleFamily::full()] + .into_iter() + .map(LocaleFamily::into_parts) + .collect(), }; let locales_fallback = match (locales_fallback, legacy_locales, legacy_fallback_mode) { @@ -536,7 +649,7 @@ impl DatagenDriver { // 1.4 API (_, Some(legacy_locales), FallbackMode::PreferredForExporter) => { LocalesWithOrWithoutFallback::WithFallback { - locales: map_legacy_locales_to_locales_with_expansion(legacy_locales), + families: map_legacy_locales_to_locales_with_expansion(legacy_locales), options: FallbackOptions { runtime_fallback_location: None, deduplication_strategy: None, @@ -545,7 +658,7 @@ impl DatagenDriver { } (_, Some(legacy_locales), FallbackMode::Runtime) => { LocalesWithOrWithoutFallback::WithFallback { - locales: map_legacy_locales_to_locales_with_expansion(legacy_locales), + families: map_legacy_locales_to_locales_with_expansion(legacy_locales), options: FallbackOptions { runtime_fallback_location: Some(RuntimeFallbackLocation::Internal), deduplication_strategy: Some(DeduplicationStrategy::Maximal), @@ -554,7 +667,7 @@ impl DatagenDriver { } (_, Some(legacy_locales), FallbackMode::RuntimeManual) => { LocalesWithOrWithoutFallback::WithFallback { - locales: map_legacy_locales_to_locales_with_expansion(legacy_locales), + families: map_legacy_locales_to_locales_with_expansion(legacy_locales), options: FallbackOptions { runtime_fallback_location: Some(RuntimeFallbackLocation::External), deduplication_strategy: Some(DeduplicationStrategy::Maximal), @@ -563,7 +676,7 @@ impl DatagenDriver { } (_, Some(Some(locales)), FallbackMode::Preresolved) => { LocalesWithOrWithoutFallback::WithoutFallback { - locales: locales.into_iter().collect(), + langids: locales.into_iter().collect(), } } (_, Some(None), FallbackMode::Preresolved) => { @@ -573,7 +686,7 @@ impl DatagenDriver { } (_, Some(legacy_locales), FallbackMode::Hybrid) => { LocalesWithOrWithoutFallback::WithFallback { - locales: map_legacy_locales_to_locales_with_expansion(legacy_locales), + families: map_legacy_locales_to_locales_with_expansion(legacy_locales), options: FallbackOptions { runtime_fallback_location: Some(RuntimeFallbackLocation::External), deduplication_strategy: Some(DeduplicationStrategy::None), @@ -593,17 +706,19 @@ impl DatagenDriver { } let (uses_internal_fallback, deduplication_strategy) = match &locales_fallback { - LocalesWithOrWithoutFallback::WithoutFallback { locales } => { - let mut sorted_locales = - locales.iter().map(ToString::to_string).collect::>(); - sorted_locales.sort(); + LocalesWithOrWithoutFallback::WithoutFallback { langids } => { + let mut sorted_locales = langids + .iter() + .map(|x| x.write_to_string()) + .collect::>(); + sorted_locales.sort_unstable(); log::info!( "Datagen configured without fallback with these locales: {:?}", sorted_locales ); (false, DeduplicationStrategy::None) } - LocalesWithOrWithoutFallback::WithFallback { options, locales } => { + LocalesWithOrWithoutFallback::WithFallback { options, families } => { let uses_internal_fallback = match options.runtime_fallback_location { None => sink.supports_built_in_fallback(), Some(RuntimeFallbackLocation::Internal) => true, @@ -620,15 +735,18 @@ impl DatagenDriver { } Some(x) => x, }; - let mut sorted_locales = - locales.iter().map(ToString::to_string).collect::>(); - sorted_locales.sort(); + let mut sorted_locales = families + .iter() + .map(LocaleFamilyBorrowed::from_parts) + .map(|family| family.write_to_string().into_owned()) + .collect::>(); + sorted_locales.sort_unstable(); log::info!( "Datagen configured with {}, {}, and these locales: {:?}", if uses_internal_fallback { "internal fallback" } else { - "external fallback " + "external fallback" }, match deduplication_strategy { DeduplicationStrategy::Maximal => "maximal deduplication", @@ -828,65 +946,6 @@ impl DatagenDriver { } } -struct ExplicitImplicitLocaleSets { - explicit: HashSet, - implicit: HashSet, -} - -/// Resolves the set of explicit langids and the supported locales into two sets of locales: -/// -/// - `explicit` contains the explicit langids but with aux keys and extension keywords included. -/// For example, if `ar-SA` is requested (explicit langid), and `ar` and `ar-u-nu-latn` are supported, -/// then `ar-SA` and `ar-SA-u-nu-latn` will be returned as `explicit`. -/// - `implcit` contains all supported locales reachable by fallback from an `explicit` locale. -/// These locales can be included without increasing data payload size. -fn make_explicit_implicit_sets( - key: DataKey, - explicit_langids: &mut dyn Iterator, - supported_map: &HashMap>, - fallbacker: &Lazy< - Result, - impl FnOnce() -> Result, - >, -) -> Result { - let mut implicit = HashSet::new(); - let mut explicit: HashSet = Default::default(); - for (explicit_langid, include_ancestors) in explicit_langids { - explicit.insert(explicit_langid.into()); - if let Some(locales) = supported_map.get(explicit_langid) { - explicit.extend(locales.iter().cloned()); // adds ar-EG-u-nu-latn - } - if explicit_langid == &LanguageIdentifier::UND { - continue; - } - let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; - let fallbacker_with_config = fallbacker.for_config(key.fallback_config()); - let mut iter = fallbacker_with_config.fallback_for(explicit_langid.into()); - loop { - if include_ancestors { - implicit.insert(iter.get().clone()); - } - if iter.get().is_und() { - break; - } - // Inherit aux keys and extension keywords from parent locales - let iter_langid = iter.get().get_langid(); - if let Some(locales) = supported_map.get(&iter_langid) { - if include_ancestors { - implicit.extend(locales.iter().cloned()); // adds ar-u-nu-latn - } - for locale in locales { - let mut morphed_locale = locale.clone(); - morphed_locale.set_langid(explicit_langid.clone()); - explicit.insert(morphed_locale); // adds ar-SA-u-nu-latn - } - } - iter.step(); - } - } - Ok(ExplicitImplicitLocaleSets { explicit, implicit }) -} - /// Selects the maximal set of locales to export based on a [`DataKey`] and this datagen /// provider's options bag. The locales may be later optionally deduplicated for fallback. fn select_locales_for_key( @@ -900,18 +959,17 @@ fn select_locales_for_key( impl FnOnce() -> Result, >, ) -> Result, DataError> { - // A map from langid to data locales. Keys that have aux keys or extension keywords - // may have multiple data locales per langid. - let mut supported_map: HashMap> = Default::default(); + // Map from all supported LanguageIdentifiers to their + // corresponding supported DataLocales. + let mut supported_map = HashMap::>::new(); for locale in provider .supported_locales_for_key(key) .map_err(|e| e.with_key(key))? { - use std::collections::hash_map::Entry; - match supported_map.entry(locale.get_langid()) { - Entry::Occupied(mut entry) => entry.get_mut().insert(locale), - Entry::Vacant(entry) => entry.insert(Default::default()).insert(locale), - }; + supported_map + .entry(locale.get_langid()) + .or_default() + .insert(locale); } if key.path().get().starts_with("segmenter/dictionary/") { @@ -954,84 +1012,125 @@ fn select_locales_for_key( }); } - let locale_families = match locales_fallback { - // `FallbackMode::Preresolved` exports all supported locales whose langid matches - // one of the explicit locales. This ensures extensions are included. In addition, any - // explicit locales are added to the list, even if they themselves don't contain data; - // fallback should be performed upon exporting. - LocalesWithOrWithoutFallback::WithoutFallback { locales, .. } => { - let mut it = locales.iter().map(|langid| (langid, false)); - let ExplicitImplicitLocaleSets { explicit, .. } = - make_explicit_implicit_sets(key, &mut it, &supported_map, fallbacker)?; - return Ok(explicit); - } - // All other modes resolve to fallback-aware inclusion. - LocalesWithOrWithoutFallback::WithFallback { locales, .. } => locales, - }; + // The explicitly requested families, except for the `full` family. + let mut include_full = false; + let requested_families: HashMap = + match locales_fallback { + LocalesWithOrWithoutFallback::WithFallback { families, .. } if families.is_empty() => { + // If no locales are selected but fallback is enabled, select the root locale + [(LanguageIdentifier::UND, LocaleFamilyAnnotations::single())] + .into_iter() + .collect() + } + LocalesWithOrWithoutFallback::WithFallback { families, .. } => families + .iter() + .filter_map(|(langid, annotations)| { + if let Some(langid) = langid.as_ref() { + if *langid == LanguageIdentifier::UND { + // Root locale: do not include descendants (use `full` for that) + Some((LanguageIdentifier::UND, LocaleFamilyAnnotations::single())) + } else { + // All other locales: copy the requested annotations + Some((langid.clone(), *annotations)) + } + } else { + // Full locale family: set the bit instead of adding to the set + debug_assert_eq!(annotations, &LocaleFamily::full().annotations); + include_full = true; + None + } + }) + .collect(), + LocalesWithOrWithoutFallback::WithoutFallback { langids } => langids + .iter() + // Map langids without fallback to the `single` family + .map(|langid| (langid.clone(), LocaleFamilyAnnotations::single())) + .collect(), + }; - let mut it = locale_families.iter().filter_map(|x| { - x.langid - .as_ref() - .map(|langid| (langid, x.include_ancestors)) - }); - let ExplicitImplicitLocaleSets { explicit, implicit } = - make_explicit_implicit_sets(key, &mut it, &supported_map, fallbacker)?; - - let supported_and_explicit = supported_map - .into_values() - .flatten() - .chain(explicit.iter().cloned()); - - // Need to check this ahead of time because we can avoid loading the fallbacker. - if locale_families.contains(&LocaleFamily::full()) { - return Ok(supported_and_explicit.collect()); + if include_full && requested_families.is_empty() { + // Special case: return now so we don't need the fallbacker (and its requisite CLDR data) + let selected_locales = supported_map.into_values().flatten().collect(); + return Ok(selected_locales); } + // Need the fallbacker now. let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; let fallbacker_with_config = fallbacker.for_config(key.fallback_config()); - // TODO(#4629): Allow the exclusion of und - let include_und = true; + // The "candidate" langids that could be exported is the union of requested and supported. + let all_candidate_langids = supported_map + .keys() + .chain(requested_families.keys()) + .collect::>(); - let result = supported_and_explicit - .filter(|locale_orig| { - let mut locale = locale_orig.clone(); - locale.remove_aux(); - if implicit.contains(&locale) { - return true; - } - if explicit.contains(&locale) { - return true; - } - if locale.is_langid_und() && include_und { - return true; - } - if locale.language().is_empty() - && matches!( - key.fallback_config().priority, - icu_provider::FallbackPriority::Region - ) - { - return true; + // Compute a map from LanguageIdentifiers to DataLocales, including inherited auxiliary keys + // and extensions. Also resolve the ancestors and descendants while building this map. + let mut selected_langids = requested_families.keys().cloned().collect::>(); + let expansion_map: HashMap<&LanguageIdentifier, HashSet> = all_candidate_langids + .into_iter() + .map(|current_langid| { + let mut expansion = supported_map + .get(current_langid) + .cloned() + .unwrap_or_default(); + if include_full && !selected_langids.contains(current_langid) { + log::trace!("Including {current_langid}: full locale family: {key}"); + selected_langids.insert(current_langid.clone()); } - // Special case: skeletons *require* the -u-ca keyword, so don't export locales that don't have it - // This would get caught later on, but it makes datagen faster and quieter to catch it here - if key.path().get() == "datetime/skeletons@1" && !locale.has_unicode_ext() { - return false; + if current_langid.language.is_empty() && current_langid != &LanguageIdentifier::UND { + log::trace!("Including {current_langid}: und variant: {key}"); + selected_langids.insert(current_langid.clone()); } - let mut iter = fallbacker_with_config.fallback_for(locale); - while !iter.get().is_und() { - if explicit.contains(iter.get()) { - return true; + let include_ancestors = requested_families + .get(current_langid) + .map(|family| family.include_ancestors) + // default to `false` if the langid was not requested + .unwrap_or(false); + let mut iter = fallbacker_with_config.fallback_for(current_langid.into()); + loop { + // Inherit aux keys and extension keywords from parent locales + let parent_langid: LanguageIdentifier = iter.get().get_langid(); + let maybe_parent_locales = supported_map.get(&parent_langid); + let include_descendants = requested_families + .get(&parent_langid) + .map(|family| family.include_descendants) + // default to `false` if the langid was not requested + .unwrap_or(false); + if include_descendants && !selected_langids.contains(current_langid) { + log::trace!("Including {current_langid}: descendant of {parent_langid}: {key}"); + selected_langids.insert(current_langid.clone()); + } + if include_ancestors && !selected_langids.contains(&parent_langid) { + log::trace!("Including {parent_langid}: ancestor of {current_langid}: {key}"); + selected_langids.insert(parent_langid); + } + if let Some(parent_locales) = maybe_parent_locales { + for morphed_locale in parent_locales.iter() { + // Special case: don't pull extensions or aux keys up from the root. + if morphed_locale.is_langid_und() && !morphed_locale.is_empty() { + continue; + } + let mut morphed_locale = morphed_locale.clone(); + morphed_locale.set_langid(current_langid.clone()); + expansion.insert(morphed_locale); + } + } + if iter.get().is_und() { + break; } iter.step(); } - log::trace!("Filtered out: {key}/{locale_orig}"); // this will print aux keys too but it avoids a clone - false + (current_langid, expansion) }) .collect(); - Ok(result) + let selected_locales = expansion_map + .into_iter() + .filter(|(langid, _)| selected_langids.contains(langid)) + .flat_map(|(_, data_locales)| data_locales) + .collect(); + Ok(selected_locales) } fn deduplicate_payloads( @@ -1196,13 +1295,18 @@ fn test_collation_filtering() { language: langid!("ko"), expected: &["ko", "ko-u-co-search", "ko-u-co-searchjl", "ko-u-co-unihan"], }, + TestCase { + include_collations: &[], + language: langid!("und"), + expected: &["und", "und-u-co-emoji", "und-u-co-eor"], + }, ]; for cas in cases { let resolved_locales = select_locales_for_key( &crate::provider::DatagenProvider::new_testing(), icu_collator::provider::CollationDataV1Marker::KEY, &LocalesWithOrWithoutFallback::WithoutFallback { - locales: [cas.language.clone()].into_iter().collect(), + langids: [cas.language.clone()].into_iter().collect(), }, &HashSet::from_iter(cas.include_collations.iter().copied().map(String::from)), &[], @@ -1221,3 +1325,33 @@ fn test_collation_filtering() { assert_eq!(resolved_locales, expected_locales, "{cas:?}"); } } + +/// Test that the last option with multiple conflicting families wins. +#[test] +fn test_family_precedence() { + let driver = DatagenDriver::new().with_locales_and_fallback( + [ + "en".parse().unwrap(), + "%en".parse().unwrap(), + "@en".parse().unwrap(), + "%zh-TW".parse().unwrap(), + "^zh-TW".parse().unwrap(), + ], + Default::default(), + ); + + let Some(LocalesWithOrWithoutFallback::WithFallback { families, .. }) = driver.locales_fallback + else { + panic!("expected locales with fallback") + }; + + assert_eq!( + families, + [ + "@en".parse::().unwrap().into_parts(), + "^zh-TW".parse::().unwrap().into_parts() + ] + .into_iter() + .collect::>() + ); +} diff --git a/provider/datagen/tests/test-options.rs b/provider/datagen/tests/test-options.rs index fd711dc648b..ba5c7db895b 100644 --- a/provider/datagen/tests/test-options.rs +++ b/provider/datagen/tests/test-options.rs @@ -653,6 +653,141 @@ fn explicit_preresolved() { assert_eq!(exported.keys().collect::>(), locales); } +#[test] +fn explicit_hybrid_without_descendants() { + const SELECTED_LOCALES: [LocaleFamily; 7] = [ + LocaleFamily::without_descendants(langid!("arc")), // Aramaic, not in supported list + LocaleFamily::without_descendants(langid!("ar-EG")), + LocaleFamily::without_descendants(langid!("ar-SA")), + LocaleFamily::without_descendants(langid!("en-GB")), + LocaleFamily::without_descendants(langid!("es")), + LocaleFamily::without_descendants(langid!("sr-ME")), + LocaleFamily::without_descendants(langid!("ru-Cyrl-RU")), + ]; + let exported = export_to_map_1_5( + DatagenDriver::new() + .with_keys([HelloWorldV1Marker::KEY]) + .with_locales_and_fallback(SELECTED_LOCALES, Default::default()), + &TestingProvider::with_decimal_symbol_like_data(), + ); + + // Explicit locales are "arc", "ar-EG", "ar-SA", "en-GB", "es", "sr-ME", "ru-Cyrl-RU" + let locales = [ + "ar", // ancestor of ar-EG + "ar-EG", // explicit locale + "ar-EG-u-nu-latn", // explicit with extensions + "ar-SA", // explicit locale, inheriting from ar + "ar-SA-u-nu-latn", // extensions should be included (#4533) + "ar-u-nu-latn", // extensions should be included (#4533) + "arc", // Aramaic, inheriting from und + "en", // ancestor of en-GB + "en-001", // ancestor of en-GB + "en-GB", // explicit locale not in supported locales + // "en-ZA", // not reachable + "es", // explicit and supported + // "es-AR", // excluded: descendant of es + "ru", // ancestor of ru-Cyrl-RU + "ru-Cyrl-RU", // explicit locale, even though it is not normalized + // "sr", // not reachable from sr-ME + "sr-Latn", // ancestor of sr-ME + "sr-ME", // explicit locale not in supported locales + "und", // ancestor of everything + ]; + + // Should return the exact explicit locales set. + assert_eq!(exported.keys().collect::>(), locales); +} + +#[test] +fn explicit_hybrid_without_ancestors() { + const SELECTED_LOCALES: [LocaleFamily; 7] = [ + LocaleFamily::without_ancestors(langid!("arc")), // Aramaic, not in supported list + LocaleFamily::without_ancestors(langid!("ar-EG")), + LocaleFamily::without_ancestors(langid!("ar-SA")), + LocaleFamily::without_ancestors(langid!("en-GB")), + LocaleFamily::without_ancestors(langid!("es")), + LocaleFamily::without_ancestors(langid!("sr-ME")), + LocaleFamily::without_ancestors(langid!("ru-Cyrl-RU")), + ]; + let exported = export_to_map_1_5( + DatagenDriver::new() + .with_keys([HelloWorldV1Marker::KEY]) + .with_locales_and_fallback(SELECTED_LOCALES, Default::default()), + &TestingProvider::with_decimal_symbol_like_data(), + ); + + // Explicit locales are "arc", "ar-EG", "ar-SA", "en-GB", "es", "sr-ME", "ru-Cyrl-RU" + let locales = [ + // "ar", // excluded: ancestor of ar-EG + "ar-EG", // explicit locale + "ar-EG-u-nu-latn", // explicit with extensions + "ar-SA", // explicit locale, inheriting from ar + "ar-SA-u-nu-latn", // extensions should be included (#4533) + // "ar-u-nu-latn", // excluded: ancestor of ar-EG + "arc", // Aramaic, inheriting from und + // "en", // excluded: ancestor of en-GB + // "en-001", // excluded: ancestor of en-GB + "en-GB", // explicit locale not in supported locales + // "en-ZA", // not reachable + "es", // explicit and supported + "es-AR", // descendant of es + // "ru", // excluded: ancestor of ru-Cyrl-RU + "ru-Cyrl-RU", // explicit locale, even though it is not normalized + // "sr", // not reachable from sr-ME + // "sr-Latn", // excluded: ancestor of sr-ME + "sr-ME", // explicit locale not in supported locales + // "und", // excluded: ancestor of everything + ]; + + // Should return the exact explicit locales set. + assert_eq!(exported.keys().collect::>(), locales); +} + +#[test] +fn explicit_hybrid_mixed_families() { + const SELECTED_LOCALES: [LocaleFamily; 8] = [ + LocaleFamily::without_ancestors(langid!("arc")), // Aramaic, not in supported list + LocaleFamily::with_descendants(langid!("ar-EG")), + LocaleFamily::without_ancestors(langid!("ar-EG")), // duplicate entry for ar-EG + LocaleFamily::with_descendants(langid!("en")), + LocaleFamily::single(langid!("en")), // duplicate entry for en + LocaleFamily::without_ancestors(langid!("en-GB")), + LocaleFamily::without_descendants(langid!("es")), + LocaleFamily::with_descendants(langid!("es")), // duplicate entry for es + ]; + let exported = export_to_map_1_5( + DatagenDriver::new() + .with_keys([HelloWorldV1Marker::KEY]) + .with_locales_and_fallback(SELECTED_LOCALES, Default::default()), + &TestingProvider::with_decimal_symbol_like_data(), + ); + + let locales = [ + // "ar", // excluded: ancestor of ar-EG + "ar-EG", // explicit locale + "ar-EG-u-nu-latn", // explicit with extensions + // "ar-SA", // explicit locale, inheriting from ar + // "ar-SA-u-nu-latn", // not reachable + // "ar-u-nu-latn", // not reachable + "arc", // Aramaic, inheriting from und + "en", // included as a singleton + // "en-001", // excluded: ancestor of en-GB + "en-GB", // included without ancestors + // "en-ZA", // not reachable + "es", // explicit and supported + "es-AR", // descendant of es + // "ru", // not requested + // "ru-Cyrl-RU", // not requested + // "sr", // not requested + // "sr-Latn", // not requested + // "sr-ME", // not requested + "und", + ]; + + // Should return the exact explicit locales set. + assert_eq!(exported.keys().collect::>(), locales); +} + #[test] fn explicit_runtime_und() { let exported = export_to_map(