Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix datagen to propagate extension keywords and aux keys to child locales #4533

Merged
merged 9 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
- Datagen shows elapsed time for keys that are slow to generate (https://github.com/unicode-org/icu4x/pull/4469)
- Datagen performance improvement by caching supported locales (https://github.com/unicode-org/icu4x/pull/4470)
- Never use fallback for baked segmentation data (https://github.com/unicode-org/icu4x/pull/4510)
- Propagate extension keywords and auxiliary keys to explicit locales (https://github.com/unicode-org/icu4x/pull/4533)
- `icu_provider`
- (Small breakage) `DataPayload::new_owned()` is no longer `const`, this was a mistake (https://github.com/unicode-org/icu4x/pull/4456)
- `icu_provider_blob`
Expand Down
161 changes: 112 additions & 49 deletions provider/datagen/src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -468,97 +468,160 @@ impl DatagenDriver {
}
}

struct ExplicitImplicitLocaleSets {
explicit: HashSet<DataLocale>,
implicit: HashSet<DataLocale>,
}

/// Resolves the set of explicit langids and the supported locales into two sets of locales:
///
/// - `explicit` contains the explicit langids but with aux keys and extension keywords included
/// - `implcit` contains any locale reachable by fallback from an `explicit` locale
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I'd like more comments here to explain what this handles, with examples referencing the locales mentioned in comments below

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added more docs

fn make_explicit_implicit_sets(
key: DataKey,
explicit_langids: &HashSet<LanguageIdentifier>,
supported_map: &HashMap<LanguageIdentifier, HashSet<DataLocale>>,
fallbacker: &Lazy<
Result<LocaleFallbacker, DataError>,
impl FnOnce() -> Result<LocaleFallbacker, DataError>,
>,
) -> Result<ExplicitImplicitLocaleSets, DataError> {
let mut implicit = HashSet::new();
// TODO: Make including the default locale configurable
implicit.insert(DataLocale::default());

let mut explicit: HashSet<DataLocale> = Default::default();
for explicit_langid in explicit_langids.iter() {
explicit.insert(explicit_langid.into());
if let Some(locales) = supported_map.get(explicit_langid) {
explicit.extend(locales.iter().cloned()); // adds ar-EG-u-nu-latn
}
if explicit_langid == &LanguageIdentifier::UND {
continue;
}
let fallbacker = fallbacker.as_ref().map_err(|e| *e)?;
let fallbacker_with_config = fallbacker.for_config(key.fallback_config());
let mut iter = fallbacker_with_config.fallback_for(explicit_langid.into());
while !iter.get().is_und() {
implicit.insert(iter.get().clone());
// Inherit aux keys and extension keywords from parent locales
let iter_langid = iter.get().get_langid();
if let Some(locales) = supported_map.get(&iter_langid) {
implicit.extend(locales.iter().cloned()); // adds ar-u-nu-latn
for locale in locales {
let mut morphed_locale = locale.clone();
morphed_locale.set_langid(explicit_langid.clone());
explicit.insert(morphed_locale); // adds ar-SA-u-nu-latn
}
}
iter.step();
}
}
Ok(ExplicitImplicitLocaleSets { explicit, implicit })
}

/// Selects the maximal set of locales to export based on a [`DataKey`] and this datagen
/// provider's options bag. The locales may be later optionally deduplicated for fallback.
fn select_locales_for_key(
provider: &dyn ExportableProvider,
key: DataKey,
fallback: FallbackMode,
locales: Option<&HashSet<LanguageIdentifier>>,
explicit_langids: Option<&HashSet<LanguageIdentifier>>,
additional_collations: &HashSet<String>,
segmenter_models: &[String],
fallbacker: &Lazy<
Result<LocaleFallbacker, DataError>,
impl FnOnce() -> Result<LocaleFallbacker, DataError>,
>,
) -> Result<HashSet<icu_provider::DataLocale>, DataError> {
let mut result = provider
// A map from langid to data locales. Keys that have aux keys or extension keywords
// may have multiple data locales per langid.
let mut supported_map: HashMap<LanguageIdentifier, HashSet<DataLocale>> = Default::default();
for locale in provider
.supported_locales_for_key(key)
.map_err(|e| e.with_key(key))?
.into_iter()
.collect::<HashSet<DataLocale>>();
{
use std::collections::hash_map::Entry;
match supported_map.entry(locale.get_langid()) {
Entry::Occupied(mut entry) => entry.get_mut().insert(locale),
Entry::Vacant(entry) => entry.insert(Default::default()).insert(locale),
};
}

if key == icu_segmenter::provider::DictionaryForWordOnlyAutoV1Marker::KEY
|| key == icu_segmenter::provider::DictionaryForWordLineExtendedV1Marker::KEY
{
result.retain(|locale| {
let model = crate::transform::segmenter::dictionary::data_locale_to_model_name(locale);
segmenter_models.iter().any(|m| Some(m.as_ref()) == model)
supported_map.retain(|_, locales| {
locales.retain(|locale| {
let model =
crate::transform::segmenter::dictionary::data_locale_to_model_name(locale);
segmenter_models.iter().any(|m| Some(m.as_ref()) == model)
});
!locales.is_empty()
});
// Don't perform additional locale filtering
return Ok(result);
return Ok(supported_map.into_values().flatten().collect());
} else if key == icu_segmenter::provider::LstmForWordLineAutoV1Marker::KEY {
result.retain(|locale| {
let model = crate::transform::segmenter::lstm::data_locale_to_model_name(locale);
segmenter_models.iter().any(|m| Some(m.as_ref()) == model)
supported_map.retain(|_, locales| {
locales.retain(|locale| {
let model = crate::transform::segmenter::lstm::data_locale_to_model_name(locale);
segmenter_models.iter().any(|m| Some(m.as_ref()) == model)
});
!locales.is_empty()
});
// Don't perform additional locale filtering
return Ok(result);
return Ok(supported_map.into_values().flatten().collect());
} else if key == icu_collator::provider::CollationDataV1Marker::KEY
|| key == icu_collator::provider::CollationDiacriticsV1Marker::KEY
|| key == icu_collator::provider::CollationJamoV1Marker::KEY
|| key == icu_collator::provider::CollationMetadataV1Marker::KEY
|| key == icu_collator::provider::CollationReorderingV1Marker::KEY
|| key == icu_collator::provider::CollationSpecialPrimariesV1Marker::KEY
{
result.retain(|locale| {
let Some(collation) = locale
.get_unicode_ext(&key!("co"))
.and_then(|co| co.as_single_subtag().copied())
else {
return true;
};
additional_collations.contains(collation.as_str())
|| if collation.starts_with("search") {
additional_collations.contains("search*")
} else {
!["big5han", "gb2312"].contains(&collation.as_str())
}
supported_map.retain(|_, locales| {
locales.retain(|locale| {
let Some(collation) = locale
.get_unicode_ext(&key!("co"))
.and_then(|co| co.as_single_subtag().copied())
else {
return true;
};
additional_collations.contains(collation.as_str())
|| if collation.starts_with("search") {
additional_collations.contains("search*")
} else {
!["big5han", "gb2312"].contains(&collation.as_str())
}
});
!locales.is_empty()
});
}

result = match (locales, fallback) {
let result = match (explicit_langids, fallback) {
// Case 1: `None` simply exports all supported locales for this key.
(None, _) => result,
(None, _) => supported_map.into_values().flatten().collect(),
// Case 2: `FallbackMode::Preresolved` exports all supported locales whose langid matches
// one of the explicit locales. This ensures extensions are included. In addition, any
// explicit locales are added to the list, even if they themselves don't contain data;
// fallback should be performed upon exporting.
(Some(explicit), FallbackMode::Preresolved) => result
.into_iter()
.chain(explicit.iter().map(|langid| langid.into()))
.filter(|locale| explicit.contains(&locale.get_langid()))
.collect(),
(Some(explicit_langids), FallbackMode::Preresolved) => {
let ExplicitImplicitLocaleSets { explicit, .. } =
make_explicit_implicit_sets(key, explicit_langids, &supported_map, fallbacker)?;
explicit
}
// Case 3: All other modes resolve to the "ancestors and descendants" strategy.
(Some(explicit), _) => {
let include_und = explicit.contains(&LanguageIdentifier::UND);
let explicit: HashSet<DataLocale> = explicit.iter().map(DataLocale::from).collect();
let mut implicit = HashSet::new();
// TODO: Make including the default locale configurable
implicit.insert(DataLocale::default());
(Some(explicit_langids), _) => {
let include_und = explicit_langids.contains(&LanguageIdentifier::UND);

let ExplicitImplicitLocaleSets { explicit, implicit } =
make_explicit_implicit_sets(key, explicit_langids, &supported_map, fallbacker)?;

let fallbacker = fallbacker.as_ref().map_err(|e| *e)?;
let fallbacker_with_config = fallbacker.for_config(key.fallback_config());

for locale in explicit.iter() {
let mut iter = fallbacker_with_config.fallback_for(locale.clone());
while !iter.get().is_und() {
implicit.insert(iter.get().clone());
iter.step();
}
}

result
.into_iter()
supported_map
.into_values()
.flatten()
.chain(explicit.iter().cloned())
.filter(|locale_orig| {
let mut locale = locale_orig.clone();
Expand Down Expand Up @@ -705,7 +768,7 @@ fn test_collation_filtering() {
Some(&HashSet::from_iter([cas.language.clone()])),
&HashSet::from_iter(cas.include_collations.iter().copied().map(String::from)),
&[],
&once_cell::sync::Lazy::new(|| unreachable!()),
&once_cell::sync::Lazy::new(|| Ok(LocaleFallbacker::new_without_data())),
)
.unwrap()
.into_iter()
Expand Down
Loading