From 860986ce1a6b7d51bc6188d8148b3cac29ea895e Mon Sep 17 00:00:00 2001 From: Robert Bastian Date: Mon, 14 Aug 2023 16:35:44 +0200 Subject: [PATCH] driver --- components/calendar/data/config.json | 5 +- components/casemap/data/config.json | 5 +- components/collator/data/config.json | 6 +- components/datetime/data/config.json | 5 +- components/decimal/data/config.json | 5 +- components/list/data/config.json | 6 +- components/locid_transform/data/config.json | 5 +- components/normalizer/data/config.json | 5 +- components/plurals/data/config.json | 5 +- components/properties/data/config.json | 5 +- components/segmenter/data/config.json | 5 +- components/timezone/data/config.json | 5 +- experimental/compactdecimal/data/config.json | 6 +- experimental/displaynames/data/config.json | 5 +- experimental/relativetime/data/config.json | 5 +- .../single_number_formatter/data/config.json | 5 +- provider/adapters/tests/data/config.json | 5 +- provider/adapters/tests/data/langtest/de.json | 4 +- provider/adapters/tests/data/langtest/ro.json | 4 +- provider/blob/src/export/mod.rs | 12 +- provider/blob/tests/data/config.json | 4 +- provider/datagen/README.md | 12 +- provider/datagen/src/baked_exporter.rs | 12 +- provider/datagen/src/bin/datagen/args.rs | 2 +- provider/datagen/src/bin/datagen/config.rs | 57 +- provider/datagen/src/bin/datagen/mod.rs | 94 ++- provider/datagen/src/driver.rs | 548 ++++++++++++++ provider/datagen/src/lib.rs | 695 +++++------------- provider/datagen/src/options.rs | 189 ----- provider/datagen/src/source.rs | 104 ++- .../src/transform/cldr/characters/mod.rs | 2 +- .../src/transform/cldr/cldr_serde/mod.rs | 3 + .../src/transform/cldr/currency/mod.rs | 2 +- .../src/transform/cldr/datetime/mod.rs | 10 +- .../src/transform/cldr/datetime/week_data.rs | 2 +- .../src/transform/cldr/decimal/compact.rs | 4 +- .../src/transform/cldr/decimal/symbols.rs | 2 +- .../transform/cldr/displaynames/language.rs | 10 +- .../src/transform/cldr/displaynames/region.rs | 4 +- .../src/transform/cldr/displaynames/script.rs | 4 +- .../transform/cldr/displaynames/variant.rs | 2 +- .../src/transform/cldr/fallback/mod.rs | 2 +- .../cldr/locale_canonicalizer/aliases.rs | 2 +- .../locale_canonicalizer/directionality.rs | 2 +- .../locale_canonicalizer/likely_subtags.rs | 2 +- .../datagen/src/transform/cldr/plurals/mod.rs | 2 +- .../src/transform/cldr/relativetime/mod.rs | 4 +- provider/datagen/src/transform/cldr/source.rs | 2 + .../src/transform/cldr/time_zones/mod.rs | 2 +- .../src/transform/icuexport/collator/mod.rs | 145 +--- .../src/transform/icuexport/normalizer/mod.rs | 3 +- .../src/transform/icuexport/ucase/mod.rs | 4 +- .../transform/icuexport/uprops/bidi_data.rs | 5 +- .../transform/icuexport/uprops/bin_cp_set.rs | 5 +- .../transform/icuexport/uprops/bin_uniset.rs | 5 +- .../icuexport/uprops/enum_codepointtrie.rs | 9 +- .../src/transform/icuexport/uprops/script.rs | 10 +- provider/datagen/src/transform/mod.rs | 2 +- .../src/transform/segmenter/dictionary.rs | 18 - .../datagen/src/transform/segmenter/lstm.rs | 20 +- .../datagen/src/transform/segmenter/mod.rs | 4 +- provider/datagen/tests/make-testdata.rs | 23 +- provider/datagen/tests/test-options.rs | 67 +- provider/fs/src/export/mod.rs | 12 +- provider/fs/tests/data/bincode.json | 3 +- provider/fs/tests/data/json.json | 3 +- provider/fs/tests/data/postcard.json | 3 +- .../src/bin/make-testdata-legacy.rs | 2 +- 68 files changed, 1059 insertions(+), 1172 deletions(-) create mode 100644 provider/datagen/src/driver.rs delete mode 100644 provider/datagen/src/options.rs diff --git a/components/calendar/data/config.json b/components/calendar/data/config.json index 1b57ebf877a..66140f3dcc0 100644 --- a/components/calendar/data/config.json +++ b/components/calendar/data/config.json @@ -6,16 +6,13 @@ "datetime/week_data@1" ] }, + "fallback": "Runtime", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } diff --git a/components/casemap/data/config.json b/components/casemap/data/config.json index 6adc1006ee3..8042fc9cd49 100644 --- a/components/casemap/data/config.json +++ b/components/casemap/data/config.json @@ -5,16 +5,13 @@ "props/casemap_unfold@1" ] }, + "fallback": "Runtime", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } diff --git a/components/collator/data/config.json b/components/collator/data/config.json index c5443df8daf..03df36dc204 100644 --- a/components/collator/data/config.json +++ b/components/collator/data/config.json @@ -7,19 +7,15 @@ "collator/meta@1", "collator/prim@1", "collator/reord@1" - ] }, + "fallback": "Runtime", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } diff --git a/components/datetime/data/config.json b/components/datetime/data/config.json index de357ec0953..66bdfe4acd2 100644 --- a/components/datetime/data/config.json +++ b/components/datetime/data/config.json @@ -45,16 +45,13 @@ ] }, + "fallback": "Runtime", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } diff --git a/components/decimal/data/config.json b/components/decimal/data/config.json index dfb8e26cc03..b14ee8c629a 100644 --- a/components/decimal/data/config.json +++ b/components/decimal/data/config.json @@ -4,16 +4,13 @@ "decimal/symbols@1" ] }, + "fallback": "Runtime", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } diff --git a/components/list/data/config.json b/components/list/data/config.json index aacf4ad76a7..2ea77a9c061 100644 --- a/components/list/data/config.json +++ b/components/list/data/config.json @@ -4,19 +4,15 @@ "list/and@1", "list/or@1", "list/unit@1" - ] }, + "fallback": "Runtime", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } diff --git a/components/locid_transform/data/config.json b/components/locid_transform/data/config.json index cb7e223ebbf..fb5f307aabf 100644 --- a/components/locid_transform/data/config.json +++ b/components/locid_transform/data/config.json @@ -11,16 +11,13 @@ "locid_transform/script_dir@1" ] }, + "fallback": "Runtime", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } diff --git a/components/normalizer/data/config.json b/components/normalizer/data/config.json index 1821929c297..e92693e33a2 100644 --- a/components/normalizer/data/config.json +++ b/components/normalizer/data/config.json @@ -10,16 +10,13 @@ "normalizer/uts46d@1" ] }, + "fallback": "Runtime", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } diff --git a/components/plurals/data/config.json b/components/plurals/data/config.json index 30fab3cd116..f3d88628bbd 100644 --- a/components/plurals/data/config.json +++ b/components/plurals/data/config.json @@ -5,16 +5,13 @@ "plurals/cardinal@1" ] }, + "fallback": "Runtime", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } diff --git a/components/properties/data/config.json b/components/properties/data/config.json index 81c68003bf4..7e1d064a2cf 100644 --- a/components/properties/data/config.json +++ b/components/properties/data/config.json @@ -113,16 +113,13 @@ "props/XIDS@1" ] }, + "fallback": "Runtime", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } diff --git a/components/segmenter/data/config.json b/components/segmenter/data/config.json index d6bf5a71ef2..f57e5cd32e9 100644 --- a/components/segmenter/data/config.json +++ b/components/segmenter/data/config.json @@ -10,16 +10,13 @@ "segmenter/word@1" ] }, + "fallback": "Hybrid", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Hybrid", "overwrite": true } diff --git a/components/timezone/data/config.json b/components/timezone/data/config.json index e574f0d31ff..fd416bfa6f8 100644 --- a/components/timezone/data/config.json +++ b/components/timezone/data/config.json @@ -4,16 +4,13 @@ "time_zone/metazone_period@1" ] }, + "fallback": "Runtime", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } diff --git a/experimental/compactdecimal/data/config.json b/experimental/compactdecimal/data/config.json index f2f88cb58c3..defa3ee091f 100644 --- a/experimental/compactdecimal/data/config.json +++ b/experimental/compactdecimal/data/config.json @@ -3,19 +3,15 @@ "Explicit": [ "compactdecimal/long@1", "compactdecimal/short@1" - ] }, + "fallback": "Runtime", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } diff --git a/experimental/displaynames/data/config.json b/experimental/displaynames/data/config.json index 0987bab1e4d..9bc624e24d7 100644 --- a/experimental/displaynames/data/config.json +++ b/experimental/displaynames/data/config.json @@ -8,16 +8,13 @@ "displaynames/variants@1" ] }, + "fallback": "Runtime", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } \ No newline at end of file diff --git a/experimental/relativetime/data/config.json b/experimental/relativetime/data/config.json index f854c55d8bb..31f154e0f1c 100644 --- a/experimental/relativetime/data/config.json +++ b/experimental/relativetime/data/config.json @@ -27,16 +27,13 @@ "relativetime/short/year@1" ] }, + "fallback": "Runtime", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } diff --git a/experimental/single_number_formatter/data/config.json b/experimental/single_number_formatter/data/config.json index 5c3033a0e71..03a6457458f 100644 --- a/experimental/single_number_formatter/data/config.json +++ b/experimental/single_number_formatter/data/config.json @@ -4,16 +4,13 @@ "currency/essentials@1" ] }, + "fallback": "Runtime", "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", "export": { "Baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } diff --git a/provider/adapters/tests/data/config.json b/provider/adapters/tests/data/config.json index da1bbfad88e..14cf64732c8 100644 --- a/provider/adapters/tests/data/config.json +++ b/provider/adapters/tests/data/config.json @@ -7,15 +7,12 @@ "core/helloworld@1" ] }, + "fallback": "RuntimeManual", "locales": "All", - "cldr": "Latest", - "icu_export": "None", - "segmenter_lstm": "None", "export": { "Blob": { "path": "blob.postcard" } }, - "fallback": "RuntimeManual", "overwrite": true } \ No newline at end of file diff --git a/provider/adapters/tests/data/langtest/de.json b/provider/adapters/tests/data/langtest/de.json index 7fd1fbac05f..5ed76c95eb2 100644 --- a/provider/adapters/tests/data/langtest/de.json +++ b/provider/adapters/tests/data/langtest/de.json @@ -4,6 +4,7 @@ "core/helloworld@1" ] }, + "fallback": "Preresolved", "locales": { "Explicit": [ "de" @@ -18,6 +19,5 @@ "syntax": "Json" } }, - "fallback": "Preresolved", "overwrite": true - } \ No newline at end of file +} \ No newline at end of file diff --git a/provider/adapters/tests/data/langtest/ro.json b/provider/adapters/tests/data/langtest/ro.json index bed32e9e24e..8fdf86a191b 100644 --- a/provider/adapters/tests/data/langtest/ro.json +++ b/provider/adapters/tests/data/langtest/ro.json @@ -4,6 +4,7 @@ "core/helloworld@1" ] }, + "fallback": "Preresolved", "locales": { "Explicit": [ "ro" @@ -18,6 +19,5 @@ "syntax": "Json" } }, - "fallback": "Preresolved", "overwrite": true - } \ No newline at end of file +} \ No newline at end of file diff --git a/provider/blob/src/export/mod.rs b/provider/blob/src/export/mod.rs index 7686af33d9c..5610cfbcf69 100644 --- a/provider/blob/src/export/mod.rs +++ b/provider/blob/src/export/mod.rs @@ -18,14 +18,10 @@ //! let mut exporter = BlobExporter::new_with_sink(Box::new(&mut blob)); //! //! // Export something -//! DatagenProvider::default() -//! .export({ -//! let mut options = options::Options::default(); -//! options.keys = [icu_provider::hello_world::HelloWorldV1Marker::KEY].into_iter().collect(); -//! options -//! }, -//! exporter -//! ).unwrap(); +//! DataExportDriver::default() +//! .with_keys([icu_provider::hello_world::HelloWorldV1Marker::KEY]) +//! .export(&DatagenProvider::default(), exporter) +//! .unwrap(); //! //! // communicate the blob to the client application (network, disk, etc.) //! ``` diff --git a/provider/blob/tests/data/config.json b/provider/blob/tests/data/config.json index 00bedf6627a..da8fd8fc8df 100644 --- a/provider/blob/tests/data/config.json +++ b/provider/blob/tests/data/config.json @@ -4,10 +4,8 @@ "core/helloworld@1" ] }, + "fallback": "Hybrid", "locales": "All", - "cldr": "None", - "icu_export": "None", - "segmenter_lstm": "None", "export": { "Blob": { "path": "hello_world.postcard" diff --git a/provider/datagen/README.md b/provider/datagen/README.md index 6360418f58e..1aa65af8d68 100644 --- a/provider/datagen/README.md +++ b/provider/datagen/README.md @@ -18,15 +18,9 @@ use icu_provider_blob::export::*; use std::fs::File; fn main() { - DatagenProvider::default() - .export( - { - let mut options = options::Options::default(); - options.keys = [icu::list::provider::AndListV1Marker::KEY].into_iter().collect(); - options - }, - BlobExporter::new_with_sink(Box::new(File::create("data.postcard").unwrap())), - ) + DataExportDriver::default() + .with_keys([icu::list::provider::AndListV1Marker::KEY]) + .export(&DatagenProvider::latest_tested(), BlobExporter::new_with_sink(Box::new(File::create("data.postcard").unwrap()))) .unwrap(); } ``` diff --git a/provider/datagen/src/baked_exporter.rs b/provider/datagen/src/baked_exporter.rs index 7064f3dcdbf..458ffbfa7db 100644 --- a/provider/datagen/src/baked_exporter.rs +++ b/provider/datagen/src/baked_exporter.rs @@ -19,14 +19,10 @@ //! let mut exporter = BakedExporter::new(demo_path.clone(), Default::default()).unwrap(); //! //! // Export something -//! DatagenProvider::default() -//! .export({ -//! let mut options = options::Options::default(); -//! options.keys = [icu_provider::hello_world::HelloWorldV1Marker::KEY].into_iter().collect(); -//! options -//! }, -//! exporter -//! ).unwrap(); +//! DataExportDriver::default() +//! .with_keys([icu_provider::hello_world::HelloWorldV1Marker::KEY]) +//! .export(&DatagenProvider::latest_tested(), exporter) +//! .unwrap(); //! # //! # let _ = std::fs::remove_dir_all(&demo_path); //! ``` diff --git a/provider/datagen/src/bin/datagen/args.rs b/provider/datagen/src/bin/datagen/args.rs index befdb1b5f67..850cfe7a423 100644 --- a/provider/datagen/src/bin/datagen/args.rs +++ b/provider/datagen/src/bin/datagen/args.rs @@ -418,7 +418,7 @@ impl Cli { }) } - fn make_segmenter_models(&self) -> eyre::Result { + fn make_segmenter_models(&self) -> eyre::Result { Ok(if self.segmenter_models.as_slice() == ["none"] { config::SegmenterModelInclude::None } else if self.segmenter_models.as_slice() == ["recommended"] { diff --git a/provider/datagen/src/bin/datagen/config.rs b/provider/datagen/src/bin/datagen/config.rs index 3c65bca2e2c..05b056b318b 100644 --- a/provider/datagen/src/bin/datagen/config.rs +++ b/provider/datagen/src/bin/datagen/config.rs @@ -2,34 +2,35 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -pub use icu_datagen::options::*; -pub use icu_datagen::{CollationHanDatabase, TrieType}; - +pub use icu_datagen::{CollationHanDatabase, CoverageLevel, FallbackMode, TrieType}; +pub use icu_locid::LanguageIdentifier; use icu_provider::prelude::*; -use std::collections::HashSet; +use std::collections::{BTreeSet, HashSet}; use std::path::{Path, PathBuf}; #[derive(Debug, serde::Deserialize, serde::Serialize)] pub struct Config { - #[serde(default, skip_serializing_if = "is_default")] pub keys: KeyInclude, - #[serde(default, skip_serializing_if = "is_default")] + pub fallback: FallbackMode, pub locales: LocaleInclude, + #[serde(default, skip_serializing_if = "is_default")] + pub collations: HashSet, + #[serde(default, skip_serializing_if = "is_default")] + pub segmenter_models: SegmenterModelInclude, + + #[serde(default, skip_serializing_if = "is_default")] pub cldr: PathOrTag, + #[serde(default, skip_serializing_if = "is_default")] pub icu_export: PathOrTag, + #[serde(default, skip_serializing_if = "is_default")] pub segmenter_lstm: PathOrTag, #[serde(default, skip_serializing_if = "is_default")] pub trie_type: TrieType, #[serde(default, skip_serializing_if = "is_default")] pub collation_han_database: CollationHanDatabase, - #[serde(default, skip_serializing_if = "is_default")] - pub collations: HashSet, - #[serde(default, skip_serializing_if = "is_default")] - pub segmenter_models: SegmenterModelInclude, + pub export: Export, #[serde(default, skip_serializing_if = "is_default")] - pub fallback: FallbackMode, - #[serde(default, skip_serializing_if = "is_default")] pub overwrite: bool, } @@ -46,12 +47,6 @@ pub enum KeyInclude { ForBinary(PathBuf), } -impl Default for KeyInclude { - fn default() -> Self { - Self::All - } -} - mod data_key_as_str { use super::*; use serde::{de::*, ser::*}; @@ -61,7 +56,7 @@ mod data_key_as_str { selff .iter() .map(|k| k.path().get()) - .collect::>() + .collect::>() .serialize(ser) } @@ -74,10 +69,32 @@ mod data_key_as_str { } } -#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] +pub enum LocaleInclude { + Recommended, + All, + None, + Explicit(HashSet), + CldrSet(HashSet), +} + +#[non_exhaustive] +#[derive(Debug, PartialEq, Clone, serde::Serialize, serde::Deserialize, Default)] +pub enum SegmenterModelInclude { + #[default] + /// Set this data driver to generate the recommended set of segmenter models. This will cover + /// all languages supported by ICU4X: Thai, Burmese, Khmer, Lao, Chinese, and Japanese. + /// Both dictionary and LSTM models will be included, to the extent required by the chosen data keys. + Recommended, + None, + Explicit(Vec), +} + +#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq, Default)] pub enum PathOrTag { Path(PathBuf), Tag(String), + #[default] Latest, None, } diff --git a/provider/datagen/src/bin/datagen/mod.rs b/provider/datagen/src/bin/datagen/mod.rs index e9d95b9fcf5..7d96bafea38 100644 --- a/provider/datagen/src/bin/datagen/mod.rs +++ b/provider/datagen/src/bin/datagen/mod.rs @@ -35,65 +35,87 @@ fn main() -> eyre::Result<()> { let config = matches.as_config()?; - let mut options = options::Options::default(); - options.keys = match config.keys { - config::KeyInclude::None => Default::default(), - config::KeyInclude::All => icu_datagen::all_keys().into_iter().collect(), - config::KeyInclude::Explicit(set) => set, - config::KeyInclude::ForBinary(path) => { - icu_datagen::keys_from_bin(path)?.into_iter().collect() - } - }; - options.locales = config.locales; - options.collations = config.collations; - options.segmenter_models = config.segmenter_models; - options.fallback = config.fallback; - - let mut source_data = SourceData::offline(); - source_data = source_data.with_collation_han_database(config.collation_han_database); + let mut source = SourceData::default(); + source = source.with_collation_han_database(config.collation_han_database); if config.trie_type == crate::config::TrieType::Fast { - source_data = source_data.with_fast_tries(); + source = source.with_fast_tries(); } - source_data = match config.cldr { - config::PathOrTag::Path(path) => source_data.with_cldr(path, Default::default())?, + source = match config.cldr { + config::PathOrTag::Path(path) => source.with_cldr(path, Default::default())?, #[cfg(feature = "networking")] config::PathOrTag::Latest => { - source_data.with_cldr_for_tag(SourceData::LATEST_TESTED_CLDR_TAG, Default::default())? + source.with_cldr_for_tag(SourceData::LATEST_TESTED_CLDR_TAG, Default::default())? } #[cfg(feature = "networking")] - config::PathOrTag::Tag(tag) => source_data.with_cldr_for_tag(&tag, Default::default())?, - config::PathOrTag::None => source_data, + config::PathOrTag::Tag(tag) => source.with_cldr_for_tag(&tag, Default::default())?, + config::PathOrTag::None => source, #[cfg(not(feature = "networking"))] _ => eyre::bail!("Download data from tags requires the `networking` Cargo feature"), }; - source_data = match config.icu_export { - config::PathOrTag::Path(path) => source_data.with_icuexport(path)?, + source = match config.icu_export { + config::PathOrTag::Path(path) => source.with_icuexport(path)?, #[cfg(feature = "networking")] config::PathOrTag::Latest => { - source_data.with_icuexport_for_tag(SourceData::LATEST_TESTED_ICUEXPORT_TAG)? + source.with_icuexport_for_tag(SourceData::LATEST_TESTED_ICUEXPORT_TAG)? } #[cfg(feature = "networking")] - config::PathOrTag::Tag(tag) => source_data.with_icuexport_for_tag(&tag)?, - config::PathOrTag::None => source_data, + config::PathOrTag::Tag(tag) => source.with_icuexport_for_tag(&tag)?, + config::PathOrTag::None => source, #[cfg(not(feature = "networking"))] _ => eyre::bail!("Download data from tags requires the `networking` Cargo feature"), }; - source_data = match config.segmenter_lstm { - config::PathOrTag::Path(path) => source_data.with_icuexport(path)?, + source = match config.segmenter_lstm { + config::PathOrTag::Path(path) => source.with_icuexport(path)?, #[cfg(feature = "networking")] config::PathOrTag::Latest => { - source_data.with_segmenter_lstm_for_tag(SourceData::LATEST_TESTED_SEGMENTER_LSTM_TAG)? + source.with_segmenter_lstm_for_tag(SourceData::LATEST_TESTED_SEGMENTER_LSTM_TAG)? } #[cfg(feature = "networking")] - config::PathOrTag::Tag(tag) => source_data.with_segmenter_lstm_for_tag(&tag)?, - config::PathOrTag::None => source_data, + config::PathOrTag::Tag(tag) => source.with_segmenter_lstm_for_tag(&tag)?, + config::PathOrTag::None => source, #[cfg(not(feature = "networking"))] _ => eyre::bail!("Download data from tags requires the `networking` Cargo feature"), }; - let provider = DatagenProvider::new(source_data); + let mut driver = DataExportDriver::default(); + driver = match config.keys { + config::KeyInclude::None => driver.with_keys([]), + config::KeyInclude::All => driver.with_keys(icu_datagen::all_keys()), + config::KeyInclude::Explicit(set) => driver.with_keys(set), + config::KeyInclude::ForBinary(path) => driver.with_keys(icu_datagen::keys_from_bin(path)?), + }; + driver = driver.with_fallback_mode(config.fallback); + driver = driver.with_collations(config.collations); + driver = match config.locales { + config::LocaleInclude::All => driver.with_all_locales(), + config::LocaleInclude::None => driver.with_locales([]), + config::LocaleInclude::Explicit(set) => driver.with_locales(set), + config::LocaleInclude::CldrSet(levels) => { + driver.with_locales(source.locales(&levels.iter().copied().collect::>())?) + } + config::LocaleInclude::Recommended => driver.with_locales(source.locales(&[ + CoverageLevel::Modern, + CoverageLevel::Moderate, + CoverageLevel::Basic, + ])?), + }; + driver = match config.segmenter_models { + config::SegmenterModelInclude::None => driver.with_segmenter_models([]), + config::SegmenterModelInclude::Recommended => driver.with_segmenter_models([ + "Burmese_codepoints_exclusive_model4_heavy".into(), + "burmesedict".into(), + "cjdict".into(), + "Khmer_codepoints_exclusive_model4_heavy".into(), + "khmerdict".into(), + "Lao_codepoints_exclusive_model4_heavy".into(), + "laodict".into(), + "Thai_codepoints_exclusive_model4_heavy".into(), + "thaidict".into(), + ]), + config::SegmenterModelInclude::Explicit(models) => driver.with_segmenter_models(models), + }; match config.export { config::Export::Fs { @@ -126,7 +148,7 @@ fn main() -> eyre::Result<()> { options }, )?; - Ok(provider.export(options, exporter)?) + Ok(driver.export(&DatagenProvider { source }, exporter)?) } } config::Export::Blob { ref path } => { @@ -146,7 +168,7 @@ fn main() -> eyre::Result<()> { ) }, ); - Ok(provider.export(options, exporter)?) + Ok(driver.export(&DatagenProvider { source }, exporter)?) } } config::Export::Baked { @@ -172,7 +194,7 @@ fn main() -> eyre::Result<()> { options })?; - Ok(provider.export(options, exporter)?) + Ok(driver.export(&DatagenProvider { source }, exporter)?) } } } diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs new file mode 100644 index 00000000000..b36b90751a1 --- /dev/null +++ b/provider/datagen/src/driver.rs @@ -0,0 +1,548 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::rayon_prelude::*; +use crate::FallbackMode; +use icu_locid::extensions::unicode::key; +use icu_locid::LanguageIdentifier; +use icu_locid_transform::fallback::LocaleFallbackConfig; +use icu_locid_transform::fallback::LocaleFallbackIterator; +use icu_locid_transform::fallback::LocaleFallbacker; +use icu_locid_transform::provider::{ + CollationFallbackSupplementV1Marker, LocaleFallbackLikelySubtagsV1Marker, + LocaleFallbackParentsV1Marker, +}; +use icu_provider::datagen::*; +use icu_provider::prelude::*; +use once_cell::sync::Lazy; +use std::borrow::Cow; +use std::collections::HashMap; +use std::collections::HashSet; +use writeable::Writeable; + +/// Configuration for a data export operation. +/// +/// # Examples +/// +/// ``` +/// use icu_datagen::prelude::*; +/// use icu_datagen::blob_exporter::*; +/// +/// DataExportDriver::default() +/// .with_keys([icu::list::provider::AndListV1Marker::KEY]) +/// .export(&DatagenProvider::latest_tested(), BlobExporter::new_with_sink(Box::new(&mut Vec::new()))) +/// .unwrap(); +/// ``` +#[derive(Debug, Clone, Default)] +pub struct DataExportDriver { + keys: HashSet, + // `None` means all + locales: Option>, + fallback: FallbackMode, + collations: HashSet, + segmenter_models: Vec, +} + +impl DataExportDriver { + /// Sets this driver to generate the given keys. See [`icu_datagen::keys`], + /// [`icu_datagen::all_keys`], [`icu_datagen::key`] and [`icu_datagen::keys_from_bin`]. + /// + /// [`icu_datagen::keys`]: crate::keys + /// [`icu_datagen::all_keys`]: crate::all_keys + /// [`icu_datagen::key`]: crate::key + /// [`icu_datagen::keys_from_bin`]: crate::keys_from_bin + pub fn with_keys(self, keys: impl IntoIterator) -> Self { + Self { + keys: keys.into_iter().collect(), + ..self + } + } + + /// Sets the fallback type that the data should be generated for. If locale fallback is + /// used at runtime, smaller data can be generated. + pub fn with_fallback_mode(self, fallback: FallbackMode) -> Self { + Self { fallback, ..self } + } + + /// Sets the locales to generate. + pub fn with_locales(self, locales: impl IntoIterator) -> Self { + Self { + locales: Some(locales.into_iter().collect()), + ..self + } + } + + /// Sets this driver to generate all available locales. + pub fn with_all_locales(self) -> Self { + Self { + locales: None, + ..self + } + } + + /// By default, the collations `big5han`, `gb2312`, and those starting with `search` + /// are excluded. This method can be used to reennable them. + /// + /// The special string `"search*"` causes all search collation tables to be included. + pub fn with_collations(self, collations: impl IntoIterator) -> Self { + Self { + collations: collations.into_iter().collect(), + ..self + } + } + + /// Sets this driver to generate the given segmentation models, to the extent required by the + /// chosen data keys. + /// + /// The currently supported dictionary models are + /// * `cjdict` + /// * `burmesedict` + /// * `khmerdict` + /// * `laodict` + /// * `thaidict` + /// + /// The currently supported LSTM models are + /// * `Burmese_codepoints_exclusive_model4_heavy` + /// * `Khmer_codepoints_exclusive_model4_heavy` + /// * `Lao_codepoints_exclusive_model4_heavy` + /// * `Thai_codepoints_exclusive_model4_heavy` + /// + /// If a model is not included, the resulting line or word segmenter will apply rule-based + /// segmentation when encountering text in a script that requires the model, which will be + /// incorrect. + /// + /// If multiple models for the same language and segmentation type (dictionary/LSTM) are + /// listed, the first one will be used. + pub fn with_segmenter_models(self, models: impl IntoIterator) -> Self { + Self { + segmenter_models: models.into_iter().collect(), + ..self + } + } + + /// Exports data from the given provider to the given exporter. + /// + /// See + /// [`BlobExporter`](icu_provider_blob::export), + /// [`FileSystemExporter`](icu_provider_fs::export), + /// and [`BakedExporter`](crate::baked_exporter). + pub fn export( + &self, + provider: &(impl IterableDynamicDataProvider + + DataProvider + + DataProvider + + DataProvider + + Sync + + Send), + mut sink: impl DataExporter, + ) -> Result<(), DataError> { + self.export_dyn(provider, &mut sink) + } + + // Avoids multiple monomorphizations + fn export_dyn( + &self, + provider: &(impl IterableDynamicDataProvider + + DataProvider + + DataProvider + + DataProvider + + Sync + + Send), + sink: &mut dyn DataExporter, + ) -> Result<(), DataError> { + if self.keys.is_empty() { + log::warn!("No keys selected"); + } + + if matches!(self.fallback, FallbackMode::Preresolved) && self.locales.is_none() { + return Err(DataError::custom( + "FallbackMode::Preresolved requires a locale set to be set", + )); + } + + let fallback = match self.fallback { + FallbackMode::PreferredForExporter => { + if sink.supports_built_in_fallback() { + FallbackMode::Runtime + } else { + FallbackMode::Hybrid + } + } + f => f, + }; + + log::info!( + "Datagen configured with fallback mode {:?} and these locales: {}", + fallback, + match self.locales { + None => "ALL".to_string(), + Some(ref set) => { + let mut list: Vec> = + set.iter().map(Writeable::write_to_string).collect(); + list.sort(); + format!("{:?}", list) + } + } + ); + + let fallbacker = + once_cell::sync::Lazy::new(|| LocaleFallbacker::try_new_unstable(provider)); + + let load_with_fallback = |key, locale: &_| { + log::trace!("Generating key/locale: {key}/{locale:}"); + let mut metadata = DataRequestMetadata::default(); + metadata.silent = true; + // Lazy-compute the fallback iterator so that we don't always require CLDR data + let mut locale_iter: Option = None; + loop { + let req = DataRequest { + locale: locale_iter.as_ref().map(|i| i.get()).unwrap_or(locale), + metadata, + }; + match provider.load_data(key, req) { + Ok(data_response) => { + if let Some(iter) = locale_iter.as_ref() { + if iter.get().is_empty() && !locale.is_empty() { + log::debug!("Falling back to und: {key}/{locale}"); + } + } + return Some(data_response.take_payload()); + } + Err(DataError { + kind: DataErrorKind::MissingLocale, + .. + }) => { + if let Some(iter) = locale_iter.as_mut() { + if iter.get().is_empty() { + log::debug!("Could not find data for: {key}/{locale}"); + return None; + } + iter.step(); + } else { + match fallbacker.as_ref() { + Ok(fallbacker) => { + locale_iter = Some( + fallbacker + .for_config(LocaleFallbackConfig::from_key(key)) + .fallback_for(locale.clone()), + ) + } + Err(e) => return Some(Err(*e)), + } + } + } + Err(e) => return Some(Err(e.with_req(key, req))), + } + } + }; + + self.keys.clone().into_par_iter().try_for_each(|key| { + log::info!("Generating key {key}"); + + if key.metadata().singleton { + let payload = provider + .load_data(key, Default::default()) + .and_then(DataResponse::take_payload) + .map_err(|e| e.with_req(key, Default::default()))?; + + return sink + .flush_singleton(key, &payload) + .map_err(|e| e.with_req(key, Default::default())); + } + + let locales_to_export = + self.select_locales_for_key(provider, key, fallback, &fallbacker)?; + + match fallback { + FallbackMode::Runtime | FallbackMode::RuntimeManual => { + let payloads = locales_to_export + .into_par_iter() + .filter_map(|locale| { + load_with_fallback(key, &locale) + .map(|r| r.map(|payload| (locale, payload))) + }) + .collect::, _>>()?; + let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; + let fallbacker_with_config = + fallbacker.for_config(LocaleFallbackConfig::from_key(key)); + 'outer: for (locale, payload) in payloads.iter() { + let mut iter = fallbacker_with_config.fallback_for(locale.clone()); + while !iter.get().is_empty() { + iter.step(); + if let Some(parent_payload) = payloads.get(iter.get()) { + if parent_payload == payload && locale != iter.get() { + // Found a match: don't need to write anything + log::trace!( + "Deduplicating {key}/{locale} (inherits from {})", + iter.get() + ); + continue 'outer; + } + } + } + // Did not find a match: export this payload + sink.put_payload(key, locale, payload)?; + } + } + FallbackMode::Hybrid | FallbackMode::Preresolved => { + locales_to_export.into_par_iter().try_for_each(|locale| { + if let Some(payload) = load_with_fallback(key, &locale) { + sink.put_payload(key, &locale, &payload?)?; + } + Ok::<(), DataError>(()) + })?; + } + FallbackMode::PreferredForExporter => unreachable!("resolved"), + }; + + match fallback { + FallbackMode::Runtime => { + sink.flush_with_built_in_fallback(key, BuiltInFallbackMode::Standard) + } + _ => sink.flush(key), + } + .map_err(|e| e.with_key(key)) + })?; + + sink.close() + } + + /// Selects the maximal set of locales to export based on a [`DataKey`] and this datagen + /// provider's options bag. The locales may be later optionally deduplicated for fallback. + fn select_locales_for_key( + &self, + provider: &(impl IterableDynamicDataProvider + Sync + Send), + key: DataKey, + fallback: FallbackMode, + fallbacker: &Lazy< + Result, + impl FnOnce() -> Result, + >, + ) -> Result, DataError> { + let mut locales = provider + .supported_locales_for_key(key) + .map_err(|e| e.with_key(key))? + .into_iter() + .collect::>(); + + if key == icu_segmenter::provider::DictionaryForWordOnlyAutoV1Marker::KEY + || key == icu_segmenter::provider::DictionaryForWordLineExtendedV1Marker::KEY + { + locales.retain(|locale| { + let model = + crate::transform::segmenter::dictionary::data_locale_to_model_name(locale); + self.segmenter_models + .iter() + .any(|m| Some(m.as_ref()) == model) + }); + // Don't perform additional locale filtering + return Ok(locales); + } else if key == icu_segmenter::provider::LstmForWordLineAutoV1Marker::KEY { + locales.retain(|locale| { + let model = crate::transform::segmenter::lstm::data_locale_to_model_name(locale); + self.segmenter_models + .iter() + .any(|m| Some(m.as_ref()) == model) + }); + // Don't perform additional locale filtering + return Ok(locales); + } else if key == icu_collator::provider::CollationDataV1Marker::KEY + || key == icu_collator::provider::CollationDiacriticsV1Marker::KEY + || key == icu_collator::provider::CollationJamoV1Marker::KEY + || key == icu_collator::provider::CollationMetadataV1Marker::KEY + || key == icu_collator::provider::CollationReorderingV1Marker::KEY + || key == icu_collator::provider::CollationSpecialPrimariesV1Marker::KEY + { + locales.retain(|locale| { + let Some(collation) = locale + .get_unicode_ext(&key!("co")) + .and_then(|co| co.as_single_subtag().copied()) + else { return true }; + self.collations.contains(collation.as_str()) + || if collation.starts_with("search") { + self.collations.contains("search*") + } else { + !["big5han", "gb2312"].contains(&collation.as_str()) + } + }); + } + + locales = match (&self.locales, fallback) { + // Case 1: `None` simply exports all supported locales for this key. + (None, _) => locales, + // Case 2: `FallbackMode::Preresolved` exports all supported locales whose langid matches + // one of the explicit locales. This ensures extensions are included. In addition, any + // explicit locales are added to the list, even if they themselves don't contain data; + // fallback should be performed upon exporting. + (Some(explicit), FallbackMode::Preresolved) => locales + .into_iter() + .chain(explicit.iter().map(|langid| langid.into())) + .filter(|locale| explicit.contains(&locale.get_langid())) + .collect(), + // Case 3: All other modes resolve to the "ancestors and descendants" strategy. + (Some(explicit), _) => { + let include_und = explicit.contains(&LanguageIdentifier::UND); + let explicit: HashSet = explicit.iter().map(DataLocale::from).collect(); + let mut implicit = HashSet::new(); + // TODO: Make including the default locale configurable + implicit.insert(DataLocale::default()); + let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; + let fallbacker_with_config = + fallbacker.for_config(LocaleFallbackConfig::from_key(key)); + + for locale in explicit.iter() { + let mut iter = fallbacker_with_config.fallback_for(locale.clone()); + while !iter.get().is_empty() { + implicit.insert(iter.get().clone()); + iter.step(); + } + } + + locales + .into_iter() + .chain(explicit.iter().cloned()) + .filter(|locale| { + if implicit.contains(locale) { + return true; + } + if explicit.contains(locale) { + return true; + } + if locale.is_langid_und() && include_und { + return true; + } + if locale.language().is_empty() + && matches!( + key.metadata().fallback_priority, + icu_provider::FallbackPriority::Region + ) + { + return true; + } + // Special case: skeletons *require* the -u-ca keyword, so don't export locales that don't have it + // This would get caught later on, but it makes datagen faster and quieter to catch it here + if key + == icu_datetime::provider::calendar::DateSkeletonPatternsV1Marker::KEY + && !locale.has_unicode_ext() + { + return false; + } + let mut iter = fallbacker_with_config.fallback_for(locale.clone()); + while !iter.get().is_empty() { + if explicit.contains(iter.get()) { + return true; + } + iter.step(); + } + log::trace!("Filtered out: {key}/{locale}"); + false + }) + .collect() + } + }; + + Ok(locales) + } +} + +#[test] +fn test_collation_filtering() { + use icu_locid::langid; + use std::collections::BTreeSet; + + #[derive(Debug)] + struct TestCase<'a> { + include_collations: &'a [&'a str], + language: LanguageIdentifier, + expected: &'a [&'a str], + } + let cases = [ + TestCase { + include_collations: &[], + language: langid!("zh"), + expected: &["zh", "zh-u-co-stroke", "zh-u-co-unihan", "zh-u-co-zhuyin"], + }, + TestCase { + include_collations: &["gb2312"], + language: langid!("zh"), + expected: &[ + "zh", + "zh-u-co-gb2312", + "zh-u-co-stroke", + "zh-u-co-unihan", + "zh-u-co-zhuyin", + ], + }, + TestCase { + include_collations: &["big5han"], + language: langid!("zh"), + expected: &[ + "zh", + "zh-u-co-big5han", + "zh-u-co-stroke", + "zh-u-co-unihan", + "zh-u-co-zhuyin", + ], + }, + TestCase { + include_collations: &["gb2312", "search*"], + language: langid!("zh"), + expected: &[ + "zh", + "zh-u-co-gb2312", + "zh-u-co-stroke", + "zh-u-co-unihan", + "zh-u-co-zhuyin", + ], + }, + TestCase { + include_collations: &[], + language: langid!("ko"), + expected: &["ko", "ko-u-co-unihan"], + }, + TestCase { + include_collations: &["search"], + language: langid!("ko"), + expected: &["ko", "ko-u-co-search", "ko-u-co-unihan"], + }, + TestCase { + include_collations: &["searchjl"], + language: langid!("ko"), + expected: &["ko", "ko-u-co-searchjl", "ko-u-co-unihan"], + }, + TestCase { + include_collations: &["search", "searchjl"], + language: langid!("ko"), + expected: &["ko", "ko-u-co-search", "ko-u-co-searchjl", "ko-u-co-unihan"], + }, + TestCase { + include_collations: &["search*", "big5han"], + language: langid!("ko"), + expected: &["ko", "ko-u-co-search", "ko-u-co-searchjl", "ko-u-co-unihan"], + }, + ]; + for cas in cases { + let resolved_locales = DataExportDriver::default() + .with_collations(cas.include_collations.iter().copied().map(String::from)) + .with_locales([cas.language.clone()]) + .with_fallback_mode(FallbackMode::Preresolved) + .select_locales_for_key( + &crate::DatagenProvider::latest_tested(), + icu_collator::provider::CollationDataV1Marker::KEY, + FallbackMode::Preresolved, + &once_cell::sync::Lazy::new(|| unreachable!()), + ) + .unwrap() + .into_iter() + .map(|l| l.to_string()) + .collect::>(); + let expected_locales = cas + .expected + .iter() + .copied() + .map(String::from) + .collect::>(); + assert_eq!(resolved_locales, expected_locales, "{cas:?}"); + } +} diff --git a/provider/datagen/src/lib.rs b/provider/datagen/src/lib.rs index 2a4f11e841a..a9fbcc69e3d 100644 --- a/provider/datagen/src/lib.rs +++ b/provider/datagen/src/lib.rs @@ -17,21 +17,13 @@ //! //! ```no_run //! use icu_datagen::prelude::*; -//! use icu_provider_blob::export::*; +//! use icu_datagen::blob_exporter::*; //! use std::fs::File; //! -//! fn main() { -//! DatagenProvider::default() -//! .export( -//! { -//! let mut options = options::Options::default(); -//! options.keys = [icu::list::provider::AndListV1Marker::KEY].into_iter().collect(); -//! options -//! }, -//! BlobExporter::new_with_sink(Box::new(File::create("data.postcard").unwrap())), -//! ) -//! .unwrap(); -//! } +//! DataExportDriver::default() +//! .with_keys([icu::list::provider::AndListV1Marker::KEY]) +//! .export(&DatagenProvider::latest_tested(), BlobExporter::new_with_sink(Box::new(File::create("data.postcard").unwrap()))) +//! .unwrap(); //! ``` //! //! ## Command line @@ -68,18 +60,19 @@ )] #![warn(missing_docs)] +mod driver; mod error; mod registry; mod source; mod transform; +pub use driver::DataExportDriver; pub use error::{is_missing_cldr_error, is_missing_icuexport_error}; #[allow(deprecated)] // ugh pub use registry::{all_keys, all_keys_with_experimental, deserialize_and_measure, key}; -pub use source::CollationHanDatabase; -pub use source::SourceData; #[doc(hidden)] // for CLI serde pub use source::TrieType; +pub use source::{CollationHanDatabase, CoverageLevel, SourceData}; #[cfg(feature = "provider_baked")] pub mod baked_exporter; @@ -88,12 +81,13 @@ pub use icu_provider_blob::export as blob_exporter; #[cfg(feature = "provider_fs")] pub use icu_provider_fs::export as fs_exporter; -pub mod options; - /// A prelude for using the datagen API pub mod prelude { #[doc(no_inline)] - pub use crate::{options, DatagenProvider, SourceData}; + pub use crate::{ + CollationHanDatabase, CoverageLevel, DataExportDriver, DatagenProvider, FallbackMode, + SourceData, + }; #[doc(no_inline)] pub use icu_locid::{langid, LanguageIdentifier}; #[doc(no_inline)] @@ -101,31 +95,13 @@ pub mod prelude { // SEMVER GRAVEYARD #[cfg(feature = "legacy_api")] - #[doc(hidden)] - pub use crate::options::CoverageLevel; - #[cfg(feature = "legacy_api")] - #[doc(hidden)] - pub use crate::source::CollationHanDatabase; - #[cfg(feature = "legacy_api")] #[allow(deprecated)] #[doc(hidden)] pub use crate::{syntax, BakedOptions, CldrLocaleSubset, Out}; } -use icu_locid::LanguageIdentifier; -use icu_locid_transform::fallback::LocaleFallbackConfig; -use icu_locid_transform::fallback::LocaleFallbackIterator; -use icu_locid_transform::fallback::LocaleFallbacker; -use icu_provider::datagen::*; use icu_provider::prelude::*; -use memchr::memmem; -use once_cell::sync::Lazy; -use options::{FallbackMode, LocaleInclude}; -use std::borrow::Cow; -use std::collections::HashMap; -use std::collections::HashSet; use std::path::Path; -use writeable::Writeable; #[cfg(feature = "rayon")] pub(crate) use rayon::prelude as rayon_prelude; @@ -140,399 +116,118 @@ pub(crate) mod rayon_prelude { impl IntoParallelIterator for T {} } +/// Defines how fallback will apply to the generated data. +/// +/// If in doubt, use [`FallbackMode::PreferredForExporter`], which selects the best mode for your +/// chosen data provider. +/// +/// # Fallback Mode Comparison +/// +/// The modes differ primarily in their approaches to runtime fallback and data size. +/// +/// | Mode | Runtime Fallback | Data Size | +/// |---|---|---| +/// | [`Runtime`] | Yes, Automatic | Smallest | +/// | [`RuntimeManual`] | Yes, Manual | Smallest | +/// | [`Preresolved`] | No | Small | +/// | [`Hybrid`] | Optional | Medium | +/// +/// If you are not 100% certain of the closed set of locales you need at runtime, you should +/// use a provider with runtime fallback enabled. +/// +/// [`Runtime`]: FallbackMode::Runtime +/// [`RuntimeManual`]: FallbackMode::RuntimeManual +/// [`Preresolved`]: FallbackMode::Preresolved +/// [`Hybrid`]: FallbackMode::Hybrid +#[derive(Debug, Copy, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)] +#[non_exhaustive] +pub enum FallbackMode { + /// Selects the fallback mode based on [`DataExporter::supports_built_in_fallback()`]( + /// icu_provider::datagen::DataExporter::supports_built_in_fallback()), resolving to either + /// [`Runtime`] or [`Hybrid`]. + /// + /// [`Runtime`]: Self::Runtime + /// [`Hybrid`]: Self::Hybrid + #[default] + PreferredForExporter, + /// This mode generates the minimal set of locales that cover the requested locales when + /// fallback is used at runtime. For example, if "en" and "en-US" are both requested but + /// they contain the same value, only "en" will be included, since "en-US" falls back to + /// "en" at runtime. + /// + /// If [`LocaleInclude::Explicit`] is used, this mode includes all ancestors and descendants + /// (usually regional variants) of the explicitly listed locales. For example, if "pt-PT" is + /// requested, then "pt", "pt-PT", and children like "pt-MO" will be included. Note that the + /// children of "pt-PT" usually inherit from it and therefore don't take up a significant + /// amount of space in the data file. + /// + /// This mode is only supported with the baked data provider, and it builds fallback logic + /// into the generated code. To use this mode with other providers that don't bundle fallback + /// logic, use [`FallbackMode::RuntimeManual`] or [`FallbackMode::Hybrid`]. + /// + /// This is the default fallback mode for the baked provider. + Runtime, + /// Same as [`FallbackMode::Runtime`] except that the fallback logic is not included in the + /// generated code. It must be enabled manually with a [`LocaleFallbackProvider`]. + /// + /// This mode is supported on all data provider implementations. + /// + /// [`LocaleFallbackProvider`]: icu_provider_adapters::fallback::LocaleFallbackProvider + RuntimeManual, + /// This mode generates data for exactly the supplied locales. If data doesn't exist for a + /// locale, fallback will be performed and the fallback value will be exported. + /// + /// Requires using [`LocaleInclude::Explicit`]. + /// + /// Note: in data exporters that deduplicate values (such as `BakedExporter` and + /// `BlobDataExporter`), the impact on data size as compared to [`FallbackMode::Runtime`] + /// is limited to the pointers in the explicitly listed locales. + /// + /// Data generated in this mode can be used without runtime fallback and guarantees that all + /// locales are present. If you wish to also support locales that were not explicitly listed + /// with runtime fallback, see [`FallbackMode::Hybrid`]. + Preresolved, + /// This mode passes through CLDR data without performing locale deduplication. + /// + /// If [`LocaleInclude::Explicit`] is used, this mode includes all ancestors and descendants + /// (usually regional variants) of the explicitly listed locales. For example, if "pt-PT" is + /// requested, then "pt", "pt-PT", and children like "pt-MO" will be included. + /// + /// Note: in data exporters that deduplicate values (such as `BakedExporter` and + /// `BlobDataExporter`), the impact on data size as compared to [`FallbackMode::Runtime`] + /// is limited to the pointers in the explicitly listed locales. + /// + /// Data generated in this mode is suitable for use with or without runtime fallback. To + /// enable runtime fallback, use a [`LocaleFallbackProvider`]. + /// + /// This is the default fallback mode for the blob and filesystem providers. + /// + /// [`LocaleFallbackProvider`]: icu_provider_adapters::fallback::LocaleFallbackProvider + Hybrid, +} + /// [`DataProvider`] backed by [`SourceData`] /// /// If `source` does not contain a specific data source, `DataProvider::load` will /// error ([`is_missing_cldr_error`](crate::is_missing_cldr_error) / /// [`is_missing_icuexport_error`](crate::is_missing_icuexport_error)) if the data is /// required for that key. +#[allow(clippy::exhaustive_structs)] // any information will be added to SourceData #[derive(Debug, Clone)] -#[cfg_attr(feature = "networking", derive(Default))] -#[cfg_attr(not(doc), allow(clippy::exhaustive_structs))] -#[cfg_attr(doc, non_exhaustive)] pub struct DatagenProvider { - #[doc(hidden)] + /// The underlying raw data pub source: SourceData, } impl DatagenProvider { - /// Creates a new data provider with the given `source`. - pub fn new(source: SourceData) -> Self { - Self { source } - } - - #[cfg(test)] - pub fn for_test() -> Self { - use once_cell::sync::OnceCell; - - static TEST_PROVIDER: OnceCell = OnceCell::new(); - // Singleton so that all instantiations share the same cache. - TEST_PROVIDER - .get_or_init(|| { - let data_root = - std::path::Path::new(core::env!("CARGO_MANIFEST_DIR")).join("tests/data"); - DatagenProvider { - // This is equivalent to `latest_tested` for the files defined in - // `tools/testdata-scripts/globs.rs.data`. - source: SourceData::offline() - .with_cldr(data_root.join("cldr"), Default::default()) - .unwrap() - .with_icuexport(data_root.join("icuexport")) - .unwrap(), - } - }) - .clone() - } - - /// Selects the maximal set of locales to export based on a [`DataKey`] and this datagen - /// provider's options bag. The locales may be later optionally deduplicated for fallback. - pub(crate) fn select_locales_for_key( - &self, - key: DataKey, - options: &options::Options, - fallbacker: &Lazy< - Result, - impl FnOnce() -> Result, - >, - ) -> Result, DataError> { - let mut locales = self - .supported_locales_for_key(key) - .map_err(|e| e.with_key(key))? - .into_iter() - .collect::>(); - - if key == icu_segmenter::provider::DictionaryForWordOnlyAutoV1Marker::KEY - || key == icu_segmenter::provider::DictionaryForWordLineExtendedV1Marker::KEY - { - // Segmenter: filter only by segmenter_models - return Ok(transform::segmenter::dictionary::filter_data_locales( - locales, - &options.segmenter_models, - )); - } else if key == icu_segmenter::provider::LstmForWordLineAutoV1Marker::KEY { - // Segmenter: filter only by segmenter_models - return Ok(transform::segmenter::lstm::filter_data_locales( - locales, - &options.segmenter_models, - )); - } else if key == icu_collator::provider::CollationDataV1Marker::KEY - || key == icu_collator::provider::CollationDiacriticsV1Marker::KEY - || key == icu_collator::provider::CollationJamoV1Marker::KEY - || key == icu_collator::provider::CollationMetadataV1Marker::KEY - || key == icu_collator::provider::CollationReorderingV1Marker::KEY - || key == icu_collator::provider::CollationSpecialPrimariesV1Marker::KEY - { - // Collator: filter by collations, but also by locales/fallback - locales = - transform::icuexport::collator::filter_data_locales(locales, &options.collations); - } - - locales = match (&options.locales, options.fallback) { - // Case 1: `LocaleInclude::All` simply exports all supported locales for this key. - (LocaleInclude::All, _) => locales, - // Case 2: `FallbackMode::Preresolved` exports all supported locales whose langid matches - // one of the explicit locales. This ensures extensions are included. In addition, any - // explicit locales are added to the list, even if they themselves don't contain data; - // fallback should be performed upon exporting. - (LocaleInclude::Explicit(explicit), FallbackMode::Preresolved) => locales - .into_iter() - .chain(explicit.iter().map(|langid| langid.into())) - .filter(|locale| explicit.contains(&locale.get_langid())) - .collect(), - // Case 3: All other modes resolve to the "ancestors and descendants" strategy. - (LocaleInclude::Explicit(explicit), _) => { - let include_und = explicit.contains(&LanguageIdentifier::UND); - let explicit: HashSet = explicit.iter().map(DataLocale::from).collect(); - let mut implicit = HashSet::new(); - // TODO: Make including the default locale configurable - implicit.insert(DataLocale::default()); - let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; - let fallbacker_with_config = - fallbacker.for_config(LocaleFallbackConfig::from_key(key)); - - for locale in explicit.iter() { - let mut iter = fallbacker_with_config.fallback_for(locale.clone()); - while !iter.get().is_empty() { - implicit.insert(iter.get().clone()); - iter.step(); - } - } - - locales - .into_iter() - .chain(explicit.iter().cloned()) - .filter(|locale| { - if implicit.contains(locale) { - return true; - } - if explicit.contains(locale) { - return true; - } - if locale.is_langid_und() && include_und { - return true; - } - if locale.language().is_empty() - && matches!( - key.metadata().fallback_priority, - icu_provider::FallbackPriority::Region - ) - { - return true; - } - // Special case: skeletons *require* the -u-ca keyword, so don't export locales that don't have it - // This would get caught later on, but it makes datagen faster and quieter to catch it here - if key - == icu_datetime::provider::calendar::DateSkeletonPatternsV1Marker::KEY - && !locale.has_unicode_ext() - { - return false; - } - let mut iter = fallbacker_with_config.fallback_for(locale.clone()); - while !iter.get().is_empty() { - if explicit.contains(iter.get()) { - return true; - } - iter.step(); - } - log::trace!("Filtered out: {key}/{locale}"); - false - }) - .collect() - } - _ => unreachable!("Pre-processed LocaleInclude has only 2 variants"), - }; - - Ok(locales) - } - - /// Loads a `DataPayload` with locale fallback enabled. - fn load_with_fallback( - &self, - key: DataKey, - locale: &DataLocale, - fallbacker: &Lazy< - Result, - impl FnOnce() -> Result, - >, - ) -> Result>, DataError> { - log::trace!("Generating key/locale: {key}/{locale:}"); - let mut metadata = DataRequestMetadata::default(); - metadata.silent = true; - // Lazy-compute the fallback iterator so that we don't always require CLDR data - let mut option_iter: Option = None; - loop { - let req = DataRequest { - locale: match option_iter.as_ref() { - Some(iter) => iter.get(), - None => locale, - }, - metadata, - }; - let result = self.load_data(key, req); - match result { - Ok(data_response) => { - if let Some(iter) = option_iter.as_ref() { - if iter.get().is_empty() && !locale.is_empty() { - log::debug!("Falling back to und: {key}/{locale}"); - } - } - return Ok(Some(data_response.take_payload()?)); - } - Err(DataError { - kind: DataErrorKind::MissingLocale, - .. - }) => { - if let Some(iter) = option_iter.as_mut() { - if iter.get().is_empty() { - log::debug!("Could not find data for: {key}/{locale}"); - return Ok(None); - } - iter.step(); - } else { - let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; - let config = LocaleFallbackConfig::from_key(key); - let iter = fallbacker.for_config(config).fallback_for(locale.clone()); - option_iter.replace(iter); - } - } - Err(e) => return Err(e.with_req(key, req)), - } - } - } - - /// Exports data for the given options to the given exporter. + /// Returns a `DatagenProvider` backed by [`SourceData::latest_tested`]. /// - /// See - /// [`BlobExporter`](icu_provider_blob::export), - /// [`FileSystemExporter`](icu_provider_fs::export), - /// and [`BakedExporter`](crate::baked_exporter). - pub fn export( - &self, - mut options: options::Options, - mut exporter: impl DataExporter, - ) -> Result<(), DataError> { - if options.keys.is_empty() { - log::warn!("No keys selected"); + /// ✨ *Enabled with the `networking` Cargo feature.* + #[cfg(any(feature = "networking", test))] + pub fn latest_tested() -> Self { + Self { + source: SourceData::latest_tested(), } - - if !self.source.collations.is_empty() - && options.collations - != self - .source - .collations - .iter() - .cloned() - .collect::>() - { - log::warn!("SourceData::with_collations was used and differs from Options#collations (which will be used).") - } - - if matches!(options.fallback, options::FallbackMode::Preresolved) - && !matches!(options.locales, options::LocaleInclude::Explicit(_)) - { - return Err(DataError::custom( - "FallbackMode::Preresolved requires LocaleInclude::Explicit", - )); - } - - options.locales = match core::mem::take(&mut options.locales) { - options::LocaleInclude::None => options::LocaleInclude::Explicit(Default::default()), - options::LocaleInclude::CldrSet(levels) => options::LocaleInclude::Explicit( - self.source - .locales(levels.iter().copied().collect::>().as_slice())? - .into_iter() - .chain(core::iter::once(LanguageIdentifier::UND)) - .collect(), - ), - options::LocaleInclude::Explicit(set) => options::LocaleInclude::Explicit(set), - options::LocaleInclude::All => options::LocaleInclude::All, - options::LocaleInclude::Recommended => options::LocaleInclude::Explicit( - self.source - .locales(&[ - options::CoverageLevel::Modern, - options::CoverageLevel::Moderate, - options::CoverageLevel::Basic, - ])? - .into_iter() - .chain(core::iter::once(LanguageIdentifier::UND)) - .collect(), - ), - }; - - options.fallback = match options.fallback { - options::FallbackMode::PreferredForExporter => { - if exporter.supports_built_in_fallback() { - options::FallbackMode::Runtime - } else { - options::FallbackMode::Hybrid - } - } - f => f, - }; - - log::info!( - "Datagen configured with fallback mode {:?} and these locales: {}", - options.fallback, - match options.locales { - options::LocaleInclude::All => "ALL".to_string(), - options::LocaleInclude::Explicit(ref set) => { - let mut list: Vec> = - set.iter().map(Writeable::write_to_string).collect(); - list.sort(); - format!("{:?}", list) - } - _ => unreachable!(), - } - ); - - // Avoid multiple monomorphizations - fn internal( - provider: &DatagenProvider, - mut options: options::Options, - exporter: &mut dyn DataExporter, - ) -> Result<(), DataError> { - use rayon_prelude::*; - - let fallbacker = - once_cell::sync::Lazy::new(|| LocaleFallbacker::try_new_unstable(provider)); - - core::mem::take(&mut options.keys) - .into_par_iter() - .try_for_each(|key| { - log::info!("Generating key {key}"); - - if key.metadata().singleton { - let payload = provider - .load_data(key, Default::default()) - .and_then(DataResponse::take_payload) - .map_err(|e| e.with_req(key, Default::default()))?; - - return exporter - .flush_singleton(key, &payload) - .map_err(|e| e.with_req(key, Default::default())); - } - - let locales_to_export = - provider.select_locales_for_key(key, &options, &fallbacker)?; - - match options.fallback { - options::FallbackMode::Runtime | options::FallbackMode::RuntimeManual => { - let payloads = locales_to_export - .into_par_iter() - .flat_map(|locale| { - match provider.load_with_fallback(key, &locale, &fallbacker) { - Ok(Some(payload)) => Some(Ok((locale, Box::new(payload)))), - Ok(None) => None, - Err(e) => Some(Err(e)), - } - }) - .collect::, _>>()?; - let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; - let fallbacker_with_config = - fallbacker.for_config(LocaleFallbackConfig::from_key(key)); - 'outer: for (locale, payload) in payloads.iter() { - let mut iter = fallbacker_with_config.fallback_for(locale.clone()); - while !iter.get().is_empty() { - iter.step(); - if let Some(parent_payload) = payloads.get(iter.get()) { - if parent_payload == payload && locale != iter.get() { - // Found a match: don't need to write anything - log::trace!( - "Deduplicating {key}/{locale} (inherits from {})", - iter.get() - ); - continue 'outer; - } - } - } - // Did not find a match: export this payload - exporter.put_payload(key, locale, payload)?; - } - } - options::FallbackMode::Hybrid | options::FallbackMode::Preresolved => { - locales_to_export.into_par_iter().try_for_each(|locale| { - let payload = - provider.load_with_fallback(key, &locale, &fallbacker)?; - if let Some(payload) = payload { - exporter.put_payload(key, &locale, &payload)?; - } - Ok::<(), DataError>(()) - })?; - } - options::FallbackMode::PreferredForExporter => unreachable!("resolved"), - }; - - match options.fallback { - options::FallbackMode::Runtime => exporter - .flush_with_built_in_fallback(key, BuiltInFallbackMode::Standard), - _ => exporter.flush(key), - } - .map_err(|e| e.with_key(key)) - })?; - - exporter.close() - } - internal(self, options, &mut exporter) } } @@ -613,15 +308,17 @@ pub fn keys_from_file>(path: P) -> std::io::Result> /// # } /// ``` pub fn keys_from_bin>(path: P) -> std::io::Result> { + use memchr::memmem::*; + let file = std::fs::read(path.as_ref())?; let file = file.as_slice(); const LEADING_TAG: &[u8] = icu_provider::leading_tag!().as_bytes(); const TRAILING_TAG: &[u8] = icu_provider::trailing_tag!().as_bytes(); - let trailing_tag = memmem::Finder::new(TRAILING_TAG); + let trailing_tag = Finder::new(TRAILING_TAG); - let mut result: Vec = memmem::find_iter(file, LEADING_TAG) + let mut result: Vec = find_iter(file, LEADING_TAG) .map(|tag_position| tag_position + LEADING_TAG.len()) .map(|key_start| &file[key_start..]) .filter_map(move |key_fragment| { @@ -643,10 +340,7 @@ pub fn keys_from_bin>(path: P) -> std::io::Result> { /// Requires `legacy_api` Cargo feature /// /// The output format. -#[deprecated( - since = "1.3.0", - note = "use `DatagenProvider::export` with self-constructed `DataExporter`s" -)] +#[deprecated(since = "1.3.0", note = "use `DataExportDriver`")] #[non_exhaustive] #[cfg(feature = "legacy_api")] pub enum Out { @@ -724,7 +418,7 @@ impl core::fmt::Debug for Out { } } -#[deprecated(since = "1.3.0", note = "use `DatagenProvider::export`")] +#[deprecated(since = "1.3.0", note = "use `DataExportDriver`")] #[cfg(feature = "legacy_api")] #[allow(deprecated)] /// Requires `legacy_api` Cargo feature @@ -744,90 +438,89 @@ pub fn datagen( source: &SourceData, outs: Vec, ) -> Result<(), DataError> { - use options::*; - - DatagenProvider::new(source.clone()).export( - Options { - keys: keys.iter().cloned().collect(), - locales: locales - .map(|ls| { - LocaleInclude::Explicit( - ls.iter() - .cloned() - .chain([icu_locid::LanguageIdentifier::UND]) - .collect(), - ) - }) - .unwrap_or(options::LocaleInclude::All), - segmenter_models: match locales { - None => options::SegmenterModelInclude::Recommended, - Some(list) => options::SegmenterModelInclude::Explicit({ - let mut models = vec![]; - for locale in list { - let locale = locale.into(); - if let Some(model) = - transform::segmenter::lstm::data_locale_to_model_name(&locale) - { - models.push(model.into()); - } - if let Some(model) = - transform::segmenter::dictionary::data_locale_to_model_name(&locale) - { - models.push(model.into()); - } + let exporter = DataExportDriver::default() + .with_keys(keys.iter().cloned()) + .with_fallback_mode(FallbackMode::Hybrid) + .with_collations(source.collations().to_vec()); + match locales { + Some(locales) => exporter + .with_locales( + locales + .iter() + .cloned() + .chain([icu_locid::LanguageIdentifier::UND]), + ) + .with_segmenter_models({ + let mut models = vec![]; + for locale in locales { + let locale = locale.into(); + if let Some(model) = + transform::segmenter::lstm::data_locale_to_model_name(&locale) + { + models.push(model.into()); + } + if let Some(model) = + transform::segmenter::dictionary::data_locale_to_model_name(&locale) + { + models.push(model.into()); } - models - }), - }, - collations: source.collations.iter().cloned().collect(), - fallback: FallbackMode::Hybrid, + } + models + }), + _ => exporter.with_all_locales(), + } + .export( + &DatagenProvider { + source: source.clone(), }, - MultiExporter::new( + icu_provider::datagen::MultiExporter::new( outs.into_iter() - .map(|out| -> Result, DataError> { - use baked_exporter::*; - use icu_provider_blob::export::*; - use icu_provider_fs::export::*; - - Ok(match out { - Out::Fs { - output_path, - serializer, - overwrite, - fingerprint, - } => { - let mut options = ExporterOptions::default(); - options.root = output_path; - if overwrite { - options.overwrite = OverwriteOption::RemoveAndReplace + .map( + |out| -> Result, DataError> { + use baked_exporter::*; + use icu_provider_blob::export::*; + use icu_provider_fs::export::*; + + Ok(match out { + Out::Fs { + output_path, + serializer, + overwrite, + fingerprint, + } => { + let mut options = ExporterOptions::default(); + options.root = output_path; + if overwrite { + options.overwrite = OverwriteOption::RemoveAndReplace + } + options.fingerprint = fingerprint; + Box::new(FilesystemExporter::try_new(serializer, options)?) } - options.fingerprint = fingerprint; - Box::new(FilesystemExporter::try_new(serializer, options)?) - } - Out::Blob(write) => Box::new(BlobExporter::new_with_sink(write)), - Out::Baked { - mod_directory, - options, - } => Box::new(BakedExporter::new(mod_directory, options)?), - #[allow(deprecated)] - Out::Module { - mod_directory, - pretty, - insert_feature_gates, - use_separate_crates, - } => Box::new(BakedExporter::new( - mod_directory, - Options { + Out::Blob(write) => Box::new(BlobExporter::new_with_sink(write)), + Out::Baked { + mod_directory, + options, + } => Box::new(BakedExporter::new(mod_directory, options)?), + #[allow(deprecated)] + Out::Module { + mod_directory, pretty, insert_feature_gates, use_separate_crates, - // Note: overwrite behavior was `true` in 1.0 but `false` in 1.1; - // 1.1.2 made it an option in Options. - overwrite: false, - }, - )?), - }) - }) + } => Box::new(BakedExporter::new( + mod_directory, + Options { + pretty, + insert_feature_gates, + use_separate_crates, + // Note: overwrite behavior was `true` in 1.0 but `false` in 1.1; + // 1.1.2 made it an option in Options. + overwrite: false, + }, + )?), + }) + }, + ) .collect::>()?, ), ) @@ -893,10 +586,6 @@ fn test_keys_from_bin() { // SEMVER GRAVEYARD -#[cfg(feature = "legacy_api")] -#[doc(hidden)] -pub use source::CoverageLevel; - #[cfg(feature = "legacy_api")] #[doc(hidden)] pub use baked_exporter::Options as BakedOptions; diff --git a/provider/datagen/src/options.rs b/provider/datagen/src/options.rs deleted file mode 100644 index b8f0556ae32..00000000000 --- a/provider/datagen/src/options.rs +++ /dev/null @@ -1,189 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -//! Options bag for [`DatagenProvider`](crate::DatagenProvider). - -pub use crate::transform::cldr::source::CoverageLevel; - -use icu_locid::LanguageIdentifier; -use std::collections::HashSet; - -/// Defines how fallback will apply to the generated data. If in doubt, use -/// [`FallbackMode::PreferredForExporter`], which selects the best mode for your -/// chosen data provider. -/// -/// # Fallback Mode Comparison -/// -/// The modes differ primarily in their approaches to runtime fallback and data size. -/// -/// | Mode | Runtime Fallback | Data Size | -/// |---|---|---| -/// | [`Runtime`] | Yes, Automatic | Smallest | -/// | [`RuntimeManual`] | Yes, Manual | Smallest | -/// | [`Preresolved`] | No | Small | -/// | [`Hybrid`] | Optional | Medium | -/// -/// If you are not 100% certain of the closed set of locales you need at runtime, you should -/// use a provider with runtime fallback enabled. -/// -/// [`Runtime`]: FallbackMode::Runtime -/// [`RuntimeManual`]: FallbackMode::RuntimeManual -/// [`Preresolved`]: FallbackMode::Preresolved -/// [`Hybrid`]: FallbackMode::Hybrid -#[derive(Debug, Copy, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)] -#[non_exhaustive] -pub enum FallbackMode { - /// Selects the fallback mode based on [`DataExporter::supports_built_in_fallback()`]( - /// icu_provider::datagen::DataExporter::supports_built_in_fallback()), resolving to either - /// [`Runtime`] or [`Hybrid`]. - /// - /// [`Runtime`]: Self::Runtime - /// [`Hybrid`]: Self::Hybrid - #[default] - PreferredForExporter, - /// This mode generates the minimal set of locales that cover the requested locales when - /// fallback is used at runtime. For example, if "en" and "en-US" are both requested but - /// they contain the same value, only "en" will be included, since "en-US" falls back to - /// "en" at runtime. - /// - /// If [`LocaleInclude::Explicit`] is used, this mode includes all ancestors and descendants - /// (usually regional variants) of the explicitly listed locales. For example, if "pt-PT" is - /// requested, then "pt", "pt-PT", and children like "pt-MO" will be included. Note that the - /// children of "pt-PT" usually inherit from it and therefore don't take up a significant - /// amount of space in the data file. - /// - /// This mode is only supported with the baked data provider, and it builds fallback logic - /// into the generated code. To use this mode with other providers that don't bundle fallback - /// logic, use [`FallbackMode::RuntimeManual`] or [`FallbackMode::Hybrid`]. - /// - /// This is the default fallback mode for the baked provider. - Runtime, - /// Same as [`FallbackMode::Runtime`] except that the fallback logic is not included in the - /// generated code. It must be enabled manually with a [`LocaleFallbackProvider`]. - /// - /// This mode is supported on all data provider implementations. - /// - /// [`LocaleFallbackProvider`]: icu_provider_adapters::fallback::LocaleFallbackProvider - RuntimeManual, - /// This mode generates data for exactly the supplied locales. If data doesn't exist for a - /// locale, fallback will be performed and the fallback value will be exported. - /// - /// Requires using [`LocaleInclude::Explicit`]. - /// - /// Note: in data exporters that deduplicate values (such as `BakedExporter` and - /// `BlobDataExporter`), the impact on data size as compared to [`FallbackMode::Runtime`] - /// is limited to the pointers in the explicitly listed locales. - /// - /// Data generated in this mode can be used without runtime fallback and guarantees that all - /// locales are present. If you wish to also support locales that were not explicitly listed - /// with runtime fallback, see [`FallbackMode::Hybrid`]. - Preresolved, - /// This mode passes through CLDR data without performing locale deduplication. - /// - /// If [`LocaleInclude::Explicit`] is used, this mode includes all ancestors and descendants - /// (usually regional variants) of the explicitly listed locales. For example, if "pt-PT" is - /// requested, then "pt", "pt-PT", and children like "pt-MO" will be included. - /// - /// Note: in data exporters that deduplicate values (such as `BakedExporter` and - /// `BlobDataExporter`), the impact on data size as compared to [`FallbackMode::Runtime`] - /// is limited to the pointers in the explicitly listed locales. - /// - /// Data generated in this mode is suitable for use with or without runtime fallback. To - /// enable runtime fallback, use a [`LocaleFallbackProvider`]. - /// - /// This is the default fallback mode for the blob and filesystem providers. - /// - /// [`LocaleFallbackProvider`]: icu_provider_adapters::fallback::LocaleFallbackProvider - Hybrid, -} - -/// Options bag for [`DatagenProvider`](crate::DatagenProvider). -#[non_exhaustive] -#[derive(Debug, Clone, PartialEq, Default)] -pub struct Options { - /// The set of keys to generate. See [`icu_datagen::keys`], - /// [`icu_datagen::all_keys`], [`icu_datagen::key`] and [`icu_datagen::keys_from_bin`]. - /// - /// [`icu_datagen::keys`]: crate::keys - /// [`icu_datagen::all_keys`]: crate::all_keys - /// [`icu_datagen::key`]: crate::key - /// [`icu_datagen::keys_from_bin`]: crate::keys_from_bin - pub keys: HashSet, - /// Defines the locales to include - pub locales: LocaleInclude, - /// The collation types to include. - /// - /// The special string `"search*"` causes all search collation tables to be included. - pub collations: HashSet, - /// The type of fallback that the data should be generated for. If locale fallback is - /// used at runtime, smaller data can be generated. - pub fallback: FallbackMode, - /// The segmentation models to include - pub segmenter_models: SegmenterModelInclude, -} - -/// Defines the locales to include -#[non_exhaustive] -#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] -pub enum LocaleInclude { - /// All locales - All, - /// No locales - None, - /// An explicit set of locales - Explicit(HashSet), - /// All locales with the given CLDR coverage levels - CldrSet(HashSet), - /// A recommended set of locales. - /// - /// This currently resolves to `CldrSet({Modern, Moderate, Basic})` but - /// might change in future releases. - Recommended, -} - -impl Default for LocaleInclude { - fn default() -> Self { - Self::All - } -} - -#[non_exhaustive] -#[derive(Debug, PartialEq, Clone, serde::Serialize, serde::Deserialize)] -/// The segmentation models to include -pub enum SegmenterModelInclude { - /// Include the recommended set of models. This will cover all languages supported - /// by ICU4X: Thai, Burmese, Khmer, Lao, Chinese, and Japanese. Both dictionary - /// and LSTM models will be included, to the extent required by the chosen data keys. - Recommended, - /// Include no dictionary or LSTM models. This will make line and word segmenters - /// behave like simple rule-based segmenters, which will be incorrect when handling text - /// that contains Thai, Burmese, Khmer, Lao, Chinese, or Japanese. - None, - /// Include an explicit list of LSTM or dictionary models, to the extent required by the - /// chosen data keys. - /// - /// The currently supported dictionary models are - /// * `cjdict` - /// * `burmesedict` - /// * `khmerdict` - /// * `laodict` - /// * `thaidict` - /// - /// The currently supported LSTM models are - /// * `Burmese_codepoints_exclusive_model4_heavy` - /// * `Khmer_codepoints_exclusive_model4_heavy` - /// * `Lao_codepoints_exclusive_model4_heavy` - /// * `Thai_codepoints_exclusive_model4_heavy` - /// - /// If a model is not included, the resulting line or word segmenter will apply rule-based - /// segmentation when encountering text in a script that requires the model, which will be - /// incorrect. - Explicit(Vec), -} - -impl Default for SegmenterModelInclude { - fn default() -> Self { - Self::Recommended - } -} diff --git a/provider/datagen/src/source.rs b/provider/datagen/src/source.rs index 68b5b71f316..1bf014f7058 100644 --- a/provider/datagen/src/source.rs +++ b/provider/datagen/src/source.rs @@ -29,24 +29,26 @@ pub struct SourceData { icuexport_paths: Option>, icuexport_fallback_paths: Arc, segmenter_lstm_paths: Arc, - pub(crate) trie_type: TrieType, - pub(crate) collation_han_database: CollationHanDatabase, - pub(crate) collations: Vec, + trie_type: TrieType, + collation_han_database: CollationHanDatabase, + #[cfg(feature = "legacy_api")] + collations: Vec, } -#[cfg(feature = "networking")] -/// The default [`SourceData`] downloads the latest supported data. -/// -/// Requires `networking` Cargo feature. impl Default for SourceData { fn default() -> Self { - Self::offline() - .with_cldr_for_tag(Self::LATEST_TESTED_CLDR_TAG, Default::default()) - .unwrap() - .with_icuexport_for_tag(Self::LATEST_TESTED_ICUEXPORT_TAG) - .unwrap() - .with_segmenter_lstm_for_tag(Self::LATEST_TESTED_SEGMENTER_LSTM_TAG) - .unwrap() + Self { + cldr_paths: None, + icuexport_paths: None, + icuexport_fallback_paths: Arc::new(SerdeCache::new( + AbstractFs::new_icuexport_fallback(), + )), + segmenter_lstm_paths: Arc::new(SerdeCache::new(AbstractFs::new_lstm_fallback())), + trie_type: Default::default(), + collation_han_database: Default::default(), + #[cfg(feature = "legacy_api")] + collations: Default::default(), + } } } @@ -60,26 +62,42 @@ impl SourceData { /// The latest segmentation LSTM model tag that has been verified to work with this version of `icu_datagen`. pub const LATEST_TESTED_SEGMENTER_LSTM_TAG: &'static str = "v0.1.0"; - #[doc(hidden)] - #[cfg(feature = "networking")] - #[deprecated(since = "1.3.0", note = "use SourceData::default()")] + /// The latest `SourceData` that has been verified to work with this version of `icu_datagen`. + /// + /// See [`SourceData::LATEST_TESTED_CLDR_TAG`], [`SourceData::LATEST_TESTED_ICUEXPORT_TAG`], [`SourceData::LATEST_TESTED_SEGMENTER_LSTM_TAG`]. + /// + /// ✨ *Enabled with the `networking` Cargo feature.* + #[cfg(any(feature = "networking", test))] pub fn latest_tested() -> Self { - Self::default() - } - - /// Creates a `SourceData` that does not have CLDR or ICU export sources set. - pub fn offline() -> Self { - Self { - cldr_paths: None, - icuexport_paths: None, - icuexport_fallback_paths: Arc::new(SerdeCache::new( - AbstractFs::new_icuexport_fallback(), - )), - segmenter_lstm_paths: Arc::new(SerdeCache::new(AbstractFs::new_lstm_fallback())), - trie_type: Default::default(), - collation_han_database: Default::default(), - collations: Default::default(), - } + use once_cell::sync::OnceCell; + + // Singleton so that all instantiations share the same cache. + static SINGLETON: OnceCell = OnceCell::new(); + SINGLETON + .get_or_init(|| { + #[cfg(not(test))] + { + Self::default() + .with_cldr_for_tag(Self::LATEST_TESTED_CLDR_TAG, Default::default()) + .unwrap() + .with_icuexport_for_tag(Self::LATEST_TESTED_ICUEXPORT_TAG) + .unwrap() + .with_segmenter_lstm_for_tag(Self::LATEST_TESTED_SEGMENTER_LSTM_TAG) + .unwrap() + } + #[cfg(test)] + { + // This is equivalent for the files defined in `tools/testdata-scripts/globs.rs.data`. + let data_root = + std::path::Path::new(core::env!("CARGO_MANIFEST_DIR")).join("tests/data"); + SourceData::default() + .with_cldr(data_root.join("cldr"), Default::default()) + .unwrap() + .with_icuexport(data_root.join("icuexport")) + .unwrap() + } + }) + .clone() } /// Adds CLDR data to this `SourceData`. The root should point to a local @@ -122,7 +140,7 @@ impl SourceData { /// /// Also see: [`LATEST_TESTED_CLDR_TAG`](Self::LATEST_TESTED_CLDR_TAG) /// - /// Requires `networking` Cargo feature. + /// ✨ *Enabled with the `networking` Cargo feature.* #[cfg(feature = "networking")] pub fn with_cldr_for_tag( self, @@ -143,7 +161,7 @@ impl SourceData { /// /// Also see: [`LATEST_TESTED_ICUEXPORT_TAG`](Self::LATEST_TESTED_ICUEXPORT_TAG) /// - /// Requires `networking` Cargo feature. + /// ✨ *Enabled with the `networking` Cargo feature.* #[cfg(feature = "networking")] pub fn with_icuexport_for_tag(self, mut tag: &str) -> Result { if tag == "release-71-1" { @@ -165,7 +183,7 @@ impl SourceData { /// /// Also see: [`LATEST_TESTED_SEGMENTER_LSTM_TAG`](Self::LATEST_TESTED_SEGMENTER_LSTM_TAG) /// - /// Requires `networking` Cargo feature. + /// ✨ *Enabled with the `networking` Cargo feature.* #[cfg(feature = "networking")] pub fn with_segmenter_lstm_for_tag(self, tag: &str) -> Result { Ok(Self { @@ -217,6 +235,7 @@ impl SourceData { #[deprecated(note = "use crate::Options", since = "1.3.0")] #[doc(hidden)] + #[cfg(feature = "legacy_api")] pub fn with_collations(self, collations: Vec) -> Self { Self { collations, ..self } } @@ -241,6 +260,19 @@ impl SourceData { Ok(&self.segmenter_lstm_paths) } + pub(crate) fn trie_type(&self) -> TrieType { + self.trie_type + } + + pub(crate) fn collation_han_database(&self) -> CollationHanDatabase { + self.collation_han_database + } + + #[cfg(feature = "legacy_api")] + pub(crate) fn collations(&self) -> &[String] { + &self.collations + } + /// List the locales for the given CLDR coverage levels pub fn locales( &self, diff --git a/provider/datagen/src/transform/cldr/characters/mod.rs b/provider/datagen/src/transform/cldr/characters/mod.rs index a9dfbed9f61..97e12181420 100644 --- a/provider/datagen/src/transform/cldr/characters/mod.rs +++ b/provider/datagen/src/transform/cldr/characters/mod.rs @@ -467,7 +467,7 @@ mod tests { #[test] fn test_basic() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let data: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/cldr_serde/mod.rs b/provider/datagen/src/transform/cldr/cldr_serde/mod.rs index 76a27ac0324..8a2bb1a6d52 100644 --- a/provider/datagen/src/transform/cldr/cldr_serde/mod.rs +++ b/provider/datagen/src/transform/cldr/cldr_serde/mod.rs @@ -10,10 +10,13 @@ pub mod aliases; pub mod ca; pub mod coverage_levels; +#[cfg(feature = "icu_singlenumberformatter")] pub mod currencies; pub mod currency_data; +#[cfg(feature = "icu_relativetime")] pub mod date_fields; pub mod directionality; +#[cfg(feature = "icu_displaynames")] pub mod displaynames; pub mod exemplar_chars; pub mod japanese; diff --git a/provider/datagen/src/transform/cldr/currency/mod.rs b/provider/datagen/src/transform/cldr/currency/mod.rs index bb458ea9f79..c070724e515 100644 --- a/provider/datagen/src/transform/cldr/currency/mod.rs +++ b/provider/datagen/src/transform/cldr/currency/mod.rs @@ -291,7 +291,7 @@ fn test_basic() { use icu_locid::locale; use icu_singlenumberformatter::provider::*; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let en: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/datetime/mod.rs b/provider/datagen/src/transform/cldr/datetime/mod.rs index 779d3d99068..d5caa3601e2 100644 --- a/provider/datagen/src/transform/cldr/datetime/mod.rs +++ b/provider/datagen/src/transform/cldr/datetime/mod.rs @@ -473,7 +473,7 @@ mod test { #[test] fn test_basic_patterns() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let locale: Locale = locale!("cs"); let cs_dates: DataPayload = provider @@ -490,7 +490,7 @@ mod test { #[test] fn test_with_numbering_system() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let locale: Locale = locale!("haw"); let cs_dates: DataPayload = provider @@ -513,7 +513,7 @@ mod test { use icu_plurals::PluralCategory; use std::convert::TryFrom; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let locale: Locale = "fil-u-ca-gregory".parse().unwrap(); let skeletons: DataPayload = provider @@ -557,7 +557,7 @@ mod test { fn test_basic_symbols() { use icu_calendar::types::MonthCode; use tinystr::tinystr; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let locale: Locale = locale!("cs"); let cs_dates: DataPayload = provider @@ -588,7 +588,7 @@ mod test { #[test] fn unalias_contexts() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let locale: Locale = locale!("cs"); let cs_dates: DataPayload = provider diff --git a/provider/datagen/src/transform/cldr/datetime/week_data.rs b/provider/datagen/src/transform/cldr/datetime/week_data.rs index 33c1b11217a..db0ee4841a4 100644 --- a/provider/datagen/src/transform/cldr/datetime/week_data.rs +++ b/provider/datagen/src/transform/cldr/datetime/week_data.rs @@ -82,7 +82,7 @@ fn basic_cldr_week_data() { use icu_calendar::types::IsoWeekday; use icu_locid::langid; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let default_week_data: DataPayload = provider .load(Default::default()) diff --git a/provider/datagen/src/transform/cldr/decimal/compact.rs b/provider/datagen/src/transform/cldr/decimal/compact.rs index 369c9c8f7d6..3779f59d7ed 100644 --- a/provider/datagen/src/transform/cldr/decimal/compact.rs +++ b/provider/datagen/src/transform/cldr/decimal/compact.rs @@ -131,7 +131,7 @@ mod tests { #[test] fn test_compact_long() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let fr_compact_long: DataPayload = provider .load(DataRequest { @@ -197,7 +197,7 @@ mod tests { #[test] fn test_compact_short() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let ja_compact_short: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/decimal/symbols.rs b/provider/datagen/src/transform/cldr/decimal/symbols.rs index 9ed0b891c37..7eea2434c57 100644 --- a/provider/datagen/src/transform/cldr/decimal/symbols.rs +++ b/provider/datagen/src/transform/cldr/decimal/symbols.rs @@ -96,7 +96,7 @@ impl TryFrom> for DecimalSymbolsV1<'static> { fn test_basic() { use icu_locid::locale; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let ar_decimal: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/displaynames/language.rs b/provider/datagen/src/transform/cldr/displaynames/language.rs index 9058902dd01..8897bd77670 100644 --- a/provider/datagen/src/transform/cldr/displaynames/language.rs +++ b/provider/datagen/src/transform/cldr/displaynames/language.rs @@ -218,7 +218,7 @@ mod tests { #[test] fn test_basic_lang_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let data: DataPayload = provider .load(DataRequest { @@ -240,7 +240,7 @@ mod tests { #[test] fn test_basic_lang_short_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let data: DataPayload = provider .load(DataRequest { @@ -262,7 +262,7 @@ mod tests { #[test] fn test_basic_lang_long_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let data: DataPayload = provider .load(DataRequest { @@ -284,7 +284,7 @@ mod tests { #[test] fn test_basic_lang_menu_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let data: DataPayload = provider .load(DataRequest { @@ -306,7 +306,7 @@ mod tests { #[test] fn test_basic_locale_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let data: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/displaynames/region.rs b/provider/datagen/src/transform/cldr/displaynames/region.rs index 993195de186..18f812387cb 100644 --- a/provider/datagen/src/transform/cldr/displaynames/region.rs +++ b/provider/datagen/src/transform/cldr/displaynames/region.rs @@ -97,7 +97,7 @@ mod tests { #[test] fn test_basic() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let data: DataPayload = provider .load(DataRequest { @@ -119,7 +119,7 @@ mod tests { #[test] fn test_basic_short_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let data: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/displaynames/script.rs b/provider/datagen/src/transform/cldr/displaynames/script.rs index e222244035b..8f3c8f760d7 100644 --- a/provider/datagen/src/transform/cldr/displaynames/script.rs +++ b/provider/datagen/src/transform/cldr/displaynames/script.rs @@ -99,7 +99,7 @@ mod tests { #[test] fn test_basic_script_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let data: DataPayload = provider .load(DataRequest { @@ -121,7 +121,7 @@ mod tests { #[test] fn test_basic_script_short_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let data: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/displaynames/variant.rs b/provider/datagen/src/transform/cldr/displaynames/variant.rs index fc493e13cb8..142b8210d3b 100644 --- a/provider/datagen/src/transform/cldr/displaynames/variant.rs +++ b/provider/datagen/src/transform/cldr/displaynames/variant.rs @@ -89,7 +89,7 @@ mod tests { #[test] fn test_basic_variant_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let data: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/fallback/mod.rs b/provider/datagen/src/transform/cldr/fallback/mod.rs index 2f0ba1bae99..0be478268f0 100644 --- a/provider/datagen/src/transform/cldr/fallback/mod.rs +++ b/provider/datagen/src/transform/cldr/fallback/mod.rs @@ -205,7 +205,7 @@ fn test_basic() { subtags::{language, region, script}, }; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let likely_subtags: DataPayload = provider .load(Default::default()) .unwrap() diff --git a/provider/datagen/src/transform/cldr/locale_canonicalizer/aliases.rs b/provider/datagen/src/transform/cldr/locale_canonicalizer/aliases.rs index 7670faf953a..6a16bf0bf82 100644 --- a/provider/datagen/src/transform/cldr/locale_canonicalizer/aliases.rs +++ b/provider/datagen/src/transform/cldr/locale_canonicalizer/aliases.rs @@ -271,7 +271,7 @@ fn test_appendix_c_cmp() { fn test_basic() { use icu_locid::subtags::{language, region, script}; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let data: DataPayload = provider .load(Default::default()) .unwrap() diff --git a/provider/datagen/src/transform/cldr/locale_canonicalizer/directionality.rs b/provider/datagen/src/transform/cldr/locale_canonicalizer/directionality.rs index 5dbaef1e814..443dc87c31a 100644 --- a/provider/datagen/src/transform/cldr/locale_canonicalizer/directionality.rs +++ b/provider/datagen/src/transform/cldr/locale_canonicalizer/directionality.rs @@ -54,7 +54,7 @@ impl From<&cldr_serde::directionality::Resource> for ScriptDirectionV1<'_> { fn test_basic() { use icu_locid::subtags::script; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let data: DataPayload = provider .load(Default::default()) .unwrap() diff --git a/provider/datagen/src/transform/cldr/locale_canonicalizer/likely_subtags.rs b/provider/datagen/src/transform/cldr/locale_canonicalizer/likely_subtags.rs index 4b6b71adc74..d2db48ffb62 100644 --- a/provider/datagen/src/transform/cldr/locale_canonicalizer/likely_subtags.rs +++ b/provider/datagen/src/transform/cldr/locale_canonicalizer/likely_subtags.rs @@ -278,7 +278,7 @@ pub(crate) fn transform<'x>( fn test_basic() { use icu_locid::subtags::{language, region, script}; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let result_common: DataPayload = provider .load(Default::default()) .unwrap() diff --git a/provider/datagen/src/transform/cldr/plurals/mod.rs b/provider/datagen/src/transform/cldr/plurals/mod.rs index d03cdd61643..89f493678a3 100644 --- a/provider/datagen/src/transform/cldr/plurals/mod.rs +++ b/provider/datagen/src/transform/cldr/plurals/mod.rs @@ -89,7 +89,7 @@ impl From<&cldr_serde::plurals::LocalePluralRules> for PluralRulesV1<'static> { fn test_basic() { use icu_locid::langid; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); // Spot-check locale 'cs' since it has some interesting entries let cs_rules: DataPayload = provider diff --git a/provider/datagen/src/transform/cldr/relativetime/mod.rs b/provider/datagen/src/transform/cldr/relativetime/mod.rs index c0d33452d28..34a43649e23 100644 --- a/provider/datagen/src/transform/cldr/relativetime/mod.rs +++ b/provider/datagen/src/transform/cldr/relativetime/mod.rs @@ -197,7 +197,7 @@ mod tests { #[test] fn test_basic() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let data: DataPayload = provider .load(DataRequest { locale: &locale!("en").into(), @@ -217,7 +217,7 @@ mod tests { #[test] fn test_singular_sub_pattern() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let data: DataPayload = provider .load(DataRequest { locale: &locale!("ar").into(), diff --git a/provider/datagen/src/transform/cldr/source.rs b/provider/datagen/src/transform/cldr/source.rs index 4c90a0e1def..2ebd03d6224 100644 --- a/provider/datagen/src/transform/cldr/source.rs +++ b/provider/datagen/src/transform/cldr/source.rs @@ -99,6 +99,8 @@ impl CldrCache { .iter() .filter_map(|(locale, c)| levels.contains(c).then_some(locale)) .cloned() + // `und` needs to be part of every set + .chain([Default::default()]) .collect()) } diff --git a/provider/datagen/src/transform/cldr/time_zones/mod.rs b/provider/datagen/src/transform/cldr/time_zones/mod.rs index 9692fa91f32..09965407b02 100644 --- a/provider/datagen/src/transform/cldr/time_zones/mod.rs +++ b/provider/datagen/src/transform/cldr/time_zones/mod.rs @@ -119,7 +119,7 @@ mod tests { fn basic_cldr_time_zones() { use icu_locid::langid; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let time_zone_formats: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/icuexport/collator/mod.rs b/provider/datagen/src/transform/icuexport/collator/mod.rs index 1fb44a4ed2a..4f6495e90f3 100644 --- a/provider/datagen/src/transform/icuexport/collator/mod.rs +++ b/provider/datagen/src/transform/icuexport/collator/mod.rs @@ -14,7 +14,6 @@ use icu_locid::LanguageIdentifier; use icu_locid::Locale; use icu_provider::datagen::IterableDataProvider; use icu_provider::prelude::*; -use std::collections::HashSet; use std::convert::TryFrom; use std::str::FromStr; use writeable::Writeable; @@ -22,9 +21,6 @@ use zerovec::ZeroVec; mod collator_serde; -// Collations removed by default from ICU4X data, plus all starting with "search". -static DEFAULT_REMOVED_COLLATIONS: &[&str] = &["big5han", "gb2312"]; - /// Backward compatibility for https://unicode-org.atlassian.net/browse/CLDR-15603 fn has_legacy_swedish_variants(source: &crate::SourceData) -> bool { source @@ -32,7 +28,7 @@ fn has_legacy_swedish_variants(source: &crate::SourceData) -> bool { .and_then(|i| { i.file_exists(&format!( "collation/{}/sv_reformed_meta.toml", - source.collation_han_database, + source.collation_han_database(), )) }) .unwrap_or(false) @@ -108,31 +104,6 @@ fn file_name_to_locale(file_name: &str, has_legacy_swedish_variants: bool) -> Op Some(locale) } -pub(crate) fn filter_data_locales( - locales: HashSet, - collations: &HashSet, -) -> HashSet { - locales - .into_iter() - .filter(|locale| { - locale - .get_unicode_ext(&key!("co")) - .and_then(|co| co.as_single_subtag().copied()) - .map(|collation| { - if collations.contains(collation.as_str()) { - true - } else if collation.starts_with("search") { - // Note: literal "search" and "searchjl" are handled above - collations.contains("search*") - } else { - !DEFAULT_REMOVED_COLLATIONS.contains(&collation.as_str()) - } - }) - .unwrap_or(true) - }) - .collect() -} - macro_rules! collation_provider { ($(($marker:ident, $serde_struct:ident, $suffix:literal, $conversion:expr)),+, $toml_data:ident) => { $( @@ -144,7 +115,7 @@ macro_rules! collation_provider { .icuexport()? .read_and_parse_toml(&format!( "collation/{}/{}{}.toml", - self.source.collation_han_database, + self.source.collation_han_database(), locale_to_file_name(&req.locale, has_legacy_swedish_variants(&self.source)), $suffix )) @@ -176,7 +147,7 @@ macro_rules! collation_provider { .icuexport()? .list(&format!( "collation/{}", - self.source.collation_han_database + self.source.collation_han_database() ))? .filter_map(|mut file_name| { file_name.truncate(file_name.len() - ".toml".len()); @@ -252,113 +223,3 @@ collation_provider!( ), toml_data ); - -#[test] -fn test_collation_filtering() { - use crate::options; - use icu_locid::langid; - use std::collections::BTreeSet; - - #[derive(Debug)] - struct TestCase<'a> { - include_collations: &'a [&'a str], - language: LanguageIdentifier, - expected: &'a [&'a str], - } - let cases = [ - TestCase { - include_collations: &[], - language: langid!("zh"), - expected: &["zh", "zh-u-co-stroke", "zh-u-co-unihan", "zh-u-co-zhuyin"], - }, - TestCase { - include_collations: &["gb2312"], - language: langid!("zh"), - expected: &[ - "zh", - "zh-u-co-gb2312", - "zh-u-co-stroke", - "zh-u-co-unihan", - "zh-u-co-zhuyin", - ], - }, - TestCase { - include_collations: &["big5han"], - language: langid!("zh"), - expected: &[ - "zh", - "zh-u-co-big5han", - "zh-u-co-stroke", - "zh-u-co-unihan", - "zh-u-co-zhuyin", - ], - }, - TestCase { - include_collations: &["gb2312", "search*"], - language: langid!("zh"), - expected: &[ - "zh", - "zh-u-co-gb2312", - "zh-u-co-stroke", - "zh-u-co-unihan", - "zh-u-co-zhuyin", - ], - }, - TestCase { - include_collations: &[], - language: langid!("ko"), - expected: &["ko", "ko-u-co-unihan"], - }, - TestCase { - include_collations: &["search"], - language: langid!("ko"), - expected: &["ko", "ko-u-co-search", "ko-u-co-unihan"], - }, - TestCase { - include_collations: &["searchjl"], - language: langid!("ko"), - expected: &["ko", "ko-u-co-searchjl", "ko-u-co-unihan"], - }, - TestCase { - include_collations: &["search", "searchjl"], - language: langid!("ko"), - expected: &["ko", "ko-u-co-search", "ko-u-co-searchjl", "ko-u-co-unihan"], - }, - TestCase { - include_collations: &["search*", "big5han"], - language: langid!("ko"), - expected: &["ko", "ko-u-co-search", "ko-u-co-searchjl", "ko-u-co-unihan"], - }, - ]; - for cas in cases { - let provider = crate::DatagenProvider::for_test(); - let mut options = options::Options::default(); - options.collations = cas - .include_collations - .iter() - .copied() - .map(String::from) - .collect(); - options.locales = - crate::options::LocaleInclude::Explicit([cas.language.clone()].into_iter().collect()); - options.fallback = crate::options::FallbackMode::Preresolved; - - let resolved_locales = provider - .select_locales_for_key( - CollationDataV1Marker::KEY, - &options, - &once_cell::sync::Lazy::new(|| unreachable!()), - ) - .unwrap() - .into_iter() - .map(|l| l.to_string()) - .collect::>(); - let expected_locales = cas - .expected - .iter() - .copied() - .map(String::from) - .collect::>(); - assert_eq!(resolved_locales, expected_locales, "{cas:?}"); - } -} diff --git a/provider/datagen/src/transform/icuexport/normalizer/mod.rs b/provider/datagen/src/transform/icuexport/normalizer/mod.rs index dd964e64d3a..477631c14b4 100644 --- a/provider/datagen/src/transform/icuexport/normalizer/mod.rs +++ b/provider/datagen/src/transform/icuexport/normalizer/mod.rs @@ -25,7 +25,8 @@ macro_rules! normalization_provider { let $toml_data: &normalizer_serde::$serde_struct = self.source.icuexport()?.read_and_parse_toml(&format!( "norm/{}/{}.toml", - self.source.trie_type, $file_name + self.source.trie_type(), + $file_name ))?; $conversion diff --git a/provider/datagen/src/transform/icuexport/ucase/mod.rs b/provider/datagen/src/transform/icuexport/ucase/mod.rs index d3018d93d92..ef414a5617f 100644 --- a/provider/datagen/src/transform/icuexport/ucase/mod.rs +++ b/provider/datagen/src/transform/icuexport/ucase/mod.rs @@ -21,7 +21,7 @@ impl DataProvider for crate::DatagenProvider { .icuexport()? .read_and_parse_toml::(&format!( "ucase/{}/ucase.toml", - self.source.trie_type + self.source.trie_type() ))? .ucase; @@ -61,7 +61,7 @@ impl DataProvider for crate::DatagenProvider { .icuexport()? .read_and_parse_toml::(&format!( "ucase/{}/ucase.toml", - self.source.trie_type + self.source.trie_type() ))? .ucase; diff --git a/provider/datagen/src/transform/icuexport/uprops/bidi_data.rs b/provider/datagen/src/transform/icuexport/uprops/bidi_data.rs index e7f14b39681..b5efc428252 100644 --- a/provider/datagen/src/transform/icuexport/uprops/bidi_data.rs +++ b/provider/datagen/src/transform/icuexport/uprops/bidi_data.rs @@ -18,7 +18,8 @@ fn get_code_point_prop_map<'a>( .icuexport()? .read_and_parse_toml::(&format!( "uprops/{}/{}.toml", - source.trie_type, key + source.trie_type(), + key ))? .enum_property .get(0) @@ -125,7 +126,7 @@ mod tests { #[test] fn test_bidi_data_provider() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let payload: DataPayload = provider .load(Default::default()) diff --git a/provider/datagen/src/transform/icuexport/uprops/bin_cp_set.rs b/provider/datagen/src/transform/icuexport/uprops/bin_cp_set.rs index b393b278e04..b52b04434a2 100644 --- a/provider/datagen/src/transform/icuexport/uprops/bin_cp_set.rs +++ b/provider/datagen/src/transform/icuexport/uprops/bin_cp_set.rs @@ -17,7 +17,8 @@ pub(crate) fn get_binary_prop_for_code_point_set<'a>( .icuexport()? .read_and_parse_toml::(&format!( "uprops/{}/{}.toml", - source.trie_type, key + source.trie_type(), + key ))? .binary_property .get(0) @@ -137,7 +138,7 @@ fn test_basic() { use icu_properties::provider::PropertyCodePointSetV1; use icu_properties::provider::WhiteSpaceV1Marker; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let payload: DataPayload = provider .load(Default::default()) diff --git a/provider/datagen/src/transform/icuexport/uprops/bin_uniset.rs b/provider/datagen/src/transform/icuexport/uprops/bin_uniset.rs index 93d50ae2855..505edfdb92e 100644 --- a/provider/datagen/src/transform/icuexport/uprops/bin_uniset.rs +++ b/provider/datagen/src/transform/icuexport/uprops/bin_uniset.rs @@ -18,7 +18,8 @@ fn get_binary_prop_for_unicodeset<'a>( .icuexport()? .read_and_parse_toml::(&format!( "uprops/{}/{}.toml", - source.trie_type, key + source.trie_type(), + key ))? .binary_property .get(0) @@ -78,7 +79,7 @@ fn test_basic() { use icu_properties::provider::BasicEmojiV1Marker; use icu_properties::provider::PropertyUnicodeSetV1; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let payload: DataPayload = provider .load(Default::default()) diff --git a/provider/datagen/src/transform/icuexport/uprops/enum_codepointtrie.rs b/provider/datagen/src/transform/icuexport/uprops/enum_codepointtrie.rs index 30bdb277272..d2e604927c1 100644 --- a/provider/datagen/src/transform/icuexport/uprops/enum_codepointtrie.rs +++ b/provider/datagen/src/transform/icuexport/uprops/enum_codepointtrie.rs @@ -20,7 +20,8 @@ pub(crate) fn get_enumerated_prop<'a>( .icuexport()? .read_and_parse_toml::(&format!( "uprops/{}/{}.toml", - source.trie_type, key + source.trie_type(), + key ))? .enum_property .get(0) @@ -379,7 +380,7 @@ fn get_mask_prop<'a>( .icuexport()? .read_and_parse_toml::(&format!( "uprops/{}/{}.toml", - source.trie_type, + source.trie_type(), key ))? .mask_property @@ -523,7 +524,7 @@ mod tests { // the ICU CodePointTrie that ICU4X is reading from. #[test] fn test_general_category() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let payload: DataPayload = provider .load(Default::default()) @@ -541,7 +542,7 @@ mod tests { #[test] fn test_script() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let payload: DataPayload = provider .load(Default::default()) diff --git a/provider/datagen/src/transform/icuexport/uprops/script.rs b/provider/datagen/src/transform/icuexport/uprops/script.rs index 5f281e26fa3..8b91b2279f4 100644 --- a/provider/datagen/src/transform/icuexport/uprops/script.rs +++ b/provider/datagen/src/transform/icuexport/uprops/script.rs @@ -25,7 +25,7 @@ impl DataProvider for crate::DatagenProvid .icuexport()? .read_and_parse_toml::(&format!( "uprops/{}/scx.toml", - self.source.trie_type, + self.source.trie_type(), ))? .script_extensions .get(0) @@ -73,7 +73,7 @@ mod tests { #[test] fn test_script_val_from_script_extensions() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let payload: DataPayload = provider .load(Default::default()) @@ -93,7 +93,7 @@ mod tests { #[test] fn test_scx_array_from_script_extensions() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let payload: DataPayload = provider .load(Default::default()) @@ -167,7 +167,7 @@ mod tests { #[test] fn test_has_script() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let payload: DataPayload = provider .load(Default::default()) @@ -247,7 +247,7 @@ mod tests { #[test] fn test_get_script_extensions_set() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let payload: DataPayload = provider .load(Default::default()) diff --git a/provider/datagen/src/transform/mod.rs b/provider/datagen/src/transform/mod.rs index cda63311103..abdb2c5a656 100644 --- a/provider/datagen/src/transform/mod.rs +++ b/provider/datagen/src/transform/mod.rs @@ -43,7 +43,7 @@ impl DatagenProvider { #[test] fn test_missing_locale() { use icu_locid::langid; - let provider = DatagenProvider::for_test(); + let provider = DatagenProvider::latest_tested(); assert!(DataProvider::::load( &provider, DataRequest { diff --git a/provider/datagen/src/transform/segmenter/dictionary.rs b/provider/datagen/src/transform/segmenter/dictionary.rs index 0d784c97cd4..962f91367d3 100644 --- a/provider/datagen/src/transform/segmenter/dictionary.rs +++ b/provider/datagen/src/transform/segmenter/dictionary.rs @@ -2,7 +2,6 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use crate::options; use icu_locid::{langid, locale}; use icu_provider::datagen::IterableDataProvider; use icu_provider::prelude::*; @@ -38,23 +37,6 @@ pub(crate) fn data_locale_to_model_name(locale: &DataLocale) -> Option<&'static } } -pub(crate) fn filter_data_locales( - locales: HashSet, - segmenter_models: &options::SegmenterModelInclude, -) -> HashSet { - match segmenter_models { - options::SegmenterModelInclude::Recommended => locales, - options::SegmenterModelInclude::None => Default::default(), - options::SegmenterModelInclude::Explicit(list) => locales - .into_iter() - .filter(|locale| { - list.iter() - .any(|x| Some(x.as_str()) == data_locale_to_model_name(locale)) - }) - .collect(), - } -} - impl crate::DatagenProvider { fn load_dictionary_data( &self, diff --git a/provider/datagen/src/transform/segmenter/lstm.rs b/provider/datagen/src/transform/segmenter/lstm.rs index 38a9ee3b6d3..00f140d8b35 100644 --- a/provider/datagen/src/transform/segmenter/lstm.rs +++ b/provider/datagen/src/transform/segmenter/lstm.rs @@ -4,7 +4,6 @@ //! This module contains provider implementations backed by LSTM segmentation data. -use crate::options; use icu_locid::langid; use icu_provider::datagen::IterableDataProvider; use icu_provider::prelude::*; @@ -203,23 +202,6 @@ pub(crate) fn data_locale_to_model_name(locale: &DataLocale) -> Option<&'static } } -pub(crate) fn filter_data_locales( - locales: HashSet, - segmenter_models: &options::SegmenterModelInclude, -) -> HashSet { - match &segmenter_models { - options::SegmenterModelInclude::Recommended => locales, - options::SegmenterModelInclude::None => Default::default(), - options::SegmenterModelInclude::Explicit(list) => locales - .into_iter() - .filter(|locale| { - list.iter() - .any(|x| Some(x.as_str()) == data_locale_to_model_name(locale)) - }) - .collect(), - } -} - impl DataProvider for crate::DatagenProvider { fn load( &self, @@ -268,7 +250,7 @@ mod tests { #[test] fn thai_word_break_with_grapheme_model() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let raw_data = provider .source .segmenter_lstm() diff --git a/provider/datagen/src/transform/segmenter/mod.rs b/provider/datagen/src/transform/segmenter/mod.rs index b7702712dc6..63c1c22b6db 100644 --- a/provider/datagen/src/transform/segmenter/mod.rs +++ b/provider/datagen/src/transform/segmenter/mod.rs @@ -585,7 +585,7 @@ impl crate::DatagenProvider { data: CodePointTrieBuilderData::ValuesByCodePoint(&properties_map), default_value: 0, error_value: 0, - trie_type: match self.source.trie_type { + trie_type: match self.source.trie_type() { crate::source::TrieType::Fast => icu_collections::codepointtrie::TrieType::Fast, crate::source::TrieType::Small => icu_collections::codepointtrie::TrieType::Small, }, @@ -679,7 +679,7 @@ mod tests { #[test] fn load_grapheme_cluster_data() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested(); let payload: DataPayload = provider .load(Default::default()) .expect("Loading should succeed!") diff --git a/provider/datagen/tests/make-testdata.rs b/provider/datagen/tests/make-testdata.rs index 4a1704d4a4f..8a57679bfdc 100644 --- a/provider/datagen/tests/make-testdata.rs +++ b/provider/datagen/tests/make-testdata.rs @@ -28,7 +28,7 @@ fn generate_json_and_verify_postcard() { let data_root = Path::new(concat!(core::env!("CARGO_MANIFEST_DIR"), "/tests/data/")); - let source = SourceData::offline() + let source = SourceData::default() .with_cldr(data_root.join("cldr"), Default::default()) .unwrap() .with_icuexport(data_root.join("icuexport")) @@ -54,16 +54,17 @@ fn generate_json_and_verify_postcard() { ), }); - let mut options = options::Options::default(); - options.keys = icu_datagen::all_keys().into_iter().collect(); - options.locales = options::LocaleInclude::Explicit(LOCALES.iter().cloned().collect()); - options.segmenter_models = options::SegmenterModelInclude::Explicit(vec![ - "thaidict".into(), - "Thai_codepoints_exclusive_model4_heavy".into(), - ]); - - DatagenProvider::new(source) - .export(options, MultiExporter::new(vec![json_out, postcard_out])) + DataExportDriver::default() + .with_keys(icu_datagen::all_keys()) + .with_locales(LOCALES.iter().cloned()) + .with_segmenter_models(vec![ + "thaidict".into(), + "Thai_codepoints_exclusive_model4_heavy".into(), + ]) + .export( + &DatagenProvider { source }, + MultiExporter::new(vec![json_out, postcard_out]), + ) .unwrap(); } diff --git a/provider/datagen/tests/test-options.rs b/provider/datagen/tests/test-options.rs index f828e4de79f..78a9d49b286 100644 --- a/provider/datagen/tests/test-options.rs +++ b/provider/datagen/tests/test-options.rs @@ -6,11 +6,9 @@ use std::collections::{BTreeMap, HashSet}; use std::path::Path; use elsa::sync::FrozenMap; -use icu_datagen::options::{FallbackMode, LocaleInclude, Options}; -use icu_datagen::{DatagenProvider, SourceData}; +use icu_datagen::prelude::*; use icu_decimal::provider::DecimalSymbolsV1Marker; -use icu_locid::{langid, LanguageIdentifier}; -use icu_provider::datagen::{DataExporter, ExportMarker}; +use icu_provider::datagen::ExportMarker; use icu_provider::prelude::*; use postcard::ser_flavors::{AllocVec, Flavor}; use writeable::Writeable; @@ -64,18 +62,17 @@ fn test_fallback_options() { let data_root = Path::new(concat!(core::env!("CARGO_MANIFEST_DIR"), "/tests/data/")); - let source = SourceData::offline() - .with_cldr(data_root.join("cldr"), Default::default()) - .unwrap() - .with_icuexport(data_root.join("icuexport")) - .unwrap(); - - let decimal_symbols_key: HashSet = [DecimalSymbolsV1Marker::KEY].into_iter().collect(); + let provider = DatagenProvider { + source: SourceData::default() + .with_cldr(data_root.join("cldr"), Default::default()) + .unwrap() + .with_icuexport(data_root.join("icuexport")) + .unwrap(), + }; let mut testing_exporter = TestingExporter::default(); - let mut options = Options::default(); - options.keys = decimal_symbols_key.clone(); + let driver = DataExportDriver::default().with_keys([DecimalSymbolsV1Marker::KEY]); let explicit_locales: HashSet = [ langid!("arc"), // Aramaic, not in CLDR @@ -91,10 +88,11 @@ fn test_fallback_options() { // // All+Hybrid // - options.locales = LocaleInclude::All; - options.fallback = FallbackMode::Hybrid; - DatagenProvider::new(source.clone()) - .export(options.clone(), &mut testing_exporter) + driver + .clone() + .with_all_locales() + .with_fallback_mode(FallbackMode::Hybrid) + .export(&provider, &mut testing_exporter) .unwrap(); let data_all_hybrid = testing_exporter.take_map_and_reset(); @@ -133,9 +131,11 @@ fn test_fallback_options() { // All+Runtime // - options.fallback = FallbackMode::RuntimeManual; - DatagenProvider::new(source.clone()) - .export(options.clone(), &mut testing_exporter) + driver + .clone() + .with_all_locales() + .with_fallback_mode(FallbackMode::RuntimeManual) + .export(&provider, &mut testing_exporter) .unwrap(); let data_all_runtime = testing_exporter.take_map_and_reset(); @@ -207,10 +207,11 @@ fn test_fallback_options() { // Explicit+Hybrid // - options.locales = LocaleInclude::Explicit(explicit_locales.clone()); - options.fallback = FallbackMode::Hybrid; - DatagenProvider::new(source.clone()) - .export(options.clone(), &mut testing_exporter) + driver + .clone() + .with_locales(explicit_locales.clone()) + .with_fallback_mode(FallbackMode::Hybrid) + .export(&provider, &mut testing_exporter) .unwrap(); let data_explicit_hybrid = testing_exporter.take_map_and_reset(); @@ -242,10 +243,11 @@ fn test_fallback_options() { // Explicit+Runtime // - options.locales = LocaleInclude::Explicit(explicit_locales.clone()); - options.fallback = FallbackMode::RuntimeManual; - DatagenProvider::new(source.clone()) - .export(options.clone(), &mut testing_exporter) + driver + .clone() + .with_locales(explicit_locales.clone()) + .with_fallback_mode(FallbackMode::RuntimeManual) + .export(&provider, &mut testing_exporter) .unwrap(); let data_explicit_runtime = testing_exporter.take_map_and_reset(); @@ -278,10 +280,11 @@ fn test_fallback_options() { // Explicit+Preresolved // - options.locales = LocaleInclude::Explicit(explicit_locales.clone()); - options.fallback = FallbackMode::Preresolved; - DatagenProvider { source } - .export(options, &mut testing_exporter) + driver + .clone() + .with_locales(explicit_locales.clone()) + .with_fallback_mode(FallbackMode::Preresolved) + .export(&provider, &mut testing_exporter) .unwrap(); let data_explicit_preresolved = testing_exporter.take_map_and_reset(); diff --git a/provider/fs/src/export/mod.rs b/provider/fs/src/export/mod.rs index b79a7407116..8fdca9674d2 100644 --- a/provider/fs/src/export/mod.rs +++ b/provider/fs/src/export/mod.rs @@ -23,14 +23,10 @@ //! .expect("Should successfully initialize data output directory"); //! //! // Export something -//! DatagenProvider::default() -//! .export({ -//! let mut options = options::Options::default(); -//! options.keys = [icu_provider::hello_world::HelloWorldV1Marker::KEY].into_iter().collect(); -//! options -//! }, -//! exporter -//! ).unwrap(); +//! DataExportDriver::default() +//! .with_keys([icu_provider::hello_world::HelloWorldV1Marker::KEY]) +//! .export(&DatagenProvider::default(), exporter) +//! .unwrap(); //! # //! # let _ = std::fs::remove_dir_all(&demo_path); //! ``` diff --git a/provider/fs/tests/data/bincode.json b/provider/fs/tests/data/bincode.json index 20cec59afb1..d8835be82db 100644 --- a/provider/fs/tests/data/bincode.json +++ b/provider/fs/tests/data/bincode.json @@ -4,6 +4,7 @@ "core/helloworld@1" ] }, + "fallback": "Hybrid", "locales": "All", "cldr": "None", "icu_export": "None", @@ -15,4 +16,4 @@ } }, "overwrite": true - } \ No newline at end of file +} \ No newline at end of file diff --git a/provider/fs/tests/data/json.json b/provider/fs/tests/data/json.json index 6711841894f..7db1650e4fb 100644 --- a/provider/fs/tests/data/json.json +++ b/provider/fs/tests/data/json.json @@ -4,6 +4,7 @@ "core/helloworld@1" ] }, + "fallback": "Hybrid", "locales": "All", "cldr": "None", "icu_export": "None", @@ -15,4 +16,4 @@ } }, "overwrite": true - } \ No newline at end of file +} \ No newline at end of file diff --git a/provider/fs/tests/data/postcard.json b/provider/fs/tests/data/postcard.json index 3e118721f8d..e9dc3768511 100644 --- a/provider/fs/tests/data/postcard.json +++ b/provider/fs/tests/data/postcard.json @@ -4,6 +4,7 @@ "core/helloworld@1" ] }, + "fallback": "Hybrid", "locales": "All", "cldr": "None", "icu_export": "None", @@ -15,4 +16,4 @@ } }, "overwrite": true - } \ No newline at end of file +} \ No newline at end of file diff --git a/tools/testdata-scripts/src/bin/make-testdata-legacy.rs b/tools/testdata-scripts/src/bin/make-testdata-legacy.rs index 1735cf91d97..cff79d9629e 100644 --- a/tools/testdata-scripts/src/bin/make-testdata-legacy.rs +++ b/tools/testdata-scripts/src/bin/make-testdata-legacy.rs @@ -25,7 +25,7 @@ fn main() { std::fs::create_dir_all(data_root).unwrap(); - let source = SourceData::offline() + let source = SourceData::default() .with_cldr_latest(Default::default()) .unwrap() .with_icuexport_latest()