diff --git a/Cargo.lock b/Cargo.lock index 955a928ecce..c9732d9c864 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1759,8 +1759,6 @@ dependencies = [ "lazy_static", "log", "ndarray", - "proc-macro2", - "quote", "rayon", "repodata", "serde", diff --git a/provider/blob/src/export/blob_exporter.rs b/provider/blob/src/export/blob_exporter.rs index bab96320b26..d4b0c1f77ba 100644 --- a/provider/blob/src/export/blob_exporter.rs +++ b/provider/blob/src/export/blob_exporter.rs @@ -57,7 +57,6 @@ impl DataExporter for BlobExporter<'_> { locale: &DataLocale, payload: &DataPayload, ) -> Result<(), DataError> { - log::trace!("Adding: {}/{}", key, locale); let mut serializer = postcard::Serializer { output: AllocVec::new(), }; diff --git a/provider/blob/src/export/mod.rs b/provider/blob/src/export/mod.rs index a9d1ac8345b..d831f651ee7 100644 --- a/provider/blob/src/export/mod.rs +++ b/provider/blob/src/export/mod.rs @@ -2,60 +2,41 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -//! Data generation for [`BlobDataProvider`](crate::BlobDataProvider) data. See the `icu_datagen` crate. +//! Data exporter for [`BlobDataProvider`](crate::BlobDataProvider). //! -//! # Examples +//! This module can be used as a target for the `icu_datagen` crate. //! -//! ``` -//! use icu_provider::datagen::DataExporter; -//! use icu_provider::dynutil::*; +//! # Examples +//! +//! use icu_datagen::prelude::*; //! use icu_provider::hello_world::*; -//! use icu_provider::prelude::*; -//! use icu_provider_blob::export::BlobExporter; +//! use icu_provider_blob::export::*; //! use icu_provider_blob::BlobDataProvider; -//! use std::borrow::Cow; -//! use std::io::Read; -//! use std::rc::Rc; //! //! let mut buffer: Vec = Vec::new(); -//! -//! let payload = DataPayload::::from_owned(HelloWorldV1 { -//! message: Cow::Borrowed("Hi"), -//! }); +//! +//! // Set up the exporter +//! let mut exporter = BlobExporter::new_with_sink(Box::new(&mut buffer)); //! //! // Export something -//! { -//! let mut exporter = BlobExporter::new_with_sink(Box::new(&mut buffer)); -//! exporter -//! .put_payload( -//! HelloWorldV1Marker::KEY, -//! &Default::default(), -//! &UpcastDataPayload::upcast(payload.clone()), -//! ) -//! .expect("Should successfully export"); -//! exporter -//! .close() -//! .expect("Should successfully dump to buffer"); -//! } +//! DatagenProvider::default() +//! .export( +//! [HelloWorldV1Marker::KEY].into_iter().collect(), +//! exporter +//! ).unwrap(); //! -//! // Create a blob provider reading from the buffer +//! // Create a filesystem provider reading from the demo directory //! let provider = //! BlobDataProvider::try_new_from_blob(buffer.into_boxed_slice()) -//! .expect("Should successfully read from buffer"); +//! .expect("Should successfully read from buffer") +//! .as_deserializing(); //! -//! // Read the key from the filesystem and ensure it is as expected -//! let req = DataRequest { -//! locale: Default::default(), -//! metadata: Default::default(), -//! }; +//! // Read the key from the blob //! let response: DataPayload = provider -//! .as_deserializing() -//! .load(req) +//! .load(Default::default()) //! .unwrap() //! .take_payload() //! .unwrap(); -//! -//! assert_eq!(response.get(), payload.get(),); //! ``` mod blob_exporter; diff --git a/provider/datagen/Cargo.toml b/provider/datagen/Cargo.toml index effa9249b45..e0a1cb711d2 100644 --- a/provider/datagen/Cargo.toml +++ b/provider/datagen/Cargo.toml @@ -48,15 +48,18 @@ icu_segmenter = { version = "1.2.0", path = "../../components/segmenter", featur icu_timezone = { version = "1.2.0", path = "../../components/timezone", features = ["datagen"] } # ICU provider infrastructure -icu_provider = { version = "1.2.0", path = "../core", features = ["std", "log_error_context", "datagen"]} +icu_provider = { version = "1.2.0", path = "../core", features = ["std", "log_error_context", "datagen", "deserialize_json"]} icu_provider_adapters = { version = "1.2.0", path = "../adapters", features = ["datagen"] } -icu_provider_blob = { version = "1.2.0", path = "../blob", features = ["export"] } -icu_provider_fs = { version = "1.2.0", path = "../fs", features = ["export"] } + +# Exporters +icu_provider_blob = { version = "1.2.0", path = "../blob", features = ["export"], optional = true } +icu_provider_fs = { version = "1.2.0", path = "../fs", features = ["export"], optional = true } +crlify = { version = "1.0.1", path = "../../utils/crlify", optional = true } +databake = { version = "0.1.3", path = "../../utils/databake", optional = true} +syn = {version = "1.0", features = ["parsing"], optional = true } # Other cached-path = { version = ">=0.5, <0.7", optional = true } -crlify = { version = "1.0.1", path = "../../utils/crlify"} -databake = { version = "0.1.3", path = "../../utils/databake"} displaydoc = { version = "0.2.3", default-features = false } elsa = "1.7" icu_codepointtrie_builder = { version = "0.3.4", path = "../../components/collections/codepointtrie_builder", default-features = false } @@ -66,13 +69,10 @@ itertools = "0.10" lazy_static = "1" log = "0.4" ndarray = { version = "0.15.5", default-features = false } -proc-macro2 = "1.0" -quote = "1.0.9" rayon = "1.5" serde = { version = "1.0", default-features = false, features = ["derive", "alloc"] } serde_json = { version = "1.0", default-features = false, features = ["alloc"] } serde-aux = { version = "4.1.2", default-features = false } -syn = {version = "1.0", features = ["parsing"] } tinystr = { version = "0.7.1", path = "../../utils/tinystr", features = ["alloc", "serde", "zerovec"], default-features = false } toml = "0.5" writeable = { version = "0.5.1", path = "../../utils/writeable" } @@ -95,7 +95,11 @@ repodata = { path = "../../provider/repodata" } dhat = "0.3.0" [features] -default = ["bin", "use_wasm", "networking"] +default = ["bin", "use_wasm", "networking", "legacy_api"] +provider_baked = ["dep:crlify", "dep:databake", "dep:syn"] +provider_blob = ["dep:icu_provider_blob"] +provider_fs = ["dep:icu_provider_fs"] +legacy_api = ["provider_fs", "provider_blob", "provider_baked"] bin = ["dep:clap", "dep:eyre", "dep:simple_logger"] # Use wasm for building codepointtries use_wasm = ["icu_codepointtrie_builder/wasm"] @@ -108,7 +112,7 @@ networking = ["dep:cached-path"] [[bin]] name = "icu4x-datagen" -path = "src/bin/datagen.rs" +path = "src/bin/datagen/mod.rs" required-features = ["bin"] [[test]] @@ -117,4 +121,4 @@ path = "tests/verify-zero-copy.rs" [package.metadata.cargo-all-features] # Disable check-all-features, as the bin feature is purely additive. -skip_feature_sets = [[]] +skip_feature_sets = [["use_icu4c"], ["use_wasm"]] diff --git a/provider/datagen/src/databake.rs b/provider/datagen/src/baked_exporter.rs similarity index 86% rename from provider/datagen/src/databake.rs rename to provider/datagen/src/baked_exporter.rs index 811087ac5d3..b3e34075d56 100644 --- a/provider/datagen/src/databake.rs +++ b/provider/datagen/src/baked_exporter.rs @@ -2,20 +2,61 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use databake::{quote, CrateEnv, TokenStream}; +//! A data exporter that bakes the data into Rust code. +//! +//! This module can be used as a target for the `icu_datagen` crate. +//! +//! # Examples +//! +//! ``` +//! use icu_datagen::prelude::*; +//! use icu_provider::hello_world::*; +//! use icu_datagen::baked_exporter::*; +//! +//! let demo_path = std::env::temp_dir().join("icu4x_baked_demo"); +//! +//! // Set up the exporter +//! let mut exporter = BakedExporter::new(demo_path.clone(), Default::default()).unwrap(); +//! +//! // Export something +//! DatagenProvider::default() +//! .export( +//! [HelloWorldV1Marker::KEY].into_iter().collect(), +//! exporter +//! ).unwrap(); +//! # +//! # std::fs::remove_dir_all(&demo_path) +//! # .expect("Should clean up test directory"); +//! ``` +//! +//! The resulting module structure can now be used like this: +//! +//! ```compile_fail +//! use icu_provider::prelude::*; +//! +//! struct MyDataProvider; +//! +//! mod baked { +//! include!("/path/to/mod/") +//! impl_data_provider!(MyDataProvider); +//! } +//! +//! let response: DataPayload = provider +//! .load(Default::default()) +//! .unwrap() +//! .take_payload() +//! .unwrap(); +//! ``` + +use databake::*; use icu_provider::datagen::*; use icu_provider::prelude::*; -use rayon::prelude::*; -use std::collections::BTreeMap; -use std::collections::BTreeSet; -use std::collections::HashMap; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::fs::File; use std::io::Write; use std::path::PathBuf; use std::sync::Mutex; -use crate::BakedOptions; - macro_rules! move_out { ($field:expr) => {{ let mut tmp = Default::default(); @@ -27,8 +68,37 @@ macro_rules! move_out { // TokenStream isn't Send/Sync type SyncTokenStream = String; +/// Options for configuring the output of [`BakedExporter`]. +#[non_exhaustive] +#[derive(Debug)] +pub struct Options { + /// Whether to run `rustfmt` on the generated files. + pub pretty: bool, + /// Whether to gate each key on its crate name. This allows using the module + /// even if some keys are not required and their dependencies are not included. + /// Requires use_separate_crates. + pub insert_feature_gates: bool, + /// Whether to use separate crates to name types instead of the `icu` metacrate + pub use_separate_crates: bool, + /// Whether to overwrite existing data. By default, errors if it is present. + pub overwrite: bool, +} + +#[allow(clippy::derivable_impls)] // want to be explicit about bool defaults +impl Default for Options { + fn default() -> Self { + Self { + pretty: false, + insert_feature_gates: false, + use_separate_crates: false, + overwrite: false, + } + } +} + #[allow(clippy::type_complexity)] -pub(crate) struct BakedDataExporter { +/// See the module-level documentation for details. +pub struct BakedExporter { // Input arguments mod_directory: PathBuf, pretty: bool, @@ -54,9 +124,22 @@ struct ImplData { feature: SyncTokenStream, } -impl BakedDataExporter { - pub fn new(mod_directory: PathBuf, options: BakedOptions) -> Result { - let BakedOptions { +impl std::fmt::Debug for BakedExporter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("BakedExporter") + .field("mod_directory", &self.mod_directory) + .field("pretty", &self.pretty) + .field("insert_feature_gates", &self.insert_feature_gates) + .field("use_separate_crates", &self.use_separate_crates) + // skip formatting intermediate data + .finish() + } +} + +impl BakedExporter { + /// Constructs a new [`BakedExporter`] with the given output directory and options. + pub fn new(mod_directory: PathBuf, options: Options) -> Result { + let Options { pretty, insert_feature_gates, use_separate_crates, @@ -179,6 +262,7 @@ impl BakedDataExporter { } fn write_intermediate_mod_files(&mut self) -> Result<(), DataError> { + use rayon::prelude::*; move_out!(self.mod_files) .into_inner() .expect("poison") @@ -199,7 +283,7 @@ impl BakedDataExporter { } } -impl DataExporter for BakedDataExporter { +impl DataExporter for BakedExporter { fn put_payload( &self, key: DataKey, @@ -377,6 +461,8 @@ impl DataExporter for BakedDataExporter { } fn close(&mut self) -> Result<(), DataError> { + log::info!("Writing module structure..."); + // These are BTreeMaps keyed on the marker to keep the output sorted and stable let mut data_impls = BTreeMap::new(); let mut any_consts = BTreeMap::new(); diff --git a/provider/datagen/src/bin/datagen.rs b/provider/datagen/src/bin/datagen.rs deleted file mode 100644 index e57a275e6d7..00000000000 --- a/provider/datagen/src/bin/datagen.rs +++ /dev/null @@ -1,386 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -use clap::{ArgGroup, Parser}; -use eyre::WrapErr; -use icu_datagen::prelude::*; -use simple_logger::SimpleLogger; -use std::path::PathBuf; - -mod cli { - use clap::ValueEnum; - - #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)] - pub(crate) enum Format { - Dir, - Blob, - Mod, - DeprecatedDefault, - } - - #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)] - pub(crate) enum Syntax { - Json, - Bincode, - Postcard, - } - - #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)] - pub(crate) enum TrieType { - Small, - Fast, - } - #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)] - pub(crate) enum CollationHanDatabase { - Unihan, - Implicit, - } - - #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)] - pub(crate) enum CollationTable { - Gb2312, - Big5han, - Search, - Searchji, - #[value(alias = "search*")] // for backwards compatability - SearchAll, - } - - impl CollationTable { - pub(crate) fn to_datagen_value(self) -> &'static str { - match self { - Self::Gb2312 => "gb2312", - Self::Big5han => "big5han", - Self::Search => "search", - Self::Searchji => "searchji", - Self::SearchAll => "search*", - } - } - } -} -#[derive(Parser)] -#[command(name = "icu4x-datagen")] -#[command(author = "The ICU4X Project Developers", version = option_env!("CARGO_PKG_VERSION"))] -#[command(about = format!("Learn more at: https://docs.rs/icu_datagen/{}", option_env!("CARGO_PKG_VERSION").unwrap_or("")), long_about = None)] -#[command(group( - ArgGroup::new("key_mode") - .required(true) - .args(["keys", "key_file", "keys_for_bin", "all_keys"]), - ))] -struct Cli { - #[arg(short, long)] - #[arg(help = "Requests verbose output")] - verbose: bool, - - #[arg(long, value_enum, default_value_t = cli::Format::DeprecatedDefault, hide_default_value = true)] - #[arg( - help = "Select the output format: a directory tree of files, a single blob, or a Rust module." - )] - format: cli::Format, - - #[arg(short = 'W', long)] - #[arg(help = "Delete the output before writing data.")] - overwrite: bool, - - #[arg(short, long, value_enum, default_value_t = cli::Syntax::Json)] - #[arg(help = "--format=dir only: serde serialization format.")] - syntax: cli::Syntax, - - #[arg(short, long)] - #[arg(help = "--format=mod, --format=dir only: pretty-print the Rust or JSON output files.")] - pretty: bool, - - #[arg(long)] - #[arg(help = "--format=dir only: whether to add a fingerprints file to the output.")] - fingerprint: bool, - - #[arg(short = 't', long, value_name = "TAG", default_value = "latest")] - #[arg( - help = "Download CLDR JSON data from this GitHub tag (https://github.com/unicode-org/cldr-json/tags)\n\ - Use 'latest' for the latest version verified to work with this version of the binary.\n\ - Ignored if '--cldr-root' is present. Requires binary to be built with `networking` feature (enabled by default).\n\ - Note that some keys do not support versions before 41.0.0." - )] - #[cfg_attr(not(feature = "networking"), arg(hide = true))] - cldr_tag: String, - - #[arg(long, value_name = "PATH")] - #[arg( - help = "Path to a local cldr-{version}-json-full.zip directory (see https://github.com/unicode-org/cldr-json/releases).\n\ - Note that some keys do not support versions before 41.0.0." - )] - cldr_root: Option, - - #[arg(long, value_name = "TAG", default_value = "latest")] - #[arg( - help = "Download Unicode Properties data from this GitHub tag (https://github.com/unicode-org/icu/tags)\n\ - Use 'latest' for the latest version verified to work with this version of the binary.\n\ - Ignored if '--icuexport-root' is present. Requires binary to be built with `networking` feature (enabled by default).\n\ - Note that some keys do not support versions before release-71-1." - )] - #[cfg_attr(not(feature = "networking"), arg(hide = true))] - icuexport_tag: String, - - #[arg(long, value_name = "PATH")] - #[arg( - help = "Path to a local icuexportdata_uprops_full directory (see https://github.com/unicode-org/icu/releases).\n\ - Note that some keys do not support versions before release-71-1." - )] - icuexport_root: Option, - - #[arg(long, value_enum, default_value_t = cli::TrieType::Small)] - #[arg( - help = "Whether to optimize CodePointTrie data structures for size (\"small\") or speed (\"fast\").\n\ - Using \"fast\" mode increases performance of CJK text processing and segmentation. For more\n\ - information, see the TrieType enum." - )] - trie_type: cli::TrieType, - - #[arg(long, value_enum, default_value_t = cli::CollationHanDatabase::Implicit)] - #[arg(help = "Which collation han database to use.")] - collation_han_database: cli::CollationHanDatabase, - - #[arg(long, value_enum, num_args = 1..)] - #[arg( - help = "Which less-common collation tables to include. 'search-all' includes all search tables." - )] - include_collations: Vec, - - #[arg(long, hide = true)] - #[arg(help = "Deprecated, use --locales full or --locales modern")] - cldr_locale_subset: bool, - - #[arg(long, short, num_args = 1..)] - #[arg( - help = "Include these resource keys in the output. Accepts multiple arguments.\n\ - Set to 'all' for all keys, 'experimental-all' to include experimental keys,\n\ - or 'none' for no keys." - )] - keys: Vec, - - #[arg(long, value_name = "KEY_FILE")] - #[arg( - help = "Path to text file with resource keys to include, one per line. Empty lines \ - and lines starting with '#' are ignored." - )] - key_file: Option, - - #[arg(long, value_name = "BINARY")] - #[arg(help = "Analyzes the binary and only includes keys that are used by the binary.")] - keys_for_bin: Option, - - #[arg(long, hide = true)] - #[arg(help = "Deprecated: alias for --keys all")] - all_keys: bool, - - #[arg(long, short, required_unless_present = "all_locales", num_args = 0..)] - #[arg( - help = "Include this locale in the output. Accepts multiple arguments. \ - Set to 'full' or 'modern' for the respective CLDR locale sets, or 'none' for no locales." - )] - locales: Vec, - - #[arg(long, hide = true)] - #[arg(help = "Deprecated: alias for --locales full")] - all_locales: bool, - - #[arg(long = "out", short, value_name = "PATH")] - #[arg( - help = "Path to output directory or file. Must be empty or non-existent, unless \ - --overwrite is present, in which case the directory is deleted first. \ - For --format=blob, omit this option to dump to stdout. \ - For --format={dir,mod} defaults to 'icu4x_data'." - )] - output: Option, - - #[arg(long)] - #[arg( - help = "--format=mod only: insert feature gates for individual `icu_*` crates. Requires --use-separate-crates" - )] - insert_feature_gates: bool, - - #[arg(long)] - #[arg( - help = "--format=mod only: use types from individual `icu_*` crates instead of the `icu` meta-crate." - )] - use_separate_crates: bool, -} - -fn main() -> eyre::Result<()> { - let matches = Cli::parse(); - - if matches.verbose { - SimpleLogger::new() - .with_level(log::LevelFilter::Trace) - .init() - .unwrap() - } else { - SimpleLogger::new() - .env() - .with_level(log::LevelFilter::Info) - .init() - .unwrap() - } - - let selected_keys = if matches.all_keys { - icu_datagen::all_keys() - } else if !matches.keys.is_empty() { - match matches.keys.as_slice() { - [x] if x == "none" => vec![], - [x] if x == "all" => icu_datagen::all_keys(), - [x] if x == "experimental-all" => icu_datagen::all_keys_with_experimental(), - keys => icu_datagen::keys(keys), - } - } else if let Some(ref key_file_path) = matches.key_file { - icu_datagen::keys_from_file(key_file_path) - .with_context(|| key_file_path.to_string_lossy().into_owned())? - } else if let Some(ref bin_path) = matches.keys_for_bin { - icu_datagen::keys_from_bin(bin_path) - .with_context(|| bin_path.to_string_lossy().into_owned())? - } else { - unreachable!("required group") - }; - - if selected_keys.is_empty() { - log::warn!("No keys selected"); - } - - let mut source_data = SourceData::default(); - if let Some(path) = matches.cldr_root { - source_data = source_data.with_cldr(path, Default::default())?; - } else { - #[cfg(feature = "networking")] - { - let tag = match &*matches.cldr_tag { - "latest" => SourceData::LATEST_TESTED_CLDR_TAG, - other => other, - }; - source_data = source_data.with_cldr_for_tag(tag, Default::default())? - } - #[cfg(not(feature = "networking"))] - { - eyre::bail!("--cldr-root flag is mandatory unless datagen is built with the `\"networking\"` feature"); - } - } - - if let Some(path) = matches.icuexport_root { - source_data = source_data.with_icuexport(path)?; - } else { - #[cfg(feature = "networking")] - { - let tag = match &*matches.icuexport_tag { - "latest" => SourceData::LATEST_TESTED_ICUEXPORT_TAG, - other => other, - }; - source_data = source_data.with_icuexport_for_tag(tag)?; - } - #[cfg(not(feature = "networking"))] - { - eyre::bail!("--icuexport-root flag is mandatory unless datagen is built with the `\"networking\"` feature"); - } - } - - if matches.trie_type == cli::TrieType::Fast { - source_data = source_data.with_fast_tries(); - } - - source_data = source_data.with_collation_han_database(match matches.collation_han_database { - cli::CollationHanDatabase::Unihan => CollationHanDatabase::Unihan, - cli::CollationHanDatabase::Implicit => CollationHanDatabase::Implicit, - }); - - if !matches.include_collations.is_empty() { - source_data = source_data.with_collations( - matches - .include_collations - .iter() - .map(|c| c.to_datagen_value().to_owned()) - .collect(), - ); - } - - let raw_locales = &matches.locales; - - let selected_locales = if raw_locales == &["none"] || selected_keys.is_empty() { - Some(vec![]) - } else if raw_locales == &["full"] || matches.all_locales { - None - } else if let Some(locale_subsets) = raw_locales - .iter() - .map(|s| match &**s { - "basic" => Some(CoverageLevel::Basic), - "moderate" => Some(CoverageLevel::Moderate), - "modern" => Some(CoverageLevel::Modern), - _ => None, - }) - .collect::>>() - { - Some(source_data.locales(&locale_subsets)?) - } else { - Some( - raw_locales - .iter() - .map(|s| { - s.parse::() - .with_context(|| s.to_string()) - }) - .collect::, eyre::Error>>()?, - ) - }; - - let out = match matches.format { - v @ (cli::Format::Dir | cli::Format::DeprecatedDefault) => { - if v == cli::Format::DeprecatedDefault { - log::warn!("Defaulting to --format=dir. This will become a required parameter in the future."); - } - icu_datagen::Out::Fs { - output_path: matches - .output - .unwrap_or_else(|| PathBuf::from("icu4x_data")), - serializer: match matches.syntax { - cli::Syntax::Bincode => Box::::default(), - cli::Syntax::Postcard => Box::::default(), - cli::Syntax::Json if matches.pretty => Box::new(syntax::Json::pretty()), - cli::Syntax::Json => Box::::default(), - }, - overwrite: matches.overwrite, - fingerprint: matches.fingerprint, - } - } - cli::Format::Blob => icu_datagen::Out::Blob(if let Some(path) = matches.output { - if !matches.overwrite && path.exists() { - eyre::bail!("Output path is present: {:?}", path); - } - Box::new( - std::fs::File::create(&path).with_context(|| path.to_string_lossy().to_string())?, - ) - } else { - Box::new(std::io::stdout()) - }), - cli::Format::Mod => { - let mod_directory = matches - .output - .unwrap_or_else(|| PathBuf::from("icu4x_data")); - - let mut options = BakedOptions::default(); - options.pretty = matches.pretty; - options.insert_feature_gates = matches.insert_feature_gates; - options.use_separate_crates = matches.use_separate_crates; - options.overwrite = matches.overwrite; - - icu_datagen::Out::Baked { - mod_directory, - options, - } - } - }; - - icu_datagen::datagen( - selected_locales.as_deref(), - &selected_keys, - &source_data, - vec![out], - ) - .map_err(eyre::ErrReport::from) -} diff --git a/provider/datagen/src/bin/datagen/args.rs b/provider/datagen/src/bin/datagen/args.rs new file mode 100644 index 00000000000..36543f72115 --- /dev/null +++ b/provider/datagen/src/bin/datagen/args.rs @@ -0,0 +1,372 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use super::config; +use clap::ValueEnum; +use clap::{ArgGroup, Parser}; +use eyre::WrapErr; +use icu_datagen::prelude::*; +use std::path::PathBuf; + +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)] +enum Format { + Dir, + Blob, + Mod, + DeprecatedDefault, +} + +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)] +enum Syntax { + Json, + Bincode, + Postcard, +} + +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)] +enum TrieType { + Small, + Fast, +} +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)] +enum CollationHanDatabase { + Unihan, + Implicit, +} + +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)] +enum CollationTable { + Gb2312, + Big5han, + Search, + Searchji, + #[value(alias = "search*")] // for backwards compatability + SearchAll, +} + +impl CollationTable { + fn to_datagen_value(self) -> &'static str { + match self { + Self::Gb2312 => "gb2312", + Self::Big5han => "big5han", + Self::Search => "search", + Self::Searchji => "searchji", + Self::SearchAll => "search*", + } + } +} + +#[derive(Parser)] +#[command(name = "icu4x-datagen")] +#[command(author = "The ICU4X Project Developers", version = option_env!("CARGO_PKG_VERSION"))] +#[command(about = format!("Learn more at: https://docs.rs/icu_datagen/{}", option_env!("CARGO_PKG_VERSION").unwrap_or("")), long_about = None)] +#[command(group( + ArgGroup::new("key_mode") + // .required(true) + .args(["keys", "key_file", "keys_for_bin", "all_keys"]), + ))] +pub struct Cli { + #[arg(short, long)] + #[arg(help = "Requests verbose output")] + pub verbose: bool, + + #[arg(long, value_enum, default_value_t = Format::DeprecatedDefault, hide_default_value = true)] + #[arg( + help = "Select the output format: a directory tree of files, a single blob, or a Rust module." + )] + format: Format, + + #[arg(short = 'W', long)] + #[arg(help = "Delete the output before writing data.")] + overwrite: bool, + + #[arg(short, long, value_enum, default_value_t = Syntax::Json)] + #[arg(help = "--format=dir only: serde serialization format.")] + syntax: Syntax, + + #[arg(short, long)] + #[arg(help = "--format=mod, --format=dir only: pretty-print the Rust or JSON output files.")] + pretty: bool, + + #[arg(long)] + #[arg(help = "--format=dir only: whether to add a fingerprints file to the output.")] + fingerprint: bool, + + #[arg(short = 't', long, value_name = "TAG", default_value = "latest")] + #[arg( + help = "Download CLDR JSON data from this GitHub tag (https://github.com/unicode-org/cldr-json/tags)\n\ + Use 'latest' for the latest version verified to work with this version of the binary.\n\ + Ignored if '--cldr-root' is present. Requires binary to be built with `networking` feature (enabled by default).\n\ + Note that some keys do not support versions before 41.0.0." + )] + #[cfg_attr(not(feature = "networking"), arg(hide = true))] + cldr_tag: String, + + #[arg(long, value_name = "PATH")] + #[arg( + help = "Path to a local cldr-{version}-json-full.zip directory (see https://github.com/unicode-org/cldr-json/releases).\n\ + Note that some keys do not support versions before 41.0.0." + )] + cldr_root: Option, + + #[arg(long, value_name = "TAG", default_value = "latest")] + #[arg( + help = "Download Unicode Properties data from this GitHub tag (https://github.com/unicode-org/icu/tags)\n\ + Use 'latest' for the latest version verified to work with this version of the binary.\n\ + Ignored if '--icuexport-root' is present. Requires binary to be built with `networking` feature (enabled by default).\n\ + Note that some keys do not support versions before release-71-1." + )] + #[cfg_attr(not(feature = "networking"), arg(hide = true))] + icuexport_tag: String, + + #[arg(long, value_name = "PATH")] + #[arg( + help = "Path to a local icuexportdata_uprops_full directory (see https://github.com/unicode-org/icu/releases).\n\ + Note that some keys do not support versions before release-71-1." + )] + icuexport_root: Option, + + #[arg(long, value_enum, default_value_t = TrieType::Small)] + #[arg( + help = "Whether to optimize CodePointTrie data structures for size (\"small\") or speed (\"fast\").\n\ + Using \"fast\" mode increases performance of CJK text processing and segmentation. For more\n\ + information, see the TrieType enum." + )] + trie_type: TrieType, + + #[arg(long, value_enum, default_value_t = CollationHanDatabase::Implicit)] + #[arg(help = "Which collation han database to use.")] + collation_han_database: CollationHanDatabase, + + #[arg(long, value_enum, num_args = 1..)] + #[arg( + help = "Which less-common collation tables to include. 'search-all' includes all search tables." + )] + include_collations: Vec, + + #[arg(long, hide = true)] + #[arg(help = "Deprecated, use --locales full or --locales modern")] + cldr_locale_subset: bool, + + #[arg(long, short, num_args = 1..)] + #[arg( + help = "Include these resource keys in the output. Accepts multiple arguments.\n\ + Set to 'all' for all keys, 'experimental-all' to include experimental keys,\n\ + or 'none' for no keys." + )] + keys: Vec, + + #[arg(long, value_name = "KEY_FILE")] + #[arg( + help = "Path to text file with resource keys to include, one per line. Empty lines \ + and lines starting with '#' are ignored." + )] + key_file: Option, + + #[arg(long, value_name = "BINARY")] + #[arg(help = "Analyzes the binary and only includes keys that are used by the binary.")] + keys_for_bin: Option, + + #[arg(long, hide = true)] + #[arg(help = "Deprecated: alias for --keys all")] + all_keys: bool, + + #[arg(long, short, num_args = 0..)] + #[arg( + help = "Include this locale in the output. Accepts multiple arguments. \ + Set to 'full' or 'modern' for the respective CLDR locale sets, or 'none' for no locales." + )] + locales: Vec, + + #[arg(long, hide = true)] + #[arg(help = "Deprecated: alias for --locales full")] + all_locales: bool, + + #[arg(long = "out", short, value_name = "PATH")] + #[arg( + help = "Path to output directory or file. Must be empty or non-existent, unless \ + --overwrite is present, in which case the directory is deleted first. \ + For --format=blob, omit this option to dump to stdout. \ + For --format={dir,mod} defaults to 'icu4x_data'." + )] + output: Option, + + #[arg(long)] + #[arg( + help = "--format=mod only: insert feature gates for individual `icu_*` crates. Requires --use-separate-crates" + )] + insert_feature_gates: bool, + + #[arg(long)] + #[arg( + help = "--format=mod only: use types from individual `icu_*` crates instead of the `icu` meta-crate." + )] + use_separate_crates: bool, + + #[arg(long)] + #[arg(help = "Load a TOML config")] + pub config: Option, +} + +impl Cli { + pub fn as_config(&self) -> eyre::Result { + Ok(config::Config { + keys: self.make_keys()?, + locales: self.make_locales()?, + cldr: self.make_cldr()?, + icu_export: self.make_icu_export()?, + trie_type: match self.trie_type { + TrieType::Fast => config::TrieType::Fast, + TrieType::Small => config::TrieType::Small, + }, + collation_han_database: match self.collation_han_database { + CollationHanDatabase::Unihan => config::CollationHanDatabase::Unihan, + CollationHanDatabase::Implicit => config::CollationHanDatabase::Implicit, + }, + collations: self + .include_collations + .iter() + .map(|c| c.to_datagen_value().to_owned()) + .collect(), + export: self.make_exporter()?, + overwrite: self.overwrite, + }) + } + + fn make_keys(&self) -> eyre::Result { + Ok(if self.all_keys { + config::KeyInclude::All + } else if !self.keys.is_empty() { + match self.keys.as_slice() { + [x] if x == "none" => config::KeyInclude::None, + [x] if x == "all" => config::KeyInclude::All, + [x] if x == "experimental-all" => config::KeyInclude::AllWithExperimental, + keys => config::KeyInclude::Explicit( + keys.iter() + .map(|k| icu_datagen::key(k).ok_or(eyre::eyre!(k.to_string()))) + .collect::>()?, + ), + } + } else if let Some(key_file_path) = &self.key_file { + log::warn!("The --key-file argument is deprecated. Use --options with a JSON file."); + #[allow(deprecated)] + config::KeyInclude::Explicit( + icu_datagen::keys_from_file(key_file_path) + .with_context(|| key_file_path.to_string_lossy().into_owned())? + .into_iter() + .collect(), + ) + } else if let Some(bin_path) = &self.keys_for_bin { + config::KeyInclude::ForBinary(bin_path.clone()) + } else { + unreachable!("Argument group"); + }) + } + + fn make_locales(&self) -> eyre::Result { + Ok(if self.locales.as_slice() == ["none"] { + config::LocaleInclude::None + } else if self.locales.as_slice() == ["full"] || self.all_locales { + config::LocaleInclude::All + } else if let Some(locale_subsets) = self + .locales + .iter() + .map(|s| match &**s { + "basic" => Some(config::CoverageLevel::Basic), + "moderate" => Some(config::CoverageLevel::Moderate), + "modern" => Some(config::CoverageLevel::Modern), + _ => None, + }) + .collect::>>() + { + config::LocaleInclude::CldrSet(locale_subsets.into_iter().collect()) + } else { + config::LocaleInclude::Explicit( + self.locales + .iter() + .map(|s| { + s.parse::() + .with_context(|| s.to_string()) + }) + .collect::>()?, + ) + }) + } + + fn make_cldr(&self) -> eyre::Result { + Ok(match (&self.cldr_root, self.cldr_tag.as_str()) { + (Some(path), _) => config::PathOrTag::Path(path.clone()), + #[cfg(feature = "networking")] + (_, "latest") => config::PathOrTag::Latest, + #[cfg(feature = "networking")] + (_, tag) => config::PathOrTag::Tag(String::from(tag)), + #[cfg(not(feature = "networking"))] + _ => eyre::bail!("--cldr-root flag is mandatory unless datagen is built with the `\"networking\"` feature"), + }) + } + + fn make_icu_export(&self) -> eyre::Result { + Ok(match (&self.icuexport_root, self.icuexport_tag.as_str()) { + (Some(path), _) => config::PathOrTag::Path(path.clone()), + #[cfg(feature = "networking")] + (_, "latest") => config::PathOrTag::Latest, + #[cfg(feature = "networking")] + (_, tag) => config::PathOrTag::Tag(String::from(tag)), + #[cfg(not(feature = "networking"))] + _ => eyre::bail!("--icuexport-root flag is mandatory unless datagen is built with the `\"networking\"` feature"), + }) + } + + fn make_exporter(&self) -> eyre::Result { + match self.format { + v @ (Format::Dir | Format::DeprecatedDefault) => { + if v == Format::DeprecatedDefault { + log::warn!("Defaulting to --format=dir. This will become a required parameter in the future."); + } + #[cfg(not(feature = "provider_fs"))] + eyre::bail!("FsDataProvider export requires the provider_fs Cargo feature."); + #[cfg(feature = "provider_fs")] + Ok(config::Export::Fs { + output_path: if let Some(root) = self.output.as_ref() { + root.clone() + } else { + PathBuf::from("icu4x_data") + }, + syntax: match self.syntax { + Syntax::Bincode => config::FsSyntax::Bincode, + Syntax::Postcard => config::FsSyntax::Postcard, + Syntax::Json if self.pretty => config::FsSyntax::JsonPretty, + Syntax::Json => config::FsSyntax::Json, + }, + fingerprint: self.fingerprint, + }) + } + Format::Blob => { + #[cfg(not(feature = "provider_blob"))] + eyre::bail!("BlobDataProvider export requires the provider_blob Cargo feature."); + #[cfg(feature = "provider_blob")] + Ok(config::Export::Blob(if let Some(path) = &self.output { + path.clone() + } else { + PathBuf::from("/stdout") + })) + } + Format::Mod => { + #[cfg(not(feature = "provider_baked"))] + eyre::bail!("Baked data export requires the provider_baked Cargo feature."); + #[cfg(feature = "provider_baked")] + Ok(config::Export::Baked { + output_path: if let Some(mod_directory) = self.output.as_ref() { + mod_directory.clone() + } else { + PathBuf::from("icu4x_data") + }, + pretty: self.pretty, + insert_feature_gates: self.insert_feature_gates, + use_separate_crates: self.use_separate_crates, + }) + } + } + } +} diff --git a/provider/datagen/src/bin/datagen/config.rs b/provider/datagen/src/bin/datagen/config.rs new file mode 100644 index 00000000000..1c8ea7036cf --- /dev/null +++ b/provider/datagen/src/bin/datagen/config.rs @@ -0,0 +1,111 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +pub use icu_datagen::options::*; + +use icu_provider::prelude::*; +use std::collections::HashSet; +use std::path::PathBuf; + +#[derive(Debug, serde::Deserialize, serde::Serialize)] +pub struct Config { + #[serde(default = "Default::default", skip_serializing_if = "is_default")] + pub keys: KeyInclude, + #[serde(default = "Default::default", skip_serializing_if = "is_default")] + pub locales: LocaleInclude, + pub cldr: PathOrTag, + pub icu_export: PathOrTag, + #[serde(default = "Default::default", skip_serializing_if = "is_default")] + pub trie_type: TrieType, + #[serde(default = "Default::default", skip_serializing_if = "is_default")] + pub collation_han_database: CollationHanDatabase, + #[serde(default = "Default::default", skip_serializing_if = "is_default")] + pub collations: HashSet, + pub export: Export, + #[serde(default = "Default::default", skip_serializing_if = "is_default")] + pub overwrite: bool, +} + +fn is_default(value: &T) -> bool { + value == &T::default() +} + +#[non_exhaustive] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] +pub enum KeyInclude { + None, + All, + AllWithExperimental, + Explicit(#[serde(with = "data_key_as_str")] HashSet), + ForBinary(PathBuf), +} + +impl Default for KeyInclude { + fn default() -> Self { + Self::All + } +} + +mod data_key_as_str { + use super::*; + use serde::{de::*, ser::*}; + use std::borrow::Cow; + + pub fn serialize(selff: &HashSet, ser: S) -> Result { + selff + .iter() + .map(|k| k.path().get()) + .collect::>() + .serialize(ser) + } + + pub fn deserialize<'de, D: Deserializer<'de>>(de: D) -> Result, D::Error> { + HashSet::>::deserialize(de)? + .into_iter() + .map(|s| icu_datagen::key(&s).ok_or(s)) + .collect::>() + .map_err(|s| D::Error::custom(format!("Unknown key {s}"))) + } +} + +#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq)] +pub enum PathOrTag { + Path(PathBuf), + #[cfg(feature = "networking")] + Tag(String), + #[cfg(feature = "networking")] + Latest, +} + +#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq)] +pub enum Export { + #[cfg(feature = "provider_fs")] + Fs { + output_path: PathBuf, + syntax: FsSyntax, + #[serde(default = "Default::default", skip_serializing_if = "is_default")] + fingerprint: bool, + }, + #[cfg(feature = "provider_blob")] + Blob(PathBuf), + #[cfg(feature = "provider_baked")] + Baked { + output_path: PathBuf, + #[serde(default = "Default::default", skip_serializing_if = "is_default")] + pretty: bool, + #[serde(default = "Default::default", skip_serializing_if = "is_default")] + insert_feature_gates: bool, + #[serde(default = "Default::default", skip_serializing_if = "is_default")] + use_separate_crates: bool, + }, +} + +#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq)] +#[cfg(feature = "provider_fs")] +pub enum FsSyntax { + Postcard, + Json, + Bincode, + JsonPretty, +} diff --git a/provider/datagen/src/bin/datagen/mod.rs b/provider/datagen/src/bin/datagen/mod.rs new file mode 100644 index 00000000000..a235d8c38b1 --- /dev/null +++ b/provider/datagen/src/bin/datagen/mod.rs @@ -0,0 +1,145 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +// If no exporter feature is enabled this all doesn't make sense +#![allow(unused_imports)] +#![allow(unused_variables)] +#![allow(unused_assignments)] + +use clap::Parser; +use eyre::WrapErr; +use icu_datagen::prelude::*; +use simple_logger::SimpleLogger; + +mod args; +pub mod config; + +fn main() -> eyre::Result<()> { + let matches = args::Cli::parse(); + + if matches.verbose { + SimpleLogger::new() + .with_level(log::LevelFilter::Trace) + .init() + .unwrap() + } else { + SimpleLogger::new() + .env() + .with_level(log::LevelFilter::Info) + .init() + .unwrap() + } + + let config = if let Some(ref path) = matches.config { + serde_json::from_str(&std::fs::read_to_string(path)?)? + } else { + matches.as_config()? + }; + + let mut options = options::Options::default(); + options.locales = config.locales; + options.trie_type = config.trie_type; + options.collation_han_database = config.collation_han_database; + options.collations = config.collations; + + let mut source_data = SourceData::offline(); + source_data = match config.cldr { + config::PathOrTag::Path(path) => source_data.with_cldr(path, Default::default())?, + #[cfg(feature = "networking")] + config::PathOrTag::Latest => { + source_data.with_cldr_for_tag(SourceData::LATEST_TESTED_CLDR_TAG, Default::default())? + } + #[cfg(feature = "networking")] + config::PathOrTag::Tag(tag) => source_data.with_cldr_for_tag(&tag, Default::default())?, + }; + + source_data = match config.icu_export { + config::PathOrTag::Path(path) => source_data.with_icuexport(path)?, + #[cfg(feature = "networking")] + config::PathOrTag::Latest => { + source_data.with_icuexport_for_tag(SourceData::LATEST_TESTED_ICUEXPORT_TAG)? + } + #[cfg(feature = "networking")] + config::PathOrTag::Tag(tag) => source_data.with_icuexport_for_tag(&tag)?, + }; + + let provider = DatagenProvider::try_new(options, source_data)?; + + let keys = match config.keys { + config::KeyInclude::None => Default::default(), + config::KeyInclude::All => icu_datagen::all_keys().into_iter().collect(), + config::KeyInclude::AllWithExperimental => icu_datagen::all_keys_with_experimental() + .into_iter() + .collect(), + config::KeyInclude::Explicit(set) => set, + config::KeyInclude::ForBinary(path) => { + icu_datagen::keys_from_bin(path)?.into_iter().collect() + } + }; + + match config.export { + #[cfg(feature = "provider_fs")] + config::Export::Fs { + output_path, + syntax, + fingerprint, + } => { + use icu_provider_fs::export::{serializers::*, *}; + let exporter = FilesystemExporter::try_new( + match syntax { + config::FsSyntax::Bincode => Box::::default(), + config::FsSyntax::Postcard => Box::::default(), + config::FsSyntax::JsonPretty => Box::new(json::Serializer::pretty()), + config::FsSyntax::Json => Box::::default(), + }, + { + let mut options = ExporterOptions::default(); + options.root = output_path; + if config.overwrite { + options.overwrite = OverwriteOption::RemoveAndReplace + } + options.fingerprint = fingerprint; + options + }, + )?; + Ok(provider.export(keys, exporter)?) + } + #[cfg(feature = "provider_blob")] + config::Export::Blob(ref path) => { + let exporter = icu_provider_blob::export::BlobExporter::new_with_sink( + if path == std::path::Path::new("/stdout") { + Box::new(std::io::stdout()) + } else if !config.overwrite && path.exists() { + eyre::bail!("Output path is present: {:?}", path); + } else { + Box::new( + std::fs::File::create(path) + .with_context(|| path.to_string_lossy().to_string())?, + ) + }, + ); + Ok(provider.export(keys, exporter)?) + } + #[cfg(feature = "provider_baked")] + config::Export::Baked { + output_path, + pretty, + insert_feature_gates, + use_separate_crates, + } => { + use icu_datagen::baked_exporter::*; + + let exporter = BakedExporter::new(output_path, { + let mut options = Options::default(); + options.pretty = pretty; + options.insert_feature_gates = insert_feature_gates; + options.use_separate_crates = use_separate_crates; + options.overwrite = config.overwrite; + options + })?; + + Ok(provider.export(keys, exporter)?) + } + } +} diff --git a/provider/datagen/src/lib.rs b/provider/datagen/src/lib.rs index d4b736b7486..94447ae4251 100644 --- a/provider/datagen/src/lib.rs +++ b/provider/datagen/src/lib.rs @@ -17,16 +17,16 @@ //! //! ```no_run //! use icu_datagen::prelude::*; +//! use icu_provider_blob::export::*; //! use std::fs::File; //! //! fn main() { -//! icu_datagen::datagen( -//! Some(&[langid!("de"), langid!("en-AU")]), -//! &[icu::list::provider::AndListV1Marker::KEY], -//! &SourceData::default(), -//! vec![Out::Blob(Box::new(File::create("data.postcard").unwrap()))], -//! ) -//! .unwrap(); +//! DatagenProvider::default() +//! .export( +//! [icu::list::provider::AndListV1Marker::KEY].into_iter().collect(), +//! BlobExporter::new_with_sink(Box::new(File::create("data.postcard").unwrap())), +//! ) +//! .unwrap(); //! } //! ``` //! @@ -64,7 +64,6 @@ )] #![warn(missing_docs)] -mod databake; mod error; mod registry; mod source; @@ -73,86 +72,137 @@ mod testutil; mod transform; pub use error::{is_missing_cldr_error, is_missing_icuexport_error}; -pub use registry::*; -pub use source::{CollationHanDatabase, CoverageLevel, SourceData}; +pub use registry::{all_keys, all_keys_with_experimental, deserialize_and_discard}; +pub use source::SourceData; -#[allow(clippy::exhaustive_enums)] // exists for backwards compatibility -#[doc(hidden)] -#[derive(Debug)] -pub enum CldrLocaleSubset { - Ignored, -} - -impl Default for CldrLocaleSubset { - fn default() -> Self { - Self::Ignored - } -} - -impl CldrLocaleSubset { - #[allow(non_upper_case_globals)] - pub const Full: Self = Self::Ignored; - #[allow(non_upper_case_globals)] - pub const Modern: Self = Self::Ignored; -} - -/// [Out::Fs] serialization formats. -pub mod syntax { - #[doc(no_inline)] - pub use icu_provider_fs::export::serializers::bincode::Serializer as Bincode; - #[doc(no_inline)] - pub use icu_provider_fs::export::serializers::json::Serializer as Json; - #[doc(no_inline)] - pub use icu_provider_fs::export::serializers::postcard::Serializer as Postcard; -} +#[cfg(feature = "provider_baked")] +pub mod baked_exporter; +pub mod options; /// A prelude for using the datagen API pub mod prelude { - #[doc(hidden)] - pub use crate::CldrLocaleSubset; #[doc(no_inline)] - pub use crate::{syntax, BakedOptions, CollationHanDatabase, CoverageLevel, Out, SourceData}; + pub use crate::{options, DatagenProvider, SourceData}; #[doc(no_inline)] pub use icu_locid::{langid, LanguageIdentifier}; #[doc(no_inline)] - pub use icu_provider::KeyedDataMarker; + pub use icu_provider::{datagen::DataExporter, DataKey, KeyedDataMarker}; + + // SEMVER GRAVEYARD + #[cfg(feature = "legacy_api")] + #[doc(hidden)] + pub use crate::options::{CollationHanDatabase, CoverageLevel}; + #[cfg(feature = "legacy_api")] + #[allow(deprecated)] + #[doc(hidden)] + pub use crate::{syntax, BakedOptions, CldrLocaleSubset, Out}; } use icu_provider::datagen::*; use icu_provider::prelude::*; -use icu_provider_adapters::empty::EmptyDataProvider; -use icu_provider_adapters::filter::Filterable; -use icu_provider_fs::export::serializers::AbstractSerializer; -use prelude::*; -use rayon::prelude::*; use std::collections::HashSet; -use std::io::{BufRead, BufReader}; -use std::path::{Path, PathBuf}; +use std::path::Path; /// [`DataProvider`] backed by [`SourceData`] -#[allow(clippy::exhaustive_structs)] // any information will be added to SourceData #[derive(Debug, Clone)] +#[cfg_attr(feature = "networking", derive(Default))] +#[cfg_attr(not(doc), allow(clippy::exhaustive_structs))] +#[cfg_attr(doc, non_exhaustive)] pub struct DatagenProvider { - /// The underlying raw data + #[doc(hidden)] pub source: SourceData, } -#[cfg(test)] impl DatagenProvider { - /// Create a `DatagenProvider` that uses test data. + /// Creates a new data provider with the given `source` and `options`. + /// + /// Fails if `options` is using CLDR locale sets and `source` does not contain CLDR data. + pub fn try_new(options: options::Options, mut source: SourceData) -> Result { + if source.options != Default::default() { + log::warn!("Trie type, collation database, or collations set on SourceData. These will be ignored in favor of options."); + } + + source.options = options; + + source.options.locales = match core::mem::take(&mut source.options.locales) { + options::LocaleInclude::None => options::LocaleInclude::Explicit(Default::default()), + options::LocaleInclude::CldrSet(levels) => options::LocaleInclude::Explicit( + source + .locales(levels.iter().copied().collect::>().as_slice())? + .into_iter() + .collect(), + ), + s => s, + }; + + Ok(Self { source }) + } + + #[cfg(test)] pub fn for_test() -> Self { + // Singleton so that all instantiations share the same cache. lazy_static::lazy_static! { static ref TEST_PROVIDER: DatagenProvider = DatagenProvider { - source: SourceData::repo(), + // This is equivalent to `latest_tested` for the files defined in + // `tools/testdata-scripts/globs.rs.data`. + source: SourceData::offline() + .with_cldr(repodata::paths::cldr(), Default::default()).unwrap() + .with_icuexport(repodata::paths::icuexport()).unwrap(), }; } TEST_PROVIDER.clone() } -} -impl AnyProvider for DatagenProvider { - fn load_any(&self, key: DataKey, req: DataRequest) -> Result { - self.as_any_provider().load_any(key, req) + /// Exports data for the set of keys to the given exporter. + /// + /// See + /// [`BlobExporter`](icu_provider_blob::export), + /// [`FileSystemExporter`](icu_provider_fs::export), + /// and [`BakedExporter`](crate::baked_exporter). + pub fn export( + &self, + keys: HashSet, + mut exporter: impl DataExporter, + ) -> Result<(), DataError> { + if keys.is_empty() { + log::warn!("No keys selected"); + } + + // Avoid multiple monomorphizations + fn internal( + provider: &DatagenProvider, + keys: HashSet, + exporter: &mut dyn DataExporter, + ) -> Result<(), DataError> { + use rayon::prelude::*; + + keys.into_par_iter().try_for_each(|key| { + provider + .supported_locales_for_key(key) + .map_err(|e| e.with_key(key))? + .into_par_iter() + .try_for_each(|locale| { + let req = DataRequest { + locale: &locale, + metadata: Default::default(), + }; + let payload = provider + .load_data(key, req) + .and_then(DataResponse::take_payload) + .map_err(|e| e.with_req(key, req))?; + log::trace!("Writing payload: {key}/{locale}"); + exporter + .put_payload(key, &locale, &payload) + .map_err(|e| e.with_req(key, req)) + })?; + + log::info!("Writing key: {key}"); + exporter.flush(key).map_err(|e| e.with_key(key)) + })?; + + exporter.close() + } + internal(self, keys, &mut exporter) } } @@ -227,6 +277,7 @@ pub fn keys>(strings: &[S]) -> Vec { /// # } /// ``` pub fn keys_from_file>(path: P) -> std::io::Result> { + use std::io::{BufRead, BufReader}; BufReader::new(std::fs::File::open(path.as_ref())?) .lines() .filter_map(|k| k.map(crate::key).transpose()) @@ -283,43 +334,22 @@ pub fn keys_from_bin>(path: P) -> std::io::Result> { Ok(result) } -/// Options for configuring the output of databake. -#[non_exhaustive] -#[derive(Debug)] -pub struct BakedOptions { - /// Whether to run `rustfmt` on the generated files. - pub pretty: bool, - /// Whether to gate each key on its crate name. This allows using the module - /// even if some keys are not required and their dependencies are not included. - /// Requires use_separate_crates. - pub insert_feature_gates: bool, - /// Whether to use separate crates to name types instead of the `icu` metacrate - pub use_separate_crates: bool, - /// Whether to overwrite existing data. By default, errors if it is present. - pub overwrite: bool, -} - -#[allow(clippy::derivable_impls)] // want to be explicit about bool defaults -impl Default for BakedOptions { - fn default() -> Self { - Self { - pretty: false, - insert_feature_gates: false, - use_separate_crates: false, - overwrite: false, - } - } -} - +/// Requires `legacy_api` Cargo feature +/// /// The output format. +#[deprecated( + since = "1.3.0", + note = "use `DatagenProvider::export` with self-constructed `DataExporter`s" +)] #[non_exhaustive] +#[cfg(feature = "legacy_api")] pub enum Out { /// Output to a file system tree Fs { /// The root path. - output_path: PathBuf, + output_path: std::path::PathBuf, /// The serialization format. See [syntax]. - serializer: Box, + serializer: Box, /// Whether to overwrite existing data. overwrite: bool, /// Whether to create a fingerprint file with SHA2 hashes @@ -330,7 +360,7 @@ pub enum Out { /// Output a module with baked data at the given location. Baked { /// The directory of the generated module. - mod_directory: PathBuf, + mod_directory: std::path::PathBuf, /// Additional options to configure the generated module. options: BakedOptions, }, @@ -338,13 +368,15 @@ pub enum Out { #[doc(hidden)] #[deprecated(since = "1.1.2", note = "please use `Out::Baked` instead")] Module { - mod_directory: PathBuf, + mod_directory: std::path::PathBuf, pretty: bool, insert_feature_gates: bool, use_separate_crates: bool, }, } +#[allow(deprecated)] +#[cfg(feature = "legacy_api")] impl core::fmt::Debug for Out { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -386,7 +418,10 @@ impl core::fmt::Debug for Out { } } -/// Runs ICU4X datagen. +#[deprecated(since = "1.3.0", note = "use `DatagenProvider::export`")] +#[cfg(feature = "legacy_api")] +#[allow(deprecated)] +/// Requires `legacy_api` Cargo feature /// /// The argument are used as follows: /// * `locales`: If this is present, only locales that are either `und` or @@ -398,108 +433,107 @@ impl core::fmt::Debug for Out { /// or [`is_missing_icuexport_error`] will be returned. /// * `out`: The output format and location. See the documentation on [`Out`] pub fn datagen( - locales: Option<&[LanguageIdentifier]>, + locales: Option<&[icu_locid::LanguageIdentifier]>, keys: &[DataKey], source: &SourceData, outs: Vec, ) -> Result<(), DataError> { - let exporters = outs - .into_iter() - .map(|out| -> Result, DataError> { - Ok(match out { - Out::Fs { - output_path, - serializer, - overwrite, - fingerprint, - } => { - let mut options = icu_provider_fs::export::ExporterOptions::default(); - options.root = output_path; - if overwrite { - options.overwrite = - icu_provider_fs::export::OverwriteOption::RemoveAndReplace - } - options.fingerprint = fingerprint; - Box::new(icu_provider_fs::export::FilesystemExporter::try_new( - serializer, options, - )?) - } - Out::Blob(write) => Box::new( - icu_provider_blob::export::BlobExporter::new_with_sink(write), - ), - Out::Baked { - mod_directory, - options, - } => Box::new(databake::BakedDataExporter::new(mod_directory, options)?), - #[allow(deprecated)] - Out::Module { - mod_directory, - pretty, - insert_feature_gates, - use_separate_crates, - } => Box::new(databake::BakedDataExporter::new( - mod_directory, - BakedOptions { - pretty, - insert_feature_gates, - use_separate_crates, - // Note: overwrite behavior was `true` in 1.0 but `false` in 1.1; - // 1.1.2 made it an option in Out::Baked. - overwrite: false, - }, - )?), - }) - }) - .collect::, DataError>>()?; - - let provider: Box = match locales { - Some(&[]) => Box::::default(), - Some(locales) => Box::new( - DatagenProvider { - source: source.clone(), - } - .filterable("icu4x-datagen locales") - .filter_by_langid(move |lid| lid.language.is_empty() || locales.contains(lid)), - ), - None => Box::new(DatagenProvider { - source: source.clone(), - }), - }; - - let keys: HashSet<_> = keys.iter().collect(); - - keys.into_par_iter().try_for_each(|&key| { - let locales = provider - .supported_locales_for_key(key) - .map_err(|e| e.with_key(key))?; - let res = locales.into_par_iter().try_for_each(|locale| { - let req = DataRequest { - locale: &locale, - metadata: Default::default(), - }; - let payload = provider - .load_data(key, req) - .and_then(DataResponse::take_payload) - .map_err(|e| e.with_req(key, req))?; - exporters.par_iter().try_for_each(|e| { - e.put_payload(key, &locale, &payload) - .map_err(|e| e.with_req(key, req)) - }) - }); - - log::info!("Writing key: {}", key); - for e in &exporters { - e.flush(key).map_err(|e| e.with_key(key))?; + use options::*; + let provider = DatagenProvider::try_new( + Options { + locales: locales + .map(|ls| { + LocaleInclude::Explicit( + ls.iter() + .cloned() + .chain(core::iter::once(icu_locid::LanguageIdentifier::UND)) + .collect(), + ) + }) + .unwrap_or(options::LocaleInclude::All), + ..source.options.clone() + }, + { + let mut source = source.clone(); + source.options = Default::default(); + source + }, + )?; + + struct MultiExporter(Vec>); + + impl DataExporter for MultiExporter { + fn put_payload( + &self, + key: DataKey, + locale: &DataLocale, + payload: &DataPayload, + ) -> Result<(), DataError> { + self.0 + .iter() + .try_for_each(|e| e.put_payload(key, locale, payload)) } - res - })?; + fn flush(&self, key: DataKey) -> Result<(), DataError> { + self.0.iter().try_for_each(|e| e.flush(key)) + } - for mut e in exporters { - e.close()?; + fn close(&mut self) -> Result<(), DataError> { + self.0.iter_mut().try_for_each(|e| e.close()) + } } - Ok(()) + provider.export( + keys.iter().cloned().collect(), + MultiExporter( + outs.into_iter() + .map(|out| -> Result, DataError> { + use baked_exporter::*; + use icu_provider_blob::export::*; + use icu_provider_fs::export::*; + + Ok(match out { + Out::Fs { + output_path, + serializer, + overwrite, + fingerprint, + } => { + let mut options = ExporterOptions::default(); + options.root = output_path; + if overwrite { + options.overwrite = OverwriteOption::RemoveAndReplace + } + options.fingerprint = fingerprint; + Box::new(FilesystemExporter::try_new(serializer, options)?) + } + Out::Blob(write) => Box::new(BlobExporter::new_with_sink(write)), + Out::Baked { + mod_directory, + options, + } => Box::new(BakedExporter::new(mod_directory, options)?), + #[allow(deprecated)] + Out::Module { + mod_directory, + pretty, + insert_feature_gates, + use_separate_crates, + } => Box::new(BakedExporter::new( + mod_directory, + Options { + pretty, + insert_feature_gates, + use_separate_crates, + // Note: overwrite behavior was `true` in 1.0 but `false` in 1.1; + // 1.1.2 made it an option in Options. + overwrite: false, + }, + )?), + }) + }) + .collect::>()?, + ), + ) } #[test] @@ -522,10 +556,7 @@ fn test_keys() { #[test] fn test_keys_from_file() { assert_eq!( - keys_from_file( - PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/data/work_log+keys.txt") - ) - .unwrap(), + keys_from_file(concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/work_log+keys.txt")).unwrap(), vec![ icu_datetime::provider::calendar::GregorianDateLengthsV1Marker::KEY, icu_datetime::provider::calendar::GregorianDateSymbolsV1Marker::KEY, @@ -543,8 +574,7 @@ fn test_keys_from_bin() { // and running `cargo +nightly-2022-04-18 wasm-build-release --examples -p icu_datetime --features serde \ // && cp target/wasm32-unknown-unknown/release-opt-size/examples/work_log.wasm provider/datagen/tests/data/` assert_eq!( - keys_from_bin(PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/data/work_log.wasm")) - .unwrap(), + keys_from_bin(concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/work_log.wasm")).unwrap(), vec![ icu_datetime::provider::calendar::GregorianDateLengthsV1Marker::KEY, icu_datetime::provider::calendar::GregorianDateSymbolsV1Marker::KEY, @@ -556,3 +586,47 @@ fn test_keys_from_bin() { ] ); } + +// SEMVER GRAVEYARD + +#[cfg(feature = "legacy_api")] +#[doc(hidden)] +pub use source::{CollationHanDatabase, CoverageLevel}; + +#[cfg(feature = "legacy_api")] +#[doc(hidden)] +pub use baked_exporter::Options as BakedOptions; + +#[allow(clippy::exhaustive_enums)] // exists for backwards compatibility +#[doc(hidden)] +#[derive(Debug)] +pub enum CldrLocaleSubset { + Ignored, +} + +impl Default for CldrLocaleSubset { + fn default() -> Self { + Self::Ignored + } +} + +impl CldrLocaleSubset { + #[allow(non_upper_case_globals)] + pub const Full: Self = Self::Ignored; + #[allow(non_upper_case_globals)] + pub const Modern: Self = Self::Ignored; +} + +#[cfg(feature = "legacy_api")] +#[doc(hidden)] +pub mod syntax { + pub use icu_provider_fs::export::serializers::bincode::Serializer as Bincode; + pub use icu_provider_fs::export::serializers::json::Serializer as Json; + pub use icu_provider_fs::export::serializers::postcard::Serializer as Postcard; +} + +impl AnyProvider for DatagenProvider { + fn load_any(&self, key: DataKey, req: DataRequest) -> Result { + self.as_any_provider().load_any(key, req) + } +} diff --git a/provider/datagen/src/options.rs b/provider/datagen/src/options.rs new file mode 100644 index 00000000000..e98d52e931a --- /dev/null +++ b/provider/datagen/src/options.rs @@ -0,0 +1,120 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Options bag for [`DatagenProvider`](crate::DatagenProvider). + +pub use crate::transform::cldr::source::CoverageLevel; + +use icu_locid::LanguageIdentifier; +use std::collections::HashSet; + +/// Options bag for [`DatagenProvider`](crate::DatagenProvider). +#[non_exhaustive] +#[derive(Debug, Clone, PartialEq, Default)] +pub struct Options { + /// Defines the locales to include + pub locales: LocaleInclude, + /// Whether to optimize tries for speed or size + pub trie_type: TrieType, + /// Which Han collation to use + pub collation_han_database: CollationHanDatabase, + /// The collation types to include. + /// + /// The special string `"search*"` causes all search collation tables to be included. + pub collations: HashSet, +} + +/// Defines the locaes to include +#[non_exhaustive] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] +pub enum LocaleInclude { + /// All locales + All, + /// No locales + None, + /// An explicit set of locales + Explicit(HashSet), + /// All locales with the given CLDR coverage levels + CldrSet(HashSet), +} + +impl Default for LocaleInclude { + fn default() -> Self { + Self::All + } +} + +impl LocaleInclude { + // TODO: Strict langid equality might not be what we want. + pub(crate) fn filter_by_langid_equality( + &self, + supported: Vec, + ) -> Vec { + match self { + LocaleInclude::All => supported, + LocaleInclude::Explicit(set) => supported + .into_iter() + .filter(|l| set.contains(&l.get_langid())) + .collect(), + _ => unreachable!("resolved"), + } + } +} + +/// Specifies the collation Han database to use. +/// +/// Unihan is more precise but significantly increases data size. See +/// +#[derive(Clone, Copy, Debug, PartialEq, serde::Serialize, serde::Deserialize)] +#[non_exhaustive] +pub enum CollationHanDatabase { + /// Implicit + #[serde(rename = "implicit")] + Implicit, + /// Unihan + #[serde(rename = "unihan")] + Unihan, +} + +impl std::fmt::Display for CollationHanDatabase { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + CollationHanDatabase::Implicit => write!(f, "implicithan"), + CollationHanDatabase::Unihan => write!(f, "unihan"), + } + } +} + +impl Default for CollationHanDatabase { + fn default() -> Self { + Self::Implicit + } +} + +/// Specifies the trie type to use. +#[derive(Clone, Copy, Debug, PartialEq, serde::Serialize, serde::Deserialize)] +#[non_exhaustive] +pub enum TrieType { + /// Fast tries are optimized for speed + #[serde(rename = "fast")] + Fast, + /// Small tries are optimized for size + #[serde(rename = "small")] + Small, +} + +impl std::fmt::Display for TrieType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + TrieType::Fast => write!(f, "fast"), + TrieType::Small => write!(f, "small"), + } + } +} + +impl Default for TrieType { + fn default() -> Self { + Self::Small + } +} diff --git a/provider/datagen/src/registry.rs b/provider/datagen/src/registry.rs index 678ec012254..4967867e254 100644 --- a/provider/datagen/src/registry.rs +++ b/provider/datagen/src/registry.rs @@ -63,6 +63,7 @@ macro_rules! registry { ] ); + #[cfg(feature = "provider_baked")] pub(crate) fn key_to_marker_bake(key: DataKey, env: &databake::CrateEnv) -> databake::TokenStream { use databake::Bake; // This is a bit naughty, we need the marker's type, but we're actually diff --git a/provider/datagen/src/source.rs b/provider/datagen/src/source.rs index 3556322dc4a..056ef6cd863 100644 --- a/provider/datagen/src/source.rs +++ b/provider/datagen/src/source.rs @@ -2,6 +2,7 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). +use crate::options::Options; use crate::transform::cldr::source::CldrCache; pub use crate::transform::cldr::source::CoverageLevel; use elsa::sync::FrozenMap; @@ -25,22 +26,21 @@ pub struct SourceData { icuexport_paths: Option>, segmenter_paths: Arc, segmenter_lstm_paths: Arc, - trie_type: IcuTrieType, - collation_han_database: CollationHanDatabase, - collations: Vec, + // TODO: move this out when we decide we can break the exhaustiveness of DatagenProvider + pub(crate) options: Options, } +#[cfg(feature = "networking")] +/// The default [`SourceData`] downloads the latest supported data. +/// +/// Requires `networking` Cargo feature. impl Default for SourceData { fn default() -> Self { - Self { - cldr_paths: None, - icuexport_paths: None, - segmenter_paths: Arc::new(SerdeCache::new(AbstractFs::new_segmenter())), - segmenter_lstm_paths: Arc::new(SerdeCache::new(AbstractFs::new_lstm())), - trie_type: IcuTrieType::Small, - collation_han_database: CollationHanDatabase::Implicit, - collations: vec![], - } + Self::offline() + .with_cldr_for_tag(Self::LATEST_TESTED_CLDR_TAG, Default::default()) + .unwrap() + .with_icuexport_for_tag(Self::LATEST_TESTED_ICUEXPORT_TAG) + .unwrap() } } @@ -51,35 +51,32 @@ impl SourceData { /// The latest ICU export tag that has been verified to work with this version of `icu_datagen`. pub const LATEST_TESTED_ICUEXPORT_TAG: &'static str = "release-73-1"; - /// The latest `SourceData` that has been verified to work with this version of `icu_datagen`. - /// - /// See [`SourceData::LATEST_TESTED_CLDR_TAG`] and [`SourceData::LATEST_TESTED_ICUEXPORT_TAG`]. - /// - /// Requires `networking` Cargo feature. + #[doc(hidden)] #[cfg(feature = "networking")] + #[deprecated(since = "1.3.0", note = "use SourceData::default()")] pub fn latest_tested() -> Self { Self::default() - .with_cldr_for_tag(Self::LATEST_TESTED_CLDR_TAG, Default::default()) - .unwrap() - .with_icuexport_for_tag(Self::LATEST_TESTED_ICUEXPORT_TAG) - .unwrap() } - #[cfg(test)] - // This is equivalent to `latest_tested` for the files defined in `tools/testdata-scripts/globs.rs.data`. - pub fn repo() -> Self { - Self::default() - .with_cldr(repodata::paths::cldr(), Default::default()) - .unwrap() - .with_icuexport(repodata::paths::icuexport()) - .unwrap() + /// Creates a `SourceData` that does not have CLDR or ICU export sources set. + /// + /// Using this to generate keys that require the data will result in errors + /// ([`is_missing_cldr_error`](crate::is_missing_cldr_error) / + /// [`is_missing_icuexport_error`](crate::is_missing_icuexport_error)). Make sure to set + /// local data sources using [`SourceData::with_cldr`] / [`SourceData::with_icuexport`]. + pub fn offline() -> Self { + Self { + cldr_paths: None, + icuexport_paths: None, + segmenter_paths: Arc::new(SerdeCache::new(AbstractFs::new_segmenter())), + segmenter_lstm_paths: Arc::new(SerdeCache::new(AbstractFs::new_lstm())), + options: Default::default(), + } } - /// Adds CLDR data to this `DataSource`. The root should point to a local + /// Adds CLDR data to this `SourceData`. The root should point to a local /// `cldr-{version}-json-{full, modern}.zip` directory or ZIP file (see /// [GitHub releases](https://github.com/unicode-org/cldr-json/releases)). - /// - /// The `_locale_subset` variable is ignored. pub fn with_cldr( self, root: PathBuf, @@ -92,7 +89,7 @@ impl SourceData { }) } - /// Adds ICU export data to this `DataSource`. The path should point to a local + /// Adds ICU export data to this `SourceData`. The path should point to a local /// `icuexportdata_uprops_full.zip` directory or ZIP file (see [GitHub releases]( /// https://github.com/unicode-org/icu/releases)). pub fn with_icuexport(self, root: PathBuf) -> Result { @@ -102,7 +99,7 @@ impl SourceData { }) } - /// Adds CLDR data to this `DataSource`. The data will be downloaded from GitHub + /// Adds CLDR data to this `SourceData`. The data will be downloaded from GitHub /// using the given tag (see [GitHub releases](https://github.com/unicode-org/cldr-json/releases)). /// /// Also see: [`LATEST_TESTED_CLDR_TAG`](Self::LATEST_TESTED_CLDR_TAG) @@ -123,7 +120,7 @@ impl SourceData { }) } - /// Adds ICU export data to this `DataSource`. The data will be downloaded from GitHub + /// Adds ICU export data to this `SourceData`. The data will be downloaded from GitHub /// using the given tag. (see [GitHub releases](https://github.com/unicode-org/icu/releases)). /// /// Also see: [`LATEST_TESTED_ICUEXPORT_TAG`](Self::LATEST_TESTED_ICUEXPORT_TAG) @@ -150,7 +147,7 @@ impl SourceData { note = "Use `with_cldr_for_tag(SourceData::LATEST_TESTED_CLDR_TAG)`" )] #[cfg(feature = "networking")] - /// Deprecated + #[doc(hidden)] pub fn with_cldr_latest( self, _use_default_here: crate::CldrLocaleSubset, @@ -163,34 +160,45 @@ impl SourceData { note = "Use `with_icuexport_for_tag(SourceData::LATEST_TESTED_ICUEXPORT_TAG)`" )] #[cfg(feature = "networking")] - /// Deprecated + #[doc(hidden)] pub fn with_icuexport_latest(self) -> Result { self.with_icuexport_for_tag(Self::LATEST_TESTED_ICUEXPORT_TAG) } - /// Set this to use tries optimized for speed instead of data size + #[deprecated(note = "use crate::Options", since = "1.3.0")] + #[doc(hidden)] pub fn with_fast_tries(self) -> Self { Self { - trie_type: IcuTrieType::Fast, + options: Options { + trie_type: crate::options::TrieType::Fast, + ..self.options + }, ..self } } - /// Set the [`CollationHanDatabase`] version. + #[deprecated(note = "use crate::Options", since = "1.3.0")] + #[doc(hidden)] pub fn with_collation_han_database(self, collation_han_database: CollationHanDatabase) -> Self { Self { - collation_han_database, + options: Options { + collation_han_database, + ..self.options + }, ..self } } - /// Set the list of BCP-47 collation IDs to include beyond the default set. - /// - /// If a list was already set, this function overwrites the previous list. - /// - /// The special string `"search*"` causes all search collation tables to be included. + #[deprecated(note = "use crate::Options", since = "1.3.0")] + #[doc(hidden)] pub fn with_collations(self, collations: Vec) -> Self { - Self { collations, ..self } + Self { + options: Options { + collations: collations.into_iter().collect(), + ..self.options + }, + ..self + } } /// Paths to CLDR source data. @@ -216,18 +224,6 @@ impl SourceData { Ok(&self.segmenter_lstm_paths) } - pub(crate) fn trie_type(&self) -> IcuTrieType { - self.trie_type - } - - pub(crate) fn collation_han_database(&self) -> CollationHanDatabase { - self.collation_han_database - } - - pub(crate) fn collations(&self) -> &[String] { - &self.collations - } - /// List the locales for the given CLDR coverage levels pub fn locales( &self, @@ -237,42 +233,8 @@ impl SourceData { } } -#[derive(Clone, Copy, Debug)] -pub(crate) enum IcuTrieType { - Fast, - Small, -} - -impl std::fmt::Display for IcuTrieType { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { - match self { - IcuTrieType::Fast => write!(f, "fast"), - IcuTrieType::Small => write!(f, "small"), - } - } -} - -/// Specifies the collation Han database to use. -/// -/// Unihan is more precise but significantly increases data size. See -/// -#[derive(Clone, Copy, Debug)] -#[non_exhaustive] -pub enum CollationHanDatabase { - /// Implicit - Implicit, - /// Unihan - Unihan, -} - -impl std::fmt::Display for CollationHanDatabase { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { - match self { - CollationHanDatabase::Implicit => write!(f, "implicithan"), - CollationHanDatabase::Unihan => write!(f, "unihan"), - } - } -} +#[doc(hidden)] +pub use crate::options::CollationHanDatabase; pub(crate) struct SerdeCache { root: AbstractFs, @@ -474,12 +436,12 @@ impl AbstractFs { self.init()?; match self { Self::Fs(root) => { - log::trace!("Reading: {}/{}", root.display(), path); + log::debug!("Reading: {}/{}", root.display(), path); std::fs::read(root.join(path)) .map_err(|e| DataError::from(e).with_path_context(&root.join(path))) } Self::Zip(zip) => { - log::trace!("Reading: /{}", path); + log::debug!("Reading: /{}", path); let mut buf = Vec::new(); zip.write() .expect("poison") diff --git a/provider/datagen/src/transform/cldr/characters/mod.rs b/provider/datagen/src/transform/cldr/characters/mod.rs index 182cb773e6b..eba849130f4 100644 --- a/provider/datagen/src/transform/cldr/characters/mod.rs +++ b/provider/datagen/src/transform/cldr/characters/mod.rs @@ -47,13 +47,14 @@ macro_rules! exemplar_chars_impls { impl IterableDataProvider<$data_marker_name> for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { - Ok(self - .source - .cldr()? - .misc() - .list_langs()? - .map(DataLocale::from) - .collect()) + Ok(self.source.options.locales.filter_by_langid_equality( + self.source + .cldr()? + .misc() + .list_langs()? + .map(DataLocale::from) + .collect(), + )) } } diff --git a/provider/datagen/src/transform/cldr/datetime/mod.rs b/provider/datagen/src/transform/cldr/datetime/mod.rs index da76f77cf19..35e9c9590cb 100644 --- a/provider/datagen/src/transform/cldr/datetime/mod.rs +++ b/provider/datagen/src/transform/cldr/datetime/mod.rs @@ -273,7 +273,7 @@ macro_rules! impl_data_provider { r.retain(|l| l.get_langid() != icu_locid::langid!("byn") && l.get_langid() != icu_locid::langid!("ssy")); } - Ok(r) + Ok(self.source.options.locales.filter_by_langid_equality(r)) } } }; diff --git a/provider/datagen/src/transform/cldr/decimal/mod.rs b/provider/datagen/src/transform/cldr/decimal/mod.rs index 3de932a0e13..9899ae91a7f 100644 --- a/provider/datagen/src/transform/cldr/decimal/mod.rs +++ b/provider/datagen/src/transform/cldr/decimal/mod.rs @@ -78,27 +78,28 @@ impl crate::DatagenProvider { } fn supported_locales(&self) -> Result, DataError> { - Ok(self - .source - .cldr()? - .numbers() - .list_langs()? - .flat_map(|langid| { - let last = DataLocale::from(&langid); - self.get_supported_numsys_for_langid_without_default(&langid) - .expect("All languages from list_langs should be present") - .into_iter() - .map(move |nsname| { - let mut data_locale = DataLocale::from(&langid); - data_locale.set_unicode_ext( - key!("nu"), - Value::try_from_single_subtag(nsname.as_bytes()) - .expect("CLDR should have valid numbering system names"), - ); - data_locale - }) - .chain(core::iter::once(last)) - }) - .collect()) + Ok(self.source.options.locales.filter_by_langid_equality( + self.source + .cldr()? + .numbers() + .list_langs()? + .flat_map(|langid| { + let last = DataLocale::from(&langid); + self.get_supported_numsys_for_langid_without_default(&langid) + .expect("All languages from list_langs should be present") + .into_iter() + .map(move |nsname| { + let mut data_locale = DataLocale::from(&langid); + data_locale.set_unicode_ext( + key!("nu"), + Value::try_from_single_subtag(nsname.as_bytes()) + .expect("CLDR should have valid numbering system names"), + ); + data_locale + }) + .chain(core::iter::once(last)) + }) + .collect(), + )) } } diff --git a/provider/datagen/src/transform/cldr/displaynames/language.rs b/provider/datagen/src/transform/cldr/displaynames/language.rs index 999adae5d67..805f212d277 100644 --- a/provider/datagen/src/transform/cldr/displaynames/language.rs +++ b/provider/datagen/src/transform/cldr/displaynames/language.rs @@ -61,43 +61,45 @@ impl DataProvider for crate::DatagenProvider { impl IterableDataProvider for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { - Ok(self - .source - .cldr()? - .displaynames() - .list_langs()? - .filter(|langid| { - // The directory might exist without languages.json - self.source - .cldr() - .unwrap() - .displaynames() - .file_exists(langid, "languages.json") - .unwrap_or_default() - }) - .map(DataLocale::from) - .collect()) + Ok(self.source.options.locales.filter_by_langid_equality( + self.source + .cldr()? + .displaynames() + .list_langs()? + .filter(|langid| { + // The directory might exist without languages.json + self.source + .cldr() + .unwrap() + .displaynames() + .file_exists(langid, "languages.json") + .unwrap_or_default() + }) + .map(DataLocale::from) + .collect(), + )) } } impl IterableDataProvider for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { - Ok(self - .source - .cldr()? - .displaynames() - .list_langs()? - .filter(|langid| { - // The directory might exist without languages.json - self.source - .cldr() - .unwrap() - .displaynames() - .file_exists(langid, "languages.json") - .unwrap_or_default() - }) - .map(DataLocale::from) - .collect()) + Ok(self.source.options.locales.filter_by_langid_equality( + self.source + .cldr()? + .displaynames() + .list_langs()? + .filter(|langid| { + // The directory might exist without languages.json + self.source + .cldr() + .unwrap() + .displaynames() + .file_exists(langid, "languages.json") + .unwrap_or_default() + }) + .map(DataLocale::from) + .collect(), + )) } } diff --git a/provider/datagen/src/transform/cldr/displaynames/region.rs b/provider/datagen/src/transform/cldr/displaynames/region.rs index 180536c1b47..4ca01e406af 100644 --- a/provider/datagen/src/transform/cldr/displaynames/region.rs +++ b/provider/datagen/src/transform/cldr/displaynames/region.rs @@ -37,22 +37,23 @@ impl DataProvider for crate::DatagenProvider { impl IterableDataProvider for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { - Ok(self - .source - .cldr()? - .displaynames() - .list_langs()? - .filter(|langid| { - // The directory might exist without territories.json - self.source - .cldr() - .unwrap() - .displaynames() - .file_exists(langid, "territories.json") - .unwrap_or_default() - }) - .map(DataLocale::from) - .collect()) + Ok(self.source.options.locales.filter_by_langid_equality( + self.source + .cldr()? + .displaynames() + .list_langs()? + .filter(|langid| { + // The directory might exist without territories.json + self.source + .cldr() + .unwrap() + .displaynames() + .file_exists(langid, "territories.json") + .unwrap_or_default() + }) + .map(DataLocale::from) + .collect(), + )) } } diff --git a/provider/datagen/src/transform/cldr/displaynames/script.rs b/provider/datagen/src/transform/cldr/displaynames/script.rs index 09a2cc74356..3ef1d24f79a 100644 --- a/provider/datagen/src/transform/cldr/displaynames/script.rs +++ b/provider/datagen/src/transform/cldr/displaynames/script.rs @@ -36,22 +36,23 @@ impl DataProvider for crate::DatagenProvider { impl IterableDataProvider for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { - Ok(self - .source - .cldr()? - .displaynames() - .list_langs()? - .filter(|langid| { - // The directory might exist without scripts.json - self.source - .cldr() - .unwrap() - .displaynames() - .file_exists(langid, "scripts.json") - .unwrap_or_default() - }) - .map(DataLocale::from) - .collect()) + Ok(self.source.options.locales.filter_by_langid_equality( + self.source + .cldr()? + .displaynames() + .list_langs()? + .filter(|langid| { + // The directory might exist without scripts.json + self.source + .cldr() + .unwrap() + .displaynames() + .file_exists(langid, "scripts.json") + .unwrap_or_default() + }) + .map(DataLocale::from) + .collect(), + )) } } diff --git a/provider/datagen/src/transform/cldr/list/mod.rs b/provider/datagen/src/transform/cldr/list/mod.rs index a6d2386bdd2..e0c3801da04 100644 --- a/provider/datagen/src/transform/cldr/list/mod.rs +++ b/provider/datagen/src/transform/cldr/list/mod.rs @@ -128,13 +128,14 @@ macro_rules! implement { impl IterableDataProvider<$marker> for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { - Ok(self - .source - .cldr()? - .misc() - .list_langs()? - .map(DataLocale::from) - .collect()) + Ok(self.source.options.locales.filter_by_langid_equality( + self.source + .cldr()? + .misc() + .list_langs()? + .map(DataLocale::from) + .collect(), + )) } } }; diff --git a/provider/datagen/src/transform/cldr/locale_canonicalizer/likely_subtags.rs b/provider/datagen/src/transform/cldr/locale_canonicalizer/likely_subtags.rs index f64b89b6a28..26c88e61c06 100644 --- a/provider/datagen/src/transform/cldr/locale_canonicalizer/likely_subtags.rs +++ b/provider/datagen/src/transform/cldr/locale_canonicalizer/likely_subtags.rs @@ -2,8 +2,8 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). +use crate::transform::cldr::{cldr_serde, source::CoverageLevel}; use crate::SourceData; -use crate::{transform::cldr::cldr_serde, CoverageLevel}; use icu_locid::subtags::Language; use icu_locid::LanguageIdentifier; use icu_locid_transform::provider::*; diff --git a/provider/datagen/src/transform/cldr/plurals/mod.rs b/provider/datagen/src/transform/cldr/plurals/mod.rs index 1c5dbe0f3e6..53c54628f67 100644 --- a/provider/datagen/src/transform/cldr/plurals/mod.rs +++ b/provider/datagen/src/transform/cldr/plurals/mod.rs @@ -51,14 +51,15 @@ macro_rules! implement { impl IterableDataProvider<$marker> for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { - Ok(self - .get_rules_for(<$marker>::KEY)? - .0 - .keys() - // TODO(#568): Avoid the clone - .cloned() - .map(DataLocale::from) - .collect()) + Ok(self.source.options.locales.filter_by_langid_equality( + self.get_rules_for(<$marker>::KEY)? + .0 + .keys() + // TODO(#568): Avoid the clone + .cloned() + .map(DataLocale::from) + .collect(), + )) } } }; diff --git a/provider/datagen/src/transform/cldr/relativetime/mod.rs b/provider/datagen/src/transform/cldr/relativetime/mod.rs index 6573cb414a5..a57011dcf10 100644 --- a/provider/datagen/src/transform/cldr/relativetime/mod.rs +++ b/provider/datagen/src/transform/cldr/relativetime/mod.rs @@ -104,13 +104,13 @@ macro_rules! make_data_provider { impl IterableDataProvider<$marker> for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { - Ok(self + Ok(self.source.options.locales.filter_by_langid_equality(self .source .cldr()? .dates("gregorian") .list_langs()? .map(DataLocale::from) - .collect()) + .collect())) } } diff --git a/provider/datagen/src/transform/cldr/source.rs b/provider/datagen/src/transform/cldr/source.rs index c38735b783a..28028f5f5d8 100644 --- a/provider/datagen/src/transform/cldr/source.rs +++ b/provider/datagen/src/transform/cldr/source.rs @@ -8,17 +8,23 @@ use icu_provider::DataError; use std::fmt::Debug; use std::str::FromStr; -/// Specifies a variant of CLDR JSON -#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Deserialize)] +/// A language's CLDR coverage level. +#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)] #[non_exhaustive] pub enum CoverageLevel { /// Locales listed as modern coverage targets by the CLDR subcomittee. + /// + /// This is the highest level of coverage. #[serde(rename = "modern")] Modern, /// Locales listed as moderate coverage targets by the CLDR subcomittee. + /// + /// This is a medium level of coverage. #[serde(rename = "moderate")] Moderate, /// Locales listed as basic coverage targets by the CLDR subcomittee. + /// + /// This is the lowest level of coverage. #[serde(rename = "basic")] Basic, } diff --git a/provider/datagen/src/transform/cldr/time_zones/mod.rs b/provider/datagen/src/transform/cldr/time_zones/mod.rs index 5c16929c6b9..bcbfbc2d248 100644 --- a/provider/datagen/src/transform/cldr/time_zones/mod.rs +++ b/provider/datagen/src/transform/cldr/time_zones/mod.rs @@ -84,13 +84,13 @@ macro_rules! impl_data_provider { Ok(vec![Default::default()]) } else { - Ok(self + Ok(self.source.options.locales.filter_by_langid_equality(self .source .cldr()? .dates("gregorian") .list_langs()? .map(DataLocale::from) - .collect()) + .collect())) } } } diff --git a/provider/datagen/src/transform/icuexport/collator/mod.rs b/provider/datagen/src/transform/icuexport/collator/mod.rs index d05a04b2ab3..259be495ce8 100644 --- a/provider/datagen/src/transform/icuexport/collator/mod.rs +++ b/provider/datagen/src/transform/icuexport/collator/mod.rs @@ -31,8 +31,12 @@ mod test; fn has_legacy_swedish_variants(source: &crate::SourceData) -> bool { source .icuexport() - .and_then(|i| i.list(&format!("collation/{}", source.collation_han_database()))) - .map(|mut iter| iter.any(|s| s == "sv_reformed_meta.toml")) + .and_then(|i| { + i.file_exists(&format!( + "collation/{}/sv_reformed_meta.toml", + source.options.collation_han_database, + )) + }) .unwrap_or(false) } @@ -110,19 +114,14 @@ impl crate::DatagenProvider { /// Whether to include the given collation value based on /// the default excludes and explicit includes. fn should_include_collation(&self, collation: &Value) -> bool { - let collation_str = collation.write_to_string(); - if self - .source - .collations() - .iter() - .any(|s| s == &*collation_str) - { + let collation_str = &*collation.write_to_string(); + if self.source.options.collations.contains(collation_str) { true } else if collation_str.starts_with("search") { // Note: literal "search" and "searchjl" are handled above - self.source.collations().iter().any(|s| s == "search*") + self.source.options.collations.contains("search*") } else { - !DEFAULT_REMOVED_COLLATIONS.contains(&&*collation_str) + !DEFAULT_REMOVED_COLLATIONS.contains(&collation_str) } } } @@ -137,7 +136,7 @@ macro_rules! collation_provider { .icuexport()? .read_and_parse_toml(&format!( "collation/{}/{}{}.toml", - self.source.collation_han_database(), + self.source.options.collation_han_database, locale_to_file_name(&req.locale, has_legacy_swedish_variants(&self.source)), $suffix )) @@ -161,12 +160,12 @@ macro_rules! collation_provider { impl IterableDataProvider<$marker> for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { - Ok(self + Ok(self.source.options.locales.filter_by_langid_equality(self .source .icuexport()? .list(&format!( "collation/{}", - self.source.collation_han_database() + self.source.options.collation_han_database ))? .filter_map(|mut file_name| { file_name.truncate(file_name.len() - ".toml".len()); @@ -186,7 +185,7 @@ macro_rules! collation_provider { .unwrap_or(true) }) .map(DataLocale::from) - .collect()) + .collect())) } } )+ diff --git a/provider/datagen/src/transform/icuexport/collator/test.rs b/provider/datagen/src/transform/icuexport/collator/test.rs index 843d9fe8f8c..5bc86368330 100644 --- a/provider/datagen/src/transform/icuexport/collator/test.rs +++ b/provider/datagen/src/transform/icuexport/collator/test.rs @@ -284,15 +284,13 @@ fn test_collation_filtering() { }, ]; for cas in cases { - let provider = DatagenProvider { - source: SourceData::repo().with_collations( - cas.include_collations - .iter() - .copied() - .map(String::from) - .collect(), - ), - }; + let mut provider = DatagenProvider::for_test(); + provider.source.options.collations = cas + .include_collations + .iter() + .copied() + .map(String::from) + .collect(); let mut resolved_locales: Vec = IterableDataProvider::::supported_locales(&provider) .unwrap() diff --git a/provider/datagen/src/transform/icuexport/normalizer/mod.rs b/provider/datagen/src/transform/icuexport/normalizer/mod.rs index 2cd7f7de247..868b12e4de8 100644 --- a/provider/datagen/src/transform/icuexport/normalizer/mod.rs +++ b/provider/datagen/src/transform/icuexport/normalizer/mod.rs @@ -24,8 +24,7 @@ macro_rules! normalization_provider { let $toml_data: &normalizer_serde::$serde_struct = self.source.icuexport()?.read_and_parse_toml(&format!( "norm/{}/{}.toml", - self.source.trie_type(), - $file_name + self.source.options.trie_type, $file_name ))?; $conversion diff --git a/provider/datagen/src/transform/icuexport/ucase/mod.rs b/provider/datagen/src/transform/icuexport/ucase/mod.rs index 1cf87c86078..46c4fdf5233 100644 --- a/provider/datagen/src/transform/icuexport/ucase/mod.rs +++ b/provider/datagen/src/transform/icuexport/ucase/mod.rs @@ -21,7 +21,7 @@ impl DataProvider for crate::DatagenProvider { .icuexport()? .read_and_parse_toml::(&format!( "ucase/{}/ucase.toml", - self.source.trie_type() + self.source.options.trie_type ))? .ucase; diff --git a/provider/datagen/src/transform/icuexport/uprops/bidi_data.rs b/provider/datagen/src/transform/icuexport/uprops/bidi_data.rs index 9568c2994d7..853e0022659 100644 --- a/provider/datagen/src/transform/icuexport/uprops/bidi_data.rs +++ b/provider/datagen/src/transform/icuexport/uprops/bidi_data.rs @@ -18,8 +18,7 @@ fn get_code_point_prop_map<'a>( .icuexport()? .read_and_parse_toml::(&format!( "uprops/{}/{}.toml", - source.trie_type(), - key + source.options.trie_type, key ))? .enum_property .get(0) diff --git a/provider/datagen/src/transform/icuexport/uprops/bin_cp_set.rs b/provider/datagen/src/transform/icuexport/uprops/bin_cp_set.rs index b69953538ba..945ec3b8981 100644 --- a/provider/datagen/src/transform/icuexport/uprops/bin_cp_set.rs +++ b/provider/datagen/src/transform/icuexport/uprops/bin_cp_set.rs @@ -17,8 +17,7 @@ pub(crate) fn get_binary_prop_for_code_point_set<'a>( .icuexport()? .read_and_parse_toml::(&format!( "uprops/{}/{}.toml", - source.trie_type(), - key + source.options.trie_type, key ))? .binary_property .get(0) diff --git a/provider/datagen/src/transform/icuexport/uprops/bin_uniset.rs b/provider/datagen/src/transform/icuexport/uprops/bin_uniset.rs index e5c528cf947..a1513eedebb 100644 --- a/provider/datagen/src/transform/icuexport/uprops/bin_uniset.rs +++ b/provider/datagen/src/transform/icuexport/uprops/bin_uniset.rs @@ -18,8 +18,7 @@ fn get_binary_prop_for_unicodeset<'a>( .icuexport()? .read_and_parse_toml::(&format!( "uprops/{}/{}.toml", - source.trie_type(), - key + source.options.trie_type, key ))? .binary_property .get(0) diff --git a/provider/datagen/src/transform/icuexport/uprops/enum_codepointtrie.rs b/provider/datagen/src/transform/icuexport/uprops/enum_codepointtrie.rs index 9b40d89f90a..91c8a60c8b0 100644 --- a/provider/datagen/src/transform/icuexport/uprops/enum_codepointtrie.rs +++ b/provider/datagen/src/transform/icuexport/uprops/enum_codepointtrie.rs @@ -20,8 +20,7 @@ pub(crate) fn get_enumerated_prop<'a>( .icuexport()? .read_and_parse_toml::(&format!( "uprops/{}/{}.toml", - source.trie_type(), - key + source.options.trie_type, key ))? .enum_property .get(0) @@ -374,7 +373,7 @@ fn get_mask_prop<'a>( .icuexport()? .read_and_parse_toml::(&format!( "uprops/{}/{}.toml", - source.trie_type(), + source.options.trie_type, key ))? .mask_property diff --git a/provider/datagen/src/transform/icuexport/uprops/script.rs b/provider/datagen/src/transform/icuexport/uprops/script.rs index 2751063516b..0b07f181b1c 100644 --- a/provider/datagen/src/transform/icuexport/uprops/script.rs +++ b/provider/datagen/src/transform/icuexport/uprops/script.rs @@ -24,7 +24,7 @@ impl DataProvider for crate::DatagenProvid .icuexport()? .read_and_parse_toml::(&format!( "uprops/{}/scx.toml", - self.source.trie_type(), + self.source.options.trie_type, ))? .script_extensions .get(0) diff --git a/provider/datagen/src/transform/mod.rs b/provider/datagen/src/transform/mod.rs index 2783b066b5a..2c0d3325671 100644 --- a/provider/datagen/src/transform/mod.rs +++ b/provider/datagen/src/transform/mod.rs @@ -18,6 +18,10 @@ impl DataProvider for crate::DatagenProvider { impl IterableDataProvider for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { - HelloWorldProvider.supported_locales() + Ok(self + .source + .options + .locales + .filter_by_langid_equality(HelloWorldProvider.supported_locales()?)) } } diff --git a/provider/datagen/src/transform/segmenter/lstm.rs b/provider/datagen/src/transform/segmenter/lstm.rs index 856746f86d3..0d2129630d6 100644 --- a/provider/datagen/src/transform/segmenter/lstm.rs +++ b/provider/datagen/src/transform/segmenter/lstm.rs @@ -203,12 +203,12 @@ impl DataProvider for crate::DatagenProvider { impl IterableDataProvider for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { - Ok(vec![ + Ok(self.source.options.locales.filter_by_langid_equality(vec![ locale!("km").into(), locale!("lo").into(), locale!("my").into(), locale!("th").into(), - ]) + ])) } } diff --git a/provider/datagen/src/transform/segmenter/mod.rs b/provider/datagen/src/transform/segmenter/mod.rs index 54523378834..1235f96c016 100644 --- a/provider/datagen/src/transform/segmenter/mod.rs +++ b/provider/datagen/src/transform/segmenter/mod.rs @@ -10,7 +10,8 @@ use icu_codepointtrie_builder::{CodePointTrieBuilder, CodePointTrieBuilderData}; use icu_collections::codepointtrie::CodePointTrie; -use icu_locid::{langid, locale}; +use icu_locid::langid; +use icu_locid::locale; use icu_properties::{ maps, sets, EastAsianWidth, GeneralCategory, GraphemeClusterBreak, LineBreak, Script, SentenceBreak, WordBreak, @@ -592,11 +593,9 @@ impl crate::DatagenProvider { data: CodePointTrieBuilderData::ValuesByCodePoint(&properties_map), default_value: 0, error_value: 0, - trie_type: match self.source.trie_type() { - crate::source::IcuTrieType::Fast => icu_collections::codepointtrie::TrieType::Fast, - crate::source::IcuTrieType::Small => { - icu_collections::codepointtrie::TrieType::Small - } + trie_type: match self.source.options.trie_type { + crate::options::TrieType::Fast => icu_collections::codepointtrie::TrieType::Fast, + crate::options::TrieType::Small => icu_collections::codepointtrie::TrieType::Small, }, } .build(); @@ -801,8 +800,14 @@ impl DataProvider for crate::DatagenProvider } impl IterableDataProvider for crate::DatagenProvider { + // TODO: Do we actually want to filter these by the user-selected locales? The keys + // are more like script selectors... fn supported_locales(&self) -> Result, DataError> { - Ok(vec![locale!("ja").into()]) + Ok(self + .source + .options + .locales + .filter_by_langid_equality(vec![locale!("ja").into()])) } } @@ -821,12 +826,14 @@ impl DataProvider for crate::DatagenProvi impl IterableDataProvider for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { - Ok(vec![ + // TODO: Do we actually want to filter these by the user-selected locales? The keys + // are more like script selectors... + Ok(self.source.options.locales.filter_by_langid_equality(vec![ locale!("th").into(), locale!("km").into(), locale!("lo").into(), locale!("my").into(), - ]) + ])) } } diff --git a/provider/datagen/tests/verify-zero-copy.rs b/provider/datagen/tests/verify-zero-copy.rs index 007c94364b2..6daaedc4c65 100644 --- a/provider/datagen/tests/verify-zero-copy.rs +++ b/provider/datagen/tests/verify-zero-copy.rs @@ -9,15 +9,11 @@ static ALLOC: dhat::Alloc = dhat::Alloc; // Something is broken wrt Windows and this test on CI. Disable for now. #[cfg(not(target_os = "windows"))] pub mod test { + use icu_datagen::{all_keys_with_experimental, DatagenProvider, SourceData}; use icu_provider::datagen::IterableDynamicDataProvider; - use icu_provider_adapters::filter::Filterable; - use icu_provider::prelude::*; - - use icu_datagen::{all_keys_with_experimental, DatagenProvider, SourceData}; use std::cmp; use std::collections::BTreeSet; - use std::mem::ManuallyDrop; // Types in this list cannot be zero-copy deserialized. // @@ -43,21 +39,25 @@ pub mod test { #[test] fn test_zero_copy() { - // manually drop to avoid dhat from printing stats at the end - let _profiler = ManuallyDrop::new(dhat::Profiler::new_heap()); - - let selected_locales = icu_testdata::locales(); + // don't drop to avoid dhat from printing stats at the end + core::mem::forget(dhat::Profiler::new_heap()); // Actual data is only needed to determine included locales. - let locale_provider = DatagenProvider { - source: SourceData::default() + let locale_provider = DatagenProvider::try_new( + { + use icu_datagen::options::*; + let mut options = Options::default(); + options.locales = + LocaleInclude::Explicit(icu_testdata::locales().into_iter().collect()); + options + }, + SourceData::offline() .with_cldr(repodata::paths::cldr(), Default::default()) .unwrap() .with_icuexport(repodata::paths::icuexport()) .unwrap(), - } - .filterable("icu4x-datagen locales") - .filter_by_langid_allowlist_strict(&selected_locales); + ) + .unwrap(); let postcard_provider = icu_testdata::buffer_no_fallback(); @@ -70,24 +70,26 @@ pub mod test { let mut max_total_violation = 0; let mut max_net_violation = 0; - for locale in - IterableDynamicDataProvider::::supported_locales_for_key( - &locale_provider, key, - ).unwrap() - { - let payload = postcard_provider.load_buffer( - key, - DataRequest { - locale: &locale, - metadata: Default::default(), - }, - ).unwrap().take_payload().unwrap(); + for locale in locale_provider.supported_locales_for_key(key).unwrap() { + let payload = postcard_provider + .load_buffer( + key, + DataRequest { + locale: &locale, + metadata: Default::default(), + }, + ) + .unwrap() + .take_payload() + .unwrap(); let stats_before = dhat::HeapStats::get(); // We need to generate the stats before the deserialized struct gets dropped, in order // to distinguish between a temporary and permanent allocation. - let stats_after = icu_datagen::deserialize_and_discard(key, payload, dhat::HeapStats::get).unwrap(); + let stats_after = + icu_datagen::deserialize_and_discard(key, payload, dhat::HeapStats::get) + .unwrap(); let vio_total = stats_after.total_bytes - stats_before.total_bytes; let vio_net = stats_after.curr_bytes - stats_before.curr_bytes; diff --git a/provider/fs/src/export/fs_exporter.rs b/provider/fs/src/export/fs_exporter.rs index c9c7400b691..3b6eaef71e9 100644 --- a/provider/fs/src/export/fs_exporter.rs +++ b/provider/fs/src/export/fs_exporter.rs @@ -119,8 +119,6 @@ impl DataExporter for FilesystemExporter { locale: &DataLocale, obj: &DataPayload, ) -> Result<(), DataError> { - log::trace!("Writing: {}/{}", key, locale); - let mut path_buf = self.root.clone(); path_buf.push(&*key.write_to_string()); path_buf.push(&*locale.write_to_string()); diff --git a/provider/fs/src/export/mod.rs b/provider/fs/src/export/mod.rs index 69a067b08e6..d586011292f 100644 --- a/provider/fs/src/export/mod.rs +++ b/provider/fs/src/export/mod.rs @@ -2,70 +2,48 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -//! Utilities for dumping data to an ICU4X filesystem tree. +//! Data exporter for [`FsDataProvider`](crate::FsDataProvider). //! -//! The `export` Cargo feature enables you to pull all data from some other data provider and -//! persist it on the filesystem to be read by an FsDataProvider at runtime. -//! -//! For a command-line user interface, see the `icu_datagen` crate. +//! This module can be used as a target for the `icu_datagen` crate. //! //! # Examples //! //! ``` -//! use icu_provider::datagen::DataExporter; -//! use icu_provider::dynutil::*; +//! use icu_datagen::prelude::*; //! use icu_provider::hello_world::*; -//! use icu_provider::prelude::*; -//! use icu_provider_fs::export::serializers; -//! use icu_provider_fs::export::ExporterOptions; -//! use icu_provider_fs::export::FilesystemExporter; +//! use icu_provider_fs::export::*; //! use icu_provider_fs::FsDataProvider; -//! use std::borrow::Cow; -//! use std::path::PathBuf; //! //! let demo_path = std::env::temp_dir().join("icu4x_json_demo"); -//! +//! //! // Set up the exporter -//! let mut options = serializers::json::Options::default(); -//! let serializer = Box::new(serializers::json::Serializer::new(options)); //! let mut options = ExporterOptions::default(); //! options.root = demo_path.clone(); +//! let serializer = Box::new(serializers::json::Serializer::default()); //! let mut exporter = FilesystemExporter::try_new(serializer, options) //! .expect("Should successfully initialize data output directory"); //! //! // Export something -//! let payload = DataPayload::::from_owned(HelloWorldV1 { -//! message: Cow::Borrowed("Hi"), -//! }); -//! exporter -//! .put_payload( -//! HelloWorldV1Marker::KEY, -//! &Default::default(), -//! &UpcastDataPayload::upcast(payload.clone()), -//! ) -//! .expect("Should successfully export"); +//! DatagenProvider::default() +//! .export( +//! [HelloWorldV1Marker::KEY].into_iter().collect(), +//! exporter +//! ).unwrap(); //! //! // Create a filesystem provider reading from the demo directory //! let provider = FsDataProvider::try_new(demo_path.clone()) -//! .expect("Should successfully read from filesystem"); +//! .expect("Should successfully read from filesystem") +//! .as_deserializing(); //! -//! // Read the key from the filesystem and ensure it is as expected -//! let req = DataRequest { -//! locale: Default::default(), -//! metadata: Default::default(), -//! }; +//! // Read the key from the filesystem //! let response: DataPayload = provider -//! .as_deserializing() -//! .load(req) +//! .load(Default::default()) //! .unwrap() //! .take_payload() //! .unwrap(); -//! -//! assert_eq!(response.get(), payload.get(),); -//! -//! // Clean up from demo -//! std::fs::remove_dir_all(&demo_path) -//! .expect("Should clean up test directory"); +//! # +//! # std::fs::remove_dir_all(&demo_path) +//! # .expect("Should clean up test directory"); //! ``` #![allow( diff --git a/tools/testdata-scripts/src/bin/make-testdata.rs b/tools/testdata-scripts/src/bin/make-testdata.rs index bb9044c4e76..6901029d364 100644 --- a/tools/testdata-scripts/src/bin/make-testdata.rs +++ b/tools/testdata-scripts/src/bin/make-testdata.rs @@ -12,6 +12,8 @@ use std::path::Path; include!("../../locales.rs.data"); fn main() { + #![allow(deprecated)] // want to keep old datagen code path covered + simple_logger::SimpleLogger::new() .env() .with_level(log::LevelFilter::Info)