diff --git a/.github/workflows/artifacts-build.yml b/.github/workflows/artifacts-build.yml index fe99582ea4e..b6006272122 100644 --- a/.github/workflows/artifacts-build.yml +++ b/.github/workflows/artifacts-build.yml @@ -214,6 +214,7 @@ jobs: - components/segmenter - experimental/transliterate - experimental/zerotrie + - provider/blob - utils/fixed_decimal - utils/litemap - utils/tinystr diff --git a/Cargo.lock b/Cargo.lock index 5180c1a4537..5d12b60bc73 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1924,6 +1924,7 @@ dependencies = [ name = "icu_provider_blob" version = "1.3.2" dependencies = [ + "criterion", "icu_datagen", "icu_locid", "icu_provider", @@ -1931,6 +1932,7 @@ dependencies = [ "postcard", "serde", "writeable", + "zerotrie", "zerovec", ] diff --git a/provider/blob/Cargo.toml b/provider/blob/Cargo.toml index db43f4f2d67..c74542ace8a 100644 --- a/provider/blob/Cargo.toml +++ b/provider/blob/Cargo.toml @@ -25,6 +25,7 @@ postcard = { version = "1.0.0", default-features = false, features = ["alloc"] } serde = { version = "1.0", default-features = false, features = ["alloc"] } writeable = {workspace = true } zerovec = { workspace = true, features = ["serde", "yoke"] } +zerotrie = { workspace = true, features = ["serde", "zerovec"] } log = { version = "0.4", optional = true } @@ -32,6 +33,9 @@ log = { version = "0.4", optional = true } icu_locid = { path = "../../components/locid", features = ["serde"] } icu_datagen = { path = "../../provider/datagen", features = ["networking"] } +[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies] +criterion = "0.4" + [features] std = ["icu_provider/std"] export = [ @@ -40,4 +44,10 @@ export = [ "postcard/alloc", "std", "zerovec/serde", + "zerotrie/alloc", + "zerotrie/litemap", ] + +[[bench]] +name = "blob_version_bench" +harness = false diff --git a/provider/blob/benches/blob_version_bench.rs b/provider/blob/benches/blob_version_bench.rs new file mode 100644 index 00000000000..c0050b5b152 --- /dev/null +++ b/provider/blob/benches/blob_version_bench.rs @@ -0,0 +1,62 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +extern crate alloc; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use icu_provider::datagen::IterableDataProvider; +use icu_provider::hello_world::*; +use icu_provider::prelude::*; +use icu_provider_blob::BlobDataProvider; + +const BLOB_V1: &[u8] = include_bytes!("../tests/data/v1.postcard"); +const BLOB_V2: &[u8] = include_bytes!("../tests/data/v2.postcard"); + +fn blob_version_bench(c: &mut Criterion) { + c.bench_function("provider/construct/v1", |b| { + b.iter(|| BlobDataProvider::try_new_from_static_blob(black_box(BLOB_V1)).unwrap()); + }); + c.bench_function("provider/construct/v2", |b| { + b.iter(|| BlobDataProvider::try_new_from_static_blob(black_box(BLOB_V1)).unwrap()); + }); + + let hello_world_provider = HelloWorldProvider; + let locales = hello_world_provider.supported_locales().unwrap(); + + c.bench_function("provider/read/v1", |b| { + let provider = BlobDataProvider::try_new_from_static_blob(black_box(BLOB_V1)).unwrap(); + b.iter(|| { + for locale in black_box(&locales).iter() { + black_box(&provider) + .load_buffer( + HelloWorldV1Marker::KEY, + DataRequest { + locale, + metadata: Default::default(), + }, + ) + .unwrap(); + } + }); + }); + c.bench_function("provider/read/v2", |b| { + let provider = BlobDataProvider::try_new_from_static_blob(black_box(BLOB_V2)).unwrap(); + b.iter(|| { + for locale in black_box(&locales).iter() { + black_box(&provider) + .load_buffer( + HelloWorldV1Marker::KEY, + DataRequest { + locale, + metadata: Default::default(), + }, + ) + .unwrap(); + } + }); + }); +} + +criterion_group!(benches, blob_version_bench,); +criterion_main!(benches); diff --git a/provider/blob/src/blob_data_provider.rs b/provider/blob/src/blob_data_provider.rs index e791a4b0d88..55aa85b05ae 100644 --- a/provider/blob/src/blob_data_provider.rs +++ b/provider/blob/src/blob_data_provider.rs @@ -2,7 +2,7 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use crate::blob_schema::{BlobSchema, BlobSchemaV1}; +use crate::blob_schema::BlobSchema; use alloc::boxed::Box; use icu_provider::buf::BufferFormat; use icu_provider::prelude::*; @@ -38,7 +38,7 @@ use yoke::*; /// // Read an ICU4X data blob dynamically: /// let blob = std::fs::read(concat!( /// env!("CARGO_MANIFEST_DIR"), -/// "/tests/data/hello_world.postcard", +/// "/tests/data/v2.postcard", /// )) /// .expect("Reading pre-computed postcard buffer"); /// @@ -69,7 +69,7 @@ use yoke::*; /// // Read an ICU4X data blob statically: /// const HELLO_WORLD_BLOB: &[u8] = include_bytes!(concat!( /// env!("CARGO_MANIFEST_DIR"), -/// "/tests/data/hello_world.postcard" +/// "/tests/data/v2.postcard" /// )); /// /// // Create a DataProvider from it: @@ -87,7 +87,7 @@ use yoke::*; /// ``` #[derive(Clone)] pub struct BlobDataProvider { - data: Yoke, Option>, + data: Yoke, Option>, } impl core::fmt::Debug for BlobDataProvider { @@ -103,7 +103,7 @@ impl BlobDataProvider { pub fn try_new_from_blob(blob: Box<[u8]>) -> Result { Ok(Self { data: Cart::try_make_yoke(blob, |bytes| { - BlobSchema::deserialize_v1(&mut postcard::Deserializer::from_bytes(bytes)) + BlobSchema::deserialize_and_check(&mut postcard::Deserializer::from_bytes(bytes)) })?, }) } @@ -112,7 +112,7 @@ impl BlobDataProvider { /// [`try_new_from_blob`](BlobDataProvider::try_new_from_blob) and is allocation-free. pub fn try_new_from_static_blob(blob: &'static [u8]) -> Result { Ok(Self { - data: Yoke::new_owned(BlobSchema::deserialize_v1( + data: Yoke::new_owned(BlobSchema::deserialize_and_check( &mut postcard::Deserializer::from_bytes(blob), )?), }) @@ -150,61 +150,82 @@ mod test { #[test] fn test_empty() { - let mut blob: Vec = Vec::new(); - - { - let mut exporter = BlobExporter::new_with_sink(Box::new(&mut blob)); - - exporter.flush(HelloWorldV1Marker::KEY).unwrap(); - - exporter.close().unwrap(); + for version in [1, 2] { + let mut blob: Vec = Vec::new(); + + { + let mut exporter = if version == 1 { + BlobExporter::new_with_sink(Box::new(&mut blob)) + } else { + BlobExporter::new_v2_with_sink(Box::new(&mut blob)) + }; + + exporter.flush(HelloWorldV1Marker::KEY).unwrap(); + + exporter.close().unwrap(); + } + + let provider = BlobDataProvider::try_new_from_blob(blob.into()).unwrap(); + + assert!( + matches!( + provider.load_buffer(HelloWorldV1Marker::KEY, Default::default()), + Err(DataError { + kind: DataErrorKind::MissingLocale, + .. + }) + ), + "(version: {version})" + ); } - - let provider = BlobDataProvider::try_new_from_blob(blob.into()).unwrap(); - - assert!(matches!( - provider.load_buffer(HelloWorldV1Marker::KEY, Default::default()), - Err(DataError { - kind: DataErrorKind::MissingLocale, - .. - }) - )); } #[test] fn test_singleton() { - let mut blob: Vec = Vec::new(); - - { - let mut exporter = BlobExporter::new_with_sink(Box::new(&mut blob)); - - exporter.flush(HelloSingletonV1Marker::KEY).unwrap(); - - exporter.close().unwrap(); + for version in [1, 2] { + let mut blob: Vec = Vec::new(); + + { + let mut exporter = if version == 1 { + BlobExporter::new_with_sink(Box::new(&mut blob)) + } else { + BlobExporter::new_v2_with_sink(Box::new(&mut blob)) + }; + + exporter.flush(HelloSingletonV1Marker::KEY).unwrap(); + + exporter.close().unwrap(); + } + + let provider = BlobDataProvider::try_new_from_blob(blob.into()).unwrap(); + + assert!( + matches!( + provider.load_buffer( + HelloSingletonV1Marker::KEY, + DataRequest { + locale: &icu_locid::locale!("de").into(), + metadata: Default::default() + } + ), + Err(DataError { + kind: DataErrorKind::ExtraneousLocale, + .. + }) + ), + "(version: {version})" + ); + + assert!( + matches!( + provider.load_buffer(HelloSingletonV1Marker::KEY, Default::default()), + Err(DataError { + kind: DataErrorKind::MissingLocale, + .. + }) + ), + "(version: {version})" + ); } - - let provider = BlobDataProvider::try_new_from_blob(blob.into()).unwrap(); - - assert!(matches!( - provider.load_buffer( - HelloSingletonV1Marker::KEY, - DataRequest { - locale: &icu_locid::locale!("de").into(), - metadata: Default::default() - } - ), - Err(DataError { - kind: DataErrorKind::ExtraneousLocale, - .. - }) - )); - - assert!(matches!( - provider.load_buffer(HelloSingletonV1Marker::KEY, Default::default()), - Err(DataError { - kind: DataErrorKind::MissingLocale, - .. - }) - )); } } diff --git a/provider/blob/src/blob_schema.rs b/provider/blob/src/blob_schema.rs index 66a951c7c27..023b9abff40 100644 --- a/provider/blob/src/blob_schema.rs +++ b/provider/blob/src/blob_schema.rs @@ -5,27 +5,47 @@ use alloc::boxed::Box; use icu_provider::prelude::*; use serde::Deserialize; +use writeable::Writeable; +use zerotrie::ZeroTrieSimpleAscii; use zerovec::maps::{ZeroMap2dBorrowed, ZeroMapKV}; -use zerovec::vecs::{Index32, VarZeroSlice, VarZeroVec}; +use zerovec::vecs::{Index32, VarZeroSlice, VarZeroVec, ZeroSlice}; /// A versioned Serde schema for ICU4X data blobs. -#[derive(serde::Deserialize)] +#[derive(serde::Deserialize, yoke::Yokeable)] +#[yoke(prove_covariance_manually)] #[cfg_attr(feature = "export", derive(serde::Serialize))] -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) enum BlobSchema<'data> { #[serde(borrow)] V001(BlobSchemaV1<'data>), + #[serde(borrow)] + V002(BlobSchemaV2<'data>), } impl<'data> BlobSchema<'data> { - pub fn deserialize_v1>( + pub fn deserialize_and_check>( de: D, - ) -> Result, D::Error> { - let BlobSchema::V001(blob) = Self::deserialize(de)?; + ) -> Result, D::Error> { + let blob = Self::deserialize(de)?; #[cfg(debug_assertions)] blob.check_invariants(); Ok(blob) } + + pub fn load(&self, key: DataKey, req: DataRequest) -> Result<&'data [u8], DataError> { + match self { + BlobSchema::V001(s) => s.load(key, req), + BlobSchema::V002(s) => s.load(key, req), + } + } + + #[cfg(debug_assertions)] + fn check_invariants(&self) { + match self { + BlobSchema::V001(s) => s.check_invariants(), + BlobSchema::V002(s) => s.check_invariants(), + } + } } /// Version 1 of the ICU4X data blob schema. @@ -98,6 +118,90 @@ impl<'data> BlobSchemaV1<'data> { } } +/// Version 2 of the ICU4X data blob schema. +#[derive(Clone, Copy, Debug, serde::Deserialize, yoke::Yokeable)] +#[yoke(prove_covariance_manually)] +#[cfg_attr(feature = "export", derive(serde::Serialize))] +pub(crate) struct BlobSchemaV2<'data> { + /// Map from key hash to locale trie. + /// Weak invariant: should be sorted. + #[serde(borrow)] + pub keys: &'data ZeroSlice, + /// Map from locale to buffer index. + /// Weak invariant: the `usize` values are valid indices into `self.buffers` + /// Weak invariant: there is at least one value for every integer in 0..self.buffers.len() + /// Weak invariant: keys and locales are the same length + // TODO: Make ZeroTrieSimpleAscii<[u8]> work when in this position. + #[serde(borrow)] + pub locales: &'data VarZeroSlice<[u8]>, + /// Vector of buffers + #[serde(borrow)] + pub buffers: &'data VarZeroSlice<[u8], Index32>, +} + +impl Default for BlobSchemaV2<'_> { + fn default() -> Self { + Self { + keys: ZeroSlice::new_empty(), + locales: VarZeroSlice::new_empty(), + buffers: VarZeroSlice::new_empty(), + } + } +} + +impl<'data> BlobSchemaV2<'data> { + pub fn load(&self, key: DataKey, req: DataRequest) -> Result<&'data [u8], DataError> { + let key_index = self + .keys + .binary_search(&key.hashed()) + .ok() + .ok_or_else(|| DataErrorKind::MissingDataKey.with_req(key, req))?; + if key.metadata().singleton && !req.locale.is_empty() { + return Err(DataErrorKind::ExtraneousLocale.with_req(key, req)); + } + let zerotrie = self + .locales + .get(key_index) + .ok_or_else(|| DataError::custom("Invalid blob bytes").with_req(key, req))?; + // TODO(#4249): Add a lookup function to zerotrie so we don't need to stringify + let locale_str = req.locale.write_to_string(); + let blob_index = ZeroTrieSimpleAscii::from_store(zerotrie) + .get(locale_str.as_bytes()) + .ok_or_else(|| DataErrorKind::MissingLocale.with_req(key, req))?; + let buffer = self + .buffers + .get(blob_index) + .ok_or_else(|| DataError::custom("Invalid blob bytes").with_req(key, req))?; + Ok(buffer) + } + + /// Verifies the weak invariants using debug assertions + #[cfg(debug_assertions)] + fn check_invariants(&self) { + if self.keys.is_empty() && self.locales.is_empty() && self.buffers.is_empty() { + return; + } + debug_assert_eq!(self.keys.len(), self.locales.len()); + // Note: We could check that every index occurs at least once, but that's a more expensive + // operation, so we will just check for the min and max index. + let mut seen_min = self.buffers.is_empty(); + let mut seen_max = self.buffers.is_empty(); + for zerotrie in self.locales.iter() { + for (_locale, idx) in ZeroTrieSimpleAscii::from_store(zerotrie).iter() { + debug_assert!(idx < self.buffers.len()); + if idx == 0 { + seen_min = true; + } + if idx + 1 == self.buffers.len() { + seen_max = true; + } + } + } + debug_assert!(seen_min); + debug_assert!(seen_max); + } +} + /// This type lets us use a u32-index-format VarZeroVec with the ZeroMap2dBorrowed. /// /// Eventually we will have a FormatSelector type that lets us do `ZeroMap, V>` diff --git a/provider/blob/src/export/blob_exporter.rs b/provider/blob/src/export/blob_exporter.rs index f9137fc59cd..56a60f97c68 100644 --- a/provider/blob/src/export/blob_exporter.rs +++ b/provider/blob/src/export/blob_exporter.rs @@ -8,27 +8,35 @@ use crate::blob_schema::*; use icu_provider::datagen::*; use icu_provider::prelude::*; -use std::collections::HashMap; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Mutex; use writeable::Writeable; +use zerotrie::ZeroTrieSimpleAscii; use zerovec::ule::VarULE; use zerovec::vecs::Index32; use zerovec::VarZeroVec; use zerovec::ZeroMap2d; +use zerovec::ZeroVec; use postcard::ser_flavors::{AllocVec, Flavor}; +enum VersionConfig { + V001, + V002, +} + /// A data exporter that writes data to a single-file blob. /// See the module-level docs for an example. pub struct BlobExporter<'w> { - /// List of (key hash, locale byte string, blob ID) + /// Map of key hash -> locale byte string -> blob ID #[allow(clippy::type_complexity)] - resources: Mutex, usize)>>, + resources: Mutex, usize>>>, // All seen keys - all_keys: Mutex>, + all_keys: Mutex>, /// Map from blob to blob ID unique_resources: Mutex, usize>>, sink: Box, + version: VersionConfig, } impl core::fmt::Debug for BlobExporter<'_> { @@ -43,13 +51,32 @@ impl core::fmt::Debug for BlobExporter<'_> { } impl<'w> BlobExporter<'w> { - /// Create a [`BlobExporter`] that writes to the given I/O stream. + /// Creates a version 1 [`BlobExporter`] that writes to the given I/O stream. + /// + /// Version 1 is needed if the blob may be consumed by ICU4X versions 1.0 through 1.3. If + /// targeting only ICU4X 1.4 and above, see [BlobExporter::new_v2_with_sink()]. pub fn new_with_sink(sink: Box) -> Self { Self { - resources: Mutex::new(Vec::new()), - unique_resources: Mutex::new(HashMap::new()), - all_keys: Mutex::new(Vec::new()), + resources: Default::default(), + unique_resources: Default::default(), + all_keys: Default::default(), + sink, + version: VersionConfig::V001, + } + } + + /// Creates a version 2 [`BlobExporter`] that writes to the given I/O stream. + /// + /// Version 2 produces a smaller postcard file than version 1 without sacrificing performance. + /// It is compatible with ICU4X 1.4 and above. If you need to support older version of ICU4X, + /// see [BlobExporter::new_with_sink()]. + pub fn new_v2_with_sink(sink: Box) -> Self { + Self { + resources: Default::default(), + unique_resources: Default::default(), + all_keys: Default::default(), sink, + version: VersionConfig::V002, } } } @@ -75,20 +102,38 @@ impl DataExporter for BlobExporter<'_> { *unique_resources.entry(output).or_insert(len) }; #[allow(clippy::expect_used)] - self.resources.lock().expect("poison").push(( - key.hashed(), - locale.write_to_string().into_owned().into_bytes(), - idx, - )); + self.resources + .lock() + .expect("poison") + .entry(key.hashed()) + .or_default() + .entry(locale.write_to_string().into_owned().into_bytes()) + .or_insert(idx); Ok(()) } fn flush(&self, key: DataKey) -> Result<(), DataError> { - self.all_keys.lock().expect("poison").push(key.hashed()); + self.all_keys.lock().expect("poison").insert(key.hashed()); Ok(()) } fn close(&mut self) -> Result<(), DataError> { + match self.version { + VersionConfig::V001 => self.close_v1(), + VersionConfig::V002 => self.close_v2(), + } + } +} + +struct FinalizedBuffers { + /// Sorted list of blob to old ID; the index in the vec is the new ID + vzv: VarZeroVec<'static, [u8], Index32>, + /// Map from old ID to new ID + remap: HashMap, +} + +impl BlobExporter<'_> { + fn finalize_buffers(&mut self) -> FinalizedBuffers { // The blob IDs are unstable due to the parallel nature of datagen. // In order to make a canonical form, we sort them lexicographically now. @@ -107,12 +152,29 @@ impl DataExporter for BlobExporter<'_> { .map(|(new_id, (_, old_id))| (*old_id, new_id)) .collect(); + // Convert the sorted list to a VarZeroVec + let vzv: VarZeroVec<[u8], Index32> = { + let buffers: Vec> = sorted.into_iter().map(|(blob, _)| blob).collect(); + buffers.as_slice().into() + }; + + FinalizedBuffers { vzv, remap } + } + + fn close_v1(&mut self) -> Result<(), DataError> { + let FinalizedBuffers { vzv, remap } = self.finalize_buffers(); + // Now build up the ZeroMap2d, changing old ID to new ID let mut zm = self .resources .get_mut() .expect("poison") .iter() + .flat_map(|(hash, sub_map)| { + sub_map + .iter() + .map(|(locale, old_id)| (*hash, locale, old_id)) + }) .map(|(hash, locale, old_id)| { ( hash, @@ -125,16 +187,10 @@ impl DataExporter for BlobExporter<'_> { for key in self.all_keys.lock().expect("poison").iter() { if zm.get0(key).is_none() { - zm.insert(key, Index32U8::SENTINEL, &sorted.len()); + zm.insert(key, Index32U8::SENTINEL, &vzv.len()); } } - // Convert the sorted list to a VarZeroVec - let vzv: VarZeroVec<[u8], Index32> = { - let buffers: Vec> = sorted.into_iter().map(|(blob, _)| blob).collect(); - buffers.as_slice().into() - }; - if !zm.is_empty() { let blob = BlobSchema::V001(BlobSchemaV1 { keys: zm.as_borrowed(), @@ -147,4 +203,46 @@ impl DataExporter for BlobExporter<'_> { } Ok(()) } + + fn close_v2(&mut self) -> Result<(), DataError> { + let FinalizedBuffers { vzv, remap } = self.finalize_buffers(); + + let all_keys = self.all_keys.lock().expect("poison"); + let resources = self.resources.lock().expect("poison"); + + let keys: ZeroVec = all_keys.iter().copied().collect(); + + let locales_vec: Vec> = all_keys + .iter() + .map(|data_key_hash| resources.get(data_key_hash)) + .map(|option_sub_map| { + if let Some(sub_map) = option_sub_map { + let mut sub_map = sub_map.clone(); + sub_map + .iter_mut() + .for_each(|(_, id)| *id = *remap.get(id).expect("in-bound index")); + let zerotrie = ZeroTrieSimpleAscii::try_from(&sub_map).expect("in-bounds"); + zerotrie.take_store() + } else { + // Key with no locales: insert an empty ZeroTrie + ZeroTrieSimpleAscii::default().take_store() + } + }) + .collect(); + + let locales_vzv: VarZeroVec<[u8]> = locales_vec.as_slice().into(); + + if !keys.is_empty() { + let blob = BlobSchema::V002(BlobSchemaV2 { + keys: &keys, + locales: &locales_vzv, + buffers: &vzv, + }); + log::info!("Serializing blob to output stream..."); + + let output = postcard::to_allocvec(&blob)?; + self.sink.write_all(&output)?; + } + Ok(()) + } } diff --git a/provider/blob/src/export/mod.rs b/provider/blob/src/export/mod.rs index bb448f703a0..546ee4bb6f0 100644 --- a/provider/blob/src/export/mod.rs +++ b/provider/blob/src/export/mod.rs @@ -38,7 +38,7 @@ //! use icu_provider_blob::BlobDataProvider; //! //! // obtain the data blob -//! # let blob = std::fs::read(concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/hello_world.postcard")).unwrap(); +//! # let blob = std::fs::read(concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/v2.postcard")).unwrap(); //! //! // Create a provider reading from the blob //! let provider = diff --git a/provider/blob/tests/data/hello_world.postcard b/provider/blob/tests/data/v1.postcard similarity index 100% rename from provider/blob/tests/data/hello_world.postcard rename to provider/blob/tests/data/v1.postcard diff --git a/provider/blob/tests/data/v2.postcard b/provider/blob/tests/data/v2.postcard new file mode 100644 index 00000000000..db45fa9ca7e Binary files /dev/null and b/provider/blob/tests/data/v2.postcard differ diff --git a/provider/blob/tests/test_versions.rs b/provider/blob/tests/test_versions.rs new file mode 100644 index 00000000000..50643cc2f95 --- /dev/null +++ b/provider/blob/tests/test_versions.rs @@ -0,0 +1,65 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use icu_datagen::prelude::*; +use icu_provider::datagen::IterableDataProvider; +use icu_provider::hello_world::*; +use icu_provider::prelude::*; +use icu_provider_blob::export::*; +use icu_provider_blob::BlobDataProvider; + +const BLOB_V1: &[u8] = include_bytes!("data/v1.postcard"); +const BLOB_V2: &[u8] = include_bytes!("data/v2.postcard"); + +fn run_driver(exporter: BlobExporter) -> Result<(), DataError> { + DatagenDriver::new() + .with_keys([icu_provider::hello_world::HelloWorldV1Marker::KEY]) + .with_all_locales() + .export(&DatagenProvider::new_custom(), exporter) +} + +fn check_hello_world(blob_provider: impl DataProvider) { + let hello_world_provider = HelloWorldProvider; + for locale in hello_world_provider.supported_locales().unwrap() { + let blob_result = blob_provider + .load(DataRequest { + locale: &locale, + metadata: Default::default(), + }) + .unwrap() + .take_payload() + .unwrap(); + let expected_result = hello_world_provider + .load(DataRequest { + locale: &locale, + metadata: Default::default(), + }) + .unwrap() + .take_payload() + .unwrap(); + assert_eq!(blob_result, expected_result, "{locale:?}"); + } +} + +#[test] +fn test_v1() { + let mut blob: Vec = Vec::new(); + let exporter = BlobExporter::new_with_sink(Box::new(&mut blob)); + run_driver(exporter).unwrap(); + assert_eq!(BLOB_V1, blob.as_slice()); + + let blob_provider = BlobDataProvider::try_new_from_blob(blob.into_boxed_slice()).unwrap(); + check_hello_world(blob_provider.as_deserializing()); +} + +#[test] +fn test_v2() { + let mut blob: Vec = Vec::new(); + let exporter = BlobExporter::new_v2_with_sink(Box::new(&mut blob)); + run_driver(exporter).unwrap(); + assert_eq!(BLOB_V2, blob.as_slice()); + + let blob_provider = BlobDataProvider::try_new_from_blob(blob.into_boxed_slice()).unwrap(); + check_hello_world(blob_provider.as_deserializing()); +} diff --git a/provider/datagen/src/bin/datagen/args.rs b/provider/datagen/src/bin/datagen/args.rs index 14acbbf3c29..350f47d6987 100644 --- a/provider/datagen/src/bin/datagen/args.rs +++ b/provider/datagen/src/bin/datagen/args.rs @@ -13,6 +13,7 @@ use std::path::PathBuf; enum Format { Dir, Blob, + Blob2, Mod, DeprecatedDefault, } @@ -212,7 +213,7 @@ pub struct Cli { #[arg( help = "Path to output directory or file. Must be empty or non-existent, unless \ --overwrite is present, in which case the directory is deleted first. \ - For --format=blob, omit this option to dump to stdout. \ + For --format={blob,blob2}, omit this option to dump to stdout. \ For --format={dir,mod} defaults to 'icu4x_data'." )] output: Option, @@ -410,6 +411,13 @@ impl Cli { PathBuf::from("/stdout") }, }), + Format::Blob2 => Ok(config::Export::Blob2 { + path: if let Some(path) = &self.output { + path.clone() + } else { + PathBuf::from("/stdout") + }, + }), Format::Mod => Ok(config::Export::Baked { path: if let Some(mod_directory) = self.output.as_ref() { mod_directory.clone() diff --git a/provider/datagen/src/bin/datagen/config.rs b/provider/datagen/src/bin/datagen/config.rs index 006d89072d3..a6ee9ab642b 100644 --- a/provider/datagen/src/bin/datagen/config.rs +++ b/provider/datagen/src/bin/datagen/config.rs @@ -130,6 +130,9 @@ pub enum Export { Blob { path: PathBuf, }, + Blob2 { + path: PathBuf, + }, Baked { path: PathBuf, #[serde(default, skip_serializing_if = "is_default")] diff --git a/provider/datagen/src/bin/datagen/mod.rs b/provider/datagen/src/bin/datagen/mod.rs index 0c9ee26fab9..69600905f6b 100644 --- a/provider/datagen/src/bin/datagen/mod.rs +++ b/provider/datagen/src/bin/datagen/mod.rs @@ -153,12 +153,12 @@ fn main() -> eyre::Result<()> { Ok(driver.export(&provider, exporter)?) } } - config::Export::Blob { ref path } => { + config::Export::Blob { ref path } | config::Export::Blob2 { ref path } => { #[cfg(not(feature = "blob_exporter"))] eyre::bail!("Exporting to a BlobProvider requires the `blob_exporter` Cargo feature"); #[cfg(feature = "blob_exporter")] { - let exporter = icu_provider_blob::export::BlobExporter::new_with_sink( + let sink: Box = if path == std::path::Path::new("/stdout") { Box::new(std::io::stdout()) } else if !config.overwrite && path.exists() { @@ -168,8 +168,12 @@ fn main() -> eyre::Result<()> { std::fs::File::create(path) .with_context(|| path.to_string_lossy().to_string())?, ) - }, - ); + }; + let exporter = if matches!(config.export, config::Export::Blob { .. }) { + icu_provider_blob::export::BlobExporter::new_with_sink(sink) + } else { + icu_provider_blob::export::BlobExporter::new_v2_with_sink(sink) + }; Ok(driver.export(&provider, exporter)?) } } diff --git a/tools/make/data.toml b/tools/make/data.toml index 084a720b07b..a1d9d7cae97 100644 --- a/tools/make/data.toml +++ b/tools/make/data.toml @@ -43,7 +43,8 @@ exec --fail-on-error target/debug/icu4x-datagen --keys "core/helloworld@1" --loc exec --fail-on-error target/debug/icu4x-datagen --keys "core/helloworld@1" --locales full --format dir --syntax bincode --out provider/fs/tests/data/bincode --overwrite exec --fail-on-error target/debug/icu4x-datagen --keys "core/helloworld@1" --locales full --format dir --syntax postcard --out provider/fs/tests/data/postcard --overwrite -exec --fail-on-error target/debug/icu4x-datagen --keys "core/helloworld@1" --locales full --format blob --overwrite --out provider/blob/tests/data/hello_world.postcard +exec --fail-on-error target/debug/icu4x-datagen --keys "core/helloworld@1" --locales full --format blob --overwrite --out provider/blob/tests/data/v1.postcard +exec --fail-on-error target/debug/icu4x-datagen --keys "core/helloworld@1" --locales full --format blob2 --overwrite --out provider/blob/tests/data/v2.postcard exec --fail-on-error target/debug/icu4x-datagen --keys "core/helloworld@1" "fallback/likelysubtags@1" "fallback/parents@1" "fallback/supplement/co@1" --locales full --format blob --out provider/adapters/tests/data/blob.postcard --overwrite exec --fail-on-error target/debug/icu4x-datagen --keys "core/helloworld@1" --fallback preresolved --locales de --format dir --syntax json --out provider/adapters/tests/data/langtest/de --overwrite