Skip to content

Commit

Permalink
Remove large generated postcard file from repo (#3666)
Browse files Browse the repository at this point in the history
  • Loading branch information
robertbastian authored Jul 13, 2023
1 parent 6bf559c commit 36b17d4
Show file tree
Hide file tree
Showing 23 changed files with 226 additions and 324 deletions.
8 changes: 6 additions & 2 deletions .github/workflows/artifacts-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -544,9 +544,13 @@ jobs:
run: |
mkdir -p benchmarks/datasize
- name: Measure size of selected data package provider/datagen/tests/data/testdata.postcard
- name: Generate testdata
run: |
cargo run --package icu_benchmark_binsize -- provider/datagen/tests/data/testdata.postcard file | tee benchmarks/datasize/output.txt
cargo run --bin make-testdata-legacy
- name: Measure size of selected data package provider/testdata/data/testdata.postcard
run: |
cargo run --package icu_benchmark_binsize -- provider/testdata/data/testdata.postcard file | tee benchmarks/datasize/output.txt
- name: Download previous benchmark data
run: |
Expand Down
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ exclude = [
"ffi/gn",
# Testdata will still be published in the 1.x stream, but is deprecated
# and we don't use it anymore. As we don't want to keep the actual data
# in the repo it doesn't build without running `cargo make testdata` first.
# in the repo it doesn't build without running `cargo make testdata-legacy`
# first.
"provider/testdata",
# Tutorials are tested in their own cargo workspace against released and
# local crates
Expand Down
1 change: 1 addition & 0 deletions docs/process/release.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Once the release is complete, the assigned release driver will:
* This will only update crates that have changed, and will ask you which version number to bump for each crate
* You can use commands like `git log icu@1.0.0..@ -- components/plurals/src/` and `cargo public-api -p icu_list diff 1.0.0` to figure out whether to do a major, minor, or patch release
* Get this reviewed and checked in before continuing
* `cargo make testdata-legacy` to generate data for `icu_testdata` (which is gitignored)
* Use `cargo workspaces publish --from-git` to automatically publish the crates in the correct order
* Add `icu4x-release` group as owners to each new component you're publishing
* `cargo owner -a github:unicode-org:icu4x-release`
Expand Down
2 changes: 1 addition & 1 deletion docs/tutorials/writing_a_new_data_struct.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ When finished, run from the top level:
$ cargo make testdata
```

If everything is hooked together properly, JSON files for your new data struct should appear under *provider/testdata/data/json*, and the file *provider/testdata/data/testdata.postcard* should have changed.
If everything is hooked together properly, JSON files for your new data struct should appear under *provider/datagen/tests/data/json*.

## Example

Expand Down
11 changes: 5 additions & 6 deletions provider/datagen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,9 @@ simple_logger = { version = "4.1.0", default-features = false, optional = true }

[dev-dependencies]
icu = { path = "../../components/icu" }
icu_provider_blob = { path = "../blob" }

[target.'cfg(not(target_os = "windows"))'.dev-dependencies]
# The verify-zero-copy test is causing problems on Windows
glob = "0.3.1"
dhat = "0.3.0"
simple_logger = { version = "4.1.0", default-features = false }

[features]
default = ["bin", "use_wasm", "networking", "legacy_api", "rayon"]
Expand All @@ -121,8 +119,9 @@ path = "src/bin/datagen/mod.rs"
required-features = ["bin"]

[[test]]
name = "icu4x-verify-zero-copy"
path = "tests/verify-zero-copy.rs"
name = "make-testdata"
path = "tests/make-testdata.rs"
required-features = ["provider_fs", "use_wasm"]

[package.metadata.cargo-all-features]
# We don't need working CPT builders for check
Expand Down
8 changes: 6 additions & 2 deletions provider/datagen/src/registry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,15 @@ macro_rules! registry {
}
)+
)+
unreachable!("unregistered marker")
unreachable!("unregistered key {key:?}")
}

#[doc(hidden)]
pub fn deserialize_and_discard<R>(key: DataKey, buf: DataPayload<BufferMarker>, r: impl Fn() -> R) -> Result<R, DataError> {
if key.path() == icu_provider::hello_world::HelloWorldV1Marker::KEY.path() {
let _reified_data: DataPayload<icu_provider::hello_world::HelloWorldV1Marker> = buf.into_deserialized(icu_provider::buf::BufferFormat::Postcard1)?;
return Ok(r());
}
$(
$(
#[cfg($feature)]
Expand All @@ -123,7 +127,7 @@ macro_rules! registry {
}
)+
)+
unreachable!("unregistered marker")
unreachable!("unregistered key {key:?}")
}
}
}
Expand Down
3 changes: 0 additions & 3 deletions provider/datagen/tests/data/json/core/helloworld@1/bn.json

This file was deleted.

3 changes: 0 additions & 3 deletions provider/datagen/tests/data/json/core/helloworld@1/en.json

This file was deleted.

3 changes: 0 additions & 3 deletions provider/datagen/tests/data/json/core/helloworld@1/ja.json

This file was deleted.

3 changes: 0 additions & 3 deletions provider/datagen/tests/data/json/core/helloworld@1/ru.json

This file was deleted.

4 changes: 0 additions & 4 deletions provider/datagen/tests/data/json/fingerprints.csv

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

File renamed without changes.
4 changes: 0 additions & 4 deletions provider/datagen/tests/data/postcard/fingerprints.csv
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,6 @@ compactdecimal/short@1, th, 52B, f0a39a85493a674d
compactdecimal/short@1, th-u-nu-thai, 52B, f0a39a85493a674d
compactdecimal/short@1, tr, 63B, 38574745ff1e12e3
compactdecimal/short@1, und, 52B, c10b79e54779e6bd
core/helloworld@1, bn, 26B, 219e744e649d8150
core/helloworld@1, en, 12B, 6a847ba13c479232
core/helloworld@1, ja, 22B, 9c8d10466cdc04cd
core/helloworld@1, ru, 21B, d84d0724633c97c6
datetime/buddhist/datelengths@1, ar, 165B, c768a2600c7063f2
datetime/buddhist/datelengths@1, ar-EG, 165B, c768a2600c7063f2
datetime/buddhist/datelengths@1, bn, 150B, bc5c367f3d0719cd
Expand Down
Binary file removed provider/datagen/tests/data/testdata.postcard
Binary file not shown.
135 changes: 135 additions & 0 deletions provider/datagen/tests/make-testdata.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use icu_datagen::fs_exporter::serializers::{Json, Postcard};
use icu_datagen::fs_exporter::*;
use icu_datagen::prelude::*;
use icu_provider::datagen::*;
use icu_provider::prelude::*;
use std::collections::BTreeSet;
use std::path::Path;

#[global_allocator]
static ALLOC: dhat::Alloc = dhat::Alloc;

include!("data/locales.rs.data");

#[test]
fn generate_fs_and_verify_zero_copy() {
simple_logger::SimpleLogger::new()
.env()
.with_level(log::LevelFilter::Info)
.init()
.unwrap();

let data_root = Path::new(concat!(core::env!("CARGO_MANIFEST_DIR"), "/tests/data/"));

let source = SourceData::offline()
.with_cldr(data_root.join("cldr"), Default::default())
.unwrap()
.with_icuexport(data_root.join("icuexport"))
.unwrap();

let json_out = Box::new(
FilesystemExporter::try_new(Box::new(Json::pretty()), {
let mut options = ExporterOptions::default();
options.root = data_root.join("json");
options.overwrite = OverwriteOption::RemoveAndReplace;
options.fingerprint = true;
options
})
.unwrap(),
);

let postcard_out = Box::new(
FilesystemExporter::try_new(Box::<Postcard>::default(), {
let mut options = ExporterOptions::default();
options.root = data_root.join("postcard");
options.overwrite = OverwriteOption::RemoveAndReplace;
options.fingerprint = true;
options
})
.unwrap(),
);

let mut options = options::Options::default();
options.locales = options::LocaleInclude::Explicit(LOCALES.iter().cloned().collect());

DatagenProvider::try_new(options, source)
.unwrap()
.export(
icu_datagen::all_keys().into_iter().collect(),
MultiExporter::new(vec![json_out, postcard_out]),
)
.unwrap();

// don't drop to avoid dhat from printing stats at the end
core::mem::forget(dhat::Profiler::new_heap());

// violations for net_bytes_allocated
let mut net_violations = BTreeSet::new();
// violations for total_bytes_allocated (but not net_bytes_allocated)
let mut total_violations = BTreeSet::new();

for key in icu_datagen::all_keys() {
for entry in glob::glob(
&data_root
.join("postcard")
.join(key.path().get())
.join("**/*.postcard")
.display()
.to_string(),
)
.unwrap()
{
let payload = DataPayload::from_owned_buffer(
std::fs::read(&entry.unwrap()).unwrap().into_boxed_slice(),
);

let stats_before = dhat::HeapStats::get();

// We need to generate the stats before the deserialized struct gets dropped, in order
// to distinguish between a temporary and permanent allocation.
let stats_after =
icu_datagen::deserialize_and_discard(key, payload, dhat::HeapStats::get).unwrap();

if stats_after.total_bytes != stats_before.total_bytes {
if stats_after.curr_bytes != stats_before.curr_bytes {
net_violations.insert(key.path().get());
} else {
total_violations.insert(key.path().get());
}
}
}
}

// Types in this list cannot be zero-copy deserialized.
//
// Such types contain some data that was allocated during deserializations
//
// Every entry in this list is a bug that needs to be addressed before ICU4X 1.0.
const EXPECTED_NET_VIOLATIONS: &[&str] = &[
// https://github.com/unicode-org/icu4x/issues/1678
"datetime/skeletons@1",
];

// Types in this list can be zero-copy deserialized (and do not contain allocated data),
// however there is some allocation that occurs during deserialization for validation.
//
// Entries in this list represent a less-than-ideal state of things, however ICU4X is shippable with violations
// in this list since it does not affect databake.
const EXPECTED_TOTAL_VIOLATIONS: &[&str] = &[
// Regex DFAs need to be validated, which involved creating a BTreeMap
"list/and@1",
"list/or@1",
"list/unit@1",
];

assert!(total_violations.iter().eq(EXPECTED_TOTAL_VIOLATIONS.iter()) && net_violations.iter().eq(EXPECTED_NET_VIOLATIONS.iter()),
"Expected violations list does not match found violations!\n\
If the new list is smaller, please update EXPECTED_VIOLATIONS in make-testdata.rs\n\
If it is bigger and that was unexpected, please make sure the key remains zero-copy, or ask ICU4X team members if it is okay\
to temporarily allow for this key to be allowlisted.\n\
Expected (net):\n{EXPECTED_NET_VIOLATIONS:?}\nFound (net):\n{net_violations:?}\nExpected (total):\n{EXPECTED_TOTAL_VIOLATIONS:?}\nFound (total):\n{total_violations:?}");
}
Loading

0 comments on commit 36b17d4

Please sign in to comment.