From c55d58d68381a59360d9a53723f7299c86630c62 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 24 Aug 2023 05:03:47 -0400 Subject: [PATCH] validate the compatibility of output zips produced across benchmark strategies --- Cargo.lock | 81 ++++++++++++++++++++++++++ lib/Cargo.toml | 2 + lib/benches/my_benchmark.rs | 111 +++++++++++++++++++++++++----------- 3 files changed, 162 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 999ec8f..832e7e1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -125,6 +125,15 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "bumpalo" version = "3.13.0" @@ -241,6 +250,15 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "cpufeatures" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" +dependencies = [ + "libc", +] + [[package]] name = "crc32fast" version = "1.3.2" @@ -331,6 +349,26 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + [[package]] name = "displaydoc" version = "0.2.4" @@ -493,6 +531,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "gimli" version = "0.27.3" @@ -601,6 +649,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "keccak" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f6d5ed8676d904364de097082f4e7d240b571b67989ced0240f08b7f966f940" +dependencies = [ + "cpufeatures", +] + [[package]] name = "libc" version = "0.2.147" @@ -616,10 +673,12 @@ dependencies = [ "criterion", "displaydoc", "futures", + "generic-array", "once_cell", "parking_lot 0.12.1", "rayon", "regex", + "sha3", "static_init", "tempfile", "thiserror", @@ -1126,6 +1185,16 @@ dependencies = [ "serde", ] +[[package]] +name = "sha3" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75872d278a8f37ef87fa0ddbda7802605cb18344497949862c0d4dcb291eba60" +dependencies = [ + "digest", + "keccak", +] + [[package]] name = "slab" version = "0.4.8" @@ -1328,6 +1397,12 @@ dependencies = [ "winnow", ] +[[package]] +name = "typenum" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" + [[package]] name = "unicode-ident" version = "1.0.11" @@ -1346,6 +1421,12 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + [[package]] name = "walkdir" version = "2.3.3" diff --git a/lib/Cargo.toml b/lib/Cargo.toml index b6fd3e7..920485f 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -31,6 +31,8 @@ zip.workspace = true [dev-dependencies] criterion = { version = "0.5", features = ["async_tokio"] } +generic-array = "0.14.7" +sha3 = "0.10.8" tokio = { workspace = true, features = ["rt-multi-thread"] } walkdir = "2" diff --git a/lib/benches/my_benchmark.rs b/lib/benches/my_benchmark.rs index 3923535..2ce2f8d 100644 --- a/lib/benches/my_benchmark.rs +++ b/lib/benches/my_benchmark.rs @@ -16,7 +16,9 @@ mod parallel_merge { use libmedusa_zip as lib; + use generic_array::{typenum::U32, GenericArray}; use rayon::prelude::*; + use sha3::{Digest, Sha3_256}; use tempfile; use tokio::runtime::Runtime; use walkdir::WalkDir; @@ -24,6 +26,20 @@ mod parallel_merge { use std::{env, fs, io, path::Path, time::Duration}; + fn hash_file_bytes(f: &mut fs::File) -> Result, io::Error> { + use io::{Read, Seek}; + + f.rewind()?; + + let mut hasher = Sha3_256::new(); + let mut buf: Vec = Vec::new(); + /* TODO: how to hash in chunks at a time? */ + f.read_to_end(&mut buf)?; + hasher.update(buf); + + Ok(hasher.finalize()) + } + fn extract_example_zip( target: &Path, ) -> Result<(Vec, tempfile::TempDir), ZipError> { @@ -189,27 +205,27 @@ mod parallel_merge { (0.05, (0.07, 0.2)), SamplingMode::Auto, ), - ( - /* This file is 9.7M. */ - "Babel-2.12.1-py3-none-any.whl", - (1000, (80, 10)), - ( - Duration::from_secs(3), - (Duration::from_secs(35), Duration::from_secs(35)), - ), - /* 50% variation is within noise given our low sample size for the slow sync tests. */ - (0.1, (0.2, 0.5)), - SamplingMode::Flat, - ), - ( - /* This file is 461M, or about half a gigabyte, with multiple individual very - * large binary files. */ - "tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl", - (100, (1, 1)), - (Duration::from_secs(10), (Duration::ZERO, Duration::ZERO)), - (0.1, (0.0, 0.0)), - SamplingMode::Flat, - ), + /* ( */ + /* /\* This file is 9.7M. *\/ */ + /* "Babel-2.12.1-py3-none-any.whl", */ + /* (1000, (80, 10)), */ + /* ( */ + /* Duration::from_secs(3), */ + /* (Duration::from_secs(35), Duration::from_secs(35)), */ + /* ), */ + /* /\* 50% variation is within noise given our low sample size for the slow sync tests. *\/ */ + /* (0.1, (0.2, 0.5)), */ + /* SamplingMode::Flat, */ + /* ), */ + /* ( */ + /* /\* This file is 461M, or about half a gigabyte, with multiple individual very */ + /* * large binary files. *\/ */ + /* "tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl", */ + /* (100, (1, 1)), */ + /* (Duration::from_secs(10), (Duration::ZERO, Duration::ZERO)), */ + /* (0.1, (0.0, 0.0)), */ + /* SamplingMode::Flat, */ + /* ), */ ] .iter() { @@ -224,13 +240,22 @@ mod parallel_merge { zip_len ); - group.throughput(Throughput::Bytes(zip_len as u64)); + group + .sampling_mode(*mode) + .throughput(Throughput::Bytes(zip_len as u64)); /* FIXME: assigning `_` to the second arg of this tuple will destroy the * extract dir, which is only a silent error producing an empty file!!! * AWFUL UX!!! */ let (input_files, extracted_dir) = extract_example_zip(&target).unwrap(); + /* Compare the outputs of the two types of crawls. */ + let medusa_crawl_result = rt + .block_on(execute_medusa_crawl(extracted_dir.path())) + .unwrap(); + let sync_crawl_result = execute_basic_crawl(extracted_dir.path()).unwrap(); + assert_eq!(medusa_crawl_result, sync_crawl_result); + /* Run the parallel filesystem crawl. */ group .sample_size(*n_crawl) @@ -248,48 +273,70 @@ mod parallel_merge { BenchmarkId::new(&id, ""), |b| b.iter(|| execute_basic_crawl(extracted_dir.path())), ); - /* Compare the outputs of the two crawls. */ - let medusa_crawl_result = rt - .block_on(execute_medusa_crawl(extracted_dir.path())) - .unwrap(); - let sync_crawl_result = execute_basic_crawl(extracted_dir.path()).unwrap(); - assert_eq!(medusa_crawl_result, sync_crawl_result); if env::var_os("ONLY_CRAWL").is_some() { continue; } - group.sampling_mode(*mode); - /* Run the parallel implementation. */ - let parallelism = lib::zip::Parallelism::ParallelMerge; group .sample_size(*n_p) .measurement_time(*t_p) .noise_threshold(*noise_p); + let parallelism = lib::zip::Parallelism::ParallelMerge; group.bench_with_input(BenchmarkId::new(&id, parallelism), ¶llelism, |b, p| { b.to_async(&rt) .iter(|| execute_medusa_zip(input_files.clone(), *p)); }); + let mut canonical_parallel_output = rt.block_on( + execute_medusa_zip(input_files.clone(), parallelism) + ).unwrap().into_inner(); + let canonical_parallel_output = hash_file_bytes(&mut canonical_parallel_output).unwrap(); /* Run the sync implementation. */ if env::var_os("NO_SYNC").is_none() { - let parallelism = lib::zip::Parallelism::Synchronous; group .sample_size(*n_sync) .measurement_time(*t_sync) .noise_threshold(*noise_sync); + /* FIXME: this takes >3x as long as sync zip! */ + /* Run the async version, but without any fancy queueing. */ + let parallelism = lib::zip::Parallelism::Synchronous; group.bench_with_input(BenchmarkId::new(&id, parallelism), ¶llelism, |b, p| { b.to_async(&rt) .iter(|| execute_medusa_zip(input_files.clone(), *p)); }); + let canonical_sync = rt.block_on( + execute_medusa_zip(input_files.clone(), parallelism) + ).unwrap(); + let mut canonical_sync_filenames: Vec<_> = canonical_sync.file_names() + .map(|s| s.to_string()).collect(); + canonical_sync_filenames.par_sort_unstable(); + let mut canonical_sync = canonical_sync.into_inner(); + let canonical_sync = hash_file_bytes(&mut canonical_sync).unwrap(); + assert_eq!(canonical_parallel_output, canonical_sync); + /* Run the implementation based only off of the zip crate. We reuse the same * sampling presets under the assumption it will have a very similar * runtime. */ group.bench_function(BenchmarkId::new(&id, ""), |b| { b.iter(|| execute_basic_zip(input_files.clone())); }); + + let canonical_basic = execute_basic_zip(input_files.clone()).unwrap(); + /* We can't match our medusa zip file byte-for-byte against the zip crate version, but we + * can at least check that they have the same filenames. */ + let mut canonical_basic_filenames: Vec<_> = canonical_basic.file_names() + .map(|s| s.to_string()).collect(); + canonical_basic_filenames.par_sort_unstable(); + /* NB: the zip crate basic impl does not introduce directory entries, so we have to remove + * them here from the medusa zip to check equality. */ + let canonical_sync_filenames: Vec<_> = canonical_sync_filenames + .into_par_iter() + .filter(|name| !name.ends_with('/')) + .collect(); + assert_eq!(canonical_sync_filenames, canonical_basic_filenames); } }