Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pipelined extraction #236

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
11 changes: 10 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ lzma-rs = { version = "0.3", default-features = false, optional = true }
[target.'cfg(any(all(target_arch = "arm", target_pointer_width = "32"), target_arch = "mips", target_arch = "powerpc"))'.dependencies]
crossbeam-utils = "0.8.20"

[target.'cfg(unix)'.dependencies]
libc = { version = "0.2.155", optional = true }

[target.'cfg(fuzzing)'.dependencies]
arbitrary = { version = "1.3.2", features = ["derive"] }

Expand All @@ -62,7 +65,8 @@ walkdir = "2.5"
time = { workspace = true, features = ["formatting", "macros"] }
anyhow = "1.0.60"
clap = { version = "=4.4.18", features = ["derive"] }
tempfile = "3.8"
tempdir = "0.3.7"
tempfile = "3.10.1"

[features]
aes-crypto = ["aes", "constant_time_eq", "hmac", "pbkdf2", "sha1", "rand", "zeroize"]
Expand All @@ -79,6 +83,7 @@ deflate-zopfli = ["zopfli", "_deflate-any"]
lzma = ["lzma-rs/stream"]
unreserved = []
xz = ["lzma-rs/raw_decoder"]
parallelism = ["libc"]
default = [
"aes-crypto",
"bzip2",
Expand All @@ -101,3 +106,7 @@ harness = false
[[bench]]
name = "merge_archive"
harness = false

[[bench]]
name = "extract"
harness = false
86 changes: 86 additions & 0 deletions benches/extract.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
use bencher::{benchmark_group, benchmark_main};

use bencher::Bencher;
use tempdir::TempDir;

use std::fs;
use std::path::Path;

use zip::result::ZipResult;
use zip::ZipArchive;

#[cfg(all(feature = "parallelism", feature = "bzip2", unix))]
use zip::read::{split_extract, ExtractionParameters};

/* This archive has a set of entries repeated 20x:
* - 200K random data, stored uncompressed (CompressionMethod::Stored)
* - 246K text data (the project gutenberg html version of king lear)
* (CompressionMethod::Bzip2, compression level 1) (project gutenberg ebooks are public domain)
*
* The full archive file is 5.3MB.
*/
fn get_test_archive() -> ZipResult<ZipArchive<fs::File>> {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/data/stored-and-compressed-text.zip");
let file = fs::File::open(path)?;
ZipArchive::new(file)
}

fn extract_basic(bench: &mut Bencher) {
let mut readable_archive = get_test_archive().unwrap();
let total_size: u64 = readable_archive
.decompressed_size()
.unwrap()
.try_into()
.unwrap();

let parent = TempDir::new("zip-extract").unwrap();

bench.bytes = total_size;
bench.bench_n(1, |bench| {
bench.iter(move || {
let outdir = TempDir::new_in(parent.path(), "bench-subdir")
.unwrap()
.into_path();
readable_archive.extract(outdir).unwrap();
});
});
}

#[cfg(all(feature = "parallelism", feature = "bzip2", unix))]
const DECOMPRESSION_THREADS: usize = 8;

#[cfg(all(feature = "parallelism", feature = "bzip2", unix))]
fn extract_split(bench: &mut Bencher) {
let readable_archive = get_test_archive().unwrap();
let total_size: u64 = readable_archive
.decompressed_size()
.unwrap()
.try_into()
.unwrap();

let params = ExtractionParameters {
decompression_threads: DECOMPRESSION_THREADS,
..Default::default()
};

let parent = TempDir::new("zip-extract").unwrap();

bench.bytes = total_size;
bench.bench_n(1, |bench| {
bench.iter(move || {
let outdir = TempDir::new_in(parent.path(), "bench-subdir")
.unwrap()
.into_path();
split_extract(&readable_archive, &outdir, params.clone()).unwrap();
});
});
}

#[cfg(not(all(feature = "parallelism", feature = "bzip2", unix)))]
benchmark_group!(benches, extract_basic);

#[cfg(all(feature = "parallelism", feature = "bzip2", unix))]
benchmark_group!(benches, extract_basic, extract_split);

benchmark_main!(benches);
7 changes: 7 additions & 0 deletions src/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@

pub(crate) mod magic_finder;

#[cfg(feature = "parallelism")]
pub(crate) mod pipelining;
#[cfg(all(unix, feature = "parallelism"))]
pub use pipelining::split_extraction::{split_extract, ExtractionParameters, SplitExtractionError};
#[cfg(feature = "parallelism")]
pub(crate) mod split;

// Put the struct declaration in a private module to convince rustdoc to display ZipArchive nicely
pub(crate) mod zip_archive {
use indexmap::IndexMap;
Expand Down Expand Up @@ -1380,7 +1387,7 @@
/// `foo/../bar` as `foo/bar` (instead of `bar`). Because of this,
/// [`ZipFile::enclosed_name`] is the better option in most scenarios.
///
/// [`ParentDir`]: `PathBuf::Component::ParentDir`

Check warning on line 1390 in src/read.rs

View workflow job for this annotation

GitHub Actions / style_and_docs (--no-default-features)

unresolved link to `PathBuf::Component::ParentDir`

Check warning on line 1390 in src/read.rs

View workflow job for this annotation

GitHub Actions / style_and_docs

unresolved link to `PathBuf::Component::ParentDir`

Check warning on line 1390 in src/read.rs

View workflow job for this annotation

GitHub Actions / style_and_docs (--all-features)

unresolved link to `PathBuf::Component::ParentDir`
pub fn mangled_name(&self) -> PathBuf {
self.get_metadata().file_name_sanitized()
}
Expand Down
Loading
Loading