zip-rs · cosmicexplorer · Jun 27, 2024 · Aug 17, 2024 · Aug 17, 2024 · Aug 17, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -52,6 +52,9 @@ lzma-rs = { version = "0.3", default-features = false, optional = true }
 [target.'cfg(any(all(target_arch = "arm", target_pointer_width = "32"), target_arch = "mips", target_arch = "powerpc"))'.dependencies]
 crossbeam-utils = "0.8.20"
 
+[target.'cfg(unix)'.dependencies]
+libc = { version = "0.2.155", optional = true }
+
 [target.'cfg(fuzzing)'.dependencies]
 arbitrary = { version = "1.3.2", features = ["derive"] }
 
@@ -62,7 +65,8 @@ walkdir = "2.5"
 time = { workspace = true, features = ["formatting", "macros"] }
 anyhow = "1.0.60"
 clap = { version = "=4.4.18", features = ["derive"] }
-tempfile = "3.8"
+tempdir = "0.3.7"
+tempfile = "3.10.1"
 
 [features]
 aes-crypto = ["aes", "constant_time_eq", "hmac", "pbkdf2", "sha1", "rand", "zeroize"]
@@ -79,6 +83,7 @@ deflate-zopfli = ["zopfli", "_deflate-any"]
 lzma = ["lzma-rs/stream"]
 unreserved = []
 xz = ["lzma-rs/raw_decoder"]
+parallelism = ["libc"]
 default = [
     "aes-crypto",
     "bzip2",
@@ -101,3 +106,7 @@ harness = false
 [[bench]]
 name = "merge_archive"
 harness = false
+
+[[bench]]
+name = "extract"
+harness = false
diff --git a/benches/extract.rs b/benches/extract.rs
@@ -0,0 +1,86 @@
+use bencher::{benchmark_group, benchmark_main};
+
+use bencher::Bencher;
+use tempdir::TempDir;
+
+use std::fs;
+use std::path::Path;
+
+use zip::result::ZipResult;
+use zip::ZipArchive;
+
+#[cfg(all(feature = "parallelism", feature = "bzip2", unix))]
+use zip::read::{split_extract, ExtractionParameters};
+
+/* This archive has a set of entries repeated 20x:
+ * - 200K random data, stored uncompressed (CompressionMethod::Stored)
+ * - 246K text data (the project gutenberg html version of king lear)
+ *   (CompressionMethod::Bzip2, compression level 1) (project gutenberg ebooks are public domain)
+ *
+ * The full archive file is 5.3MB.
+ */
+fn get_test_archive() -> ZipResult<ZipArchive<fs::File>> {
+    let path =
+        Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/data/stored-and-compressed-text.zip");
+    let file = fs::File::open(path)?;
+    ZipArchive::new(file)
+}
+
+fn extract_basic(bench: &mut Bencher) {
+    let mut readable_archive = get_test_archive().unwrap();
+    let total_size: u64 = readable_archive
+        .decompressed_size()
+        .unwrap()
+        .try_into()
+        .unwrap();
+
+    let parent = TempDir::new("zip-extract").unwrap();
+
+    bench.bytes = total_size;
+    bench.bench_n(1, |bench| {
+        bench.iter(move || {
+            let outdir = TempDir::new_in(parent.path(), "bench-subdir")
+                .unwrap()
+                .into_path();
+            readable_archive.extract(outdir).unwrap();
+        });
+    });
+}
+
+#[cfg(all(feature = "parallelism", feature = "bzip2", unix))]
+const DECOMPRESSION_THREADS: usize = 8;
+
+#[cfg(all(feature = "parallelism", feature = "bzip2", unix))]
+fn extract_split(bench: &mut Bencher) {
+    let readable_archive = get_test_archive().unwrap();
+    let total_size: u64 = readable_archive
+        .decompressed_size()
+        .unwrap()
+        .try_into()
+        .unwrap();
+
+    let params = ExtractionParameters {
+        decompression_threads: DECOMPRESSION_THREADS,
+        ..Default::default()
+    };
+
+    let parent = TempDir::new("zip-extract").unwrap();
+
+    bench.bytes = total_size;
+    bench.bench_n(1, |bench| {
+        bench.iter(move || {
+            let outdir = TempDir::new_in(parent.path(), "bench-subdir")
+                .unwrap()
+                .into_path();
+            split_extract(&readable_archive, &outdir, params.clone()).unwrap();
+        });
+    });
+}
+
+#[cfg(not(all(feature = "parallelism", feature = "bzip2", unix)))]
+benchmark_group!(benches, extract_basic);
+
+#[cfg(all(feature = "parallelism", feature = "bzip2", unix))]
+benchmark_group!(benches, extract_basic, extract_split);
+
+benchmark_main!(benches);
diff --git a/src/read.rs b/src/read.rs
@@ -40,6 +40,13 @@
 
 pub(crate) mod magic_finder;
 
+#[cfg(feature = "parallelism")]
+pub(crate) mod pipelining;
+#[cfg(all(unix, feature = "parallelism"))]
+pub use pipelining::split_extraction::{split_extract, ExtractionParameters, SplitExtractionError};
+#[cfg(feature = "parallelism")]
+pub(crate) mod split;
+
 // Put the struct declaration in a private module to convince rustdoc to display ZipArchive nicely
 pub(crate) mod zip_archive {
     use indexmap::IndexMap;
@@ -1380,7 +1387,7 @@
    /// `foo/../bar` as `foo/bar` (instead of `bar`). Because of this,
    /// [`ZipFile::enclosed_name`] is the better option in most scenarios.
    ///
    /// [`ParentDir`]: `PathBuf::Component::ParentDir`
    pub fn mangled_name(&self) -> PathBuf {
        self.get_metadata().file_name_sanitized()
    }