zip-rs · cosmicexplorer · Jun 27, 2024 · Jul 16, 2024 · Jul 17, 2024 · Pr0methean
diff --git a/Cargo.toml b/Cargo.toml
@@ -52,6 +52,9 @@ lzma-rs = { version = "0.3.0", default-features = false, optional = true }
 [target.'cfg(any(all(target_arch = "arm", target_pointer_width = "32"), target_arch = "mips", target_arch = "powerpc"))'.dependencies]
 crossbeam-utils = "0.8.20"
 
+[target.'cfg(unix)'.dependencies]
+libc = { version = "0.2.155", optional = true }
+
 [target.'cfg(fuzzing)'.dependencies]
 arbitrary = { version = "1.3.2", features = ["derive"] }
 
@@ -63,6 +66,7 @@ time = { workspace = true, features = ["formatting", "macros"] }
 anyhow = "1"
 clap = { version = "=4.4.18", features = ["derive"] }
 tempdir = "0.3.7"
+tempfile = "3.10.1"
 
 [features]
 aes-crypto = ["aes", "constant_time_eq", "hmac", "pbkdf2", "sha1", "rand", "zeroize"]
@@ -79,6 +83,7 @@ deflate-zopfli = ["zopfli", "_deflate-any"]
 lzma = ["lzma-rs/stream"]
 unreserved = []
 xz = ["lzma-rs/raw_decoder"]
+parallelism = ["libc"]
 default = [
     "aes-crypto",
     "bzip2",
@@ -101,3 +106,7 @@ harness = false
 [[bench]]
 name = "merge_archive"
 harness = false
+
+[[bench]]
+name = "extract"
+harness = false
diff --git a/benches/extract.rs b/benches/extract.rs
@@ -0,0 +1,86 @@
+use bencher::{benchmark_group, benchmark_main};
+
+use bencher::Bencher;
+use tempdir::TempDir;
+
+use std::fs;
+use std::path::Path;
+
+use zip::result::ZipResult;
+use zip::ZipArchive;
+
+#[cfg(all(feature = "parallelism", unix))]
+use zip::read::{split_extract, ExtractionParameters};
+
+/* This archive has a set of entries repeated 20x:
+ * - 200K random data, stored uncompressed (CompressionMethod::Stored)
+ * - 246K text data (the project gutenberg html version of king lear)
+ *   (CompressionMethod::Bzip2, compression level 1) (project gutenberg ebooks are public domain)
+ *
+ * The full archive file is 5.3MB.
+ */
+fn get_test_archive() -> ZipResult<ZipArchive<fs::File>> {
+    let path =
+        Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/data/stored-and-compressed-text.zip");
+    let file = fs::File::open(path)?;
+    ZipArchive::new(file)
+}
+
+fn extract_basic(bench: &mut Bencher) {
+    let mut readable_archive = get_test_archive().unwrap();
+    let total_size: u64 = readable_archive
+        .decompressed_size()
+        .unwrap()
+        .try_into()
+        .unwrap();
+
+    let parent = TempDir::new("zip-extract").unwrap();
+
+    bench.bytes = total_size;
+    bench.bench_n(1, |bench| {
+        bench.iter(move || {
+            let outdir = TempDir::new_in(parent.path(), "bench-subdir")
+                .unwrap()
+                .into_path();
+            readable_archive.extract(outdir).unwrap();
+        });
+    });
+}
+
+#[cfg(all(feature = "parallelism", unix))]
+const DECOMPRESSION_THREADS: usize = 8;
+
+#[cfg(all(feature = "parallelism", unix))]
+fn extract_split(bench: &mut Bencher) {
+    let readable_archive = get_test_archive().unwrap();
+    let total_size: u64 = readable_archive
+        .decompressed_size()
+        .unwrap()
+        .try_into()
+        .unwrap();
+
+    let params = ExtractionParameters {
+        decompression_threads: DECOMPRESSION_THREADS,
+        ..Default::default()
+    };
+
+    let parent = TempDir::new("zip-extract").unwrap();
+
+    bench.bytes = total_size;
+    bench.bench_n(1, |bench| {
+        bench.iter(move || {
+            let outdir = TempDir::new_in(parent.path(), "bench-subdir")
+                .unwrap()
+                .into_path();
+            split_extract(&readable_archive, &outdir, params.clone()).unwrap();
+        });
+    });
+}
+
+#[cfg(not(all(feature = "parallelism", unix)))]
+benchmark_group!(benches, extract_basic);
+
+#[cfg(all(feature = "parallelism", unix))]
+benchmark_group!(benches, extract_basic, extract_split);
+
+benchmark_main!(benches);
diff --git a/src/read.rs b/src/read.rs
@@ -51,6 +51,13 @@
 #[cfg(feature = "xz")]
 pub(crate) mod xz;
 
+#[cfg(feature = "parallelism")]
+pub(crate) mod pipelining;
+#[cfg(all(unix, feature = "parallelism"))]
+pub use pipelining::split_extraction::{split_extract, ExtractionParameters, SplitExtractionError};
+#[cfg(feature = "parallelism")]
+pub(crate) mod split;
+
 // Put the struct declaration in a private module to convince rustdoc to display ZipArchive nicely
 pub(crate) mod zip_archive {
     use indexmap::IndexMap;
@@ -1076,6 +1083,9 @@
 
     fn make_writable_dir_all<T: AsRef<Path>>(outpath: T) -> Result<(), ZipError> {
         create_dir_all(outpath.as_ref())?;
+        /* TODO: do we want to automatically make the directory writable? Wouldn't we prefer to
+         * respect the write permissions of the extraction dir? Pipelined extraction does not
+         * mutate permissions like this. */
-        /* TODO: do we want to automatically make the directory writable? Wouldn't we prefer to
-         * respect the write permissions of the extraction dir? Pipelined extraction does not
-         * mutate permissions like this. */
-        /* TODO: do we want to automatically make the directory writable? Wouldn't we prefer to
-         * respect the write permissions of the extraction dir? Pipelined extraction does not
-         * mutate permissions like this. */
         #[cfg(unix)]
         {
             // Dirs must be writable until all normal files are extracted
@@ -1604,7 +1614,7 @@
    /// `foo/../bar` as `foo/bar` (instead of `bar`). Because of this,
    /// [`ZipFile::enclosed_name`] is the better option in most scenarios.
    ///
    /// [`ParentDir`]: `Component::ParentDir`
    pub fn mangled_name(&self) -> PathBuf {
        self.data.file_name_sanitized()
    }