From 58b088c4bdeb920b28cdb78d103499bbd96c85c3 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 11 Aug 2024 15:46:58 -0400 Subject: [PATCH 01/68] add new zip-cli crate in workspace - implement absurd arg parsing - add help text - add long help text - first iteration of compression - add --stdout - add large file support - make the compress command mostly work - make compress flags work better - verbose output works! - reduce size of zip-cli from 2.9M->1.3M - make a new subcrate clite for different features/opt flags - remove clap - set --large-file automatically - clarify info and extract are a TODO for now - move OutputHandle to lib.rs - factor out main method into lib.rs - clarify the behavior of -f and -r around symlinks - add --append option - rename CompressError -> CommandError - make much more subcommand logic generic through traits - wrap compress help to 80 chars - begin extract help text --- Cargo.toml | 5 +- cli/Cargo.toml | 63 + cli/clite/Cargo.toml | 35 + cli/clite/src/main.rs | 3 + cli/src/args.rs | 1966 +++++++++++++++++++++++++++++++ cli/src/compress.rs | 494 ++++++++ cli/src/extract.rs | 192 +++ cli/src/lib.rs | 167 +++ cli/src/main.rs | 3 + fuzz/fuzz_targets/fuzz_write.rs | 162 ++- src/write.rs | 2 + 11 files changed, 3048 insertions(+), 44 deletions(-) create mode 100644 cli/Cargo.toml create mode 100644 cli/clite/Cargo.toml create mode 100644 cli/clite/src/main.rs create mode 100644 cli/src/args.rs create mode 100644 cli/src/compress.rs create mode 100644 cli/src/extract.rs create mode 100644 cli/src/lib.rs create mode 100644 cli/src/main.rs diff --git a/Cargo.toml b/Cargo.toml index c5405c9d5..e7a599015 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ authors = [ license = "MIT" repository = "https://github.com/zip-rs/zip2.git" keywords = ["zip", "archive", "compression"] +categories = ["compression", "filesystem", "parser-implementations"] rust-version = "1.73.0" description = """ Library to support the reading and writing of zip files. @@ -23,7 +24,9 @@ all-features = true rustdoc-args = ["--cfg", "docsrs"] [workspace.dependencies] +arbitrary = { version = "1.3.2", features = ["derive"] } time = { version = "0.3.36", default-features = false } +zip = { path = ".", default-features = false } [dependencies] aes = { version = "0.8.4", optional = true } @@ -53,7 +56,7 @@ lzma-rs = { version = "0.3.0", default-features = false, optional = true } crossbeam-utils = "0.8.20" [target.'cfg(fuzzing)'.dependencies] -arbitrary = { version = "1.3.2", features = ["derive"] } +arbitrary.workspace = true [dev-dependencies] bencher = "0.1.5" diff --git a/cli/Cargo.toml b/cli/Cargo.toml new file mode 100644 index 000000000..d787a5048 --- /dev/null +++ b/cli/Cargo.toml @@ -0,0 +1,63 @@ +[package] +name = "zip-cli" +version = "0.0.1" +authors = [ + "Danny McClanahan ", +] +license = "MIT" +repository = "https://github.com/zip-rs/zip2.git" +keywords = ["zip", "archive", "compression", "cli"] +categories = ["command-line-utilities", "compression", "filesystem", "development-tools::build-utils"] +rust-version = "1.74.0" +description = """ +Binary for creation and manipulation of zip files. +""" +edition = "2021" + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[lib] + +[[bin]] +name = "zip-cli" + +[dependencies] + +[dependencies.zip] +path = ".." +default-features = false + +[features] +aes-crypto = ["zip/aes-crypto"] +bzip2 = ["zip/bzip2"] +chrono = ["zip/chrono"] +deflate64 = ["zip/deflate64"] +deflate = ["zip/deflate"] +deflate-flate2 = ["zip/deflate-flate2"] +deflate-zlib = ["zip/deflate-zlib"] +deflate-zlib-ng = ["zip/deflate-zlib-ng"] +deflate-zopfli = ["zip/deflate-zopfli"] +lzma = ["zip/lzma"] +time = ["zip/time"] +xz = ["zip/xz"] +zstd = ["zip/zstd"] + +default = [ + "aes-crypto", + "bzip2", + "deflate64", + "deflate", + "lzma", + "time", + "xz", + "zstd", +] + + +[profile.release] +strip = true +lto = true +opt-level = 3 +codegen-units = 1 diff --git a/cli/clite/Cargo.toml b/cli/clite/Cargo.toml new file mode 100644 index 000000000..607bf3314 --- /dev/null +++ b/cli/clite/Cargo.toml @@ -0,0 +1,35 @@ +[package] +name = "zip-clite" +version = "0.0.1" +authors = [ + "Danny McClanahan ", +] +license = "MIT" +repository = "https://github.com/zip-rs/zip2.git" +keywords = ["zip", "archive", "compression", "cli"] +categories = ["command-line-utilities", "compression", "filesystem", "development-tools::build-utils"] +rust-version = "1.74.0" +description = """ +Binary for creation and manipulation of zip files. +""" +edition = "2021" + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[[bin]] +name = "zip-clite" + +[dependencies] + +[dependencies.zip-cli] +path = ".." +default-features = false +features = ["deflate-flate2", "deflate-zlib"] + +[profile.release] +strip = true +lto = true +opt-level = "s" +codegen-units = 1 diff --git a/cli/clite/src/main.rs b/cli/clite/src/main.rs new file mode 100644 index 000000000..95fae2ac9 --- /dev/null +++ b/cli/clite/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + zip_cli::driver::main(); +} diff --git a/cli/src/args.rs b/cli/src/args.rs new file mode 100644 index 000000000..91070ecdf --- /dev/null +++ b/cli/src/args.rs @@ -0,0 +1,1966 @@ +use std::{collections::VecDeque, ffi::OsString, sync::OnceLock}; + +#[derive(Debug)] +pub enum ArgParseError { + StdoutMessage(String), + StderrMessage(String), +} + +#[derive(Debug)] +pub struct ZipCli { + pub verbose: bool, + pub command: ZipCommand, +} + +#[derive(Debug)] +enum SubcommandName { + Compress, + Info, + Extract, +} + +static PARSED_EXE_NAME: OnceLock = OnceLock::new(); + +impl ZipCli { + const VERSION: &'static str = env!("CARGO_PKG_VERSION"); + const DESCRIPTION: &'static str = env!("CARGO_PKG_DESCRIPTION"); + + pub const INTERNAL_ERROR_EXIT_CODE: i32 = 3; + pub const ARGV_PARSE_FAILED_EXIT_CODE: i32 = 2; + pub const NON_FAILURE_EXIT_CODE: i32 = 0; + + pub fn binary_name() -> &'static str { + PARSED_EXE_NAME.get().expect("binary name was not set yet") + } + + fn generate_version_text() -> String { + format!("{} {}\n", Self::binary_name(), Self::VERSION) + } + + fn generate_usage_line() -> String { + format!("Usage: {} [OPTIONS] ", Self::binary_name()) + } + + fn generate_full_help_text() -> String { + format!( + "\ +{} + +{} + +Commands: + {}{}{} + {} {} + {}{}{} + +Options: + -v, --verbose Write information logs to stderr + -h, --help Print help + -V, --version Print version +", + Self::DESCRIPTION, + Self::generate_usage_line(), + compress::Compress::COMMAND_NAME, + compress::Compress::COMMAND_TABS, + compress::Compress::COMMAND_DESCRIPTION, + info::Info::COMMAND_NAME, + info::Info::COMMAND_DESCRIPTION, + extract::Extract::COMMAND_NAME, + extract::Extract::COMMAND_TABS, + extract::Extract::COMMAND_DESCRIPTION, + ) + } + + fn generate_brief_help_text(context: &str) -> String { + format!( + "\ +error: {context} + +{} + +For more information, try '--help'. +", + Self::generate_usage_line() + ) + } + + fn parse_up_to_subcommand_name( + argv: &mut VecDeque, + ) -> Result<(bool, SubcommandName), ArgParseError> { + let mut verbose: bool = false; + let mut subcommand_name: Option = None; + while subcommand_name.is_none() { + match argv.pop_front() { + None => { + let help_text = Self::generate_full_help_text(); + return Err(ArgParseError::StderrMessage(help_text)); + } + Some(arg) => match arg.as_encoded_bytes() { + b"-v" | b"--verbose" => verbose = true, + b"-V" | b"--version" => { + let version_text = Self::generate_version_text(); + return Err(ArgParseError::StdoutMessage(version_text)); + } + b"-h" | b"--help" => { + let help_text = Self::generate_full_help_text(); + return Err(ArgParseError::StdoutMessage(help_text)); + } + b"compress" => subcommand_name = Some(SubcommandName::Compress), + b"info" => subcommand_name = Some(SubcommandName::Info), + b"extract" => subcommand_name = Some(SubcommandName::Extract), + arg_bytes => { + let context = if arg_bytes.starts_with(b"-") { + format!("unrecognized global flag {arg:?}") + } else { + format!("unrecognized subcommand name {arg:?}") + }; + let help_text = Self::generate_brief_help_text(&context); + return Err(ArgParseError::StderrMessage(help_text)); + } + }, + } + } + Ok((verbose, subcommand_name.unwrap())) + } + + pub fn parse_argv(argv: impl IntoIterator) -> Result { + let mut argv: VecDeque = argv.into_iter().collect(); + let exe_name: String = argv + .pop_front() + .expect("exe name not on command line") + .into_string() + .expect("exe name not valid unicode"); + PARSED_EXE_NAME + .set(exe_name) + .expect("exe name already written"); + let (verbose, subcommand_name) = Self::parse_up_to_subcommand_name(&mut argv)?; + let command = match subcommand_name { + SubcommandName::Info => ZipCommand::Info, + SubcommandName::Extract => ZipCommand::Extract(extract::Extract::parse_argv(argv)?), + SubcommandName::Compress => ZipCommand::Compress(compress::Compress::parse_argv(argv)?), + }; + Ok(Self { verbose, command }) + } +} + +#[derive(Debug)] +pub enum ZipCommand { + Compress(compress::Compress), + Info, + Extract(extract::Extract), +} + +pub trait CommandFormat { + const COMMAND_NAME: &'static str; + const COMMAND_TABS: &'static str; + const COMMAND_DESCRIPTION: &'static str; + + const USAGE_LINE: &'static str; + + fn generate_usage_line() -> String { + format!( + "Usage: {} {} {}", + ZipCli::binary_name(), + Self::COMMAND_NAME, + Self::USAGE_LINE, + ) + } + + fn generate_help() -> String; + + fn generate_full_help_text() -> String { + format!( + "\ +{} + +{} +{}", + Self::COMMAND_DESCRIPTION, + Self::generate_usage_line(), + Self::generate_help(), + ) + } + + fn generate_brief_help_text(context: &str) -> String { + format!( + "\ +error: {context} + +{} +", + Self::generate_usage_line() + ) + } + + fn exit_arg_invalid(context: &str) -> ArgParseError { + let message = Self::generate_brief_help_text(context); + ArgParseError::StderrMessage(message) + } + + fn parse_argv(argv: VecDeque) -> Result + where + Self: Sized; +} + +pub mod compress { + use super::{ArgParseError, CommandFormat}; + + use std::{collections::VecDeque, ffi::OsString, num::ParseIntError, path::PathBuf}; + + #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] + pub enum CompressionMethodArg { + Stored, + Deflate, /* requires having zip/_deflate-any set to compile */ + #[cfg(feature = "deflate64")] + Deflate64, + #[cfg(feature = "bzip2")] + Bzip2, + #[cfg(feature = "zstd")] + Zstd, + } + + #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] + pub struct CompressionLevel(pub i64); + + #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] + pub struct UnixPermissions(pub u32); + + impl UnixPermissions { + pub fn parse(s: &str) -> Result { + Ok(Self(u32::from_str_radix(s, 8)?)) + } + } + + #[derive(Debug)] + pub enum CompressionArg { + CompressionMethod(CompressionMethodArg), + Level(CompressionLevel), + UnixPermissions(UnixPermissions), + LargeFile(bool), + Name(String), + Dir, + Symlink, + Immediate(OsString), + FilePath(PathBuf), + RecursiveDirPath(PathBuf), + } + + #[derive(Debug)] + pub enum OutputType { + Stdout { allow_tty: bool }, + File { path: PathBuf, append: bool }, + } + + #[derive(Debug)] + pub struct Compress { + pub output: OutputType, + pub args: Vec, + pub positional_paths: Vec, + } + + impl Compress { + #[cfg(feature = "deflate64")] + const DEFLATE64_HELP_LINE: &'static str = " - deflate64:\twith deflate64\n"; + #[cfg(not(feature = "deflate64"))] + const DEFLATE64_HELP_LINE: &'static str = ""; + + #[cfg(feature = "bzip2")] + const BZIP2_HELP_LINE: &'static str = " - bzip2:\twith bzip2\n"; + #[cfg(not(feature = "bzip2"))] + const BZIP2_HELP_LINE: &'static str = ""; + + #[cfg(feature = "zstd")] + const ZSTD_HELP_LINE: &'static str = " - zstd:\twith zstd\n"; + #[cfg(not(feature = "zstd"))] + const ZSTD_HELP_LINE: &'static str = ""; + } + + impl CommandFormat for Compress { + const COMMAND_NAME: &'static str = "compress"; + const COMMAND_TABS: &'static str = "\t"; + const COMMAND_DESCRIPTION: &'static str = "Generate a zip archive from files, directories, and symlinks provided as arguments or read from filesystem paths."; + + const USAGE_LINE: &'static str = "[-h|--help] [OUTPUT-FLAGS] [ENTRY]... [--] [PATH]..."; + + fn generate_help() -> String { + format!( + r#" + -h, --help Print help + +Output flags: +Where and how to write the generated zip archive. + + -o, --output-file + Output zip file path to write. + The output file is truncated if it already exists, unless --append is + provided. If not provided, output is written to stdout. + + --append + If an output path is provided with -o, open it as an existing zip + archive and append to it. If the output path does not already exist, + no error is produced, and a new zip file is created at the given path. + + --stdout + Allow writing output to stdout even if stdout is a tty. + +Entries: +After output flags are provided, the rest of the command line is +attributes and entry data. Attributes modify later entries. + +Sticky attributes: +These flags apply to everything that comes after them until reset by another +instance of the same attribute. Sticky attributes continue to apply to +positional arguments received after processing all flags. + + -c, --compression-method + Which compression technique to use. + Defaults to deflate if not specified. + + Possible values: + - stored: uncompressed + - deflate: with deflate (default) +{}{}{} + -l, --compression-level + How much compression to perform, from 0..=24. + The accepted range of values differs for each technique. + + -m, --mode + Unix permissions to apply to the file, in octal (like chmod). + + --large-file [true|false] + Whether to enable large file support. + This may take up more space for records, but allows files over 32 bits + in length to be written, up to 64 bit sizes. + File arguments over 32 bits in length (either provided explicitly or + encountered when traversing a recursive directory) will have this flag + set automatically, without affecting the sticky value for + later options. + Therefore, this option likely never has to be set explicitly by + the user. + +Non-sticky attributes: +These flags only apply to the next entry after them, and may not be repeated. + + -n, --name + The name to apply to the entry. This must be UTF-8 encoded. + + -s, --symlink + Make the next entry into a symlink entry. + A symlink entry may be immediate with -i, or it may copy the target + from an existing symlink with -f. + +Entry data: +Each of these flags creates an entry in the output zip archive. + + -d, --dir + Create a directory entry. + A name must be provided beforehand with -n. + + -i, --immediate + Write an entry containing the data in the argument, which need not be + UTF-8 encoded but will exit early upon encountering any null bytes. + A name must be provided beforehand with -n. + + -f, --file + Write an entry with the contents of this file path. + A name may be provided beforehand with -n, otherwise the name will be + inferred from relativizing the given path to the working directory. + Note that sockets are currently not supported and will produce an + error. Providing a path to a directory will produce an error. + + If -s was specified beforehand, the path will be read as a symlink, + which will produce an error if the path does not point to a symbolic + link. If -s was not specified beforehand and a symlink path was + provided, then the symbolic link will be interpreted as if it was + a file with the contents of the symlink target, but with its name + corresponding to the symlink path (unless overridden with -n). + + -r, --recursive-dir + Write all the recursive contents of this directory path. + A name may be provided beforehand with -n, which will be used as the + prefix for all recursive contents of this directory. Otherwise, the + name will be inferred from relativizing the given path to the + working directory. + + -s is not allowed before this argument. If a path to a symbolic link + is provided, it will be treated as if it pointed to a directory with + the recursive contents of the target directory, but with its name + corresponding to the symlink path (unless overridden with -n). + Providing a symlink path which points to a file will produce an error. + +Positional entries: + [PATH]... + Write the file or recursive directory contents, relativizing the path. + If the given path points to a file, then a single file entry will + be written. + If the given path is a symlink, then a single symlink entry will + be written. + If the given path refers to a directory, then the recursive contents + will be written, reproducing files and symlinks. + Socket paths will produce an error. +"#, + Self::DEFLATE64_HELP_LINE, + Self::BZIP2_HELP_LINE, + Self::ZSTD_HELP_LINE, + ) + } + + fn parse_argv(mut argv: VecDeque) -> Result { + let mut allow_stdout: bool = false; + let mut append_to_output_path: bool = false; + let mut output_path: Option = None; + let mut args: Vec = Vec::new(); + let mut positional_paths: Vec = Vec::new(); + + while let Some(arg) = argv.pop_front() { + match arg.as_encoded_bytes() { + b"-h" | b"--help" => { + let help_text = Self::generate_full_help_text(); + return Err(ArgParseError::StdoutMessage(help_text)); + } + + /* Output flags */ + b"--stdout" => { + if let Some(output_path) = output_path.take() { + return Err(Self::exit_arg_invalid(&format!( + "--stdout provided along with output file {output_path:?}" + ))); + } else if append_to_output_path { + return Err(Self::exit_arg_invalid( + "--stdout provided along with --append", + )); + } else if !args.is_empty() || !positional_paths.is_empty() { + return Err(Self::exit_arg_invalid("--stdout provided after entries")); + } else if allow_stdout { + return Err(Self::exit_arg_invalid("--stdout provided twice")); + } else { + allow_stdout = true; + } + } + b"--append" => { + if append_to_output_path { + return Err(Self::exit_arg_invalid("--append provided twice")); + } else if !args.is_empty() || !positional_paths.is_empty() { + return Err(Self::exit_arg_invalid("--append provided after entries")); + } else if allow_stdout { + return Err(Self::exit_arg_invalid( + "--stdout provided along with --append", + )); + } else { + append_to_output_path = true; + } + } + b"-o" | b"--output-file" => { + let new_path = argv.pop_front().map(PathBuf::from).ok_or_else(|| { + Self::exit_arg_invalid("no argument provided for -o/--output-file") + })?; + if let Some(prev_path) = output_path.take() { + return Err(Self::exit_arg_invalid(&format!( + "--output-file provided twice: {prev_path:?} and {new_path:?}" + ))); + } else if allow_stdout { + return Err(Self::exit_arg_invalid( + "--stdout provided along with output file", + )); + } else if !args.is_empty() || !positional_paths.is_empty() { + return Err(Self::exit_arg_invalid( + "-o/--output-file provided after entries", + )); + } else { + output_path = Some(new_path); + } + } + + /* Attributes */ + b"-c" | b"--compression-method" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid( + "no argument provided for -c/--compression-method", + )) + } + Some(name) => match name.as_encoded_bytes() { + b"stored" => args.push(CompressionArg::CompressionMethod( + CompressionMethodArg::Stored, + )), + b"deflate" => args.push(CompressionArg::CompressionMethod( + CompressionMethodArg::Deflate, + )), + #[cfg(feature = "deflate64")] + b"deflate64" => args.push(CompressionArg::CompressionMethod( + CompressionMethodArg::Deflate64, + )), + #[cfg(feature = "bzip2")] + b"bzip2" => args.push(CompressionArg::CompressionMethod( + CompressionMethodArg::Bzip2, + )), + #[cfg(feature = "zstd")] + b"zstd" => args.push(CompressionArg::CompressionMethod( + CompressionMethodArg::Zstd, + )), + _ => { + return Err(Self::exit_arg_invalid( + "unrecognized compression method {name:?}", + )); + } + }, + }, + b"-l" | b"--compression-level" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid( + "no argument provided for -l/--compression-level", + )); + } + Some(level) => match level.into_string() { + Err(level) => { + return Err(Self::exit_arg_invalid(&format!( + "invalid unicode provided for compression level: {level:?}" + ))); + } + Ok(level) => match level.parse::() { + Err(e) => { + return Err(Self::exit_arg_invalid(&format!( + "failed to parse integer for compression level: {e}" + ))); + } + Ok(level) => { + if (0..=24).contains(&level) { + args.push(CompressionArg::Level(CompressionLevel(level))) + } else { + return Err(Self::exit_arg_invalid(&format!( + "compression level {level} was not between 0 and 24" + ))); + } + } + }, + }, + }, + b"-m" | b"--mode" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid( + "no argument provided for -m/--mode", + )); + } + Some(mode) => match mode.into_string() { + Err(mode) => { + return Err(Self::exit_arg_invalid(&format!( + "invalid unicode provided for mode: {mode:?}" + ))); + } + Ok(mode) => match UnixPermissions::parse(&mode) { + Err(e) => { + return Err(Self::exit_arg_invalid(&format!( + "failed to parse integer for mode: {e}" + ))); + } + Ok(mode) => args.push(CompressionArg::UnixPermissions(mode)), + }, + }, + }, + b"--large-file" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid( + "no argument provided for --large-file", + )); + } + Some(large_file) => match large_file.as_encoded_bytes() { + b"true" => args.push(CompressionArg::LargeFile(true)), + b"false" => args.push(CompressionArg::LargeFile(false)), + _ => { + return Err(Self::exit_arg_invalid(&format!( + "unrecognized value for --large-file: {large_file:?}" + ))); + } + }, + }, + + /* Data */ + b"-n" | b"--name" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid( + "no argument provided for -n/--name", + )) + } + Some(name) => match name.into_string() { + Err(name) => { + return Err(Self::exit_arg_invalid(&format!( + "invalid unicode provided for name: {name:?}" + ))); + } + Ok(name) => args.push(CompressionArg::Name(name)), + }, + }, + b"-s" | b"--symlink" => args.push(CompressionArg::Symlink), + b"-d" | b"--dir" => args.push(CompressionArg::Dir), + b"-i" | b"--immediate" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid( + "no argument provided for -i/--immediate", + )); + } + Some(data) => args.push(CompressionArg::Immediate(data)), + }, + b"-f" | b"--file" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid( + "no argument provided for -f/--file", + )); + } + Some(file) => args.push(CompressionArg::FilePath(file.into())), + }, + b"-r" | b"--recursive-dir" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid( + "no argument provided for -r/--recursive-dir", + )); + } + Some(dir) => args.push(CompressionArg::RecursiveDirPath(dir.into())), + }, + + /* Transition to positional args */ + b"--" => break, + arg_bytes => { + if arg_bytes.starts_with(b"-") { + return Err(Self::exit_arg_invalid(&format!( + "unrecognized flag {arg:?}" + ))); + } else { + argv.push_front(arg); + break; + } + } + } + } + + positional_paths.extend(argv.into_iter().map(|arg| arg.into())); + + let output = if let Some(path) = output_path { + OutputType::File { + path, + append: append_to_output_path, + } + } else { + OutputType::Stdout { + allow_tty: allow_stdout, + } + }; + + Ok(Self { + output, + args, + positional_paths, + }) + } + } + + impl crate::driver::ExecuteCommand for Compress { + fn execute(self, err: impl std::io::Write) -> Result<(), crate::CommandError> { + crate::compress::execute_compress(err, self) + } + } +} + +pub mod info { + #[derive(Debug)] + pub struct Info {} + + impl Info { + pub const COMMAND_NAME: &'static str = "info"; + pub const COMMAND_DESCRIPTION: &'static str = + "(TODO) Print info about archive contents and individual entries."; + pub const COMMAND_TABS: &'static str = "\t\t"; + } +} + +pub mod extract { + use super::{ArgParseError, CommandFormat}; + + use std::{collections::VecDeque, ffi::OsString, mem, path::PathBuf}; + + #[derive(Debug)] + pub enum ContentTransform { + Extract, + /* FIXME: not yet supported */ + Raw, + LogToStderr, + } + + #[derive(Debug, Default, PartialEq, Eq)] + pub enum ComponentSelector { + #[default] + Path, + Basename, + Dirname, + FileExtension, + } + + impl ComponentSelector { + pub fn parse(s: &[u8]) -> Option { + match s { + b"path" => Some(Self::Path), + b"basename" => Some(Self::Basename), + b"dirname" => Some(Self::Dirname), + b"ext" => Some(Self::FileExtension), + _ => None, + } + } + } + + #[derive(Debug, Default, PartialEq, Eq)] + pub enum PatternSelectorType { + #[default] + Glob, + Literal, + Regexp, + } + + impl PatternSelectorType { + pub fn parse(s: &[u8]) -> Option { + match s { + b"glob" => Some(Self::Glob), + b"lit" => Some(Self::Literal), + b"rx" => Some(Self::Regexp), + _ => None, + } + } + } + + #[derive(Debug)] + pub enum PatternSelectorModifier { + CaseInsensitive, + } + + impl PatternSelectorModifier { + pub fn parse(s: &[u8]) -> Option { + match s { + b"i" => Some(Self::CaseInsensitive), + _ => None, + } + } + } + + #[derive(Debug, Default)] + pub struct PatternSelector { + pub pat_sel: PatternSelectorType, + pub modifiers: Vec, + } + + impl PatternSelector { + pub fn parse(s: &[u8]) -> Option { + match s.iter().position(|c| *c == b':') { + Some(modifiers_ind) => { + let pat_sel_str = &s[..modifiers_ind]; + let modifiers_str = &s[(modifiers_ind + 1)..]; + + let pat_sel = PatternSelectorType::parse(pat_sel_str)?; + let modifiers = modifiers_str + .split(|c| *c == b':') + .map(PatternSelectorModifier::parse) + .collect::>>()?; + Some(Self { pat_sel, modifiers }) + } + None => { + let pat_sel = PatternSelectorType::parse(s)?; + Some(Self { + pat_sel, + modifiers: Vec::new(), + }) + } + } + } + } + + pub fn parse_only_pat_sel(s: &[u8]) -> Option { + match s.iter().position(|c| *c == b':') { + Some(pat_sel_ind) => { + let pat_sel_str = &s[(pat_sel_ind + 1)..]; + + let pat_sel = PatternSelector::parse(pat_sel_str)?; + Some(pat_sel) + } + None => Some(PatternSelector::default()), + } + } + + pub fn parse_comp_and_pat_sel(s: &[u8]) -> Option<(ComponentSelector, PatternSelector)> { + match ( + s.iter().position(|c| *c == b'='), + s.iter().position(|c| *c == b':'), + ) { + (Some(comp_sel_ind), Some(pat_sel_ind)) => { + if comp_sel_ind >= pat_sel_ind { + return None; + } + let comp_sel_str = &s[(comp_sel_ind + 1)..pat_sel_ind]; + let pat_sel_str = &s[(pat_sel_ind + 1)..]; + + let comp_sel = ComponentSelector::parse(comp_sel_str)?; + let pat_sel = PatternSelector::parse(pat_sel_str)?; + Some((comp_sel, pat_sel)) + } + (Some(comp_sel_ind), None) => { + let comp_sel_str = &s[(comp_sel_ind + 1)..]; + + let comp_sel = ComponentSelector::parse(comp_sel_str)?; + let pat_sel = PatternSelector::default(); + Some((comp_sel, pat_sel)) + } + (None, Some(pat_sel_ind)) => { + let pat_sel_str = &s[(pat_sel_ind + 1)..]; + + let pat_sel = PatternSelector::parse(pat_sel_str)?; + let comp_sel = ComponentSelector::default(); + Some((comp_sel, pat_sel)) + } + (None, None) => { + let comp_sel = ComponentSelector::default(); + let pat_sel = PatternSelector::default(); + Some((comp_sel, pat_sel)) + } + } + } + + #[derive(Debug)] + pub enum EntryType { + /// file + File, + /// dir + Dir, + /// symlink + Symlink, + } + + impl EntryType { + pub fn parse(s: &[u8]) -> Option { + match s { + b"file" => Some(Self::File), + b"dir" => Some(Self::Dir), + b"symlink" => Some(Self::Symlink), + _ => None, + } + } + } + + #[derive(Debug, PartialEq, Eq)] + pub enum NonSpecificCompressionMethodArg { + /// any + Any, + /// known + Known, + } + + #[derive(Debug, PartialEq, Eq)] + pub enum SpecificCompressionMethodArg { + Stored, + Deflated, + #[cfg(feature = "deflate64")] + Deflate64, + #[cfg(feature = "bzip2")] + Bzip2, + #[cfg(feature = "zstd")] + Zstd, + #[cfg(feature = "lzma")] + Lzma, + #[cfg(feature = "xz")] + Xz, + } + + #[derive(Debug, PartialEq, Eq)] + pub enum CompressionMethodArg { + NonSpecific(NonSpecificCompressionMethodArg), + Specific(SpecificCompressionMethodArg), + } + + impl CompressionMethodArg { + pub fn parse(s: &[u8]) -> Option { + match s { + b"any" => Some(Self::NonSpecific(NonSpecificCompressionMethodArg::Any)), + b"known" => Some(Self::NonSpecific(NonSpecificCompressionMethodArg::Known)), + b"stored" => Some(Self::Specific(SpecificCompressionMethodArg::Stored)), + b"deflated" => Some(Self::Specific(SpecificCompressionMethodArg::Deflated)), + #[cfg(feature = "deflate64")] + b"deflate64" => Some(Self::Specific(SpecificCompressionMethodArg::Deflate64)), + #[cfg(feature = "bzip2")] + b"bzip2" => Some(Self::Specific(SpecificCompressionMethodArg::Bzip2)), + #[cfg(feature = "zstd")] + b"zstd" => Some(Self::Specific(SpecificCompressionMethodArg::Zstd)), + #[cfg(feature = "lzma")] + b"lzma" => Some(Self::Specific(SpecificCompressionMethodArg::Lzma)), + #[cfg(feature = "xz")] + b"xz" => Some(Self::Specific(SpecificCompressionMethodArg::Xz)), + _ => None, + } + } + } + + #[derive(Debug)] + pub enum DepthLimitArg { + Max(u8), + Min(u8), + } + + #[derive(Debug)] + pub struct MatchArg { + pub comp_sel: ComponentSelector, + pub pat_sel: PatternSelector, + pub pattern: String, + } + + #[derive(Debug)] + pub enum TrivialPredicate { + True, + False, + } + + #[derive(Debug)] + pub enum Predicate { + Trivial(TrivialPredicate), + EntryType(EntryType), + CompressionMethod(CompressionMethodArg), + DepthLimit(DepthLimitArg), + Match(MatchArg), + } + + #[derive(Debug)] + enum ExprOp { + Negation, + And, + Or, + } + + #[derive(Debug)] + enum ExprArg { + PrimitivePredicate(Predicate), + Op(ExprOp), + Subgroup(MatchExpression), + } + + #[derive(Debug, Default)] + struct SingleExprLevel { + expr_args: Vec, + } + + impl SingleExprLevel { + pub fn push_arg(&mut self, arg: ExprArg) { + self.expr_args.push(arg); + } + + fn get_negation( + expr_args: &mut VecDeque, + ) -> Result { + let negated_expr: MatchExpression = match expr_args.pop_front().ok_or_else(|| { + Extract::exit_arg_invalid(&format!( + "negation was only expression in list inside match expr (rest: {expr_args:?})" + )) + })? { + ExprArg::Subgroup(match_expr) => { + /* We have a valid match expression, so just negate it without + * wrapping. */ + MatchExpression::Negated(Box::new(match_expr)) + } + ExprArg::PrimitivePredicate(predicate) => { + /* We got a primitive predicate, so just negate it! */ + MatchExpression::Negated(Box::new(MatchExpression::PrimitivePredicate( + predicate, + ))) + } + ExprArg::Op(op) => { + /* Negation before any other operator is invalid. */ + return Err(Extract::exit_arg_invalid(&format!( + "negation before operator {op:?} inside match expr is invalid (rest: {expr_args:?})" + ))); + } + }; + Ok(negated_expr) + } + + fn get_non_operator( + expr_args: &mut VecDeque, + ) -> Result { + let next_expr: MatchExpression = match expr_args.pop_front().ok_or_else(|| { + /* We can't fold an empty list. */ + Extract::exit_arg_invalid(&format!( + "empty expression list inside match expr (rest: {expr_args:?})" + )) + })? { + /* This is already an evaluated match expression, so just start with that. */ + ExprArg::Subgroup(match_expr) => match_expr, + ExprArg::PrimitivePredicate(predicate) => { + /* Success! We start with a simple predicate. */ + MatchExpression::PrimitivePredicate(predicate) + } + ExprArg::Op(op) => match op { + /* We started with negation, which means we need to get the next arg to resolve + * it. */ + ExprOp::Negation => Self::get_negation(expr_args)?, + /* Starting with a binary operator is invalid. */ + op @ (ExprOp::And | ExprOp::Or) => { + return Err(Extract::exit_arg_invalid(&format!( + "expression list cannot begin with binary operator {op:?} (rest: {expr_args:?})" + ))); + } + }, + }; + Ok(next_expr) + } + + pub fn fold(self) -> Result { + let Self { expr_args } = self; + let mut expr_args: VecDeque<_> = expr_args.into(); + + /* Get a valid match expression to start our fold with. */ + let mut cur_expr: MatchExpression = Self::get_non_operator(&mut expr_args)?; + + /* Now fold the expression rightwards! */ + while let Some(next_arg) = expr_args.pop_front() { + match next_arg { + /* Implicit AND, wrapping the primitive result into a match. */ + ExprArg::PrimitivePredicate(predicate) => { + let next_expr = MatchExpression::PrimitivePredicate(predicate); + cur_expr = MatchExpression::And { + explicit: false, + left: Box::new(cur_expr), + right: Box::new(next_expr), + }; + } + /* Implicit AND, without needing to wrap the result. */ + ExprArg::Subgroup(match_expr) => { + cur_expr = MatchExpression::And { + explicit: false, + left: Box::new(cur_expr), + right: Box::new(match_expr), + }; + } + /* Evaluate the operator according to association. */ + ExprArg::Op(op) => match op { + /* Negation applies to the next element, so retrieve it! */ + ExprOp::Negation => { + let next_expr = Self::get_negation(&mut expr_args)?; + cur_expr = MatchExpression::And { + explicit: false, + left: Box::new(cur_expr), + right: Box::new(next_expr), + }; + } + /* Explicit AND requires the next element. */ + ExprOp::And => { + let next_expr = Self::get_non_operator(&mut expr_args)?; + cur_expr = MatchExpression::And { + explicit: true, + left: Box::new(cur_expr), + right: Box::new(next_expr), + }; + } + /* OR requires the next element. */ + ExprOp::Or => { + let next_expr = Self::get_non_operator(&mut expr_args)?; + cur_expr = MatchExpression::Or { + left: Box::new(cur_expr), + right: Box::new(next_expr), + }; + } + }, + } + } + + assert!(expr_args.is_empty()); + Ok(cur_expr) + } + } + + #[derive(Debug)] + pub enum MatchExpression { + PrimitivePredicate(Predicate), + Negated(Box), + And { + explicit: bool, + left: Box, + right: Box, + }, + Or { + left: Box, + right: Box, + }, + Grouped(Box), + } + + impl MatchExpression { + pub fn parse_argv(argv: &mut VecDeque) -> Result { + let mut expr_stack: Vec = Vec::new(); + let mut top_exprs = SingleExprLevel::default(); + + while let Some(arg) = argv.pop_front() { + match arg.as_encoded_bytes() { + /* Parse primitive predicates. */ + b"-true" => { + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::Trivial( + TrivialPredicate::True, + ))); + } + b"-false" => { + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::Trivial( + TrivialPredicate::False, + ))); + } + b"-t" | b"--type" => { + let type_arg = argv.pop_front().ok_or_else(|| { + Extract::exit_arg_invalid("no argument provided for -t/--type") + })?; + let entry_type = + EntryType::parse(type_arg.as_encoded_bytes()).ok_or_else(|| { + Extract::exit_arg_invalid(&format!( + "invalid --type argument: {type_arg:?}" + )) + })?; + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::EntryType( + entry_type, + ))); + } + b"--compression-method" => { + let method_arg = argv.pop_front().ok_or_else(|| { + Extract::exit_arg_invalid( + "no argument provided for --compression-method", + ) + })?; + let method = CompressionMethodArg::parse(method_arg.as_encoded_bytes()) + .ok_or_else(|| { + Extract::exit_arg_invalid(&format!( + "invalid --compression-method argument: {method_arg:?}" + )) + })?; + top_exprs.push_arg(ExprArg::PrimitivePredicate( + Predicate::CompressionMethod(method), + )); + } + b"--max-depth" => { + let max_depth: u8 = argv + .pop_front() + .ok_or_else(|| { + Extract::exit_arg_invalid("no argument provided for --max-depth") + })? + .into_string() + .map_err(|depth_arg| { + Extract::exit_arg_invalid(&format!( + "invalid unicode provided for --max-depth: {depth_arg:?}" + )) + })? + .parse::() + .map_err(|e| { + Extract::exit_arg_invalid(&format!( + "failed to parse --max-depth arg {e:?} as u8" + )) + })?; + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::DepthLimit( + DepthLimitArg::Max(max_depth), + ))); + } + b"--min-depth" => { + let min_depth: u8 = argv + .pop_front() + .ok_or_else(|| { + Extract::exit_arg_invalid("no argument provided for --min-depth") + })? + .into_string() + .map_err(|depth_arg| { + Extract::exit_arg_invalid(&format!( + "invalid unicode provided for --min-depth: {depth_arg:?}" + )) + })? + .parse::() + .map_err(|e| { + Extract::exit_arg_invalid(&format!( + "failed to parse --min-depth arg {e:?} as u8" + )) + })?; + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::DepthLimit( + DepthLimitArg::Min(min_depth), + ))); + } + b"-m" => { + let pattern: String = argv + .pop_front() + .ok_or_else(|| { + Extract::exit_arg_invalid("no argument provided for -m") + })? + .into_string() + .map_err(|pattern| { + Extract::exit_arg_invalid(&format!( + "invalid unicode provided for -m: {pattern:?}" + )) + })?; + let comp_sel = ComponentSelector::default(); + let pat_sel = PatternSelector::default(); + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::Match( + MatchArg { + comp_sel, + pat_sel, + pattern, + }, + ))); + } + arg_bytes if arg_bytes.starts_with(b"--match") => { + let (comp_sel, pat_sel) = + parse_comp_and_pat_sel(arg_bytes).ok_or_else(|| { + Extract::exit_arg_invalid(&format!( + "invalid --match argument modifiers: {arg:?}" + )) + })?; + let pattern: String = argv + .pop_front() + .ok_or_else(|| { + Extract::exit_arg_invalid("no argument provided for --match") + })? + .into_string() + .map_err(|pattern| { + Extract::exit_arg_invalid(&format!( + "invalid unicode provided for --match: {pattern:?}" + )) + })?; + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::Match( + MatchArg { + comp_sel, + pat_sel, + pattern, + }, + ))); + } + + /* Parse operators. */ + b"!" | b"-not" => { + top_exprs.push_arg(ExprArg::Op(ExprOp::Negation)); + } + b"&" | b"-and" => { + top_exprs.push_arg(ExprArg::Op(ExprOp::And)); + } + b"|" | b"-or" => { + top_exprs.push_arg(ExprArg::Op(ExprOp::Or)); + } + + /* Process groups with stack logic! */ + b"(" | b"-open" => { + expr_stack.push(mem::take(&mut top_exprs)); + } + b")" | b"-close" => { + /* Get the unevaluated exprs from the previous nesting level. */ + let prev_level = expr_stack.pop().ok_or_else(|| { + Extract::exit_arg_invalid("too many close parens inside match expr") + })?; + /* Move the previous nesting level into current, and evaluate the current + * nesting level. */ + let group_expr = mem::replace(&mut top_exprs, prev_level).fold()?; + /* Wrap the completed group in a Grouped. */ + let group_expr = MatchExpression::Grouped(Box::new(group_expr)); + /* Push the completed and evaluated group into the current nesting level. */ + top_exprs.push_arg(ExprArg::Subgroup(group_expr)); + } + + /* Conclude the match expr processing. */ + b"--expr" => { + break; + } + _ => { + return Err(Extract::exit_arg_invalid(&format!( + "unrecognized match expression component {arg:?}: all match expressions must start and end with a --expr flag" + ))); + } + } + } + + if !expr_stack.is_empty() { + return Err(Extract::exit_arg_invalid( + "not enough close parens inside match expr", + )); + } + top_exprs.fold() + } + } + + #[derive(Debug)] + pub enum TrivialTransform { + Identity, + } + + #[derive(Debug)] + pub enum BasicTransform { + StripComponents(u8), + AddPrefix(PathBuf), + } + + #[derive(Debug)] + pub struct TransformArg { + pub comp_sel: ComponentSelector, + pub pat_sel: PatternSelector, + pub pattern: String, + pub replacement_spec: String, + } + + #[derive(Debug)] + pub struct RemovePrefixArg { + pub pat_sel: PatternSelector, + pub pattern: String, + } + + #[derive(Debug)] + pub enum ComplexTransform { + Transform(TransformArg), + RemovePrefix(RemovePrefixArg), + } + + #[derive(Debug)] + pub enum NameTransform { + Trivial(TrivialTransform), + Basic(BasicTransform), + Complex(ComplexTransform), + } + + #[derive(Debug)] + enum ExtractArg { + Match(MatchExpression), + NameTransform(NameTransform), + ContentTransform(ContentTransform), + } + + #[derive(Debug)] + pub struct EntrySpec { + pub match_expr: Option, + pub name_transforms: Vec, + pub content_transform: ContentTransform, + } + + impl EntrySpec { + fn parse_extract_args( + args: impl IntoIterator, + ) -> Result, ArgParseError> { + let mut match_expr: Option = None; + let mut name_transforms: Vec = Vec::new(); + + let mut ret: Vec = Vec::new(); + + for arg in args.into_iter() { + match arg { + ExtractArg::Match(new_expr) => { + if let Some(prev_expr) = match_expr.take() { + return Err(Extract::exit_arg_invalid(&format!( + "more than one match expr was provided for the same entry: {prev_expr:?} and {new_expr:?}" + ))); + } + match_expr = Some(new_expr); + } + ExtractArg::NameTransform(n_trans) => { + name_transforms.push(n_trans); + } + ExtractArg::ContentTransform(c_trans) => { + let spec = Self { + match_expr: match_expr.take(), + name_transforms: mem::take(&mut name_transforms), + content_transform: c_trans, + }; + ret.push(spec); + } + } + } + if let Some(match_expr) = match_expr { + return Err(Extract::exit_arg_invalid(&format!( + "match expr {match_expr:?} was provided with no corresponding content \ +transform. add -x/--extract to construct a complete entry spec" + ))); + } + if !name_transforms.is_empty() { + return Err(Extract::exit_arg_invalid(&format!( + "name transforms {name_transforms:?} were provided with no corresponding \ +content transform. add -x/--extract to construct a complete entry spec" + ))); + } + + Ok(ret) + } + } + + #[derive(Debug)] + pub enum OutputCollation { + ConcatenateStdout, + Filesystem { + output_dir: Option, + mkdir: bool, + }, + } + + #[derive(Debug)] + pub enum InputType { + /* FIXME: not yet supported */ + StreamingStdin, + ZipPaths(Vec), + } + + #[derive(Debug)] + pub struct Extract { + pub output: OutputCollation, + pub entry_specs: Vec, + pub input: InputType, + } + + impl Extract { + #[cfg(feature = "deflate64")] + const DEFLATE64_HELP_LINE: &'static str = " - deflate64:\twith deflate64\n"; + #[cfg(not(feature = "deflate64"))] + const DEFLATE64_HELP_LINE: &'static str = ""; + + #[cfg(feature = "bzip2")] + const BZIP2_HELP_LINE: &'static str = " - bzip2:\twith bzip2\n"; + #[cfg(not(feature = "bzip2"))] + const BZIP2_HELP_LINE: &'static str = ""; + + #[cfg(feature = "zstd")] + const ZSTD_HELP_LINE: &'static str = " - zstd:\twith zstd\n"; + #[cfg(not(feature = "zstd"))] + const ZSTD_HELP_LINE: &'static str = ""; + + #[cfg(feature = "lzma")] + const LZMA_HELP_LINE: &'static str = " - lzma:\twith lzma\n"; + #[cfg(not(feature = "lzma"))] + const LZMA_HELP_LINE: &'static str = ""; + + #[cfg(feature = "xz")] + const XZ_HELP_LINE: &'static str = " - xz:\t\twith xz\n"; + #[cfg(not(feature = "xz"))] + const XZ_HELP_LINE: &'static str = ""; + } + + impl CommandFormat for Extract { + const COMMAND_NAME: &'static str = "extract"; + const COMMAND_TABS: &'static str = "\t"; + const COMMAND_DESCRIPTION: &'static str = + "Extract individual entries or an entire archive into a stream or the filesystem."; + + const USAGE_LINE: &'static str = + "[-h|--help] [OUTPUT-FLAGS] [ENTRY-SPEC]... [--stdin|[--] ZIP-PATH...]"; + + fn generate_help() -> String { + format!( + r#" + -h, --help Print help + +Output flags: +Where and how to collate the extracted entries. + + -d, --output-directory + Output directory path to write extracted entries into. + Paths for extracted entries will be constructed by interpreting entry + names as relative paths to the provided directory. If the provided + path is not a directory, an error is produced. If the provided path + does not exist, an error is produced unless --mkdir is specified. + If not provided, entries will be extracted into the current directory + (as if '-d .' had been provided). + + --mkdir + If an output directory is provided with -d and the directory path does + not exist, create it along with any missing parent directories. + If the path provided to -d is not a directory, an error will still be + produced if this flag is also provided. + + --stdout + Concatenate all extracted entries and write them in order to stdout + instead of writing anything to the filesystem. + This disables some optimizations that are possible when extracting to + the filesystem. + This will write output to stdout even if stdout is a tty. + +# Entry specs: + +After output flags are provided, entry specs are processed in order until an +input argument is reached. Entry specs are modelled after the arguments to +find(1), although "actions" are separated from "matching" expressions with +test clauses instead of being fully recursive like find(1). + +The full specification of an entry spec is provided below +(we will use lowercase names to describe this grammar): + + entry-spec = [--expr match-expr --expr] [name-transform]... content-transform + +1. (match-expr) matches against entries, +2. (name-transform) may transform the entry name string, +3. (content-transform) processes the entry content and writes it + to the output. + +Note that only the "content transform" is required: each entry spec must +conclude with exactly one content transform, but the other arguments may +be omitted and will be set to their default values. + +If no entry specs are provided, by default all entries are decompressed and written to the +output collator without modification. This behavior can be requested explicitly +with the command line: + + --expr -true --expr --identity --extract + +*Note:* if a match-expr is provided, it *must* be surrounded with --expr arguments on both sides! +This is a necessary constraint of the current command line parsing. + + +## Match expressions (match-expr): + +Entry matching logic composes boolean arithmetic expressions ("expr") in terms +of basic "predicates" which test some component of the zip entry. Expressions +can be composed as follows, in order of precedence: + +expr = ( ) (grouping to force precedence) + = ! (negation) + = & (short-circuiting conjunction "and") + = (implicit &) + = | (disjunction "or") + = (evaluate on entry) + +### Operators: +The operators to compose match expressions must be quoted in shell commands +(e.g. as \( or '('), so alternatives are provided which do not require +special quoting: + +Grouping operators: + (, -open + ), -close + +Unary operators: + !, -not + +Binary operators: + |, -or + &, -and + +### Predicates (predicate): +These arguments are interpreted as basic predicates, returning true or false in +response to a specific zip entry. + +Trivial: +These results do not depend on the entry data at all: + + -true Always return true. + -false Always return false. + +If a match expression is not provided, it defaults to the behavior of -true. + +Basic: +These results are dependent on the entry data: + + -t, --type [file|dir|symlink] + Match entries of the given type. + Note that directory entries may have specific mode bits set, or they may just be + zero-length entries whose name ends in '/'. + + --compression-method + Match entries compressed with the given compression technique. + + Possible values: + - any: any compression method at all + - known: any compression method this binary is able to decompress + - stored: uncompressed + - deflated: with deflate +{}{}{}{}{} + Using e.g. '-not --compression-method known' as a filter enables + special handling of entries compressed with an unsupported method. + + --max-depth + Match entries with at *most* components of their containing directory. + --min-depth + Match entries with at *least* components of their containing directory. + + -m, --match[=][:] + Return true for entries whose name matches . + + See section on "Selector syntax" for and for how + the string argument is interpreted into a string matching + predicate against the entry name. + + +## Name transforms (name-transform): + +Name transforms modify the entry name before writing the entry to the +output. Unlike match expressions, name transforms do not involve any boolean +logic, and instead are composed linearly, each processing the string produced by +the prior name transform in the series. + +*Note:* name transforms do *not* perform any filtering, so if a string +replacement operation "fails", the entry name is simply returned unchanged. + +Trivial: + --identity Return the entry name string unchanged. + +If no name transforms are provided, it defaults to the behavior of --identity. + +Basic: +These transformers do not perform any complex pattern matching, and instead add +or remove a fixed string from the entry name: + + --strip-components + Remove at most directory components from the entry name. + If is greater than or equal the number of components in the + entry dirname, then the basename of the entry is returned. + --add-prefix + Prefix the entry name with a directory path . + A single separator '/' will be added after before the rest of + the entry name, and any trailing '/' in will be trimmed + before joining. + +Complex: +These transformers perform complex pattern matching and replacement upon the +entry name string: + + --transform[=][:] + Extract the portion of the entry name corresponding to , + search it against corresponding to , and then + replace the result with . + + If == 'rx', then may contain references + to numbered capture groups specified by . Otherwise, + is interpreted as a literal string. + + --remove-prefix[:] + Equivalent to "--transform=path: ''", except the + search is anchored at the beginning of the string. + + +## Content transforms (content-transform): + +Content transforms determine how to interpret the content of the zip +entry itself. + +*Note:* when multiple entry specs are provided on the command line, a single +entry may be matched more than once. In this case, the entry's content will be +extracted more than once over the execution of this command. + + -x, --extract + Decompress the entry's contents (if necessary) before writing it to + the output. + + --raw + Do not decompress entry contents at all before writing its content to + the output. + + --log-to-stderr + Write the (possibly transformed) entry name to stderr, without reading + its content at all. + +Attempting to extract an entry using an unsupported compression method with +-x/--extract will produce an error. In this case, --compression-method can be +used to filter out such entries, and --raw may be used to avoid the failure and +decompress the entry later, or --log-to-stderr can be used to print the names of +all unsupported entries. + + +## Selector syntax: + +The string matching operations of --match and --transform expose an interface to +configure various pattern matching techniques on various components of the entry +name string. + +These flags default to interpreting a argument as a glob string to +match against the entire entry name, which can be explicitly requested as +follows: + + --match=path:glob + +The entire range of search options is described below: + +### Component selector (comp-sel): +comp-sel = path [DEFAULT] (match full entry) + = basename (match only the final component of entry) + = dirname (match all except final component of entry) + = ext (match only the file extension, if available) + +### Pattern selector (pat-sel): +pat-sel = glob [DEFAULT] (interpret as a shell glob) + = lit (interpret as literal string) + = rx (interpret as a regular expression) + = :i (use case-insensitive matching for the given pattern) + + +# Input arguments: +Zip file inputs to extract from can be specified in exactly one of two ways: +streaming from stdin, or as at least one path pointing to an existing zip file. +Input arguments are always specified after all output flags and entry +specs on the command line. If no positional argument is provided and --stdin is +not present, an error will be produced. + + --stdin + If this argument is provided, the streaming API will be used to read + entries as they are encountered, instead of filtering them beforehand + as is done with file inputs. This disables some optimizations, but + also avoids waiting for the entire input to buffer to start writing + output, so can be used in a streaming context. + +Positional paths: + ZIP-PATH... + Apply the entry specs to filter and rename entries to extract from all + of the provided zip files. At least one zip path must be provided, and + all provided paths must exist and point to an existing zip file. Pipes + are not supported and will produce an error. +"#, + Self::DEFLATE64_HELP_LINE, + Self::BZIP2_HELP_LINE, + Self::ZSTD_HELP_LINE, + Self::LZMA_HELP_LINE, + Self::XZ_HELP_LINE, + ) + } + + fn parse_argv(mut argv: VecDeque) -> Result { + let mut output_dir: Option = None; + let mut mkdir_flag: bool = false; + let mut stdout_flag: bool = false; + let mut args: Vec = Vec::new(); + let mut stdin_flag: bool = false; + let mut positional_zips: Vec = Vec::new(); + + while let Some(arg) = argv.pop_front() { + match arg.as_encoded_bytes() { + b"-h" | b"--help" => { + let help_text = Self::generate_full_help_text(); + return Err(ArgParseError::StdoutMessage(help_text)); + } + + /* Output args */ + b"-d" | b"--output-directory" => { + let new_path = argv.pop_front().map(PathBuf::from).ok_or_else(|| { + Self::exit_arg_invalid("no argument provided for -d/--output-directory") + })?; + if let Some(prev_path) = output_dir.take() { + return Err(Self::exit_arg_invalid(&format!( + "--output-directory provided twice: {prev_path:?} and {new_path:?}" + ))); + } else if stdout_flag { + return Err(Self::exit_arg_invalid( + "--stdout provided along with output dir", + )); + } else if !args.is_empty() || stdin_flag || !positional_zips.is_empty() { + return Err(Self::exit_arg_invalid( + "-d/--output-directory provided after entry specs or inputs", + )); + } else { + output_dir = Some(new_path); + } + } + b"--mkdir" => { + if mkdir_flag { + return Err(Self::exit_arg_invalid("--mkdir provided twice")); + } else if stdout_flag { + return Err(Self::exit_arg_invalid( + "--stdout provided along with --mkdir", + )); + } else if !args.is_empty() || stdin_flag || !positional_zips.is_empty() { + return Err(Self::exit_arg_invalid( + "--mkdir provided after entry specs or inputs", + )); + } else { + mkdir_flag = true; + } + } + b"--stdout" => { + if let Some(output_dir) = output_dir.take() { + return Err(Self::exit_arg_invalid(&format!( + "--stdout provided along with output directory {output_dir:?}" + ))); + } else if stdout_flag { + return Err(Self::exit_arg_invalid("--stdout provided twice")); + } else if mkdir_flag { + return Err(Self::exit_arg_invalid( + "--stdout provided along with --mkdir", + )); + } else if !args.is_empty() || stdin_flag || !positional_zips.is_empty() { + return Err(Self::exit_arg_invalid( + "--stdout provided after entry specs or inputs", + )); + } else { + stdout_flag = true; + } + } + + /* Transition to entry specs */ + /* Try content transforms first, as they are unambiguous sentinel values. */ + b"-x" | b"--extract" => { + args.push(ExtractArg::ContentTransform(ContentTransform::Extract)); + } + b"--raw" => { + args.push(ExtractArg::ContentTransform(ContentTransform::Raw)); + } + b"--log-to-stderr" => { + args.push(ExtractArg::ContentTransform(ContentTransform::LogToStderr)); + } + + /* Try name transforms next, as they only stack linearly and do not require CFG + * parsing of paired delimiters. */ + /* FIXME: none of these name transforms have any effect if --stdout is + * provided. Should we error or warn about this? */ + b"--identity" => { + args.push(ExtractArg::NameTransform(NameTransform::Trivial( + TrivialTransform::Identity, + ))); + } + b"--strip-components" => { + let num: u8 = argv + .pop_front() + .ok_or_else(|| { + Self::exit_arg_invalid("no argument provided for --strip-component") + })? + .into_string() + .map_err(|num| { + Self::exit_arg_invalid(&format!( + "invalid unicode provided for --strip-component: {num:?}" + )) + })? + .parse::() + .map_err(|e| { + Self::exit_arg_invalid(&format!( + "failed to parse --strip-component arg {e:?} as u8" + )) + })?; + args.push(ExtractArg::NameTransform(NameTransform::Basic( + BasicTransform::StripComponents(num), + ))); + } + b"--add-prefix" => { + let prefix: PathBuf = argv + .pop_front() + .ok_or_else(|| { + Self::exit_arg_invalid("no argument provided for --add-prefix") + })? + .into_string() + .map_err(|prefix| { + Self::exit_arg_invalid(&format!( + "invalid unicode provided for --add-prefix: {prefix:?}" + )) + })? + .into(); + args.push(ExtractArg::NameTransform(NameTransform::Basic( + BasicTransform::AddPrefix(prefix), + ))); + } + arg_bytes if arg_bytes.starts_with(b"--transform") => { + let (comp_sel, pat_sel) = + parse_comp_and_pat_sel(arg_bytes).ok_or_else(|| { + Self::exit_arg_invalid(&format!( + "invalid --transform argument modifiers: {arg:?}" + )) + })?; + let pattern = argv + .pop_front() + .ok_or_else(|| { + Self::exit_arg_invalid( + "no argument provided for --transform", + ) + })? + .into_string() + .map_err(|pattern| { + Self::exit_arg_invalid(&format!( + "invalid unicode provided for --transform : {pattern:?}" + )) + })?; + let replacement_spec = argv + .pop_front() + .ok_or_else(|| { + Self::exit_arg_invalid( + "no argument provided for --transform", + ) + })? + .into_string() + .map_err(|replacement_spec| { + Self::exit_arg_invalid(&format!( + "invalid unicode provided for --transform : {replacement_spec:?}" + )) + })?; + args.push(ExtractArg::NameTransform(NameTransform::Complex( + ComplexTransform::Transform(TransformArg { + comp_sel, + pat_sel, + pattern, + replacement_spec, + }), + ))); + } + arg_bytes if arg_bytes.starts_with(b"--remove-prefix") => { + let pat_sel = parse_only_pat_sel(arg_bytes).ok_or_else(|| { + Self::exit_arg_invalid(&format!( + "invalid --remove-prefix argument modifiers: {arg:?}" + )) + })?; + let pattern = argv + .pop_front() + .ok_or_else(|| { + Self::exit_arg_invalid( + "no argument provided for --remove-prefix", + ) + })? + .into_string() + .map_err(|pattern| { + Self::exit_arg_invalid(&format!( + "invalid unicode provided for --remove-prefix : {pattern:?}" + )) + })?; + args.push(ExtractArg::NameTransform(NameTransform::Complex( + ComplexTransform::RemovePrefix(RemovePrefixArg { pat_sel, pattern }), + ))); + } + + /* Try parsing match specs! */ + b"--expr" => { + let match_expr = MatchExpression::parse_argv(&mut argv)?; + args.push(ExtractArg::Match(match_expr)); + } + + /* Transition to input args */ + b"--stdin" => { + stdin_flag = true; + break; + } + b"--" => break, + arg_bytes => { + if arg_bytes.starts_with(b"-") { + return Err(Self::exit_arg_invalid(&format!( + "unrecognized flag {arg:?}" + ))); + } else { + argv.push_front(arg); + break; + } + } + } + } + + positional_zips.extend(argv.into_iter().map(|arg| arg.into())); + if stdin_flag && !positional_zips.is_empty() { + return Err(Self::exit_arg_invalid(&format!( + "--stdin was provided at the same time as positional args {positional_zips:?}" + ))); + } + let input = if stdin_flag { + InputType::StreamingStdin + } else { + if positional_zips.is_empty() { + return Err(Self::exit_arg_invalid( + "no zip input files were provided, and --stdin was not provided", + )); + } + InputType::ZipPaths(positional_zips) + }; + + let output = if stdout_flag { + OutputCollation::ConcatenateStdout + } else { + OutputCollation::Filesystem { + output_dir, + mkdir: mkdir_flag, + } + }; + + let entry_specs = EntrySpec::parse_extract_args(args)?; + + Ok(Self { + output, + entry_specs, + input, + }) + } + } + + impl crate::driver::ExecuteCommand for Extract { + fn execute(self, err: impl std::io::Write) -> Result<(), crate::CommandError> { + crate::extract::execute_extract(err, self) + } + } +} diff --git a/cli/src/compress.rs b/cli/src/compress.rs new file mode 100644 index 000000000..a61fa2018 --- /dev/null +++ b/cli/src/compress.rs @@ -0,0 +1,494 @@ +use std::{ + fs, + io::{self, Cursor, IsTerminal, Seek, Write}, + mem, + path::Path, +}; + +use zip::{ + unstable::path_to_string, + write::{SimpleFileOptions, ZipWriter}, + CompressionMethod, ZIP64_BYTES_THR, +}; + +use crate::{args::compress::*, CommandError, OutputHandle, WrapCommandErr}; + +fn enter_recursive_dir_entries( + err: &mut impl Write, + base_rename: Option, + root: &Path, + writer: &mut ZipWriter, + options: SimpleFileOptions, +) -> Result<(), CommandError> { + let base_dirname: String = base_rename + .unwrap_or_else(|| path_to_string(root).into()) + .trim_end_matches('/') + .to_string(); + writeln!( + err, + "writing top-level directory entry for {base_dirname:?}" + ) + .unwrap(); + writer + .add_directory(&base_dirname, options) + .wrap_err_with(|| format!("error adding top-level directory entry {base_dirname}"))?; + + let mut readdir_stack: Vec<(fs::ReadDir, String)> = vec![( + fs::read_dir(root) + .wrap_err_with(|| format!("error reading directory contents for {}", root.display()))?, + base_dirname, + )]; + while let Some((mut readdir, top_component)) = readdir_stack.pop() { + if let Some(dir_entry) = readdir + .next() + .transpose() + .wrap_err("reading next dir entry")? + { + let mut components: Vec<&str> = readdir_stack.iter().map(|(_, s)| s.as_ref()).collect(); + components.push(&top_component); + + let entry_basename: String = dir_entry.file_name().into_string().map_err(|name| { + CommandError::InvalidArg(format!("failed to decode basename {name:?}")) + })?; + components.push(&entry_basename); + let full_path: String = components.join("/"); + readdir_stack.push((readdir, top_component)); + + let file_type = dir_entry.file_type().wrap_err_with(|| { + format!("failed to read file type for dir entry {dir_entry:?}") + })?; + if file_type.is_symlink() { + let target: String = path_to_string( + fs::read_link(dir_entry.path()) + .wrap_err_with(|| format!("failed to read symlink from {dir_entry:?}"))?, + ) + .into(); + if target.len() > ZIP64_BYTES_THR.try_into().unwrap() { + return Err(CommandError::InvalidArg(format!( + "symlink target for {full_path} is over {ZIP64_BYTES_THR} bytes (was: {})", + target.len() + ))); + } + writeln!( + err, + "writing recursive symlink entry with name {full_path:?} and target {target:?}" + ) + .unwrap(); + writer + .add_symlink(&full_path, &target, options) + .wrap_err_with(|| format!("error adding symlink from {full_path}->{target}"))?; + } else if file_type.is_file() { + writeln!(err, "writing recursive file entry with name {full_path:?}").unwrap(); + let mut f = fs::File::open(dir_entry.path()).wrap_err_with(|| { + format!("error opening file for {full_path} from dir entry {dir_entry:?}") + })?; + /* Get the length of the file before reading it and set large_file if needed. */ + let input_len: u64 = f + .metadata() + .wrap_err_with(|| format!("error reading file metadata for {f:?}"))? + .len(); + let maybe_large_file_options = if input_len > ZIP64_BYTES_THR { + writeln!( + err, + "temporarily ensuring .large_file(true) for current entry" + ) + .unwrap(); + options.large_file(true) + } else { + options + }; + writer + .start_file(&full_path, maybe_large_file_options) + .wrap_err_with(|| format!("error creating file entry for {full_path}"))?; + io::copy(&mut f, writer).wrap_err_with(|| { + format!("error copying content for {full_path} from file {f:?}") + })?; + } else { + assert!(file_type.is_dir()); + writeln!( + err, + "writing recursive directory entry with name {full_path:?}" + ) + .unwrap(); + writer + .add_directory(&full_path, options) + .wrap_err_with(|| format!("failed to create directory entry {full_path}"))?; + writeln!( + err, + "adding subdirectories depth-first for recursive directory entry {entry_basename:?}" + ).unwrap(); + let new_readdir = fs::read_dir(dir_entry.path()).wrap_err_with(|| { + format!("failed to read recursive directory contents from {dir_entry:?}") + })?; + readdir_stack.push((new_readdir, entry_basename)); + } + } + } + Ok(()) +} + +pub fn execute_compress(mut err: impl Write, args: Compress) -> Result<(), CommandError> { + let Compress { + output, + args, + positional_paths, + } = args; + + let (out, do_append) = match output { + OutputType::File { path, append } => { + if append { + writeln!( + err, + "reading compressed zip from output file path {path:?} for append" + ) + .unwrap(); + match fs::OpenOptions::new() + .read(true) + .write(true) + .create(false) + .open(&path) + { + Ok(f) => { + writeln!(err, "output zip file existed, appending").unwrap(); + (OutputHandle::File(f), true) + } + Err(e) if e.kind() == io::ErrorKind::NotFound => { + writeln!( + err, + "output zip file did not exist, creating new file instead of appending" + ) + .unwrap(); + let out = + OutputHandle::File(fs::File::create(&path).wrap_err_with(|| { + format!("failed to create new zip output file at {path:?}") + })?); + (out, false) + } + Err(e) => { + return Err(e).wrap_err_with(|| { + format!( + "unexpected error reading zip output file for append at {path:?}" + ) + }); + } + } + } else { + writeln!(err, "writing compressed zip to output file path {path:?}").unwrap(); + let out = OutputHandle::File(fs::File::create(&path).wrap_err_with(|| { + format!("failed to create output file at {}", path.display()) + })?); + (out, false) + } + } + OutputType::Stdout { allow_tty } => { + writeln!( + err, + "writing to stdout and buffering compressed zip in memory" + ) + .unwrap(); + if io::stdout().is_terminal() && !allow_tty { + /* TODO: maybe figure out some way to ensure --stdout is still the correct flag */ + return Err(CommandError::InvalidArg( + "stdout is a tty, but --stdout was not set".to_string(), + )); + } + let out = OutputHandle::InMem(Cursor::new(Vec::new())); + (out, false) + } + }; + let mut writer = if do_append { + ZipWriter::new_append(out) + .wrap_err("failed to initialize zip writer from existing zip file for append")? + } else { + ZipWriter::new(out) + }; + + let mut options = SimpleFileOptions::default() + .compression_method(CompressionMethod::Deflated) + .large_file(false); + writeln!(err, "default zip entry options: {options:?}").unwrap(); + let mut last_name: Option = None; + let mut symlink_flag: bool = false; + + for arg in args.into_iter() { + match arg { + CompressionArg::CompressionMethod(method) => { + let method = match method { + CompressionMethodArg::Stored => CompressionMethod::Stored, + CompressionMethodArg::Deflate => CompressionMethod::Deflated, + #[cfg(feature = "deflate64")] + CompressionMethodArg::Deflate64 => CompressionMethod::Deflate64, + #[cfg(feature = "bzip2")] + CompressionMethodArg::Bzip2 => CompressionMethod::Bzip2, + #[cfg(feature = "zstd")] + CompressionMethodArg::Zstd => CompressionMethod::Zstd, + }; + writeln!(err, "setting compression method {method:?}").unwrap(); + options = options.compression_method(method); + } + CompressionArg::Level(CompressionLevel(level)) => { + writeln!(err, "setting compression level {level:?}").unwrap(); + options = options.compression_level(Some(level)); + } + CompressionArg::UnixPermissions(UnixPermissions(mode)) => { + writeln!(err, "setting file mode {mode:#o}").unwrap(); + options = options.unix_permissions(mode); + } + CompressionArg::LargeFile(large_file) => { + writeln!(err, "setting large file flag to {large_file:?}").unwrap(); + options = options.large_file(large_file); + } + CompressionArg::Name(name) => { + writeln!(err, "setting name of next entry to {name:?}").unwrap(); + if let Some(last_name) = last_name { + return Err(CommandError::InvalidArg(format!( + "got two names before an entry: {last_name} and {name}" + ))); + } + last_name = Some(name); + } + CompressionArg::Dir => { + writeln!(err, "writing dir entry").unwrap(); + if symlink_flag { + return Err(CommandError::InvalidArg( + "symlink flag provided before dir entry".to_string(), + )); + } + let dirname = last_name.take().ok_or_else(|| { + CommandError::InvalidArg("no name provided before dir entry".to_string()) + })?; + writer + .add_directory(&dirname, options) + .wrap_err_with(|| format!("failed to create dir entry {dirname}"))?; + } + CompressionArg::Symlink => { + writeln!(err, "setting symlink flag for next entry").unwrap(); + if symlink_flag { + /* TODO: make this a warning? */ + return Err(CommandError::InvalidArg( + "symlink flag provided twice before entry".to_string(), + )); + } + symlink_flag = true; + } + CompressionArg::Immediate(data) => { + let name = last_name.take().ok_or_else(|| { + CommandError::InvalidArg(format!( + "no name provided for immediate data {data:?}" + )) + })?; + /* It's highly unlikely any OS allows process args of this length, so even though + * we're using rust's env::args_os() and it would be very impressive for an attacker + * to get CLI args to overflow, it seems likely to be inefficient in any case, and + * very unlikely to be useful, so exit with a clear error. */ + if data.len() > ZIP64_BYTES_THR.try_into().unwrap() { + return Err(CommandError::InvalidArg(format!( + "length of immediate data argument is {}; use a file for inputs over {} bytes", + data.len(), + ZIP64_BYTES_THR + ))); + }; + if symlink_flag { + /* This is a symlink entry. */ + let target = data.into_string().map_err(|target| { + CommandError::InvalidArg(format!( + "failed to decode immediate symlink target {target:?}" + )) + })?; + writeln!( + err, + "writing immediate symlink entry with name {name:?} and target {target:?}" + ) + .unwrap(); + /* TODO: .add_symlink() should support OsString targets! */ + writer + .add_symlink(&name, &target, options) + .wrap_err_with(|| { + format!("failed to created symlink entry {name}->{target}") + })?; + symlink_flag = false; + } else { + /* This is a file entry. */ + writeln!( + err, + "writing immediate file entry with name {name:?} and data {data:?}" + ) + .unwrap(); + let data = data.into_encoded_bytes(); + writer + .start_file(&name, options) + .wrap_err_with(|| format!("failed to create file entry {name}"))?; + writer.write_all(data.as_ref()).wrap_err_with(|| { + format!( + "failed writing immediate data of length {} to file entry {name}", + data.len() + ) + })?; + } + } + CompressionArg::FilePath(path) => { + let name = last_name + .take() + .unwrap_or_else(|| path_to_string(&path).into()); + if symlink_flag { + /* This is a symlink entry. */ + let target: String = + path_to_string(fs::read_link(&path).wrap_err_with(|| { + format!("failed to read symlink from path {}", path.display()) + })?) + .into(); + /* Similarly to immediate data arguments, we're simply not going to support + * symlinks over this length, which should be impossible anyway. */ + if target.len() > ZIP64_BYTES_THR.try_into().unwrap() { + return Err(CommandError::InvalidArg(format!( + "symlink target for {name} is over {ZIP64_BYTES_THR} bytes (was: {})", + target.len() + ))); + } + writeln!(err, "writing symlink entry from path {path:?} with name {name:?} and target {target:?}").unwrap(); + writer + .add_symlink(&name, &target, options) + .wrap_err_with(|| { + format!("failed to create symlink entry for {name}->{target}") + })?; + symlink_flag = false; + } else { + /* This is a file entry. */ + writeln!( + err, + "writing file entry from path {path:?} with name {name:?}" + ) + .unwrap(); + let mut f = fs::File::open(&path).wrap_err_with(|| { + format!("error opening file for {name} at {}", path.display()) + })?; + /* Get the length of the file before reading it and set large_file if needed. */ + let input_len: u64 = f + .metadata() + .wrap_err_with(|| format!("error reading file metadata for {f:?}"))? + .len(); + writeln!(err, "entry is {input_len} bytes long").unwrap(); + let maybe_large_file_options = if input_len > ZIP64_BYTES_THR { + writeln!( + err, + "temporarily ensuring .large_file(true) for current entry" + ) + .unwrap(); + options.large_file(true) + } else { + options + }; + writer + .start_file(&name, maybe_large_file_options) + .wrap_err_with(|| format!("error creating file entry for {name}"))?; + io::copy(&mut f, &mut writer).wrap_err_with(|| { + format!("error copying content for {name} from file {f:?}") + })?; + } + } + CompressionArg::RecursiveDirPath(r) => { + if symlink_flag { + return Err(CommandError::InvalidArg( + "symlink flag provided before recursive dir entry".to_string(), + )); + } + writeln!( + err, + "writing recursive dir entries for path {r:?} with name {last_name:?}" + ) + .unwrap(); + enter_recursive_dir_entries(&mut err, last_name.take(), &r, &mut writer, options)?; + } + } + } + if symlink_flag { + return Err(CommandError::InvalidArg( + "symlink flag remaining after all entry flags processed".to_string(), + )); + } + if let Some(last_name) = last_name { + return Err(CommandError::InvalidArg(format!( + "name {last_name} remaining after all entry flags processed" + ))); + } + for pos_arg in positional_paths.into_iter() { + let file_type = fs::symlink_metadata(&pos_arg) + .wrap_err_with(|| format!("failed to read metadata from path {}", pos_arg.display()))? + .file_type(); + if file_type.is_symlink() { + let target = fs::read_link(&pos_arg).wrap_err_with(|| { + format!("failed to read symlink content from {}", pos_arg.display()) + })?; + writeln!( + err, + "writing positional symlink entry with path {pos_arg:?} and target {target:?}" + ) + .unwrap(); + writer + .add_symlink_from_path(&pos_arg, &target, options) + .wrap_err_with(|| { + format!( + "failed to create symlink entry for {}->{}", + pos_arg.display(), + target.display() + ) + })?; + } else if file_type.is_file() { + writeln!(err, "writing positional file entry with path {pos_arg:?}").unwrap(); + let mut f = fs::File::open(&pos_arg) + .wrap_err_with(|| format!("failed to open file at {}", pos_arg.display()))?; + /* Get the length of the file before reading it and set large_file if needed. */ + let input_len: u64 = f + .metadata() + .wrap_err_with(|| format!("error reading file metadata for {f:?}"))? + .len(); + let maybe_large_file_options = if input_len > ZIP64_BYTES_THR { + writeln!( + err, + "temporarily ensuring .large_file(true) for current entry" + ) + .unwrap(); + options.large_file(true) + } else { + options + }; + writer + .start_file_from_path(&pos_arg, maybe_large_file_options) + .wrap_err_with(|| format!("failed to create file entry {}", pos_arg.display()))?; + io::copy(&mut f, &mut writer) + .wrap_err_with(|| format!("failed to copy file contents from {f:?}"))?; + } else { + assert!(file_type.is_dir()); + writeln!( + err, + "writing positional recursive dir entry for {pos_arg:?}" + ) + .unwrap(); + enter_recursive_dir_entries(&mut err, None, &pos_arg, &mut writer, options)?; + } + } + + let handle = writer + .finish() + .wrap_err("failed to write zip to output handle")?; + match handle { + OutputHandle::File(f) => { + let archive_len: u64 = f + .metadata() + .wrap_err_with(|| format!("failed reading metadata from file {f:?}"))? + .len(); + writeln!(err, "file archive {f:?} was {archive_len} bytes").unwrap(); + mem::drop(f); /* Superfluous explicit drop. */ + } + OutputHandle::InMem(mut cursor) => { + let archive_len: u64 = cursor.position(); + writeln!(err, "in-memory archive was {archive_len} bytes").unwrap(); + cursor.rewind().wrap_err("failed to rewind cursor")?; + let mut stdout = io::stdout().lock(); + io::copy(&mut cursor, &mut stdout) + .wrap_err("failed to copy {archive_len} byte archive to stdout")?; + } + } + + Ok(()) +} diff --git a/cli/src/extract.rs b/cli/src/extract.rs new file mode 100644 index 000000000..0b1160182 --- /dev/null +++ b/cli/src/extract.rs @@ -0,0 +1,192 @@ +use std::{ + cell::{RefCell, RefMut}, + env, fs, + io::{self, Read, Seek, Write}, + marker::PhantomData, + mem, + path::PathBuf, +}; + +use zip::{read::ZipFile, result::ZipError, ZipArchive}; + +use crate::{args::extract::*, CommandError, WrapCommandErr}; + +trait EntryReceiver { + fn receive_entry(&self, entry: ZipFile, name: &str) -> Result<(), CommandError>; + fn finalize_entries(&self) -> Result<(), CommandError>; +} + +fn make_entry_receiver<'a>( + err: RefCell, + collation: OutputCollation, +) -> Result, CommandError> { + let ret: Box = match collation { + OutputCollation::ConcatenateStdout => Box::new(StdoutReceiver { + err, + stdout: io::stdout(), + }), + OutputCollation::Filesystem { output_dir, mkdir } => { + let output_dir = match output_dir { + Some(dir) => { + if mkdir { + fs::create_dir_all(&dir).wrap_err_with(|| { + format!("failed to create output directory {dir:?}") + })?; + } + dir + } + None => env::current_dir().wrap_err("failed to get current dir")?, + }; + Box::new(FilesystemReceiver { + err, + output_dir, + #[cfg(unix)] + perms_to_set: RefCell::new(Vec::new()), + }) + } + }; + Ok(ret) +} + +struct StdoutReceiver { + err: RefCell, + stdout: io::Stdout, +} + +impl EntryReceiver for StdoutReceiver +where + W: Write, +{ + fn receive_entry(&self, mut entry: ZipFile, name: &str) -> Result<(), CommandError> { + let mut err = self.err.borrow_mut(); + writeln!(err, "receiving entry {} with name {name}", entry.name()).unwrap(); + if entry.is_dir() { + writeln!(err, "entry is directory, ignoring").unwrap(); + } else if entry.is_symlink() { + writeln!(err, "entry is symlink, ignoring").unwrap(); + } else { + io::copy(&mut entry, &mut self.stdout.lock()) + .wrap_err_with(|| format!("failed to write entry {name} to stdout"))?; + } + Ok(()) + } + + fn finalize_entries(&self) -> Result<(), CommandError> { + Ok(()) + } +} + +struct FilesystemReceiver { + err: RefCell, + output_dir: PathBuf, + #[cfg(unix)] + perms_to_set: RefCell>, +} + +impl EntryReceiver for FilesystemReceiver +where + W: Write, +{ + fn receive_entry(&self, mut entry: ZipFile, name: &str) -> Result<(), CommandError> { + let mut err = self.err.borrow_mut(); + let full_output_path = self.output_dir.join(name); + writeln!( + err, + "receiving entry {} with name {name} and writing to path {full_output_path:?}", + entry.name() + ) + .unwrap(); + + #[cfg(unix)] + if let Some(mode) = entry.unix_mode() { + writeln!( + err, + "storing unix mode {mode} for path {full_output_path:?}" + ) + .unwrap(); + self.perms_to_set + .borrow_mut() + .push((full_output_path.clone(), mode)); + } + + if entry.is_dir() { + writeln!(err, "entry is directory, creating").unwrap(); + fs::create_dir_all(&full_output_path).wrap_err_with(|| { + format!("failed to create directory entry at {full_output_path:?}") + })?; + } else if entry.is_symlink() { + let mut target: Vec = Vec::with_capacity(entry.size().try_into().unwrap()); + entry.read_to_end(&mut target).wrap_err_with(|| { + format!( + "failed to read symlink target from zip archive entry {}", + entry.name() + ) + })?; + + #[cfg(unix)] + { + use std::{ + ffi::OsString, + os::unix::{ffi::OsStringExt, fs::symlink}, + }; + let target = OsString::from_vec(target); + writeln!(err, "entry is symlink to {target:?}, creating").unwrap(); + symlink(&target, &full_output_path).wrap_err_with(|| { + format!( + "failed to create symlink at {full_output_path:?} with target {target:?}" + ) + })?; + } + #[cfg(not(unix))] + { + /* FIXME: non-unix symlink extraction not yet supported! */ + todo!("TODO: cannot create symlink for entry {name} on non-unix yet!"); + } + } else { + writeln!(err, "entry is file, creating").unwrap(); + if let Some(containing_dir) = full_output_path.parent() { + fs::create_dir_all(containing_dir).wrap_err_with(|| { + format!("failed to create parent dirs for file at {full_output_path:?}") + })?; + } else { + writeln!(err, "entry had no parent dir (in root dir?)").unwrap(); + } + let mut outfile = fs::File::create(&full_output_path) + .wrap_err_with(|| format!("failed to create file at {full_output_path:?}"))?; + io::copy(&mut entry, &mut outfile).wrap_err_with(|| { + format!( + "failed to copy file contents from {} to {full_output_path:?}", + entry.name() + ) + })?; + } + Ok(()) + } + + fn finalize_entries(&self) -> Result<(), CommandError> { + #[cfg(unix)] + { + use std::{cmp::Reverse, os::unix::fs::PermissionsExt}; + + let mut perms_to_set = mem::take(&mut *self.perms_to_set.borrow_mut()); + perms_to_set.sort_unstable_by_key(|(path, _)| Reverse(path.clone())); + for (path, mode) in perms_to_set.into_iter() { + let perms = fs::Permissions::from_mode(mode); + fs::set_permissions(&path, perms.clone()) + .wrap_err_with(|| format!("error setting perms {perms:?} for path {path:?}"))?; + } + } + Ok(()) + } +} + +struct ZipFileInput { + err: RefCell, + inner: ZipArchive, +} + +pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), CommandError> { + writeln!(err, "asdf!").unwrap(); + dbg!(extract); + Ok(()) +} diff --git a/cli/src/lib.rs b/cli/src/lib.rs new file mode 100644 index 000000000..73d1c1397 --- /dev/null +++ b/cli/src/lib.rs @@ -0,0 +1,167 @@ +//! ??? + +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + +use std::{fs, io}; + +pub mod args; +pub mod compress; +pub mod extract; + +pub enum ErrHandle { + Output(W), + NoOutput, +} + +impl io::Write for ErrHandle +where + W: io::Write, +{ + fn write(&mut self, buf: &[u8]) -> io::Result { + match self { + Self::Output(w) => w.write(buf), + Self::NoOutput => Ok(buf.len()), + } + } + + fn flush(&mut self) -> io::Result<()> { + match self { + Self::Output(w) => w.flush(), + Self::NoOutput => Ok(()), + } + } +} + +pub enum OutputHandle { + File(fs::File), + InMem(io::Cursor>), +} + +impl io::Read for OutputHandle { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self { + Self::File(f) => f.read(buf), + Self::InMem(c) => c.read(buf), + } + } +} + +impl io::Write for OutputHandle { + fn write(&mut self, buf: &[u8]) -> io::Result { + match self { + Self::File(f) => f.write(buf), + Self::InMem(c) => c.write(buf), + } + } + + fn flush(&mut self) -> io::Result<()> { + match self { + Self::File(f) => f.flush(), + Self::InMem(c) => c.flush(), + } + } +} + +impl io::Seek for OutputHandle { + fn seek(&mut self, pos: io::SeekFrom) -> io::Result { + match self { + Self::File(f) => f.seek(pos), + Self::InMem(c) => c.seek(pos), + } + } +} + +#[derive(Debug)] +pub enum CommandError { + InvalidArg(String), + Io(String, io::Error), + Zip(String, zip::result::ZipError), +} + +pub trait WrapCommandErr: Sized { + fn wrap_err(self, context: &str) -> Result { + self.wrap_err_with(|| context.to_string()) + } + fn wrap_err_with(self, f: impl FnOnce() -> String) -> Result; +} + +impl WrapCommandErr for Result { + fn wrap_err_with(self, f: impl FnOnce() -> String) -> Result { + self.map_err(|e| CommandError::Io(f(), e)) + } +} + +impl WrapCommandErr for Result { + fn wrap_err_with(self, f: impl FnOnce() -> String) -> Result { + self.map_err(|e| CommandError::Zip(f(), e)) + } +} + +pub mod driver { + use std::env; + use std::io::{self, Write}; + use std::process; + + use super::args::{ArgParseError, CommandFormat, ZipCli, ZipCommand}; + use super::{CommandError, ErrHandle}; + + pub trait ExecuteCommand: CommandFormat { + fn execute(self, err: impl Write) -> Result<(), CommandError>; + + fn do_main(self, err: impl Write) -> ! + where + Self: Sized, + { + match self.execute(err) { + Ok(()) => process::exit(ZipCli::NON_FAILURE_EXIT_CODE), + Err(e) => match e { + CommandError::InvalidArg(msg) => { + let msg = Self::generate_brief_help_text(&msg); + let _ = io::stderr().write_all(msg.as_bytes()); + process::exit(ZipCli::ARGV_PARSE_FAILED_EXIT_CODE); + } + CommandError::Io(context, e) => { + let msg = format!("i/o error: {context}: {e}\n"); + let _ = io::stderr().write_all(msg.as_bytes()); + process::exit(ZipCli::INTERNAL_ERROR_EXIT_CODE); + } + CommandError::Zip(context, e) => { + let msg = format!("zip error: {context}: {e}\n"); + let _ = io::stderr().write_all(msg.as_bytes()); + process::exit(ZipCli::INTERNAL_ERROR_EXIT_CODE); + } + }, + } + } + } + + pub fn main() { + let ZipCli { verbose, command } = match ZipCli::parse_argv(env::args_os()) { + Ok(cli) => cli, + Err(e) => match e { + ArgParseError::StdoutMessage(msg) => { + io::stdout() + .write_all(msg.as_bytes()) + .expect("couldn't write message to stdout"); + process::exit(ZipCli::NON_FAILURE_EXIT_CODE); + } + ArgParseError::StderrMessage(msg) => { + /* If we can't write anything to stderr, no use aborting, so just exit. */ + let _ = io::stderr().write_all(msg.as_bytes()); + process::exit(ZipCli::ARGV_PARSE_FAILED_EXIT_CODE); + } + }, + }; + let err = if verbose { + ErrHandle::Output(io::stderr()) + } else { + ErrHandle::NoOutput + }; + + match command { + ZipCommand::Info => todo!("info command not implemented"), + ZipCommand::Extract(extract) => extract.do_main(err), + ZipCommand::Compress(compress) => compress.do_main(err), + } + } +} diff --git a/cli/src/main.rs b/cli/src/main.rs new file mode 100644 index 000000000..95fae2ac9 --- /dev/null +++ b/cli/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + zip_cli::driver::main(); +} diff --git a/fuzz/fuzz_targets/fuzz_write.rs b/fuzz/fuzz_targets/fuzz_write.rs index 53653a60b..c3cc9089a 100755 --- a/fuzz/fuzz_targets/fuzz_write.rs +++ b/fuzz/fuzz_targets/fuzz_write.rs @@ -1,12 +1,12 @@ #![no_main] use arbitrary::Arbitrary; -use core::fmt::{Debug}; +use core::fmt::Debug; use libfuzzer_sys::fuzz_target; use replace_with::replace_with_or_abort; use std::fmt::{Arguments, Formatter, Write}; -use std::io::{Cursor, Seek, SeekFrom}; use std::io::Write as IoWrite; +use std::io::{Cursor, Seek, SeekFrom}; use std::path::PathBuf; use tikv_jemallocator::Jemalloc; use zip::result::{ZipError, ZipResult}; @@ -93,22 +93,36 @@ fn do_operation<'k>( flush_on_finish_file: bool, files_added: &mut usize, stringifier: &mut impl Write, - panic_on_error: bool + panic_on_error: bool, ) -> Result<(), Box> { writer.set_flush_on_finish_file(flush_on_finish_file); - let FileOperation { basic, mut path, reopen} = operation; + let FileOperation { + basic, + mut path, + reopen, + } = operation; match basic { BasicFileOperation::WriteNormalFile { - contents, mut options, .. + contents, + mut options, + .. } => { let uncompressed_size = contents.iter().map(|chunk| chunk.len()).sum::(); if uncompressed_size >= u32::MAX as usize { options = options.large_file(true); } if options == FullFileOptions::default() { - writeln!(stringifier, "writer.start_file_from_path({:?}, Default::default())?;", path)?; + writeln!( + stringifier, + "writer.start_file_from_path({:?}, Default::default())?;", + path + )?; } else { - writeln!(stringifier, "writer.start_file_from_path({:?}, {:?})?;", path, options)?; + writeln!( + stringifier, + "writer.start_file_from_path({:?}, {:?})?;", + path, options + )?; } writer.start_file_from_path(&*path, options)?; for chunk in contents.iter() { @@ -118,12 +132,20 @@ fn do_operation<'k>( *files_added += 1; } BasicFileOperation::WriteDirectory(options) => { - writeln!(stringifier, "writer.add_directory_from_path(&{:?}, {:?})?;", path, options)?; + writeln!( + stringifier, + "writer.add_directory_from_path(&{:?}, {:?})?;", + path, options + )?; writer.add_directory_from_path(&*path, options.to_owned())?; *files_added += 1; } BasicFileOperation::WriteSymlinkWithTarget { target, options } => { - writeln!(stringifier, "writer.add_symlink_from_path(&{:?}, {:?}, {:?});", path, target, options)?; + writeln!( + stringifier, + "writer.add_symlink_from_path(&{:?}, {:?}, {:?});", + path, target, options + )?; writer.add_symlink_from_path(&*path, target, options.to_owned())?; *files_added += 1; } @@ -132,8 +154,20 @@ fn do_operation<'k>( return Ok(()); }; deduplicate_paths(&mut path, &base_path); - do_operation(writer, *base, false, flush_on_finish_file, files_added, stringifier, panic_on_error)?; - writeln!(stringifier, "writer.shallow_copy_file_from_path({:?}, {:?});", base_path, path)?; + do_operation( + writer, + *base, + false, + flush_on_finish_file, + files_added, + stringifier, + panic_on_error, + )?; + writeln!( + stringifier, + "writer.shallow_copy_file_from_path({:?}, {:?});", + base_path, path + )?; writer.shallow_copy_file_from_path(&*base_path, &*path)?; *files_added += 1; } @@ -142,38 +176,65 @@ fn do_operation<'k>( return Ok(()); }; deduplicate_paths(&mut path, &base_path); - do_operation(writer, *base, false, flush_on_finish_file, files_added, stringifier, panic_on_error)?; - writeln!(stringifier, "writer.deep_copy_file_from_path({:?}, {:?});", base_path, path)?; + do_operation( + writer, + *base, + false, + flush_on_finish_file, + files_added, + stringifier, + panic_on_error, + )?; + writeln!( + stringifier, + "writer.deep_copy_file_from_path({:?}, {:?});", + base_path, path + )?; writer.deep_copy_file_from_path(&*base_path, path)?; *files_added += 1; } - BasicFileOperation::MergeWithOtherFile { operations, initial_junk } => { + BasicFileOperation::MergeWithOtherFile { + operations, + initial_junk, + } => { if initial_junk.is_empty() { - writeln!(stringifier, "let sub_writer = {{\n\ - let mut writer = ZipWriter::new(Cursor::new(Vec::new()));")?; + writeln!( + stringifier, + "let sub_writer = {{\n\ + let mut writer = ZipWriter::new(Cursor::new(Vec::new()));" + )?; } else { - writeln!(stringifier, - "let sub_writer = {{\n\ + writeln!( + stringifier, + "let sub_writer = {{\n\ let mut initial_junk = Cursor::new(vec!{:?});\n\ initial_junk.seek(SeekFrom::End(0))?; - let mut writer = ZipWriter::new(initial_junk);", initial_junk)?; + let mut writer = ZipWriter::new(initial_junk);", + initial_junk + )?; } let mut initial_junk = Cursor::new(initial_junk.into_vec()); initial_junk.seek(SeekFrom::End(0))?; let mut other_writer = zip::ZipWriter::new(initial_junk); let mut inner_files_added = 0; - operations.into_vec().into_iter().for_each(|(operation, abort)| { - let _ = do_operation( - &mut other_writer, - operation, - abort, - false, - &mut inner_files_added, - stringifier, - panic_on_error - ); - }); - writeln!(stringifier, "writer\n}};\nwriter.merge_archive(sub_writer.finish_into_readable()?)?;")?; + operations + .into_vec() + .into_iter() + .for_each(|(operation, abort)| { + let _ = do_operation( + &mut other_writer, + operation, + abort, + false, + &mut inner_files_added, + stringifier, + panic_on_error, + ); + }); + writeln!( + stringifier, + "writer\n}};\nwriter.merge_archive(sub_writer.finish_into_readable()?)?;" + )?; writer.merge_archive(other_writer.finish_into_readable()?)?; *files_added += inner_files_added; } @@ -193,15 +254,19 @@ fn do_operation<'k>( match reopen { ReopenOption::DoNotReopen => { writeln!(stringifier, "writer")?; - return Ok(()) - }, + return Ok(()); + } ReopenOption::ViaFinish => { let old_comment = writer.get_raw_comment().to_owned(); - writeln!(stringifier, "let mut writer = ZipWriter::new_append(writer.finish()?)?;")?; + writeln!( + stringifier, + "let mut writer = ZipWriter::new_append(writer.finish()?)?;" + )?; replace_with_or_abort(writer, |old_writer: zip::ZipWriter>>| { (|| -> ZipResult>>> { zip::ZipWriter::new_append(old_writer.finish()?) - })().unwrap_or_else(|_| { + })() + .unwrap_or_else(|_| { if panic_on_error { panic!("Failed to create new ZipWriter") } @@ -214,11 +279,15 @@ fn do_operation<'k>( } ReopenOption::ViaFinishIntoReadable => { let old_comment = writer.get_raw_comment().to_owned(); - writeln!(stringifier, "let mut writer = ZipWriter::new_append(writer.finish()?)?;")?; + writeln!( + stringifier, + "let mut writer = ZipWriter::new_append(writer.finish()?)?;" + )?; replace_with_or_abort(writer, |old_writer| { (|| -> ZipResult>>> { zip::ZipWriter::new_append(old_writer.finish()?) - })().unwrap_or_else(|_| { + })() + .unwrap_or_else(|_| { if panic_on_error { panic!("Failed to create new ZipWriter") } @@ -231,7 +300,7 @@ fn do_operation<'k>( Ok(()) } -impl <'k> FuzzTestCase<'k> { +impl<'k> FuzzTestCase<'k> { fn execute(self, stringifier: &mut impl Write, panic_on_error: bool) -> ZipResult<()> { let mut initial_junk = Cursor::new(self.initial_junk.into_vec()); initial_junk.seek(SeekFrom::End(0))?; @@ -253,7 +322,7 @@ impl <'k> FuzzTestCase<'k> { self.flush_on_finish_file, &mut files_added, stringifier, - panic_on_error + panic_on_error, ); } if final_reopen { @@ -265,14 +334,21 @@ impl <'k> FuzzTestCase<'k> { } } -impl <'k> Debug for FuzzTestCase<'k> { +impl<'k> Debug for FuzzTestCase<'k> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { if self.initial_junk.is_empty() { - writeln!(f, "let mut writer = ZipWriter::new(Cursor::new(Vec::new()));")?; + writeln!( + f, + "let mut writer = ZipWriter::new(Cursor::new(Vec::new()));" + )?; } else { - writeln!(f, "let mut initial_junk = Cursor::new(vec!{:?});\n\ + writeln!( + f, + "let mut initial_junk = Cursor::new(vec!{:?});\n\ initial_junk.seek(SeekFrom::End(0))?;\n\ - let mut writer = ZipWriter::new(initial_junk);", &self.initial_junk)?; + let mut writer = ZipWriter::new(initial_junk);", + &self.initial_junk + )?; } let _ = self.clone().execute(f, false); Ok(()) diff --git a/src/write.rs b/src/write.rs index 48276cb9d..9ba9a63a6 100644 --- a/src/write.rs +++ b/src/write.rs @@ -253,6 +253,7 @@ impl<'a> arbitrary::Arbitrary<'a> for EncryptWith<'a> { } /// Metadata for a file to be written +/* TODO: add accessors for this data as well so options can be introspected! */ #[derive(Clone, Debug, Copy, Eq, PartialEq)] pub struct FileOptions<'k, T: FileOptionExtension> { pub(crate) compression_method: CompressionMethod, @@ -1396,6 +1397,7 @@ impl ZipWriter { /// implementations may materialize a symlink as a regular file, possibly with the /// content incorrectly set to the symlink target. For maximum portability, consider /// storing a regular file instead. + /* TODO: support OsStr instead of just str, for non-unicode paths. */ pub fn add_symlink( &mut self, name: N, From 0039b063039ce74b4c8a594e02625fa0e40d6ab9 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 22 Aug 2024 06:06:19 -0400 Subject: [PATCH 02/68] implement matching logic except for --match --- cli/src/args.rs | 43 ++++++++++++++++++++++---- cli/src/extract.rs | 75 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 6 deletions(-) diff --git a/cli/src/args.rs b/cli/src/args.rs index 91070ecdf..6eeb80953 100644 --- a/cli/src/args.rs +++ b/cli/src/args.rs @@ -674,6 +674,8 @@ pub mod info { pub mod extract { use super::{ArgParseError, CommandFormat}; + use zip::CompressionMethod; + use std::{collections::VecDeque, ffi::OsString, mem, path::PathBuf}; #[derive(Debug)] @@ -821,11 +823,8 @@ pub mod extract { #[derive(Debug)] pub enum EntryType { - /// file File, - /// dir Dir, - /// symlink Symlink, } @@ -842,13 +841,11 @@ pub mod extract { #[derive(Debug, PartialEq, Eq)] pub enum NonSpecificCompressionMethodArg { - /// any Any, - /// known Known, } - #[derive(Debug, PartialEq, Eq)] + #[derive(Debug, PartialEq, Eq, Copy, Clone)] pub enum SpecificCompressionMethodArg { Stored, Deflated, @@ -864,6 +861,40 @@ pub mod extract { Xz, } + impl SpecificCompressionMethodArg { + pub const KNOWN_COMPRESSION_METHODS: &[CompressionMethod] = &[ + CompressionMethod::Stored, + CompressionMethod::Deflated, + #[cfg(feature = "deflate64")] + CompressionMethod::Deflate64, + #[cfg(feature = "bzip2")] + CompressionMethod::Bzip2, + #[cfg(feature = "zstd")] + CompressionMethod::Zstd, + #[cfg(feature = "lzma")] + CompressionMethod::Lzma, + #[cfg(feature = "xz")] + CompressionMethod::Xz, + ]; + + pub fn translate_to_zip(self) -> CompressionMethod { + match self { + Self::Stored => CompressionMethod::Stored, + Self::Deflated => CompressionMethod::Deflated, + #[cfg(feature = "deflate64")] + Self::Deflate64 => CompressionMethod::Deflate64, + #[cfg(feature = "bzip2")] + Self::Bzip2 => CompressionMethod::Bzip2, + #[cfg(feature = "zstd")] + Self::Zstd => CompressionMethod::Zstd, + #[cfg(feature = "lzma")] + Self::Lzma => CompressionMethod::Lzma, + #[cfg(feature = "xz")] + Self::Xz => CompressionMethod::Xz, + } + } + } + #[derive(Debug, PartialEq, Eq)] pub enum CompressionMethodArg { NonSpecific(NonSpecificCompressionMethodArg), diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 0b1160182..cb16b315b 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -180,6 +180,81 @@ where } } +struct Matcher { + err: RefCell, + expr: MatchExpression, +} + +impl Matcher +where + W: Write, +{ + pub fn evaluate(&self, entry: &ZipFile) -> Result { + let Self { err, expr } = self; + Self::recursive_match(err, &expr, entry) + } + + fn recursive_match( + err: &RefCell, + expr: &MatchExpression, + entry: &ZipFile, + ) -> Result { + match expr { + MatchExpression::PrimitivePredicate(predicate) => match predicate { + Predicate::Trivial(trivial) => match trivial { + TrivialPredicate::True => Ok(true), + TrivialPredicate::False => Ok(false), + }, + Predicate::EntryType(entry_type) => match entry_type { + EntryType::File => Ok(!entry.is_dir() && !entry.is_symlink()), + EntryType::Dir => Ok(entry.is_dir()), + EntryType::Symlink => Ok(entry.is_symlink()), + }, + Predicate::CompressionMethod(method_arg) => match method_arg { + CompressionMethodArg::NonSpecific(nonspecific_arg) => match nonspecific_arg { + NonSpecificCompressionMethodArg::Any => Ok(true), + NonSpecificCompressionMethodArg::Known => { + Ok(SpecificCompressionMethodArg::KNOWN_COMPRESSION_METHODS + .contains(&entry.compression())) + } + }, + CompressionMethodArg::Specific(specific_arg) => { + Ok(specific_arg.translate_to_zip() == entry.compression()) + } + }, + Predicate::DepthLimit(limit_arg) => match limit_arg { + DepthLimitArg::Max(max) => { + let max: usize = (*max).into(); + Ok(entry.name().split('/').count() <= max) + } + DepthLimitArg::Min(min) => { + let min: usize = (*min).into(); + Ok(entry.name().split('/').count() >= min) + } + }, + Predicate::Match(match_arg) => todo!("{match_arg:?}"), + }, + MatchExpression::Negated(inner) => { + Self::recursive_match(err, inner.as_ref(), entry).map(|result| !result) + } + MatchExpression::And { + explicit: _, + left, + right, + } => { + /* Short-circuiting, so do left first. */ + Ok(Self::recursive_match(err, left.as_ref(), entry)? + && Self::recursive_match(err, right.as_ref(), entry)?) + } + MatchExpression::Or { left, right } => { + Ok(Self::recursive_match(err, left.as_ref(), entry)? + || Self::recursive_match(err, right.as_ref(), entry)?) + } + MatchExpression::Grouped(inner) => Self::recursive_match(err, inner.as_ref(), entry), + } + } +} + struct ZipFileInput { err: RefCell, inner: ZipArchive, From 5bd3cd6c5c01f9ddf76844871534dab754760f50 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 10:08:03 -0400 Subject: [PATCH 03/68] FINALLY fix input zips iteration --- cli/src/args.rs | 1 - cli/src/extract.rs | 146 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 126 insertions(+), 21 deletions(-) diff --git a/cli/src/args.rs b/cli/src/args.rs index 6eeb80953..072a909e8 100644 --- a/cli/src/args.rs +++ b/cli/src/args.rs @@ -1417,7 +1417,6 @@ content transform. add -x/--extract to construct a complete entry spec" #[derive(Debug)] pub enum InputType { - /* FIXME: not yet supported */ StreamingStdin, ZipPaths(Vec), } diff --git a/cli/src/extract.rs b/cli/src/extract.rs index cb16b315b..ab19a059e 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -1,13 +1,16 @@ use std::{ - cell::{RefCell, RefMut}, + cell::RefCell, env, fs, - io::{self, Read, Seek, Write}, - marker::PhantomData, + io::{self, Read, Write}, mem, - path::PathBuf, + path::{Path, PathBuf}, + rc::Rc, }; -use zip::{read::ZipFile, result::ZipError, ZipArchive}; +use zip::{ + read::{read_zipfile_from_stream, ZipFile}, + ZipArchive, +}; use crate::{args::extract::*, CommandError, WrapCommandErr}; @@ -17,14 +20,11 @@ trait EntryReceiver { } fn make_entry_receiver<'a>( - err: RefCell, + err: Rc>, collation: OutputCollation, ) -> Result, CommandError> { let ret: Box = match collation { - OutputCollation::ConcatenateStdout => Box::new(StdoutReceiver { - err, - stdout: io::stdout(), - }), + OutputCollation::ConcatenateStdout => Box::new(StdoutReceiver::new(err)), OutputCollation::Filesystem { output_dir, mkdir } => { let output_dir = match output_dir { Some(dir) => { @@ -37,22 +37,26 @@ fn make_entry_receiver<'a>( } None => env::current_dir().wrap_err("failed to get current dir")?, }; - Box::new(FilesystemReceiver { - err, - output_dir, - #[cfg(unix)] - perms_to_set: RefCell::new(Vec::new()), - }) + Box::new(FilesystemReceiver::new(err, output_dir)) } }; Ok(ret) } struct StdoutReceiver { - err: RefCell, + err: Rc>, stdout: io::Stdout, } +impl StdoutReceiver { + pub fn new(err: Rc>) -> Self { + Self { + err, + stdout: io::stdout(), + } + } +} + impl EntryReceiver for StdoutReceiver where W: Write, @@ -77,12 +81,23 @@ where } struct FilesystemReceiver { - err: RefCell, + err: Rc>, output_dir: PathBuf, #[cfg(unix)] perms_to_set: RefCell>, } +impl FilesystemReceiver { + pub fn new(err: Rc>, output_dir: PathBuf) -> Self { + Self { + err, + output_dir, + #[cfg(unix)] + perms_to_set: RefCell::new(Vec::new()), + } + } +} + impl EntryReceiver for FilesystemReceiver where W: Write, @@ -181,7 +196,7 @@ where } struct Matcher { - err: RefCell, + err: Rc>, expr: MatchExpression, } @@ -255,9 +270,100 @@ where } } +trait IterateEntries { + fn next_entry(&mut self) -> Result, CommandError>; +} + +struct StdinInput { + err: Rc>, + inner: io::Stdin, +} + +impl StdinInput { + pub fn new(err: Rc>) -> Self { + Self { + err, + inner: io::stdin(), + } + } +} + +impl IterateEntries for StdinInput { + fn next_entry(&mut self) -> Result, CommandError> { + read_zipfile_from_stream(&mut self.inner).wrap_err("failed to read zip entries from stdin") + } +} + +#[derive(Debug)] struct ZipFileInput { - err: RefCell, + err: Rc>, inner: ZipArchive, + file_counter: usize, +} + +impl ZipFileInput { + pub fn new(err: Rc>, inner: ZipArchive) -> Self { + Self { + err, + inner: inner, + file_counter: 0, + } + } + + pub fn remaining(&self) -> usize { + self.inner.len() - self.file_counter + } + + pub fn none_left(&self) -> bool { + self.remaining() == 0 + } +} + +impl IterateEntries for ZipFileInput { + fn next_entry(&mut self) -> Result, CommandError> { + if self.none_left() { + return Ok(None); + } + let prev_counter = self.file_counter; + self.file_counter += 1; + self.inner + .by_index(prev_counter) + .map(Some) + .wrap_err_with(|| format!("failed to read entry #{prev_counter} from zip",)) + } +} + +struct AllInputZips { + err: Rc>, + zips_todo: Vec>, +} + +impl AllInputZips { + pub fn new( + err: Rc>, + zip_paths: impl IntoIterator>, + ) -> Result { + let zips_todo = zip_paths + .into_iter() + .map(|p| { + fs::File::open(p.as_ref()) + .wrap_err_with(|| { + format!("failed to open zip input file path {:?}", p.as_ref()) + }) + .and_then(|f| { + ZipArchive::new(f).wrap_err_with(|| { + format!("failed to create zip archive for file {:?}", p.as_ref()) + }) + }) + .map(|archive| ZipFileInput::new(Rc::clone(&err), archive)) + }) + .collect::, CommandError>>()?; + Ok(Self { err, zips_todo }) + } + + pub fn iter_zips(self) -> impl IntoIterator> { + self.zips_todo.into_iter() + } } pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), CommandError> { From fd539cc904232f9c2c7b8039b9ee19c303ac5110 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 10:40:10 -0400 Subject: [PATCH 04/68] implement a couple basic transforms --- cli/src/args.rs | 7 ++--- cli/src/extract.rs | 75 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 4 deletions(-) diff --git a/cli/src/args.rs b/cli/src/args.rs index 072a909e8..8adbc7443 100644 --- a/cli/src/args.rs +++ b/cli/src/args.rs @@ -1313,7 +1313,7 @@ pub mod extract { #[derive(Debug)] pub enum BasicTransform { StripComponents(u8), - AddPrefix(PathBuf), + AddPrefix(String), } #[derive(Debug)] @@ -1845,7 +1845,7 @@ Positional paths: ))); } b"--add-prefix" => { - let prefix: PathBuf = argv + let prefix = argv .pop_front() .ok_or_else(|| { Self::exit_arg_invalid("no argument provided for --add-prefix") @@ -1855,8 +1855,7 @@ Positional paths: Self::exit_arg_invalid(&format!( "invalid unicode provided for --add-prefix: {prefix:?}" )) - })? - .into(); + })?; args.push(ExtractArg::NameTransform(NameTransform::Basic( BasicTransform::AddPrefix(prefix), ))); diff --git a/cli/src/extract.rs b/cli/src/extract.rs index ab19a059e..dfa1984f2 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -1,5 +1,7 @@ use std::{ + borrow::Cow, cell::RefCell, + collections::VecDeque, env, fs, io::{self, Read, Write}, mem, @@ -200,6 +202,12 @@ struct Matcher { expr: MatchExpression, } +impl Matcher { + pub fn new(err: Rc>, expr: MatchExpression) -> Self { + Self { err, expr } + } +} + impl Matcher where W: Write, @@ -270,6 +278,73 @@ where } } +struct Transformer { + err: Rc>, + trans: NameTransform, +} + +impl Transformer { + pub fn new(err: Rc>, trans: NameTransform) -> Self { + Self { err, trans } + } +} + +impl Transformer +where + W: Write, +{ + pub fn evaluate<'s>(&self, name: &'s str) -> Result, CommandError> { + match self.trans { + NameTransform::Trivial(TrivialTransform::Identity) => Ok(Cow::Borrowed(name)), + NameTransform::Basic(basic_trans) => match basic_trans { + BasicTransform::StripComponents(num_components_to_strip) => { + /* If no directory components, then nothing to strip. */ + if !name.contains('/') { + return Ok(Cow::Borrowed(name)); + } + /* We allow stripping 0 components, which does nothing. */ + if num_components_to_strip == 0 { + return Ok(Cow::Borrowed(name)); + } + /* Pop off prefix components until only one is left or we have stripped all the + * requested prefix components. */ + let mut num_components_to_strip: usize = (*num_components_to_strip).into(); + let mut separator_indices: VecDeque = + name.match_indices('/').map(|(i, _)| i).collect(); + debug_assert!(separator_indices.len() > 0); + /* Always keep the final separator, as regardless of how many we strip, we want + * to keep the basename in all cases. */ + while separator_indices.len() > 1 && num_components_to_strip > 0 { + let _ = separator_indices.pop_front().unwrap(); + num_components_to_strip -= 1; + } + debug_assert!(separator_indices.len() > 0); + let leftmost_remaining_separator_index: usize = + separator_indices.pop_front().unwrap(); + Ok(Cow::Borrowed( + name[(leftmost_remaining_separator_index + 1)..], + )) + } + BasicTransform::AddPrefix(prefix_to_add) => { + /* We allow an empty prefix, which means to do nothing. */ + if prefix_to_add.is_empty() { + return Ok(Cow::Borrowed(name)); + } + Ok(Cow::Owned(format!("{}/{}", prefix_to_add, name))) + } + }, + NameTransform::Complex(complex_trans) => match complex_trans { + ComplexTransform::RemovePrefix(remove_prefix_arg) => { + todo!("impl remove prefix: {:?}", remove_prefix_arg) + } + ComplexTransform::Transform(transform_arg) => { + todo!("impl transform: {:?}", transform_arg) + } + }, + } + } +} + trait IterateEntries { fn next_entry(&mut self) -> Result, CommandError>; } From 05985980f8be1af4d086dd9df58a6caf713394cc Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 10:53:52 -0400 Subject: [PATCH 05/68] add terrible ContentTransformer --- cli/src/args.rs | 3 ++- cli/src/extract.rs | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/cli/src/args.rs b/cli/src/args.rs index 8adbc7443..7adf992c4 100644 --- a/cli/src/args.rs +++ b/cli/src/args.rs @@ -681,7 +681,8 @@ pub mod extract { #[derive(Debug)] pub enum ContentTransform { Extract, - /* FIXME: not yet supported */ + /* FIXME: not yet supported -- could be done by exposing ZipFile::take_raw_reader(), but + * should probably just refactor extract.rs to avoid the need for that. */ Raw, LogToStderr, } diff --git a/cli/src/extract.rs b/cli/src/extract.rs index dfa1984f2..663cfc59f 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -345,6 +345,39 @@ where } } +enum EntryContent<'a> { + Decompressed(ZipFile<'a>), + /* See ContentTransform::Raw -- need to refactor this file to avoid the need to convert + * a ZipFile into a Raw after it's already constructed. */ + #[allow(dead_code)] + Raw(ZipFile<'a>), + LogToStderr(ZipFile<'a>), +} + +struct ContentTransformer { + err: Rc>, + arg: ContentTransform, +} + +impl ContentTransformer { + pub fn new(err: Rc>, arg: ContentTransform) -> Self { + Self { err, arg } + } +} + +impl ContentTransformer +where + W: Write, +{ + pub fn transform_matched_entry<'a>(&self, entry: ZipFile<'a>) -> EntryContent<'a> { + match self.arg { + ContentTransform::Extract => EntryContent::Decompressed(entry), + ContentTransform::Raw => unreachable!("this has not been implemented"), + ContentTransform::LogToStderr => EntryContent::LogToStderr(entry), + } + } +} + trait IterateEntries { fn next_entry(&mut self) -> Result, CommandError>; } From 0c78496bf6eb01fc384d4a6174d9b8be2da2d501 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 11:01:58 -0400 Subject: [PATCH 06/68] add entry spec transformer --- cli/src/extract.rs | 47 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 663cfc59f..bbe50c7ed 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -294,7 +294,7 @@ where W: Write, { pub fn evaluate<'s>(&self, name: &'s str) -> Result, CommandError> { - match self.trans { + match &self.trans { NameTransform::Trivial(TrivialTransform::Identity) => Ok(Cow::Borrowed(name)), NameTransform::Basic(basic_trans) => match basic_trans { BasicTransform::StripComponents(num_components_to_strip) => { @@ -303,7 +303,7 @@ where return Ok(Cow::Borrowed(name)); } /* We allow stripping 0 components, which does nothing. */ - if num_components_to_strip == 0 { + if *num_components_to_strip == 0 { return Ok(Cow::Borrowed(name)); } /* Pop off prefix components until only one is left or we have stripped all the @@ -322,7 +322,7 @@ where let leftmost_remaining_separator_index: usize = separator_indices.pop_front().unwrap(); Ok(Cow::Borrowed( - name[(leftmost_remaining_separator_index + 1)..], + &name[(leftmost_remaining_separator_index + 1)..], )) } BasicTransform::AddPrefix(prefix_to_add) => { @@ -378,6 +378,45 @@ where } } +struct EntrySpecTransformer { + err: Rc>, + matcher: Option>, + name_transformers: Vec>, + content: ContentTransformer, +} + +impl EntrySpecTransformer { + pub fn new( + err: Rc>, + match_expr: Option, + name_transforms: impl IntoIterator, + content: ContentTransform, + ) -> Self { + let matcher = match_expr.map(|expr| Matcher::new(err.clone(), expr)); + let name_transformers: Vec<_> = name_transforms + .into_iter() + .map(|trans| Transformer::new(err.clone(), trans)) + .collect(); + let content = ContentTransformer::new(err.clone(), content); + Self { + err, + matcher, + name_transformers, + content, + } + } + + pub fn empty(err: Rc>) -> Self { + let content = ContentTransformer::new(err.clone(), ContentTransform::Extract); + Self { + err, + matcher: None, + name_transformers: Vec::new(), + content, + } + } +} + trait IterateEntries { fn next_entry(&mut self) -> Result, CommandError>; } @@ -476,6 +515,8 @@ impl AllInputZips { pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), CommandError> { writeln!(err, "asdf!").unwrap(); + dbg!(extract); + Ok(()) } From 459ee829904627761eb3ad939dafa3285d1a1bff Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 12:25:59 -0400 Subject: [PATCH 07/68] give up and use an unsafecell --- cli/src/extract.rs | 83 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 65 insertions(+), 18 deletions(-) diff --git a/cli/src/extract.rs b/cli/src/extract.rs index bbe50c7ed..daadefedc 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -1,6 +1,6 @@ use std::{ borrow::Cow, - cell::RefCell, + cell::{RefCell, UnsafeCell}, collections::VecDeque, env, fs, io::{self, Read, Write}, @@ -386,18 +386,18 @@ struct EntrySpecTransformer { } impl EntrySpecTransformer { - pub fn new( - err: Rc>, - match_expr: Option, - name_transforms: impl IntoIterator, - content: ContentTransform, - ) -> Self { + pub fn new(err: Rc>, entry_spec: EntrySpec) -> Self { + let EntrySpec { + match_expr, + name_transforms, + content_transform, + } = entry_spec; let matcher = match_expr.map(|expr| Matcher::new(err.clone(), expr)); let name_transformers: Vec<_> = name_transforms .into_iter() .map(|trans| Transformer::new(err.clone(), trans)) .collect(); - let content = ContentTransformer::new(err.clone(), content); + let content = ContentTransformer::new(err.clone(), content_transform); Self { err, matcher, @@ -421,6 +421,17 @@ trait IterateEntries { fn next_entry(&mut self) -> Result, CommandError>; } +fn make_entry_iterator<'a>( + err: Rc>, + input_type: InputType, +) -> Result, CommandError> { + let ret: Box = match input_type { + InputType::StreamingStdin => Box::new(StdinInput::new(err)), + InputType::ZipPaths(zip_paths) => Box::new(AllInputZips::new(err, zip_paths)?), + }; + Ok(ret) +} + struct StdinInput { err: Rc>, inner: io::Stdin, @@ -482,7 +493,8 @@ impl IterateEntries for ZipFileInput { struct AllInputZips { err: Rc>, - zips_todo: Vec>, + zips_todo: VecDeque>, + cur_zip: UnsafeCell>, } impl AllInputZips { @@ -490,7 +502,7 @@ impl AllInputZips { err: Rc>, zip_paths: impl IntoIterator>, ) -> Result { - let zips_todo = zip_paths + let mut zips_todo = zip_paths .into_iter() .map(|p| { fs::File::open(p.as_ref()) @@ -504,19 +516,54 @@ impl AllInputZips { }) .map(|archive| ZipFileInput::new(Rc::clone(&err), archive)) }) - .collect::, CommandError>>()?; - Ok(Self { err, zips_todo }) + .collect::, CommandError>>()?; + debug_assert!(!zips_todo.is_empty()); + let cur_zip = zips_todo.pop_front().unwrap(); + Ok(Self { + err, + zips_todo, + cur_zip: UnsafeCell::new(cur_zip), + }) } +} - pub fn iter_zips(self) -> impl IntoIterator> { - self.zips_todo.into_iter() +impl IterateEntries for AllInputZips { + fn next_entry(&mut self) -> Result, CommandError> { + loop { + if let Some(entry) = unsafe { &mut *self.cur_zip.get() }.next_entry()? { + return Ok(Some(entry)); + } + match self.zips_todo.pop_front() { + Some(zip) => { + self.cur_zip = UnsafeCell::new(zip); + } + None => { + return Ok(None); + } + } + } } } -pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), CommandError> { - writeln!(err, "asdf!").unwrap(); - - dbg!(extract); +pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandError> { + dbg!(&extract); + let Extract { + output, + entry_specs, + input, + } = extract; + let err = Rc::new(RefCell::new(err)); + + let entry_receiver = make_entry_receiver(err.clone(), output)?; + let entry_spec_transformers: Vec> = if entry_specs.is_empty() { + vec![EntrySpecTransformer::empty(err.clone())] + } else { + entry_specs + .into_iter() + .map(|entry_spec| EntrySpecTransformer::new(err.clone(), entry_spec)) + .collect() + }; + let entry_iterator = make_entry_iterator(err.clone(), input)?; Ok(()) } From 9713c6cd9b030d581a373ad08ee7c147c899d129 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 13:30:57 -0400 Subject: [PATCH 08/68] impl transform_name --- cli/src/args.rs | 2 +- cli/src/extract.rs | 111 +++++++++++++++++++++++++++++---------------- 2 files changed, 74 insertions(+), 39 deletions(-) diff --git a/cli/src/args.rs b/cli/src/args.rs index 7adf992c4..b4d8a61be 100644 --- a/cli/src/args.rs +++ b/cli/src/args.rs @@ -678,7 +678,7 @@ pub mod extract { use std::{collections::VecDeque, ffi::OsString, mem, path::PathBuf}; - #[derive(Debug)] + #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum ContentTransform { Extract, /* FIXME: not yet supported -- could be done by exposing ZipFile::take_raw_reader(), but diff --git a/cli/src/extract.rs b/cli/src/extract.rs index daadefedc..269af28cf 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -345,44 +345,11 @@ where } } -enum EntryContent<'a> { - Decompressed(ZipFile<'a>), - /* See ContentTransform::Raw -- need to refactor this file to avoid the need to convert - * a ZipFile into a Raw after it's already constructed. */ - #[allow(dead_code)] - Raw(ZipFile<'a>), - LogToStderr(ZipFile<'a>), -} - -struct ContentTransformer { - err: Rc>, - arg: ContentTransform, -} - -impl ContentTransformer { - pub fn new(err: Rc>, arg: ContentTransform) -> Self { - Self { err, arg } - } -} - -impl ContentTransformer -where - W: Write, -{ - pub fn transform_matched_entry<'a>(&self, entry: ZipFile<'a>) -> EntryContent<'a> { - match self.arg { - ContentTransform::Extract => EntryContent::Decompressed(entry), - ContentTransform::Raw => unreachable!("this has not been implemented"), - ContentTransform::LogToStderr => EntryContent::LogToStderr(entry), - } - } -} - struct EntrySpecTransformer { err: Rc>, matcher: Option>, name_transformers: Vec>, - content: ContentTransformer, + content_transform: ContentTransform, } impl EntrySpecTransformer { @@ -397,24 +364,92 @@ impl EntrySpecTransformer { .into_iter() .map(|trans| Transformer::new(err.clone(), trans)) .collect(); - let content = ContentTransformer::new(err.clone(), content_transform); Self { err, matcher, name_transformers, - content, + content_transform, } } pub fn empty(err: Rc>) -> Self { - let content = ContentTransformer::new(err.clone(), ContentTransform::Extract); Self { err, matcher: None, name_transformers: Vec::new(), - content, + content_transform: ContentTransform::Extract, + } + } +} + +impl EntrySpecTransformer +where + W: Write, +{ + pub fn matches(&self, entry: &ZipFile) -> Result { + match &self.matcher { + None => Ok(true), + Some(matcher) => matcher.evaluate(entry), } } + + /// Transform the name from the zip entry, maintaining a few invariants: + /// 1. If the transformations all return substrings (no prefixing, non-empty replacements, or + /// empty replacements that lead to non-contiguous input chunks), return a slice of the + /// original input, pointing back to the ZipFile's memory location with associated lifetime. + /// 2. If some intermediate transformation requires an allocation (e.g. adding a prefix), do + /// not perform intermediate reallocations for subsequent substring-only transformations. + /// - TODO: The returned string may be reallocated from the initial allocation exactly once + /// at the end, if substring-only transformations reduced its length. This is because Cow + /// can only describe a substring of the original input or an entirely new allocated + /// string, as opposed to a more general sort of string view wrapper. + pub fn transform_name<'a>(&self, entry: &'a ZipFile<'a>) -> Result, CommandError> { + let mut original_name: &'a str = entry.name(); + let mut newly_allocated_name: Option = None; + let mut newly_allocated_str: Option<&str> = None; + for transformer in self.name_transformers.iter() { + match newly_allocated_str { + Some(s) => match transformer.evaluate(s)? { + Cow::Borrowed(t) => { + let _ = newly_allocated_str.replace(t); + } + Cow::Owned(t) => { + assert!(newly_allocated_name.replace(t).is_some()); + newly_allocated_str = Some(newly_allocated_name.as_ref().unwrap().as_str()); + } + }, + None => match transformer.evaluate(original_name)? { + Cow::Borrowed(t) => { + original_name = t; + } + Cow::Owned(t) => { + assert!(newly_allocated_name.replace(t).is_none()); + newly_allocated_str = Some(newly_allocated_name.as_ref().unwrap().as_str()); + } + }, + } + } + let ret = if newly_allocated_name.is_none() { + /* If we have never allocated anything new, just return the substring of the original + * name! */ + Cow::Borrowed(original_name) + } else { + let subref = newly_allocated_str.unwrap(); + /* If the active substring is the same length as the backing string, assume it's + * unchanged, so we can return the backing string without reallocating. */ + if subref.len() == newly_allocated_name.as_ref().unwrap().len() { + Cow::Owned(newly_allocated_name.unwrap()) + } else { + let reallocated_string = subref.to_string(); + Cow::Owned(reallocated_string) + } + }; + Ok(ret) + } + + pub fn content_transform(&self) -> &ContentTransform { + &self.content_transform + } } trait IterateEntries { From 7dcf7a567a7ab3103ba81dc49c35b99a89eb8c40 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 14:30:43 -0400 Subject: [PATCH 09/68] initial extract impl --- cli/src/args.rs | 17 ++++-- cli/src/extract.rs | 129 ++++++++++++++++++++++++++++++++++----------- cli/src/lib.rs | 3 +- 3 files changed, 115 insertions(+), 34 deletions(-) diff --git a/cli/src/args.rs b/cli/src/args.rs index b4d8a61be..ad6909e04 100644 --- a/cli/src/args.rs +++ b/cli/src/args.rs @@ -1,4 +1,4 @@ -use std::{collections::VecDeque, ffi::OsString, sync::OnceLock}; +use std::{collections::VecDeque, ffi::OsString, fmt, sync::OnceLock}; #[derive(Debug)] pub enum ArgParseError { @@ -150,7 +150,7 @@ pub enum ZipCommand { Extract(extract::Extract), } -pub trait CommandFormat { +pub trait CommandFormat: fmt::Debug { const COMMAND_NAME: &'static str; const COMMAND_TABS: &'static str; const COMMAND_DESCRIPTION: &'static str; @@ -682,7 +682,9 @@ pub mod extract { pub enum ContentTransform { Extract, /* FIXME: not yet supported -- could be done by exposing ZipFile::take_raw_reader(), but - * should probably just refactor extract.rs to avoid the need for that. */ + * should probably just refactor extract.rs to avoid the need for that. + * NB: actually, we can't do that while supporting streaming archives unless we expose + * take_raw_reader()! */ Raw, LogToStderr, } @@ -1599,6 +1601,8 @@ These results are dependent on the entry data: the string argument is interpreted into a string matching predicate against the entry name. + TODO: this flag is not yet supported and will produce an error. + ## Name transforms (name-transform): @@ -1633,6 +1637,8 @@ Complex: These transformers perform complex pattern matching and replacement upon the entry name string: +TODO: these flags are not yet supported and will produce an error. + --transform[=][:] Extract the portion of the entry name corresponding to , search it against corresponding to , and then @@ -1656,6 +1662,9 @@ entry itself. entry may be matched more than once. In this case, the entry's content will be extracted more than once over the execution of this command. +TODO: multiple entry specs with content transforms that extract output more than once require entry +teeing, which is not yet supported, so will produce an error. + -x, --extract Decompress the entry's contents (if necessary) before writing it to the output. @@ -1664,6 +1673,8 @@ extracted more than once over the execution of this command. Do not decompress entry contents at all before writing its content to the output. + TODO: this flag is not yet supported and will produce an error. + --log-to-stderr Write the (possibly transformed) entry name to stderr, without reading its content at all. diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 269af28cf..b44b8842b 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -17,8 +17,12 @@ use zip::{ use crate::{args::extract::*, CommandError, WrapCommandErr}; trait EntryReceiver { - fn receive_entry(&self, entry: ZipFile, name: &str) -> Result<(), CommandError>; - fn finalize_entries(&self) -> Result<(), CommandError>; + fn receive_entry<'a>( + &mut self, + entry: &mut ZipFile<'a>, + name: &str, + ) -> Result<(), CommandError>; + fn finalize_entries(&mut self) -> Result<(), CommandError>; } fn make_entry_receiver<'a>( @@ -63,7 +67,11 @@ impl EntryReceiver for StdoutReceiver where W: Write, { - fn receive_entry(&self, mut entry: ZipFile, name: &str) -> Result<(), CommandError> { + fn receive_entry<'a>( + &mut self, + entry: &mut ZipFile<'a>, + name: &str, + ) -> Result<(), CommandError> { let mut err = self.err.borrow_mut(); writeln!(err, "receiving entry {} with name {name}", entry.name()).unwrap(); if entry.is_dir() { @@ -71,13 +79,13 @@ where } else if entry.is_symlink() { writeln!(err, "entry is symlink, ignoring").unwrap(); } else { - io::copy(&mut entry, &mut self.stdout.lock()) + io::copy(entry, &mut self.stdout) .wrap_err_with(|| format!("failed to write entry {name} to stdout"))?; } Ok(()) } - fn finalize_entries(&self) -> Result<(), CommandError> { + fn finalize_entries(&mut self) -> Result<(), CommandError> { Ok(()) } } @@ -86,7 +94,7 @@ struct FilesystemReceiver { err: Rc>, output_dir: PathBuf, #[cfg(unix)] - perms_to_set: RefCell>, + perms_to_set: Vec<(PathBuf, u32)>, } impl FilesystemReceiver { @@ -95,7 +103,7 @@ impl FilesystemReceiver { err, output_dir, #[cfg(unix)] - perms_to_set: RefCell::new(Vec::new()), + perms_to_set: Vec::new(), } } } @@ -104,7 +112,11 @@ impl EntryReceiver for FilesystemReceiver where W: Write, { - fn receive_entry(&self, mut entry: ZipFile, name: &str) -> Result<(), CommandError> { + fn receive_entry<'a>( + &mut self, + entry: &mut ZipFile<'a>, + name: &str, + ) -> Result<(), CommandError> { let mut err = self.err.borrow_mut(); let full_output_path = self.output_dir.join(name); writeln!( @@ -121,9 +133,7 @@ where "storing unix mode {mode} for path {full_output_path:?}" ) .unwrap(); - self.perms_to_set - .borrow_mut() - .push((full_output_path.clone(), mode)); + self.perms_to_set.push((full_output_path.clone(), mode)); } if entry.is_dir() { @@ -170,7 +180,7 @@ where } let mut outfile = fs::File::create(&full_output_path) .wrap_err_with(|| format!("failed to create file at {full_output_path:?}"))?; - io::copy(&mut entry, &mut outfile).wrap_err_with(|| { + io::copy(entry, &mut outfile).wrap_err_with(|| { format!( "failed to copy file contents from {} to {full_output_path:?}", entry.name() @@ -180,12 +190,12 @@ where Ok(()) } - fn finalize_entries(&self) -> Result<(), CommandError> { + fn finalize_entries(&mut self) -> Result<(), CommandError> { #[cfg(unix)] { use std::{cmp::Reverse, os::unix::fs::PermissionsExt}; - let mut perms_to_set = mem::take(&mut *self.perms_to_set.borrow_mut()); + let mut perms_to_set = mem::take(&mut self.perms_to_set); perms_to_set.sort_unstable_by_key(|(path, _)| Reverse(path.clone())); for (path, mode) in perms_to_set.into_iter() { let perms = fs::Permissions::from_mode(mode); @@ -403,8 +413,8 @@ where /// at the end, if substring-only transformations reduced its length. This is because Cow /// can only describe a substring of the original input or an entirely new allocated /// string, as opposed to a more general sort of string view wrapper. - pub fn transform_name<'a>(&self, entry: &'a ZipFile<'a>) -> Result, CommandError> { - let mut original_name: &'a str = entry.name(); + pub fn transform_name(&self, entry: &ZipFile) -> Result { + let mut original_name: &str = entry.name(); let mut newly_allocated_name: Option = None; let mut newly_allocated_str: Option<&str> = None; for transformer in self.name_transformers.iter() { @@ -432,16 +442,16 @@ where let ret = if newly_allocated_name.is_none() { /* If we have never allocated anything new, just return the substring of the original * name! */ - Cow::Borrowed(original_name) + original_name.to_string() } else { let subref = newly_allocated_str.unwrap(); /* If the active substring is the same length as the backing string, assume it's * unchanged, so we can return the backing string without reallocating. */ if subref.len() == newly_allocated_name.as_ref().unwrap().len() { - Cow::Owned(newly_allocated_name.unwrap()) + newly_allocated_name.unwrap() } else { let reallocated_string = subref.to_string(); - Cow::Owned(reallocated_string) + reallocated_string } }; Ok(ret) @@ -580,8 +590,48 @@ impl IterateEntries for AllInputZips { } } +fn process_entry_specs( + err: Rc>, + entry_specs: impl IntoIterator, +) -> Result>, CommandError> +where + W: Write, +{ + let entry_spec_transformers: Vec> = entry_specs + .into_iter() + .map(|spec| EntrySpecTransformer::new(err.clone(), spec)) + .collect(); + if entry_spec_transformers.is_empty() { + return Ok(vec![EntrySpecTransformer::empty(err.clone())]); + }; + + /* Perform some validation on the transforms since we don't currently support everything we + * want to. */ + if entry_spec_transformers + .iter() + .any(|t| *t.content_transform() == ContentTransform::Raw) + { + /* TODO: this can be solved if we can convert a ZipFile into a Raw reader! */ + return Err(CommandError::InvalidArg( + "--raw extraction output is not yet supported".to_string(), + )); + } + if entry_spec_transformers + .iter() + .filter(|t| *t.content_transform() != ContentTransform::LogToStderr) + .count() + > 1 + { + /* TODO: this can be solved by separating data from entries! */ + return Err(CommandError::InvalidArg( + "more than one entry spec using a content transform which reads content (i.e. was not --log-to-stderr) was provided; this requires teeing entry contents which is not yet supported".to_string(), + )); + } + + Ok(entry_spec_transformers) +} + pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandError> { - dbg!(&extract); let Extract { output, entry_specs, @@ -589,16 +639,35 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE } = extract; let err = Rc::new(RefCell::new(err)); - let entry_receiver = make_entry_receiver(err.clone(), output)?; - let entry_spec_transformers: Vec> = if entry_specs.is_empty() { - vec![EntrySpecTransformer::empty(err.clone())] - } else { - entry_specs - .into_iter() - .map(|entry_spec| EntrySpecTransformer::new(err.clone(), entry_spec)) - .collect() - }; - let entry_iterator = make_entry_iterator(err.clone(), input)?; + let mut entry_receiver = make_entry_receiver(err.clone(), output)?; + let entry_spec_transformers = process_entry_specs(err.clone(), entry_specs)?; + let mut stderr_log_output = io::stderr(); + let mut entry_iterator = make_entry_iterator(err.clone(), input)?; + + while let Some(mut entry) = entry_iterator.next_entry()? { + for transformer in entry_spec_transformers.iter() { + if !transformer.matches(&entry)? { + continue; + } + let name: String = transformer.transform_name(&entry)?; + match transformer.content_transform() { + ContentTransform::Raw => unreachable!(), + ContentTransform::LogToStderr => { + writeln!( + &mut stderr_log_output, + "log to stderr: entry with original name {} and transformed name {}, compression method {}, uncompressed size {}", + entry.name(), name, entry.compression(), entry.size() + ) + .unwrap(); + continue; + } + ContentTransform::Extract => { + entry_receiver.receive_entry(&mut entry, &name)?; + } + } + } + } + entry_receiver.finalize_entries()?; Ok(()) } diff --git a/cli/src/lib.rs b/cli/src/lib.rs index 73d1c1397..e3b526e61 100644 --- a/cli/src/lib.rs +++ b/cli/src/lib.rs @@ -108,10 +108,11 @@ pub mod driver { pub trait ExecuteCommand: CommandFormat { fn execute(self, err: impl Write) -> Result<(), CommandError>; - fn do_main(self, err: impl Write) -> ! + fn do_main(self, mut err: impl Write) -> ! where Self: Sized, { + writeln!(&mut err, "{} args: {:?}", Self::COMMAND_NAME, &self).unwrap(); match self.execute(err) { Ok(()) => process::exit(ZipCli::NON_FAILURE_EXIT_CODE), Err(e) => match e { From 095167c4a03c966e4d433e233524373ce5eb75ff Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 15:29:01 -0400 Subject: [PATCH 10/68] add name matchers --- cli/Cargo.toml | 3 ++ cli/src/args.rs | 24 ++++++++--- cli/src/extract.rs | 104 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 126 insertions(+), 5 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index d787a5048..fb880e115 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -24,6 +24,9 @@ members = ["."] name = "zip-cli" [dependencies] +# TODO: make these optional deps? +glob = "0.3" +regex = "1" [dependencies.zip] path = ".." diff --git a/cli/src/args.rs b/cli/src/args.rs index ad6909e04..9fa8706dd 100644 --- a/cli/src/args.rs +++ b/cli/src/args.rs @@ -689,7 +689,7 @@ pub mod extract { LogToStderr, } - #[derive(Debug, Default, PartialEq, Eq)] + #[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Copy, Clone)] pub enum ComponentSelector { #[default] Path, @@ -710,7 +710,7 @@ pub mod extract { } } - #[derive(Debug, Default, PartialEq, Eq)] + #[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Copy, Clone)] pub enum PatternSelectorType { #[default] Glob, @@ -743,10 +743,15 @@ pub mod extract { } } + #[derive(Debug, Default, Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] + pub struct PatternModifiers { + pub case_insensitive: bool, + } + #[derive(Debug, Default)] pub struct PatternSelector { pub pat_sel: PatternSelectorType, - pub modifiers: Vec, + pub modifiers: PatternModifiers, } impl PatternSelector { @@ -757,17 +762,26 @@ pub mod extract { let modifiers_str = &s[(modifiers_ind + 1)..]; let pat_sel = PatternSelectorType::parse(pat_sel_str)?; - let modifiers = modifiers_str + + let mut modifiers = PatternModifiers::default(); + let mod_els = modifiers_str .split(|c| *c == b':') .map(PatternSelectorModifier::parse) .collect::>>()?; + for m in mod_els.into_iter() { + match m { + PatternSelectorModifier::CaseInsensitive => { + modifiers.case_insensitive = true; + } + } + } Some(Self { pat_sel, modifiers }) } None => { let pat_sel = PatternSelectorType::parse(s)?; Some(Self { pat_sel, - modifiers: Vec::new(), + modifiers: Default::default(), }) } } diff --git a/cli/src/extract.rs b/cli/src/extract.rs index b44b8842b..b7c8e2840 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -9,6 +9,9 @@ use std::{ rc::Rc, }; +use glob; +use regex; + use zip::{ read::{read_zipfile_from_stream, ZipFile}, ZipArchive, @@ -207,6 +210,107 @@ where } } +fn process_component_selector<'s>(sel: ComponentSelector, name: &'s str) -> Option<&'s str> { + let path = Path::new(name); + match sel { + ComponentSelector::Path => Some(name), + ComponentSelector::Basename => path.file_name().map(|bname| bname.to_str().unwrap()), + ComponentSelector::Dirname => path + .parent() + .map(|p| p.to_str().unwrap()) + /* "a".parent() becomes Some(""), which we want to treat as no parent */ + .filter(|s| !s.is_empty()), + ComponentSelector::FileExtension => path.extension().map(|ext| ext.to_str().unwrap()), + } +} + +trait NameMatcher { + fn create(pattern: &str, opts: PatternModifiers) -> Result + where + Self: Sized; + fn matches(&self, input: &str) -> bool; +} + +struct LiteralMatcher { + lit: String, + case_insensitive: bool, +} + +impl NameMatcher for LiteralMatcher { + fn create(pattern: &str, opts: PatternModifiers) -> Result + where + Self: Sized, + { + let PatternModifiers { case_insensitive } = opts; + Ok(Self { + lit: pattern.to_string(), + case_insensitive, + }) + } + + fn matches(&self, input: &str) -> bool { + if self.case_insensitive { + self.lit.eq_ignore_ascii_case(input) + } else { + input == &self.lit + } + } +} + +struct GlobMatcher { + pat: glob::Pattern, + glob_opts: glob::MatchOptions, +} + +impl NameMatcher for GlobMatcher { + fn create(pattern: &str, opts: PatternModifiers) -> Result + where + Self: Sized, + { + let PatternModifiers { case_insensitive } = opts; + let glob_opts = glob::MatchOptions { + case_sensitive: !case_insensitive, + ..Default::default() + }; + let pat = glob::Pattern::new(pattern).map_err(|e| { + CommandError::InvalidArg(format!( + "failed to construct glob matcher from pattern {pattern:?}: {e}" + )) + })?; + Ok(Self { pat, glob_opts }) + } + + fn matches(&self, input: &str) -> bool { + self.pat.matches_with(input, self.glob_opts) + } +} + +struct RegexMatcher { + pat: regex::Regex, +} + +impl NameMatcher for RegexMatcher { + fn create(pattern: &str, opts: PatternModifiers) -> Result + where + Self: Sized, + { + let PatternModifiers { case_insensitive } = opts; + let pat = regex::RegexBuilder::new(pattern) + .case_insensitive(case_insensitive) + .build() + .map_err(|e| { + CommandError::InvalidArg(format!( + "failed to construct regex matcher from pattern {pattern:?}: {e}" + )) + })?; + Ok(Self { pat }) + } + + fn matches(&self, input: &str) -> bool { + self.pat.is_match(input) + } +} + struct Matcher { err: Rc>, expr: MatchExpression, From b397d6a9f2fd45234fe0cf115ada667b14ce2e91 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 16:43:02 -0400 Subject: [PATCH 11/68] impl --match --- cli/src/extract.rs | 356 +++++++++++++++++++++++++++++++++------------ 1 file changed, 262 insertions(+), 94 deletions(-) diff --git a/cli/src/extract.rs b/cli/src/extract.rs index b7c8e2840..70915a810 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -14,7 +14,7 @@ use regex; use zip::{ read::{read_zipfile_from_stream, ZipFile}, - ZipArchive, + CompressionMethod, ZipArchive, }; use crate::{args::extract::*, CommandError, WrapCommandErr}; @@ -210,6 +210,7 @@ where } } +#[inline(always)] fn process_component_selector<'s>(sel: ComponentSelector, name: &'s str) -> Option<&'s str> { let path = Path::new(name); match sel { @@ -311,83 +312,249 @@ impl NameMatcher for RegexMatcher { } } -struct Matcher { - err: Rc>, - expr: MatchExpression, +trait EntryMatcher { + type Arg + where + Self: Sized; + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized; + fn matches(&self, entry: &ZipFile) -> bool; +} + +#[derive(Copy, Clone)] +enum TrivialMatcher { + True, + False, } -impl Matcher { - pub fn new(err: Rc>, expr: MatchExpression) -> Self { - Self { err, expr } +impl EntryMatcher for TrivialMatcher { + type Arg = TrivialPredicate where Self: Sized; + + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(match arg { + TrivialPredicate::True => Self::True, + TrivialPredicate::False => Self::False, + }) + } + + fn matches(&self, _entry: &ZipFile) -> bool { + match self { + Self::True => true, + Self::False => false, + } } } -impl Matcher -where - W: Write, -{ - pub fn evaluate(&self, entry: &ZipFile) -> Result { - let Self { err, expr } = self; - Self::recursive_match(err, &expr, entry) - } - - fn recursive_match( - err: &RefCell, - expr: &MatchExpression, - entry: &ZipFile, - ) -> Result { - match expr { - MatchExpression::PrimitivePredicate(predicate) => match predicate { - Predicate::Trivial(trivial) => match trivial { - TrivialPredicate::True => Ok(true), - TrivialPredicate::False => Ok(false), - }, - Predicate::EntryType(entry_type) => match entry_type { - EntryType::File => Ok(!entry.is_dir() && !entry.is_symlink()), - EntryType::Dir => Ok(entry.is_dir()), - EntryType::Symlink => Ok(entry.is_symlink()), - }, - Predicate::CompressionMethod(method_arg) => match method_arg { - CompressionMethodArg::NonSpecific(nonspecific_arg) => match nonspecific_arg { - NonSpecificCompressionMethodArg::Any => Ok(true), - NonSpecificCompressionMethodArg::Known => { - Ok(SpecificCompressionMethodArg::KNOWN_COMPRESSION_METHODS - .contains(&entry.compression())) - } - }, - CompressionMethodArg::Specific(specific_arg) => { - Ok(specific_arg.translate_to_zip() == entry.compression()) - } - }, - Predicate::DepthLimit(limit_arg) => match limit_arg { - DepthLimitArg::Max(max) => { - let max: usize = (*max).into(); - Ok(entry.name().split('/').count() <= max) - } - DepthLimitArg::Min(min) => { - let min: usize = (*min).into(); - Ok(entry.name().split('/').count() >= min) - } - }, - Predicate::Match(match_arg) => todo!("{match_arg:?}"), +#[derive(Copy, Clone)] +enum EntryTypeMatcher { + File, + Dir, + Symlink, +} + +impl EntryMatcher for EntryTypeMatcher { + type Arg = EntryType where Self: Sized; + + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(match arg { + EntryType::File => Self::File, + EntryType::Dir => Self::Dir, + EntryType::Symlink => Self::Symlink, + }) + } + + fn matches(&self, entry: &ZipFile) -> bool { + match self { + Self::File => !entry.is_dir() && !entry.is_symlink(), + Self::Dir => entry.is_dir(), + Self::Symlink => entry.is_symlink(), + } + } +} + +#[derive(Copy, Clone)] +enum NonSpecificMethods { + Any, + Known, +} + +impl EntryMatcher for NonSpecificMethods { + type Arg = NonSpecificCompressionMethodArg where Self: Sized; + + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(match arg { + NonSpecificCompressionMethodArg::Any => Self::Any, + NonSpecificCompressionMethodArg::Known => Self::Known, + }) + } + + fn matches(&self, entry: &ZipFile) -> bool { + match self { + Self::Any => true, + Self::Known => SpecificCompressionMethodArg::KNOWN_COMPRESSION_METHODS + .contains(&entry.compression()), + } + } +} + +struct SpecificMethods { + specific_method: CompressionMethod, +} + +impl EntryMatcher for SpecificMethods { + type Arg = SpecificCompressionMethodArg where Self: Sized; + + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(Self { + specific_method: arg.translate_to_zip(), + }) + } + + fn matches(&self, entry: &ZipFile) -> bool { + self.specific_method == entry.compression() + } +} + +#[derive(Copy, Clone)] +enum DepthLimit { + Max(usize), + Min(usize), +} + +impl EntryMatcher for DepthLimit { + type Arg = DepthLimitArg where Self: Sized; + + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(match arg { + DepthLimitArg::Max(max) => Self::Max(max.into()), + DepthLimitArg::Min(min) => Self::Min(min.into()), + }) + } + + fn matches(&self, entry: &ZipFile) -> bool { + let num_components = entry.name().split('/').count(); + match self { + Self::Max(max) => num_components <= *max, + Self::Min(min) => num_components >= *min, + } + } +} + +struct PatternMatcher { + matcher: Box, + comp_sel: ComponentSelector, +} + +impl EntryMatcher for PatternMatcher { + type Arg = MatchArg where Self: Sized; + + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + let MatchArg { + comp_sel, + pat_sel: PatternSelector { pat_sel, modifiers }, + pattern, + } = arg; + + let matcher: Box = match pat_sel { + PatternSelectorType::Glob => Box::new(GlobMatcher::create(&pattern, modifiers)?), + PatternSelectorType::Literal => Box::new(LiteralMatcher::create(&pattern, modifiers)?), + PatternSelectorType::Regexp => Box::new(RegexMatcher::create(&pattern, modifiers)?), + }; + + Ok(Self { matcher, comp_sel }) + } + + fn matches(&self, entry: &ZipFile) -> bool { + match process_component_selector(self.comp_sel, entry.name()) { + None => false, + Some(s) => self.matcher.matches(s), + } + } +} + +enum WrappedMatcher { + Primitive(Box), + Negated(Box), + And { + left: Box, + right: Box, + }, + Or { + left: Box, + right: Box, + }, +} + +impl WrappedMatcher { + fn create_primitive(arg: Predicate) -> Result { + Ok(Self::Primitive(match arg { + Predicate::Trivial(arg) => Box::new(TrivialMatcher::from_arg(arg)?), + Predicate::EntryType(arg) => Box::new(EntryTypeMatcher::from_arg(arg)?), + Predicate::CompressionMethod(method_arg) => match method_arg { + CompressionMethodArg::NonSpecific(arg) => { + Box::new(NonSpecificMethods::from_arg(arg)?) + } + CompressionMethodArg::Specific(arg) => Box::new(SpecificMethods::from_arg(arg)?), }, - MatchExpression::Negated(inner) => { - Self::recursive_match(err, inner.as_ref(), entry).map(|result| !result) - } + Predicate::DepthLimit(arg) => Box::new(DepthLimit::from_arg(arg)?), + Predicate::Match(arg) => Box::new(PatternMatcher::from_arg(arg)?), + })) + } +} + +impl EntryMatcher for WrappedMatcher { + type Arg = MatchExpression where Self: Sized; + + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(match arg { + MatchExpression::PrimitivePredicate(pred) => Self::create_primitive(pred)?, + MatchExpression::Negated(arg) => Self::Negated(Box::new(Self::from_arg(*arg)?)), MatchExpression::And { explicit: _, left, right, } => { - /* Short-circuiting, so do left first. */ - Ok(Self::recursive_match(err, left.as_ref(), entry)? - && Self::recursive_match(err, right.as_ref(), entry)?) + let left = Box::new(Self::from_arg(*left)?); + let right = Box::new(Self::from_arg(*right)?); + Self::And { left, right } } MatchExpression::Or { left, right } => { - Ok(Self::recursive_match(err, left.as_ref(), entry)? - || Self::recursive_match(err, right.as_ref(), entry)?) + let left = Box::new(Self::from_arg(*left)?); + let right = Box::new(Self::from_arg(*right)?); + Self::Or { left, right } } - MatchExpression::Grouped(inner) => Self::recursive_match(err, inner.as_ref(), entry), + MatchExpression::Grouped(inner) => Self::from_arg(*inner)?, + }) + } + + fn matches(&self, entry: &ZipFile) -> bool { + match self { + Self::Primitive(m) => m.matches(entry), + Self::Negated(m) => !m.matches(entry), + Self::And { left, right } => left.matches(entry) && right.matches(entry), + Self::Or { left, right } => left.matches(entry) || right.matches(entry), } } } @@ -407,18 +574,18 @@ impl Transformer where W: Write, { - pub fn evaluate<'s>(&self, name: &'s str) -> Result, CommandError> { + pub fn evaluate<'s>(&self, name: &'s str) -> Cow<'s, str> { match &self.trans { - NameTransform::Trivial(TrivialTransform::Identity) => Ok(Cow::Borrowed(name)), + NameTransform::Trivial(TrivialTransform::Identity) => Cow::Borrowed(name), NameTransform::Basic(basic_trans) => match basic_trans { BasicTransform::StripComponents(num_components_to_strip) => { /* If no directory components, then nothing to strip. */ if !name.contains('/') { - return Ok(Cow::Borrowed(name)); + return Cow::Borrowed(name); } /* We allow stripping 0 components, which does nothing. */ if *num_components_to_strip == 0 { - return Ok(Cow::Borrowed(name)); + return Cow::Borrowed(name); } /* Pop off prefix components until only one is left or we have stripped all the * requested prefix components. */ @@ -435,16 +602,14 @@ where debug_assert!(separator_indices.len() > 0); let leftmost_remaining_separator_index: usize = separator_indices.pop_front().unwrap(); - Ok(Cow::Borrowed( - &name[(leftmost_remaining_separator_index + 1)..], - )) + Cow::Borrowed(&name[(leftmost_remaining_separator_index + 1)..]) } BasicTransform::AddPrefix(prefix_to_add) => { /* We allow an empty prefix, which means to do nothing. */ if prefix_to_add.is_empty() { - return Ok(Cow::Borrowed(name)); + return Cow::Borrowed(name); } - Ok(Cow::Owned(format!("{}/{}", prefix_to_add, name))) + Cow::Owned(format!("{}/{}", prefix_to_add, name)) } }, NameTransform::Complex(complex_trans) => match complex_trans { @@ -461,29 +626,32 @@ where struct EntrySpecTransformer { err: Rc>, - matcher: Option>, + matcher: Option, name_transformers: Vec>, content_transform: ContentTransform, } impl EntrySpecTransformer { - pub fn new(err: Rc>, entry_spec: EntrySpec) -> Self { + pub fn new(err: Rc>, entry_spec: EntrySpec) -> Result { let EntrySpec { match_expr, name_transforms, content_transform, } = entry_spec; - let matcher = match_expr.map(|expr| Matcher::new(err.clone(), expr)); + let matcher = match match_expr { + None => None, + Some(expr) => Some(WrappedMatcher::from_arg(expr)?), + }; let name_transformers: Vec<_> = name_transforms .into_iter() .map(|trans| Transformer::new(err.clone(), trans)) .collect(); - Self { + Ok(Self { err, matcher, name_transformers, content_transform, - } + }) } pub fn empty(err: Rc>) -> Self { @@ -500,10 +668,10 @@ impl EntrySpecTransformer where W: Write, { - pub fn matches(&self, entry: &ZipFile) -> Result { + pub fn matches(&self, entry: &ZipFile) -> bool { match &self.matcher { - None => Ok(true), - Some(matcher) => matcher.evaluate(entry), + None => true, + Some(matcher) => matcher.matches(entry), } } @@ -517,13 +685,12 @@ where /// at the end, if substring-only transformations reduced its length. This is because Cow /// can only describe a substring of the original input or an entirely new allocated /// string, as opposed to a more general sort of string view wrapper. - pub fn transform_name(&self, entry: &ZipFile) -> Result { - let mut original_name: &str = entry.name(); + pub fn transform_name<'s>(&self, mut original_name: &'s str) -> Cow<'s, str> { let mut newly_allocated_name: Option = None; let mut newly_allocated_str: Option<&str> = None; for transformer in self.name_transformers.iter() { match newly_allocated_str { - Some(s) => match transformer.evaluate(s)? { + Some(s) => match transformer.evaluate(s) { Cow::Borrowed(t) => { let _ = newly_allocated_str.replace(t); } @@ -532,7 +699,7 @@ where newly_allocated_str = Some(newly_allocated_name.as_ref().unwrap().as_str()); } }, - None => match transformer.evaluate(original_name)? { + None => match transformer.evaluate(original_name) { Cow::Borrowed(t) => { original_name = t; } @@ -543,22 +710,22 @@ where }, } } - let ret = if newly_allocated_name.is_none() { + + if newly_allocated_name.is_none() { /* If we have never allocated anything new, just return the substring of the original * name! */ - original_name.to_string() + Cow::Borrowed(original_name) } else { let subref = newly_allocated_str.unwrap(); /* If the active substring is the same length as the backing string, assume it's * unchanged, so we can return the backing string without reallocating. */ if subref.len() == newly_allocated_name.as_ref().unwrap().len() { - newly_allocated_name.unwrap() + Cow::Owned(newly_allocated_name.unwrap()) } else { let reallocated_string = subref.to_string(); - reallocated_string + Cow::Owned(reallocated_string) } - }; - Ok(ret) + } } pub fn content_transform(&self) -> &ContentTransform { @@ -704,7 +871,7 @@ where let entry_spec_transformers: Vec> = entry_specs .into_iter() .map(|spec| EntrySpecTransformer::new(err.clone(), spec)) - .collect(); + .collect::>()?; if entry_spec_transformers.is_empty() { return Ok(vec![EntrySpecTransformer::empty(err.clone())]); }; @@ -750,10 +917,10 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE while let Some(mut entry) = entry_iterator.next_entry()? { for transformer in entry_spec_transformers.iter() { - if !transformer.matches(&entry)? { + if !transformer.matches(&entry) { continue; } - let name: String = transformer.transform_name(&entry)?; + let name = transformer.transform_name(entry.name()); match transformer.content_transform() { ContentTransform::Raw => unreachable!(), ContentTransform::LogToStderr => { @@ -766,6 +933,7 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE continue; } ContentTransform::Extract => { + let name = name.into_owned(); entry_receiver.receive_entry(&mut entry, &name)?; } } From d36190720fbb6f69a671385b3729ca3e5ae6cd6f Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 16:51:41 -0400 Subject: [PATCH 12/68] modularize extract --- cli/src/extract.rs | 911 +---------------------------------- cli/src/extract/entries.rs | 132 +++++ cli/src/extract/matcher.rs | 360 ++++++++++++++ cli/src/extract/receiver.rs | 203 ++++++++ cli/src/extract/transform.rs | 208 ++++++++ 5 files changed, 913 insertions(+), 901 deletions(-) create mode 100644 cli/src/extract/entries.rs create mode 100644 cli/src/extract/matcher.rs create mode 100644 cli/src/extract/receiver.rs create mode 100644 cli/src/extract/transform.rs diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 70915a810..446886b9f 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -1,906 +1,15 @@ use std::{ - borrow::Cow, - cell::{RefCell, UnsafeCell}, - collections::VecDeque, - env, fs, - io::{self, Read, Write}, - mem, - path::{Path, PathBuf}, + cell::RefCell, + io::{self, Write}, rc::Rc, }; -use glob; -use regex; +use crate::{args::extract::*, CommandError}; -use zip::{ - read::{read_zipfile_from_stream, ZipFile}, - CompressionMethod, ZipArchive, -}; - -use crate::{args::extract::*, CommandError, WrapCommandErr}; - -trait EntryReceiver { - fn receive_entry<'a>( - &mut self, - entry: &mut ZipFile<'a>, - name: &str, - ) -> Result<(), CommandError>; - fn finalize_entries(&mut self) -> Result<(), CommandError>; -} - -fn make_entry_receiver<'a>( - err: Rc>, - collation: OutputCollation, -) -> Result, CommandError> { - let ret: Box = match collation { - OutputCollation::ConcatenateStdout => Box::new(StdoutReceiver::new(err)), - OutputCollation::Filesystem { output_dir, mkdir } => { - let output_dir = match output_dir { - Some(dir) => { - if mkdir { - fs::create_dir_all(&dir).wrap_err_with(|| { - format!("failed to create output directory {dir:?}") - })?; - } - dir - } - None => env::current_dir().wrap_err("failed to get current dir")?, - }; - Box::new(FilesystemReceiver::new(err, output_dir)) - } - }; - Ok(ret) -} - -struct StdoutReceiver { - err: Rc>, - stdout: io::Stdout, -} - -impl StdoutReceiver { - pub fn new(err: Rc>) -> Self { - Self { - err, - stdout: io::stdout(), - } - } -} - -impl EntryReceiver for StdoutReceiver -where - W: Write, -{ - fn receive_entry<'a>( - &mut self, - entry: &mut ZipFile<'a>, - name: &str, - ) -> Result<(), CommandError> { - let mut err = self.err.borrow_mut(); - writeln!(err, "receiving entry {} with name {name}", entry.name()).unwrap(); - if entry.is_dir() { - writeln!(err, "entry is directory, ignoring").unwrap(); - } else if entry.is_symlink() { - writeln!(err, "entry is symlink, ignoring").unwrap(); - } else { - io::copy(entry, &mut self.stdout) - .wrap_err_with(|| format!("failed to write entry {name} to stdout"))?; - } - Ok(()) - } - - fn finalize_entries(&mut self) -> Result<(), CommandError> { - Ok(()) - } -} - -struct FilesystemReceiver { - err: Rc>, - output_dir: PathBuf, - #[cfg(unix)] - perms_to_set: Vec<(PathBuf, u32)>, -} - -impl FilesystemReceiver { - pub fn new(err: Rc>, output_dir: PathBuf) -> Self { - Self { - err, - output_dir, - #[cfg(unix)] - perms_to_set: Vec::new(), - } - } -} - -impl EntryReceiver for FilesystemReceiver -where - W: Write, -{ - fn receive_entry<'a>( - &mut self, - entry: &mut ZipFile<'a>, - name: &str, - ) -> Result<(), CommandError> { - let mut err = self.err.borrow_mut(); - let full_output_path = self.output_dir.join(name); - writeln!( - err, - "receiving entry {} with name {name} and writing to path {full_output_path:?}", - entry.name() - ) - .unwrap(); - - #[cfg(unix)] - if let Some(mode) = entry.unix_mode() { - writeln!( - err, - "storing unix mode {mode} for path {full_output_path:?}" - ) - .unwrap(); - self.perms_to_set.push((full_output_path.clone(), mode)); - } - - if entry.is_dir() { - writeln!(err, "entry is directory, creating").unwrap(); - fs::create_dir_all(&full_output_path).wrap_err_with(|| { - format!("failed to create directory entry at {full_output_path:?}") - })?; - } else if entry.is_symlink() { - let mut target: Vec = Vec::with_capacity(entry.size().try_into().unwrap()); - entry.read_to_end(&mut target).wrap_err_with(|| { - format!( - "failed to read symlink target from zip archive entry {}", - entry.name() - ) - })?; - - #[cfg(unix)] - { - use std::{ - ffi::OsString, - os::unix::{ffi::OsStringExt, fs::symlink}, - }; - let target = OsString::from_vec(target); - writeln!(err, "entry is symlink to {target:?}, creating").unwrap(); - symlink(&target, &full_output_path).wrap_err_with(|| { - format!( - "failed to create symlink at {full_output_path:?} with target {target:?}" - ) - })?; - } - #[cfg(not(unix))] - { - /* FIXME: non-unix symlink extraction not yet supported! */ - todo!("TODO: cannot create symlink for entry {name} on non-unix yet!"); - } - } else { - writeln!(err, "entry is file, creating").unwrap(); - if let Some(containing_dir) = full_output_path.parent() { - fs::create_dir_all(containing_dir).wrap_err_with(|| { - format!("failed to create parent dirs for file at {full_output_path:?}") - })?; - } else { - writeln!(err, "entry had no parent dir (in root dir?)").unwrap(); - } - let mut outfile = fs::File::create(&full_output_path) - .wrap_err_with(|| format!("failed to create file at {full_output_path:?}"))?; - io::copy(entry, &mut outfile).wrap_err_with(|| { - format!( - "failed to copy file contents from {} to {full_output_path:?}", - entry.name() - ) - })?; - } - Ok(()) - } - - fn finalize_entries(&mut self) -> Result<(), CommandError> { - #[cfg(unix)] - { - use std::{cmp::Reverse, os::unix::fs::PermissionsExt}; - - let mut perms_to_set = mem::take(&mut self.perms_to_set); - perms_to_set.sort_unstable_by_key(|(path, _)| Reverse(path.clone())); - for (path, mode) in perms_to_set.into_iter() { - let perms = fs::Permissions::from_mode(mode); - fs::set_permissions(&path, perms.clone()) - .wrap_err_with(|| format!("error setting perms {perms:?} for path {path:?}"))?; - } - } - Ok(()) - } -} - -#[inline(always)] -fn process_component_selector<'s>(sel: ComponentSelector, name: &'s str) -> Option<&'s str> { - let path = Path::new(name); - match sel { - ComponentSelector::Path => Some(name), - ComponentSelector::Basename => path.file_name().map(|bname| bname.to_str().unwrap()), - ComponentSelector::Dirname => path - .parent() - .map(|p| p.to_str().unwrap()) - /* "a".parent() becomes Some(""), which we want to treat as no parent */ - .filter(|s| !s.is_empty()), - ComponentSelector::FileExtension => path.extension().map(|ext| ext.to_str().unwrap()), - } -} - -trait NameMatcher { - fn create(pattern: &str, opts: PatternModifiers) -> Result - where - Self: Sized; - fn matches(&self, input: &str) -> bool; -} - -struct LiteralMatcher { - lit: String, - case_insensitive: bool, -} - -impl NameMatcher for LiteralMatcher { - fn create(pattern: &str, opts: PatternModifiers) -> Result - where - Self: Sized, - { - let PatternModifiers { case_insensitive } = opts; - Ok(Self { - lit: pattern.to_string(), - case_insensitive, - }) - } - - fn matches(&self, input: &str) -> bool { - if self.case_insensitive { - self.lit.eq_ignore_ascii_case(input) - } else { - input == &self.lit - } - } -} - -struct GlobMatcher { - pat: glob::Pattern, - glob_opts: glob::MatchOptions, -} - -impl NameMatcher for GlobMatcher { - fn create(pattern: &str, opts: PatternModifiers) -> Result - where - Self: Sized, - { - let PatternModifiers { case_insensitive } = opts; - let glob_opts = glob::MatchOptions { - case_sensitive: !case_insensitive, - ..Default::default() - }; - let pat = glob::Pattern::new(pattern).map_err(|e| { - CommandError::InvalidArg(format!( - "failed to construct glob matcher from pattern {pattern:?}: {e}" - )) - })?; - Ok(Self { pat, glob_opts }) - } - - fn matches(&self, input: &str) -> bool { - self.pat.matches_with(input, self.glob_opts) - } -} - -struct RegexMatcher { - pat: regex::Regex, -} - -impl NameMatcher for RegexMatcher { - fn create(pattern: &str, opts: PatternModifiers) -> Result - where - Self: Sized, - { - let PatternModifiers { case_insensitive } = opts; - let pat = regex::RegexBuilder::new(pattern) - .case_insensitive(case_insensitive) - .build() - .map_err(|e| { - CommandError::InvalidArg(format!( - "failed to construct regex matcher from pattern {pattern:?}: {e}" - )) - })?; - Ok(Self { pat }) - } - - fn matches(&self, input: &str) -> bool { - self.pat.is_match(input) - } -} - -trait EntryMatcher { - type Arg - where - Self: Sized; - fn from_arg(arg: Self::Arg) -> Result - where - Self: Sized; - fn matches(&self, entry: &ZipFile) -> bool; -} - -#[derive(Copy, Clone)] -enum TrivialMatcher { - True, - False, -} - -impl EntryMatcher for TrivialMatcher { - type Arg = TrivialPredicate where Self: Sized; - - fn from_arg(arg: Self::Arg) -> Result - where - Self: Sized, - { - Ok(match arg { - TrivialPredicate::True => Self::True, - TrivialPredicate::False => Self::False, - }) - } - - fn matches(&self, _entry: &ZipFile) -> bool { - match self { - Self::True => true, - Self::False => false, - } - } -} - -#[derive(Copy, Clone)] -enum EntryTypeMatcher { - File, - Dir, - Symlink, -} - -impl EntryMatcher for EntryTypeMatcher { - type Arg = EntryType where Self: Sized; - - fn from_arg(arg: Self::Arg) -> Result - where - Self: Sized, - { - Ok(match arg { - EntryType::File => Self::File, - EntryType::Dir => Self::Dir, - EntryType::Symlink => Self::Symlink, - }) - } - - fn matches(&self, entry: &ZipFile) -> bool { - match self { - Self::File => !entry.is_dir() && !entry.is_symlink(), - Self::Dir => entry.is_dir(), - Self::Symlink => entry.is_symlink(), - } - } -} - -#[derive(Copy, Clone)] -enum NonSpecificMethods { - Any, - Known, -} - -impl EntryMatcher for NonSpecificMethods { - type Arg = NonSpecificCompressionMethodArg where Self: Sized; - - fn from_arg(arg: Self::Arg) -> Result - where - Self: Sized, - { - Ok(match arg { - NonSpecificCompressionMethodArg::Any => Self::Any, - NonSpecificCompressionMethodArg::Known => Self::Known, - }) - } - - fn matches(&self, entry: &ZipFile) -> bool { - match self { - Self::Any => true, - Self::Known => SpecificCompressionMethodArg::KNOWN_COMPRESSION_METHODS - .contains(&entry.compression()), - } - } -} - -struct SpecificMethods { - specific_method: CompressionMethod, -} - -impl EntryMatcher for SpecificMethods { - type Arg = SpecificCompressionMethodArg where Self: Sized; - - fn from_arg(arg: Self::Arg) -> Result - where - Self: Sized, - { - Ok(Self { - specific_method: arg.translate_to_zip(), - }) - } - - fn matches(&self, entry: &ZipFile) -> bool { - self.specific_method == entry.compression() - } -} - -#[derive(Copy, Clone)] -enum DepthLimit { - Max(usize), - Min(usize), -} - -impl EntryMatcher for DepthLimit { - type Arg = DepthLimitArg where Self: Sized; - - fn from_arg(arg: Self::Arg) -> Result - where - Self: Sized, - { - Ok(match arg { - DepthLimitArg::Max(max) => Self::Max(max.into()), - DepthLimitArg::Min(min) => Self::Min(min.into()), - }) - } - - fn matches(&self, entry: &ZipFile) -> bool { - let num_components = entry.name().split('/').count(); - match self { - Self::Max(max) => num_components <= *max, - Self::Min(min) => num_components >= *min, - } - } -} - -struct PatternMatcher { - matcher: Box, - comp_sel: ComponentSelector, -} - -impl EntryMatcher for PatternMatcher { - type Arg = MatchArg where Self: Sized; - - fn from_arg(arg: Self::Arg) -> Result - where - Self: Sized, - { - let MatchArg { - comp_sel, - pat_sel: PatternSelector { pat_sel, modifiers }, - pattern, - } = arg; - - let matcher: Box = match pat_sel { - PatternSelectorType::Glob => Box::new(GlobMatcher::create(&pattern, modifiers)?), - PatternSelectorType::Literal => Box::new(LiteralMatcher::create(&pattern, modifiers)?), - PatternSelectorType::Regexp => Box::new(RegexMatcher::create(&pattern, modifiers)?), - }; - - Ok(Self { matcher, comp_sel }) - } - - fn matches(&self, entry: &ZipFile) -> bool { - match process_component_selector(self.comp_sel, entry.name()) { - None => false, - Some(s) => self.matcher.matches(s), - } - } -} - -enum WrappedMatcher { - Primitive(Box), - Negated(Box), - And { - left: Box, - right: Box, - }, - Or { - left: Box, - right: Box, - }, -} - -impl WrappedMatcher { - fn create_primitive(arg: Predicate) -> Result { - Ok(Self::Primitive(match arg { - Predicate::Trivial(arg) => Box::new(TrivialMatcher::from_arg(arg)?), - Predicate::EntryType(arg) => Box::new(EntryTypeMatcher::from_arg(arg)?), - Predicate::CompressionMethod(method_arg) => match method_arg { - CompressionMethodArg::NonSpecific(arg) => { - Box::new(NonSpecificMethods::from_arg(arg)?) - } - CompressionMethodArg::Specific(arg) => Box::new(SpecificMethods::from_arg(arg)?), - }, - Predicate::DepthLimit(arg) => Box::new(DepthLimit::from_arg(arg)?), - Predicate::Match(arg) => Box::new(PatternMatcher::from_arg(arg)?), - })) - } -} - -impl EntryMatcher for WrappedMatcher { - type Arg = MatchExpression where Self: Sized; - - fn from_arg(arg: Self::Arg) -> Result - where - Self: Sized, - { - Ok(match arg { - MatchExpression::PrimitivePredicate(pred) => Self::create_primitive(pred)?, - MatchExpression::Negated(arg) => Self::Negated(Box::new(Self::from_arg(*arg)?)), - MatchExpression::And { - explicit: _, - left, - right, - } => { - let left = Box::new(Self::from_arg(*left)?); - let right = Box::new(Self::from_arg(*right)?); - Self::And { left, right } - } - MatchExpression::Or { left, right } => { - let left = Box::new(Self::from_arg(*left)?); - let right = Box::new(Self::from_arg(*right)?); - Self::Or { left, right } - } - MatchExpression::Grouped(inner) => Self::from_arg(*inner)?, - }) - } - - fn matches(&self, entry: &ZipFile) -> bool { - match self { - Self::Primitive(m) => m.matches(entry), - Self::Negated(m) => !m.matches(entry), - Self::And { left, right } => left.matches(entry) && right.matches(entry), - Self::Or { left, right } => left.matches(entry) || right.matches(entry), - } - } -} - -struct Transformer { - err: Rc>, - trans: NameTransform, -} - -impl Transformer { - pub fn new(err: Rc>, trans: NameTransform) -> Self { - Self { err, trans } - } -} - -impl Transformer -where - W: Write, -{ - pub fn evaluate<'s>(&self, name: &'s str) -> Cow<'s, str> { - match &self.trans { - NameTransform::Trivial(TrivialTransform::Identity) => Cow::Borrowed(name), - NameTransform::Basic(basic_trans) => match basic_trans { - BasicTransform::StripComponents(num_components_to_strip) => { - /* If no directory components, then nothing to strip. */ - if !name.contains('/') { - return Cow::Borrowed(name); - } - /* We allow stripping 0 components, which does nothing. */ - if *num_components_to_strip == 0 { - return Cow::Borrowed(name); - } - /* Pop off prefix components until only one is left or we have stripped all the - * requested prefix components. */ - let mut num_components_to_strip: usize = (*num_components_to_strip).into(); - let mut separator_indices: VecDeque = - name.match_indices('/').map(|(i, _)| i).collect(); - debug_assert!(separator_indices.len() > 0); - /* Always keep the final separator, as regardless of how many we strip, we want - * to keep the basename in all cases. */ - while separator_indices.len() > 1 && num_components_to_strip > 0 { - let _ = separator_indices.pop_front().unwrap(); - num_components_to_strip -= 1; - } - debug_assert!(separator_indices.len() > 0); - let leftmost_remaining_separator_index: usize = - separator_indices.pop_front().unwrap(); - Cow::Borrowed(&name[(leftmost_remaining_separator_index + 1)..]) - } - BasicTransform::AddPrefix(prefix_to_add) => { - /* We allow an empty prefix, which means to do nothing. */ - if prefix_to_add.is_empty() { - return Cow::Borrowed(name); - } - Cow::Owned(format!("{}/{}", prefix_to_add, name)) - } - }, - NameTransform::Complex(complex_trans) => match complex_trans { - ComplexTransform::RemovePrefix(remove_prefix_arg) => { - todo!("impl remove prefix: {:?}", remove_prefix_arg) - } - ComplexTransform::Transform(transform_arg) => { - todo!("impl transform: {:?}", transform_arg) - } - }, - } - } -} - -struct EntrySpecTransformer { - err: Rc>, - matcher: Option, - name_transformers: Vec>, - content_transform: ContentTransform, -} - -impl EntrySpecTransformer { - pub fn new(err: Rc>, entry_spec: EntrySpec) -> Result { - let EntrySpec { - match_expr, - name_transforms, - content_transform, - } = entry_spec; - let matcher = match match_expr { - None => None, - Some(expr) => Some(WrappedMatcher::from_arg(expr)?), - }; - let name_transformers: Vec<_> = name_transforms - .into_iter() - .map(|trans| Transformer::new(err.clone(), trans)) - .collect(); - Ok(Self { - err, - matcher, - name_transformers, - content_transform, - }) - } - - pub fn empty(err: Rc>) -> Self { - Self { - err, - matcher: None, - name_transformers: Vec::new(), - content_transform: ContentTransform::Extract, - } - } -} - -impl EntrySpecTransformer -where - W: Write, -{ - pub fn matches(&self, entry: &ZipFile) -> bool { - match &self.matcher { - None => true, - Some(matcher) => matcher.matches(entry), - } - } - - /// Transform the name from the zip entry, maintaining a few invariants: - /// 1. If the transformations all return substrings (no prefixing, non-empty replacements, or - /// empty replacements that lead to non-contiguous input chunks), return a slice of the - /// original input, pointing back to the ZipFile's memory location with associated lifetime. - /// 2. If some intermediate transformation requires an allocation (e.g. adding a prefix), do - /// not perform intermediate reallocations for subsequent substring-only transformations. - /// - TODO: The returned string may be reallocated from the initial allocation exactly once - /// at the end, if substring-only transformations reduced its length. This is because Cow - /// can only describe a substring of the original input or an entirely new allocated - /// string, as opposed to a more general sort of string view wrapper. - pub fn transform_name<'s>(&self, mut original_name: &'s str) -> Cow<'s, str> { - let mut newly_allocated_name: Option = None; - let mut newly_allocated_str: Option<&str> = None; - for transformer in self.name_transformers.iter() { - match newly_allocated_str { - Some(s) => match transformer.evaluate(s) { - Cow::Borrowed(t) => { - let _ = newly_allocated_str.replace(t); - } - Cow::Owned(t) => { - assert!(newly_allocated_name.replace(t).is_some()); - newly_allocated_str = Some(newly_allocated_name.as_ref().unwrap().as_str()); - } - }, - None => match transformer.evaluate(original_name) { - Cow::Borrowed(t) => { - original_name = t; - } - Cow::Owned(t) => { - assert!(newly_allocated_name.replace(t).is_none()); - newly_allocated_str = Some(newly_allocated_name.as_ref().unwrap().as_str()); - } - }, - } - } - - if newly_allocated_name.is_none() { - /* If we have never allocated anything new, just return the substring of the original - * name! */ - Cow::Borrowed(original_name) - } else { - let subref = newly_allocated_str.unwrap(); - /* If the active substring is the same length as the backing string, assume it's - * unchanged, so we can return the backing string without reallocating. */ - if subref.len() == newly_allocated_name.as_ref().unwrap().len() { - Cow::Owned(newly_allocated_name.unwrap()) - } else { - let reallocated_string = subref.to_string(); - Cow::Owned(reallocated_string) - } - } - } - - pub fn content_transform(&self) -> &ContentTransform { - &self.content_transform - } -} - -trait IterateEntries { - fn next_entry(&mut self) -> Result, CommandError>; -} - -fn make_entry_iterator<'a>( - err: Rc>, - input_type: InputType, -) -> Result, CommandError> { - let ret: Box = match input_type { - InputType::StreamingStdin => Box::new(StdinInput::new(err)), - InputType::ZipPaths(zip_paths) => Box::new(AllInputZips::new(err, zip_paths)?), - }; - Ok(ret) -} - -struct StdinInput { - err: Rc>, - inner: io::Stdin, -} - -impl StdinInput { - pub fn new(err: Rc>) -> Self { - Self { - err, - inner: io::stdin(), - } - } -} - -impl IterateEntries for StdinInput { - fn next_entry(&mut self) -> Result, CommandError> { - read_zipfile_from_stream(&mut self.inner).wrap_err("failed to read zip entries from stdin") - } -} - -#[derive(Debug)] -struct ZipFileInput { - err: Rc>, - inner: ZipArchive, - file_counter: usize, -} - -impl ZipFileInput { - pub fn new(err: Rc>, inner: ZipArchive) -> Self { - Self { - err, - inner: inner, - file_counter: 0, - } - } - - pub fn remaining(&self) -> usize { - self.inner.len() - self.file_counter - } - - pub fn none_left(&self) -> bool { - self.remaining() == 0 - } -} - -impl IterateEntries for ZipFileInput { - fn next_entry(&mut self) -> Result, CommandError> { - if self.none_left() { - return Ok(None); - } - let prev_counter = self.file_counter; - self.file_counter += 1; - self.inner - .by_index(prev_counter) - .map(Some) - .wrap_err_with(|| format!("failed to read entry #{prev_counter} from zip",)) - } -} - -struct AllInputZips { - err: Rc>, - zips_todo: VecDeque>, - cur_zip: UnsafeCell>, -} - -impl AllInputZips { - pub fn new( - err: Rc>, - zip_paths: impl IntoIterator>, - ) -> Result { - let mut zips_todo = zip_paths - .into_iter() - .map(|p| { - fs::File::open(p.as_ref()) - .wrap_err_with(|| { - format!("failed to open zip input file path {:?}", p.as_ref()) - }) - .and_then(|f| { - ZipArchive::new(f).wrap_err_with(|| { - format!("failed to create zip archive for file {:?}", p.as_ref()) - }) - }) - .map(|archive| ZipFileInput::new(Rc::clone(&err), archive)) - }) - .collect::, CommandError>>()?; - debug_assert!(!zips_todo.is_empty()); - let cur_zip = zips_todo.pop_front().unwrap(); - Ok(Self { - err, - zips_todo, - cur_zip: UnsafeCell::new(cur_zip), - }) - } -} - -impl IterateEntries for AllInputZips { - fn next_entry(&mut self) -> Result, CommandError> { - loop { - if let Some(entry) = unsafe { &mut *self.cur_zip.get() }.next_entry()? { - return Ok(Some(entry)); - } - match self.zips_todo.pop_front() { - Some(zip) => { - self.cur_zip = UnsafeCell::new(zip); - } - None => { - return Ok(None); - } - } - } - } -} - -fn process_entry_specs( - err: Rc>, - entry_specs: impl IntoIterator, -) -> Result>, CommandError> -where - W: Write, -{ - let entry_spec_transformers: Vec> = entry_specs - .into_iter() - .map(|spec| EntrySpecTransformer::new(err.clone(), spec)) - .collect::>()?; - if entry_spec_transformers.is_empty() { - return Ok(vec![EntrySpecTransformer::empty(err.clone())]); - }; - - /* Perform some validation on the transforms since we don't currently support everything we - * want to. */ - if entry_spec_transformers - .iter() - .any(|t| *t.content_transform() == ContentTransform::Raw) - { - /* TODO: this can be solved if we can convert a ZipFile into a Raw reader! */ - return Err(CommandError::InvalidArg( - "--raw extraction output is not yet supported".to_string(), - )); - } - if entry_spec_transformers - .iter() - .filter(|t| *t.content_transform() != ContentTransform::LogToStderr) - .count() - > 1 - { - /* TODO: this can be solved by separating data from entries! */ - return Err(CommandError::InvalidArg( - "more than one entry spec using a content transform which reads content (i.e. was not --log-to-stderr) was provided; this requires teeing entry contents which is not yet supported".to_string(), - )); - } - - Ok(entry_spec_transformers) -} +mod entries; +mod matcher; +mod receiver; +mod transform; pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandError> { let Extract { @@ -910,10 +19,10 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE } = extract; let err = Rc::new(RefCell::new(err)); - let mut entry_receiver = make_entry_receiver(err.clone(), output)?; - let entry_spec_transformers = process_entry_specs(err.clone(), entry_specs)?; + let mut entry_receiver = receiver::make_entry_receiver(err.clone(), output)?; + let entry_spec_transformers = transform::process_entry_specs(entry_specs)?; let mut stderr_log_output = io::stderr(); - let mut entry_iterator = make_entry_iterator(err.clone(), input)?; + let mut entry_iterator = entries::make_entry_iterator(input)?; while let Some(mut entry) = entry_iterator.next_entry()? { for transformer in entry_spec_transformers.iter() { diff --git a/cli/src/extract/entries.rs b/cli/src/extract/entries.rs new file mode 100644 index 000000000..8e0e322d6 --- /dev/null +++ b/cli/src/extract/entries.rs @@ -0,0 +1,132 @@ +use std::{ + cell::UnsafeCell, + collections::VecDeque, + fs, + io::{self}, + path::Path, +}; + +use zip::{ + read::{read_zipfile_from_stream, ZipFile}, + ZipArchive, +}; + +use crate::{args::extract::*, CommandError, WrapCommandErr}; + +pub trait IterateEntries { + fn next_entry(&mut self) -> Result, CommandError>; +} + +pub fn make_entry_iterator<'a>( + input_type: InputType, +) -> Result, CommandError> { + let ret: Box = match input_type { + InputType::StreamingStdin => Box::new(StdinInput::new()), + InputType::ZipPaths(zip_paths) => Box::new(AllInputZips::new(zip_paths)?), + }; + Ok(ret) +} + +struct StdinInput { + inner: io::Stdin, +} + +impl StdinInput { + pub fn new() -> Self { + Self { inner: io::stdin() } + } +} + +impl IterateEntries for StdinInput { + fn next_entry(&mut self) -> Result, CommandError> { + read_zipfile_from_stream(&mut self.inner).wrap_err("failed to read zip entries from stdin") + } +} + +#[derive(Debug)] +struct ZipFileInput { + inner: ZipArchive, + file_counter: usize, +} + +impl ZipFileInput { + pub fn new(inner: ZipArchive) -> Self { + Self { + inner: inner, + file_counter: 0, + } + } + + pub fn remaining(&self) -> usize { + self.inner.len() - self.file_counter + } + + pub fn none_left(&self) -> bool { + self.remaining() == 0 + } +} + +impl IterateEntries for ZipFileInput { + fn next_entry(&mut self) -> Result, CommandError> { + if self.none_left() { + return Ok(None); + } + let prev_counter = self.file_counter; + self.file_counter += 1; + self.inner + .by_index(prev_counter) + .map(Some) + .wrap_err_with(|| format!("failed to read entry #{prev_counter} from zip",)) + } +} + +struct AllInputZips { + zips_todo: VecDeque, + cur_zip: UnsafeCell, +} + +impl AllInputZips { + pub fn new( + zip_paths: impl IntoIterator>, + ) -> Result { + let mut zips_todo = zip_paths + .into_iter() + .map(|p| { + fs::File::open(p.as_ref()) + .wrap_err_with(|| { + format!("failed to open zip input file path {:?}", p.as_ref()) + }) + .and_then(|f| { + ZipArchive::new(f).wrap_err_with(|| { + format!("failed to create zip archive for file {:?}", p.as_ref()) + }) + }) + .map(ZipFileInput::new) + }) + .collect::, CommandError>>()?; + debug_assert!(!zips_todo.is_empty()); + let cur_zip = zips_todo.pop_front().unwrap(); + Ok(Self { + zips_todo, + cur_zip: UnsafeCell::new(cur_zip), + }) + } +} + +impl IterateEntries for AllInputZips { + fn next_entry(&mut self) -> Result, CommandError> { + loop { + if let Some(entry) = unsafe { &mut *self.cur_zip.get() }.next_entry()? { + return Ok(Some(entry)); + } + match self.zips_todo.pop_front() { + Some(zip) => { + self.cur_zip = UnsafeCell::new(zip); + } + None => { + return Ok(None); + } + } + } + } +} diff --git a/cli/src/extract/matcher.rs b/cli/src/extract/matcher.rs new file mode 100644 index 000000000..adb04e2ae --- /dev/null +++ b/cli/src/extract/matcher.rs @@ -0,0 +1,360 @@ +use std::path::Path; + +use glob; +use regex; + +use zip::{ + read::ZipFile, + CompressionMethod, +}; + +use crate::{args::extract::*, CommandError}; + +#[inline(always)] +fn process_component_selector<'s>(sel: ComponentSelector, name: &'s str) -> Option<&'s str> { + let path = Path::new(name); + match sel { + ComponentSelector::Path => Some(name), + ComponentSelector::Basename => path.file_name().map(|bname| bname.to_str().unwrap()), + ComponentSelector::Dirname => path + .parent() + .map(|p| p.to_str().unwrap()) + /* "a".parent() becomes Some(""), which we want to treat as no parent */ + .filter(|s| !s.is_empty()), + ComponentSelector::FileExtension => path.extension().map(|ext| ext.to_str().unwrap()), + } +} + +trait NameMatcher { + fn create(pattern: &str, opts: PatternModifiers) -> Result + where + Self: Sized; + fn matches(&self, input: &str) -> bool; +} + +struct LiteralMatcher { + lit: String, + case_insensitive: bool, +} + +impl NameMatcher for LiteralMatcher { + fn create(pattern: &str, opts: PatternModifiers) -> Result + where + Self: Sized, + { + let PatternModifiers { case_insensitive } = opts; + Ok(Self { + lit: pattern.to_string(), + case_insensitive, + }) + } + + fn matches(&self, input: &str) -> bool { + if self.case_insensitive { + self.lit.eq_ignore_ascii_case(input) + } else { + input == &self.lit + } + } +} + +struct GlobMatcher { + pat: glob::Pattern, + glob_opts: glob::MatchOptions, +} + +impl NameMatcher for GlobMatcher { + fn create(pattern: &str, opts: PatternModifiers) -> Result + where + Self: Sized, + { + let PatternModifiers { case_insensitive } = opts; + let glob_opts = glob::MatchOptions { + case_sensitive: !case_insensitive, + ..Default::default() + }; + let pat = glob::Pattern::new(pattern).map_err(|e| { + CommandError::InvalidArg(format!( + "failed to construct glob matcher from pattern {pattern:?}: {e}" + )) + })?; + Ok(Self { pat, glob_opts }) + } + + fn matches(&self, input: &str) -> bool { + self.pat.matches_with(input, self.glob_opts) + } +} + +struct RegexMatcher { + pat: regex::Regex, +} + +impl NameMatcher for RegexMatcher { + fn create(pattern: &str, opts: PatternModifiers) -> Result + where + Self: Sized, + { + let PatternModifiers { case_insensitive } = opts; + let pat = regex::RegexBuilder::new(pattern) + .case_insensitive(case_insensitive) + .build() + .map_err(|e| { + CommandError::InvalidArg(format!( + "failed to construct regex matcher from pattern {pattern:?}: {e}" + )) + })?; + Ok(Self { pat }) + } + + fn matches(&self, input: &str) -> bool { + self.pat.is_match(input) + } +} + +pub trait EntryMatcher { + type Arg + where + Self: Sized; + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized; + fn matches(&self, entry: &ZipFile) -> bool; +} + +#[derive(Copy, Clone)] +enum TrivialMatcher { + True, + False, +} + +impl EntryMatcher for TrivialMatcher { + type Arg = TrivialPredicate where Self: Sized; + + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(match arg { + TrivialPredicate::True => Self::True, + TrivialPredicate::False => Self::False, + }) + } + + fn matches(&self, _entry: &ZipFile) -> bool { + match self { + Self::True => true, + Self::False => false, + } + } +} + +#[derive(Copy, Clone)] +enum EntryTypeMatcher { + File, + Dir, + Symlink, +} + +impl EntryMatcher for EntryTypeMatcher { + type Arg = EntryType where Self: Sized; + + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(match arg { + EntryType::File => Self::File, + EntryType::Dir => Self::Dir, + EntryType::Symlink => Self::Symlink, + }) + } + + fn matches(&self, entry: &ZipFile) -> bool { + match self { + Self::File => !entry.is_dir() && !entry.is_symlink(), + Self::Dir => entry.is_dir(), + Self::Symlink => entry.is_symlink(), + } + } +} + +#[derive(Copy, Clone)] +enum NonSpecificMethods { + Any, + Known, +} + +impl EntryMatcher for NonSpecificMethods { + type Arg = NonSpecificCompressionMethodArg where Self: Sized; + + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(match arg { + NonSpecificCompressionMethodArg::Any => Self::Any, + NonSpecificCompressionMethodArg::Known => Self::Known, + }) + } + + fn matches(&self, entry: &ZipFile) -> bool { + match self { + Self::Any => true, + Self::Known => SpecificCompressionMethodArg::KNOWN_COMPRESSION_METHODS + .contains(&entry.compression()), + } + } +} + +struct SpecificMethods { + specific_method: CompressionMethod, +} + +impl EntryMatcher for SpecificMethods { + type Arg = SpecificCompressionMethodArg where Self: Sized; + + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(Self { + specific_method: arg.translate_to_zip(), + }) + } + + fn matches(&self, entry: &ZipFile) -> bool { + self.specific_method == entry.compression() + } +} + +#[derive(Copy, Clone)] +enum DepthLimit { + Max(usize), + Min(usize), +} + +impl EntryMatcher for DepthLimit { + type Arg = DepthLimitArg where Self: Sized; + + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(match arg { + DepthLimitArg::Max(max) => Self::Max(max.into()), + DepthLimitArg::Min(min) => Self::Min(min.into()), + }) + } + + fn matches(&self, entry: &ZipFile) -> bool { + let num_components = entry.name().split('/').count(); + match self { + Self::Max(max) => num_components <= *max, + Self::Min(min) => num_components >= *min, + } + } +} + +struct PatternMatcher { + matcher: Box, + comp_sel: ComponentSelector, +} + +impl EntryMatcher for PatternMatcher { + type Arg = MatchArg where Self: Sized; + + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + let MatchArg { + comp_sel, + pat_sel: PatternSelector { pat_sel, modifiers }, + pattern, + } = arg; + + let matcher: Box = match pat_sel { + PatternSelectorType::Glob => Box::new(GlobMatcher::create(&pattern, modifiers)?), + PatternSelectorType::Literal => Box::new(LiteralMatcher::create(&pattern, modifiers)?), + PatternSelectorType::Regexp => Box::new(RegexMatcher::create(&pattern, modifiers)?), + }; + + Ok(Self { matcher, comp_sel }) + } + + fn matches(&self, entry: &ZipFile) -> bool { + match process_component_selector(self.comp_sel, entry.name()) { + None => false, + Some(s) => self.matcher.matches(s), + } + } +} + +pub enum WrappedMatcher { + Primitive(Box), + Negated(Box), + And { + left: Box, + right: Box, + }, + Or { + left: Box, + right: Box, + }, +} + +impl WrappedMatcher { + fn create_primitive(arg: Predicate) -> Result { + Ok(Self::Primitive(match arg { + Predicate::Trivial(arg) => Box::new(TrivialMatcher::from_arg(arg)?), + Predicate::EntryType(arg) => Box::new(EntryTypeMatcher::from_arg(arg)?), + Predicate::CompressionMethod(method_arg) => match method_arg { + CompressionMethodArg::NonSpecific(arg) => { + Box::new(NonSpecificMethods::from_arg(arg)?) + } + CompressionMethodArg::Specific(arg) => Box::new(SpecificMethods::from_arg(arg)?), + }, + Predicate::DepthLimit(arg) => Box::new(DepthLimit::from_arg(arg)?), + Predicate::Match(arg) => Box::new(PatternMatcher::from_arg(arg)?), + })) + } +} + +impl EntryMatcher for WrappedMatcher { + type Arg = MatchExpression where Self: Sized; + + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(match arg { + MatchExpression::PrimitivePredicate(pred) => Self::create_primitive(pred)?, + MatchExpression::Negated(arg) => Self::Negated(Box::new(Self::from_arg(*arg)?)), + MatchExpression::And { + explicit: _, + left, + right, + } => { + let left = Box::new(Self::from_arg(*left)?); + let right = Box::new(Self::from_arg(*right)?); + Self::And { left, right } + } + MatchExpression::Or { left, right } => { + let left = Box::new(Self::from_arg(*left)?); + let right = Box::new(Self::from_arg(*right)?); + Self::Or { left, right } + } + MatchExpression::Grouped(inner) => Self::from_arg(*inner)?, + }) + } + + fn matches(&self, entry: &ZipFile) -> bool { + match self { + Self::Primitive(m) => m.matches(entry), + Self::Negated(m) => !m.matches(entry), + Self::And { left, right } => left.matches(entry) && right.matches(entry), + Self::Or { left, right } => left.matches(entry) || right.matches(entry), + } + } +} diff --git a/cli/src/extract/receiver.rs b/cli/src/extract/receiver.rs new file mode 100644 index 000000000..57a6b1b38 --- /dev/null +++ b/cli/src/extract/receiver.rs @@ -0,0 +1,203 @@ +use std::{ + cell::RefCell, + env, fs, + io::{self, Read, Write}, + mem, + path::PathBuf, + rc::Rc, +}; + +use zip::read::ZipFile; + +use crate::{args::extract::*, CommandError, WrapCommandErr}; + +pub trait EntryReceiver { + fn receive_entry<'a>( + &mut self, + entry: &mut ZipFile<'a>, + name: &str, + ) -> Result<(), CommandError>; + fn finalize_entries(&mut self) -> Result<(), CommandError>; +} + +pub fn make_entry_receiver<'a>( + err: Rc>, + collation: OutputCollation, +) -> Result, CommandError> { + let ret: Box = match collation { + OutputCollation::ConcatenateStdout => Box::new(StdoutReceiver::new(err)), + OutputCollation::Filesystem { output_dir, mkdir } => { + let output_dir = match output_dir { + Some(dir) => { + if mkdir { + fs::create_dir_all(&dir).wrap_err_with(|| { + format!("failed to create output directory {dir:?}") + })?; + } + dir + } + None => env::current_dir().wrap_err("failed to get current dir")?, + }; + Box::new(FilesystemReceiver::new(err, output_dir)) + } + }; + Ok(ret) +} + +struct StdoutReceiver { + err: Rc>, + stdout: io::Stdout, +} + +impl StdoutReceiver { + pub fn new(err: Rc>) -> Self { + Self { + err, + stdout: io::stdout(), + } + } +} + +impl EntryReceiver for StdoutReceiver +where + W: Write, +{ + fn receive_entry<'a>( + &mut self, + entry: &mut ZipFile<'a>, + name: &str, + ) -> Result<(), CommandError> { + let mut err = self.err.borrow_mut(); + writeln!(err, "receiving entry {} with name {name}", entry.name()).unwrap(); + if entry.is_dir() { + writeln!(err, "entry is directory, ignoring").unwrap(); + } else if entry.is_symlink() { + writeln!(err, "entry is symlink, ignoring").unwrap(); + } else { + io::copy(entry, &mut self.stdout) + .wrap_err_with(|| format!("failed to write entry {name} to stdout"))?; + } + Ok(()) + } + + fn finalize_entries(&mut self) -> Result<(), CommandError> { + Ok(()) + } +} + +struct FilesystemReceiver { + err: Rc>, + output_dir: PathBuf, + #[cfg(unix)] + perms_to_set: Vec<(PathBuf, u32)>, +} + +impl FilesystemReceiver { + pub fn new(err: Rc>, output_dir: PathBuf) -> Self { + Self { + err, + output_dir, + #[cfg(unix)] + perms_to_set: Vec::new(), + } + } +} + +impl EntryReceiver for FilesystemReceiver +where + W: Write, +{ + fn receive_entry<'a>( + &mut self, + entry: &mut ZipFile<'a>, + name: &str, + ) -> Result<(), CommandError> { + let mut err = self.err.borrow_mut(); + let full_output_path = self.output_dir.join(name); + writeln!( + err, + "receiving entry {} with name {name} and writing to path {full_output_path:?}", + entry.name() + ) + .unwrap(); + + #[cfg(unix)] + if let Some(mode) = entry.unix_mode() { + writeln!( + err, + "storing unix mode {mode} for path {full_output_path:?}" + ) + .unwrap(); + self.perms_to_set.push((full_output_path.clone(), mode)); + } + + if entry.is_dir() { + writeln!(err, "entry is directory, creating").unwrap(); + fs::create_dir_all(&full_output_path).wrap_err_with(|| { + format!("failed to create directory entry at {full_output_path:?}") + })?; + } else if entry.is_symlink() { + let mut target: Vec = Vec::with_capacity(entry.size().try_into().unwrap()); + entry.read_to_end(&mut target).wrap_err_with(|| { + format!( + "failed to read symlink target from zip archive entry {}", + entry.name() + ) + })?; + + #[cfg(unix)] + { + use std::{ + ffi::OsString, + os::unix::{ffi::OsStringExt, fs::symlink}, + }; + let target = OsString::from_vec(target); + writeln!(err, "entry is symlink to {target:?}, creating").unwrap(); + symlink(&target, &full_output_path).wrap_err_with(|| { + format!( + "failed to create symlink at {full_output_path:?} with target {target:?}" + ) + })?; + } + #[cfg(not(unix))] + { + /* FIXME: non-unix symlink extraction not yet supported! */ + todo!("TODO: cannot create symlink for entry {name} on non-unix yet!"); + } + } else { + writeln!(err, "entry is file, creating").unwrap(); + if let Some(containing_dir) = full_output_path.parent() { + fs::create_dir_all(containing_dir).wrap_err_with(|| { + format!("failed to create parent dirs for file at {full_output_path:?}") + })?; + } else { + writeln!(err, "entry had no parent dir (in root dir?)").unwrap(); + } + let mut outfile = fs::File::create(&full_output_path) + .wrap_err_with(|| format!("failed to create file at {full_output_path:?}"))?; + io::copy(entry, &mut outfile).wrap_err_with(|| { + format!( + "failed to copy file contents from {} to {full_output_path:?}", + entry.name() + ) + })?; + } + Ok(()) + } + + fn finalize_entries(&mut self) -> Result<(), CommandError> { + #[cfg(unix)] + { + use std::{cmp::Reverse, os::unix::fs::PermissionsExt}; + + let mut perms_to_set = mem::take(&mut self.perms_to_set); + perms_to_set.sort_unstable_by_key(|(path, _)| Reverse(path.clone())); + for (path, mode) in perms_to_set.into_iter() { + let perms = fs::Permissions::from_mode(mode); + fs::set_permissions(&path, perms.clone()) + .wrap_err_with(|| format!("error setting perms {perms:?} for path {path:?}"))?; + } + } + Ok(()) + } +} diff --git a/cli/src/extract/transform.rs b/cli/src/extract/transform.rs new file mode 100644 index 000000000..4c6f6369d --- /dev/null +++ b/cli/src/extract/transform.rs @@ -0,0 +1,208 @@ +use std::{borrow::Cow, collections::VecDeque}; + +use zip::read::ZipFile; + +use crate::{args::extract::*, CommandError}; + +use super::matcher::{EntryMatcher, WrappedMatcher}; + +struct Transformer { + trans: NameTransform, +} + +impl Transformer { + pub fn new(trans: NameTransform) -> Self { + Self { trans } + } +} + +impl Transformer { + pub fn evaluate<'s>(&self, name: &'s str) -> Cow<'s, str> { + match &self.trans { + NameTransform::Trivial(TrivialTransform::Identity) => Cow::Borrowed(name), + NameTransform::Basic(basic_trans) => match basic_trans { + BasicTransform::StripComponents(num_components_to_strip) => { + /* If no directory components, then nothing to strip. */ + if !name.contains('/') { + return Cow::Borrowed(name); + } + /* We allow stripping 0 components, which does nothing. */ + if *num_components_to_strip == 0 { + return Cow::Borrowed(name); + } + /* Pop off prefix components until only one is left or we have stripped all the + * requested prefix components. */ + let mut num_components_to_strip: usize = (*num_components_to_strip).into(); + let mut separator_indices: VecDeque = + name.match_indices('/').map(|(i, _)| i).collect(); + debug_assert!(separator_indices.len() > 0); + /* Always keep the final separator, as regardless of how many we strip, we want + * to keep the basename in all cases. */ + while separator_indices.len() > 1 && num_components_to_strip > 0 { + let _ = separator_indices.pop_front().unwrap(); + num_components_to_strip -= 1; + } + debug_assert!(separator_indices.len() > 0); + let leftmost_remaining_separator_index: usize = + separator_indices.pop_front().unwrap(); + Cow::Borrowed(&name[(leftmost_remaining_separator_index + 1)..]) + } + BasicTransform::AddPrefix(prefix_to_add) => { + /* We allow an empty prefix, which means to do nothing. */ + if prefix_to_add.is_empty() { + return Cow::Borrowed(name); + } + Cow::Owned(format!("{}/{}", prefix_to_add, name)) + } + }, + NameTransform::Complex(complex_trans) => match complex_trans { + ComplexTransform::RemovePrefix(remove_prefix_arg) => { + todo!("impl remove prefix: {:?}", remove_prefix_arg) + } + ComplexTransform::Transform(transform_arg) => { + todo!("impl transform: {:?}", transform_arg) + } + }, + } + } +} + +pub struct EntrySpecTransformer { + matcher: Option, + name_transformers: Vec, + content_transform: ContentTransform, +} + +impl EntrySpecTransformer { + pub fn new(entry_spec: EntrySpec) -> Result { + let EntrySpec { + match_expr, + name_transforms, + content_transform, + } = entry_spec; + let matcher = match match_expr { + None => None, + Some(expr) => Some(WrappedMatcher::from_arg(expr)?), + }; + let name_transformers: Vec<_> = name_transforms + .into_iter() + .map(|trans| Transformer::new(trans)) + .collect(); + Ok(Self { + matcher, + name_transformers, + content_transform, + }) + } + + pub fn empty() -> Self { + Self { + matcher: None, + name_transformers: Vec::new(), + content_transform: ContentTransform::Extract, + } + } +} + +impl EntrySpecTransformer { + pub fn matches(&self, entry: &ZipFile) -> bool { + match &self.matcher { + None => true, + Some(matcher) => matcher.matches(entry), + } + } + + /// Transform the name from the zip entry, maintaining a few invariants: + /// 1. If the transformations all return substrings (no prefixing, non-empty replacements, or + /// empty replacements that lead to non-contiguous input chunks), return a slice of the + /// original input, pointing back to the ZipFile's memory location with associated lifetime. + /// 2. If some intermediate transformation requires an allocation (e.g. adding a prefix), do + /// not perform intermediate reallocations for subsequent substring-only transformations. + /// - TODO: The returned string may be reallocated from the initial allocation exactly once + /// at the end, if substring-only transformations reduced its length. This is because Cow + /// can only describe a substring of the original input or an entirely new allocated + /// string, as opposed to a more general sort of string view wrapper. + pub fn transform_name<'s>(&self, mut original_name: &'s str) -> Cow<'s, str> { + let mut newly_allocated_name: Option = None; + let mut newly_allocated_str: Option<&str> = None; + for transformer in self.name_transformers.iter() { + match newly_allocated_str { + Some(s) => match transformer.evaluate(s) { + Cow::Borrowed(t) => { + let _ = newly_allocated_str.replace(t); + } + Cow::Owned(t) => { + assert!(newly_allocated_name.replace(t).is_some()); + newly_allocated_str = Some(newly_allocated_name.as_ref().unwrap().as_str()); + } + }, + None => match transformer.evaluate(original_name) { + Cow::Borrowed(t) => { + original_name = t; + } + Cow::Owned(t) => { + assert!(newly_allocated_name.replace(t).is_none()); + newly_allocated_str = Some(newly_allocated_name.as_ref().unwrap().as_str()); + } + }, + } + } + + if newly_allocated_name.is_none() { + /* If we have never allocated anything new, just return the substring of the original + * name! */ + Cow::Borrowed(original_name) + } else { + let subref = newly_allocated_str.unwrap(); + /* If the active substring is the same length as the backing string, assume it's + * unchanged, so we can return the backing string without reallocating. */ + if subref.len() == newly_allocated_name.as_ref().unwrap().len() { + Cow::Owned(newly_allocated_name.unwrap()) + } else { + let reallocated_string = subref.to_string(); + Cow::Owned(reallocated_string) + } + } + } + + pub fn content_transform(&self) -> &ContentTransform { + &self.content_transform + } +} + +pub fn process_entry_specs( + entry_specs: impl IntoIterator, +) -> Result, CommandError> { + let entry_spec_transformers: Vec = entry_specs + .into_iter() + .map(|spec| EntrySpecTransformer::new(spec)) + .collect::>()?; + if entry_spec_transformers.is_empty() { + return Ok(vec![EntrySpecTransformer::empty()]); + }; + + /* Perform some validation on the transforms since we don't currently support everything we + * want to. */ + if entry_spec_transformers + .iter() + .any(|t| *t.content_transform() == ContentTransform::Raw) + { + /* TODO: this can be solved if we can convert a ZipFile into a Raw reader! */ + return Err(CommandError::InvalidArg( + "--raw extraction output is not yet supported".to_string(), + )); + } + if entry_spec_transformers + .iter() + .filter(|t| *t.content_transform() != ContentTransform::LogToStderr) + .count() + > 1 + { + /* TODO: this can be solved by separating data from entries! */ + return Err(CommandError::InvalidArg( + "more than one entry spec using a content transform which reads content (i.e. was not --log-to-stderr) was provided; this requires teeing entry contents which is not yet supported".to_string(), + )); + } + + Ok(entry_spec_transformers) +} From e2ae4c5cc832830424250a0d648e99385a8c105b Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 17:18:48 -0400 Subject: [PATCH 13/68] begin impl transforms --- cli/src/extract/matcher.rs | 7 +- cli/src/extract/transform.rs | 161 +++++++++++++++++++++++------------ 2 files changed, 108 insertions(+), 60 deletions(-) diff --git a/cli/src/extract/matcher.rs b/cli/src/extract/matcher.rs index adb04e2ae..71ce892da 100644 --- a/cli/src/extract/matcher.rs +++ b/cli/src/extract/matcher.rs @@ -3,15 +3,12 @@ use std::path::Path; use glob; use regex; -use zip::{ - read::ZipFile, - CompressionMethod, -}; +use zip::{read::ZipFile, CompressionMethod}; use crate::{args::extract::*, CommandError}; #[inline(always)] -fn process_component_selector<'s>(sel: ComponentSelector, name: &'s str) -> Option<&'s str> { +pub fn process_component_selector<'s>(sel: ComponentSelector, name: &'s str) -> Option<&'s str> { let path = Path::new(name); match sel { ComponentSelector::Path => Some(name), diff --git a/cli/src/extract/transform.rs b/cli/src/extract/transform.rs index 4c6f6369d..f9e21d99b 100644 --- a/cli/src/extract/transform.rs +++ b/cli/src/extract/transform.rs @@ -6,54 +6,113 @@ use crate::{args::extract::*, CommandError}; use super::matcher::{EntryMatcher, WrappedMatcher}; -struct Transformer { - trans: NameTransform, +trait NameTransformer { + type Arg + where + Self: Sized; + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized; + fn transform_name<'s>(&self, name: &'s str) -> Cow<'s, str>; } -impl Transformer { - pub fn new(trans: NameTransform) -> Self { - Self { trans } +#[derive(Copy, Clone)] +enum Trivial { + Identity, +} + +impl NameTransformer for Trivial { + type Arg = TrivialTransform where Self: Sized; + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(match arg { + TrivialTransform::Identity => Self::Identity, + }) + } + fn transform_name<'s>(&self, name: &'s str) -> Cow<'s, str> { + match self { + Self::Identity => Cow::Borrowed(name), + } + } +} + +struct StripComponents { + num_components_to_strip: usize, +} + +impl NameTransformer for StripComponents { + type Arg = u8 where Self: Sized; + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(Self { + num_components_to_strip: arg.into(), + }) + } + fn transform_name<'s>(&self, name: &'s str) -> Cow<'s, str> { + /* If no directory components, then nothing to strip. */ + if !name.contains('/') { + return Cow::Borrowed(name); + } + /* We allow stripping 0 components, which does nothing. */ + if self.num_components_to_strip == 0 { + return Cow::Borrowed(name); + } + /* Pop off prefix components until only one is left or we have stripped all the + * requested prefix components. */ + let mut remaining_to_strip = self.num_components_to_strip; + let mut separator_indices: VecDeque = + name.match_indices('/').map(|(i, _)| i).collect(); + debug_assert!(separator_indices.len() > 0); + /* Always keep the final separator, as regardless of how many we strip, we want + * to keep the basename in all cases. */ + while separator_indices.len() > 1 && remaining_to_strip > 0 { + let _ = separator_indices.pop_front().unwrap(); + remaining_to_strip -= 1; + } + debug_assert!(separator_indices.len() > 0); + let leftmost_remaining_separator_index: usize = separator_indices.pop_front().unwrap(); + Cow::Borrowed(&name[(leftmost_remaining_separator_index + 1)..]) } } -impl Transformer { - pub fn evaluate<'s>(&self, name: &'s str) -> Cow<'s, str> { - match &self.trans { - NameTransform::Trivial(TrivialTransform::Identity) => Cow::Borrowed(name), +struct AddPrefix { + prefix_to_add: String, +} + +impl NameTransformer for AddPrefix { + type Arg = String where Self: Sized; + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(Self { prefix_to_add: arg }) + } + fn transform_name<'s>(&self, name: &'s str) -> Cow<'s, str> { + /* We allow an empty prefix, which means to do nothing. */ + if self.prefix_to_add.is_empty() { + return Cow::Borrowed(name); + } + Cow::Owned(format!("{}/{}", self.prefix_to_add, name)) + } +} + +pub struct EntrySpecTransformer { + matcher: Option, + name_transformers: Vec>, + content_transform: ContentTransform, +} + +impl EntrySpecTransformer { + fn make_transformer(trans: NameTransform) -> Result, CommandError> { + Ok(match trans { + NameTransform::Trivial(arg) => Box::new(Trivial::from_arg(arg)?), NameTransform::Basic(basic_trans) => match basic_trans { - BasicTransform::StripComponents(num_components_to_strip) => { - /* If no directory components, then nothing to strip. */ - if !name.contains('/') { - return Cow::Borrowed(name); - } - /* We allow stripping 0 components, which does nothing. */ - if *num_components_to_strip == 0 { - return Cow::Borrowed(name); - } - /* Pop off prefix components until only one is left or we have stripped all the - * requested prefix components. */ - let mut num_components_to_strip: usize = (*num_components_to_strip).into(); - let mut separator_indices: VecDeque = - name.match_indices('/').map(|(i, _)| i).collect(); - debug_assert!(separator_indices.len() > 0); - /* Always keep the final separator, as regardless of how many we strip, we want - * to keep the basename in all cases. */ - while separator_indices.len() > 1 && num_components_to_strip > 0 { - let _ = separator_indices.pop_front().unwrap(); - num_components_to_strip -= 1; - } - debug_assert!(separator_indices.len() > 0); - let leftmost_remaining_separator_index: usize = - separator_indices.pop_front().unwrap(); - Cow::Borrowed(&name[(leftmost_remaining_separator_index + 1)..]) - } - BasicTransform::AddPrefix(prefix_to_add) => { - /* We allow an empty prefix, which means to do nothing. */ - if prefix_to_add.is_empty() { - return Cow::Borrowed(name); - } - Cow::Owned(format!("{}/{}", prefix_to_add, name)) - } + BasicTransform::StripComponents(arg) => Box::new(StripComponents::from_arg(arg)?), + BasicTransform::AddPrefix(arg) => Box::new(AddPrefix::from_arg(arg)?), }, NameTransform::Complex(complex_trans) => match complex_trans { ComplexTransform::RemovePrefix(remove_prefix_arg) => { @@ -63,17 +122,9 @@ impl Transformer { todo!("impl transform: {:?}", transform_arg) } }, - } + }) } -} -pub struct EntrySpecTransformer { - matcher: Option, - name_transformers: Vec, - content_transform: ContentTransform, -} - -impl EntrySpecTransformer { pub fn new(entry_spec: EntrySpec) -> Result { let EntrySpec { match_expr, @@ -86,8 +137,8 @@ impl EntrySpecTransformer { }; let name_transformers: Vec<_> = name_transforms .into_iter() - .map(|trans| Transformer::new(trans)) - .collect(); + .map(Self::make_transformer) + .collect::>()?; Ok(Self { matcher, name_transformers, @@ -127,7 +178,7 @@ impl EntrySpecTransformer { let mut newly_allocated_str: Option<&str> = None; for transformer in self.name_transformers.iter() { match newly_allocated_str { - Some(s) => match transformer.evaluate(s) { + Some(s) => match transformer.transform_name(s) { Cow::Borrowed(t) => { let _ = newly_allocated_str.replace(t); } @@ -136,7 +187,7 @@ impl EntrySpecTransformer { newly_allocated_str = Some(newly_allocated_name.as_ref().unwrap().as_str()); } }, - None => match transformer.evaluate(original_name) { + None => match transformer.transform_name(original_name) { Cow::Borrowed(t) => { original_name = t; } From 12fb722047d9bd64a6ef9d299eb720107aedc900 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 18:35:54 -0400 Subject: [PATCH 14/68] refactor args modules --- cli/src/args.rs | 1806 +------------------------------------- cli/src/args/compress.rs | 448 ++++++++++ cli/src/args/extract.rs | 1352 ++++++++++++++++++++++++++++ 3 files changed, 1802 insertions(+), 1804 deletions(-) create mode 100644 cli/src/args/compress.rs create mode 100644 cli/src/args/extract.rs diff --git a/cli/src/args.rs b/cli/src/args.rs index 9fa8706dd..c2cdd94f6 100644 --- a/cli/src/args.rs +++ b/cli/src/args.rs @@ -202,462 +202,7 @@ error: {context} Self: Sized; } -pub mod compress { - use super::{ArgParseError, CommandFormat}; - - use std::{collections::VecDeque, ffi::OsString, num::ParseIntError, path::PathBuf}; - - #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] - pub enum CompressionMethodArg { - Stored, - Deflate, /* requires having zip/_deflate-any set to compile */ - #[cfg(feature = "deflate64")] - Deflate64, - #[cfg(feature = "bzip2")] - Bzip2, - #[cfg(feature = "zstd")] - Zstd, - } - - #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] - pub struct CompressionLevel(pub i64); - - #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] - pub struct UnixPermissions(pub u32); - - impl UnixPermissions { - pub fn parse(s: &str) -> Result { - Ok(Self(u32::from_str_radix(s, 8)?)) - } - } - - #[derive(Debug)] - pub enum CompressionArg { - CompressionMethod(CompressionMethodArg), - Level(CompressionLevel), - UnixPermissions(UnixPermissions), - LargeFile(bool), - Name(String), - Dir, - Symlink, - Immediate(OsString), - FilePath(PathBuf), - RecursiveDirPath(PathBuf), - } - - #[derive(Debug)] - pub enum OutputType { - Stdout { allow_tty: bool }, - File { path: PathBuf, append: bool }, - } - - #[derive(Debug)] - pub struct Compress { - pub output: OutputType, - pub args: Vec, - pub positional_paths: Vec, - } - - impl Compress { - #[cfg(feature = "deflate64")] - const DEFLATE64_HELP_LINE: &'static str = " - deflate64:\twith deflate64\n"; - #[cfg(not(feature = "deflate64"))] - const DEFLATE64_HELP_LINE: &'static str = ""; - - #[cfg(feature = "bzip2")] - const BZIP2_HELP_LINE: &'static str = " - bzip2:\twith bzip2\n"; - #[cfg(not(feature = "bzip2"))] - const BZIP2_HELP_LINE: &'static str = ""; - - #[cfg(feature = "zstd")] - const ZSTD_HELP_LINE: &'static str = " - zstd:\twith zstd\n"; - #[cfg(not(feature = "zstd"))] - const ZSTD_HELP_LINE: &'static str = ""; - } - - impl CommandFormat for Compress { - const COMMAND_NAME: &'static str = "compress"; - const COMMAND_TABS: &'static str = "\t"; - const COMMAND_DESCRIPTION: &'static str = "Generate a zip archive from files, directories, and symlinks provided as arguments or read from filesystem paths."; - - const USAGE_LINE: &'static str = "[-h|--help] [OUTPUT-FLAGS] [ENTRY]... [--] [PATH]..."; - - fn generate_help() -> String { - format!( - r#" - -h, --help Print help - -Output flags: -Where and how to write the generated zip archive. - - -o, --output-file - Output zip file path to write. - The output file is truncated if it already exists, unless --append is - provided. If not provided, output is written to stdout. - - --append - If an output path is provided with -o, open it as an existing zip - archive and append to it. If the output path does not already exist, - no error is produced, and a new zip file is created at the given path. - - --stdout - Allow writing output to stdout even if stdout is a tty. - -Entries: -After output flags are provided, the rest of the command line is -attributes and entry data. Attributes modify later entries. - -Sticky attributes: -These flags apply to everything that comes after them until reset by another -instance of the same attribute. Sticky attributes continue to apply to -positional arguments received after processing all flags. - - -c, --compression-method - Which compression technique to use. - Defaults to deflate if not specified. - - Possible values: - - stored: uncompressed - - deflate: with deflate (default) -{}{}{} - -l, --compression-level - How much compression to perform, from 0..=24. - The accepted range of values differs for each technique. - - -m, --mode - Unix permissions to apply to the file, in octal (like chmod). - - --large-file [true|false] - Whether to enable large file support. - This may take up more space for records, but allows files over 32 bits - in length to be written, up to 64 bit sizes. - File arguments over 32 bits in length (either provided explicitly or - encountered when traversing a recursive directory) will have this flag - set automatically, without affecting the sticky value for - later options. - Therefore, this option likely never has to be set explicitly by - the user. - -Non-sticky attributes: -These flags only apply to the next entry after them, and may not be repeated. - - -n, --name - The name to apply to the entry. This must be UTF-8 encoded. - - -s, --symlink - Make the next entry into a symlink entry. - A symlink entry may be immediate with -i, or it may copy the target - from an existing symlink with -f. - -Entry data: -Each of these flags creates an entry in the output zip archive. - - -d, --dir - Create a directory entry. - A name must be provided beforehand with -n. - - -i, --immediate - Write an entry containing the data in the argument, which need not be - UTF-8 encoded but will exit early upon encountering any null bytes. - A name must be provided beforehand with -n. - - -f, --file - Write an entry with the contents of this file path. - A name may be provided beforehand with -n, otherwise the name will be - inferred from relativizing the given path to the working directory. - Note that sockets are currently not supported and will produce an - error. Providing a path to a directory will produce an error. - - If -s was specified beforehand, the path will be read as a symlink, - which will produce an error if the path does not point to a symbolic - link. If -s was not specified beforehand and a symlink path was - provided, then the symbolic link will be interpreted as if it was - a file with the contents of the symlink target, but with its name - corresponding to the symlink path (unless overridden with -n). - - -r, --recursive-dir - Write all the recursive contents of this directory path. - A name may be provided beforehand with -n, which will be used as the - prefix for all recursive contents of this directory. Otherwise, the - name will be inferred from relativizing the given path to the - working directory. - - -s is not allowed before this argument. If a path to a symbolic link - is provided, it will be treated as if it pointed to a directory with - the recursive contents of the target directory, but with its name - corresponding to the symlink path (unless overridden with -n). - Providing a symlink path which points to a file will produce an error. - -Positional entries: - [PATH]... - Write the file or recursive directory contents, relativizing the path. - If the given path points to a file, then a single file entry will - be written. - If the given path is a symlink, then a single symlink entry will - be written. - If the given path refers to a directory, then the recursive contents - will be written, reproducing files and symlinks. - Socket paths will produce an error. -"#, - Self::DEFLATE64_HELP_LINE, - Self::BZIP2_HELP_LINE, - Self::ZSTD_HELP_LINE, - ) - } - - fn parse_argv(mut argv: VecDeque) -> Result { - let mut allow_stdout: bool = false; - let mut append_to_output_path: bool = false; - let mut output_path: Option = None; - let mut args: Vec = Vec::new(); - let mut positional_paths: Vec = Vec::new(); - - while let Some(arg) = argv.pop_front() { - match arg.as_encoded_bytes() { - b"-h" | b"--help" => { - let help_text = Self::generate_full_help_text(); - return Err(ArgParseError::StdoutMessage(help_text)); - } - - /* Output flags */ - b"--stdout" => { - if let Some(output_path) = output_path.take() { - return Err(Self::exit_arg_invalid(&format!( - "--stdout provided along with output file {output_path:?}" - ))); - } else if append_to_output_path { - return Err(Self::exit_arg_invalid( - "--stdout provided along with --append", - )); - } else if !args.is_empty() || !positional_paths.is_empty() { - return Err(Self::exit_arg_invalid("--stdout provided after entries")); - } else if allow_stdout { - return Err(Self::exit_arg_invalid("--stdout provided twice")); - } else { - allow_stdout = true; - } - } - b"--append" => { - if append_to_output_path { - return Err(Self::exit_arg_invalid("--append provided twice")); - } else if !args.is_empty() || !positional_paths.is_empty() { - return Err(Self::exit_arg_invalid("--append provided after entries")); - } else if allow_stdout { - return Err(Self::exit_arg_invalid( - "--stdout provided along with --append", - )); - } else { - append_to_output_path = true; - } - } - b"-o" | b"--output-file" => { - let new_path = argv.pop_front().map(PathBuf::from).ok_or_else(|| { - Self::exit_arg_invalid("no argument provided for -o/--output-file") - })?; - if let Some(prev_path) = output_path.take() { - return Err(Self::exit_arg_invalid(&format!( - "--output-file provided twice: {prev_path:?} and {new_path:?}" - ))); - } else if allow_stdout { - return Err(Self::exit_arg_invalid( - "--stdout provided along with output file", - )); - } else if !args.is_empty() || !positional_paths.is_empty() { - return Err(Self::exit_arg_invalid( - "-o/--output-file provided after entries", - )); - } else { - output_path = Some(new_path); - } - } - - /* Attributes */ - b"-c" | b"--compression-method" => match argv.pop_front() { - None => { - return Err(Self::exit_arg_invalid( - "no argument provided for -c/--compression-method", - )) - } - Some(name) => match name.as_encoded_bytes() { - b"stored" => args.push(CompressionArg::CompressionMethod( - CompressionMethodArg::Stored, - )), - b"deflate" => args.push(CompressionArg::CompressionMethod( - CompressionMethodArg::Deflate, - )), - #[cfg(feature = "deflate64")] - b"deflate64" => args.push(CompressionArg::CompressionMethod( - CompressionMethodArg::Deflate64, - )), - #[cfg(feature = "bzip2")] - b"bzip2" => args.push(CompressionArg::CompressionMethod( - CompressionMethodArg::Bzip2, - )), - #[cfg(feature = "zstd")] - b"zstd" => args.push(CompressionArg::CompressionMethod( - CompressionMethodArg::Zstd, - )), - _ => { - return Err(Self::exit_arg_invalid( - "unrecognized compression method {name:?}", - )); - } - }, - }, - b"-l" | b"--compression-level" => match argv.pop_front() { - None => { - return Err(Self::exit_arg_invalid( - "no argument provided for -l/--compression-level", - )); - } - Some(level) => match level.into_string() { - Err(level) => { - return Err(Self::exit_arg_invalid(&format!( - "invalid unicode provided for compression level: {level:?}" - ))); - } - Ok(level) => match level.parse::() { - Err(e) => { - return Err(Self::exit_arg_invalid(&format!( - "failed to parse integer for compression level: {e}" - ))); - } - Ok(level) => { - if (0..=24).contains(&level) { - args.push(CompressionArg::Level(CompressionLevel(level))) - } else { - return Err(Self::exit_arg_invalid(&format!( - "compression level {level} was not between 0 and 24" - ))); - } - } - }, - }, - }, - b"-m" | b"--mode" => match argv.pop_front() { - None => { - return Err(Self::exit_arg_invalid( - "no argument provided for -m/--mode", - )); - } - Some(mode) => match mode.into_string() { - Err(mode) => { - return Err(Self::exit_arg_invalid(&format!( - "invalid unicode provided for mode: {mode:?}" - ))); - } - Ok(mode) => match UnixPermissions::parse(&mode) { - Err(e) => { - return Err(Self::exit_arg_invalid(&format!( - "failed to parse integer for mode: {e}" - ))); - } - Ok(mode) => args.push(CompressionArg::UnixPermissions(mode)), - }, - }, - }, - b"--large-file" => match argv.pop_front() { - None => { - return Err(Self::exit_arg_invalid( - "no argument provided for --large-file", - )); - } - Some(large_file) => match large_file.as_encoded_bytes() { - b"true" => args.push(CompressionArg::LargeFile(true)), - b"false" => args.push(CompressionArg::LargeFile(false)), - _ => { - return Err(Self::exit_arg_invalid(&format!( - "unrecognized value for --large-file: {large_file:?}" - ))); - } - }, - }, - - /* Data */ - b"-n" | b"--name" => match argv.pop_front() { - None => { - return Err(Self::exit_arg_invalid( - "no argument provided for -n/--name", - )) - } - Some(name) => match name.into_string() { - Err(name) => { - return Err(Self::exit_arg_invalid(&format!( - "invalid unicode provided for name: {name:?}" - ))); - } - Ok(name) => args.push(CompressionArg::Name(name)), - }, - }, - b"-s" | b"--symlink" => args.push(CompressionArg::Symlink), - b"-d" | b"--dir" => args.push(CompressionArg::Dir), - b"-i" | b"--immediate" => match argv.pop_front() { - None => { - return Err(Self::exit_arg_invalid( - "no argument provided for -i/--immediate", - )); - } - Some(data) => args.push(CompressionArg::Immediate(data)), - }, - b"-f" | b"--file" => match argv.pop_front() { - None => { - return Err(Self::exit_arg_invalid( - "no argument provided for -f/--file", - )); - } - Some(file) => args.push(CompressionArg::FilePath(file.into())), - }, - b"-r" | b"--recursive-dir" => match argv.pop_front() { - None => { - return Err(Self::exit_arg_invalid( - "no argument provided for -r/--recursive-dir", - )); - } - Some(dir) => args.push(CompressionArg::RecursiveDirPath(dir.into())), - }, - - /* Transition to positional args */ - b"--" => break, - arg_bytes => { - if arg_bytes.starts_with(b"-") { - return Err(Self::exit_arg_invalid(&format!( - "unrecognized flag {arg:?}" - ))); - } else { - argv.push_front(arg); - break; - } - } - } - } - - positional_paths.extend(argv.into_iter().map(|arg| arg.into())); - - let output = if let Some(path) = output_path { - OutputType::File { - path, - append: append_to_output_path, - } - } else { - OutputType::Stdout { - allow_tty: allow_stdout, - } - }; - - Ok(Self { - output, - args, - positional_paths, - }) - } - } - - impl crate::driver::ExecuteCommand for Compress { - fn execute(self, err: impl std::io::Write) -> Result<(), crate::CommandError> { - crate::compress::execute_compress(err, self) - } - } -} +pub mod compress; pub mod info { #[derive(Debug)] @@ -671,1351 +216,4 @@ pub mod info { } } -pub mod extract { - use super::{ArgParseError, CommandFormat}; - - use zip::CompressionMethod; - - use std::{collections::VecDeque, ffi::OsString, mem, path::PathBuf}; - - #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] - pub enum ContentTransform { - Extract, - /* FIXME: not yet supported -- could be done by exposing ZipFile::take_raw_reader(), but - * should probably just refactor extract.rs to avoid the need for that. - * NB: actually, we can't do that while supporting streaming archives unless we expose - * take_raw_reader()! */ - Raw, - LogToStderr, - } - - #[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Copy, Clone)] - pub enum ComponentSelector { - #[default] - Path, - Basename, - Dirname, - FileExtension, - } - - impl ComponentSelector { - pub fn parse(s: &[u8]) -> Option { - match s { - b"path" => Some(Self::Path), - b"basename" => Some(Self::Basename), - b"dirname" => Some(Self::Dirname), - b"ext" => Some(Self::FileExtension), - _ => None, - } - } - } - - #[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Copy, Clone)] - pub enum PatternSelectorType { - #[default] - Glob, - Literal, - Regexp, - } - - impl PatternSelectorType { - pub fn parse(s: &[u8]) -> Option { - match s { - b"glob" => Some(Self::Glob), - b"lit" => Some(Self::Literal), - b"rx" => Some(Self::Regexp), - _ => None, - } - } - } - - #[derive(Debug)] - pub enum PatternSelectorModifier { - CaseInsensitive, - } - - impl PatternSelectorModifier { - pub fn parse(s: &[u8]) -> Option { - match s { - b"i" => Some(Self::CaseInsensitive), - _ => None, - } - } - } - - #[derive(Debug, Default, Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] - pub struct PatternModifiers { - pub case_insensitive: bool, - } - - #[derive(Debug, Default)] - pub struct PatternSelector { - pub pat_sel: PatternSelectorType, - pub modifiers: PatternModifiers, - } - - impl PatternSelector { - pub fn parse(s: &[u8]) -> Option { - match s.iter().position(|c| *c == b':') { - Some(modifiers_ind) => { - let pat_sel_str = &s[..modifiers_ind]; - let modifiers_str = &s[(modifiers_ind + 1)..]; - - let pat_sel = PatternSelectorType::parse(pat_sel_str)?; - - let mut modifiers = PatternModifiers::default(); - let mod_els = modifiers_str - .split(|c| *c == b':') - .map(PatternSelectorModifier::parse) - .collect::>>()?; - for m in mod_els.into_iter() { - match m { - PatternSelectorModifier::CaseInsensitive => { - modifiers.case_insensitive = true; - } - } - } - Some(Self { pat_sel, modifiers }) - } - None => { - let pat_sel = PatternSelectorType::parse(s)?; - Some(Self { - pat_sel, - modifiers: Default::default(), - }) - } - } - } - } - - pub fn parse_only_pat_sel(s: &[u8]) -> Option { - match s.iter().position(|c| *c == b':') { - Some(pat_sel_ind) => { - let pat_sel_str = &s[(pat_sel_ind + 1)..]; - - let pat_sel = PatternSelector::parse(pat_sel_str)?; - Some(pat_sel) - } - None => Some(PatternSelector::default()), - } - } - - pub fn parse_comp_and_pat_sel(s: &[u8]) -> Option<(ComponentSelector, PatternSelector)> { - match ( - s.iter().position(|c| *c == b'='), - s.iter().position(|c| *c == b':'), - ) { - (Some(comp_sel_ind), Some(pat_sel_ind)) => { - if comp_sel_ind >= pat_sel_ind { - return None; - } - let comp_sel_str = &s[(comp_sel_ind + 1)..pat_sel_ind]; - let pat_sel_str = &s[(pat_sel_ind + 1)..]; - - let comp_sel = ComponentSelector::parse(comp_sel_str)?; - let pat_sel = PatternSelector::parse(pat_sel_str)?; - Some((comp_sel, pat_sel)) - } - (Some(comp_sel_ind), None) => { - let comp_sel_str = &s[(comp_sel_ind + 1)..]; - - let comp_sel = ComponentSelector::parse(comp_sel_str)?; - let pat_sel = PatternSelector::default(); - Some((comp_sel, pat_sel)) - } - (None, Some(pat_sel_ind)) => { - let pat_sel_str = &s[(pat_sel_ind + 1)..]; - - let pat_sel = PatternSelector::parse(pat_sel_str)?; - let comp_sel = ComponentSelector::default(); - Some((comp_sel, pat_sel)) - } - (None, None) => { - let comp_sel = ComponentSelector::default(); - let pat_sel = PatternSelector::default(); - Some((comp_sel, pat_sel)) - } - } - } - - #[derive(Debug)] - pub enum EntryType { - File, - Dir, - Symlink, - } - - impl EntryType { - pub fn parse(s: &[u8]) -> Option { - match s { - b"file" => Some(Self::File), - b"dir" => Some(Self::Dir), - b"symlink" => Some(Self::Symlink), - _ => None, - } - } - } - - #[derive(Debug, PartialEq, Eq)] - pub enum NonSpecificCompressionMethodArg { - Any, - Known, - } - - #[derive(Debug, PartialEq, Eq, Copy, Clone)] - pub enum SpecificCompressionMethodArg { - Stored, - Deflated, - #[cfg(feature = "deflate64")] - Deflate64, - #[cfg(feature = "bzip2")] - Bzip2, - #[cfg(feature = "zstd")] - Zstd, - #[cfg(feature = "lzma")] - Lzma, - #[cfg(feature = "xz")] - Xz, - } - - impl SpecificCompressionMethodArg { - pub const KNOWN_COMPRESSION_METHODS: &[CompressionMethod] = &[ - CompressionMethod::Stored, - CompressionMethod::Deflated, - #[cfg(feature = "deflate64")] - CompressionMethod::Deflate64, - #[cfg(feature = "bzip2")] - CompressionMethod::Bzip2, - #[cfg(feature = "zstd")] - CompressionMethod::Zstd, - #[cfg(feature = "lzma")] - CompressionMethod::Lzma, - #[cfg(feature = "xz")] - CompressionMethod::Xz, - ]; - - pub fn translate_to_zip(self) -> CompressionMethod { - match self { - Self::Stored => CompressionMethod::Stored, - Self::Deflated => CompressionMethod::Deflated, - #[cfg(feature = "deflate64")] - Self::Deflate64 => CompressionMethod::Deflate64, - #[cfg(feature = "bzip2")] - Self::Bzip2 => CompressionMethod::Bzip2, - #[cfg(feature = "zstd")] - Self::Zstd => CompressionMethod::Zstd, - #[cfg(feature = "lzma")] - Self::Lzma => CompressionMethod::Lzma, - #[cfg(feature = "xz")] - Self::Xz => CompressionMethod::Xz, - } - } - } - - #[derive(Debug, PartialEq, Eq)] - pub enum CompressionMethodArg { - NonSpecific(NonSpecificCompressionMethodArg), - Specific(SpecificCompressionMethodArg), - } - - impl CompressionMethodArg { - pub fn parse(s: &[u8]) -> Option { - match s { - b"any" => Some(Self::NonSpecific(NonSpecificCompressionMethodArg::Any)), - b"known" => Some(Self::NonSpecific(NonSpecificCompressionMethodArg::Known)), - b"stored" => Some(Self::Specific(SpecificCompressionMethodArg::Stored)), - b"deflated" => Some(Self::Specific(SpecificCompressionMethodArg::Deflated)), - #[cfg(feature = "deflate64")] - b"deflate64" => Some(Self::Specific(SpecificCompressionMethodArg::Deflate64)), - #[cfg(feature = "bzip2")] - b"bzip2" => Some(Self::Specific(SpecificCompressionMethodArg::Bzip2)), - #[cfg(feature = "zstd")] - b"zstd" => Some(Self::Specific(SpecificCompressionMethodArg::Zstd)), - #[cfg(feature = "lzma")] - b"lzma" => Some(Self::Specific(SpecificCompressionMethodArg::Lzma)), - #[cfg(feature = "xz")] - b"xz" => Some(Self::Specific(SpecificCompressionMethodArg::Xz)), - _ => None, - } - } - } - - #[derive(Debug)] - pub enum DepthLimitArg { - Max(u8), - Min(u8), - } - - #[derive(Debug)] - pub struct MatchArg { - pub comp_sel: ComponentSelector, - pub pat_sel: PatternSelector, - pub pattern: String, - } - - #[derive(Debug)] - pub enum TrivialPredicate { - True, - False, - } - - #[derive(Debug)] - pub enum Predicate { - Trivial(TrivialPredicate), - EntryType(EntryType), - CompressionMethod(CompressionMethodArg), - DepthLimit(DepthLimitArg), - Match(MatchArg), - } - - #[derive(Debug)] - enum ExprOp { - Negation, - And, - Or, - } - - #[derive(Debug)] - enum ExprArg { - PrimitivePredicate(Predicate), - Op(ExprOp), - Subgroup(MatchExpression), - } - - #[derive(Debug, Default)] - struct SingleExprLevel { - expr_args: Vec, - } - - impl SingleExprLevel { - pub fn push_arg(&mut self, arg: ExprArg) { - self.expr_args.push(arg); - } - - fn get_negation( - expr_args: &mut VecDeque, - ) -> Result { - let negated_expr: MatchExpression = match expr_args.pop_front().ok_or_else(|| { - Extract::exit_arg_invalid(&format!( - "negation was only expression in list inside match expr (rest: {expr_args:?})" - )) - })? { - ExprArg::Subgroup(match_expr) => { - /* We have a valid match expression, so just negate it without - * wrapping. */ - MatchExpression::Negated(Box::new(match_expr)) - } - ExprArg::PrimitivePredicate(predicate) => { - /* We got a primitive predicate, so just negate it! */ - MatchExpression::Negated(Box::new(MatchExpression::PrimitivePredicate( - predicate, - ))) - } - ExprArg::Op(op) => { - /* Negation before any other operator is invalid. */ - return Err(Extract::exit_arg_invalid(&format!( - "negation before operator {op:?} inside match expr is invalid (rest: {expr_args:?})" - ))); - } - }; - Ok(negated_expr) - } - - fn get_non_operator( - expr_args: &mut VecDeque, - ) -> Result { - let next_expr: MatchExpression = match expr_args.pop_front().ok_or_else(|| { - /* We can't fold an empty list. */ - Extract::exit_arg_invalid(&format!( - "empty expression list inside match expr (rest: {expr_args:?})" - )) - })? { - /* This is already an evaluated match expression, so just start with that. */ - ExprArg::Subgroup(match_expr) => match_expr, - ExprArg::PrimitivePredicate(predicate) => { - /* Success! We start with a simple predicate. */ - MatchExpression::PrimitivePredicate(predicate) - } - ExprArg::Op(op) => match op { - /* We started with negation, which means we need to get the next arg to resolve - * it. */ - ExprOp::Negation => Self::get_negation(expr_args)?, - /* Starting with a binary operator is invalid. */ - op @ (ExprOp::And | ExprOp::Or) => { - return Err(Extract::exit_arg_invalid(&format!( - "expression list cannot begin with binary operator {op:?} (rest: {expr_args:?})" - ))); - } - }, - }; - Ok(next_expr) - } - - pub fn fold(self) -> Result { - let Self { expr_args } = self; - let mut expr_args: VecDeque<_> = expr_args.into(); - - /* Get a valid match expression to start our fold with. */ - let mut cur_expr: MatchExpression = Self::get_non_operator(&mut expr_args)?; - - /* Now fold the expression rightwards! */ - while let Some(next_arg) = expr_args.pop_front() { - match next_arg { - /* Implicit AND, wrapping the primitive result into a match. */ - ExprArg::PrimitivePredicate(predicate) => { - let next_expr = MatchExpression::PrimitivePredicate(predicate); - cur_expr = MatchExpression::And { - explicit: false, - left: Box::new(cur_expr), - right: Box::new(next_expr), - }; - } - /* Implicit AND, without needing to wrap the result. */ - ExprArg::Subgroup(match_expr) => { - cur_expr = MatchExpression::And { - explicit: false, - left: Box::new(cur_expr), - right: Box::new(match_expr), - }; - } - /* Evaluate the operator according to association. */ - ExprArg::Op(op) => match op { - /* Negation applies to the next element, so retrieve it! */ - ExprOp::Negation => { - let next_expr = Self::get_negation(&mut expr_args)?; - cur_expr = MatchExpression::And { - explicit: false, - left: Box::new(cur_expr), - right: Box::new(next_expr), - }; - } - /* Explicit AND requires the next element. */ - ExprOp::And => { - let next_expr = Self::get_non_operator(&mut expr_args)?; - cur_expr = MatchExpression::And { - explicit: true, - left: Box::new(cur_expr), - right: Box::new(next_expr), - }; - } - /* OR requires the next element. */ - ExprOp::Or => { - let next_expr = Self::get_non_operator(&mut expr_args)?; - cur_expr = MatchExpression::Or { - left: Box::new(cur_expr), - right: Box::new(next_expr), - }; - } - }, - } - } - - assert!(expr_args.is_empty()); - Ok(cur_expr) - } - } - - #[derive(Debug)] - pub enum MatchExpression { - PrimitivePredicate(Predicate), - Negated(Box), - And { - explicit: bool, - left: Box, - right: Box, - }, - Or { - left: Box, - right: Box, - }, - Grouped(Box), - } - - impl MatchExpression { - pub fn parse_argv(argv: &mut VecDeque) -> Result { - let mut expr_stack: Vec = Vec::new(); - let mut top_exprs = SingleExprLevel::default(); - - while let Some(arg) = argv.pop_front() { - match arg.as_encoded_bytes() { - /* Parse primitive predicates. */ - b"-true" => { - top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::Trivial( - TrivialPredicate::True, - ))); - } - b"-false" => { - top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::Trivial( - TrivialPredicate::False, - ))); - } - b"-t" | b"--type" => { - let type_arg = argv.pop_front().ok_or_else(|| { - Extract::exit_arg_invalid("no argument provided for -t/--type") - })?; - let entry_type = - EntryType::parse(type_arg.as_encoded_bytes()).ok_or_else(|| { - Extract::exit_arg_invalid(&format!( - "invalid --type argument: {type_arg:?}" - )) - })?; - top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::EntryType( - entry_type, - ))); - } - b"--compression-method" => { - let method_arg = argv.pop_front().ok_or_else(|| { - Extract::exit_arg_invalid( - "no argument provided for --compression-method", - ) - })?; - let method = CompressionMethodArg::parse(method_arg.as_encoded_bytes()) - .ok_or_else(|| { - Extract::exit_arg_invalid(&format!( - "invalid --compression-method argument: {method_arg:?}" - )) - })?; - top_exprs.push_arg(ExprArg::PrimitivePredicate( - Predicate::CompressionMethod(method), - )); - } - b"--max-depth" => { - let max_depth: u8 = argv - .pop_front() - .ok_or_else(|| { - Extract::exit_arg_invalid("no argument provided for --max-depth") - })? - .into_string() - .map_err(|depth_arg| { - Extract::exit_arg_invalid(&format!( - "invalid unicode provided for --max-depth: {depth_arg:?}" - )) - })? - .parse::() - .map_err(|e| { - Extract::exit_arg_invalid(&format!( - "failed to parse --max-depth arg {e:?} as u8" - )) - })?; - top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::DepthLimit( - DepthLimitArg::Max(max_depth), - ))); - } - b"--min-depth" => { - let min_depth: u8 = argv - .pop_front() - .ok_or_else(|| { - Extract::exit_arg_invalid("no argument provided for --min-depth") - })? - .into_string() - .map_err(|depth_arg| { - Extract::exit_arg_invalid(&format!( - "invalid unicode provided for --min-depth: {depth_arg:?}" - )) - })? - .parse::() - .map_err(|e| { - Extract::exit_arg_invalid(&format!( - "failed to parse --min-depth arg {e:?} as u8" - )) - })?; - top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::DepthLimit( - DepthLimitArg::Min(min_depth), - ))); - } - b"-m" => { - let pattern: String = argv - .pop_front() - .ok_or_else(|| { - Extract::exit_arg_invalid("no argument provided for -m") - })? - .into_string() - .map_err(|pattern| { - Extract::exit_arg_invalid(&format!( - "invalid unicode provided for -m: {pattern:?}" - )) - })?; - let comp_sel = ComponentSelector::default(); - let pat_sel = PatternSelector::default(); - top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::Match( - MatchArg { - comp_sel, - pat_sel, - pattern, - }, - ))); - } - arg_bytes if arg_bytes.starts_with(b"--match") => { - let (comp_sel, pat_sel) = - parse_comp_and_pat_sel(arg_bytes).ok_or_else(|| { - Extract::exit_arg_invalid(&format!( - "invalid --match argument modifiers: {arg:?}" - )) - })?; - let pattern: String = argv - .pop_front() - .ok_or_else(|| { - Extract::exit_arg_invalid("no argument provided for --match") - })? - .into_string() - .map_err(|pattern| { - Extract::exit_arg_invalid(&format!( - "invalid unicode provided for --match: {pattern:?}" - )) - })?; - top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::Match( - MatchArg { - comp_sel, - pat_sel, - pattern, - }, - ))); - } - - /* Parse operators. */ - b"!" | b"-not" => { - top_exprs.push_arg(ExprArg::Op(ExprOp::Negation)); - } - b"&" | b"-and" => { - top_exprs.push_arg(ExprArg::Op(ExprOp::And)); - } - b"|" | b"-or" => { - top_exprs.push_arg(ExprArg::Op(ExprOp::Or)); - } - - /* Process groups with stack logic! */ - b"(" | b"-open" => { - expr_stack.push(mem::take(&mut top_exprs)); - } - b")" | b"-close" => { - /* Get the unevaluated exprs from the previous nesting level. */ - let prev_level = expr_stack.pop().ok_or_else(|| { - Extract::exit_arg_invalid("too many close parens inside match expr") - })?; - /* Move the previous nesting level into current, and evaluate the current - * nesting level. */ - let group_expr = mem::replace(&mut top_exprs, prev_level).fold()?; - /* Wrap the completed group in a Grouped. */ - let group_expr = MatchExpression::Grouped(Box::new(group_expr)); - /* Push the completed and evaluated group into the current nesting level. */ - top_exprs.push_arg(ExprArg::Subgroup(group_expr)); - } - - /* Conclude the match expr processing. */ - b"--expr" => { - break; - } - _ => { - return Err(Extract::exit_arg_invalid(&format!( - "unrecognized match expression component {arg:?}: all match expressions must start and end with a --expr flag" - ))); - } - } - } - - if !expr_stack.is_empty() { - return Err(Extract::exit_arg_invalid( - "not enough close parens inside match expr", - )); - } - top_exprs.fold() - } - } - - #[derive(Debug)] - pub enum TrivialTransform { - Identity, - } - - #[derive(Debug)] - pub enum BasicTransform { - StripComponents(u8), - AddPrefix(String), - } - - #[derive(Debug)] - pub struct TransformArg { - pub comp_sel: ComponentSelector, - pub pat_sel: PatternSelector, - pub pattern: String, - pub replacement_spec: String, - } - - #[derive(Debug)] - pub struct RemovePrefixArg { - pub pat_sel: PatternSelector, - pub pattern: String, - } - - #[derive(Debug)] - pub enum ComplexTransform { - Transform(TransformArg), - RemovePrefix(RemovePrefixArg), - } - - #[derive(Debug)] - pub enum NameTransform { - Trivial(TrivialTransform), - Basic(BasicTransform), - Complex(ComplexTransform), - } - - #[derive(Debug)] - enum ExtractArg { - Match(MatchExpression), - NameTransform(NameTransform), - ContentTransform(ContentTransform), - } - - #[derive(Debug)] - pub struct EntrySpec { - pub match_expr: Option, - pub name_transforms: Vec, - pub content_transform: ContentTransform, - } - - impl EntrySpec { - fn parse_extract_args( - args: impl IntoIterator, - ) -> Result, ArgParseError> { - let mut match_expr: Option = None; - let mut name_transforms: Vec = Vec::new(); - - let mut ret: Vec = Vec::new(); - - for arg in args.into_iter() { - match arg { - ExtractArg::Match(new_expr) => { - if let Some(prev_expr) = match_expr.take() { - return Err(Extract::exit_arg_invalid(&format!( - "more than one match expr was provided for the same entry: {prev_expr:?} and {new_expr:?}" - ))); - } - match_expr = Some(new_expr); - } - ExtractArg::NameTransform(n_trans) => { - name_transforms.push(n_trans); - } - ExtractArg::ContentTransform(c_trans) => { - let spec = Self { - match_expr: match_expr.take(), - name_transforms: mem::take(&mut name_transforms), - content_transform: c_trans, - }; - ret.push(spec); - } - } - } - if let Some(match_expr) = match_expr { - return Err(Extract::exit_arg_invalid(&format!( - "match expr {match_expr:?} was provided with no corresponding content \ -transform. add -x/--extract to construct a complete entry spec" - ))); - } - if !name_transforms.is_empty() { - return Err(Extract::exit_arg_invalid(&format!( - "name transforms {name_transforms:?} were provided with no corresponding \ -content transform. add -x/--extract to construct a complete entry spec" - ))); - } - - Ok(ret) - } - } - - #[derive(Debug)] - pub enum OutputCollation { - ConcatenateStdout, - Filesystem { - output_dir: Option, - mkdir: bool, - }, - } - - #[derive(Debug)] - pub enum InputType { - StreamingStdin, - ZipPaths(Vec), - } - - #[derive(Debug)] - pub struct Extract { - pub output: OutputCollation, - pub entry_specs: Vec, - pub input: InputType, - } - - impl Extract { - #[cfg(feature = "deflate64")] - const DEFLATE64_HELP_LINE: &'static str = " - deflate64:\twith deflate64\n"; - #[cfg(not(feature = "deflate64"))] - const DEFLATE64_HELP_LINE: &'static str = ""; - - #[cfg(feature = "bzip2")] - const BZIP2_HELP_LINE: &'static str = " - bzip2:\twith bzip2\n"; - #[cfg(not(feature = "bzip2"))] - const BZIP2_HELP_LINE: &'static str = ""; - - #[cfg(feature = "zstd")] - const ZSTD_HELP_LINE: &'static str = " - zstd:\twith zstd\n"; - #[cfg(not(feature = "zstd"))] - const ZSTD_HELP_LINE: &'static str = ""; - - #[cfg(feature = "lzma")] - const LZMA_HELP_LINE: &'static str = " - lzma:\twith lzma\n"; - #[cfg(not(feature = "lzma"))] - const LZMA_HELP_LINE: &'static str = ""; - - #[cfg(feature = "xz")] - const XZ_HELP_LINE: &'static str = " - xz:\t\twith xz\n"; - #[cfg(not(feature = "xz"))] - const XZ_HELP_LINE: &'static str = ""; - } - - impl CommandFormat for Extract { - const COMMAND_NAME: &'static str = "extract"; - const COMMAND_TABS: &'static str = "\t"; - const COMMAND_DESCRIPTION: &'static str = - "Extract individual entries or an entire archive into a stream or the filesystem."; - - const USAGE_LINE: &'static str = - "[-h|--help] [OUTPUT-FLAGS] [ENTRY-SPEC]... [--stdin|[--] ZIP-PATH...]"; - - fn generate_help() -> String { - format!( - r#" - -h, --help Print help - -Output flags: -Where and how to collate the extracted entries. - - -d, --output-directory - Output directory path to write extracted entries into. - Paths for extracted entries will be constructed by interpreting entry - names as relative paths to the provided directory. If the provided - path is not a directory, an error is produced. If the provided path - does not exist, an error is produced unless --mkdir is specified. - If not provided, entries will be extracted into the current directory - (as if '-d .' had been provided). - - --mkdir - If an output directory is provided with -d and the directory path does - not exist, create it along with any missing parent directories. - If the path provided to -d is not a directory, an error will still be - produced if this flag is also provided. - - --stdout - Concatenate all extracted entries and write them in order to stdout - instead of writing anything to the filesystem. - This disables some optimizations that are possible when extracting to - the filesystem. - This will write output to stdout even if stdout is a tty. - -# Entry specs: - -After output flags are provided, entry specs are processed in order until an -input argument is reached. Entry specs are modelled after the arguments to -find(1), although "actions" are separated from "matching" expressions with -test clauses instead of being fully recursive like find(1). - -The full specification of an entry spec is provided below -(we will use lowercase names to describe this grammar): - - entry-spec = [--expr match-expr --expr] [name-transform]... content-transform - -1. (match-expr) matches against entries, -2. (name-transform) may transform the entry name string, -3. (content-transform) processes the entry content and writes it - to the output. - -Note that only the "content transform" is required: each entry spec must -conclude with exactly one content transform, but the other arguments may -be omitted and will be set to their default values. - -If no entry specs are provided, by default all entries are decompressed and written to the -output collator without modification. This behavior can be requested explicitly -with the command line: - - --expr -true --expr --identity --extract - -*Note:* if a match-expr is provided, it *must* be surrounded with --expr arguments on both sides! -This is a necessary constraint of the current command line parsing. - - -## Match expressions (match-expr): - -Entry matching logic composes boolean arithmetic expressions ("expr") in terms -of basic "predicates" which test some component of the zip entry. Expressions -can be composed as follows, in order of precedence: - -expr = ( ) (grouping to force precedence) - = ! (negation) - = & (short-circuiting conjunction "and") - = (implicit &) - = | (disjunction "or") - = (evaluate on entry) - -### Operators: -The operators to compose match expressions must be quoted in shell commands -(e.g. as \( or '('), so alternatives are provided which do not require -special quoting: - -Grouping operators: - (, -open - ), -close - -Unary operators: - !, -not - -Binary operators: - |, -or - &, -and - -### Predicates (predicate): -These arguments are interpreted as basic predicates, returning true or false in -response to a specific zip entry. - -Trivial: -These results do not depend on the entry data at all: - - -true Always return true. - -false Always return false. - -If a match expression is not provided, it defaults to the behavior of -true. - -Basic: -These results are dependent on the entry data: - - -t, --type [file|dir|symlink] - Match entries of the given type. - Note that directory entries may have specific mode bits set, or they may just be - zero-length entries whose name ends in '/'. - - --compression-method - Match entries compressed with the given compression technique. - - Possible values: - - any: any compression method at all - - known: any compression method this binary is able to decompress - - stored: uncompressed - - deflated: with deflate -{}{}{}{}{} - Using e.g. '-not --compression-method known' as a filter enables - special handling of entries compressed with an unsupported method. - - --max-depth - Match entries with at *most* components of their containing directory. - --min-depth - Match entries with at *least* components of their containing directory. - - -m, --match[=][:] - Return true for entries whose name matches . - - See section on "Selector syntax" for and for how - the string argument is interpreted into a string matching - predicate against the entry name. - - TODO: this flag is not yet supported and will produce an error. - - -## Name transforms (name-transform): - -Name transforms modify the entry name before writing the entry to the -output. Unlike match expressions, name transforms do not involve any boolean -logic, and instead are composed linearly, each processing the string produced by -the prior name transform in the series. - -*Note:* name transforms do *not* perform any filtering, so if a string -replacement operation "fails", the entry name is simply returned unchanged. - -Trivial: - --identity Return the entry name string unchanged. - -If no name transforms are provided, it defaults to the behavior of --identity. - -Basic: -These transformers do not perform any complex pattern matching, and instead add -or remove a fixed string from the entry name: - - --strip-components - Remove at most directory components from the entry name. - If is greater than or equal the number of components in the - entry dirname, then the basename of the entry is returned. - --add-prefix - Prefix the entry name with a directory path . - A single separator '/' will be added after before the rest of - the entry name, and any trailing '/' in will be trimmed - before joining. - -Complex: -These transformers perform complex pattern matching and replacement upon the -entry name string: - -TODO: these flags are not yet supported and will produce an error. - - --transform[=][:] - Extract the portion of the entry name corresponding to , - search it against corresponding to , and then - replace the result with . - - If == 'rx', then may contain references - to numbered capture groups specified by . Otherwise, - is interpreted as a literal string. - - --remove-prefix[:] - Equivalent to "--transform=path: ''", except the - search is anchored at the beginning of the string. - - -## Content transforms (content-transform): - -Content transforms determine how to interpret the content of the zip -entry itself. - -*Note:* when multiple entry specs are provided on the command line, a single -entry may be matched more than once. In this case, the entry's content will be -extracted more than once over the execution of this command. - -TODO: multiple entry specs with content transforms that extract output more than once require entry -teeing, which is not yet supported, so will produce an error. - - -x, --extract - Decompress the entry's contents (if necessary) before writing it to - the output. - - --raw - Do not decompress entry contents at all before writing its content to - the output. - - TODO: this flag is not yet supported and will produce an error. - - --log-to-stderr - Write the (possibly transformed) entry name to stderr, without reading - its content at all. - -Attempting to extract an entry using an unsupported compression method with --x/--extract will produce an error. In this case, --compression-method can be -used to filter out such entries, and --raw may be used to avoid the failure and -decompress the entry later, or --log-to-stderr can be used to print the names of -all unsupported entries. - - -## Selector syntax: - -The string matching operations of --match and --transform expose an interface to -configure various pattern matching techniques on various components of the entry -name string. - -These flags default to interpreting a argument as a glob string to -match against the entire entry name, which can be explicitly requested as -follows: - - --match=path:glob - -The entire range of search options is described below: - -### Component selector (comp-sel): -comp-sel = path [DEFAULT] (match full entry) - = basename (match only the final component of entry) - = dirname (match all except final component of entry) - = ext (match only the file extension, if available) - -### Pattern selector (pat-sel): -pat-sel = glob [DEFAULT] (interpret as a shell glob) - = lit (interpret as literal string) - = rx (interpret as a regular expression) - = :i (use case-insensitive matching for the given pattern) - - -# Input arguments: -Zip file inputs to extract from can be specified in exactly one of two ways: -streaming from stdin, or as at least one path pointing to an existing zip file. -Input arguments are always specified after all output flags and entry -specs on the command line. If no positional argument is provided and --stdin is -not present, an error will be produced. - - --stdin - If this argument is provided, the streaming API will be used to read - entries as they are encountered, instead of filtering them beforehand - as is done with file inputs. This disables some optimizations, but - also avoids waiting for the entire input to buffer to start writing - output, so can be used in a streaming context. - -Positional paths: - ZIP-PATH... - Apply the entry specs to filter and rename entries to extract from all - of the provided zip files. At least one zip path must be provided, and - all provided paths must exist and point to an existing zip file. Pipes - are not supported and will produce an error. -"#, - Self::DEFLATE64_HELP_LINE, - Self::BZIP2_HELP_LINE, - Self::ZSTD_HELP_LINE, - Self::LZMA_HELP_LINE, - Self::XZ_HELP_LINE, - ) - } - - fn parse_argv(mut argv: VecDeque) -> Result { - let mut output_dir: Option = None; - let mut mkdir_flag: bool = false; - let mut stdout_flag: bool = false; - let mut args: Vec = Vec::new(); - let mut stdin_flag: bool = false; - let mut positional_zips: Vec = Vec::new(); - - while let Some(arg) = argv.pop_front() { - match arg.as_encoded_bytes() { - b"-h" | b"--help" => { - let help_text = Self::generate_full_help_text(); - return Err(ArgParseError::StdoutMessage(help_text)); - } - - /* Output args */ - b"-d" | b"--output-directory" => { - let new_path = argv.pop_front().map(PathBuf::from).ok_or_else(|| { - Self::exit_arg_invalid("no argument provided for -d/--output-directory") - })?; - if let Some(prev_path) = output_dir.take() { - return Err(Self::exit_arg_invalid(&format!( - "--output-directory provided twice: {prev_path:?} and {new_path:?}" - ))); - } else if stdout_flag { - return Err(Self::exit_arg_invalid( - "--stdout provided along with output dir", - )); - } else if !args.is_empty() || stdin_flag || !positional_zips.is_empty() { - return Err(Self::exit_arg_invalid( - "-d/--output-directory provided after entry specs or inputs", - )); - } else { - output_dir = Some(new_path); - } - } - b"--mkdir" => { - if mkdir_flag { - return Err(Self::exit_arg_invalid("--mkdir provided twice")); - } else if stdout_flag { - return Err(Self::exit_arg_invalid( - "--stdout provided along with --mkdir", - )); - } else if !args.is_empty() || stdin_flag || !positional_zips.is_empty() { - return Err(Self::exit_arg_invalid( - "--mkdir provided after entry specs or inputs", - )); - } else { - mkdir_flag = true; - } - } - b"--stdout" => { - if let Some(output_dir) = output_dir.take() { - return Err(Self::exit_arg_invalid(&format!( - "--stdout provided along with output directory {output_dir:?}" - ))); - } else if stdout_flag { - return Err(Self::exit_arg_invalid("--stdout provided twice")); - } else if mkdir_flag { - return Err(Self::exit_arg_invalid( - "--stdout provided along with --mkdir", - )); - } else if !args.is_empty() || stdin_flag || !positional_zips.is_empty() { - return Err(Self::exit_arg_invalid( - "--stdout provided after entry specs or inputs", - )); - } else { - stdout_flag = true; - } - } - - /* Transition to entry specs */ - /* Try content transforms first, as they are unambiguous sentinel values. */ - b"-x" | b"--extract" => { - args.push(ExtractArg::ContentTransform(ContentTransform::Extract)); - } - b"--raw" => { - args.push(ExtractArg::ContentTransform(ContentTransform::Raw)); - } - b"--log-to-stderr" => { - args.push(ExtractArg::ContentTransform(ContentTransform::LogToStderr)); - } - - /* Try name transforms next, as they only stack linearly and do not require CFG - * parsing of paired delimiters. */ - /* FIXME: none of these name transforms have any effect if --stdout is - * provided. Should we error or warn about this? */ - b"--identity" => { - args.push(ExtractArg::NameTransform(NameTransform::Trivial( - TrivialTransform::Identity, - ))); - } - b"--strip-components" => { - let num: u8 = argv - .pop_front() - .ok_or_else(|| { - Self::exit_arg_invalid("no argument provided for --strip-component") - })? - .into_string() - .map_err(|num| { - Self::exit_arg_invalid(&format!( - "invalid unicode provided for --strip-component: {num:?}" - )) - })? - .parse::() - .map_err(|e| { - Self::exit_arg_invalid(&format!( - "failed to parse --strip-component arg {e:?} as u8" - )) - })?; - args.push(ExtractArg::NameTransform(NameTransform::Basic( - BasicTransform::StripComponents(num), - ))); - } - b"--add-prefix" => { - let prefix = argv - .pop_front() - .ok_or_else(|| { - Self::exit_arg_invalid("no argument provided for --add-prefix") - })? - .into_string() - .map_err(|prefix| { - Self::exit_arg_invalid(&format!( - "invalid unicode provided for --add-prefix: {prefix:?}" - )) - })?; - args.push(ExtractArg::NameTransform(NameTransform::Basic( - BasicTransform::AddPrefix(prefix), - ))); - } - arg_bytes if arg_bytes.starts_with(b"--transform") => { - let (comp_sel, pat_sel) = - parse_comp_and_pat_sel(arg_bytes).ok_or_else(|| { - Self::exit_arg_invalid(&format!( - "invalid --transform argument modifiers: {arg:?}" - )) - })?; - let pattern = argv - .pop_front() - .ok_or_else(|| { - Self::exit_arg_invalid( - "no argument provided for --transform", - ) - })? - .into_string() - .map_err(|pattern| { - Self::exit_arg_invalid(&format!( - "invalid unicode provided for --transform : {pattern:?}" - )) - })?; - let replacement_spec = argv - .pop_front() - .ok_or_else(|| { - Self::exit_arg_invalid( - "no argument provided for --transform", - ) - })? - .into_string() - .map_err(|replacement_spec| { - Self::exit_arg_invalid(&format!( - "invalid unicode provided for --transform : {replacement_spec:?}" - )) - })?; - args.push(ExtractArg::NameTransform(NameTransform::Complex( - ComplexTransform::Transform(TransformArg { - comp_sel, - pat_sel, - pattern, - replacement_spec, - }), - ))); - } - arg_bytes if arg_bytes.starts_with(b"--remove-prefix") => { - let pat_sel = parse_only_pat_sel(arg_bytes).ok_or_else(|| { - Self::exit_arg_invalid(&format!( - "invalid --remove-prefix argument modifiers: {arg:?}" - )) - })?; - let pattern = argv - .pop_front() - .ok_or_else(|| { - Self::exit_arg_invalid( - "no argument provided for --remove-prefix", - ) - })? - .into_string() - .map_err(|pattern| { - Self::exit_arg_invalid(&format!( - "invalid unicode provided for --remove-prefix : {pattern:?}" - )) - })?; - args.push(ExtractArg::NameTransform(NameTransform::Complex( - ComplexTransform::RemovePrefix(RemovePrefixArg { pat_sel, pattern }), - ))); - } - - /* Try parsing match specs! */ - b"--expr" => { - let match_expr = MatchExpression::parse_argv(&mut argv)?; - args.push(ExtractArg::Match(match_expr)); - } - - /* Transition to input args */ - b"--stdin" => { - stdin_flag = true; - break; - } - b"--" => break, - arg_bytes => { - if arg_bytes.starts_with(b"-") { - return Err(Self::exit_arg_invalid(&format!( - "unrecognized flag {arg:?}" - ))); - } else { - argv.push_front(arg); - break; - } - } - } - } - - positional_zips.extend(argv.into_iter().map(|arg| arg.into())); - if stdin_flag && !positional_zips.is_empty() { - return Err(Self::exit_arg_invalid(&format!( - "--stdin was provided at the same time as positional args {positional_zips:?}" - ))); - } - let input = if stdin_flag { - InputType::StreamingStdin - } else { - if positional_zips.is_empty() { - return Err(Self::exit_arg_invalid( - "no zip input files were provided, and --stdin was not provided", - )); - } - InputType::ZipPaths(positional_zips) - }; - - let output = if stdout_flag { - OutputCollation::ConcatenateStdout - } else { - OutputCollation::Filesystem { - output_dir, - mkdir: mkdir_flag, - } - }; - - let entry_specs = EntrySpec::parse_extract_args(args)?; - - Ok(Self { - output, - entry_specs, - input, - }) - } - } - - impl crate::driver::ExecuteCommand for Extract { - fn execute(self, err: impl std::io::Write) -> Result<(), crate::CommandError> { - crate::extract::execute_extract(err, self) - } - } -} +pub mod extract; diff --git a/cli/src/args/compress.rs b/cli/src/args/compress.rs new file mode 100644 index 000000000..1c07b1e45 --- /dev/null +++ b/cli/src/args/compress.rs @@ -0,0 +1,448 @@ +use super::{ArgParseError, CommandFormat}; + +use std::{collections::VecDeque, ffi::OsString, num::ParseIntError, path::PathBuf}; + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub enum CompressionMethodArg { + Stored, + Deflate, /* requires having zip/_deflate-any set to compile */ + #[cfg(feature = "deflate64")] + Deflate64, + #[cfg(feature = "bzip2")] + Bzip2, + #[cfg(feature = "zstd")] + Zstd, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct CompressionLevel(pub i64); + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct UnixPermissions(pub u32); + +impl UnixPermissions { + pub fn parse(s: &str) -> Result { + Ok(Self(u32::from_str_radix(s, 8)?)) + } +} + +#[derive(Debug)] +pub enum CompressionArg { + CompressionMethod(CompressionMethodArg), + Level(CompressionLevel), + UnixPermissions(UnixPermissions), + LargeFile(bool), + Name(String), + Dir, + Symlink, + Immediate(OsString), + FilePath(PathBuf), + RecursiveDirPath(PathBuf), +} + +#[derive(Debug)] +pub enum OutputType { + Stdout { allow_tty: bool }, + File { path: PathBuf, append: bool }, +} + +#[derive(Debug)] +pub struct Compress { + pub output: OutputType, + pub args: Vec, + pub positional_paths: Vec, +} + +impl Compress { + #[cfg(feature = "deflate64")] + const DEFLATE64_HELP_LINE: &'static str = " - deflate64:\twith deflate64\n"; + #[cfg(not(feature = "deflate64"))] + const DEFLATE64_HELP_LINE: &'static str = ""; + + #[cfg(feature = "bzip2")] + const BZIP2_HELP_LINE: &'static str = " - bzip2:\twith bzip2\n"; + #[cfg(not(feature = "bzip2"))] + const BZIP2_HELP_LINE: &'static str = ""; + + #[cfg(feature = "zstd")] + const ZSTD_HELP_LINE: &'static str = " - zstd:\twith zstd\n"; + #[cfg(not(feature = "zstd"))] + const ZSTD_HELP_LINE: &'static str = ""; +} + +impl CommandFormat for Compress { + const COMMAND_NAME: &'static str = "compress"; + const COMMAND_TABS: &'static str = "\t"; + const COMMAND_DESCRIPTION: &'static str = "Generate a zip archive from files, directories, and symlinks provided as arguments or read from filesystem paths."; + + const USAGE_LINE: &'static str = "[-h|--help] [OUTPUT-FLAGS] [ENTRY]... [--] [PATH]..."; + + fn generate_help() -> String { + format!( + r#" + -h, --help Print help + +Output flags: +Where and how to write the generated zip archive. + + -o, --output-file + Output zip file path to write. + The output file is truncated if it already exists, unless --append is + provided. If not provided, output is written to stdout. + + --append + If an output path is provided with -o, open it as an existing zip + archive and append to it. If the output path does not already exist, + no error is produced, and a new zip file is created at the given path. + + --stdout + Allow writing output to stdout even if stdout is a tty. + +Entries: +After output flags are provided, the rest of the command line is +attributes and entry data. Attributes modify later entries. + +Sticky attributes: +These flags apply to everything that comes after them until reset by another +instance of the same attribute. Sticky attributes continue to apply to +positional arguments received after processing all flags. + + -c, --compression-method + Which compression technique to use. + Defaults to deflate if not specified. + + Possible values: + - stored: uncompressed + - deflate: with deflate (default) +{}{}{} + -l, --compression-level + How much compression to perform, from 0..=24. + The accepted range of values differs for each technique. + + -m, --mode + Unix permissions to apply to the file, in octal (like chmod). + + --large-file [true|false] + Whether to enable large file support. + This may take up more space for records, but allows files over 32 bits + in length to be written, up to 64 bit sizes. + File arguments over 32 bits in length (either provided explicitly or + encountered when traversing a recursive directory) will have this flag + set automatically, without affecting the sticky value for + later options. + Therefore, this option likely never has to be set explicitly by + the user. + +Non-sticky attributes: +These flags only apply to the next entry after them, and may not be repeated. + + -n, --name + The name to apply to the entry. This must be UTF-8 encoded. + + -s, --symlink + Make the next entry into a symlink entry. + A symlink entry may be immediate with -i, or it may copy the target + from an existing symlink with -f. + +Entry data: +Each of these flags creates an entry in the output zip archive. + + -d, --dir + Create a directory entry. + A name must be provided beforehand with -n. + + -i, --immediate + Write an entry containing the data in the argument, which need not be + UTF-8 encoded but will exit early upon encountering any null bytes. + A name must be provided beforehand with -n. + + -f, --file + Write an entry with the contents of this file path. + A name may be provided beforehand with -n, otherwise the name will be + inferred from relativizing the given path to the working directory. + Note that sockets are currently not supported and will produce an + error. Providing a path to a directory will produce an error. + + If -s was specified beforehand, the path will be read as a symlink, + which will produce an error if the path does not point to a symbolic + link. If -s was not specified beforehand and a symlink path was + provided, then the symbolic link will be interpreted as if it was + a file with the contents of the symlink target, but with its name + corresponding to the symlink path (unless overridden with -n). + + -r, --recursive-dir + Write all the recursive contents of this directory path. + A name may be provided beforehand with -n, which will be used as the + prefix for all recursive contents of this directory. Otherwise, the + name will be inferred from relativizing the given path to the + working directory. + + -s is not allowed before this argument. If a path to a symbolic link + is provided, it will be treated as if it pointed to a directory with + the recursive contents of the target directory, but with its name + corresponding to the symlink path (unless overridden with -n). + Providing a symlink path which points to a file will produce an error. + +Positional entries: + [PATH]... + Write the file or recursive directory contents, relativizing the path. + If the given path points to a file, then a single file entry will + be written. + If the given path is a symlink, then a single symlink entry will + be written. + If the given path refers to a directory, then the recursive contents + will be written, reproducing files and symlinks. + Socket paths will produce an error. +"#, + Self::DEFLATE64_HELP_LINE, + Self::BZIP2_HELP_LINE, + Self::ZSTD_HELP_LINE, + ) + } + + fn parse_argv(mut argv: VecDeque) -> Result { + let mut allow_stdout: bool = false; + let mut append_to_output_path: bool = false; + let mut output_path: Option = None; + let mut args: Vec = Vec::new(); + let mut positional_paths: Vec = Vec::new(); + + while let Some(arg) = argv.pop_front() { + match arg.as_encoded_bytes() { + b"-h" | b"--help" => { + let help_text = Self::generate_full_help_text(); + return Err(ArgParseError::StdoutMessage(help_text)); + } + + /* Output flags */ + b"--stdout" => { + if let Some(output_path) = output_path.take() { + return Err(Self::exit_arg_invalid(&format!( + "--stdout provided along with output file {output_path:?}" + ))); + } else if append_to_output_path { + return Err(Self::exit_arg_invalid( + "--stdout provided along with --append", + )); + } else if !args.is_empty() || !positional_paths.is_empty() { + return Err(Self::exit_arg_invalid("--stdout provided after entries")); + } else if allow_stdout { + return Err(Self::exit_arg_invalid("--stdout provided twice")); + } else { + allow_stdout = true; + } + } + b"--append" => { + if append_to_output_path { + return Err(Self::exit_arg_invalid("--append provided twice")); + } else if !args.is_empty() || !positional_paths.is_empty() { + return Err(Self::exit_arg_invalid("--append provided after entries")); + } else if allow_stdout { + return Err(Self::exit_arg_invalid( + "--stdout provided along with --append", + )); + } else { + append_to_output_path = true; + } + } + b"-o" | b"--output-file" => { + let new_path = argv.pop_front().map(PathBuf::from).ok_or_else(|| { + Self::exit_arg_invalid("no argument provided for -o/--output-file") + })?; + if let Some(prev_path) = output_path.take() { + return Err(Self::exit_arg_invalid(&format!( + "--output-file provided twice: {prev_path:?} and {new_path:?}" + ))); + } else if allow_stdout { + return Err(Self::exit_arg_invalid( + "--stdout provided along with output file", + )); + } else if !args.is_empty() || !positional_paths.is_empty() { + return Err(Self::exit_arg_invalid( + "-o/--output-file provided after entries", + )); + } else { + output_path = Some(new_path); + } + } + + /* Attributes */ + b"-c" | b"--compression-method" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid( + "no argument provided for -c/--compression-method", + )) + } + Some(name) => match name.as_encoded_bytes() { + b"stored" => args.push(CompressionArg::CompressionMethod( + CompressionMethodArg::Stored, + )), + b"deflate" => args.push(CompressionArg::CompressionMethod( + CompressionMethodArg::Deflate, + )), + #[cfg(feature = "deflate64")] + b"deflate64" => args.push(CompressionArg::CompressionMethod( + CompressionMethodArg::Deflate64, + )), + #[cfg(feature = "bzip2")] + b"bzip2" => args.push(CompressionArg::CompressionMethod( + CompressionMethodArg::Bzip2, + )), + #[cfg(feature = "zstd")] + b"zstd" => args.push(CompressionArg::CompressionMethod( + CompressionMethodArg::Zstd, + )), + _ => { + return Err(Self::exit_arg_invalid( + "unrecognized compression method {name:?}", + )); + } + }, + }, + b"-l" | b"--compression-level" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid( + "no argument provided for -l/--compression-level", + )); + } + Some(level) => match level.into_string() { + Err(level) => { + return Err(Self::exit_arg_invalid(&format!( + "invalid unicode provided for compression level: {level:?}" + ))); + } + Ok(level) => match level.parse::() { + Err(e) => { + return Err(Self::exit_arg_invalid(&format!( + "failed to parse integer for compression level: {e}" + ))); + } + Ok(level) => { + if (0..=24).contains(&level) { + args.push(CompressionArg::Level(CompressionLevel(level))) + } else { + return Err(Self::exit_arg_invalid(&format!( + "compression level {level} was not between 0 and 24" + ))); + } + } + }, + }, + }, + b"-m" | b"--mode" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid("no argument provided for -m/--mode")); + } + Some(mode) => match mode.into_string() { + Err(mode) => { + return Err(Self::exit_arg_invalid(&format!( + "invalid unicode provided for mode: {mode:?}" + ))); + } + Ok(mode) => match UnixPermissions::parse(&mode) { + Err(e) => { + return Err(Self::exit_arg_invalid(&format!( + "failed to parse integer for mode: {e}" + ))); + } + Ok(mode) => args.push(CompressionArg::UnixPermissions(mode)), + }, + }, + }, + b"--large-file" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid( + "no argument provided for --large-file", + )); + } + Some(large_file) => match large_file.as_encoded_bytes() { + b"true" => args.push(CompressionArg::LargeFile(true)), + b"false" => args.push(CompressionArg::LargeFile(false)), + _ => { + return Err(Self::exit_arg_invalid(&format!( + "unrecognized value for --large-file: {large_file:?}" + ))); + } + }, + }, + + /* Data */ + b"-n" | b"--name" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid("no argument provided for -n/--name")) + } + Some(name) => match name.into_string() { + Err(name) => { + return Err(Self::exit_arg_invalid(&format!( + "invalid unicode provided for name: {name:?}" + ))); + } + Ok(name) => args.push(CompressionArg::Name(name)), + }, + }, + b"-s" | b"--symlink" => args.push(CompressionArg::Symlink), + b"-d" | b"--dir" => args.push(CompressionArg::Dir), + b"-i" | b"--immediate" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid( + "no argument provided for -i/--immediate", + )); + } + Some(data) => args.push(CompressionArg::Immediate(data)), + }, + b"-f" | b"--file" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid("no argument provided for -f/--file")); + } + Some(file) => args.push(CompressionArg::FilePath(file.into())), + }, + b"-r" | b"--recursive-dir" => match argv.pop_front() { + None => { + return Err(Self::exit_arg_invalid( + "no argument provided for -r/--recursive-dir", + )); + } + Some(dir) => args.push(CompressionArg::RecursiveDirPath(dir.into())), + }, + + /* Transition to positional args */ + b"--" => break, + arg_bytes => { + if arg_bytes.starts_with(b"-") { + return Err(Self::exit_arg_invalid(&format!( + "unrecognized flag {arg:?}" + ))); + } else { + argv.push_front(arg); + break; + } + } + } + } + + positional_paths.extend(argv.into_iter().map(|arg| arg.into())); + + let output = if let Some(path) = output_path { + OutputType::File { + path, + append: append_to_output_path, + } + } else { + OutputType::Stdout { + allow_tty: allow_stdout, + } + }; + + Ok(Self { + output, + args, + positional_paths, + }) + } +} + +impl crate::driver::ExecuteCommand for Compress { + fn execute(self, err: impl std::io::Write) -> Result<(), crate::CommandError> { + crate::compress::execute_compress(err, self) + } +} diff --git a/cli/src/args/extract.rs b/cli/src/args/extract.rs new file mode 100644 index 000000000..08b69b29e --- /dev/null +++ b/cli/src/args/extract.rs @@ -0,0 +1,1352 @@ +use super::{ArgParseError, CommandFormat}; + +use zip::CompressionMethod; + +use std::{collections::VecDeque, ffi::OsString, mem, path::PathBuf}; + +#[derive(Debug)] +pub enum ContentTransform { + Extract, + /* FIXME: not yet supported -- could be done by exposing ZipFile::take_raw_reader(), but + * should probably just refactor extract.rs to avoid the need for that. + * NB: actually, we can't do that while supporting streaming archives unless we expose + * take_raw_reader()! */ + Raw, + LogToStderr, +} + +#[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Copy, Clone)] +pub enum ComponentSelector { + #[default] + Path, + Basename, + Dirname, + FileExtension, +} + +impl ComponentSelector { + pub fn parse(s: &[u8]) -> Option { + match s { + b"path" => Some(Self::Path), + b"basename" => Some(Self::Basename), + b"dirname" => Some(Self::Dirname), + b"ext" => Some(Self::FileExtension), + _ => None, + } + } +} + +#[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Copy, Clone)] +pub enum PatternSelectorType { + #[default] + Glob, + Literal, + Regexp, +} + +impl PatternSelectorType { + pub fn parse(s: &[u8]) -> Option { + match s { + b"glob" => Some(Self::Glob), + b"lit" => Some(Self::Literal), + b"rx" => Some(Self::Regexp), + _ => None, + } + } +} + +#[derive(Debug)] +pub enum PatternSelectorModifier { + CaseInsensitive, + MultipleMatches, +} + +impl PatternSelectorModifier { + pub fn parse(s: &[u8]) -> Option { + match s { + b"i" => Some(Self::CaseInsensitive), + b"g" => Some(Self::MultipleMatches), + _ => None, + } + } +} + +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct PatternModifiers { + pub case_insensitive: bool, + pub multiple_matches: bool, +} + +#[derive(Debug, Default)] +pub struct PatternSelector { + pub pat_sel: PatternSelectorType, + pub modifiers: PatternModifiers, +} + +impl PatternSelector { + pub fn parse(s: &[u8]) -> Option { + match s.iter().position(|c| *c == b':') { + Some(modifiers_ind) => { + let pat_sel_str = &s[..modifiers_ind]; + let modifiers_str = &s[(modifiers_ind + 1)..]; + + let pat_sel = PatternSelectorType::parse(pat_sel_str)?; + + let mut modifiers = PatternModifiers::default(); + let mod_els = modifiers_str + .split(|c| *c == b':') + .map(PatternSelectorModifier::parse) + .collect::>>()?; + for m in mod_els.into_iter() { + match m { + PatternSelectorModifier::CaseInsensitive => { + modifiers.case_insensitive = true; + } + PatternSelectorModifier::MultipleMatches => { + modifiers.multiple_matches = true; + } + } + } + Some(Self { pat_sel, modifiers }) + } + None => { + let pat_sel = PatternSelectorType::parse(s)?; + Some(Self { + pat_sel, + modifiers: Default::default(), + }) + } + } + } +} + +pub fn parse_only_pat_sel(s: &[u8]) -> Option { + match s.iter().position(|c| *c == b':') { + Some(pat_sel_ind) => { + let pat_sel_str = &s[(pat_sel_ind + 1)..]; + + let pat_sel = PatternSelector::parse(pat_sel_str)?; + Some(pat_sel) + } + None => Some(PatternSelector::default()), + } +} + +pub fn parse_comp_and_pat_sel(s: &[u8]) -> Option<(ComponentSelector, PatternSelector)> { + match ( + s.iter().position(|c| *c == b'='), + s.iter().position(|c| *c == b':'), + ) { + (Some(comp_sel_ind), Some(pat_sel_ind)) => { + if comp_sel_ind >= pat_sel_ind { + return None; + } + let comp_sel_str = &s[(comp_sel_ind + 1)..pat_sel_ind]; + let pat_sel_str = &s[(pat_sel_ind + 1)..]; + + let comp_sel = ComponentSelector::parse(comp_sel_str)?; + let pat_sel = PatternSelector::parse(pat_sel_str)?; + Some((comp_sel, pat_sel)) + } + (Some(comp_sel_ind), None) => { + let comp_sel_str = &s[(comp_sel_ind + 1)..]; + + let comp_sel = ComponentSelector::parse(comp_sel_str)?; + let pat_sel = PatternSelector::default(); + Some((comp_sel, pat_sel)) + } + (None, Some(pat_sel_ind)) => { + let pat_sel_str = &s[(pat_sel_ind + 1)..]; + + let pat_sel = PatternSelector::parse(pat_sel_str)?; + let comp_sel = ComponentSelector::default(); + Some((comp_sel, pat_sel)) + } + (None, None) => { + let comp_sel = ComponentSelector::default(); + let pat_sel = PatternSelector::default(); + Some((comp_sel, pat_sel)) + } + } +} + +#[derive(Debug)] +pub enum EntryType { + File, + Dir, + Symlink, +} + +impl EntryType { + pub fn parse(s: &[u8]) -> Option { + match s { + b"file" => Some(Self::File), + b"dir" => Some(Self::Dir), + b"symlink" => Some(Self::Symlink), + _ => None, + } + } +} + +#[derive(Debug, PartialEq, Eq)] +pub enum NonSpecificCompressionMethodArg { + Any, + Known, +} + +#[derive(Debug, PartialEq, Eq, Copy, Clone)] +pub enum SpecificCompressionMethodArg { + Stored, + Deflated, + #[cfg(feature = "deflate64")] + Deflate64, + #[cfg(feature = "bzip2")] + Bzip2, + #[cfg(feature = "zstd")] + Zstd, + #[cfg(feature = "lzma")] + Lzma, + #[cfg(feature = "xz")] + Xz, +} + +impl SpecificCompressionMethodArg { + pub const KNOWN_COMPRESSION_METHODS: &[CompressionMethod] = &[ + CompressionMethod::Stored, + CompressionMethod::Deflated, + #[cfg(feature = "deflate64")] + CompressionMethod::Deflate64, + #[cfg(feature = "bzip2")] + CompressionMethod::Bzip2, + #[cfg(feature = "zstd")] + CompressionMethod::Zstd, + #[cfg(feature = "lzma")] + CompressionMethod::Lzma, + #[cfg(feature = "xz")] + CompressionMethod::Xz, + ]; + + pub fn translate_to_zip(self) -> CompressionMethod { + match self { + Self::Stored => CompressionMethod::Stored, + Self::Deflated => CompressionMethod::Deflated, + #[cfg(feature = "deflate64")] + Self::Deflate64 => CompressionMethod::Deflate64, + #[cfg(feature = "bzip2")] + Self::Bzip2 => CompressionMethod::Bzip2, + #[cfg(feature = "zstd")] + Self::Zstd => CompressionMethod::Zstd, + #[cfg(feature = "lzma")] + Self::Lzma => CompressionMethod::Lzma, + #[cfg(feature = "xz")] + Self::Xz => CompressionMethod::Xz, + } + } +} + +#[derive(Debug, PartialEq, Eq)] +pub enum CompressionMethodArg { + NonSpecific(NonSpecificCompressionMethodArg), + Specific(SpecificCompressionMethodArg), +} + +impl CompressionMethodArg { + pub fn parse(s: &[u8]) -> Option { + match s { + b"any" => Some(Self::NonSpecific(NonSpecificCompressionMethodArg::Any)), + b"known" => Some(Self::NonSpecific(NonSpecificCompressionMethodArg::Known)), + b"stored" => Some(Self::Specific(SpecificCompressionMethodArg::Stored)), + b"deflated" => Some(Self::Specific(SpecificCompressionMethodArg::Deflated)), + #[cfg(feature = "deflate64")] + b"deflate64" => Some(Self::Specific(SpecificCompressionMethodArg::Deflate64)), + #[cfg(feature = "bzip2")] + b"bzip2" => Some(Self::Specific(SpecificCompressionMethodArg::Bzip2)), + #[cfg(feature = "zstd")] + b"zstd" => Some(Self::Specific(SpecificCompressionMethodArg::Zstd)), + #[cfg(feature = "lzma")] + b"lzma" => Some(Self::Specific(SpecificCompressionMethodArg::Lzma)), + #[cfg(feature = "xz")] + b"xz" => Some(Self::Specific(SpecificCompressionMethodArg::Xz)), + _ => None, + } + } +} + +#[derive(Debug)] +pub enum DepthLimitArg { + Max(u8), + Min(u8), +} + +#[derive(Debug)] +pub struct MatchArg { + pub comp_sel: ComponentSelector, + pub pat_sel: PatternSelector, + pub pattern: String, +} + +#[derive(Debug)] +pub enum TrivialPredicate { + True, + False, +} + +#[derive(Debug)] +pub enum Predicate { + Trivial(TrivialPredicate), + EntryType(EntryType), + CompressionMethod(CompressionMethodArg), + DepthLimit(DepthLimitArg), + Match(MatchArg), +} + +#[derive(Debug)] +enum ExprOp { + Negation, + And, + Or, +} + +#[derive(Debug)] +enum ExprArg { + PrimitivePredicate(Predicate), + Op(ExprOp), + Subgroup(MatchExpression), +} + +#[derive(Debug, Default)] +struct SingleExprLevel { + expr_args: Vec, +} + +impl SingleExprLevel { + pub fn push_arg(&mut self, arg: ExprArg) { + self.expr_args.push(arg); + } + + fn get_negation(expr_args: &mut VecDeque) -> Result { + let negated_expr: MatchExpression = match expr_args.pop_front().ok_or_else(|| { + Extract::exit_arg_invalid(&format!( + "negation was only expression in list inside match expr (rest: {expr_args:?})" + )) + })? { + ExprArg::Subgroup(match_expr) => { + /* We have a valid match expression, so just negate it without + * wrapping. */ + MatchExpression::Negated(Box::new(match_expr)) + } + ExprArg::PrimitivePredicate(predicate) => { + /* We got a primitive predicate, so just negate it! */ + MatchExpression::Negated(Box::new(MatchExpression::PrimitivePredicate(predicate))) + } + ExprArg::Op(op) => { + /* Negation before any other operator is invalid. */ + return Err(Extract::exit_arg_invalid(&format!( + "negation before operator {op:?} inside match expr is invalid (rest: {expr_args:?})" + ))); + } + }; + Ok(negated_expr) + } + + fn get_non_operator( + expr_args: &mut VecDeque, + ) -> Result { + let next_expr: MatchExpression = match expr_args.pop_front().ok_or_else(|| { + /* We can't fold an empty list. */ + Extract::exit_arg_invalid(&format!( + "empty expression list inside match expr (rest: {expr_args:?})" + )) + })? { + /* This is already an evaluated match expression, so just start with that. */ + ExprArg::Subgroup(match_expr) => match_expr, + ExprArg::PrimitivePredicate(predicate) => { + /* Success! We start with a simple predicate. */ + MatchExpression::PrimitivePredicate(predicate) + } + ExprArg::Op(op) => match op { + /* We started with negation, which means we need to get the next arg to resolve + * it. */ + ExprOp::Negation => Self::get_negation(expr_args)?, + /* Starting with a binary operator is invalid. */ + op @ (ExprOp::And | ExprOp::Or) => { + return Err(Extract::exit_arg_invalid(&format!( + "expression list cannot begin with binary operator {op:?} (rest: {expr_args:?})" + ))); + } + }, + }; + Ok(next_expr) + } + + pub fn fold(self) -> Result { + let Self { expr_args } = self; + let mut expr_args: VecDeque<_> = expr_args.into(); + + /* Get a valid match expression to start our fold with. */ + let mut cur_expr: MatchExpression = Self::get_non_operator(&mut expr_args)?; + + /* Now fold the expression rightwards! */ + while let Some(next_arg) = expr_args.pop_front() { + match next_arg { + /* Implicit AND, wrapping the primitive result into a match. */ + ExprArg::PrimitivePredicate(predicate) => { + let next_expr = MatchExpression::PrimitivePredicate(predicate); + cur_expr = MatchExpression::And { + explicit: false, + left: Box::new(cur_expr), + right: Box::new(next_expr), + }; + } + /* Implicit AND, without needing to wrap the result. */ + ExprArg::Subgroup(match_expr) => { + cur_expr = MatchExpression::And { + explicit: false, + left: Box::new(cur_expr), + right: Box::new(match_expr), + }; + } + /* Evaluate the operator according to association. */ + ExprArg::Op(op) => match op { + /* Negation applies to the next element, so retrieve it! */ + ExprOp::Negation => { + let next_expr = Self::get_negation(&mut expr_args)?; + cur_expr = MatchExpression::And { + explicit: false, + left: Box::new(cur_expr), + right: Box::new(next_expr), + }; + } + /* Explicit AND requires the next element. */ + ExprOp::And => { + let next_expr = Self::get_non_operator(&mut expr_args)?; + cur_expr = MatchExpression::And { + explicit: true, + left: Box::new(cur_expr), + right: Box::new(next_expr), + }; + } + /* OR requires the next element. */ + ExprOp::Or => { + let next_expr = Self::get_non_operator(&mut expr_args)?; + cur_expr = MatchExpression::Or { + left: Box::new(cur_expr), + right: Box::new(next_expr), + }; + } + }, + } + } + + assert!(expr_args.is_empty()); + Ok(cur_expr) + } +} + +#[derive(Debug)] +pub enum MatchExpression { + PrimitivePredicate(Predicate), + Negated(Box), + And { + explicit: bool, + left: Box, + right: Box, + }, + Or { + left: Box, + right: Box, + }, + Grouped(Box), +} + +impl MatchExpression { + pub fn parse_argv(argv: &mut VecDeque) -> Result { + let mut expr_stack: Vec = Vec::new(); + let mut top_exprs = SingleExprLevel::default(); + + while let Some(arg) = argv.pop_front() { + match arg.as_encoded_bytes() { + /* Parse primitive predicates. */ + b"-true" => { + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::Trivial( + TrivialPredicate::True, + ))); + } + b"-false" => { + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::Trivial( + TrivialPredicate::False, + ))); + } + b"-t" | b"--type" => { + let type_arg = argv.pop_front().ok_or_else(|| { + Extract::exit_arg_invalid("no argument provided for -t/--type") + })?; + let entry_type = + EntryType::parse(type_arg.as_encoded_bytes()).ok_or_else(|| { + Extract::exit_arg_invalid(&format!( + "invalid --type argument: {type_arg:?}" + )) + })?; + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::EntryType( + entry_type, + ))); + } + b"--compression-method" => { + let method_arg = argv.pop_front().ok_or_else(|| { + Extract::exit_arg_invalid("no argument provided for --compression-method") + })?; + let method = CompressionMethodArg::parse(method_arg.as_encoded_bytes()) + .ok_or_else(|| { + Extract::exit_arg_invalid(&format!( + "invalid --compression-method argument: {method_arg:?}" + )) + })?; + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::CompressionMethod( + method, + ))); + } + b"--max-depth" => { + let max_depth: u8 = argv + .pop_front() + .ok_or_else(|| { + Extract::exit_arg_invalid("no argument provided for --max-depth") + })? + .into_string() + .map_err(|depth_arg| { + Extract::exit_arg_invalid(&format!( + "invalid unicode provided for --max-depth: {depth_arg:?}" + )) + })? + .parse::() + .map_err(|e| { + Extract::exit_arg_invalid(&format!( + "failed to parse --max-depth arg {e:?} as u8" + )) + })?; + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::DepthLimit( + DepthLimitArg::Max(max_depth), + ))); + } + b"--min-depth" => { + let min_depth: u8 = argv + .pop_front() + .ok_or_else(|| { + Extract::exit_arg_invalid("no argument provided for --min-depth") + })? + .into_string() + .map_err(|depth_arg| { + Extract::exit_arg_invalid(&format!( + "invalid unicode provided for --min-depth: {depth_arg:?}" + )) + })? + .parse::() + .map_err(|e| { + Extract::exit_arg_invalid(&format!( + "failed to parse --min-depth arg {e:?} as u8" + )) + })?; + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::DepthLimit( + DepthLimitArg::Min(min_depth), + ))); + } + b"-m" => { + let pattern: String = argv + .pop_front() + .ok_or_else(|| Extract::exit_arg_invalid("no argument provided for -m"))? + .into_string() + .map_err(|pattern| { + Extract::exit_arg_invalid(&format!( + "invalid unicode provided for -m: {pattern:?}" + )) + })?; + let comp_sel = ComponentSelector::default(); + let pat_sel = PatternSelector::default(); + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::Match(MatchArg { + comp_sel, + pat_sel, + pattern, + }))); + } + arg_bytes if arg_bytes.starts_with(b"--match") => { + let (comp_sel, pat_sel) = + parse_comp_and_pat_sel(arg_bytes).ok_or_else(|| { + Extract::exit_arg_invalid(&format!( + "invalid --match argument modifiers: {arg:?}" + )) + })?; + if pat_sel.modifiers.multiple_matches { + return Err(Extract::exit_arg_invalid(&format!( + "multimatch modifier :g is unused in match expressions: {arg:?}" + ))); + } + let pattern: String = argv + .pop_front() + .ok_or_else(|| { + Extract::exit_arg_invalid("no argument provided for --match") + })? + .into_string() + .map_err(|pattern| { + Extract::exit_arg_invalid(&format!( + "invalid unicode provided for --match: {pattern:?}" + )) + })?; + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::Match(MatchArg { + comp_sel, + pat_sel, + pattern, + }))); + } + + /* Parse operators. */ + b"!" | b"-not" => { + top_exprs.push_arg(ExprArg::Op(ExprOp::Negation)); + } + b"&" | b"-and" => { + top_exprs.push_arg(ExprArg::Op(ExprOp::And)); + } + b"|" | b"-or" => { + top_exprs.push_arg(ExprArg::Op(ExprOp::Or)); + } + + /* Process groups with stack logic! */ + b"(" | b"-open" => { + expr_stack.push(mem::take(&mut top_exprs)); + } + b")" | b"-close" => { + /* Get the unevaluated exprs from the previous nesting level. */ + let prev_level = expr_stack.pop().ok_or_else(|| { + Extract::exit_arg_invalid("too many close parens inside match expr") + })?; + /* Move the previous nesting level into current, and evaluate the current + * nesting level. */ + let group_expr = mem::replace(&mut top_exprs, prev_level).fold()?; + /* Wrap the completed group in a Grouped. */ + let group_expr = MatchExpression::Grouped(Box::new(group_expr)); + /* Push the completed and evaluated group into the current nesting level. */ + top_exprs.push_arg(ExprArg::Subgroup(group_expr)); + } + + /* Conclude the match expr processing. */ + b"--expr" => { + break; + } + _ => { + return Err(Extract::exit_arg_invalid(&format!( + "unrecognized match expression component {arg:?}: all match expressions must start and end with a --expr flag" + ))); + } + } + } + + if !expr_stack.is_empty() { + return Err(Extract::exit_arg_invalid( + "not enough close parens inside match expr", + )); + } + top_exprs.fold() + } +} + +#[derive(Debug)] +pub enum TrivialTransform { + Identity, +} + +#[derive(Debug)] +pub enum BasicTransform { + StripComponents(u8), + AddPrefix(String), +} + +#[derive(Debug)] +pub struct TransformArg { + pub comp_sel: ComponentSelector, + pub pat_sel: PatternSelector, + pub pattern: String, + pub replacement_spec: String, +} + +#[derive(Debug)] +pub struct RemovePrefixArg { + pub pat_sel: PatternSelector, + pub pattern: String, +} + +#[derive(Debug)] +pub enum ComplexTransform { + Transform(TransformArg), + RemovePrefix(RemovePrefixArg), +} + +#[derive(Debug)] +pub enum NameTransform { + Trivial(TrivialTransform), + Basic(BasicTransform), + Complex(ComplexTransform), +} + +#[derive(Debug)] +enum ExtractArg { + Match(MatchExpression), + NameTransform(NameTransform), + ContentTransform(ContentTransform), +} + +#[derive(Debug)] +pub struct EntrySpec { + pub match_expr: Option, + pub name_transforms: Vec, + pub content_transform: ContentTransform, +} + +impl EntrySpec { + fn parse_extract_args( + args: impl IntoIterator, + ) -> Result, ArgParseError> { + let mut match_expr: Option = None; + let mut name_transforms: Vec = Vec::new(); + + let mut ret: Vec = Vec::new(); + + for arg in args.into_iter() { + match arg { + ExtractArg::Match(new_expr) => { + if let Some(prev_expr) = match_expr.take() { + return Err(Extract::exit_arg_invalid(&format!( + "more than one match expr was provided for the same entry: {prev_expr:?} and {new_expr:?}" + ))); + } + match_expr = Some(new_expr); + } + ExtractArg::NameTransform(n_trans) => { + name_transforms.push(n_trans); + } + ExtractArg::ContentTransform(c_trans) => { + let spec = Self { + match_expr: match_expr.take(), + name_transforms: mem::take(&mut name_transforms), + content_transform: c_trans, + }; + ret.push(spec); + } + } + } + if let Some(match_expr) = match_expr { + return Err(Extract::exit_arg_invalid(&format!( + "match expr {match_expr:?} was provided with no corresponding content \ +transform. add -x/--extract to construct a complete entry spec" + ))); + } + if !name_transforms.is_empty() { + return Err(Extract::exit_arg_invalid(&format!( + "name transforms {name_transforms:?} were provided with no corresponding \ +content transform. add -x/--extract to construct a complete entry spec" + ))); + } + + Ok(ret) + } +} + +#[derive(Debug)] +pub enum OutputCollation { + ConcatenateStdout, + Filesystem { + output_dir: Option, + mkdir: bool, + }, +} + +#[derive(Debug)] +pub enum InputType { + StreamingStdin, + ZipPaths(Vec), +} + +#[derive(Debug)] +pub struct Extract { + pub output: OutputCollation, + pub entry_specs: Vec, + pub input: InputType, +} + +impl Extract { + #[cfg(feature = "deflate64")] + const DEFLATE64_HELP_LINE: &'static str = " - deflate64:\twith deflate64\n"; + #[cfg(not(feature = "deflate64"))] + const DEFLATE64_HELP_LINE: &'static str = ""; + + #[cfg(feature = "bzip2")] + const BZIP2_HELP_LINE: &'static str = " - bzip2:\twith bzip2\n"; + #[cfg(not(feature = "bzip2"))] + const BZIP2_HELP_LINE: &'static str = ""; + + #[cfg(feature = "zstd")] + const ZSTD_HELP_LINE: &'static str = " - zstd:\twith zstd\n"; + #[cfg(not(feature = "zstd"))] + const ZSTD_HELP_LINE: &'static str = ""; + + #[cfg(feature = "lzma")] + const LZMA_HELP_LINE: &'static str = " - lzma:\twith lzma\n"; + #[cfg(not(feature = "lzma"))] + const LZMA_HELP_LINE: &'static str = ""; + + #[cfg(feature = "xz")] + const XZ_HELP_LINE: &'static str = " - xz:\t\twith xz\n"; + #[cfg(not(feature = "xz"))] + const XZ_HELP_LINE: &'static str = ""; +} + +impl CommandFormat for Extract { + const COMMAND_NAME: &'static str = "extract"; + const COMMAND_TABS: &'static str = "\t"; + const COMMAND_DESCRIPTION: &'static str = + "Extract individual entries or an entire archive into a stream or the filesystem."; + + const USAGE_LINE: &'static str = + "[-h|--help] OUTPUT-SPEC... [ENTRY-SPEC]... [--stdin|[--] ZIP-PATH...]"; + + fn generate_help() -> String { + format!( + r#" + -h, --help Print help + +# Output specs: +Where and how to collate the extracted entries. + + -d, --output-directory + Output directory path to write extracted entries into. + Paths for extracted entries will be constructed by interpreting entry + names as relative paths to the provided directory. If the provided + path is not a directory, an error is produced. If the provided path + does not exist, an error is produced unless --mkdir is specified. + If not provided, entries will be extracted into the current directory + (as if '-d .' had been provided). + + --mkdir + If an output directory is provided with -d and the directory path does + not exist, create it along with any missing parent directories. + If the path provided to -d is not a directory, an error will still be + produced if this flag is also provided. + + --stdout + Concatenate all extracted entries and write them in order to stdout + instead of writing anything to the filesystem. + This disables some optimizations that are possible when extracting to + the filesystem. + This will write output to stdout even if stdout is a tty. + +# Entry specs: + +After output flags are provided, entry specs are processed in order until an +input argument is reached. Entry specs are modelled after the arguments to +find(1), although "actions" are separated from "matching" expressions with +test clauses instead of being fully recursive like find(1). + +The full specification of an entry spec is provided below +(we will use lowercase names to describe this grammar): + + entry-spec = [--expr match-expr --expr] [name-transform]... content-transform + +1. (match-expr) matches against entries, +2. (name-transform) may transform the entry name string, +3. (content-transform) processes the entry content and writes it + to the output. + +Note that only the "content transform" is required: each entry spec must +conclude with exactly one content transform, but the other arguments may +be omitted and will be set to their default values. + +If no entry specs are provided, by default all entries are decompressed and written to the +output collator without modification. This behavior can be requested explicitly +with the command line: + + --expr -true --expr --identity --extract + +*Note:* if a match-expr is provided, it *must* be surrounded with --expr arguments on both sides! +This is a necessary constraint of the current command line parsing. + + +## Match expressions (match-expr): + +Entry matching logic composes boolean arithmetic expressions ("expr") in terms +of basic "predicates" which test some component of the zip entry. Expressions +can be composed as follows, in order of precedence: + +expr = ( ) (grouping to force precedence) + = ! (negation) + = & (short-circuiting conjunction "and") + = (implicit &) + = | (disjunction "or") + = (evaluate on entry) + +### Operators: +The operators to compose match expressions must be quoted in shell commands +(e.g. as \( or '('), so alternatives are provided which do not require +special quoting: + +Grouping operators: + (, -open + ), -close + +Unary operators: + !, -not + +Binary operators: + |, -or + &, -and + +### Predicates (predicate): +These arguments are interpreted as basic predicates, returning true or false in +response to a specific zip entry. + +Trivial: +These results do not depend on the entry data at all: + + -true Always return true. + -false Always return false. + +If a match expression is not provided, it defaults to the behavior of -true. + +Basic: +These results are dependent on the entry data: + + -t, --type [file|dir|symlink] + Match entries of the given type. + Note that directory entries may have specific mode bits set, or they may just be + zero-length entries whose name ends in '/'. + + --compression-method + Match entries compressed with the given compression technique. + + Possible values: + - any: any compression method at all + - known: any compression method this binary is able to decompress + - stored: uncompressed + - deflated: with deflate +{}{}{}{}{} + Using e.g. '-not --compression-method known' as a filter enables + special handling of entries compressed with an unsupported method. + + --max-depth + Match entries with at *most* components of their containing directory. + --min-depth + Match entries with at *least* components of their containing directory. + + -m, --match[=][:] + Return true for entries whose name matches . + + See section on "Selector syntax" for and for how + the string argument is interpreted into a string matching + predicate against the entry name. + + TODO: this flag is not yet supported and will produce an error. + + +## Name transforms (name-transform): + +Name transforms modify the entry name before writing the entry to the +output. Unlike match expressions, name transforms do not involve any boolean +logic, and instead are composed linearly, each processing the string produced by +the prior name transform in the series. + +*Note:* name transforms do *not* perform any filtering, so if a string +replacement operation "fails", the entry name is simply returned unchanged. + +Trivial: + --identity Return the entry name string unchanged. + +If no name transforms are provided, it defaults to the behavior of --identity. + +Basic: +These transformers do not perform any complex pattern matching, and instead add +or remove a fixed string from the entry name: + + --strip-components + Remove at most directory components from the entry name. + If is greater than or equal the number of components in the + entry dirname, then the basename of the entry is returned. + --add-prefix + Prefix the entry name with a directory path . + A single separator '/' will be added after before the rest of + the entry name, and any trailing '/' in will be trimmed + before joining. + +Complex: +These transformers perform complex pattern matching and replacement upon the +entry name string: + +TODO: these flags are not yet supported and will produce an error. + + --transform[=][:] + Extract the portion of the entry name corresponding to , + search it against corresponding to , and then + replace the result with . + + If == 'rx', then may contain references + to numbered capture groups specified by . Otherwise, + is interpreted as a literal string. + + --remove-prefix[:] + Equivalent to "--transform=path: ''", except the + search is anchored at the beginning of the string. + + +## Content transforms (content-transform): + +Content transforms determine how to interpret the content of the zip +entry itself. + +*Note:* when multiple entry specs are provided on the command line, a single +entry may be matched more than once. In this case, the entry's content will be +extracted more than once over the execution of this command. + +TODO: multiple entry specs with content transforms that extract output more than once require entry +teeing, which is not yet supported, so will produce an error. + + -x, --extract + Decompress the entry's contents (if necessary) before writing it to + the output. + + --raw + Do not decompress entry contents at all before writing its content to + the output. + + TODO: this flag is not yet supported and will produce an error. + + --log-to-stderr + Write the (possibly transformed) entry name to stderr, without reading + its content at all. + +Attempting to extract an entry using an unsupported compression method with +-x/--extract will produce an error. In this case, --compression-method can be +used to filter out such entries, and --raw may be used to avoid the failure and +decompress the entry later, or --log-to-stderr can be used to print the names of +all unsupported entries. + + +## Selector syntax: + +The string matching operations of --match and --transform expose an interface to +configure various pattern matching techniques on various components of the entry +name string. + +These flags default to interpreting a argument as a glob string to +match against the entire entry name, which can be explicitly requested as +follows: + + --match=path:glob + +The entire range of search options is described below: + +### Component selector (comp-sel): +comp-sel = path [DEFAULT] (match full entry) + = basename (match only the final component of entry) + = dirname (match all except final component of entry) + = ext (match only the file extension, if available) + +### Pattern selector (pat-sel): +pat-sel = glob [DEFAULT] (interpret as a shell glob) + = lit (interpret as literal string) + = rx (interpret as a regular expression) + = (apply search modifiers from ) + +#### Pattern modifiers (pat-mod): +pat-mod = :i (use case-insensitive matching for the given pattern) + = :g (use multi-match behavior for string replacements) + +Pattern modifiers from (pat-mod) can be sequenced, e.g. ':i:g'. + +*Note:* not all pattern modifiers apply everywhere. In particular, ':g' only +applies to string replacement, and using it for a match expression like +'--match:rx:g' will produce an error. + +# Input arguments: +Zip file inputs to extract from can be specified in exactly one of two ways: +streaming from stdin, or as at least one path pointing to an existing zip file. +Input arguments are always specified after all output flags and entry +specs on the command line. If no positional argument is provided and --stdin is +not present, an error will be produced. + + --stdin + If this argument is provided, the streaming API will be used to read + entries as they are encountered, instead of filtering them beforehand + as is done with file inputs. This disables some optimizations, but + also avoids waiting for the entire input to buffer to start writing + output, so can be used in a streaming context. + +Positional paths: + ZIP-PATH... + Apply the entry specs to filter and rename entries to extract from all + of the provided zip files. At least one zip path must be provided, and + all provided paths must exist and point to an existing zip file. Pipes + are not supported and will produce an error. +"#, + Self::DEFLATE64_HELP_LINE, + Self::BZIP2_HELP_LINE, + Self::ZSTD_HELP_LINE, + Self::LZMA_HELP_LINE, + Self::XZ_HELP_LINE, + ) + } + + fn parse_argv(mut argv: VecDeque) -> Result { + let mut output_dir: Option = None; + let mut mkdir_flag: bool = false; + let mut stdout_flag: bool = false; + let mut args: Vec = Vec::new(); + let mut stdin_flag: bool = false; + let mut positional_zips: Vec = Vec::new(); + + while let Some(arg) = argv.pop_front() { + match arg.as_encoded_bytes() { + b"-h" | b"--help" => { + let help_text = Self::generate_full_help_text(); + return Err(ArgParseError::StdoutMessage(help_text)); + } + + /* Output args */ + b"-d" | b"--output-directory" => { + let new_path = argv.pop_front().map(PathBuf::from).ok_or_else(|| { + Self::exit_arg_invalid("no argument provided for -d/--output-directory") + })?; + if let Some(prev_path) = output_dir.take() { + return Err(Self::exit_arg_invalid(&format!( + "--output-directory provided twice: {prev_path:?} and {new_path:?}" + ))); + } else if stdout_flag { + return Err(Self::exit_arg_invalid( + "--stdout provided along with output dir", + )); + } else if !args.is_empty() || stdin_flag || !positional_zips.is_empty() { + return Err(Self::exit_arg_invalid( + "-d/--output-directory provided after entry specs or inputs", + )); + } else { + output_dir = Some(new_path); + } + } + b"--mkdir" => { + if mkdir_flag { + return Err(Self::exit_arg_invalid("--mkdir provided twice")); + } else if stdout_flag { + return Err(Self::exit_arg_invalid( + "--stdout provided along with --mkdir", + )); + } else if !args.is_empty() || stdin_flag || !positional_zips.is_empty() { + return Err(Self::exit_arg_invalid( + "--mkdir provided after entry specs or inputs", + )); + } else { + mkdir_flag = true; + } + } + b"--stdout" => { + if let Some(output_dir) = output_dir.take() { + return Err(Self::exit_arg_invalid(&format!( + "--stdout provided along with output directory {output_dir:?}" + ))); + } else if stdout_flag { + return Err(Self::exit_arg_invalid("--stdout provided twice")); + } else if mkdir_flag { + return Err(Self::exit_arg_invalid( + "--stdout provided along with --mkdir", + )); + } else if !args.is_empty() || stdin_flag || !positional_zips.is_empty() { + return Err(Self::exit_arg_invalid( + "--stdout provided after entry specs or inputs", + )); + } else { + stdout_flag = true; + } + } + + /* Transition to entry specs */ + /* Try content transforms first, as they are unambiguous sentinel values. */ + b"-x" | b"--extract" => { + args.push(ExtractArg::ContentTransform(ContentTransform::Extract)); + } + b"--raw" => { + args.push(ExtractArg::ContentTransform(ContentTransform::Raw)); + } + b"--log-to-stderr" => { + args.push(ExtractArg::ContentTransform(ContentTransform::LogToStderr)); + } + + /* Try name transforms next, as they only stack linearly and do not require CFG + * parsing of paired delimiters. */ + /* FIXME: none of these name transforms have any effect if --stdout is + * provided. Should we error or warn about this? */ + b"--identity" => { + args.push(ExtractArg::NameTransform(NameTransform::Trivial( + TrivialTransform::Identity, + ))); + } + b"--strip-components" => { + let num: u8 = argv + .pop_front() + .ok_or_else(|| { + Self::exit_arg_invalid("no argument provided for --strip-component") + })? + .into_string() + .map_err(|num| { + Self::exit_arg_invalid(&format!( + "invalid unicode provided for --strip-component: {num:?}" + )) + })? + .parse::() + .map_err(|e| { + Self::exit_arg_invalid(&format!( + "failed to parse --strip-component arg {e:?} as u8" + )) + })?; + args.push(ExtractArg::NameTransform(NameTransform::Basic( + BasicTransform::StripComponents(num), + ))); + } + b"--add-prefix" => { + let prefix = argv + .pop_front() + .ok_or_else(|| { + Self::exit_arg_invalid("no argument provided for --add-prefix") + })? + .into_string() + .map_err(|prefix| { + Self::exit_arg_invalid(&format!( + "invalid unicode provided for --add-prefix: {prefix:?}" + )) + })?; + args.push(ExtractArg::NameTransform(NameTransform::Basic( + BasicTransform::AddPrefix(prefix), + ))); + } + arg_bytes if arg_bytes.starts_with(b"--transform") => { + let (comp_sel, pat_sel) = + parse_comp_and_pat_sel(arg_bytes).ok_or_else(|| { + Self::exit_arg_invalid(&format!( + "invalid --transform argument modifiers: {arg:?}" + )) + })?; + let pattern = argv + .pop_front() + .ok_or_else(|| { + Self::exit_arg_invalid("no argument provided for --transform") + })? + .into_string() + .map_err(|pattern| { + Self::exit_arg_invalid(&format!( + "invalid unicode provided for --transform : {pattern:?}" + )) + })?; + let replacement_spec = argv + .pop_front() + .ok_or_else(|| { + Self::exit_arg_invalid( + "no argument provided for --transform", + ) + })? + .into_string() + .map_err(|replacement_spec| { + Self::exit_arg_invalid(&format!( + "invalid unicode provided for --transform : {replacement_spec:?}" + )) + })?; + args.push(ExtractArg::NameTransform(NameTransform::Complex( + ComplexTransform::Transform(TransformArg { + comp_sel, + pat_sel, + pattern, + replacement_spec, + }), + ))); + } + arg_bytes if arg_bytes.starts_with(b"--remove-prefix") => { + let pat_sel = parse_only_pat_sel(arg_bytes).ok_or_else(|| { + Self::exit_arg_invalid(&format!( + "invalid --remove-prefix argument modifiers: {arg:?}" + )) + })?; + let pattern = argv + .pop_front() + .ok_or_else(|| { + Self::exit_arg_invalid( + "no argument provided for --remove-prefix", + ) + })? + .into_string() + .map_err(|pattern| { + Self::exit_arg_invalid(&format!( + "invalid unicode provided for --remove-prefix : {pattern:?}" + )) + })?; + args.push(ExtractArg::NameTransform(NameTransform::Complex( + ComplexTransform::RemovePrefix(RemovePrefixArg { pat_sel, pattern }), + ))); + } + + /* Try parsing match specs! */ + b"--expr" => { + let match_expr = MatchExpression::parse_argv(&mut argv)?; + args.push(ExtractArg::Match(match_expr)); + } + + /* Transition to input args */ + b"--stdin" => { + stdin_flag = true; + break; + } + b"--" => break, + arg_bytes => { + if arg_bytes.starts_with(b"-") { + return Err(Self::exit_arg_invalid(&format!( + "unrecognized flag {arg:?}" + ))); + } else { + argv.push_front(arg); + break; + } + } + } + } + + positional_zips.extend(argv.into_iter().map(|arg| arg.into())); + if stdin_flag && !positional_zips.is_empty() { + return Err(Self::exit_arg_invalid(&format!( + "--stdin was provided at the same time as positional args {positional_zips:?}" + ))); + } + let input = if stdin_flag { + InputType::StreamingStdin + } else { + if positional_zips.is_empty() { + return Err(Self::exit_arg_invalid( + "no zip input files were provided, and --stdin was not provided", + )); + } + InputType::ZipPaths(positional_zips) + }; + + let output = if stdout_flag { + OutputCollation::ConcatenateStdout + } else { + OutputCollation::Filesystem { + output_dir, + mkdir: mkdir_flag, + } + }; + + let entry_specs = EntrySpec::parse_extract_args(args)?; + + Ok(Self { + output, + entry_specs, + input, + }) + } +} + +impl crate::driver::ExecuteCommand for Extract { + fn execute(self, err: impl std::io::Write) -> Result<(), crate::CommandError> { + crate::extract::execute_extract(err, self) + } +} From adcc1f268cbf608fb21a343aded185617871296b Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 18:42:21 -0400 Subject: [PATCH 15/68] init entry data --- cli/src/args/extract.rs | 2 +- cli/src/extract.rs | 3 ++- cli/src/extract/matcher.rs | 49 +++++++++++++++++++++--------------- cli/src/extract/receiver.rs | 38 +++++++++++++++++++++++++++- cli/src/extract/transform.rs | 17 +++++++++++-- src/compression.rs | 2 +- 6 files changed, 85 insertions(+), 26 deletions(-) diff --git a/cli/src/args/extract.rs b/cli/src/args/extract.rs index 08b69b29e..d1fec51f0 100644 --- a/cli/src/args/extract.rs +++ b/cli/src/args/extract.rs @@ -4,7 +4,7 @@ use zip::CompressionMethod; use std::{collections::VecDeque, ffi::OsString, mem, path::PathBuf}; -#[derive(Debug)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub enum ContentTransform { Extract, /* FIXME: not yet supported -- could be done by exposing ZipFile::take_raw_reader(), but diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 446886b9f..b3119020b 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -10,6 +10,7 @@ mod entries; mod matcher; mod receiver; mod transform; +use receiver::EntryData; pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandError> { let Extract { @@ -26,7 +27,7 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE while let Some(mut entry) = entry_iterator.next_entry()? { for transformer in entry_spec_transformers.iter() { - if !transformer.matches(&entry) { + if !transformer.matches(&EntryData::from_entry(&entry)) { continue; } let name = transformer.transform_name(entry.name()); diff --git a/cli/src/extract/matcher.rs b/cli/src/extract/matcher.rs index 71ce892da..8c086cd3f 100644 --- a/cli/src/extract/matcher.rs +++ b/cli/src/extract/matcher.rs @@ -5,6 +5,7 @@ use regex; use zip::{read::ZipFile, CompressionMethod}; +use super::receiver::{EntryData, EntryKind}; use crate::{args::extract::*, CommandError}; #[inline(always)] @@ -39,7 +40,9 @@ impl NameMatcher for LiteralMatcher { where Self: Sized, { - let PatternModifiers { case_insensitive } = opts; + let PatternModifiers { + case_insensitive, .. + } = opts; Ok(Self { lit: pattern.to_string(), case_insensitive, @@ -65,7 +68,9 @@ impl NameMatcher for GlobMatcher { where Self: Sized, { - let PatternModifiers { case_insensitive } = opts; + let PatternModifiers { + case_insensitive, .. + } = opts; let glob_opts = glob::MatchOptions { case_sensitive: !case_insensitive, ..Default::default() @@ -92,7 +97,9 @@ impl NameMatcher for RegexMatcher { where Self: Sized, { - let PatternModifiers { case_insensitive } = opts; + let PatternModifiers { + case_insensitive, .. + } = opts; let pat = regex::RegexBuilder::new(pattern) .case_insensitive(case_insensitive) .build() @@ -116,7 +123,7 @@ pub trait EntryMatcher { fn from_arg(arg: Self::Arg) -> Result where Self: Sized; - fn matches(&self, entry: &ZipFile) -> bool; + fn matches(&self, entry: &EntryData) -> bool; } #[derive(Copy, Clone)] @@ -138,7 +145,7 @@ impl EntryMatcher for TrivialMatcher { }) } - fn matches(&self, _entry: &ZipFile) -> bool { + fn matches(&self, _entry: &EntryData) -> bool { match self { Self::True => true, Self::False => false, @@ -167,11 +174,12 @@ impl EntryMatcher for EntryTypeMatcher { }) } - fn matches(&self, entry: &ZipFile) -> bool { - match self { - Self::File => !entry.is_dir() && !entry.is_symlink(), - Self::Dir => entry.is_dir(), - Self::Symlink => entry.is_symlink(), + fn matches(&self, entry: &EntryData) -> bool { + match (self, entry.kind) { + (Self::File, EntryKind::File) => true, + (Self::Dir, EntryKind::Dir) => true, + (Self::Symlink, EntryKind::Symlink) => true, + _ => false, } } } @@ -195,11 +203,12 @@ impl EntryMatcher for NonSpecificMethods { }) } - fn matches(&self, entry: &ZipFile) -> bool { + fn matches(&self, entry: &EntryData) -> bool { match self { Self::Any => true, - Self::Known => SpecificCompressionMethodArg::KNOWN_COMPRESSION_METHODS - .contains(&entry.compression()), + Self::Known => { + SpecificCompressionMethodArg::KNOWN_COMPRESSION_METHODS.contains(&entry.compression) + } } } } @@ -220,8 +229,8 @@ impl EntryMatcher for SpecificMethods { }) } - fn matches(&self, entry: &ZipFile) -> bool { - self.specific_method == entry.compression() + fn matches(&self, entry: &EntryData) -> bool { + self.specific_method == entry.compression } } @@ -244,8 +253,8 @@ impl EntryMatcher for DepthLimit { }) } - fn matches(&self, entry: &ZipFile) -> bool { - let num_components = entry.name().split('/').count(); + fn matches(&self, entry: &EntryData) -> bool { + let num_components = entry.name.split('/').count(); match self { Self::Max(max) => num_components <= *max, Self::Min(min) => num_components >= *min, @@ -280,8 +289,8 @@ impl EntryMatcher for PatternMatcher { Ok(Self { matcher, comp_sel }) } - fn matches(&self, entry: &ZipFile) -> bool { - match process_component_selector(self.comp_sel, entry.name()) { + fn matches(&self, entry: &EntryData) -> bool { + match process_component_selector(self.comp_sel, entry.name) { None => false, Some(s) => self.matcher.matches(s), } @@ -346,7 +355,7 @@ impl EntryMatcher for WrappedMatcher { }) } - fn matches(&self, entry: &ZipFile) -> bool { + fn matches(&self, entry: &EntryData) -> bool { match self { Self::Primitive(m) => m.matches(entry), Self::Negated(m) => !m.matches(entry), diff --git a/cli/src/extract/receiver.rs b/cli/src/extract/receiver.rs index 57a6b1b38..dd54ec528 100644 --- a/cli/src/extract/receiver.rs +++ b/cli/src/extract/receiver.rs @@ -1,4 +1,5 @@ use std::{ + borrow::Cow, cell::RefCell, env, fs, io::{self, Read, Write}, @@ -7,10 +8,45 @@ use std::{ rc::Rc, }; -use zip::read::ZipFile; +use zip::{read::ZipFile, CompressionMethod}; use crate::{args::extract::*, CommandError, WrapCommandErr}; +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum EntryKind { + File, + Dir, + Symlink, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct EntryData<'a> { + pub name: &'a str, + pub kind: EntryKind, + pub compression: CompressionMethod, + pub unix_mode: Option, + pub size: u64, +} + +impl<'a> EntryData<'a> { + #[inline(always)] + pub fn from_entry<'b>(entry: &'a ZipFile<'b>) -> Self { + Self { + name: entry.name(), + kind: if entry.is_dir() { + EntryKind::Dir + } else if entry.is_symlink() { + EntryKind::Symlink + } else { + EntryKind::File + }, + compression: entry.compression(), + unix_mode: entry.unix_mode(), + size: entry.size(), + } + } +} + pub trait EntryReceiver { fn receive_entry<'a>( &mut self, diff --git a/cli/src/extract/transform.rs b/cli/src/extract/transform.rs index f9e21d99b..34f05a977 100644 --- a/cli/src/extract/transform.rs +++ b/cli/src/extract/transform.rs @@ -4,7 +4,10 @@ use zip::read::ZipFile; use crate::{args::extract::*, CommandError}; -use super::matcher::{EntryMatcher, WrappedMatcher}; +use super::{ + matcher::{process_component_selector, EntryMatcher, WrappedMatcher}, + receiver::{EntryData, EntryReceiver}, +}; trait NameTransformer { type Arg @@ -100,6 +103,16 @@ impl NameTransformer for AddPrefix { } } +enum ContentProcessor { + StderrLog, + /* FileLog(fs::File), */ + WriteContent, +} + +impl ContentProcessor { + /* pub fn process_entry(&mut self, entry: &mut ZipFile, ) */ +} + pub struct EntrySpecTransformer { matcher: Option, name_transformers: Vec>, @@ -156,7 +169,7 @@ impl EntrySpecTransformer { } impl EntrySpecTransformer { - pub fn matches(&self, entry: &ZipFile) -> bool { + pub fn matches(&self, entry: &EntryData) -> bool { match &self.matcher { None => true, Some(matcher) => matcher.matches(entry), diff --git a/src/compression.rs b/src/compression.rs index 83a7669bd..02c264641 100644 --- a/src/compression.rs +++ b/src/compression.rs @@ -10,7 +10,7 @@ use std::{fmt, io}; /// /// When creating ZIP files, you may choose the method to use with /// [`crate::write::FileOptions::compression_method`] -#[derive(Copy, Clone, PartialEq, Eq, Debug)] +#[derive(Copy, Clone, PartialEq, Eq, Debug, Hash, PartialOrd, Ord)] #[cfg_attr(fuzzing, derive(arbitrary::Arbitrary))] #[non_exhaustive] pub enum CompressionMethod { From c7176a4ca3a86e78e4fad56c659a6b7bd90bc6e0 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 19:35:25 -0400 Subject: [PATCH 16/68] stub out utterly absurd cli spec --- cli/src/args/extract.rs | 97 +++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 33 deletions(-) diff --git a/cli/src/args/extract.rs b/cli/src/args/extract.rs index d1fec51f0..d733b19d1 100644 --- a/cli/src/args/extract.rs +++ b/cli/src/args/extract.rs @@ -804,38 +804,77 @@ impl CommandFormat for Extract { "Extract individual entries or an entire archive into a stream or the filesystem."; const USAGE_LINE: &'static str = - "[-h|--help] OUTPUT-SPEC... [ENTRY-SPEC]... [--stdin|[--] ZIP-PATH...]"; + "[-h|--help] [OUTPUT-SPEC]... [ENTRY-SPEC]... [--stdin] [--] [ZIP-PATH]..."; fn generate_help() -> String { format!( r#" -h, --help Print help -# Output specs: +# Output flags: Where and how to collate the extracted entries. - -d, --output-directory +## Directory extraction: +Extract entries into relative paths of a named directory according to the +entry's name. + + -d, --output-directory[:mkdir] Output directory path to write extracted entries into. Paths for extracted entries will be constructed by interpreting entry - names as relative paths to the provided directory. If the provided - path is not a directory, an error is produced. If the provided path - does not exist, an error is produced unless --mkdir is specified. + names as relative paths to the provided directory. + + If the provided path is not a directory, an error is produced. If the + provided path does not exist, an error is produced, unless :mkdir is + specified, which attempts to create the specified directory along with + any missing parent directories. + If not provided, entries will be extracted into the current directory (as if '-d .' had been provided). - --mkdir - If an output directory is provided with -d and the directory path does - not exist, create it along with any missing parent directories. - If the path provided to -d is not a directory, an error will still be - produced if this flag is also provided. +## Pipe decompression: +Concatenate decompressed entry data into a pipe or file. Entry names are +effectively ignored. This disables some optimizations that are possible when +extracting to the filesystem. --stdout Concatenate all extracted entries and write them in order to stdout instead of writing anything to the filesystem. - This disables some optimizations that are possible when extracting to - the filesystem. This will write output to stdout even if stdout is a tty. + -f, --output-file[:append] + Write all entries into the specified file path . + + The output file will be truncated if it already exists, unless :append + is provided. If the specified file path could not be created + (e.g. because the containing directory does not exist, or because the + path exists but does not point to a regular file), an error + is produced. + +## Output teeing: +Entries may be *received* by one or more named outputs. Without any output names specified, the +above flags will produce a single receiver named "default". This is the default receiver used for +the -x/--extract argument unless otherwise specified. However, multiple named receivers may be +specified in sequence, separated by the --name flag: + + --name + Assign the output receiver created from the following output flags to the name . + +Note that the first output in a list need not have a name, as it will be assigned to the name +"default" if not provided. + +'--stdout' Creates a single default receiver decompressing contents to stdout. +'-d ./a' Creates a single default receiver extracting entries into './a'. + +'--name one -d ./a' + Creates a single named receiver "one" extracting into './a'. -x/--extract + must specify the name "one", or an error will be produced. +'--output-directory:mkdir ./a --name two --stdout' + Creates a default receiver extracting into './a', which will be created if + it does not exist, and a named receiver "two" concatenating into stdout. +'--name one -d ./a --name two -f ./b' + Creates a named receiver "one" extracting into './a', and a second named receiver "two" + concatenating into the file './b'. + # Entry specs: After output flags are provided, entry specs are processed in order until an @@ -1004,25 +1043,14 @@ extracted more than once over the execution of this command. TODO: multiple entry specs with content transforms that extract output more than once require entry teeing, which is not yet supported, so will produce an error. - -x, --extract + -x, --extract[=] Decompress the entry's contents (if necessary) before writing it to - the output. - - --raw - Do not decompress entry contents at all before writing its content to - the output. - - TODO: this flag is not yet supported and will produce an error. - - --log-to-stderr - Write the (possibly transformed) entry name to stderr, without reading - its content at all. + the named output , or the default output if the receiver name is + not specified. Attempting to extract an entry using an unsupported compression method with -x/--extract will produce an error. In this case, --compression-method can be -used to filter out such entries, and --raw may be used to avoid the failure and -decompress the entry later, or --log-to-stderr can be used to print the names of -all unsupported entries. +used to filter out such entries. ## Selector syntax: @@ -1062,11 +1090,11 @@ applies to string replacement, and using it for a match expression like '--match:rx:g' will produce an error. # Input arguments: -Zip file inputs to extract from can be specified in exactly one of two ways: -streaming from stdin, or as at least one path pointing to an existing zip file. -Input arguments are always specified after all output flags and entry -specs on the command line. If no positional argument is provided and --stdin is -not present, an error will be produced. +Zip file inputs to extract from can be specified by streaming from stdin, or as +at least one path pointing to an existing zip file. Input arguments are always +specified after all output flags and entry specs on the command line. If no +positional argument is provided and --stdin is not present, an error will +be produced. --stdin If this argument is provided, the streaming API will be used to read @@ -1081,6 +1109,9 @@ Positional paths: of the provided zip files. At least one zip path must be provided, and all provided paths must exist and point to an existing zip file. Pipes are not supported and will produce an error. + + If --stdin is provided, it will be read in a streaming manner before + reading entries from any positional zip paths. "#, Self::DEFLATE64_HELP_LINE, Self::BZIP2_HELP_LINE, From 28654f6079e35b17952e7c419aa97bc416916fe9 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 20:36:38 -0400 Subject: [PATCH 17/68] parse our absurd cli spec --- cli/src/args/extract.rs | 313 ++++++++++++++++++++++++++-------------- 1 file changed, 206 insertions(+), 107 deletions(-) diff --git a/cli/src/args/extract.rs b/cli/src/args/extract.rs index d733b19d1..fa85ad83e 100644 --- a/cli/src/args/extract.rs +++ b/cli/src/args/extract.rs @@ -4,15 +4,9 @@ use zip::CompressionMethod; use std::{collections::VecDeque, ffi::OsString, mem, path::PathBuf}; -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub enum ContentTransform { - Extract, - /* FIXME: not yet supported -- could be done by exposing ZipFile::take_raw_reader(), but - * should probably just refactor extract.rs to avoid the need for that. - * NB: actually, we can't do that while supporting streaming archives unless we expose - * take_raw_reader()! */ - Raw, - LogToStderr, + Extract { name: Option }, } #[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Copy, Clone)] @@ -751,23 +745,190 @@ content transform. add -x/--extract to construct a complete entry spec" #[derive(Debug)] pub enum OutputCollation { ConcatenateStdout, - Filesystem { - output_dir: Option, - mkdir: bool, - }, + ConcatenateFile { path: PathBuf, append: bool }, + Filesystem { output_dir: PathBuf, mkdir: bool }, +} + +#[derive(Debug)] +pub struct NamedOutput { + pub name: String, + pub output: OutputCollation, +} + +#[derive(Debug)] +pub struct OutputSpecs { + pub default: Option, + pub named: Vec, +} + +impl Default for OutputSpecs { + fn default() -> Self { + Self { + default: Some(OutputCollation::Filesystem { + output_dir: PathBuf::from("."), + mkdir: false, + }), + named: Vec::new(), + } + } +} + +impl OutputSpecs { + pub fn parse_argv(argv: &mut VecDeque) -> Result { + let mut default: Option = None; + let mut named: Vec = Vec::new(); + let mut cur_name: Option = None; + + while let Some(arg) = argv.pop_front() { + match arg.as_encoded_bytes() { + b"-h" | b"--help" => { + let help_text = Extract::generate_full_help_text(); + return Err(ArgParseError::StdoutMessage(help_text)); + } + b"--name" => { + let name = argv + .pop_front() + .ok_or_else(|| { + Extract::exit_arg_invalid("no argument provided for --name") + })? + .into_string() + .map_err(|name| { + Extract::exit_arg_invalid(&format!( + "invalid unicode provided for --name: {name:?}" + )) + })?; + if let Some(prev_name) = cur_name.take() { + return Err(Extract::exit_arg_invalid(&format!( + "multiple names provided for output: {prev_name:?} and {name:?}" + ))); + } + cur_name = Some(name); + } + b"-d" => { + let dir_path = argv + .pop_front() + .map(PathBuf::from) + .ok_or_else(|| Extract::exit_arg_invalid("no argument provided for -d"))?; + let output = OutputCollation::Filesystem { + output_dir: dir_path, + mkdir: false, + }; + if let Some(name) = cur_name.take() { + named.push(NamedOutput { name, output }); + } else if let Some(default) = default.replace(output) { + return Err(Extract::exit_arg_invalid( + "multiple unnamed outputs provided: {default:?} and {output:?}", + )); + } + } + arg_bytes if arg_bytes.starts_with(b"--output-directory") => { + let mkdir = match arg_bytes { + b"--output-directory" => false, + b"--output-directory:mkdir" => true, + _ => { + return Err(Extract::exit_arg_invalid(&format!( + "invalid suffix provided to --output-directory: {arg:?}" + ))); + } + }; + let dir_path = argv.pop_front().map(PathBuf::from).ok_or_else(|| { + Extract::exit_arg_invalid("no argument provided for --output-directory") + })?; + let output = OutputCollation::Filesystem { + output_dir: dir_path, + mkdir, + }; + if let Some(name) = cur_name.take() { + named.push(NamedOutput { name, output }); + } else if let Some(default) = default.replace(output) { + return Err(Extract::exit_arg_invalid( + "multiple unnamed outputs provided: {default:?} and {output:?}", + )); + } + } + b"--stdout" => { + let output = OutputCollation::ConcatenateStdout; + if let Some(name) = cur_name.take() { + named.push(NamedOutput { name, output }); + } else if let Some(default) = default.replace(output) { + return Err(Extract::exit_arg_invalid( + "multiple unnamed outputs provided: {default:?} and {output:?}", + )); + } + } + b"-f" => { + let file_path = argv + .pop_front() + .map(PathBuf::from) + .ok_or_else(|| Extract::exit_arg_invalid("no argument provided for -f"))?; + let output = OutputCollation::ConcatenateFile { + path: file_path, + append: false, + }; + if let Some(name) = cur_name.take() { + named.push(NamedOutput { name, output }); + } else if let Some(default) = default.replace(output) { + return Err(Extract::exit_arg_invalid( + "multiple unnamed outputs provided: {default:?} and {output:?}", + )); + } + } + arg_bytes if arg_bytes.starts_with(b"--output-file") => { + let append = match arg_bytes { + b"--output-file" => false, + b"--output-file:append" => true, + _ => { + return Err(Extract::exit_arg_invalid(&format!( + "invalid suffix provided to --output-file: {arg:?}" + ))); + } + }; + let file_path = argv.pop_front().map(PathBuf::from).ok_or_else(|| { + Extract::exit_arg_invalid("no argument provided for --output-file") + })?; + let output = OutputCollation::ConcatenateFile { + path: file_path, + append, + }; + if let Some(name) = cur_name.take() { + named.push(NamedOutput { name, output }); + } else if let Some(default) = default.replace(output) { + return Err(Extract::exit_arg_invalid( + "multiple unnamed outputs provided: {default:?} and {output:?}", + )); + } + } + _ => { + argv.push_front(arg); + break; + } + } + } + if let Some(name) = cur_name { + return Err(Extract::exit_arg_invalid(&format!( + "trailing --name argument provided without output spec: {name:?}" + ))); + } + + Ok(if default.is_none() && named.is_empty() { + Self::default() + } else { + Self { default, named } + }) + } } #[derive(Debug)] -pub enum InputType { - StreamingStdin, - ZipPaths(Vec), +pub struct InputSpec { + pub stdin_stream: bool, + pub zip_paths: Vec, } #[derive(Debug)] pub struct Extract { - pub output: OutputCollation, + pub output_specs: OutputSpecs, pub entry_specs: Vec, - pub input: InputType, + pub input_spec: InputSpec, } impl Extract { @@ -1122,13 +1283,12 @@ Positional paths: } fn parse_argv(mut argv: VecDeque) -> Result { - let mut output_dir: Option = None; - let mut mkdir_flag: bool = false; - let mut stdout_flag: bool = false; let mut args: Vec = Vec::new(); let mut stdin_flag: bool = false; let mut positional_zips: Vec = Vec::new(); + let output_specs = OutputSpecs::parse_argv(&mut argv)?; + while let Some(arg) = argv.pop_front() { match arg.as_encoded_bytes() { b"-h" | b"--help" => { @@ -1136,72 +1296,27 @@ Positional paths: return Err(ArgParseError::StdoutMessage(help_text)); } - /* Output args */ - b"-d" | b"--output-directory" => { - let new_path = argv.pop_front().map(PathBuf::from).ok_or_else(|| { - Self::exit_arg_invalid("no argument provided for -d/--output-directory") - })?; - if let Some(prev_path) = output_dir.take() { - return Err(Self::exit_arg_invalid(&format!( - "--output-directory provided twice: {prev_path:?} and {new_path:?}" - ))); - } else if stdout_flag { - return Err(Self::exit_arg_invalid( - "--stdout provided along with output dir", - )); - } else if !args.is_empty() || stdin_flag || !positional_zips.is_empty() { - return Err(Self::exit_arg_invalid( - "-d/--output-directory provided after entry specs or inputs", - )); - } else { - output_dir = Some(new_path); - } - } - b"--mkdir" => { - if mkdir_flag { - return Err(Self::exit_arg_invalid("--mkdir provided twice")); - } else if stdout_flag { - return Err(Self::exit_arg_invalid( - "--stdout provided along with --mkdir", - )); - } else if !args.is_empty() || stdin_flag || !positional_zips.is_empty() { - return Err(Self::exit_arg_invalid( - "--mkdir provided after entry specs or inputs", - )); - } else { - mkdir_flag = true; - } - } - b"--stdout" => { - if let Some(output_dir) = output_dir.take() { - return Err(Self::exit_arg_invalid(&format!( - "--stdout provided along with output directory {output_dir:?}" - ))); - } else if stdout_flag { - return Err(Self::exit_arg_invalid("--stdout provided twice")); - } else if mkdir_flag { - return Err(Self::exit_arg_invalid( - "--stdout provided along with --mkdir", - )); - } else if !args.is_empty() || stdin_flag || !positional_zips.is_empty() { - return Err(Self::exit_arg_invalid( - "--stdout provided after entry specs or inputs", - )); - } else { - stdout_flag = true; - } - } - /* Transition to entry specs */ /* Try content transforms first, as they are unambiguous sentinel values. */ b"-x" | b"--extract" => { - args.push(ExtractArg::ContentTransform(ContentTransform::Extract)); + args.push(ExtractArg::ContentTransform(ContentTransform::Extract { + name: None, + })); } - b"--raw" => { - args.push(ExtractArg::ContentTransform(ContentTransform::Raw)); - } - b"--log-to-stderr" => { - args.push(ExtractArg::ContentTransform(ContentTransform::LogToStderr)); + arg_bytes if arg_bytes.starts_with(b"--extract=") => { + let name = arg + .into_string() + .map_err(|arg| { + Self::exit_arg_invalid(&format!( + "invalid unicode provided to --extract=: {arg:?}" + )) + })? + .strip_prefix("--extract=") + .unwrap() + .to_string(); + args.push(ExtractArg::ContentTransform(ContentTransform::Extract { + name: Some(name), + })); } /* Try name transforms next, as they only stack linearly and do not require CFG @@ -1324,7 +1439,6 @@ Positional paths: /* Transition to input args */ b"--stdin" => { stdin_flag = true; - break; } b"--" => break, arg_bytes => { @@ -1341,37 +1455,22 @@ Positional paths: } positional_zips.extend(argv.into_iter().map(|arg| arg.into())); - if stdin_flag && !positional_zips.is_empty() { - return Err(Self::exit_arg_invalid(&format!( - "--stdin was provided at the same time as positional args {positional_zips:?}" - ))); - } - let input = if stdin_flag { - InputType::StreamingStdin - } else { - if positional_zips.is_empty() { - return Err(Self::exit_arg_invalid( - "no zip input files were provided, and --stdin was not provided", - )); - } - InputType::ZipPaths(positional_zips) + if !stdin_flag && positional_zips.is_empty() { + return Err(Self::exit_arg_invalid( + "no zip input files were provided, and --stdin was not provided", + )); }; - - let output = if stdout_flag { - OutputCollation::ConcatenateStdout - } else { - OutputCollation::Filesystem { - output_dir, - mkdir: mkdir_flag, - } + let input_spec = InputSpec { + stdin_stream: stdin_flag, + zip_paths: positional_zips, }; let entry_specs = EntrySpec::parse_extract_args(args)?; Ok(Self { - output, + output_specs, entry_specs, - input, + input_spec, }) } } From ea507f1a0d1cfc30c9f128ded6ae7a5a89427c4e Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 25 Aug 2024 21:13:48 -0400 Subject: [PATCH 18/68] impl merged input --- cli/src/args/extract.rs | 50 ++++++++++++++++++++------------- cli/src/extract.rs | 7 +++-- cli/src/extract/entries.rs | 57 +++++++++++++++++++++++++++++++------- 3 files changed, 81 insertions(+), 33 deletions(-) diff --git a/cli/src/args/extract.rs b/cli/src/args/extract.rs index fa85ad83e..e34701c3f 100644 --- a/cli/src/args/extract.rs +++ b/cli/src/args/extract.rs @@ -815,10 +815,12 @@ impl OutputSpecs { }; if let Some(name) = cur_name.take() { named.push(NamedOutput { name, output }); - } else if let Some(default) = default.replace(output) { - return Err(Extract::exit_arg_invalid( - "multiple unnamed outputs provided: {default:?} and {output:?}", - )); + } else if let Some(default) = default.take() { + return Err(Extract::exit_arg_invalid(&format!( + "multiple unnamed outputs provided: {default:?} and {output:?}" + ))); + } else { + default = Some(output); } } arg_bytes if arg_bytes.starts_with(b"--output-directory") => { @@ -840,20 +842,24 @@ impl OutputSpecs { }; if let Some(name) = cur_name.take() { named.push(NamedOutput { name, output }); - } else if let Some(default) = default.replace(output) { - return Err(Extract::exit_arg_invalid( - "multiple unnamed outputs provided: {default:?} and {output:?}", - )); + } else if let Some(default) = default.take() { + return Err(Extract::exit_arg_invalid(&format!( + "multiple unnamed outputs provided: {default:?} and {output:?}" + ))); + } else { + default = Some(output); } } b"--stdout" => { let output = OutputCollation::ConcatenateStdout; if let Some(name) = cur_name.take() { named.push(NamedOutput { name, output }); - } else if let Some(default) = default.replace(output) { - return Err(Extract::exit_arg_invalid( - "multiple unnamed outputs provided: {default:?} and {output:?}", - )); + } else if let Some(default) = default.take() { + return Err(Extract::exit_arg_invalid(&format!( + "multiple unnamed outputs provided: {default:?} and {output:?}" + ))); + } else { + default = Some(output); } } b"-f" => { @@ -867,10 +873,12 @@ impl OutputSpecs { }; if let Some(name) = cur_name.take() { named.push(NamedOutput { name, output }); - } else if let Some(default) = default.replace(output) { - return Err(Extract::exit_arg_invalid( - "multiple unnamed outputs provided: {default:?} and {output:?}", - )); + } else if let Some(default) = default.take() { + return Err(Extract::exit_arg_invalid(&format!( + "multiple unnamed outputs provided: {default:?} and {output:?}" + ))); + } else { + default = Some(output); } } arg_bytes if arg_bytes.starts_with(b"--output-file") => { @@ -892,10 +900,12 @@ impl OutputSpecs { }; if let Some(name) = cur_name.take() { named.push(NamedOutput { name, output }); - } else if let Some(default) = default.replace(output) { - return Err(Extract::exit_arg_invalid( - "multiple unnamed outputs provided: {default:?} and {output:?}", - )); + } else if let Some(default) = default.take() { + return Err(Extract::exit_arg_invalid(&format!( + "multiple unnamed outputs provided: {default:?} and {output:?}" + ))); + } else { + default = Some(output); } } _ => { diff --git a/cli/src/extract.rs b/cli/src/extract.rs index b3119020b..12e7d402f 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -10,20 +10,21 @@ mod entries; mod matcher; mod receiver; mod transform; +use entries::IterateEntries; use receiver::EntryData; pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandError> { let Extract { - output, + output_specs, entry_specs, - input, + input_spec, } = extract; let err = Rc::new(RefCell::new(err)); let mut entry_receiver = receiver::make_entry_receiver(err.clone(), output)?; let entry_spec_transformers = transform::process_entry_specs(entry_specs)?; let mut stderr_log_output = io::stderr(); - let mut entry_iterator = entries::make_entry_iterator(input)?; + let mut entry_iterator = entries::MergedInput::from_spec(input_spec)?; while let Some(mut entry) = entry_iterator.next_entry()? { for transformer in entry_spec_transformers.iter() { diff --git a/cli/src/extract/entries.rs b/cli/src/extract/entries.rs index 8e0e322d6..dcf45a6bf 100644 --- a/cli/src/extract/entries.rs +++ b/cli/src/extract/entries.rs @@ -17,16 +17,6 @@ pub trait IterateEntries { fn next_entry(&mut self) -> Result, CommandError>; } -pub fn make_entry_iterator<'a>( - input_type: InputType, -) -> Result, CommandError> { - let ret: Box = match input_type { - InputType::StreamingStdin => Box::new(StdinInput::new()), - InputType::ZipPaths(zip_paths) => Box::new(AllInputZips::new(zip_paths)?), - }; - Ok(ret) -} - struct StdinInput { inner: io::Stdin, } @@ -130,3 +120,50 @@ impl IterateEntries for AllInputZips { } } } + +pub struct MergedInput { + stdin_stream: Option>, + zips: Option, +} + +impl MergedInput { + pub fn from_spec(spec: InputSpec) -> Result { + let InputSpec { + stdin_stream, + zip_paths, + } = spec; + Ok(Self { + stdin_stream: if stdin_stream { + Some(UnsafeCell::new(StdinInput::new())) + } else { + None + }, + zips: if zip_paths.is_empty() { + None + } else { + Some(AllInputZips::new(zip_paths)?) + }, + }) + } +} + +impl IterateEntries for MergedInput { + fn next_entry(&mut self) -> Result, CommandError> { + let mut completed_stdin: bool = false; + if let Some(stdin_stream) = self.stdin_stream.as_mut() { + if let Some(entry) = unsafe { &mut *stdin_stream.get() }.next_entry()? { + return Ok(Some(entry)); + } + completed_stdin = true; + } + if completed_stdin { + self.stdin_stream = None; + } + if let Some(zips) = self.zips.as_mut() { + if let Some(entry) = zips.next_entry()? { + return Ok(Some(entry)); + } + } + Ok(None) + } +} From 8851583890d3f958996155be84879c445039cd12 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 26 Aug 2024 01:07:40 -0400 Subject: [PATCH 19/68] do absurd stuff without checking for compilation success --- cli/src/extract.rs | 112 ++++++-- cli/src/extract/matcher.rs | 6 +- cli/src/extract/receiver.rs | 535 +++++++++++++++++++++++++---------- cli/src/extract/transform.rs | 115 ++------ 4 files changed, 499 insertions(+), 269 deletions(-) diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 12e7d402f..aaed429f0 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -1,6 +1,6 @@ use std::{ cell::RefCell, - io::{self, Write}, + io::{self, Read, Write}, rc::Rc, }; @@ -11,46 +11,106 @@ mod matcher; mod receiver; mod transform; use entries::IterateEntries; -use receiver::EntryData; +use matcher::EntryMatcher; +use receiver::{CompiledEntrySpec, ConcatEntry, EntryData, EntryReceiver, ExtractEntry}; +use transform::NameTransformer; -pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandError> { +pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), CommandError> { let Extract { output_specs, entry_specs, input_spec, } = extract; - let err = Rc::new(RefCell::new(err)); - let mut entry_receiver = receiver::make_entry_receiver(err.clone(), output)?; - let entry_spec_transformers = transform::process_entry_specs(entry_specs)?; - let mut stderr_log_output = io::stderr(); + let compiled_specs = receiver::process_entry_and_output_specs(entry_specs, output_specs)?; let mut entry_iterator = entries::MergedInput::from_spec(input_spec)?; + let mut copy_buf: Vec = vec![0u8; 1024 * 16]; + while let Some(mut entry) = entry_iterator.next_entry()? { - for transformer in entry_spec_transformers.iter() { - if !transformer.matches(&EntryData::from_entry(&entry)) { - continue; - } - let name = transformer.transform_name(entry.name()); - match transformer.content_transform() { - ContentTransform::Raw => unreachable!(), - ContentTransform::LogToStderr => { - writeln!( - &mut stderr_log_output, - "log to stderr: entry with original name {} and transformed name {}, compression method {}, uncompressed size {}", - entry.name(), name, entry.compression(), entry.size() - ) - .unwrap(); - continue; + let data = EntryData::from_entry(&entry); + + let mut matching_concats: Vec>> = Vec::new(); + let mut matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); + for spec in compiled_specs.iter() { + match spec { + CompiledEntrySpec::Concat(ConcatEntry { matcher, stream }) => { + if matcher.map(|m| m.matches(&data)).unwrap_or(true) { + matching_concats.push(stream.clone()); + } } - ContentTransform::Extract => { - let name = name.into_owned(); - entry_receiver.receive_entry(&mut entry, &name)?; + CompiledEntrySpec::Extract(ExtractEntry { + matcher, + transforms, + recv, + }) => { + if matcher.map(|m| m.matches(&data)).unwrap_or(true) { + let new_name = transforms + .map(|t| t.transform_name(&data.name)) + .unwrap_or_else(|| Cow::Borrowed(&data.name)); + matching_extracts.push((new_name, recv.clone())); + } } } } + if matching_concats.is_empty() && matching_extracts.is_empty() { + continue; + } + + /* Split output handles for concat, and split generated handles by extract source and + * name. use Rc::ptr_eq() to split, and Cow::<'s, str>::eq() with str AsRef. */ + let mut deduped_concat_writers: Vec>> = Vec::new(); + for concat_p in matching_concats.into_iter() { + if deduped_concat_writers + .iter() + .any(|p| Rc::ptr_eq(p, &concat_p)) + { + writeln!(&mut err, "skipping repeated concat").unwrap(); + } else { + deduped_concat_writers.push(concat_p); + } + } + let mut deduped_matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); + for (name, extract_p) in matching_extracts.into_iter() { + if deduped_matching_extracts + .iter() + .any(|(n, p)| Rc::ptr_eq(p, &extract_p) && name.as_ref() == n.as_ref()) + { + writeln!(&mut err, "skipping repeated extract").unwrap(); + } else { + deduped_matching_extracts.push((name, extract_p)); + } + } + + let matching_handles: Vec> = deduped_matching_extracts + .into_iter() + .map(|(name, recv)| recv.generate_entry_handle(name)) + .collect::>()?; + + let mut read_len: usize = 0; + loop { + read_len = entry.read(&mut copy_buf).wrap_err("read of entry failed")?; + if read_len == 0 { + break; + } + let cur_data: &[u8] = ©_buf[..read_len]; + for concat_writer in deduped_concat_writers.iter() { + concat_writer.borrow_mut().write_all(cur_data)?; + } + for extract_writer in matching_handles.iter() { + extract_writer.write_all(cur_data)?; + } + } + } + /* Finalize all extract entries. */ + for spec in compiled_specs.into_iter() { + match spec { + CompiledEntrySpec::Concat(_) => (), + CompiledEntrySpec::Extract(ExtractEntry { recv, .. }) => { + recv.finalize_entries()?; + } + } } - entry_receiver.finalize_entries()?; Ok(()) } diff --git a/cli/src/extract/matcher.rs b/cli/src/extract/matcher.rs index 8c086cd3f..b8e2ca35b 100644 --- a/cli/src/extract/matcher.rs +++ b/cli/src/extract/matcher.rs @@ -297,7 +297,7 @@ impl EntryMatcher for PatternMatcher { } } -pub enum WrappedMatcher { +pub enum CompiledMatcher { Primitive(Box), Negated(Box), And { @@ -310,7 +310,7 @@ pub enum WrappedMatcher { }, } -impl WrappedMatcher { +impl CompiledMatcher { fn create_primitive(arg: Predicate) -> Result { Ok(Self::Primitive(match arg { Predicate::Trivial(arg) => Box::new(TrivialMatcher::from_arg(arg)?), @@ -327,7 +327,7 @@ impl WrappedMatcher { } } -impl EntryMatcher for WrappedMatcher { +impl EntryMatcher for CompiledMatcher { type Arg = MatchExpression where Self: Sized; fn from_arg(arg: Self::Arg) -> Result diff --git a/cli/src/extract/receiver.rs b/cli/src/extract/receiver.rs index dd54ec528..6c326209e 100644 --- a/cli/src/extract/receiver.rs +++ b/cli/src/extract/receiver.rs @@ -1,8 +1,9 @@ use std::{ borrow::Cow, cell::RefCell, + collections::{HashMap, HashSet}, env, fs, - io::{self, Read, Write}, + io::{self, Read, Seek, Write}, mem, path::PathBuf, rc::Rc, @@ -10,6 +11,8 @@ use std::{ use zip::{read::ZipFile, CompressionMethod}; +use super::matcher::{CompiledMatcher, EntryMatcher}; +use super::transform::{CompiledTransformer, NameTransformer}; use crate::{args::extract::*, CommandError, WrapCommandErr}; #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] @@ -47,186 +50,428 @@ impl<'a> EntryData<'a> { } } -pub trait EntryReceiver { - fn receive_entry<'a>( - &mut self, - entry: &mut ZipFile<'a>, - name: &str, - ) -> Result<(), CommandError>; - fn finalize_entries(&mut self) -> Result<(), CommandError>; -} - -pub fn make_entry_receiver<'a>( - err: Rc>, - collation: OutputCollation, -) -> Result, CommandError> { - let ret: Box = match collation { - OutputCollation::ConcatenateStdout => Box::new(StdoutReceiver::new(err)), - OutputCollation::Filesystem { output_dir, mkdir } => { - let output_dir = match output_dir { - Some(dir) => { - if mkdir { - fs::create_dir_all(&dir).wrap_err_with(|| { - format!("failed to create output directory {dir:?}") - })?; - } - dir - } - None => env::current_dir().wrap_err("failed to get current dir")?, - }; - Box::new(FilesystemReceiver::new(err, output_dir)) - } - }; - Ok(ret) +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct OutputName(pub String); + +impl OutputName { + pub fn default_name() -> Self { + Self("default".to_string()) + } } -struct StdoutReceiver { - err: Rc>, - stdout: io::Stdout, +pub struct ParsedEntrySpecArg { + matcher: Option, + transforms: Option, + output_name: OutputName, } -impl StdoutReceiver { - pub fn new(err: Rc>) -> Self { - Self { - err, - stdout: io::stdout(), - } +impl ParsedEntrySpecArg { + pub fn from_entry_spec(spec: EntrySpec) -> Result { + let EntrySpec { + match_expr, + name_transforms, + content_transform, + } = spec; + let matcher = match match_expr { + None => None, + Some(expr) => Some(CompiledMatcher::from_arg(expr)?), + }; + let transforms = if name_transforms.is_empty() { + None + } else { + Some(CompiledTransformer::from_arg()?) + }; + let output_name = match content_transform { + ContentTransform::Extract { name } => name + .map(OutputName) + .unwrap_or_else(OutputName::default_name), + }; + Ok(Self { + matcher, + transforms, + output_name, + }) } } -impl EntryReceiver for StdoutReceiver -where - W: Write, -{ - fn receive_entry<'a>( - &mut self, - entry: &mut ZipFile<'a>, - name: &str, +pub struct ConcatEntry { + pub matcher: Option, + pub stream: Rc>, +} + +pub struct ExtractEntry { + pub matcher: Option, + pub transforms: Option, + pub recv: Rc, +} + +pub enum CompiledEntrySpec { + Concat(ConcatEntry), + Extract(ExtractEntry), +} + +pub struct ParsedNamedOutputs { + concats: HashMap>>, + extracts: HashMap>, +} + +pub fn process_entry_and_output_specs( + entry_specs: impl IntoIterator, + output_specs: OutputSpecs, +) -> Result, CommandError> { + let entry_specs: Vec = entry_specs + .into_iter() + .map(ParsedEntrySpecArg::from_entry_spec) + .collect::>()?; + assert!(!entry_specs.is_empty()); + let parsed_outputs = ParsedNamedOutputs::from_output_specs(output_specs)?; + parsed_outputs.process_entry_specs_for_outputs(entry_specs) +} + +impl ParsedNamedOutputs { + pub fn process_entry_specs_for_outputs( + self, + args: impl IntoIterator, + ) -> Result> { + args.into_iter() + .map(|arg| self.lookup_entry_spec_arg(arg)) + .collect() + } + + fn lookup_entry_spec_arg( + &self, + arg: ParsedEntrySpecArg, + ) -> Result { + let ParsedEntrySpecArg { + matcher, + transforms, + output_name, + } = arg; + if let Some(stream) = self.concats.get(&output_name) { + if transforms.is_some() { + return Err(CommandError::InvalidArg( + format!("entry name transforms {transforms:?} do not apply to concat output {output_name:?}") + )); + } + return Ok(CompiledEntrySpec::Concat(ConcatEntry { + matcher, + stream: stream.clone(), + })); + } + let Some(recv) = self.extracts.get(&output_name) else { + return Err(CommandError::InvalidArg(format!( + "output name {output_name:?} was not found" + ))); + }; + Ok(CompiledEntrySpec::Extract(ExtractEntry { + matcher, + transforms, + recv: recv.clone(), + })) + } + + fn add_stdout( + seen_stdout: &mut bool, + name: OutputName, + seen_names: &mut HashSet, + concats: &mut HashMap>>, ) -> Result<(), CommandError> { - let mut err = self.err.borrow_mut(); - writeln!(err, "receiving entry {} with name {name}", entry.name()).unwrap(); - if entry.is_dir() { - writeln!(err, "entry is directory, ignoring").unwrap(); - } else if entry.is_symlink() { - writeln!(err, "entry is symlink, ignoring").unwrap(); - } else { - io::copy(entry, &mut self.stdout) - .wrap_err_with(|| format!("failed to write entry {name} to stdout"))?; + if *seen_stdout { + return Err(CommandError::InvalidArg( + "--stdout output provided for more than one receiver".to_string(), + )); + } + if seen_names.contains(&name) { + return Err(CommandError::InvalidArg(format!( + "output name {name:?} provided more than once" + ))); } + assert!(!concats.contains(&name)); + + let handle: Rc> = Rc::new(RefCell::new(io::stdout())); + + *seen_stdout = true; + assert!(seen_names.insert(name.clone())); + assert!(concats.insert(name, handle).is_none()); Ok(()) } - fn finalize_entries(&mut self) -> Result<(), CommandError> { + fn add_file( + path: PathBuf, + append: bool, + name: OutputName, + seen_files: &mut HashSet, + seen_names: &mut HashSet, + concats: &mut HashMap>>, + ) -> Result<(), CommandError> { + if seen_names.contains(&name) { + return Err(CommandError::InvalidArg(format!( + "output name {name:?} provided more than once" + ))); + } + assert!(!concats.contains(&name)); + let canon_path = path + .canonicalize() + .wrap_err_with(|| format!("canonicalizing path {path:?} failed"))?; + if seen_files.contains(&canon_path) { + return Err(CommandError::InvalidArg(format!( + "canonical output file path {canon_path:?} provided more than once" + ))); + } + + let handle: Rc> = { + let mut f: fs::File = if append { + fs::OpenOptions::new() + .write(true) + .create(true) + .open(&path) + .wrap_err_with(|| format!("failed to open file for append at {path:?}"))? + } else { + fs::File::create(&path) + .wrap_err_with(|| format!("failed to open file with truncation at {path:?}"))? + }; + f.seek(io::SeekFrom::End(0)) + .wrap_err_with(|| format!("failed to seek to end of opened file {f:?}"))?; + Rc::new(RefCell::new(f)) + }; + + assert!(seen_files.insert(canon_path)); + assert!(seen_names.insert(name.clone())); + assert!(concats.insert(name, handle).is_none()); Ok(()) } + + fn add_dir( + output_dir: PathBuf, + mkdir: bool, + name: OutputName, + seen_dirs: &mut HashSet, + seen_names: &mut HashSet, + extracts: &mut HashMap>, + ) -> Result<(), CommandError> { + if seen_names.contains(&name) { + return Err(CommandError::InvalidArg(format!( + "output name {name:?} provided more than once" + ))); + } + assert!(!extracts.contains(&name)); + let canon_path = path + .canonicalize() + .wrap_err_with(|| format!("canonicalizing dir path {path:?} failed"))?; + if seen_dirs.contains(&canon_path) { + return Err(CommandError::InvalidArg(format!( + "canonical output dir path {canon_path:?} provided more than once" + ))); + } + + let handle: Rc = { + if mkdir { + fs::create_dir_all(&output_dir).wrap_err_with(|| { + format!("failed to create output directory {output_dir:?}") + })?; + }; + let d = FilesystemReceiver::new(output_dir); + Rc::new(d) + }; + + assert!(seen_dirs.insert(canon_path)); + assert!(seen_names.insert(name.clone())); + assert!(extracts.insert(name, handle).is_none()); + Ok(()) + } + + pub fn from_output_specs(spec: OutputSpecs) -> Result { + let OutputSpecs { default, named } = spec; + + let mut concats: HashMap>> = HashMap::new(); + let mut extracts: HashMap> = HashMap::new(); + + let mut seen_stdout: bool = false; + let mut seen_files: HashSet = HashSet::new(); + let mut seen_dirs: HashSet = HashSet::new(); + let mut seen_names: HashSet = HashSet::new(); + + if let Some(default) = default { + match default { + OutputCollation::ConcatenateStdout => { + Self::add_stdout( + &mut seen_stdout, + OutputName::default_name(), + &mut seen_names, + &mut concats, + )?; + } + OutputCollation::ConcatenateFile { path, append } => { + Self::add_file( + path, + append, + OutputName::default_name(), + &mut seen_files, + &mut seen_names, + &mut concats, + )?; + } + OutputCollation::Filesystem { output_dir, mkdir } => { + Self::add_dir( + output_dir, + mkdir, + OutputName::default_name(), + &mut seen_dirs, + &mut seen_names, + &mut extracts, + )?; + } + } + } + for NamedOutput { name, output } in named.into_iter() { + match output { + OutputCollation::ConcatenateStdout => { + Self::add_stdout(&mut seen_stdout, name, &mut seen_names, &mut concats)?; + } + OutputCollation::ConcatenateFile { path, append } => { + Self::add_file( + path, + append, + name, + &mut seen_files, + &mut seen_names, + &mut concats, + )?; + } + OutputCollation::Filesystem { output_dir, mkdir } => { + Self::add_dir( + output_dir, + mkdir, + name, + &mut seen_dirs, + &mut seen_names, + &mut extracts, + )?; + } + } + } + + Ok(Self { concats, extracts }) + } } -struct FilesystemReceiver { - err: Rc>, +pub trait EntryReceiver { + fn generate_entry_handle<'s>(&self, name: Cow<'s, str>) + -> Result, CommandError>; + + fn finalize_entries(&self) -> Result<(), CommandError>; +} + +struct FilesystemReceiver { output_dir: PathBuf, #[cfg(unix)] - perms_to_set: Vec<(PathBuf, u32)>, + perms_to_set: RefCell>, } -impl FilesystemReceiver { - pub fn new(err: Rc>, output_dir: PathBuf) -> Self { +impl FilesystemReceiver { + pub fn new(output_dir: PathBuf) -> Self { Self { - err, output_dir, #[cfg(unix)] - perms_to_set: Vec::new(), + perms_to_set: RefCell::new(Vec::new()), } } } -impl EntryReceiver for FilesystemReceiver -where - W: Write, -{ - fn receive_entry<'a>( - &mut self, - entry: &mut ZipFile<'a>, - name: &str, - ) -> Result<(), CommandError> { - let mut err = self.err.borrow_mut(); - let full_output_path = self.output_dir.join(name); - writeln!( - err, - "receiving entry {} with name {name} and writing to path {full_output_path:?}", - entry.name() - ) - .unwrap(); +impl EntryReceiver for FilesystemReceiver { + fn generate_entry_handle<'s>( + &self, + name: Cow<'s, str>, + ) -> Result, CommandError> { + todo!("wow!") + } - #[cfg(unix)] - if let Some(mode) = entry.unix_mode() { - writeln!( - err, - "storing unix mode {mode} for path {full_output_path:?}" - ) - .unwrap(); - self.perms_to_set.push((full_output_path.clone(), mode)); - } + /* fn receive_entry<'a>( */ + /* &mut self, */ + /* entry: &mut ZipFile<'a>, */ + /* name: &str, */ + /* ) -> Result<(), CommandError> { */ + /* let mut err = self.err.borrow_mut(); */ + /* let full_output_path = self.output_dir.join(name); */ + /* writeln!( */ + /* err, */ + /* "receiving entry {} with name {name} and writing to path {full_output_path:?}", */ + /* entry.name() */ + /* ) */ + /* .unwrap(); */ - if entry.is_dir() { - writeln!(err, "entry is directory, creating").unwrap(); - fs::create_dir_all(&full_output_path).wrap_err_with(|| { - format!("failed to create directory entry at {full_output_path:?}") - })?; - } else if entry.is_symlink() { - let mut target: Vec = Vec::with_capacity(entry.size().try_into().unwrap()); - entry.read_to_end(&mut target).wrap_err_with(|| { - format!( - "failed to read symlink target from zip archive entry {}", - entry.name() - ) - })?; + /* #[cfg(unix)] */ + /* if let Some(mode) = entry.unix_mode() { */ + /* writeln!( */ + /* err, */ + /* "storing unix mode {mode} for path {full_output_path:?}" */ + /* ) */ + /* .unwrap(); */ + /* self.perms_to_set */ + /* .borrow_mut() */ + /* .push((full_output_path.clone(), mode)); */ + /* } */ - #[cfg(unix)] - { - use std::{ - ffi::OsString, - os::unix::{ffi::OsStringExt, fs::symlink}, - }; - let target = OsString::from_vec(target); - writeln!(err, "entry is symlink to {target:?}, creating").unwrap(); - symlink(&target, &full_output_path).wrap_err_with(|| { - format!( - "failed to create symlink at {full_output_path:?} with target {target:?}" - ) - })?; - } - #[cfg(not(unix))] - { - /* FIXME: non-unix symlink extraction not yet supported! */ - todo!("TODO: cannot create symlink for entry {name} on non-unix yet!"); - } - } else { - writeln!(err, "entry is file, creating").unwrap(); - if let Some(containing_dir) = full_output_path.parent() { - fs::create_dir_all(containing_dir).wrap_err_with(|| { - format!("failed to create parent dirs for file at {full_output_path:?}") - })?; - } else { - writeln!(err, "entry had no parent dir (in root dir?)").unwrap(); - } - let mut outfile = fs::File::create(&full_output_path) - .wrap_err_with(|| format!("failed to create file at {full_output_path:?}"))?; - io::copy(entry, &mut outfile).wrap_err_with(|| { - format!( - "failed to copy file contents from {} to {full_output_path:?}", - entry.name() - ) - })?; - } - Ok(()) - } + /* if entry.is_dir() { */ + /* writeln!(err, "entry is directory, creating").unwrap(); */ + /* fs::create_dir_all(&full_output_path).wrap_err_with(|| { */ + /* format!("failed to create directory entry at {full_output_path:?}") */ + /* })?; */ + /* } else if entry.is_symlink() { */ + /* let mut target: Vec = Vec::with_capacity(entry.size().try_into().unwrap()); */ + /* entry.read_to_end(&mut target).wrap_err_with(|| { */ + /* format!( */ + /* "failed to read symlink target from zip archive entry {}", */ + /* entry.name() */ + /* ) */ + /* })?; */ + + /* #[cfg(unix)] */ + /* { */ + /* use std::{ */ + /* ffi::OsString, */ + /* os::unix::{ffi::OsStringExt, fs::symlink}, */ + /* }; */ + /* let target = OsString::from_vec(target); */ + /* writeln!(err, "entry is symlink to {target:?}, creating").unwrap(); */ + /* symlink(&target, &full_output_path).wrap_err_with(|| { */ + /* format!( */ + /* "failed to create symlink at {full_output_path:?} with target {target:?}" */ + /* ) */ + /* })?; */ + /* } */ + /* #[cfg(not(unix))] */ + /* { */ + /* /\* FIXME: non-unix symlink extraction not yet supported! *\/ */ + /* todo!("TODO: cannot create symlink for entry {name} on non-unix yet!"); */ + /* } */ + /* } else { */ + /* writeln!(err, "entry is file, creating").unwrap(); */ + /* if let Some(containing_dir) = full_output_path.parent() { */ + /* fs::create_dir_all(containing_dir).wrap_err_with(|| { */ + /* format!("failed to create parent dirs for file at {full_output_path:?}") */ + /* })?; */ + /* } else { */ + /* writeln!(err, "entry had no parent dir (in root dir?)").unwrap(); */ + /* } */ + /* let mut outfile = fs::File::create(&full_output_path) */ + /* .wrap_err_with(|| format!("failed to create file at {full_output_path:?}"))?; */ + /* io::copy(entry, &mut outfile).wrap_err_with(|| { */ + /* format!( */ + /* "failed to copy file contents from {} to {full_output_path:?}", */ + /* entry.name() */ + /* ) */ + /* })?; */ + /* } */ + /* Ok(()) */ + /* } */ fn finalize_entries(&mut self) -> Result<(), CommandError> { #[cfg(unix)] { use std::{cmp::Reverse, os::unix::fs::PermissionsExt}; - let mut perms_to_set = mem::take(&mut self.perms_to_set); + let mut perms_to_set = mem::take(&mut *self.perms_to_set.borrow_mut()); perms_to_set.sort_unstable_by_key(|(path, _)| Reverse(path.clone())); for (path, mode) in perms_to_set.into_iter() { let perms = fs::Permissions::from_mode(mode); diff --git a/cli/src/extract/transform.rs b/cli/src/extract/transform.rs index 34f05a977..2e1427816 100644 --- a/cli/src/extract/transform.rs +++ b/cli/src/extract/transform.rs @@ -5,11 +5,11 @@ use zip::read::ZipFile; use crate::{args::extract::*, CommandError}; use super::{ - matcher::{process_component_selector, EntryMatcher, WrappedMatcher}, + matcher::process_component_selector, receiver::{EntryData, EntryReceiver}, }; -trait NameTransformer { +pub trait NameTransformer { type Arg where Self: Sized; @@ -103,24 +103,12 @@ impl NameTransformer for AddPrefix { } } -enum ContentProcessor { - StderrLog, - /* FileLog(fs::File), */ - WriteContent, +pub struct CompiledTransformer { + transformers: Vec>, } -impl ContentProcessor { - /* pub fn process_entry(&mut self, entry: &mut ZipFile, ) */ -} - -pub struct EntrySpecTransformer { - matcher: Option, - name_transformers: Vec>, - content_transform: ContentTransform, -} - -impl EntrySpecTransformer { - fn make_transformer(trans: NameTransform) -> Result, CommandError> { +impl CompiledTransformer { + fn make_single(trans: NameTransform) -> Result, CommandError> { Ok(match trans { NameTransform::Trivial(arg) => Box::new(Trivial::from_arg(arg)?), NameTransform::Basic(basic_trans) => match basic_trans { @@ -137,45 +125,23 @@ impl EntrySpecTransformer { }, }) } +} - pub fn new(entry_spec: EntrySpec) -> Result { - let EntrySpec { - match_expr, - name_transforms, - content_transform, - } = entry_spec; - let matcher = match match_expr { - None => None, - Some(expr) => Some(WrappedMatcher::from_arg(expr)?), - }; - let name_transformers: Vec<_> = name_transforms - .into_iter() - .map(Self::make_transformer) - .collect::>()?; +impl NameTransformer for CompiledTransformer { + type Arg = Vec where Self: Sized; + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + assert!(!arg.is_empty()); Ok(Self { - matcher, - name_transformers, - content_transform, + transformers: arg + .into_iter() + .map(Self::make_single) + .collect::>()?, }) } - pub fn empty() -> Self { - Self { - matcher: None, - name_transformers: Vec::new(), - content_transform: ContentTransform::Extract, - } - } -} - -impl EntrySpecTransformer { - pub fn matches(&self, entry: &EntryData) -> bool { - match &self.matcher { - None => true, - Some(matcher) => matcher.matches(entry), - } - } - /// Transform the name from the zip entry, maintaining a few invariants: /// 1. If the transformations all return substrings (no prefixing, non-empty replacements, or /// empty replacements that lead to non-contiguous input chunks), return a slice of the @@ -186,10 +152,10 @@ impl EntrySpecTransformer { /// at the end, if substring-only transformations reduced its length. This is because Cow /// can only describe a substring of the original input or an entirely new allocated /// string, as opposed to a more general sort of string view wrapper. - pub fn transform_name<'s>(&self, mut original_name: &'s str) -> Cow<'s, str> { + fn transform_name<'s>(&self, mut original_name: &'s str) -> Cow<'s, str> { let mut newly_allocated_name: Option = None; let mut newly_allocated_str: Option<&str> = None; - for transformer in self.name_transformers.iter() { + for transformer in self.transformers.iter() { match newly_allocated_str { Some(s) => match transformer.transform_name(s) { Cow::Borrowed(t) => { @@ -228,45 +194,4 @@ impl EntrySpecTransformer { } } } - - pub fn content_transform(&self) -> &ContentTransform { - &self.content_transform - } -} - -pub fn process_entry_specs( - entry_specs: impl IntoIterator, -) -> Result, CommandError> { - let entry_spec_transformers: Vec = entry_specs - .into_iter() - .map(|spec| EntrySpecTransformer::new(spec)) - .collect::>()?; - if entry_spec_transformers.is_empty() { - return Ok(vec![EntrySpecTransformer::empty()]); - }; - - /* Perform some validation on the transforms since we don't currently support everything we - * want to. */ - if entry_spec_transformers - .iter() - .any(|t| *t.content_transform() == ContentTransform::Raw) - { - /* TODO: this can be solved if we can convert a ZipFile into a Raw reader! */ - return Err(CommandError::InvalidArg( - "--raw extraction output is not yet supported".to_string(), - )); - } - if entry_spec_transformers - .iter() - .filter(|t| *t.content_transform() != ContentTransform::LogToStderr) - .count() - > 1 - { - /* TODO: this can be solved by separating data from entries! */ - return Err(CommandError::InvalidArg( - "more than one entry spec using a content transform which reads content (i.e. was not --log-to-stderr) was provided; this requires teeing entry contents which is not yet supported".to_string(), - )); - } - - Ok(entry_spec_transformers) } From 93c3bb958121c0b056c7a195d650375b28f41c92 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 26 Aug 2024 01:15:41 -0400 Subject: [PATCH 20/68] ok it compiles now --- cli/src/extract.rs | 21 ++++++++++++++------- cli/src/extract/receiver.rs | 23 ++++++++++++----------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/cli/src/extract.rs b/cli/src/extract.rs index aaed429f0..5fc049929 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -1,10 +1,11 @@ use std::{ + borrow::Cow, cell::RefCell, io::{self, Read, Write}, rc::Rc, }; -use crate::{args::extract::*, CommandError}; +use crate::{args::extract::*, CommandError, WrapCommandErr}; mod entries; mod matcher; @@ -35,7 +36,7 @@ pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), Comm for spec in compiled_specs.iter() { match spec { CompiledEntrySpec::Concat(ConcatEntry { matcher, stream }) => { - if matcher.map(|m| m.matches(&data)).unwrap_or(true) { + if matcher.as_ref().map(|m| m.matches(&data)).unwrap_or(true) { matching_concats.push(stream.clone()); } } @@ -44,8 +45,9 @@ pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), Comm transforms, recv, }) => { - if matcher.map(|m| m.matches(&data)).unwrap_or(true) { + if matcher.as_ref().map(|m| m.matches(&data)).unwrap_or(true) { let new_name = transforms + .as_ref() .map(|t| t.transform_name(&data.name)) .unwrap_or_else(|| Cow::Borrowed(&data.name)); matching_extracts.push((new_name, recv.clone())); @@ -82,7 +84,7 @@ pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), Comm } } - let matching_handles: Vec> = deduped_matching_extracts + let mut matching_handles: Vec> = deduped_matching_extracts .into_iter() .map(|(name, recv)| recv.generate_entry_handle(name)) .collect::>()?; @@ -95,10 +97,15 @@ pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), Comm } let cur_data: &[u8] = ©_buf[..read_len]; for concat_writer in deduped_concat_writers.iter() { - concat_writer.borrow_mut().write_all(cur_data)?; + concat_writer + .borrow_mut() + .write_all(cur_data) + .wrap_err("failed to write data to concat output")?; } - for extract_writer in matching_handles.iter() { - extract_writer.write_all(cur_data)?; + for extract_writer in matching_handles.iter_mut() { + extract_writer + .write_all(cur_data) + .wrap_err("failed to write data to extract output")?; } } } diff --git a/cli/src/extract/receiver.rs b/cli/src/extract/receiver.rs index 6c326209e..020b2e9da 100644 --- a/cli/src/extract/receiver.rs +++ b/cli/src/extract/receiver.rs @@ -79,7 +79,7 @@ impl ParsedEntrySpecArg { let transforms = if name_transforms.is_empty() { None } else { - Some(CompiledTransformer::from_arg()?) + Some(CompiledTransformer::from_arg(name_transforms)?) }; let output_name = match content_transform { ContentTransform::Extract { name } => name @@ -132,7 +132,7 @@ impl ParsedNamedOutputs { pub fn process_entry_specs_for_outputs( self, args: impl IntoIterator, - ) -> Result> { + ) -> Result, CommandError> { args.into_iter() .map(|arg| self.lookup_entry_spec_arg(arg)) .collect() @@ -149,9 +149,9 @@ impl ParsedNamedOutputs { } = arg; if let Some(stream) = self.concats.get(&output_name) { if transforms.is_some() { - return Err(CommandError::InvalidArg( - format!("entry name transforms {transforms:?} do not apply to concat output {output_name:?}") - )); + return Err(CommandError::InvalidArg(format!( + "entry name transforms do not apply to concat output {output_name:?}" + ))); } return Ok(CompiledEntrySpec::Concat(ConcatEntry { matcher, @@ -186,7 +186,7 @@ impl ParsedNamedOutputs { "output name {name:?} provided more than once" ))); } - assert!(!concats.contains(&name)); + assert!(!concats.contains_key(&name)); let handle: Rc> = Rc::new(RefCell::new(io::stdout())); @@ -209,7 +209,7 @@ impl ParsedNamedOutputs { "output name {name:?} provided more than once" ))); } - assert!(!concats.contains(&name)); + assert!(!concats.contains_key(&name)); let canon_path = path .canonicalize() .wrap_err_with(|| format!("canonicalizing path {path:?} failed"))?; @@ -254,10 +254,10 @@ impl ParsedNamedOutputs { "output name {name:?} provided more than once" ))); } - assert!(!extracts.contains(&name)); - let canon_path = path + assert!(!extracts.contains_key(&name)); + let canon_path = output_dir .canonicalize() - .wrap_err_with(|| format!("canonicalizing dir path {path:?} failed"))?; + .wrap_err_with(|| format!("canonicalizing dir path {output_dir:?} failed"))?; if seen_dirs.contains(&canon_path) { return Err(CommandError::InvalidArg(format!( "canonical output dir path {canon_path:?} provided more than once" @@ -324,6 +324,7 @@ impl ParsedNamedOutputs { } } for NamedOutput { name, output } in named.into_iter() { + let name = OutputName(name); match output { OutputCollation::ConcatenateStdout => { Self::add_stdout(&mut seen_stdout, name, &mut seen_names, &mut concats)?; @@ -466,7 +467,7 @@ impl EntryReceiver for FilesystemReceiver { /* Ok(()) */ /* } */ - fn finalize_entries(&mut self) -> Result<(), CommandError> { + fn finalize_entries(&self) -> Result<(), CommandError> { #[cfg(unix)] { use std::{cmp::Reverse, os::unix::fs::PermissionsExt}; From bda400de451cbfc7bcd68bb52d6f5cd804ee13bd Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 26 Aug 2024 01:38:41 -0400 Subject: [PATCH 21/68] ok it might even run correctly now? --- cli/src/extract.rs | 31 +++++- cli/src/extract/matcher.rs | 2 +- cli/src/extract/receiver.rs | 192 +++++++++++++++++------------------ cli/src/extract/transform.rs | 7 -- 4 files changed, 123 insertions(+), 109 deletions(-) diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 5fc049929..d6eb46c0f 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -1,7 +1,7 @@ use std::{ borrow::Cow, cell::RefCell, - io::{self, Read, Write}, + io::{Read, Write}, rc::Rc, }; @@ -13,7 +13,7 @@ mod receiver; mod transform; use entries::IterateEntries; use matcher::EntryMatcher; -use receiver::{CompiledEntrySpec, ConcatEntry, EntryData, EntryReceiver, ExtractEntry}; +use receiver::{CompiledEntrySpec, ConcatEntry, EntryData, EntryKind, EntryReceiver, ExtractEntry}; use transform::NameTransformer; pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), CommandError> { @@ -29,6 +29,22 @@ pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), Comm let mut copy_buf: Vec = vec![0u8; 1024 * 16]; while let Some(mut entry) = entry_iterator.next_entry()? { + let symlink_target: Option> = { + let (kind, size) = { + let data = EntryData::from_entry(&entry); + (data.kind, data.size) + }; + match kind { + EntryKind::Symlink => { + let mut target: Vec = Vec::with_capacity(size.try_into().unwrap()); + entry + .read_to_end(&mut target) + .wrap_err("failed to read symlink target from zip archive entry")?; + Some(target) + } + _ => None, + } + }; let data = EntryData::from_entry(&entry); let mut matching_concats: Vec>> = Vec::new(); @@ -86,10 +102,15 @@ pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), Comm let mut matching_handles: Vec> = deduped_matching_extracts .into_iter() - .map(|(name, recv)| recv.generate_entry_handle(name)) - .collect::>()?; + .map(|(name, recv)| { + recv.generate_entry_handle(data, symlink_target.as_ref().map(|t| t.as_ref()), name) + }) + .collect::, _>>()? + .into_iter() + .flatten() + .collect(); - let mut read_len: usize = 0; + let mut read_len: usize; loop { read_len = entry.read(&mut copy_buf).wrap_err("read of entry failed")?; if read_len == 0 { diff --git a/cli/src/extract/matcher.rs b/cli/src/extract/matcher.rs index b8e2ca35b..641850d9f 100644 --- a/cli/src/extract/matcher.rs +++ b/cli/src/extract/matcher.rs @@ -3,7 +3,7 @@ use std::path::Path; use glob; use regex; -use zip::{read::ZipFile, CompressionMethod}; +use zip::CompressionMethod; use super::receiver::{EntryData, EntryKind}; use crate::{args::extract::*, CommandError}; diff --git a/cli/src/extract/receiver.rs b/cli/src/extract/receiver.rs index 020b2e9da..e0d8d47d5 100644 --- a/cli/src/extract/receiver.rs +++ b/cli/src/extract/receiver.rs @@ -2,8 +2,8 @@ use std::{ borrow::Cow, cell::RefCell, collections::{HashMap, HashSet}, - env, fs, - io::{self, Read, Seek, Write}, + fs, + io::{self, Seek, Write}, mem, path::PathBuf, rc::Rc, @@ -60,9 +60,9 @@ impl OutputName { } pub struct ParsedEntrySpecArg { - matcher: Option, - transforms: Option, - output_name: OutputName, + pub matcher: Option, + pub transforms: Option, + pub output_name: OutputName, } impl ParsedEntrySpecArg { @@ -119,11 +119,17 @@ pub fn process_entry_and_output_specs( entry_specs: impl IntoIterator, output_specs: OutputSpecs, ) -> Result, CommandError> { - let entry_specs: Vec = entry_specs + let mut entry_specs: Vec = entry_specs .into_iter() .map(ParsedEntrySpecArg::from_entry_spec) .collect::>()?; - assert!(!entry_specs.is_empty()); + if entry_specs.is_empty() { + entry_specs.push(ParsedEntrySpecArg { + matcher: None, + transforms: None, + output_name: OutputName::default_name(), + }); + } let parsed_outputs = ParsedNamedOutputs::from_output_specs(output_specs)?; parsed_outputs.process_entry_specs_for_outputs(entry_specs) } @@ -255,6 +261,12 @@ impl ParsedNamedOutputs { ))); } assert!(!extracts.contains_key(&name)); + + if mkdir { + fs::create_dir_all(&output_dir) + .wrap_err_with(|| format!("failed to create output directory {output_dir:?}"))?; + }; + let canon_path = output_dir .canonicalize() .wrap_err_with(|| format!("canonicalizing dir path {output_dir:?} failed"))?; @@ -265,11 +277,6 @@ impl ParsedNamedOutputs { } let handle: Rc = { - if mkdir { - fs::create_dir_all(&output_dir).wrap_err_with(|| { - format!("failed to create output directory {output_dir:?}") - })?; - }; let d = FilesystemReceiver::new(output_dir); Rc::new(d) }; @@ -357,8 +364,12 @@ impl ParsedNamedOutputs { } pub trait EntryReceiver { - fn generate_entry_handle<'s>(&self, name: Cow<'s, str>) - -> Result, CommandError>; + fn generate_entry_handle<'s>( + &self, + data: EntryData<'s>, + symlink_target: Option<&[u8]>, + name: Cow<'s, str>, + ) -> Result>, CommandError>; fn finalize_entries(&self) -> Result<(), CommandError>; } @@ -382,90 +393,79 @@ impl FilesystemReceiver { impl EntryReceiver for FilesystemReceiver { fn generate_entry_handle<'s>( &self, + data: EntryData<'s>, + symlink_target: Option<&[u8]>, name: Cow<'s, str>, - ) -> Result, CommandError> { - todo!("wow!") - } + ) -> Result>, CommandError> { + /* let mut err = self.err.borrow_mut(); */ + let full_output_path = self.output_dir.join(name.as_ref()); + /* writeln!( */ + /* err, */ + /* "receiving entry {} with name {name} and writing to path {full_output_path:?}", */ + /* entry.name() */ + /* ) */ + /* .unwrap(); */ - /* fn receive_entry<'a>( */ - /* &mut self, */ - /* entry: &mut ZipFile<'a>, */ - /* name: &str, */ - /* ) -> Result<(), CommandError> { */ - /* let mut err = self.err.borrow_mut(); */ - /* let full_output_path = self.output_dir.join(name); */ - /* writeln!( */ - /* err, */ - /* "receiving entry {} with name {name} and writing to path {full_output_path:?}", */ - /* entry.name() */ - /* ) */ - /* .unwrap(); */ - - /* #[cfg(unix)] */ - /* if let Some(mode) = entry.unix_mode() { */ - /* writeln!( */ - /* err, */ - /* "storing unix mode {mode} for path {full_output_path:?}" */ - /* ) */ - /* .unwrap(); */ - /* self.perms_to_set */ - /* .borrow_mut() */ - /* .push((full_output_path.clone(), mode)); */ - /* } */ - - /* if entry.is_dir() { */ - /* writeln!(err, "entry is directory, creating").unwrap(); */ - /* fs::create_dir_all(&full_output_path).wrap_err_with(|| { */ - /* format!("failed to create directory entry at {full_output_path:?}") */ - /* })?; */ - /* } else if entry.is_symlink() { */ - /* let mut target: Vec = Vec::with_capacity(entry.size().try_into().unwrap()); */ - /* entry.read_to_end(&mut target).wrap_err_with(|| { */ - /* format!( */ - /* "failed to read symlink target from zip archive entry {}", */ - /* entry.name() */ - /* ) */ - /* })?; */ - - /* #[cfg(unix)] */ - /* { */ - /* use std::{ */ - /* ffi::OsString, */ - /* os::unix::{ffi::OsStringExt, fs::symlink}, */ - /* }; */ - /* let target = OsString::from_vec(target); */ - /* writeln!(err, "entry is symlink to {target:?}, creating").unwrap(); */ - /* symlink(&target, &full_output_path).wrap_err_with(|| { */ - /* format!( */ - /* "failed to create symlink at {full_output_path:?} with target {target:?}" */ - /* ) */ - /* })?; */ - /* } */ - /* #[cfg(not(unix))] */ - /* { */ - /* /\* FIXME: non-unix symlink extraction not yet supported! *\/ */ - /* todo!("TODO: cannot create symlink for entry {name} on non-unix yet!"); */ - /* } */ - /* } else { */ - /* writeln!(err, "entry is file, creating").unwrap(); */ - /* if let Some(containing_dir) = full_output_path.parent() { */ - /* fs::create_dir_all(containing_dir).wrap_err_with(|| { */ - /* format!("failed to create parent dirs for file at {full_output_path:?}") */ - /* })?; */ - /* } else { */ - /* writeln!(err, "entry had no parent dir (in root dir?)").unwrap(); */ - /* } */ - /* let mut outfile = fs::File::create(&full_output_path) */ - /* .wrap_err_with(|| format!("failed to create file at {full_output_path:?}"))?; */ - /* io::copy(entry, &mut outfile).wrap_err_with(|| { */ - /* format!( */ - /* "failed to copy file contents from {} to {full_output_path:?}", */ - /* entry.name() */ - /* ) */ - /* })?; */ - /* } */ - /* Ok(()) */ - /* } */ + #[cfg(unix)] + if let Some(mode) = data.unix_mode { + /* writeln!( */ + /* err, */ + /* "storing unix mode {mode} for path {full_output_path:?}" */ + /* ) */ + /* .unwrap(); */ + self.perms_to_set + .borrow_mut() + .push((full_output_path.clone(), mode)); + } + + match data.kind { + EntryKind::Dir => { + /* writeln!(err, "entry is directory, creating").unwrap(); */ + fs::create_dir_all(&full_output_path).wrap_err_with(|| { + format!("failed to create directory entry at {full_output_path:?}") + })?; + } + EntryKind::Symlink => { + let target: Vec = symlink_target + .expect("we should have generated this") + .to_vec(); + + #[cfg(unix)] + { + use std::{ + ffi::OsString, + os::unix::{ffi::OsStringExt, fs::symlink}, + }; + let target = OsString::from_vec(target); + /* writeln!(err, "entry is symlink to {target:?}, creating").unwrap(); */ + symlink(&target, &full_output_path).wrap_err_with(|| { + format!( + "failed to create symlink at {full_output_path:?} with target {target:?}" + ) + })?; + } + #[cfg(not(unix))] + { + /* FIXME: non-unix symlink extraction not yet supported! */ + todo!("TODO: cannot create symlink for entry {name} on non-unix yet!"); + } + } + EntryKind::File => { + /* writeln!(err, "entry is file, creating").unwrap(); */ + if let Some(containing_dir) = full_output_path.parent() { + fs::create_dir_all(containing_dir).wrap_err_with(|| { + format!("failed to create parent dirs for file at {full_output_path:?}") + })?; + } else { + /* writeln!(err, "entry had no parent dir (in root dir?)").unwrap(); */ + } + let outfile = fs::File::create(&full_output_path) + .wrap_err_with(|| format!("failed to create file at {full_output_path:?}"))?; + return Ok(Some(Box::new(outfile))); + } + } + Ok(None) + } fn finalize_entries(&self) -> Result<(), CommandError> { #[cfg(unix)] diff --git a/cli/src/extract/transform.rs b/cli/src/extract/transform.rs index 2e1427816..cfa9df547 100644 --- a/cli/src/extract/transform.rs +++ b/cli/src/extract/transform.rs @@ -1,14 +1,7 @@ use std::{borrow::Cow, collections::VecDeque}; -use zip::read::ZipFile; - use crate::{args::extract::*, CommandError}; -use super::{ - matcher::process_component_selector, - receiver::{EntryData, EntryReceiver}, -}; - pub trait NameTransformer { type Arg where From 90e19516fe5bdbe66fa7a70d0327229291a6775e Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 26 Aug 2024 01:52:12 -0400 Subject: [PATCH 22/68] make output files work --- cli/src/extract/receiver.rs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/cli/src/extract/receiver.rs b/cli/src/extract/receiver.rs index e0d8d47d5..9e0bc2ae7 100644 --- a/cli/src/extract/receiver.rs +++ b/cli/src/extract/receiver.rs @@ -216,14 +216,6 @@ impl ParsedNamedOutputs { ))); } assert!(!concats.contains_key(&name)); - let canon_path = path - .canonicalize() - .wrap_err_with(|| format!("canonicalizing path {path:?} failed"))?; - if seen_files.contains(&canon_path) { - return Err(CommandError::InvalidArg(format!( - "canonical output file path {canon_path:?} provided more than once" - ))); - } let handle: Rc> = { let mut f: fs::File = if append { @@ -241,6 +233,15 @@ impl ParsedNamedOutputs { Rc::new(RefCell::new(f)) }; + let canon_path = path + .canonicalize() + .wrap_err_with(|| format!("canonicalizing path {path:?} failed"))?; + if seen_files.contains(&canon_path) { + return Err(CommandError::InvalidArg(format!( + "canonical output file path {canon_path:?} provided more than once" + ))); + } + assert!(seen_files.insert(canon_path)); assert!(seen_names.insert(name.clone())); assert!(concats.insert(name, handle).is_none()); From 559691db8a4296727c7398f0c977c1bc45a8bea7 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 26 Aug 2024 11:11:42 -0400 Subject: [PATCH 23/68] default to regexp for replacement and disallow globs from replacement --- cli/src/args/extract.rs | 113 ++++++++++++++++++++++++++++------------ 1 file changed, 81 insertions(+), 32 deletions(-) diff --git a/cli/src/args/extract.rs b/cli/src/args/extract.rs index e34701c3f..b37658a3e 100644 --- a/cli/src/args/extract.rs +++ b/cli/src/args/extract.rs @@ -30,9 +30,8 @@ impl ComponentSelector { } } -#[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Copy, Clone)] +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Copy, Clone)] pub enum PatternSelectorType { - #[default] Glob, Literal, Regexp, @@ -47,6 +46,14 @@ impl PatternSelectorType { _ => None, } } + + pub const fn default_for_match() -> Self { + Self::Glob + } + + pub const fn default_for_replacement() -> Self { + Self::Regexp + } } #[derive(Debug)] @@ -71,7 +78,7 @@ pub struct PatternModifiers { pub multiple_matches: bool, } -#[derive(Debug, Default)] +#[derive(Debug)] pub struct PatternSelector { pub pat_sel: PatternSelectorType, pub modifiers: PatternModifiers, @@ -112,9 +119,36 @@ impl PatternSelector { } } } + + pub fn default_for_context(ctx: PatternContext) -> Self { + match ctx { + PatternContext::Match => Self::default_for_match(), + PatternContext::Replacement => Self::default_for_replacement(), + } + } + + pub fn default_for_match() -> Self { + Self { + pat_sel: PatternSelectorType::default_for_match(), + modifiers: PatternModifiers::default(), + } + } + + pub fn default_for_replacement() -> Self { + Self { + pat_sel: PatternSelectorType::default_for_replacement(), + modifiers: PatternModifiers::default(), + } + } +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum PatternContext { + Match, + Replacement, } -pub fn parse_only_pat_sel(s: &[u8]) -> Option { +pub fn parse_only_pat_sel(s: &[u8], ctx: PatternContext) -> Option { match s.iter().position(|c| *c == b':') { Some(pat_sel_ind) => { let pat_sel_str = &s[(pat_sel_ind + 1)..]; @@ -122,11 +156,14 @@ pub fn parse_only_pat_sel(s: &[u8]) -> Option { let pat_sel = PatternSelector::parse(pat_sel_str)?; Some(pat_sel) } - None => Some(PatternSelector::default()), + None => Some(PatternSelector::default_for_context(ctx)), } } -pub fn parse_comp_and_pat_sel(s: &[u8]) -> Option<(ComponentSelector, PatternSelector)> { +pub fn parse_comp_and_pat_sel( + s: &[u8], + ctx: PatternContext, +) -> Option<(ComponentSelector, PatternSelector)> { match ( s.iter().position(|c| *c == b'='), s.iter().position(|c| *c == b':'), @@ -146,7 +183,7 @@ pub fn parse_comp_and_pat_sel(s: &[u8]) -> Option<(ComponentSelector, PatternSel let comp_sel_str = &s[(comp_sel_ind + 1)..]; let comp_sel = ComponentSelector::parse(comp_sel_str)?; - let pat_sel = PatternSelector::default(); + let pat_sel = PatternSelector::default_for_context(ctx); Some((comp_sel, pat_sel)) } (None, Some(pat_sel_ind)) => { @@ -158,7 +195,7 @@ pub fn parse_comp_and_pat_sel(s: &[u8]) -> Option<(ComponentSelector, PatternSel } (None, None) => { let comp_sel = ComponentSelector::default(); - let pat_sel = PatternSelector::default(); + let pat_sel = PatternSelector::default_for_context(ctx); Some((comp_sel, pat_sel)) } } @@ -554,7 +591,7 @@ impl MatchExpression { )) })?; let comp_sel = ComponentSelector::default(); - let pat_sel = PatternSelector::default(); + let pat_sel = PatternSelector::default_for_context(PatternContext::Match); top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::Match(MatchArg { comp_sel, pat_sel, @@ -563,11 +600,13 @@ impl MatchExpression { } arg_bytes if arg_bytes.starts_with(b"--match") => { let (comp_sel, pat_sel) = - parse_comp_and_pat_sel(arg_bytes).ok_or_else(|| { - Extract::exit_arg_invalid(&format!( - "invalid --match argument modifiers: {arg:?}" - )) - })?; + parse_comp_and_pat_sel(arg_bytes, PatternContext::Match).ok_or_else( + || { + Extract::exit_arg_invalid(&format!( + "invalid --match argument modifiers: {arg:?}" + )) + }, + )?; if pat_sel.modifiers.multiple_matches { return Err(Extract::exit_arg_invalid(&format!( "multimatch modifier :g is unused in match expressions: {arg:?}" @@ -1150,9 +1189,6 @@ These results are dependent on the entry data: the string argument is interpreted into a string matching predicate against the entry name. - TODO: this flag is not yet supported and will produce an error. - - ## Name transforms (name-transform): Name transforms modify the entry name before writing the entry to the @@ -1211,9 +1247,6 @@ entry itself. entry may be matched more than once. In this case, the entry's content will be extracted more than once over the execution of this command. -TODO: multiple entry specs with content transforms that extract output more than once require entry -teeing, which is not yet supported, so will produce an error. - -x, --extract[=] Decompress the entry's contents (if necessary) before writing it to the named output , or the default output if the receiver name is @@ -1245,11 +1278,14 @@ comp-sel = path [DEFAULT] (match full entry) = ext (match only the file extension, if available) ### Pattern selector (pat-sel): -pat-sel = glob [DEFAULT] (interpret as a shell glob) +pat-sel = glob [DEFAULT for matching] (interpret as a shell glob) = lit (interpret as literal string) - = rx (interpret as a regular expression) + = rx [DEFAULT for replacement] (interpret as a regular expression) = (apply search modifiers from ) +*Note:* glob patterns are not supported for replacement, and attempting to use +them with e.g '--transform:glob' will produce an error. + #### Pattern modifiers (pat-mod): pat-mod = :i (use case-insensitive matching for the given pattern) = :g (use multi-match behavior for string replacements) @@ -1378,11 +1414,18 @@ Positional paths: } arg_bytes if arg_bytes.starts_with(b"--transform") => { let (comp_sel, pat_sel) = - parse_comp_and_pat_sel(arg_bytes).ok_or_else(|| { - Self::exit_arg_invalid(&format!( - "invalid --transform argument modifiers: {arg:?}" - )) - })?; + parse_comp_and_pat_sel(arg_bytes, PatternContext::Replacement).ok_or_else( + || { + Self::exit_arg_invalid(&format!( + "invalid --transform argument modifiers: {arg:?}" + )) + }, + )?; + if pat_sel.pat_sel == PatternSelectorType::Glob { + return Err(Self::exit_arg_invalid(&format!( + ":glob pattern type is unsupported in transform expressions: {arg:?}" + ))); + } let pattern = argv .pop_front() .ok_or_else(|| { @@ -1417,11 +1460,17 @@ Positional paths: ))); } arg_bytes if arg_bytes.starts_with(b"--remove-prefix") => { - let pat_sel = parse_only_pat_sel(arg_bytes).ok_or_else(|| { - Self::exit_arg_invalid(&format!( - "invalid --remove-prefix argument modifiers: {arg:?}" - )) - })?; + let pat_sel = parse_only_pat_sel(arg_bytes, PatternContext::Replacement) + .ok_or_else(|| { + Self::exit_arg_invalid(&format!( + "invalid --remove-prefix argument modifiers: {arg:?}" + )) + })?; + if pat_sel.pat_sel == PatternSelectorType::Glob { + return Err(Self::exit_arg_invalid(&format!( + ":glob pattern type is unsupported in transform expressions: {arg:?}" + ))); + } let pattern = argv .pop_front() .ok_or_else(|| { From 8127539d28b9be349036a2970791fcecc7e9a022 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 26 Aug 2024 11:57:02 -0400 Subject: [PATCH 24/68] add --{min,max}-size match exprs --- cli/src/args/extract.rs | 72 +++++++++++++++++++++++++++++++++++--- cli/src/extract/matcher.rs | 28 +++++++++++++++ 2 files changed, 96 insertions(+), 4 deletions(-) diff --git a/cli/src/args/extract.rs b/cli/src/args/extract.rs index b37658a3e..97628899c 100644 --- a/cli/src/args/extract.rs +++ b/cli/src/args/extract.rs @@ -309,6 +309,12 @@ pub enum DepthLimitArg { Min(u8), } +#[derive(Debug)] +pub enum SizeArg { + Max(u64), + Min(u64), +} + #[derive(Debug)] pub struct MatchArg { pub comp_sel: ComponentSelector, @@ -328,6 +334,7 @@ pub enum Predicate { EntryType(EntryType), CompressionMethod(CompressionMethodArg), DepthLimit(DepthLimitArg), + Size(SizeArg), Match(MatchArg), } @@ -551,7 +558,7 @@ impl MatchExpression { .parse::() .map_err(|e| { Extract::exit_arg_invalid(&format!( - "failed to parse --max-depth arg {e:?} as u8" + "failed to parse --max-depth arg as u8: {e:?}" )) })?; top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::DepthLimit( @@ -573,13 +580,57 @@ impl MatchExpression { .parse::() .map_err(|e| { Extract::exit_arg_invalid(&format!( - "failed to parse --min-depth arg {e:?} as u8" + "failed to parse --min-depth arg as u8: {e:?}" )) })?; top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::DepthLimit( DepthLimitArg::Min(min_depth), ))); } + b"--max-size" => { + let max_size: u64 = argv + .pop_front() + .ok_or_else(|| { + Extract::exit_arg_invalid("no argument provided for --max-size") + })? + .into_string() + .map_err(|size_arg| { + Extract::exit_arg_invalid(&format!( + "invalid unicode provided for --max-size: {size_arg:?}" + )) + })? + .parse::() + .map_err(|e| { + Extract::exit_arg_invalid(&format!( + "failed to parse --max-size arg as u64: {e:?}" + )) + })?; + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::Size(SizeArg::Max( + max_size, + )))); + } + b"--min-size" => { + let min_size: u64 = argv + .pop_front() + .ok_or_else(|| { + Extract::exit_arg_invalid("no argument provided for --min-size") + })? + .into_string() + .map_err(|size_arg| { + Extract::exit_arg_invalid(&format!( + "invalid unicode provided for --min-size: {size_arg:?}" + )) + })? + .parse::() + .map_err(|e| { + Extract::exit_arg_invalid(&format!( + "failed to parse --min-size arg as u64: {e:?}" + )) + })?; + top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::Size(SizeArg::Min( + min_size, + )))); + } b"-m" => { let pattern: String = argv .pop_front() @@ -1178,9 +1229,22 @@ These results are dependent on the entry data: special handling of entries compressed with an unsupported method. --max-depth - Match entries with at *most* components of their containing directory. + Match entries with at *most* components of their + containing directory. --min-depth - Match entries with at *least* components of their containing directory. + Match entries with at *least* components of their + containing directory. + + --max-size + Match entries of at *most* in *uncompressed* size. + --min-size + Match entries of at *least* in *uncompressed* size. + + Directory entries are 0 bytes in size, and symlink entries are the + size required to store their target. + + TODO: Abbrevations such as 1k, 1M are not currently supported; the + precise byte number must be provided, parseable as a u64. -m, --match[=][:] Return true for entries whose name matches . diff --git a/cli/src/extract/matcher.rs b/cli/src/extract/matcher.rs index 641850d9f..b2ff1c2f7 100644 --- a/cli/src/extract/matcher.rs +++ b/cli/src/extract/matcher.rs @@ -262,6 +262,33 @@ impl EntryMatcher for DepthLimit { } } +#[derive(Copy, Clone)] +enum Size { + Max(u64), + Min(u64), +} + +impl EntryMatcher for Size { + type Arg = SizeArg where Self: Sized; + + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + Ok(match arg { + SizeArg::Max(max) => Self::Max(max), + SizeArg::Min(min) => Self::Min(min), + }) + } + + fn matches(&self, entry: &EntryData) -> bool { + match self { + Self::Max(max) => entry.size <= *max, + Self::Min(min) => entry.size >= *min, + } + } +} + struct PatternMatcher { matcher: Box, comp_sel: ComponentSelector, @@ -322,6 +349,7 @@ impl CompiledMatcher { CompressionMethodArg::Specific(arg) => Box::new(SpecificMethods::from_arg(arg)?), }, Predicate::DepthLimit(arg) => Box::new(DepthLimit::from_arg(arg)?), + Predicate::Size(arg) => Box::new(Size::from_arg(arg)?), Predicate::Match(arg) => Box::new(PatternMatcher::from_arg(arg)?), })) } From c34fbdfba76d71890383d88b883a30e1406297ea Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 26 Aug 2024 15:28:16 -0400 Subject: [PATCH 25/68] impl pattern transformers --- cli/src/args/extract.rs | 4 +- cli/src/extract/transform.rs | 212 ++++++++++++++++++++++++++++++++++- 2 files changed, 211 insertions(+), 5 deletions(-) diff --git a/cli/src/args/extract.rs b/cli/src/args/extract.rs index 97628899c..984e39b87 100644 --- a/cli/src/args/extract.rs +++ b/cli/src/args/extract.rs @@ -1225,8 +1225,8 @@ These results are dependent on the entry data: - stored: uncompressed - deflated: with deflate {}{}{}{}{} - Using e.g. '-not --compression-method known' as a filter enables - special handling of entries compressed with an unsupported method. + Using e.g. '--compression-method known' as a match expression filters + entries to only those which can be successfully decompressed. --max-depth Match entries with at *most* components of their diff --git a/cli/src/extract/transform.rs b/cli/src/extract/transform.rs index cfa9df547..329fa858f 100644 --- a/cli/src/extract/transform.rs +++ b/cli/src/extract/transform.rs @@ -1,5 +1,7 @@ use std::{borrow::Cow, collections::VecDeque}; +use regex; + use crate::{args::extract::*, CommandError}; pub trait NameTransformer { @@ -96,6 +98,210 @@ impl NameTransformer for AddPrefix { } } +trait PatternTransformer { + type Replacement + where + Self: Sized; + fn create( + pattern: &str, + opts: PatternModifiers, + rep: Self::Replacement, + ) -> Result + where + Self: Sized; + + fn replace<'s>(&self, input: &'s str) -> Cow<'s, str>; +} + +struct LiteralTransformer { + lit: String, + case_insensitive: bool, + multiple_matches: bool, + rep: String, +} + +impl LiteralTransformer { + fn format_single_replacement( + input: &str, + lit_len: usize, + rep: &str, + match_index: usize, + ) -> String { + debug_assert!(lit_len > 0); + debug_assert!(input.len() > 0); + debug_assert!(rep.len() > 0); + format!( + "{}{}{}", + &input[..match_index], + rep, + &input[(match_index + lit_len)..] + ) + } + + fn replace_single_exact<'s>(input: &'s str, lit: &str, rep: &str) -> Cow<'s, str> { + match input.find(lit) { + None => Cow::Borrowed(input), + Some(i) => Cow::Owned(Self::format_single_replacement(input, lit.len(), rep, i)), + } + } + + fn replace_single_icase<'s>(input: &'s str, lit: &str, rep: &str) -> Cow<'s, str> { + /* NB: literal was already changed to uppercase upon construction in Self::create()! */ + match input.to_ascii_uppercase().find(&lit) { + None => Cow::Borrowed(input), + Some(i) => Cow::Owned(Self::format_single_replacement(input, lit.len(), rep, i)), + } + } + + fn format_multiple_replacements( + input: &str, + lit_len: usize, + rep: &str, + match_indices: Vec, + ) -> String { + debug_assert!(lit_len > 0); + debug_assert!(input.len() > 0); + debug_assert!(rep.len() > 0); + let expected_len: usize = + input.len() - (lit_len * match_indices.len()) + (rep.len() * match_indices.len()); + let mut ret = String::with_capacity(expected_len); + let mut last_source_position: usize = 0; + for i in match_indices.into_iter() { + ret.push_str(&input[last_source_position..i]); + ret.push_str(rep); + last_source_position = i + lit_len; + } + assert_eq!(ret.len(), expected_len); + ret + } + + fn replace_multiple_exact<'s>(input: &'s str, lit: &str, rep: &str) -> Cow<'s, str> { + let match_indices: Vec = input.match_indices(lit).map(|(i, _)| i).collect(); + if match_indices.is_empty() { + return Cow::Borrowed(input); + } + Cow::Owned(Self::format_multiple_replacements( + input, + lit.len(), + rep, + match_indices, + )) + } + + fn replace_multiple_icase<'s>(input: &'s str, lit: &str, rep: &str) -> Cow<'s, str> { + let match_indices: Vec = input + .to_ascii_uppercase() + /* NB: literal was already changed to uppercase upon construction in Self::create()! */ + .match_indices(&lit) + .map(|(i, _)| i) + .collect(); + if match_indices.is_empty() { + return Cow::Borrowed(input); + } + Cow::Owned(Self::format_multiple_replacements( + input, + lit.len(), + rep, + match_indices, + )) + } +} + +impl PatternTransformer for LiteralTransformer { + type Replacement = String where Self: Sized; + fn create( + pattern: &str, + opts: PatternModifiers, + rep: Self::Replacement, + ) -> Result + where + Self: Sized, + { + let PatternModifiers { + case_insensitive, + multiple_matches, + } = opts; + Ok(Self { + lit: match case_insensitive { + false => pattern.to_string(), + true => pattern.to_ascii_uppercase(), + }, + case_insensitive, + multiple_matches, + rep, + }) + } + + fn replace<'s>(&self, input: &'s str) -> Cow<'s, str> { + /* Empty replacement or literal is allowed, it just does nothing. */ + if self.lit.is_empty() || self.rep.is_empty() || input.is_empty() { + return Cow::Borrowed(input); + } + match self.multiple_matches { + false => match self.case_insensitive { + /* Single replacement, case-sensitive (exact) match: */ + false => Self::replace_single_exact(input, &self.lit, &self.rep), + /* Single replacement, case-insensitive match: */ + true => Self::replace_single_icase(input, &self.lit, &self.rep), + }, + true => match self.case_insensitive { + /* Multiple replacements, case-sensitive (exact) match: */ + false => Self::replace_multiple_exact(input, &self.lit, &self.rep), + /* Multiple replacements, case-insensitive match: */ + true => Self::replace_multiple_icase(input, &self.lit, &self.rep), + }, + } + } +} + +struct RegexpTransformer { + pat: regex::Regex, + multiple_matches: bool, + rep: String, +} + +impl PatternTransformer for RegexpTransformer { + type Replacement = String where Self: Sized; + fn create( + pattern: &str, + opts: PatternModifiers, + rep: Self::Replacement, + ) -> Result + where + Self: Sized, + { + let PatternModifiers { + case_insensitive, + multiple_matches, + } = opts; + let pat = regex::RegexBuilder::new(pattern) + .case_insensitive(case_insensitive) + .build() + .map_err(|e| { + CommandError::InvalidArg(format!( + "failed to construct regex replacer from search pattern {pattern:?}: {e}" + )) + })?; + Ok(Self { + pat, + multiple_matches, + rep, + }) + } + + fn replace<'s>(&self, input: &'s str) -> Cow<'s, str> { + match self.multiple_matches { + false => self.pat.replace(input, &self.rep), + true => self.pat.replace_all(input, &self.rep), + } + } +} + +/* struct ComponentTransformer { */ +/* pattern_trans: Box, */ +/* comp_sel: ComponentSelector, */ +/* } */ + pub struct CompiledTransformer { transformers: Vec>, } @@ -109,12 +315,12 @@ impl CompiledTransformer { BasicTransform::AddPrefix(arg) => Box::new(AddPrefix::from_arg(arg)?), }, NameTransform::Complex(complex_trans) => match complex_trans { - ComplexTransform::RemovePrefix(remove_prefix_arg) => { - todo!("impl remove prefix: {:?}", remove_prefix_arg) - } ComplexTransform::Transform(transform_arg) => { todo!("impl transform: {:?}", transform_arg) } + ComplexTransform::RemovePrefix(remove_prefix_arg) => { + todo!("impl remove prefix: {:?}", remove_prefix_arg) + } }, }) } From 764250eaa7936e8497b9089a370bd181df7d8550 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 26 Aug 2024 17:51:03 -0400 Subject: [PATCH 26/68] support --transform!! --- cli/src/args/extract.rs | 4 +- cli/src/extract.rs | 2 + cli/src/extract/matcher.rs | 21 ++-- cli/src/extract/transform.rs | 211 +++++++++++++++++++++++++++++++++-- 4 files changed, 214 insertions(+), 24 deletions(-) diff --git a/cli/src/args/extract.rs b/cli/src/args/extract.rs index 984e39b87..cb0149bac 100644 --- a/cli/src/args/extract.rs +++ b/cli/src/args/extract.rs @@ -1286,8 +1286,6 @@ Complex: These transformers perform complex pattern matching and replacement upon the entry name string: -TODO: these flags are not yet supported and will produce an error. - --transform[=][:] Extract the portion of the entry name corresponding to , search it against corresponding to , and then @@ -1301,6 +1299,8 @@ TODO: these flags are not yet supported and will produce an error. Equivalent to "--transform=path: ''", except the search is anchored at the beginning of the string. + TODO: this flag is not yet supported and will produce an error. + ## Content transforms (content-transform): diff --git a/cli/src/extract.rs b/cli/src/extract.rs index d6eb46c0f..a0589ee72 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -66,6 +66,8 @@ pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), Comm .as_ref() .map(|t| t.transform_name(&data.name)) .unwrap_or_else(|| Cow::Borrowed(&data.name)); + writeln!(&mut err, "{data:?}").unwrap(); + writeln!(&mut err, "{new_name:?}").unwrap(); matching_extracts.push((new_name, recv.clone())); } } diff --git a/cli/src/extract/matcher.rs b/cli/src/extract/matcher.rs index b2ff1c2f7..c1a5f6753 100644 --- a/cli/src/extract/matcher.rs +++ b/cli/src/extract/matcher.rs @@ -1,26 +1,19 @@ -use std::path::Path; - use glob; use regex; use zip::CompressionMethod; use super::receiver::{EntryData, EntryKind}; +use super::transform::ComponentSplit; use crate::{args::extract::*, CommandError}; #[inline(always)] -pub fn process_component_selector<'s>(sel: ComponentSelector, name: &'s str) -> Option<&'s str> { - let path = Path::new(name); - match sel { - ComponentSelector::Path => Some(name), - ComponentSelector::Basename => path.file_name().map(|bname| bname.to_str().unwrap()), - ComponentSelector::Dirname => path - .parent() - .map(|p| p.to_str().unwrap()) - /* "a".parent() becomes Some(""), which we want to treat as no parent */ - .filter(|s| !s.is_empty()), - ComponentSelector::FileExtension => path.extension().map(|ext| ext.to_str().unwrap()), - } +fn process_component_selector<'s>(sel: ComponentSelector, name: &'s str) -> Option<&'s str> { + ComponentSplit::split_by_component_selector(sel, name).map(|split| match split { + ComponentSplit::LeftAnchored { selected_left, .. } => selected_left, + ComponentSplit::RightAnchored { selected_right, .. } => selected_right, + ComponentSplit::Whole(s) => s, + }) } trait NameMatcher { diff --git a/cli/src/extract/transform.rs b/cli/src/extract/transform.rs index 329fa858f..5d3864a28 100644 --- a/cli/src/extract/transform.rs +++ b/cli/src/extract/transform.rs @@ -1,4 +1,4 @@ -use std::{borrow::Cow, collections::VecDeque}; +use std::{borrow::Cow, collections::VecDeque, path::Path, slice, str}; use regex; @@ -297,10 +297,207 @@ impl PatternTransformer for RegexpTransformer { } } -/* struct ComponentTransformer { */ -/* pattern_trans: Box, */ -/* comp_sel: ComponentSelector, */ -/* } */ +pub enum ComponentSplit<'s> { + LeftAnchored { + selected_left: &'s str, + right: &'s str, + }, + RightAnchored { + left: &'s str, + selected_right: &'s str, + }, + Whole(&'s str), +} + +impl<'s> ComponentSplit<'s> { + #[inline(always)] + pub fn split_by_component_selector(sel: ComponentSelector, name: &'s str) -> Option { + let path = Path::new(name); + match sel { + ComponentSelector::Path => Some(ComponentSplit::Whole(name)), + ComponentSelector::Basename => path + .file_name() + .map(|bname| bname.to_str().unwrap()) + .map(|bname| name.split_at(name.len() - bname.len())) + .map(|(pfx, bname)| ComponentSplit::RightAnchored { + left: pfx, + selected_right: bname, + }), + ComponentSelector::Dirname => path + .parent() + .map(|p| p.to_str().unwrap()) + /* "a".parent() becomes Some(""), which we want to treat as no parent */ + .filter(|s| !s.is_empty()) + .map(|dirname| name.split_at(dirname.len())) + .map(|(dirname, sfx)| ComponentSplit::LeftAnchored { + selected_left: dirname, + right: sfx, + }), + ComponentSelector::FileExtension => path + .extension() + .map(|ext| ext.to_str().unwrap()) + .map(|ext| name.split_at(name.len() - ext.len())) + .map(|(pfx, ext)| ComponentSplit::RightAnchored { + left: pfx, + selected_right: ext, + }), + } + } +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +enum SubstringAnchoring { + RetainsLeftAnchor, + RetainsRightAnchor, + RetainsBothAnchors, + LosesBothAnchors, +} + +impl SubstringAnchoring { + #[inline(always)] + pub fn analyze<'s, 't>(parent: &'s str, sub: &'t str) -> Self + where + 't: 's, + { + let p = parent.as_bytes().as_ptr_range(); + let s = sub.as_bytes().as_ptr_range(); + assert!(s.start >= p.start); + assert!(s.end <= p.end); + if p.start == s.start { + if p.end == s.end { + debug_assert_eq!(parent, sub); + Self::RetainsBothAnchors + } else { + Self::RetainsLeftAnchor + } + } else { + if p.end == s.end { + Self::RetainsRightAnchor + } else { + Self::LosesBothAnchors + } + } + } + + #[inline(always)] + pub fn split_then_transform_then_reformulate<'s>( + input: &'s str, + split: impl FnOnce(&'s str) -> Option>, + transform: impl FnOnce(&'s str) -> Cow<'s, str>, + ) -> Cow<'s, str> { + let components = match split(input) { + /* If the given name doesn't have the specified component, return it unchanged. */ + None => return Cow::Borrowed(input), + Some(s) => s, + }; + match components { + /* If there was no splitting (the whole path was selected), then we don't need to do + * any work to hook things back up! */ + ComponentSplit::Whole(s) => transform(s), + /* If there was splitting, we need to do more work. */ + ComponentSplit::LeftAnchored { + selected_left, + right, + } => match transform(selected_left) { + /* If we reallocated, then we have to reallocate the whole thing, so reuse the + * returned String. */ + Cow::Owned(mut new_left) => { + new_left.push_str(right); + Cow::Owned(new_left) + } + /* If no reallocation, we now have to figure out whether the result is still + * contiguous. */ + Cow::Borrowed(left_sub) => match Self::analyze(selected_left, left_sub) { + Self::RetainsBothAnchors => Cow::Borrowed(input), + Self::RetainsRightAnchor => { + Cow::Borrowed(Self::join_adjacent_strings(left_sub, right)) + } + _ => Cow::Owned(format!("{}{}", left_sub, right)), + }, + }, + ComponentSplit::RightAnchored { + left, + selected_right, + } => match transform(selected_right) { + Cow::Owned(mut new_right) => { + new_right.insert_str(0, left); + Cow::Owned(new_right) + } + Cow::Borrowed(right_sub) => match Self::analyze(selected_right, right_sub) { + Self::RetainsBothAnchors => Cow::Borrowed(input), + Self::RetainsLeftAnchor => { + Cow::Borrowed(Self::join_adjacent_strings(left, right_sub)) + } + _ => Cow::Owned(format!("{}{}", left, right_sub)), + }, + }, + } + } + + #[inline(always)] + fn join_adjacent_strings<'s>(left: &'s str, right: &'s str) -> &'s str { + assert!(left.len() + right.len() <= isize::MAX as usize); + let left = left.as_bytes().as_ptr_range(); + let right = right.as_bytes().as_ptr_range(); + assert_eq!(left.end, right.start); + let start: *const u8 = left.start; + let end: *const u8 = right.end; + unsafe { + let len: usize = end.offset_from(start) as usize; + let joined_slice = slice::from_raw_parts(start, len); + str::from_utf8_unchecked(joined_slice) + } + } +} + +struct ComponentTransformer { + pattern_trans: Box, + comp_sel: ComponentSelector, +} + +impl NameTransformer for ComponentTransformer { + type Arg = TransformArg where Self: Sized; + fn from_arg(arg: Self::Arg) -> Result + where + Self: Sized, + { + let TransformArg { + comp_sel, + pat_sel: PatternSelector { pat_sel, modifiers }, + pattern, + replacement_spec, + } = arg; + + let pattern_trans: Box = match pat_sel { + PatternSelectorType::Glob => { + unreachable!("glob patterns are not supported for name transformations") + } + PatternSelectorType::Literal => Box::new(LiteralTransformer::create( + &pattern, + modifiers, + replacement_spec, + )?), + PatternSelectorType::Regexp => Box::new(RegexpTransformer::create( + &pattern, + modifiers, + replacement_spec, + )?), + }; + + Ok(Self { + pattern_trans, + comp_sel, + }) + } + + fn transform_name<'s>(&self, name: &'s str) -> Cow<'s, str> { + SubstringAnchoring::split_then_transform_then_reformulate( + name, + move |name| ComponentSplit::split_by_component_selector(self.comp_sel, name), + |name| self.pattern_trans.replace(name), + ) + } +} pub struct CompiledTransformer { transformers: Vec>, @@ -315,9 +512,7 @@ impl CompiledTransformer { BasicTransform::AddPrefix(arg) => Box::new(AddPrefix::from_arg(arg)?), }, NameTransform::Complex(complex_trans) => match complex_trans { - ComplexTransform::Transform(transform_arg) => { - todo!("impl transform: {:?}", transform_arg) - } + ComplexTransform::Transform(arg) => Box::new(ComponentTransformer::from_arg(arg)?), ComplexTransform::RemovePrefix(remove_prefix_arg) => { todo!("impl remove prefix: {:?}", remove_prefix_arg) } From 0dc57d2ee68b57e6a06837b14f1d6ff20eb68233 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 26 Aug 2024 20:25:52 -0400 Subject: [PATCH 27/68] anchoring, prefixes, the whole shebang. i think we're done here --- cli/src/args/extract.rs | 85 ++++------- cli/src/extract/matcher.rs | 160 ++++++++++++++++---- cli/src/extract/transform.rs | 281 +++++++++++++++++++++++------------ 3 files changed, 348 insertions(+), 178 deletions(-) diff --git a/cli/src/args/extract.rs b/cli/src/args/extract.rs index cb0149bac..6f57ceee1 100644 --- a/cli/src/args/extract.rs +++ b/cli/src/args/extract.rs @@ -60,6 +60,8 @@ impl PatternSelectorType { pub enum PatternSelectorModifier { CaseInsensitive, MultipleMatches, + PrefixAnchored, + SuffixAnchored, } impl PatternSelectorModifier { @@ -67,21 +69,25 @@ impl PatternSelectorModifier { match s { b"i" => Some(Self::CaseInsensitive), b"g" => Some(Self::MultipleMatches), + b"p" => Some(Self::PrefixAnchored), + b"s" => Some(Self::SuffixAnchored), _ => None, } } } #[derive(Debug, Default, Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub struct PatternModifiers { +pub struct PatternModifierFlags { pub case_insensitive: bool, pub multiple_matches: bool, + pub prefix_anchored: bool, + pub suffix_anchored: bool, } #[derive(Debug)] pub struct PatternSelector { pub pat_sel: PatternSelectorType, - pub modifiers: PatternModifiers, + pub modifiers: PatternModifierFlags, } impl PatternSelector { @@ -93,7 +99,7 @@ impl PatternSelector { let pat_sel = PatternSelectorType::parse(pat_sel_str)?; - let mut modifiers = PatternModifiers::default(); + let mut modifiers = PatternModifierFlags::default(); let mod_els = modifiers_str .split(|c| *c == b':') .map(PatternSelectorModifier::parse) @@ -106,6 +112,12 @@ impl PatternSelector { PatternSelectorModifier::MultipleMatches => { modifiers.multiple_matches = true; } + PatternSelectorModifier::PrefixAnchored => { + modifiers.prefix_anchored = true; + } + PatternSelectorModifier::SuffixAnchored => { + modifiers.suffix_anchored = true; + } } } Some(Self { pat_sel, modifiers }) @@ -130,14 +142,14 @@ impl PatternSelector { pub fn default_for_match() -> Self { Self { pat_sel: PatternSelectorType::default_for_match(), - modifiers: PatternModifiers::default(), + modifiers: PatternModifierFlags::default(), } } pub fn default_for_replacement() -> Self { Self { pat_sel: PatternSelectorType::default_for_replacement(), - modifiers: PatternModifiers::default(), + modifiers: PatternModifierFlags::default(), } } } @@ -658,11 +670,6 @@ impl MatchExpression { )) }, )?; - if pat_sel.modifiers.multiple_matches { - return Err(Extract::exit_arg_invalid(&format!( - "multimatch modifier :g is unused in match expressions: {arg:?}" - ))); - } let pattern: String = argv .pop_front() .ok_or_else(|| { @@ -750,16 +757,9 @@ pub struct TransformArg { pub replacement_spec: String, } -#[derive(Debug)] -pub struct RemovePrefixArg { - pub pat_sel: PatternSelector, - pub pattern: String, -} - #[derive(Debug)] pub enum ComplexTransform { Transform(TransformArg), - RemovePrefix(RemovePrefixArg), } #[derive(Debug)] @@ -1295,12 +1295,6 @@ entry name string: to numbered capture groups specified by . Otherwise, is interpreted as a literal string. - --remove-prefix[:] - Equivalent to "--transform=path: ''", except the - search is anchored at the beginning of the string. - - TODO: this flag is not yet supported and will produce an error. - ## Content transforms (content-transform): @@ -1353,12 +1347,19 @@ them with e.g '--transform:glob' will produce an error. #### Pattern modifiers (pat-mod): pat-mod = :i (use case-insensitive matching for the given pattern) = :g (use multi-match behavior for string replacements) + = :p (perform left-anchored "prefix" searches) + = :s (perform right-anchored "suffix" searches) -Pattern modifiers from (pat-mod) can be sequenced, e.g. ':i:g'. +Pattern modifiers from (pat-mod) can be sequenced, e.g. ':i:g'. If ':p' and ':s' +are provided together, the result is to perform a doubly-anchored match, against +the entire string. For regexp matching with ':rx', ':p' and ':s' are converted +to '^' or '$' anchors in the regexp pattern string. If the pattern string also +contains '^' or '$' as well, no error is produced. *Note:* not all pattern modifiers apply everywhere. In particular, ':g' only applies to string replacement, and using it for a match expression like -'--match:rx:g' will produce an error. +'--match:rx:g' will produce an error. Additionally, ':p' and ':s' are +incompatible with glob search and will produce an error. # Input arguments: Zip file inputs to extract from can be specified by streaming from stdin, or as @@ -1485,11 +1486,6 @@ Positional paths: )) }, )?; - if pat_sel.pat_sel == PatternSelectorType::Glob { - return Err(Self::exit_arg_invalid(&format!( - ":glob pattern type is unsupported in transform expressions: {arg:?}" - ))); - } let pattern = argv .pop_front() .ok_or_else(|| { @@ -1523,35 +1519,6 @@ Positional paths: }), ))); } - arg_bytes if arg_bytes.starts_with(b"--remove-prefix") => { - let pat_sel = parse_only_pat_sel(arg_bytes, PatternContext::Replacement) - .ok_or_else(|| { - Self::exit_arg_invalid(&format!( - "invalid --remove-prefix argument modifiers: {arg:?}" - )) - })?; - if pat_sel.pat_sel == PatternSelectorType::Glob { - return Err(Self::exit_arg_invalid(&format!( - ":glob pattern type is unsupported in transform expressions: {arg:?}" - ))); - } - let pattern = argv - .pop_front() - .ok_or_else(|| { - Self::exit_arg_invalid( - "no argument provided for --remove-prefix", - ) - })? - .into_string() - .map_err(|pattern| { - Self::exit_arg_invalid(&format!( - "invalid unicode provided for --remove-prefix : {pattern:?}" - )) - })?; - args.push(ExtractArg::NameTransform(NameTransform::Complex( - ComplexTransform::RemovePrefix(RemovePrefixArg { pat_sel, pattern }), - ))); - } /* Try parsing match specs! */ b"--expr" => { diff --git a/cli/src/extract/matcher.rs b/cli/src/extract/matcher.rs index c1a5f6753..1d7a69e42 100644 --- a/cli/src/extract/matcher.rs +++ b/cli/src/extract/matcher.rs @@ -1,3 +1,5 @@ +use std::borrow::Cow; + use glob; use regex; @@ -16,8 +18,85 @@ fn process_component_selector<'s>(sel: ComponentSelector, name: &'s str) -> Opti }) } +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum SearchAnchoring { + #[default] + Unanchored, + LeftAnchored, + RightAnchored, + DoublyAnchored, +} + +impl SearchAnchoring { + pub const fn from_prefix_suffix_flags(prefix_anchored: bool, suffix_anchored: bool) -> Self { + match (prefix_anchored, suffix_anchored) { + (true, true) => Self::DoublyAnchored, + (true, false) => Self::LeftAnchored, + (false, true) => Self::RightAnchored, + (false, false) => Self::Unanchored, + } + } + + pub fn wrap_regex_pattern<'s>(self, pattern: &'s str) -> Cow<'s, str> { + match self { + Self::Unanchored => Cow::Borrowed(pattern), + Self::LeftAnchored => Cow::Owned(format!("^(?:{pattern})")), + Self::RightAnchored => Cow::Owned(format!("(?:{pattern})$")), + Self::DoublyAnchored => Cow::Owned(format!("^(?:{pattern})$")), + } + } +} + +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum CaseSensitivity { + #[default] + Sensitive, + Insensitive, +} + +impl CaseSensitivity { + pub const fn from_case_insensitive_flag(case_insensitive: bool) -> Self { + match case_insensitive { + true => Self::Insensitive, + false => Self::Sensitive, + } + } + + pub fn string_equal(self, a: &str, b: &str) -> bool { + match self { + Self::Insensitive => a.eq_ignore_ascii_case(b), + Self::Sensitive => a == b, + } + } +} + +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct MatchModifiers { + pub anchoring: SearchAnchoring, + pub case: CaseSensitivity, +} + +impl MatchModifiers { + pub fn from_flags(flags: PatternModifierFlags) -> Result { + let PatternModifierFlags { + case_insensitive, + multiple_matches, + prefix_anchored, + suffix_anchored, + } = flags; + if multiple_matches { + return Err(CommandError::InvalidArg(format!( + "multimatch modifier :g is unused in match expressions: {flags:?}" + ))); + } + let case = CaseSensitivity::from_case_insensitive_flag(case_insensitive); + let anchoring = SearchAnchoring::from_prefix_suffix_flags(prefix_anchored, suffix_anchored); + Ok(Self { anchoring, case }) + } +} + trait NameMatcher { - fn create(pattern: &str, opts: PatternModifiers) -> Result + fn create(pattern: String, opts: MatchModifiers) -> Result where Self: Sized; fn matches(&self, input: &str) -> bool; @@ -25,28 +104,44 @@ trait NameMatcher { struct LiteralMatcher { lit: String, - case_insensitive: bool, + case: CaseSensitivity, + anchoring: SearchAnchoring, } impl NameMatcher for LiteralMatcher { - fn create(pattern: &str, opts: PatternModifiers) -> Result + fn create(pattern: String, opts: MatchModifiers) -> Result where Self: Sized, { - let PatternModifiers { - case_insensitive, .. - } = opts; + let MatchModifiers { case, anchoring } = opts; Ok(Self { - lit: pattern.to_string(), - case_insensitive, + lit: match case { + CaseSensitivity::Sensitive => pattern, + CaseSensitivity::Insensitive => pattern.to_ascii_uppercase(), + }, + case, + anchoring, }) } fn matches(&self, input: &str) -> bool { - if self.case_insensitive { - self.lit.eq_ignore_ascii_case(input) - } else { - input == &self.lit + if input.len() < self.lit.len() { + return false; + } + match self.anchoring { + SearchAnchoring::Unanchored => match self.case { + CaseSensitivity::Insensitive => input.to_ascii_uppercase().contains(&self.lit), + CaseSensitivity::Sensitive => input.contains(&self.lit), + }, + SearchAnchoring::DoublyAnchored => self.case.string_equal(&self.lit, input), + SearchAnchoring::LeftAnchored => { + let prefix = &input[..self.lit.len()]; + self.case.string_equal(&self.lit, prefix) + } + SearchAnchoring::RightAnchored => { + let suffix = &input[(input.len() - self.lit.len())..]; + self.case.string_equal(&self.lit, suffix) + } } } } @@ -57,18 +152,24 @@ struct GlobMatcher { } impl NameMatcher for GlobMatcher { - fn create(pattern: &str, opts: PatternModifiers) -> Result + fn create(pattern: String, opts: MatchModifiers) -> Result where Self: Sized, { - let PatternModifiers { - case_insensitive, .. - } = opts; + let MatchModifiers { anchoring, case } = opts; + if !matches!(anchoring, SearchAnchoring::Unanchored) { + return Err(CommandError::InvalidArg(format!( + "anchored search with :p or :s is incompatible with glob patterns: {opts:?}" + ))); + } let glob_opts = glob::MatchOptions { - case_sensitive: !case_insensitive, + case_sensitive: match case { + CaseSensitivity::Sensitive => true, + CaseSensitivity::Insensitive => false, + }, ..Default::default() }; - let pat = glob::Pattern::new(pattern).map_err(|e| { + let pat = glob::Pattern::new(&pattern).map_err(|e| { CommandError::InvalidArg(format!( "failed to construct glob matcher from pattern {pattern:?}: {e}" )) @@ -86,15 +187,19 @@ struct RegexMatcher { } impl NameMatcher for RegexMatcher { - fn create(pattern: &str, opts: PatternModifiers) -> Result + fn create(pattern: String, opts: MatchModifiers) -> Result where Self: Sized, { - let PatternModifiers { - case_insensitive, .. - } = opts; - let pat = regex::RegexBuilder::new(pattern) - .case_insensitive(case_insensitive) + let MatchModifiers { case, anchoring } = opts; + + let pattern = anchoring.wrap_regex_pattern(&pattern); + + let pat = regex::RegexBuilder::new(&pattern) + .case_insensitive(match case { + CaseSensitivity::Sensitive => false, + CaseSensitivity::Insensitive => true, + }) .build() .map_err(|e| { CommandError::InvalidArg(format!( @@ -300,10 +405,11 @@ impl EntryMatcher for PatternMatcher { pattern, } = arg; + let opts = MatchModifiers::from_flags(modifiers)?; let matcher: Box = match pat_sel { - PatternSelectorType::Glob => Box::new(GlobMatcher::create(&pattern, modifiers)?), - PatternSelectorType::Literal => Box::new(LiteralMatcher::create(&pattern, modifiers)?), - PatternSelectorType::Regexp => Box::new(RegexMatcher::create(&pattern, modifiers)?), + PatternSelectorType::Glob => Box::new(GlobMatcher::create(pattern, opts)?), + PatternSelectorType::Literal => Box::new(LiteralMatcher::create(pattern, opts)?), + PatternSelectorType::Regexp => Box::new(RegexMatcher::create(pattern, opts)?), }; Ok(Self { matcher, comp_sel }) diff --git a/cli/src/extract/transform.rs b/cli/src/extract/transform.rs index 5d3864a28..fbc9a1152 100644 --- a/cli/src/extract/transform.rs +++ b/cli/src/extract/transform.rs @@ -1,7 +1,8 @@ -use std::{borrow::Cow, collections::VecDeque, path::Path, slice, str}; +use std::{borrow::Cow, collections::VecDeque, ops, path::Path, slice, str}; use regex; +use super::matcher::{CaseSensitivity, SearchAnchoring}; use crate::{args::extract::*, CommandError}; pub trait NameTransformer { @@ -98,13 +99,55 @@ impl NameTransformer for AddPrefix { } } +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Multiplicity { + #[default] + Single, + All, +} + +impl Multiplicity { + pub const fn from_multiple_matches_flag(multiple_matches: bool) -> Self { + match multiple_matches { + true => Self::All, + false => Self::Single, + } + } +} + +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct ReplaceModifiers { + pub anchoring: SearchAnchoring, + pub case: CaseSensitivity, + pub multi: Multiplicity, +} + +impl ReplaceModifiers { + pub const fn from_flags(flags: PatternModifierFlags) -> Self { + let PatternModifierFlags { + case_insensitive, + multiple_matches, + prefix_anchored, + suffix_anchored, + } = flags; + let multi = Multiplicity::from_multiple_matches_flag(multiple_matches); + let case = CaseSensitivity::from_case_insensitive_flag(case_insensitive); + let anchoring = SearchAnchoring::from_prefix_suffix_flags(prefix_anchored, suffix_anchored); + Self { + anchoring, + case, + multi, + } + } +} + trait PatternTransformer { type Replacement where Self: Sized; fn create( - pattern: &str, - opts: PatternModifiers, + pattern: String, + opts: ReplaceModifiers, rep: Self::Replacement, ) -> Result where @@ -115,33 +158,58 @@ trait PatternTransformer { struct LiteralTransformer { lit: String, - case_insensitive: bool, - multiple_matches: bool, + case: CaseSensitivity, + anchoring: SearchAnchoring, + multi: Multiplicity, rep: String, } impl LiteralTransformer { - fn format_single_replacement( - input: &str, + fn format_single_replacement<'s>( + input: &'s str, lit_len: usize, rep: &str, match_index: usize, - ) -> String { - debug_assert!(lit_len > 0); - debug_assert!(input.len() > 0); - debug_assert!(rep.len() > 0); - format!( + ) -> Cow<'s, str> { + /* If the replacement is empty, we have the opportunity to return a borrowed Cow. */ + if rep.is_empty() { + /* Remove the prefix alone! */ + if match_index == 0 { + return Cow::Borrowed(&input[lit_len..]); + } + /* Remove the suffix alone! */ + if match_index == input.len() - lit_len { + return Cow::Borrowed(&input[..match_index]); + } + } + /* Otherwise, we allocate a new string. */ + Cow::Owned(format!( "{}{}{}", &input[..match_index], rep, &input[(match_index + lit_len)..] - ) + )) + } + + fn replace_single_anchored<'s>( + input: &'s str, + lit: &str, + rep: &str, + range: ops::Range, + case: CaseSensitivity, + ) -> Cow<'s, str> { + let sub = &input[range.clone()]; + if case.string_equal(lit, sub) { + Self::format_single_replacement(input, lit.len(), rep, range.start) + } else { + Cow::Borrowed(input) + } } fn replace_single_exact<'s>(input: &'s str, lit: &str, rep: &str) -> Cow<'s, str> { match input.find(lit) { None => Cow::Borrowed(input), - Some(i) => Cow::Owned(Self::format_single_replacement(input, lit.len(), rep, i)), + Some(i) => Self::format_single_replacement(input, lit.len(), rep, i), } } @@ -149,19 +217,22 @@ impl LiteralTransformer { /* NB: literal was already changed to uppercase upon construction in Self::create()! */ match input.to_ascii_uppercase().find(&lit) { None => Cow::Borrowed(input), - Some(i) => Cow::Owned(Self::format_single_replacement(input, lit.len(), rep, i)), + Some(i) => Self::format_single_replacement(input, lit.len(), rep, i), } } - fn format_multiple_replacements( - input: &str, + fn format_multiple_replacements<'s>( + input: &'s str, lit_len: usize, rep: &str, match_indices: Vec, - ) -> String { - debug_assert!(lit_len > 0); - debug_assert!(input.len() > 0); - debug_assert!(rep.len() > 0); + ) -> Cow<'s, str> { + if match_indices.is_empty() { + return Cow::Borrowed(input); + } + if match_indices.len() == 1 { + return Self::format_single_replacement(input, lit_len, rep, match_indices[0]); + } let expected_len: usize = input.len() - (lit_len * match_indices.len()) + (rep.len() * match_indices.len()); let mut ret = String::with_capacity(expected_len); @@ -172,20 +243,12 @@ impl LiteralTransformer { last_source_position = i + lit_len; } assert_eq!(ret.len(), expected_len); - ret + Cow::Owned(ret) } fn replace_multiple_exact<'s>(input: &'s str, lit: &str, rep: &str) -> Cow<'s, str> { let match_indices: Vec = input.match_indices(lit).map(|(i, _)| i).collect(); - if match_indices.is_empty() { - return Cow::Borrowed(input); - } - Cow::Owned(Self::format_multiple_replacements( - input, - lit.len(), - rep, - match_indices, - )) + Self::format_multiple_replacements(input, lit.len(), rep, match_indices) } fn replace_multiple_icase<'s>(input: &'s str, lit: &str, rep: &str) -> Cow<'s, str> { @@ -195,60 +258,96 @@ impl LiteralTransformer { .match_indices(&lit) .map(|(i, _)| i) .collect(); - if match_indices.is_empty() { - return Cow::Borrowed(input); - } - Cow::Owned(Self::format_multiple_replacements( - input, - lit.len(), - rep, - match_indices, - )) + Self::format_multiple_replacements(input, lit.len(), rep, match_indices) } } impl PatternTransformer for LiteralTransformer { type Replacement = String where Self: Sized; fn create( - pattern: &str, - opts: PatternModifiers, + pattern: String, + opts: ReplaceModifiers, rep: Self::Replacement, ) -> Result where Self: Sized, { - let PatternModifiers { - case_insensitive, - multiple_matches, + let ReplaceModifiers { + case, + anchoring, + multi, } = opts; + + if matches!(multi, Multiplicity::All) && !matches!(anchoring, SearchAnchoring::Unanchored) { + return Err(CommandError::InvalidArg(format!( + "multimatch replacement with :g is not supported with anchoring flags :p or :s for literal transforms: {opts:?} {pattern:?}" + ))); + } + Ok(Self { - lit: match case_insensitive { - false => pattern.to_string(), - true => pattern.to_ascii_uppercase(), + lit: match case { + CaseSensitivity::Sensitive => pattern, + CaseSensitivity::Insensitive => pattern.to_ascii_uppercase(), }, - case_insensitive, - multiple_matches, + case, + anchoring, + multi, rep, }) } fn replace<'s>(&self, input: &'s str) -> Cow<'s, str> { /* Empty replacement or literal is allowed, it just does nothing. */ - if self.lit.is_empty() || self.rep.is_empty() || input.is_empty() { + if self.lit.is_empty() || input.is_empty() { + return Cow::Borrowed(input); + } + /* Can't match input longer than the literal. */ + if self.lit.len() > input.len() { return Cow::Borrowed(input); } - match self.multiple_matches { - false => match self.case_insensitive { - /* Single replacement, case-sensitive (exact) match: */ - false => Self::replace_single_exact(input, &self.lit, &self.rep), - /* Single replacement, case-insensitive match: */ - true => Self::replace_single_icase(input, &self.lit, &self.rep), + + match self.multi { + Multiplicity::Single => match self.anchoring { + SearchAnchoring::DoublyAnchored => Self::replace_single_anchored( + input, + &self.lit, + &self.rep, + 0..input.len(), + self.case, + ), + SearchAnchoring::LeftAnchored => Self::replace_single_anchored( + input, + &self.lit, + &self.rep, + 0..self.lit.len(), + self.case, + ), + SearchAnchoring::RightAnchored => Self::replace_single_anchored( + input, + &self.lit, + &self.rep, + (input.len() - self.lit.len())..input.len(), + self.case, + ), + SearchAnchoring::Unanchored => match self.case { + CaseSensitivity::Sensitive => { + Self::replace_single_exact(input, &self.lit, &self.rep) + } + CaseSensitivity::Insensitive => { + Self::replace_single_icase(input, &self.lit, &self.rep) + } + }, }, - true => match self.case_insensitive { - /* Multiple replacements, case-sensitive (exact) match: */ - false => Self::replace_multiple_exact(input, &self.lit, &self.rep), - /* Multiple replacements, case-insensitive match: */ - true => Self::replace_multiple_icase(input, &self.lit, &self.rep), + Multiplicity::All => match self.anchoring { + SearchAnchoring::Unanchored => match self.case { + CaseSensitivity::Sensitive => { + Self::replace_multiple_exact(input, &self.lit, &self.rep) + } + CaseSensitivity::Insensitive => { + Self::replace_multiple_icase(input, &self.lit, &self.rep) + } + }, + _ => unreachable!("checked during construction"), }, } } @@ -256,43 +355,45 @@ impl PatternTransformer for LiteralTransformer { struct RegexpTransformer { pat: regex::Regex, - multiple_matches: bool, + multi: Multiplicity, rep: String, } impl PatternTransformer for RegexpTransformer { type Replacement = String where Self: Sized; fn create( - pattern: &str, - opts: PatternModifiers, + pattern: String, + opts: ReplaceModifiers, rep: Self::Replacement, ) -> Result where Self: Sized, { - let PatternModifiers { - case_insensitive, - multiple_matches, + let ReplaceModifiers { + case, + anchoring, + multi, } = opts; - let pat = regex::RegexBuilder::new(pattern) - .case_insensitive(case_insensitive) + let pattern = anchoring.wrap_regex_pattern(&pattern); + + let pat = regex::RegexBuilder::new(&pattern) + .case_insensitive(match case { + CaseSensitivity::Insensitive => true, + CaseSensitivity::Sensitive => false, + }) .build() .map_err(|e| { CommandError::InvalidArg(format!( "failed to construct regex replacer from search pattern {pattern:?}: {e}" )) })?; - Ok(Self { - pat, - multiple_matches, - rep, - }) + Ok(Self { pat, multi, rep }) } fn replace<'s>(&self, input: &'s str) -> Cow<'s, str> { - match self.multiple_matches { - false => self.pat.replace(input, &self.rep), - true => self.pat.replace_all(input, &self.rep), + match self.multi { + Multiplicity::Single => self.pat.replace(input, &self.rep), + Multiplicity::All => self.pat.replace_all(input, &self.rep), } } } @@ -468,20 +569,19 @@ impl NameTransformer for ComponentTransformer { replacement_spec, } = arg; + let opts = ReplaceModifiers::from_flags(modifiers); let pattern_trans: Box = match pat_sel { PatternSelectorType::Glob => { - unreachable!("glob patterns are not supported for name transformations") + return Err(CommandError::InvalidArg(format!( + "glob patterns are not supported for name transformations: {pattern:?}" + ))); + } + PatternSelectorType::Literal => { + Box::new(LiteralTransformer::create(pattern, opts, replacement_spec)?) + } + PatternSelectorType::Regexp => { + Box::new(RegexpTransformer::create(pattern, opts, replacement_spec)?) } - PatternSelectorType::Literal => Box::new(LiteralTransformer::create( - &pattern, - modifiers, - replacement_spec, - )?), - PatternSelectorType::Regexp => Box::new(RegexpTransformer::create( - &pattern, - modifiers, - replacement_spec, - )?), }; Ok(Self { @@ -513,9 +613,6 @@ impl CompiledTransformer { }, NameTransform::Complex(complex_trans) => match complex_trans { ComplexTransform::Transform(arg) => Box::new(ComponentTransformer::from_arg(arg)?), - ComplexTransform::RemovePrefix(remove_prefix_arg) => { - todo!("impl remove prefix: {:?}", remove_prefix_arg) - } }, }) } From b4914b7ad149b2650f19631ca8a103b2c1417d12 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 26 Aug 2024 20:43:50 -0400 Subject: [PATCH 28/68] make glob and rx optional dependencies --- cli/Cargo.toml | 10 +++++++--- cli/src/args/extract.rs | 20 +++++++++++++++++--- cli/src/extract/matcher.rs | 33 +++++++++++++++++++++++++++++++-- cli/src/extract/transform.rs | 14 +++++++++++++- 4 files changed, 68 insertions(+), 9 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index fb880e115..135270248 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -24,9 +24,8 @@ members = ["."] name = "zip-cli" [dependencies] -# TODO: make these optional deps? -glob = "0.3" -regex = "1" +glob = { version = "0.3", optional = true } +regex = { version = "1", optional = true } [dependencies.zip] path = ".." @@ -47,6 +46,9 @@ time = ["zip/time"] xz = ["zip/xz"] zstd = ["zip/zstd"] +glob = ["dep:glob"] +rx = ["dep:regex"] + default = [ "aes-crypto", "bzip2", @@ -56,6 +58,8 @@ default = [ "time", "xz", "zstd", + "glob", + "rx", ] diff --git a/cli/src/args/extract.rs b/cli/src/args/extract.rs index 6f57ceee1..7973760cc 100644 --- a/cli/src/args/extract.rs +++ b/cli/src/args/extract.rs @@ -48,11 +48,19 @@ impl PatternSelectorType { } pub const fn default_for_match() -> Self { - Self::Glob + if cfg!(feature = "glob") { + Self::Glob + } else { + Self::Literal + } } pub const fn default_for_replacement() -> Self { - Self::Regexp + if cfg!(feature = "rx") { + Self::Regexp + } else { + Self::Literal + } } } @@ -1303,7 +1311,7 @@ entry itself. *Note:* when multiple entry specs are provided on the command line, a single entry may be matched more than once. In this case, the entry's content will be -extracted more than once over the execution of this command. +teed to all the specified outputs. -x, --extract[=] Decompress the entry's contents (if necessary) before writing it to @@ -1344,6 +1352,12 @@ pat-sel = glob [DEFAULT for matching] (interpret as a shell glob) *Note:* glob patterns are not supported for replacement, and attempting to use them with e.g '--transform:glob' will produce an error. +Also note that glob and regex patterns require building this binary with the +"glob" and "rx" cargo features respectively. Specifying ':glob' or ':rx' without +the requisite feature support will produce an error. If the requisite feature is +not provided, the default is to use literal matching, which is supported in +all cases. + #### Pattern modifiers (pat-mod): pat-mod = :i (use case-insensitive matching for the given pattern) = :g (use multi-match behavior for string replacements) diff --git a/cli/src/extract/matcher.rs b/cli/src/extract/matcher.rs index 1d7a69e42..1fabf0220 100644 --- a/cli/src/extract/matcher.rs +++ b/cli/src/extract/matcher.rs @@ -1,6 +1,8 @@ use std::borrow::Cow; +#[cfg(feature = "glob")] use glob; +#[cfg(feature = "rx")] use regex; use zip::CompressionMethod; @@ -146,11 +148,13 @@ impl NameMatcher for LiteralMatcher { } } +#[cfg(feature = "glob")] struct GlobMatcher { pat: glob::Pattern, glob_opts: glob::MatchOptions, } +#[cfg(feature = "glob")] impl NameMatcher for GlobMatcher { fn create(pattern: String, opts: MatchModifiers) -> Result where @@ -182,10 +186,12 @@ impl NameMatcher for GlobMatcher { } } +#[cfg(feature = "rx")] struct RegexMatcher { pat: regex::Regex, } +#[cfg(feature = "rx")] impl NameMatcher for RegexMatcher { fn create(pattern: String, opts: MatchModifiers) -> Result where @@ -407,9 +413,32 @@ impl EntryMatcher for PatternMatcher { let opts = MatchModifiers::from_flags(modifiers)?; let matcher: Box = match pat_sel { - PatternSelectorType::Glob => Box::new(GlobMatcher::create(pattern, opts)?), + PatternSelectorType::Glob => { + #[cfg(feature = "glob")] + { + Box::new(GlobMatcher::create(pattern, opts)?) + } + #[cfg(not(feature = "glob"))] + { + return Err(CommandError::InvalidArg(format!( + "glob patterns were requested, but this binary was built without the \"glob\" feature: {pattern:?}" + ))); + } + } + PatternSelectorType::Literal => Box::new(LiteralMatcher::create(pattern, opts)?), - PatternSelectorType::Regexp => Box::new(RegexMatcher::create(pattern, opts)?), + PatternSelectorType::Regexp => { + #[cfg(feature = "rx")] + { + Box::new(RegexMatcher::create(pattern, opts)?) + } + #[cfg(not(feature = "rx"))] + { + return Err(CommandError::InvalidArg(format!( + "regexp patterns were requested, but this binary was built without the \"rx\" feature: {pattern:?}" + ))); + } + } }; Ok(Self { matcher, comp_sel }) diff --git a/cli/src/extract/transform.rs b/cli/src/extract/transform.rs index fbc9a1152..dc36f57b0 100644 --- a/cli/src/extract/transform.rs +++ b/cli/src/extract/transform.rs @@ -1,5 +1,6 @@ use std::{borrow::Cow, collections::VecDeque, ops, path::Path, slice, str}; +#[cfg(feature = "rx")] use regex; use super::matcher::{CaseSensitivity, SearchAnchoring}; @@ -353,12 +354,14 @@ impl PatternTransformer for LiteralTransformer { } } +#[cfg(feature = "rx")] struct RegexpTransformer { pat: regex::Regex, multi: Multiplicity, rep: String, } +#[cfg(feature = "rx")] impl PatternTransformer for RegexpTransformer { type Replacement = String where Self: Sized; fn create( @@ -580,7 +583,16 @@ impl NameTransformer for ComponentTransformer { Box::new(LiteralTransformer::create(pattern, opts, replacement_spec)?) } PatternSelectorType::Regexp => { - Box::new(RegexpTransformer::create(pattern, opts, replacement_spec)?) + #[cfg(feature = "rx")] + { + Box::new(RegexpTransformer::create(pattern, opts, replacement_spec)?) + } + #[cfg(not(feature = "rx"))] + { + return Err(CommandError::InvalidArg(format!( + "regexp patterns were requested, but this binary was built without the \"rx\" feature: {pattern:?}" + ))); + } } }; From 972d77a87cce0219aba959dbe69696bead685d26 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 26 Aug 2024 21:10:24 -0400 Subject: [PATCH 29/68] add stub for info command --- cli/src/args.rs | 21 +++++-------------- cli/src/args/info.rs | 50 ++++++++++++++++++++++++++++++++++++++++++++ cli/src/lib.rs | 2 +- 3 files changed, 56 insertions(+), 17 deletions(-) create mode 100644 cli/src/args/info.rs diff --git a/cli/src/args.rs b/cli/src/args.rs index c2cdd94f6..b59fd0cbd 100644 --- a/cli/src/args.rs +++ b/cli/src/args.rs @@ -50,7 +50,7 @@ impl ZipCli { Commands: {}{}{} - {} {} + {}{}{} {}{}{} Options: @@ -64,6 +64,7 @@ Options: compress::Compress::COMMAND_TABS, compress::Compress::COMMAND_DESCRIPTION, info::Info::COMMAND_NAME, + info::Info::COMMAND_TABS, info::Info::COMMAND_DESCRIPTION, extract::Extract::COMMAND_NAME, extract::Extract::COMMAND_TABS, @@ -135,7 +136,7 @@ For more information, try '--help'. .expect("exe name already written"); let (verbose, subcommand_name) = Self::parse_up_to_subcommand_name(&mut argv)?; let command = match subcommand_name { - SubcommandName::Info => ZipCommand::Info, + SubcommandName::Info => ZipCommand::Info(info::Info::parse_argv(argv)?), SubcommandName::Extract => ZipCommand::Extract(extract::Extract::parse_argv(argv)?), SubcommandName::Compress => ZipCommand::Compress(compress::Compress::parse_argv(argv)?), }; @@ -146,7 +147,7 @@ For more information, try '--help'. #[derive(Debug)] pub enum ZipCommand { Compress(compress::Compress), - Info, + Info(info::Info), Extract(extract::Extract), } @@ -203,17 +204,5 @@ error: {context} } pub mod compress; - -pub mod info { - #[derive(Debug)] - pub struct Info {} - - impl Info { - pub const COMMAND_NAME: &'static str = "info"; - pub const COMMAND_DESCRIPTION: &'static str = - "(TODO) Print info about archive contents and individual entries."; - pub const COMMAND_TABS: &'static str = "\t\t"; - } -} - pub mod extract; +pub mod info; diff --git a/cli/src/args/info.rs b/cli/src/args/info.rs new file mode 100644 index 000000000..d227c58e0 --- /dev/null +++ b/cli/src/args/info.rs @@ -0,0 +1,50 @@ +use super::{ + extract::{InputSpec, MatchExpression}, + ArgParseError, CommandFormat, +}; + +use std::{collections::VecDeque, ffi::OsString}; + +#[derive(Debug)] +pub struct Info { + pub match_expr: Option, + pub input_spec: InputSpec, +} + +impl CommandFormat for Info { + const COMMAND_NAME: &'static str = "info"; + const COMMAND_TABS: &'static str = "\t\t"; + const COMMAND_DESCRIPTION: &'static str = + "Print info about archive contents and individual entries."; + + const USAGE_LINE: &'static str = + "[-h|--help] [FORMAT-SPEC] [--expr MATCH-EXPR --expr] [--stdin] [--] [ZIP-PATH]..."; + + fn generate_help() -> String { + r#" + -h, --help Print help + +... +"# + .to_string() + } + + fn parse_argv(mut argv: VecDeque) -> Result { + while let Some(arg) = argv.pop_front() { + match arg.as_encoded_bytes() { + b"-h" | b"--help" => { + let help_text = Self::generate_full_help_text(); + return Err(ArgParseError::StdoutMessage(help_text)); + } + _ => todo!(), + } + } + todo!() + } +} + +impl crate::driver::ExecuteCommand for Info { + fn execute(self, err: impl std::io::Write) -> Result<(), crate::CommandError> { + todo!() + } +} diff --git a/cli/src/lib.rs b/cli/src/lib.rs index e3b526e61..246f2ab9c 100644 --- a/cli/src/lib.rs +++ b/cli/src/lib.rs @@ -160,7 +160,7 @@ pub mod driver { }; match command { - ZipCommand::Info => todo!("info command not implemented"), + ZipCommand::Info(info) => info.do_main(err), ZipCommand::Extract(extract) => extract.do_main(err), ZipCommand::Compress(compress) => compress.do_main(err), } From 5a1f812505614a0f19c5068d8e7374ccab18ae81 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 26 Aug 2024 22:56:28 -0400 Subject: [PATCH 30/68] parameterize the match help text to reuse for info --- cli/src/args/extract.rs | 327 ++++++++++++++++++++++------------------ cli/src/args/info.rs | 18 ++- 2 files changed, 198 insertions(+), 147 deletions(-) diff --git a/cli/src/args/extract.rs b/cli/src/args/extract.rs index 7973760cc..c7e47b026 100644 --- a/cli/src/args/extract.rs +++ b/cli/src/args/extract.rs @@ -1064,6 +1064,186 @@ impl Extract { const XZ_HELP_LINE: &'static str = " - xz:\t\twith xz\n"; #[cfg(not(feature = "xz"))] const XZ_HELP_LINE: &'static str = ""; + + pub fn generate_match_expr_help_text() -> String { + format!( + r#" +## Match expressions (match-expr): + +Entry matching logic composes boolean arithmetic expressions ("expr") in terms +of basic "predicates" which test some component of the zip entry. Expressions +can be composed as follows, in order of precedence: + +expr = ( ) (grouping to force precedence) + = ! (negation) + = & (short-circuiting conjunction "and") + = (implicit &) + = | (disjunction "or") + = (evaluate on entry) + +### Operators: +The operators to compose match expressions must be quoted in shell commands +(e.g. as \( or '('), so alternatives are provided which do not require +special quoting: + +Grouping operators: + (, -open + ), -close + +Unary operators: + !, -not + +Binary operators: + |, -or + &, -and + +### Predicates (predicate): +These arguments are interpreted as basic predicates, returning true or false in +response to a specific zip entry. + +Trivial: +These results do not depend on the entry data at all: + + -true Always return true. + -false Always return false. + +If a match expression is not provided, it defaults to the behavior of -true. + +Basic: +These results are dependent on the entry data: + + -t, --type [file|dir|symlink] + Match entries of the given type. + Note that directory entries may have specific mode bits set, or they may just be + zero-length entries whose name ends in '/'. + + --compression-method + Match entries compressed with the given compression technique. + + Possible values: + - any: any compression method at all + - known: any compression method this binary is able to decompress + - stored: uncompressed + - deflated: with deflate +{}{}{}{}{} + Using e.g. '--compression-method known' as a match expression filters + entries to only those which can be successfully decompressed. + + --max-depth + Match entries with at *most* components of their + containing directory. + --min-depth + Match entries with at *least* components of their + containing directory. + + --max-size + Match entries of at *most* in *uncompressed* size. + --min-size + Match entries of at *least* in *uncompressed* size. + + Directory entries are 0 bytes in size, and symlink entries are the + size required to store their target. + + TODO: Abbrevations such as 1k, 1M are not currently supported; the + precise byte number must be provided, parseable as a u64. + + -m, --match[=][:] + Return true for entries whose name matches . + + See section on "Selector syntax" for and for how + the string argument is interpreted into a string matching + predicate against the entry name. +"#, + Self::DEFLATE64_HELP_LINE, + Self::BZIP2_HELP_LINE, + Self::ZSTD_HELP_LINE, + Self::LZMA_HELP_LINE, + Self::XZ_HELP_LINE, + ) + } + + pub fn generate_pattern_selector_help_text(match_only: bool) -> String { + format!( + r#" +## Selector syntax: + +The string matching operations of {} expose an interface to +configure various pattern matching techniques on various components of the entry +name string. + +These flags default to interpreting a argument as a glob string to +match against the entire entry name, which can be explicitly requested as +follows: + + --match=path:glob + +The entire range of search options is described below: + +### Component selector (comp-sel): +comp-sel = path [DEFAULT] (match full entry) + = basename (match only the final component of entry) + = dirname (match all except final component of entry) + = ext (match only the file extension, if available) + +### Pattern selector (pat-sel): +pat-sel = glob [DEFAULT{}] (interpret as a shell glob) + = lit (interpret as literal string) + = rx {}(interpret as a regular expression) + = (apply search modifiers from ) + +{} + +Also note that glob and regex patterns require building this binary with the +"glob" and "rx" cargo features respectively. Specifying ':glob' or ':rx' without +the requisite feature support will produce an error. If the requisite feature is +not provided, the default is to use literal matching, which is supported in +all cases. + +#### Pattern modifiers (pat-mod): +pat-mod = :i (use case-insensitive matching for the given pattern) +{} = :p (perform left-anchored "prefix" searches) + = :s (perform right-anchored "suffix" searches) + +Pattern modifiers from (pat-mod) can be sequenced, e.g. ':i:p'. If ':p' and ':s' +are provided together, the result is to perform a doubly-anchored match, against +the entire string. For regexp matching with ':rx', ':p' and ':s' are converted +to '^' or '$' anchors in the regexp pattern string. If the pattern string also +contains '^' or '$' as well, no error is produced. + +*Note:* not all pattern modifiers apply everywhere. In particular, {}':p' and ':s' are +incompatible with glob search and will produce an error. +"#, + if match_only { + "--match" + } else { + "--match and --transform" + }, + if match_only { "" } else { " for matching" }, + if match_only { + "" + } else { + "[DEFAULT for replacement] " + }, + if match_only { + "" + } else { + "*Note:* glob patterns are not supported for replacement, and attempting to use +them with e.g '--transform:glob' will produce an error." + }, + if match_only { + "" + } else { + " = :g (use multi-match behavior for string replacements)\n" + }, + if match_only { + "" + } else { + "':g' only +applies to string replacement, and using it for a match expression like +'--match:rx:g' will produce an error. Additionally, " + } + ) + } } impl CommandFormat for Extract { @@ -1174,92 +1354,7 @@ with the command line: *Note:* if a match-expr is provided, it *must* be surrounded with --expr arguments on both sides! This is a necessary constraint of the current command line parsing. - -## Match expressions (match-expr): - -Entry matching logic composes boolean arithmetic expressions ("expr") in terms -of basic "predicates" which test some component of the zip entry. Expressions -can be composed as follows, in order of precedence: - -expr = ( ) (grouping to force precedence) - = ! (negation) - = & (short-circuiting conjunction "and") - = (implicit &) - = | (disjunction "or") - = (evaluate on entry) - -### Operators: -The operators to compose match expressions must be quoted in shell commands -(e.g. as \( or '('), so alternatives are provided which do not require -special quoting: - -Grouping operators: - (, -open - ), -close - -Unary operators: - !, -not - -Binary operators: - |, -or - &, -and - -### Predicates (predicate): -These arguments are interpreted as basic predicates, returning true or false in -response to a specific zip entry. - -Trivial: -These results do not depend on the entry data at all: - - -true Always return true. - -false Always return false. - -If a match expression is not provided, it defaults to the behavior of -true. - -Basic: -These results are dependent on the entry data: - - -t, --type [file|dir|symlink] - Match entries of the given type. - Note that directory entries may have specific mode bits set, or they may just be - zero-length entries whose name ends in '/'. - - --compression-method - Match entries compressed with the given compression technique. - - Possible values: - - any: any compression method at all - - known: any compression method this binary is able to decompress - - stored: uncompressed - - deflated: with deflate -{}{}{}{}{} - Using e.g. '--compression-method known' as a match expression filters - entries to only those which can be successfully decompressed. - - --max-depth - Match entries with at *most* components of their - containing directory. - --min-depth - Match entries with at *least* components of their - containing directory. - - --max-size - Match entries of at *most* in *uncompressed* size. - --min-size - Match entries of at *least* in *uncompressed* size. - - Directory entries are 0 bytes in size, and symlink entries are the - size required to store their target. - - TODO: Abbrevations such as 1k, 1M are not currently supported; the - precise byte number must be provided, parseable as a u64. - - -m, --match[=][:] - Return true for entries whose name matches . - - See section on "Selector syntax" for and for how - the string argument is interpreted into a string matching - predicate against the entry name. +{} ## Name transforms (name-transform): @@ -1322,58 +1417,7 @@ Attempting to extract an entry using an unsupported compression method with -x/--extract will produce an error. In this case, --compression-method can be used to filter out such entries. - -## Selector syntax: - -The string matching operations of --match and --transform expose an interface to -configure various pattern matching techniques on various components of the entry -name string. - -These flags default to interpreting a argument as a glob string to -match against the entire entry name, which can be explicitly requested as -follows: - - --match=path:glob - -The entire range of search options is described below: - -### Component selector (comp-sel): -comp-sel = path [DEFAULT] (match full entry) - = basename (match only the final component of entry) - = dirname (match all except final component of entry) - = ext (match only the file extension, if available) - -### Pattern selector (pat-sel): -pat-sel = glob [DEFAULT for matching] (interpret as a shell glob) - = lit (interpret as literal string) - = rx [DEFAULT for replacement] (interpret as a regular expression) - = (apply search modifiers from ) - -*Note:* glob patterns are not supported for replacement, and attempting to use -them with e.g '--transform:glob' will produce an error. - -Also note that glob and regex patterns require building this binary with the -"glob" and "rx" cargo features respectively. Specifying ':glob' or ':rx' without -the requisite feature support will produce an error. If the requisite feature is -not provided, the default is to use literal matching, which is supported in -all cases. - -#### Pattern modifiers (pat-mod): -pat-mod = :i (use case-insensitive matching for the given pattern) - = :g (use multi-match behavior for string replacements) - = :p (perform left-anchored "prefix" searches) - = :s (perform right-anchored "suffix" searches) - -Pattern modifiers from (pat-mod) can be sequenced, e.g. ':i:g'. If ':p' and ':s' -are provided together, the result is to perform a doubly-anchored match, against -the entire string. For regexp matching with ':rx', ':p' and ':s' are converted -to '^' or '$' anchors in the regexp pattern string. If the pattern string also -contains '^' or '$' as well, no error is produced. - -*Note:* not all pattern modifiers apply everywhere. In particular, ':g' only -applies to string replacement, and using it for a match expression like -'--match:rx:g' will produce an error. Additionally, ':p' and ':s' are -incompatible with glob search and will produce an error. +{} # Input arguments: Zip file inputs to extract from can be specified by streaming from stdin, or as @@ -1399,11 +1443,8 @@ Positional paths: If --stdin is provided, it will be read in a streaming manner before reading entries from any positional zip paths. "#, - Self::DEFLATE64_HELP_LINE, - Self::BZIP2_HELP_LINE, - Self::ZSTD_HELP_LINE, - Self::LZMA_HELP_LINE, - Self::XZ_HELP_LINE, + Self::generate_match_expr_help_text(), + Self::generate_pattern_selector_help_text(false), ) } diff --git a/cli/src/args/info.rs b/cli/src/args/info.rs index d227c58e0..087ec0751 100644 --- a/cli/src/args/info.rs +++ b/cli/src/args/info.rs @@ -1,5 +1,5 @@ use super::{ - extract::{InputSpec, MatchExpression}, + extract::{Extract, InputSpec, MatchExpression}, ArgParseError, CommandFormat, }; @@ -21,12 +21,22 @@ impl CommandFormat for Info { "[-h|--help] [FORMAT-SPEC] [--expr MATCH-EXPR --expr] [--stdin] [--] [ZIP-PATH]..."; fn generate_help() -> String { - r#" + format!( + r#" -h, --help Print help ... -"# - .to_string() + +*Note:* if a match-expr is provided, it *must* be surrounded with --expr arguments on both sides! +This is a necessary constraint of the current command line parsing. + +{} + +{} +"#, + Extract::generate_match_expr_help_text(), + Extract::generate_pattern_selector_help_text(true), + ) } fn parse_argv(mut argv: VecDeque) -> Result { From 12542362f747b5fdcf7230344b65896c283ab3ad Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Tue, 27 Aug 2024 14:08:09 -0400 Subject: [PATCH 31/68] stub out info format specs --- cli/src/args/extract.rs | 53 +++++++++-------- cli/src/args/info.rs | 126 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 150 insertions(+), 29 deletions(-) diff --git a/cli/src/args/extract.rs b/cli/src/args/extract.rs index c7e47b026..b98b105a2 100644 --- a/cli/src/args/extract.rs +++ b/cli/src/args/extract.rs @@ -1244,6 +1244,32 @@ applies to string replacement, and using it for a match expression like } ) } + + pub const INPUT_HELP_TEXT: &'static str = r#" +# Input arguments: +Zip file inputs to extract from can be specified by streaming from stdin, or as +at least one path pointing to an existing zip file. Input arguments are always +specified after all output flags and entry specs on the command line. If no +positional argument is provided and --stdin is not present, an error will +be produced. + + --stdin + If this argument is provided, the streaming API will be used to read + entries as they are encountered, instead of filtering them beforehand + as is done with file inputs. This disables some optimizations, but + also avoids waiting for the entire input to buffer to start writing + output, so can be used in a streaming context. + +Positional paths: + ZIP-PATH... + Apply the entry specs to filter and rename entries to extract from all + of the provided zip files. At least one zip path must be provided, and + all provided paths must exist and point to an existing zip file. Pipes + are not supported and will produce an error. + + If --stdin is provided, it will be read in a streaming manner before + reading entries from any positional zip paths. +"#; } impl CommandFormat for Extract { @@ -1418,33 +1444,10 @@ Attempting to extract an entry using an unsupported compression method with used to filter out such entries. {} - -# Input arguments: -Zip file inputs to extract from can be specified by streaming from stdin, or as -at least one path pointing to an existing zip file. Input arguments are always -specified after all output flags and entry specs on the command line. If no -positional argument is provided and --stdin is not present, an error will -be produced. - - --stdin - If this argument is provided, the streaming API will be used to read - entries as they are encountered, instead of filtering them beforehand - as is done with file inputs. This disables some optimizations, but - also avoids waiting for the entire input to buffer to start writing - output, so can be used in a streaming context. - -Positional paths: - ZIP-PATH... - Apply the entry specs to filter and rename entries to extract from all - of the provided zip files. At least one zip path must be provided, and - all provided paths must exist and point to an existing zip file. Pipes - are not supported and will produce an error. - - If --stdin is provided, it will be read in a streaming manner before - reading entries from any positional zip paths. -"#, +{}"#, Self::generate_match_expr_help_text(), Self::generate_pattern_selector_help_text(false), + Self::INPUT_HELP_TEXT, ) } diff --git a/cli/src/args/info.rs b/cli/src/args/info.rs index 087ec0751..f5d7ec236 100644 --- a/cli/src/args/info.rs +++ b/cli/src/args/info.rs @@ -5,8 +5,126 @@ use super::{ use std::{collections::VecDeque, ffi::OsString}; +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum ByteSizeFormat { + #[default] + FullDecimal, + HumanAbbreviated, +} + +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum OffsetFormat { + Decimal, + #[default] + Hexadecimal, +} + +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum BinaryStringFormat { + #[default] + PrintAsString, + WriteBinaryContents, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum ArchiveOverviewFormatDirective { + ArchiveName, + TotalSize(ByteSizeFormat), + NumEntries, + ArchiveComment(BinaryStringFormat), + FirstEntryStart(OffsetFormat), + CentralDirectoryStart(OffsetFormat), +} + +#[derive(Debug)] +pub enum ArchiveOverviewFormatComponent { + Directive(ArchiveOverviewFormatDirective), + Literal(String), +} + +#[derive(Debug)] +pub struct ArchiveOverviewFormatSpec { + components: Vec, +} + +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum UnixModeFormat { + #[default] + Octal, + Pretty, +} + +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum TimestampFormat { + UnixEpochMilliseconds, + DateOnly, + TimeOnly, + #[default] + DateAndTime, +} + +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum CompressionMethodFormat { + Abbreviated, + #[default] + Full, +} + +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum BinaryNumericValueFormat { + Decimal, + #[default] + Hexadecimal, +} + +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum FileTypeFormat { + Abbreviated, + #[default] + Full, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum EntryFormatDirective { + Name, + FileType(FileTypeFormat), + Comment(BinaryStringFormat), + LocalHeaderStart(OffsetFormat), + ContentStart(OffsetFormat), + ContentEnd(OffsetFormat), + CompressedSize(ByteSizeFormat), + UncompressedSize(ByteSizeFormat), + UnixMode(UnixModeFormat), + CompressionMethod(CompressionMethodFormat), + CrcValue(BinaryNumericValueFormat), + Timestamp(TimestampFormat), +} + +#[derive(Debug)] +pub enum EntryFormatComponent { + Directive(EntryFormatDirective), + Literal(String), +} + +#[derive(Debug)] +pub struct EntryFormatSpec { + components: Vec, +} + +#[derive(Debug, Default)] +pub enum FormatSpec { + #[default] + Compact, + Extended, + Custom { + overview: ArchiveOverviewFormatSpec, + entry: EntryFormatSpec, + }, +} + #[derive(Debug)] pub struct Info { + pub format_spec: FormatSpec, pub match_expr: Option, pub input_spec: InputSpec, } @@ -25,17 +143,17 @@ impl CommandFormat for Info { r#" -h, --help Print help -... - -*Note:* if a match-expr is provided, it *must* be surrounded with --expr arguments on both sides! -This is a necessary constraint of the current command line parsing. +# Format specs: +??? {} +{} {} "#, Extract::generate_match_expr_help_text(), Extract::generate_pattern_selector_help_text(true), + Extract::INPUT_HELP_TEXT, ) } From fbc426471ef8de8c8698e89dec21cf5af5811b4d Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Tue, 27 Aug 2024 14:49:32 -0400 Subject: [PATCH 32/68] write out some more help text --- cli/src/args/compress.rs | 3 +- cli/src/args/extract.rs | 2 +- cli/src/args/info.rs | 134 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 132 insertions(+), 7 deletions(-) diff --git a/cli/src/args/compress.rs b/cli/src/args/compress.rs index 1c07b1e45..01afa648a 100644 --- a/cli/src/args/compress.rs +++ b/cli/src/args/compress.rs @@ -73,7 +73,8 @@ impl Compress { impl CommandFormat for Compress { const COMMAND_NAME: &'static str = "compress"; const COMMAND_TABS: &'static str = "\t"; - const COMMAND_DESCRIPTION: &'static str = "Generate a zip archive from files, directories, and symlinks provided as arguments or read from filesystem paths."; + const COMMAND_DESCRIPTION: &'static str = + "Generate an archive from data in argument strings or read from the filesystem."; const USAGE_LINE: &'static str = "[-h|--help] [OUTPUT-FLAGS] [ENTRY]... [--] [PATH]..."; diff --git a/cli/src/args/extract.rs b/cli/src/args/extract.rs index b98b105a2..8b179fde4 100644 --- a/cli/src/args/extract.rs +++ b/cli/src/args/extract.rs @@ -1276,7 +1276,7 @@ impl CommandFormat for Extract { const COMMAND_NAME: &'static str = "extract"; const COMMAND_TABS: &'static str = "\t"; const COMMAND_DESCRIPTION: &'static str = - "Extract individual entries or an entire archive into a stream or the filesystem."; + "Decompress and transform matching entries into a stream or directory."; const USAGE_LINE: &'static str = "[-h|--help] [OUTPUT-SPEC]... [ENTRY-SPEC]... [--stdin] [--] [ZIP-PATH]..."; diff --git a/cli/src/args/info.rs b/cli/src/args/info.rs index f5d7ec236..c9c5e0cca 100644 --- a/cli/src/args/info.rs +++ b/cli/src/args/info.rs @@ -3,7 +3,7 @@ use super::{ ArgParseError, CommandFormat, }; -use std::{collections::VecDeque, ffi::OsString}; +use std::{collections::VecDeque, ffi::OsString, path::PathBuf}; #[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum ByteSizeFormat { @@ -122,6 +122,15 @@ pub enum FormatSpec { }, } +impl FormatSpec { + pub fn parse_format_strings( + archive_format: String, + entry_format: String, + ) -> Result { + todo!() + } +} + #[derive(Debug)] pub struct Info { pub format_spec: FormatSpec, @@ -136,21 +145,44 @@ impl CommandFormat for Info { "Print info about archive contents and individual entries."; const USAGE_LINE: &'static str = - "[-h|--help] [FORMAT-SPEC] [--expr MATCH-EXPR --expr] [--stdin] [--] [ZIP-PATH]..."; + "[-h|--help] [--extended|--format ] [--expr MATCH-EXPR --expr] [--stdin] [--] [ZIP-PATH]..."; fn generate_help() -> String { format!( r#" -h, --help Print help +By default, a compact representation of the metadata within the top-level +archive and individual entries is printed to stdout. This format, along with the +"extended" format from --extended, is not stable for processing by external +tools. For stable output, a custom format string should be provided with +--format. + +Note that the contents of individual entries are not accessible with this +command, and should instead be extracted with the '{}' subcommand, which can +write entries to stdout or a given file path as well as extracted into an +output directory. + + --extended + Print a verbose description of all top-level archive and individual + entry fields. + + --format + Print a custom description of the top-level archive and individual + entry metadata. + + Both format specs must be provided, but empty strings are + accepted. Explicit trailing newlines must be specified and will not be + inserted automatically. + # Format specs: -??? {} {} {} "#, + Extract::COMMAND_NAME, Extract::generate_match_expr_help_text(), Extract::generate_pattern_selector_help_text(true), Extract::INPUT_HELP_TEXT, @@ -158,16 +190,108 @@ impl CommandFormat for Info { } fn parse_argv(mut argv: VecDeque) -> Result { + let mut format_spec: Option = None; + let mut match_expr: Option = None; + let mut stdin_flag = false; + let mut positional_zips: Vec = Vec::new(); + while let Some(arg) = argv.pop_front() { match arg.as_encoded_bytes() { b"-h" | b"--help" => { let help_text = Self::generate_full_help_text(); return Err(ArgParseError::StdoutMessage(help_text)); } - _ => todo!(), + + /* Try parsing format specs. */ + b"--extended" => { + if let Some(prev_spec) = format_spec.take() { + return Err(Self::exit_arg_invalid(&format!( + "format spec already provided before --extended: {prev_spec:?}" + ))); + } + format_spec = Some(FormatSpec::Extended); + } + b"--format" => { + if let Some(prev_spec) = format_spec.take() { + return Err(Self::exit_arg_invalid(&format!( + "format spec already provided before --format: {prev_spec:?}" + ))); + } + let archive_format = argv + .pop_front() + .ok_or_else(|| { + Self::exit_arg_invalid("no arg provided to --format") + })? + .into_string() + .map_err(|fmt_arg| { + Self::exit_arg_invalid(&format!( + "invalid unicode provided to --format: {fmt_arg:?}" + )) + })?; + let entry_format = argv + .pop_front() + .ok_or_else(|| { + Self::exit_arg_invalid("no arg provided to --format") + })? + .into_string() + .map_err(|fmt_arg| { + Self::exit_arg_invalid(&format!( + "invalid unicode provided to --format: {fmt_arg:?}" + )) + })?; + format_spec = Some(FormatSpec::parse_format_strings( + archive_format, + entry_format, + )?); + } + + /* Try parsing match specs! */ + b"--expr" => { + let new_expr = MatchExpression::parse_argv(&mut argv)?; + if let Some(prev_expr) = match_expr.take() { + return Err(Self::exit_arg_invalid(&format!( + "multiple match expressions provided: {prev_expr:?} and {new_expr:?}" + ))); + } + match_expr = Some(new_expr); + } + + /* Transition to input args */ + b"--stdin" => { + stdin_flag = true; + } + b"--" => break, + arg_bytes => { + if arg_bytes.starts_with(b"-") { + return Err(Self::exit_arg_invalid(&format!( + "unrecognized flag {arg:?}" + ))); + } else { + argv.push_front(arg); + break; + } + } } } - todo!() + + positional_zips.extend(argv.into_iter().map(|arg| arg.into())); + if !stdin_flag && positional_zips.is_empty() { + return Err(Self::exit_arg_invalid( + "no zip input files were provided, and --stdin was not provided", + )); + }; + let input_spec = InputSpec { + stdin_stream: stdin_flag, + zip_paths: positional_zips, + }; + + let format_spec = format_spec.unwrap_or_default(); + + Ok(Self { + format_spec, + match_expr, + input_spec, + }) } } From d5b14d7d962374245b9294e973d1cf4159e362cb Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Tue, 27 Aug 2024 15:02:15 -0400 Subject: [PATCH 33/68] remove unnecessary unsafe --- cli/src/extract/transform.rs | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/cli/src/extract/transform.rs b/cli/src/extract/transform.rs index dc36f57b0..2ae3ade27 100644 --- a/cli/src/extract/transform.rs +++ b/cli/src/extract/transform.rs @@ -514,7 +514,7 @@ impl SubstringAnchoring { Cow::Borrowed(left_sub) => match Self::analyze(selected_left, left_sub) { Self::RetainsBothAnchors => Cow::Borrowed(input), Self::RetainsRightAnchor => { - Cow::Borrowed(Self::join_adjacent_strings(left_sub, right)) + Cow::Borrowed(Self::join_adjacent_strings(input, left_sub, right)) } _ => Cow::Owned(format!("{}{}", left_sub, right)), }, @@ -530,7 +530,7 @@ impl SubstringAnchoring { Cow::Borrowed(right_sub) => match Self::analyze(selected_right, right_sub) { Self::RetainsBothAnchors => Cow::Borrowed(input), Self::RetainsLeftAnchor => { - Cow::Borrowed(Self::join_adjacent_strings(left, right_sub)) + Cow::Borrowed(Self::join_adjacent_strings(input, left, right_sub)) } _ => Cow::Owned(format!("{}{}", left, right_sub)), }, @@ -539,18 +539,19 @@ impl SubstringAnchoring { } #[inline(always)] - fn join_adjacent_strings<'s>(left: &'s str, right: &'s str) -> &'s str { - assert!(left.len() + right.len() <= isize::MAX as usize); + fn join_adjacent_strings<'s, 't>(parent: &'s str, left: &'t str, right: &'t str) -> &'s str + where + 't: 's, + { + let parent_range = parent.as_bytes().as_ptr_range(); let left = left.as_bytes().as_ptr_range(); + debug_assert!(left.start >= parent_range.start && left.end <= parent_range.end); let right = right.as_bytes().as_ptr_range(); - assert_eq!(left.end, right.start); - let start: *const u8 = left.start; - let end: *const u8 = right.end; - unsafe { - let len: usize = end.offset_from(start) as usize; - let joined_slice = slice::from_raw_parts(start, len); - str::from_utf8_unchecked(joined_slice) - } + debug_assert!(right.start >= parent_range.start && right.end <= parent_range.end); + debug_assert_eq!(left.end, right.start); + let start_offset = (left.start as usize) - (parent_range.start as usize); + let end_offset = (parent_range.end as usize) - (right.end as usize); + &parent[start_offset..(parent.len() - end_offset)] } } From 3502d4fa3542985d553bad701d390e94aa5cf800 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:22:59 -0400 Subject: [PATCH 34/68] parse archive overview format strings --- cli/src/args/extract.rs | 83 ++++++++---------- cli/src/args/info.rs | 183 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 216 insertions(+), 50 deletions(-) diff --git a/cli/src/args/extract.rs b/cli/src/args/extract.rs index 8b179fde4..1a580ad54 100644 --- a/cli/src/args/extract.rs +++ b/cli/src/args/extract.rs @@ -518,7 +518,9 @@ pub enum MatchExpression { } impl MatchExpression { - pub fn parse_argv(argv: &mut VecDeque) -> Result { + pub fn parse_argv( + argv: &mut VecDeque, + ) -> Result { let mut expr_stack: Vec = Vec::new(); let mut top_exprs = SingleExprLevel::default(); @@ -536,14 +538,12 @@ impl MatchExpression { ))); } b"-t" | b"--type" => { - let type_arg = argv.pop_front().ok_or_else(|| { - Extract::exit_arg_invalid("no argument provided for -t/--type") - })?; + let type_arg = argv + .pop_front() + .ok_or_else(|| C::exit_arg_invalid("no argument provided for -t/--type"))?; let entry_type = EntryType::parse(type_arg.as_encoded_bytes()).ok_or_else(|| { - Extract::exit_arg_invalid(&format!( - "invalid --type argument: {type_arg:?}" - )) + C::exit_arg_invalid(&format!("invalid --type argument: {type_arg:?}")) })?; top_exprs.push_arg(ExprArg::PrimitivePredicate(Predicate::EntryType( entry_type, @@ -551,11 +551,11 @@ impl MatchExpression { } b"--compression-method" => { let method_arg = argv.pop_front().ok_or_else(|| { - Extract::exit_arg_invalid("no argument provided for --compression-method") + C::exit_arg_invalid("no argument provided for --compression-method") })?; let method = CompressionMethodArg::parse(method_arg.as_encoded_bytes()) .ok_or_else(|| { - Extract::exit_arg_invalid(&format!( + C::exit_arg_invalid(&format!( "invalid --compression-method argument: {method_arg:?}" )) })?; @@ -566,18 +566,16 @@ impl MatchExpression { b"--max-depth" => { let max_depth: u8 = argv .pop_front() - .ok_or_else(|| { - Extract::exit_arg_invalid("no argument provided for --max-depth") - })? + .ok_or_else(|| C::exit_arg_invalid("no argument provided for --max-depth"))? .into_string() .map_err(|depth_arg| { - Extract::exit_arg_invalid(&format!( + C::exit_arg_invalid(&format!( "invalid unicode provided for --max-depth: {depth_arg:?}" )) })? .parse::() .map_err(|e| { - Extract::exit_arg_invalid(&format!( + C::exit_arg_invalid(&format!( "failed to parse --max-depth arg as u8: {e:?}" )) })?; @@ -588,18 +586,16 @@ impl MatchExpression { b"--min-depth" => { let min_depth: u8 = argv .pop_front() - .ok_or_else(|| { - Extract::exit_arg_invalid("no argument provided for --min-depth") - })? + .ok_or_else(|| C::exit_arg_invalid("no argument provided for --min-depth"))? .into_string() .map_err(|depth_arg| { - Extract::exit_arg_invalid(&format!( + C::exit_arg_invalid(&format!( "invalid unicode provided for --min-depth: {depth_arg:?}" )) })? .parse::() .map_err(|e| { - Extract::exit_arg_invalid(&format!( + C::exit_arg_invalid(&format!( "failed to parse --min-depth arg as u8: {e:?}" )) })?; @@ -610,18 +606,16 @@ impl MatchExpression { b"--max-size" => { let max_size: u64 = argv .pop_front() - .ok_or_else(|| { - Extract::exit_arg_invalid("no argument provided for --max-size") - })? + .ok_or_else(|| C::exit_arg_invalid("no argument provided for --max-size"))? .into_string() .map_err(|size_arg| { - Extract::exit_arg_invalid(&format!( + C::exit_arg_invalid(&format!( "invalid unicode provided for --max-size: {size_arg:?}" )) })? .parse::() .map_err(|e| { - Extract::exit_arg_invalid(&format!( + C::exit_arg_invalid(&format!( "failed to parse --max-size arg as u64: {e:?}" )) })?; @@ -632,18 +626,16 @@ impl MatchExpression { b"--min-size" => { let min_size: u64 = argv .pop_front() - .ok_or_else(|| { - Extract::exit_arg_invalid("no argument provided for --min-size") - })? + .ok_or_else(|| C::exit_arg_invalid("no argument provided for --min-size"))? .into_string() .map_err(|size_arg| { - Extract::exit_arg_invalid(&format!( + C::exit_arg_invalid(&format!( "invalid unicode provided for --min-size: {size_arg:?}" )) })? .parse::() .map_err(|e| { - Extract::exit_arg_invalid(&format!( + C::exit_arg_invalid(&format!( "failed to parse --min-size arg as u64: {e:?}" )) })?; @@ -654,10 +646,10 @@ impl MatchExpression { b"-m" => { let pattern: String = argv .pop_front() - .ok_or_else(|| Extract::exit_arg_invalid("no argument provided for -m"))? + .ok_or_else(|| C::exit_arg_invalid("no argument provided for -m"))? .into_string() .map_err(|pattern| { - Extract::exit_arg_invalid(&format!( + C::exit_arg_invalid(&format!( "invalid unicode provided for -m: {pattern:?}" )) })?; @@ -670,22 +662,19 @@ impl MatchExpression { }))); } arg_bytes if arg_bytes.starts_with(b"--match") => { - let (comp_sel, pat_sel) = - parse_comp_and_pat_sel(arg_bytes, PatternContext::Match).ok_or_else( - || { - Extract::exit_arg_invalid(&format!( - "invalid --match argument modifiers: {arg:?}" - )) - }, - )?; + let (comp_sel, pat_sel) = parse_comp_and_pat_sel( + arg_bytes, + PatternContext::Match, + ) + .ok_or_else(|| { + C::exit_arg_invalid(&format!("invalid --match argument modifiers: {arg:?}")) + })?; let pattern: String = argv .pop_front() - .ok_or_else(|| { - Extract::exit_arg_invalid("no argument provided for --match") - })? + .ok_or_else(|| C::exit_arg_invalid("no argument provided for --match"))? .into_string() .map_err(|pattern| { - Extract::exit_arg_invalid(&format!( + C::exit_arg_invalid(&format!( "invalid unicode provided for --match: {pattern:?}" )) })?; @@ -714,7 +703,7 @@ impl MatchExpression { b")" | b"-close" => { /* Get the unevaluated exprs from the previous nesting level. */ let prev_level = expr_stack.pop().ok_or_else(|| { - Extract::exit_arg_invalid("too many close parens inside match expr") + C::exit_arg_invalid("too many close parens inside match expr") })?; /* Move the previous nesting level into current, and evaluate the current * nesting level. */ @@ -730,7 +719,7 @@ impl MatchExpression { break; } _ => { - return Err(Extract::exit_arg_invalid(&format!( + return Err(C::exit_arg_invalid(&format!( "unrecognized match expression component {arg:?}: all match expressions must start and end with a --expr flag" ))); } @@ -738,7 +727,7 @@ impl MatchExpression { } if !expr_stack.is_empty() { - return Err(Extract::exit_arg_invalid( + return Err(C::exit_arg_invalid( "not enough close parens inside match expr", )); } @@ -1580,7 +1569,7 @@ used to filter out such entries. /* Try parsing match specs! */ b"--expr" => { - let match_expr = MatchExpression::parse_argv(&mut argv)?; + let match_expr = MatchExpression::parse_argv::(&mut argv)?; args.push(ExtractArg::Match(match_expr)); } diff --git a/cli/src/args/info.rs b/cli/src/args/info.rs index c9c5e0cca..3c606ff4e 100644 --- a/cli/src/args/info.rs +++ b/cli/src/args/info.rs @@ -3,7 +3,54 @@ use super::{ ArgParseError, CommandFormat, }; -use std::{collections::VecDeque, ffi::OsString, path::PathBuf}; +use std::{collections::VecDeque, ffi::OsString, fmt, path::PathBuf}; + +#[derive(Debug)] +struct ModifierParseError(pub String); + +impl fmt::Display for ModifierParseError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", &self.0) + } +} + +#[derive(Debug)] +enum DirectiveParseError { + Modifier(String, ModifierParseError), + Unrecognized(String), +} + +impl fmt::Display for DirectiveParseError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Modifier(d, e) => { + write!(f, "unrecognized modifier in directive {d:?}: {e}") + } + Self::Unrecognized(d) => { + write!(f, "unrecognized directive: {d:?}") + } + } + } +} + +#[derive(Debug)] +enum FormatParseError { + Directive(DirectiveParseError), + Search(String), +} + +impl fmt::Display for FormatParseError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Directive(e) => { + write!(f, "{e}") + } + Self::Search(e) => { + write!(f, "error in parsing logic: {e}") + } + } + } +} #[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum ByteSizeFormat { @@ -12,6 +59,19 @@ pub enum ByteSizeFormat { HumanAbbreviated, } +impl ByteSizeFormat { + pub fn parse(s: &str) -> Result { + match s { + "" => Ok(Self::default()), + ":decimal" => Ok(Self::FullDecimal), + ":human" => Ok(Self::HumanAbbreviated), + _ => Err(ModifierParseError(format!( + "unrecognized byte size format: {s:?}" + ))), + } + } +} + #[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum OffsetFormat { Decimal, @@ -19,13 +79,41 @@ pub enum OffsetFormat { Hexadecimal, } +impl OffsetFormat { + pub fn parse(s: &str) -> Result { + match s { + "" => Ok(Self::default()), + ":decimal" => Ok(Self::Decimal), + ":hex" => Ok(Self::Hexadecimal), + _ => Err(ModifierParseError(format!( + "unrecognized offset format: {s:?}" + ))), + } + } +} + #[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum BinaryStringFormat { #[default] PrintAsString, + EscapeBinary, WriteBinaryContents, } +impl BinaryStringFormat { + pub fn parse(s: &str) -> Result { + match s { + "" => Ok(Self::default()), + ":print" => Ok(Self::PrintAsString), + ":escape" => Ok(Self::EscapeBinary), + ":write" => Ok(Self::WriteBinaryContents), + _ => Err(ModifierParseError(format!( + "unrecognized string format: {s:?}" + ))), + } + } +} + #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum ArchiveOverviewFormatDirective { ArchiveName, @@ -36,9 +124,40 @@ pub enum ArchiveOverviewFormatDirective { CentralDirectoryStart(OffsetFormat), } +impl ArchiveOverviewFormatDirective { + pub fn parse(s: &str) -> Result { + match s { + "name" => Ok(Self::ArchiveName), + s if s.starts_with("size") => { + let size_fmt = ByteSizeFormat::parse(&s["size".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::TotalSize(size_fmt)) + } + "num" => Ok(Self::NumEntries), + s if s.starts_with("comment") => { + let str_fmt = BinaryStringFormat::parse(&s["comment".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::ArchiveComment(str_fmt)) + } + s if s.starts_with("offset") => { + let offset_fmt = OffsetFormat::parse(&s["offset".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::FirstEntryStart(offset_fmt)) + } + s if s.starts_with("cde-offset") => { + let offset_fmt = OffsetFormat::parse(&s["cde-offset".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::CentralDirectoryStart(offset_fmt)) + } + _ => Err(DirectiveParseError::Unrecognized(s.to_string())), + } + } +} + #[derive(Debug)] pub enum ArchiveOverviewFormatComponent { Directive(ArchiveOverviewFormatDirective), + Escaped(char), Literal(String), } @@ -47,6 +166,51 @@ pub struct ArchiveOverviewFormatSpec { components: Vec, } +impl ArchiveOverviewFormatSpec { + pub fn parse(s: &str) -> Result { + let mut components: Vec = Vec::new(); + let mut last_source_position: usize = 0; + while let Some(pcnt_pos) = s[last_source_position..] + .find('%') + .map(|p| p + last_source_position) + { + /* Anything in between directives is a literal string. */ + if pcnt_pos > last_source_position { + components.push(ArchiveOverviewFormatComponent::Literal( + s[last_source_position..pcnt_pos].to_string(), + )); + last_source_position = pcnt_pos; + } + let next_pcnt = s[(pcnt_pos + 1)..] + .find('%') + .map(|p| p + pcnt_pos + 1) + .ok_or_else(|| { + FormatParseError::Search("% directive opened but not closed".to_string()) + })?; + let directive_contents = &s[pcnt_pos..=next_pcnt]; + match directive_contents { + /* An empty directive is a literal percent. */ + "%%" => { + components.push(ArchiveOverviewFormatComponent::Escaped('%')); + } + /* Otherwise, parse the space between percents. */ + d => { + let directive = ArchiveOverviewFormatDirective::parse(&d[1..(d.len() - 1)]) + .map_err(FormatParseError::Directive)?; + components.push(ArchiveOverviewFormatComponent::Directive(directive)); + } + } + last_source_position += directive_contents.len(); + } + if s.len() > last_source_position { + components.push(ArchiveOverviewFormatComponent::Literal( + s[last_source_position..].to_string(), + )); + } + Ok(Self { components }) + } +} + #[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum UnixModeFormat { #[default] @@ -111,6 +275,12 @@ pub struct EntryFormatSpec { components: Vec, } +impl EntryFormatSpec { + pub fn parse(s: &str) -> Result { + todo!() + } +} + #[derive(Debug, Default)] pub enum FormatSpec { #[default] @@ -127,7 +297,14 @@ impl FormatSpec { archive_format: String, entry_format: String, ) -> Result { - todo!() + let overview = ArchiveOverviewFormatSpec::parse(&archive_format).map_err(|e| { + Info::exit_arg_invalid(&format!( + "failed to parse archive format string {archive_format:?}: {e}" + )) + })?; + dbg!(&overview); + let entry = EntryFormatSpec::parse(&entry_format)?; + Ok(Self::Custom { overview, entry }) } } @@ -247,7 +424,7 @@ output directory. /* Try parsing match specs! */ b"--expr" => { - let new_expr = MatchExpression::parse_argv(&mut argv)?; + let new_expr = MatchExpression::parse_argv::(&mut argv)?; if let Some(prev_expr) = match_expr.take() { return Err(Self::exit_arg_invalid(&format!( "multiple match expressions provided: {prev_expr:?} and {new_expr:?}" From 4ca7ca1bae64a507a7aeba4f5cc226a173809034 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:36:44 -0400 Subject: [PATCH 35/68] make a trait for format parsing --- cli/src/args/info.rs | 70 ++++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/cli/src/args/info.rs b/cli/src/args/info.rs index 3c606ff4e..1a2ca0f17 100644 --- a/cli/src/args/info.rs +++ b/cli/src/args/info.rs @@ -154,21 +154,15 @@ impl ArchiveOverviewFormatDirective { } } -#[derive(Debug)] -pub enum ArchiveOverviewFormatComponent { - Directive(ArchiveOverviewFormatDirective), - Escaped(char), - Literal(String), -} - -#[derive(Debug)] -pub struct ArchiveOverviewFormatSpec { - components: Vec, -} - -impl ArchiveOverviewFormatSpec { - pub fn parse(s: &str) -> Result { - let mut components: Vec = Vec::new(); +trait ParseableFormat: Sized { + type Component: Sized; + const ESCAPED: Self::Component; + fn make_literal(s: &str) -> Self::Component; + fn parse_directive(s: &str) -> Result; + fn from_components(components: Vec) -> Self; + + fn parse_format(s: &str) -> Result { + let mut components: Vec = Vec::new(); let mut last_source_position: usize = 0; while let Some(pcnt_pos) = s[last_source_position..] .find('%') @@ -176,9 +170,7 @@ impl ArchiveOverviewFormatSpec { { /* Anything in between directives is a literal string. */ if pcnt_pos > last_source_position { - components.push(ArchiveOverviewFormatComponent::Literal( - s[last_source_position..pcnt_pos].to_string(), - )); + components.push(Self::make_literal(&s[last_source_position..pcnt_pos])); last_source_position = pcnt_pos; } let next_pcnt = s[(pcnt_pos + 1)..] @@ -191,23 +183,49 @@ impl ArchiveOverviewFormatSpec { match directive_contents { /* An empty directive is a literal percent. */ "%%" => { - components.push(ArchiveOverviewFormatComponent::Escaped('%')); + components.push(Self::ESCAPED); } /* Otherwise, parse the space between percents. */ d => { - let directive = ArchiveOverviewFormatDirective::parse(&d[1..(d.len() - 1)]) + let directive = Self::parse_directive(&d[1..(d.len() - 1)]) .map_err(FormatParseError::Directive)?; - components.push(ArchiveOverviewFormatComponent::Directive(directive)); + components.push(directive); } } last_source_position += directive_contents.len(); } if s.len() > last_source_position { - components.push(ArchiveOverviewFormatComponent::Literal( - s[last_source_position..].to_string(), - )); + components.push(Self::make_literal(&s[last_source_position..])); } - Ok(Self { components }) + Ok(Self::from_components(components)) + } +} + +#[derive(Debug)] +pub enum ArchiveOverviewFormatComponent { + Directive(ArchiveOverviewFormatDirective), + EscapedPercent, + Literal(String), +} + +#[derive(Debug)] +pub struct ArchiveOverviewFormatSpec { + pub components: Vec, +} + +impl ParseableFormat for ArchiveOverviewFormatSpec { + type Component = ArchiveOverviewFormatComponent; + const ESCAPED: Self::Component = ArchiveOverviewFormatComponent::EscapedPercent; + fn make_literal(s: &str) -> Self::Component { + ArchiveOverviewFormatComponent::Literal(s.to_string()) + } + fn parse_directive(s: &str) -> Result { + Ok(ArchiveOverviewFormatComponent::Directive( + ArchiveOverviewFormatDirective::parse(s)?, + )) + } + fn from_components(components: Vec) -> Self { + Self { components } } } @@ -297,7 +315,7 @@ impl FormatSpec { archive_format: String, entry_format: String, ) -> Result { - let overview = ArchiveOverviewFormatSpec::parse(&archive_format).map_err(|e| { + let overview = ArchiveOverviewFormatSpec::parse_format(&archive_format).map_err(|e| { Info::exit_arg_invalid(&format!( "failed to parse archive format string {archive_format:?}: {e}" )) From f9cad05ae0b6ec0efd39c4c04f7a1a3c7c5d21f8 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:58:13 -0400 Subject: [PATCH 36/68] parse entry format --- cli/src/args/info.rs | 157 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 151 insertions(+), 6 deletions(-) diff --git a/cli/src/args/info.rs b/cli/src/args/info.rs index 1a2ca0f17..083d0ab6a 100644 --- a/cli/src/args/info.rs +++ b/cli/src/args/info.rs @@ -236,6 +236,19 @@ pub enum UnixModeFormat { Pretty, } +impl UnixModeFormat { + pub fn parse(s: &str) -> Result { + match s { + "" => Ok(Self::default()), + ":octal" => Ok(Self::Octal), + ":pretty" => Ok(Self::Pretty), + _ => Err(ModifierParseError(format!( + "unrecognized unix mode format: {s:?}" + ))), + } + } +} + #[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum TimestampFormat { UnixEpochMilliseconds, @@ -245,6 +258,21 @@ pub enum TimestampFormat { DateAndTime, } +impl TimestampFormat { + pub fn parse(s: &str) -> Result { + match s { + "" => Ok(Self::default()), + ":epoch" => Ok(Self::UnixEpochMilliseconds), + ":date" => Ok(Self::DateOnly), + ":time" => Ok(Self::TimeOnly), + ":date-time" => Ok(Self::DateAndTime), + _ => Err(ModifierParseError(format!( + "unrecognized timestamp format: {s:?}" + ))), + } + } +} + #[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum CompressionMethodFormat { Abbreviated, @@ -252,6 +280,19 @@ pub enum CompressionMethodFormat { Full, } +impl CompressionMethodFormat { + pub fn parse(s: &str) -> Result { + match s { + "" => Ok(Self::default()), + ":abbrev" => Ok(Self::Abbreviated), + ":full" => Ok(Self::Full), + _ => Err(ModifierParseError(format!( + "unrecognized compression method format: {s:?}" + ))), + } + } +} + #[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum BinaryNumericValueFormat { Decimal, @@ -259,6 +300,19 @@ pub enum BinaryNumericValueFormat { Hexadecimal, } +impl BinaryNumericValueFormat { + pub fn parse(s: &str) -> Result { + match s { + "" => Ok(Self::default()), + ":decimal" => Ok(Self::Decimal), + ":hex" => Ok(Self::Hexadecimal), + _ => Err(ModifierParseError(format!( + "unrecognized binary numeric value format: {s:?}" + ))), + } + } +} + #[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum FileTypeFormat { Abbreviated, @@ -266,6 +320,19 @@ pub enum FileTypeFormat { Full, } +impl FileTypeFormat { + pub fn parse(s: &str) -> Result { + match s { + "" => Ok(Self::default()), + ":abbrev" => Ok(Self::Abbreviated), + ":full" => Ok(Self::Full), + _ => Err(ModifierParseError(format!( + "unrecognized file type format: {s:?}" + ))), + } + } +} + #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum EntryFormatDirective { Name, @@ -282,20 +349,95 @@ pub enum EntryFormatDirective { Timestamp(TimestampFormat), } +impl EntryFormatDirective { + pub fn parse(s: &str) -> Result { + match s { + "name" => Ok(Self::Name), + s if s.starts_with("type") => { + let type_fmt = FileTypeFormat::parse(&s["type".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::FileType(type_fmt)) + } + s if s.starts_with("comment") => { + let str_fmt = BinaryStringFormat::parse(&s["comment".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::Comment(str_fmt)) + } + s if s.starts_with("header-start") => { + let offset_fmt = OffsetFormat::parse(&s["header-start".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::LocalHeaderStart(offset_fmt)) + } + s if s.starts_with("content-start") => { + let offset_fmt = OffsetFormat::parse(&s["content-start".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::ContentStart(offset_fmt)) + } + s if s.starts_with("content-end") => { + let offset_fmt = OffsetFormat::parse(&s["content-end".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::ContentEnd(offset_fmt)) + } + s if s.starts_with("compressed-size") => { + let size_fmt = ByteSizeFormat::parse(&s["compressed-size".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::CompressedSize(size_fmt)) + } + s if s.starts_with("uncompressed-size") => { + let size_fmt = ByteSizeFormat::parse(&s["uncompressed-size".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::UncompressedSize(size_fmt)) + } + s if s.starts_with("unix-mode") => { + let mode_fmt = UnixModeFormat::parse(&s["unix-mode".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::UnixMode(mode_fmt)) + } + s if s.starts_with("compression-method") => { + let method_fmt = CompressionMethodFormat::parse(&s["compression-method".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::CompressionMethod(method_fmt)) + } + s if s.starts_with("crc") => { + let num_fmt = BinaryNumericValueFormat::parse(&s["crc".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::CrcValue(num_fmt)) + } + s if s.starts_with("timestamp") => { + let ts_fmt = TimestampFormat::parse(&s["timestamp".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::Timestamp(ts_fmt)) + } + _ => Err(DirectiveParseError::Unrecognized(s.to_string())), + } + } +} + #[derive(Debug)] pub enum EntryFormatComponent { Directive(EntryFormatDirective), + EscapedPercent, Literal(String), } #[derive(Debug)] pub struct EntryFormatSpec { - components: Vec, + pub components: Vec, } -impl EntryFormatSpec { - pub fn parse(s: &str) -> Result { - todo!() +impl ParseableFormat for EntryFormatSpec { + type Component = EntryFormatComponent; + const ESCAPED: Self::Component = EntryFormatComponent::EscapedPercent; + fn make_literal(s: &str) -> Self::Component { + EntryFormatComponent::Literal(s.to_string()) + } + fn parse_directive(s: &str) -> Result { + Ok(EntryFormatComponent::Directive( + EntryFormatDirective::parse(s)?, + )) + } + fn from_components(components: Vec) -> Self { + Self { components } } } @@ -320,8 +462,11 @@ impl FormatSpec { "failed to parse archive format string {archive_format:?}: {e}" )) })?; - dbg!(&overview); - let entry = EntryFormatSpec::parse(&entry_format)?; + let entry = EntryFormatSpec::parse_format(&entry_format).map_err(|e| { + Info::exit_arg_invalid(&format!( + "failed to parse entry format string {entry_format:?}: {e}" + )) + })?; Ok(Self::Custom { overview, entry }) } } From 9967050d4aa3374b2c6122404e9c94dd35ab9787 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Tue, 27 Aug 2024 17:44:07 -0400 Subject: [PATCH 37/68] finish help text for info --- cli/src/args/compress.rs | 1 + cli/src/args/info.rs | 129 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+) diff --git a/cli/src/args/compress.rs b/cli/src/args/compress.rs index 01afa648a..d1b82a448 100644 --- a/cli/src/args/compress.rs +++ b/cli/src/args/compress.rs @@ -70,6 +70,7 @@ impl Compress { const ZSTD_HELP_LINE: &'static str = ""; } +/* TODO: add support for entry and file comments! */ impl CommandFormat for Compress { const COMMAND_NAME: &'static str = "compress"; const COMMAND_TABS: &'static str = "\t"; diff --git a/cli/src/args/info.rs b/cli/src/args/info.rs index 083d0ab6a..5ed0b56f0 100644 --- a/cli/src/args/info.rs +++ b/cli/src/args/info.rs @@ -498,6 +498,9 @@ archive and individual entries is printed to stdout. This format, along with the tools. For stable output, a custom format string should be provided with --format. +*Note:* the archive metadata is printed *after* the metadata for each entry, +because zip files store metadata at the end of the file! + Note that the contents of individual entries are not accessible with this command, and should instead be extracted with the '{}' subcommand, which can write entries to stdout or a given file path as well as extracted into an @@ -515,7 +518,133 @@ output directory. accepted. Explicit trailing newlines must be specified and will not be inserted automatically. + Note again that archive metadata is printed after all entries + are formatted. + # Format specs: +Format specs are literal strings interspersed with directives, which are +surrounded by *paired* '%' characters. This is different from typical %-encoded +format strings which only use a single '%'. A doubled '%%' produces a literal +'%', while '%name%' encodes a directive "name". The directives for archive and +entry format strings are different, but certain directives are parsed with +modifier strings which are shared across both format types. These modifiers are +discussed in the section on . + +## Archive format directives: +This is printed at the bottom of the output, after all entries are formatted. + +%name% + The name of the file provided as input, or '' for stdin. + +%size% + The size of the entire archive. + +%num% + The number of entries in the archive. + +%comment% + The archive comment, if provided (otherwise an empty string). + +%offset% + The offset of the first entry's local header from the start of the + file. This is where the zip file content starts, and arbitrary data may be + present in the space before this point. + +%cde-offset% + The offset of the central directory record from the start of the file. This + is where entry contents end, and after this point is only zip metadata until + the end of the file. + +## Entry format directives: +This is printed for each entry. Note again that no newlines are inserted +automatically, so an explicit trailing newline must be provided to avoid writing +all the output to a single line. + +%name% + The name of the entry in the archive. This is the relative path that the + entry would be extracted to. + +%type% + The type of the entry (file, directory, or symlink). + +%comment% + The entry comment, if provided (otherwise an empty string). + +%header-start% + The offset of the entry's local header, which comes before any + entry contents. + +%content-start% + The offset of the entry's possibly-compressed content, which comes after the + local header. + +%content-end% + The offset of the end of the entry's possibly-compressed content. The next + entry's local header begins immediately after. + +%compressed-size% + The size of the entry's possibly-compressed content as stored in + the archive. + +%uncompressed-size% + The size of the entry's content after decompression, as it would be + after extraction. + +%unix-mode% + The mode bits for the entry, if set. If unset, this is interpreted as + a value of 0. + +%compression-method% + The method used to compress the entry. + +%crc% + The CRC32 value for the entry. + +%timestamp% + The timestamp for the entry. + + Note that zip timestamps only have precision down to the minute. + +## Entry format directives: + +## Modifiers : +byte-size = '' [DEFAULT => decimal] + = ':decimal' (decimal numeric representation) + = ':human' (human-abbreviated size e.g. 1K, 1M) + +offset = '' [DEFAULT => hex] + = ':decimal' (decimal numeric representation) + = ':hex' (hexadecimal numeric representation) + +bin-str = '' [DEFAULT => print] + = ':print' (print string, erroring upon invalid unicode) + = ':escape' (surround with "" and escape non-unicode characters) + = ':write' (write string to output without checking for unicode) + +unix-mode = '' [DEFAULT => octal] + = ':octal' (octal numeric representation) + = ':pretty' (`ls`-like permissions string) + +timestamp = '' [DEFAULT => date-time] + = ':epoch' (milliseconds since unix epoch as a decimal number) + = ':date' (ISO 8601 string representation of date) + = ':time' (HH:MM string representation of time) + = ':date-time' + (ISO 8601 date then HH:MM time joined by a space) + +compression-method + = '' [DEFAULT => full] + = ':abbrev' (abbreviated name of method) + = ':full' (full name of method) + +bin-num = '' [DEFAULT => hex] + = ':decimal' (decimal numeric representation) + = ':hex' (hexadecimal numeric representation) + +file-type = '' [DEFAULT => full] + = ':abbrev' (abbreviated name of file type) + = ':full' (full name of file type) + {} From c5018ee2051daf0ded36cbd6f296dd2ac3b17b45 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Tue, 27 Aug 2024 17:46:46 -0400 Subject: [PATCH 38/68] add info command stub --- cli/src/args/info.rs | 2 +- cli/src/info.rs | 18 ++++++++++++++++++ cli/src/lib.rs | 1 + 3 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 cli/src/info.rs diff --git a/cli/src/args/info.rs b/cli/src/args/info.rs index 5ed0b56f0..737a7117b 100644 --- a/cli/src/args/info.rs +++ b/cli/src/args/info.rs @@ -766,6 +766,6 @@ file-type = '' [DEFAULT => full] impl crate::driver::ExecuteCommand for Info { fn execute(self, err: impl std::io::Write) -> Result<(), crate::CommandError> { - todo!() + crate::info::execute_info(err, self) } } diff --git a/cli/src/info.rs b/cli/src/info.rs new file mode 100644 index 000000000..79ebade99 --- /dev/null +++ b/cli/src/info.rs @@ -0,0 +1,18 @@ +use std::{ + fs, + io::{self, Cursor, IsTerminal, Seek, Write}, + mem, + path::Path, +}; + +use zip::{ + unstable::path_to_string, + write::{SimpleFileOptions, ZipWriter}, + CompressionMethod, ZIP64_BYTES_THR, +}; + +use crate::{args::info::*, CommandError, OutputHandle, WrapCommandErr}; + +pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> { + todo!() +} diff --git a/cli/src/lib.rs b/cli/src/lib.rs index 246f2ab9c..b29151eec 100644 --- a/cli/src/lib.rs +++ b/cli/src/lib.rs @@ -7,6 +7,7 @@ use std::{fs, io}; pub mod args; pub mod compress; pub mod extract; +pub mod info; pub enum ErrHandle { Output(W), From e641d41ba694ee254c8f45e251d4925f4ce284d8 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Tue, 27 Aug 2024 22:52:42 -0400 Subject: [PATCH 39/68] implement basic entry info --- cli/src/args/info.rs | 25 +- cli/src/extract.rs | 8 +- cli/src/extract/transform.rs | 2 +- cli/src/info.rs | 508 ++++++++++++++++++++++++++++++++++- cli/src/lib.rs | 6 + 5 files changed, 530 insertions(+), 19 deletions(-) diff --git a/cli/src/args/info.rs b/cli/src/args/info.rs index 737a7117b..cef24e8c2 100644 --- a/cli/src/args/info.rs +++ b/cli/src/args/info.rs @@ -156,7 +156,8 @@ impl ArchiveOverviewFormatDirective { trait ParseableFormat: Sized { type Component: Sized; - const ESCAPED: Self::Component; + const ESCAPED_PERCENT: Self::Component; + const ESCAPED_NEWLINE: Self::Component; fn make_literal(s: &str) -> Self::Component; fn parse_directive(s: &str) -> Result; fn from_components(components: Vec) -> Self; @@ -183,7 +184,11 @@ trait ParseableFormat: Sized { match directive_contents { /* An empty directive is a literal percent. */ "%%" => { - components.push(Self::ESCAPED); + components.push(Self::ESCAPED_PERCENT); + } + /* A single '!' directive is a literal newline. */ + "%!%" => { + components.push(Self::ESCAPED_NEWLINE); } /* Otherwise, parse the space between percents. */ d => { @@ -205,6 +210,7 @@ trait ParseableFormat: Sized { pub enum ArchiveOverviewFormatComponent { Directive(ArchiveOverviewFormatDirective), EscapedPercent, + EscapedNewline, Literal(String), } @@ -215,7 +221,8 @@ pub struct ArchiveOverviewFormatSpec { impl ParseableFormat for ArchiveOverviewFormatSpec { type Component = ArchiveOverviewFormatComponent; - const ESCAPED: Self::Component = ArchiveOverviewFormatComponent::EscapedPercent; + const ESCAPED_PERCENT: Self::Component = ArchiveOverviewFormatComponent::EscapedPercent; + const ESCAPED_NEWLINE: Self::Component = ArchiveOverviewFormatComponent::EscapedNewline; fn make_literal(s: &str) -> Self::Component { ArchiveOverviewFormatComponent::Literal(s.to_string()) } @@ -417,6 +424,7 @@ impl EntryFormatDirective { pub enum EntryFormatComponent { Directive(EntryFormatDirective), EscapedPercent, + EscapedNewline, Literal(String), } @@ -427,7 +435,8 @@ pub struct EntryFormatSpec { impl ParseableFormat for EntryFormatSpec { type Component = EntryFormatComponent; - const ESCAPED: Self::Component = EntryFormatComponent::EscapedPercent; + const ESCAPED_PERCENT: Self::Component = EntryFormatComponent::EscapedPercent; + const ESCAPED_NEWLINE: Self::Component = EntryFormatComponent::EscapedNewline; fn make_literal(s: &str) -> Self::Component { EntryFormatComponent::Literal(s.to_string()) } @@ -462,6 +471,7 @@ impl FormatSpec { "failed to parse archive format string {archive_format:?}: {e}" )) })?; + dbg!(&entry_format); let entry = EntryFormatSpec::parse_format(&entry_format).map_err(|e| { Info::exit_arg_invalid(&format!( "failed to parse entry format string {entry_format:?}: {e}" @@ -530,6 +540,13 @@ entry format strings are different, but certain directives are parsed with modifier strings which are shared across both format types. These modifiers are discussed in the section on . +## Escape characters: +%% + Prints a literal percent '%'. + +%!% + Prints a single literal newline '\n'. + ## Archive format directives: This is printed at the bottom of the output, after all entries are formatted. diff --git a/cli/src/extract.rs b/cli/src/extract.rs index a0589ee72..3cda277f4 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -7,10 +7,10 @@ use std::{ use crate::{args::extract::*, CommandError, WrapCommandErr}; -mod entries; -mod matcher; -mod receiver; -mod transform; +pub mod entries; +pub mod matcher; +pub mod receiver; +pub mod transform; use entries::IterateEntries; use matcher::EntryMatcher; use receiver::{CompiledEntrySpec, ConcatEntry, EntryData, EntryKind, EntryReceiver, ExtractEntry}; diff --git a/cli/src/extract/transform.rs b/cli/src/extract/transform.rs index 2ae3ade27..cc6ad710d 100644 --- a/cli/src/extract/transform.rs +++ b/cli/src/extract/transform.rs @@ -1,4 +1,4 @@ -use std::{borrow::Cow, collections::VecDeque, ops, path::Path, slice, str}; +use std::{borrow::Cow, collections::VecDeque, ops, path::Path, str}; #[cfg(feature = "rx")] use regex; diff --git a/cli/src/info.rs b/cli/src/info.rs index 79ebade99..60744927d 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -1,18 +1,506 @@ use std::{ - fs, - io::{self, Cursor, IsTerminal, Seek, Write}, - mem, - path::Path, + borrow::Cow, + collections::HashMap, + convert::Infallible, + fmt, fs, + io::{self, Write}, + marker::PhantomData, + path::PathBuf, + sync::{Arc, LazyLock, Mutex}, }; use zip::{ - unstable::path_to_string, - write::{SimpleFileOptions, ZipWriter}, - CompressionMethod, ZIP64_BYTES_THR, + read::{read_zipfile_from_stream, ZipArchive, ZipFile}, + CompressionMethod, }; -use crate::{args::info::*, CommandError, OutputHandle, WrapCommandErr}; +use crate::{ + args::{extract::InputSpec, info::*}, + extract::{ + matcher::{CompiledMatcher, EntryMatcher}, + receiver::{EntryData, EntryKind}, + }, + CommandError, WrapCommandErr, +}; + +trait FormatValue { + type Input<'a>; + type Output<'a>: AsRef; + type E: fmt::Display; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E>; +} + +#[derive(Copy, Clone)] +struct NameString; + +impl FormatValue for NameString { + type Input<'a> = &'a str; + type Output<'a> = &'a str; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + Ok(input) + } +} + +#[derive(Copy, Clone)] +struct FileTypeValue(FileTypeFormat); + +impl FormatValue for FileTypeValue { + type Input<'a> = EntryKind; + type Output<'a> = &'static str; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + Ok(match self.0 { + FileTypeFormat::Full => match input { + EntryKind::File => "file", + EntryKind::Dir => "directory", + EntryKind::Symlink => "symlink", + }, + FileTypeFormat::Abbreviated => match input { + EntryKind::File => "f", + EntryKind::Dir => "d", + EntryKind::Symlink => "s", + }, + }) + } +} + +#[derive(Copy, Clone)] +struct CompressionMethodValue(CompressionMethodFormat); + +impl FormatValue for CompressionMethodValue { + type Input<'a> = CompressionMethod; + type Output<'a> = &'static str; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + Ok(match self.0 { + CompressionMethodFormat::Full => match input { + CompressionMethod::Stored => "stored", + CompressionMethod::Deflated => "deflate", + #[cfg(feature = "deflate64")] + CompressionMethod::Deflate64 => "deflate64", + #[cfg(feature = "bzip2")] + CompressionMethod::Bzip2 => "bzip2", + #[cfg(feature = "zstd")] + CompressionMethod::Zstd => "zstd", + #[cfg(feature = "lzma")] + CompressionMethod::Lzma => "lzma", + #[cfg(feature = "xz")] + CompressionMethod::Xz => "xz", + _ => "unknown", + }, + CompressionMethodFormat::Abbreviated => match input { + CompressionMethod::Stored => "stor", + CompressionMethod::Deflated => "defl", + #[cfg(feature = "deflate64")] + CompressionMethod::Deflate64 => "df64", + #[cfg(feature = "bzip2")] + CompressionMethod::Bzip2 => "bz2", + #[cfg(feature = "zstd")] + CompressionMethod::Zstd => "zst", + #[cfg(feature = "lzma")] + CompressionMethod::Lzma => "lz", + #[cfg(feature = "xz")] + CompressionMethod::Xz => "xz", + _ => "?", + }, + }) + } +} + +static GENERATED_MODE_STRINGS: LazyLock, UnixModeFormat), Arc>>> = + LazyLock::new(|| Mutex::new(HashMap::new())); + +#[derive(Copy, Clone)] +struct UnixModeValue(UnixModeFormat); + +impl UnixModeValue { + const S_IRUSR: u32 = 256; + const S_IWUSR: u32 = 128; + const S_IXUSR: u32 = 64; + + const S_IRGRP: u32 = 32; + const S_IWGRP: u32 = 16; + const S_IXGRP: u32 = 8; + + const S_IROTH: u32 = 4; + const S_IWOTH: u32 = 2; + const S_IXOTH: u32 = 1; + + fn pretty_format_mode_bits(mode: u32) -> [u8; 9] { + let mut ret = [b'-'; 9]; + + if mode & Self::S_IRUSR == Self::S_IRUSR { + ret[0] = b'r'; + } + if mode & Self::S_IWUSR == Self::S_IWUSR { + ret[1] = b'w'; + } + if mode & Self::S_IXUSR == Self::S_IXUSR { + ret[2] = b'x'; + } + + if mode & Self::S_IRGRP == Self::S_IRGRP { + ret[3] = b'r'; + } + if mode & Self::S_IWGRP == Self::S_IWGRP { + ret[4] = b'w'; + } + if mode & Self::S_IXGRP == Self::S_IXGRP { + ret[5] = b'x'; + } + + if mode & Self::S_IROTH == Self::S_IROTH { + ret[6] = b'r'; + } + if mode & Self::S_IWOTH == Self::S_IWOTH { + ret[7] = b'w'; + } + if mode & Self::S_IXOTH == Self::S_IXOTH { + ret[8] = b'x'; + } + + ret + } +} + +impl FormatValue for UnixModeValue { + type Input<'a> = Option; + type Output<'a> = Arc; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + Ok(Arc::clone( + GENERATED_MODE_STRINGS + .lock() + .unwrap() + .entry((input, self.0)) + .or_insert_with(|| { + let x = input.unwrap_or(0); + Arc::from(match self.0 { + UnixModeFormat::Octal => format!("{x:o}"), + UnixModeFormat::Pretty => { + String::from_utf8(Self::pretty_format_mode_bits(x).to_vec()).unwrap() + } + }) + }), + )) + } +} + +#[derive(Copy, Clone)] +struct ByteSizeValue(ByteSizeFormat); + +static ZERO_SIZE: &'static str = "0"; + +impl FormatValue for ByteSizeValue { + type Input<'a> = u64; + type Output<'a> = Cow<'static, str>; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + if input == 0 { + return Ok(Cow::Borrowed(ZERO_SIZE)); + } + Ok(Cow::Owned(match self.0 { + ByteSizeFormat::FullDecimal => format!("{}", input), + ByteSizeFormat::HumanAbbreviated => todo!("human abbreviated byte sizes"), + })) + } +} + +struct ArchiveWithPath { + pub path: PathBuf, + pub archive: ZipArchive, +} + +impl ArchiveWithPath { + pub fn open(path: PathBuf) -> Result { + let f = fs::File::open(&path) + .wrap_err_with(|| format!("failed to open zip input file path {:?}", &path))?; + let archive = ZipArchive::new(f) + .wrap_err_with(|| format!("failed to create zip archive from file {:?}", &path))?; + Ok(Self { path, archive }) + } +} + +trait FormatDirective { + type Data<'a>; + type FieldType: FormatValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a>; + fn value_formatter(&self) -> Self::FieldType; + + fn format_field<'a>( + &self, + data: Self::Data<'a>, + ) -> Result<::Output<'a>, ::E> + { + self.value_formatter() + .format_value(self.extract_field(data)) + } +} + +struct EntryNameField(NameString); + +impl FormatDirective for EntryNameField { + type Data<'a> = EntryData<'a>; + type FieldType = NameString; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.name + } + fn value_formatter(&self) -> NameString { + self.0 + } +} + +struct FileTypeField(FileTypeValue); + +impl FormatDirective for FileTypeField { + type Data<'a> = EntryData<'a>; + type FieldType = FileTypeValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.kind + } + fn value_formatter(&self) -> FileTypeValue { + self.0 + } +} + +struct CompressionMethodField(CompressionMethodValue); + +impl FormatDirective for CompressionMethodField { + type Data<'a> = EntryData<'a>; + type FieldType = CompressionMethodValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.compression + } + fn value_formatter(&self) -> CompressionMethodValue { + self.0 + } +} + +struct UnixModeField(UnixModeValue); + +impl FormatDirective for UnixModeField { + type Data<'a> = EntryData<'a>; + type FieldType = UnixModeValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.unix_mode + } + fn value_formatter(&self) -> UnixModeValue { + self.0 + } +} + +struct UncompressedSizeField(ByteSizeValue); + +impl FormatDirective for UncompressedSizeField { + type Data<'a> = EntryData<'a>; + type FieldType = ByteSizeValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.size + } + fn value_formatter(&self) -> ByteSizeValue { + self.0 + } +} + +trait ComponentFormatter { + type Data<'a>; + + fn write_component<'a>( + &self, + data: Self::Data<'a>, + out: &mut dyn Write, + ) -> Result; +} + +impl ComponentFormatter for FD +where + FD: FormatDirective, +{ + type Data<'a> = ::Data<'a>; + + fn write_component<'a>( + &self, + data: Self::Data<'a>, + out: &mut dyn Write, + ) -> Result { + let output = self + .format_field(data) + .map_err(|e| CommandError::InvalidData(format!("error formatting field: {e}")))?; + let output: &str = output.as_ref(); + let n = output.len(); + out.write_all(output.as_bytes()) + .wrap_err_with(|| format!("failed to write output to stream: {output:?}"))?; + Ok(n) + } +} + +trait EntryComponentFormatter { + fn write_entry_component<'a>( + &self, + data: EntryData<'a>, + out: &mut dyn Write, + ) -> Result; +} + +impl EntryComponentFormatter for CF +where + CF: for<'a> ComponentFormatter = EntryData<'a>>, +{ + fn write_entry_component<'a>( + &self, + data: EntryData<'a>, + out: &mut dyn Write, + ) -> Result { + self.write_component(data, out) + } +} + +enum CompiledEntryFormatComponent { + Directive(Box), + EscapedPercent, + EscapedNewline, + Literal(String), +} + +impl CompiledEntryFormatComponent { + fn compile_directive( + spec: EntryFormatDirective, + ) -> Result, CommandError> { + Ok(match spec { + EntryFormatDirective::Name => Box::new(EntryNameField(NameString)), + EntryFormatDirective::FileType(f) => Box::new(FileTypeField(FileTypeValue(f))), + EntryFormatDirective::UncompressedSize(f) => { + Box::new(UncompressedSizeField(ByteSizeValue(f))) + } + EntryFormatDirective::UnixMode(f) => Box::new(UnixModeField(UnixModeValue(f))), + EntryFormatDirective::CompressionMethod(f) => { + Box::new(CompressionMethodField(CompressionMethodValue(f))) + } + _ => todo!(), + }) + } + + pub fn from_spec(spec: EntryFormatComponent) -> Result { + match spec { + EntryFormatComponent::Directive(directive) => { + Ok(Self::Directive(Self::compile_directive(directive)?)) + } + EntryFormatComponent::EscapedPercent => Ok(Self::EscapedPercent), + EntryFormatComponent::EscapedNewline => Ok(Self::EscapedNewline), + EntryFormatComponent::Literal(lit) => Ok(Self::Literal(lit)), + } + } + + pub fn write_component<'a>( + &self, + data: EntryData<'a>, + mut out: impl Write, + ) -> Result { + match self { + Self::Directive(directive) => directive.write_entry_component(data, &mut out), + Self::EscapedPercent => out + .write_all(b"%") + .wrap_err("failed to write escaped % to output") + .map(|()| 1), + Self::EscapedNewline => out + .write_all(b"\n") + .wrap_err("failed to write escaped newline to output") + .map(|()| 1), + Self::Literal(lit) => out + .write_all(lit.as_bytes()) + .wrap_err_with(|| format!("failed to write literal {lit:?} to output")) + .map(|()| lit.len()), + } + } +} + +struct CompiledEntryFormatter { + components: Vec, +} + +impl CompiledEntryFormatter { + pub fn from_spec(spec: EntryFormatSpec) -> Result { + let EntryFormatSpec { components } = spec; + let components: Vec<_> = components + .into_iter() + .map(CompiledEntryFormatComponent::from_spec) + .collect::>()?; + Ok(Self { components }) + } + + pub fn write_entry<'a>( + &self, + data: EntryData<'a>, + mut out: impl Write, + ) -> Result { + let mut written: usize = 0; + for c in self.components.iter() { + written += c.write_component(data, &mut out)?; + } + Ok(written) + } +} + +pub fn execute_info(err: impl Write, args: Info) -> Result<(), CommandError> { + let Info { + format_spec, + match_expr, + input_spec: InputSpec { + stdin_stream, + zip_paths, + }, + } = args; + + let matcher = match match_expr { + None => None, + Some(expr) => Some(CompiledMatcher::from_arg(expr)?), + }; + let (archive_formatter, entry_formatter) = match format_spec { + FormatSpec::Compact => todo!(), + FormatSpec::Extended => todo!(), + FormatSpec::Custom { overview, entry } => ((), CompiledEntryFormatter::from_spec(entry)?), + }; + let mut output_stream = io::stdout().lock(); + + if stdin_stream { + let mut stdin = io::stdin().lock(); + while let Some(entry) = + read_zipfile_from_stream(&mut stdin).wrap_err("error reading zip entry from stdin")? + { + let data = EntryData::from_entry(&entry); + entry_formatter.write_entry(data, &mut output_stream)?; + } + } + for p in zip_paths.into_iter() { + let mut zip = ArchiveWithPath::open(p.clone())?; + for i in 0..zip.archive.len() { + let entry = zip + .archive + .by_index(i) + .wrap_err_with(|| format!("failed to read entry {i} from zip at {p:?}"))?; + let data = EntryData::from_entry(&entry); + entry_formatter.write_entry(data, &mut output_stream)?; + } + } -pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> { - todo!() + Ok(()) } diff --git a/cli/src/lib.rs b/cli/src/lib.rs index b29151eec..24db1aaae 100644 --- a/cli/src/lib.rs +++ b/cli/src/lib.rs @@ -75,6 +75,7 @@ impl io::Seek for OutputHandle { #[derive(Debug)] pub enum CommandError { InvalidArg(String), + InvalidData(String), Io(String, io::Error), Zip(String, zip::result::ZipError), } @@ -122,6 +123,11 @@ pub mod driver { let _ = io::stderr().write_all(msg.as_bytes()); process::exit(ZipCli::ARGV_PARSE_FAILED_EXIT_CODE); } + CommandError::InvalidData(msg) => { + let msg = format!("error processing zip data: {msg}\n"); + let _ = io::stderr().write_all(msg.as_bytes()); + process::exit(ZipCli::ARGV_PARSE_FAILED_EXIT_CODE); + } CommandError::Io(context, e) => { let msg = format!("i/o error: {context}: {e}\n"); let _ = io::stderr().write_all(msg.as_bytes()); From 0c189b237de5cecab000ca7f12fac975beee47a6 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 28 Aug 2024 00:54:05 -0400 Subject: [PATCH 40/68] write directly to the output stream, don't allocate a string --- cli/src/info.rs | 123 +++++++++++++++++++++++++++--------------------- 1 file changed, 69 insertions(+), 54 deletions(-) diff --git a/cli/src/info.rs b/cli/src/info.rs index 60744927d..6b86bab01 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -1,16 +1,12 @@ use std::{ - borrow::Cow, - collections::HashMap, convert::Infallible, fmt, fs, io::{self, Write}, - marker::PhantomData, path::PathBuf, - sync::{Arc, LazyLock, Mutex}, }; use zip::{ - read::{read_zipfile_from_stream, ZipArchive, ZipFile}, + read::{read_zipfile_from_stream, ZipArchive}, CompressionMethod, }; @@ -23,10 +19,23 @@ use crate::{ CommandError, WrapCommandErr, }; +trait Writeable { + fn write_to(&self, out: &mut dyn Write) -> Result<(), io::Error>; +} + +impl Writeable for S +where + S: fmt::Display, +{ + fn write_to(&self, out: &mut dyn Write) -> Result<(), io::Error> { + write!(out, "{}", self) + } +} + trait FormatValue { type Input<'a>; - type Output<'a>: AsRef; - type E: fmt::Display; + type Output<'a>; + type E; fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E>; } @@ -108,9 +117,6 @@ impl FormatValue for CompressionMethodValue { } } -static GENERATED_MODE_STRINGS: LazyLock, UnixModeFormat), Arc>>> = - LazyLock::new(|| Mutex::new(HashMap::new())); - #[derive(Copy, Clone)] struct UnixModeValue(UnixModeFormat); @@ -164,46 +170,59 @@ impl UnixModeValue { } } +#[derive(Debug)] +enum ModeValueWriter { + Octal(u32), + Pretty([u8; 9]), +} + +impl Writeable for ModeValueWriter { + fn write_to(&self, out: &mut dyn Write) -> Result<(), io::Error> { + match self { + Self::Octal(mode) => write!(out, "{:o}", mode), + Self::Pretty(bits) => out.write_all(bits.as_ref()), + } + } +} + impl FormatValue for UnixModeValue { type Input<'a> = Option; - type Output<'a> = Arc; + type Output<'a> = ModeValueWriter; type E = Infallible; fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { - Ok(Arc::clone( - GENERATED_MODE_STRINGS - .lock() - .unwrap() - .entry((input, self.0)) - .or_insert_with(|| { - let x = input.unwrap_or(0); - Arc::from(match self.0 { - UnixModeFormat::Octal => format!("{x:o}"), - UnixModeFormat::Pretty => { - String::from_utf8(Self::pretty_format_mode_bits(x).to_vec()).unwrap() - } - }) - }), - )) + let x = input.unwrap_or(0); + Ok(match self.0 { + UnixModeFormat::Octal => ModeValueWriter::Octal(x), + UnixModeFormat::Pretty => ModeValueWriter::Pretty(Self::pretty_format_mode_bits(x)), + }) } } #[derive(Copy, Clone)] struct ByteSizeValue(ByteSizeFormat); -static ZERO_SIZE: &'static str = "0"; +#[derive(Debug)] +enum ByteSizeWriter { + FullDecimal(u64), +} + +impl fmt::Display for ByteSizeWriter { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::FullDecimal(n) => write!(f, "{}", n), + } + } +} impl FormatValue for ByteSizeValue { type Input<'a> = u64; - type Output<'a> = Cow<'static, str>; + type Output<'a> = ByteSizeWriter; type E = Infallible; fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { - if input == 0 { - return Ok(Cow::Borrowed(ZERO_SIZE)); - } - Ok(Cow::Owned(match self.0 { - ByteSizeFormat::FullDecimal => format!("{}", input), + Ok(match self.0 { + ByteSizeFormat::FullDecimal => ByteSizeWriter::FullDecimal(input), ByteSizeFormat::HumanAbbreviated => todo!("human abbreviated byte sizes"), - })) + }) } } @@ -328,12 +347,14 @@ trait ComponentFormatter { &self, data: Self::Data<'a>, out: &mut dyn Write, - ) -> Result; + ) -> Result<(), CommandError>; } impl ComponentFormatter for FD where FD: FormatDirective, + for<'a> <::FieldType as FormatValue>::Output<'a>: Writeable + fmt::Debug, + <::FieldType as FormatValue>::E: fmt::Display, { type Data<'a> = ::Data<'a>; @@ -341,15 +362,13 @@ where &self, data: Self::Data<'a>, out: &mut dyn Write, - ) -> Result { + ) -> Result<(), CommandError> { let output = self .format_field(data) .map_err(|e| CommandError::InvalidData(format!("error formatting field: {e}")))?; - let output: &str = output.as_ref(); - let n = output.len(); - out.write_all(output.as_bytes()) - .wrap_err_with(|| format!("failed to write output to stream: {output:?}"))?; - Ok(n) + output + .write_to(out) + .wrap_err_with(|| format!("failed to write output to stream: {output:?}")) } } @@ -358,7 +377,7 @@ trait EntryComponentFormatter { &self, data: EntryData<'a>, out: &mut dyn Write, - ) -> Result; + ) -> Result<(), CommandError>; } impl EntryComponentFormatter for CF @@ -369,7 +388,7 @@ where &self, data: EntryData<'a>, out: &mut dyn Write, - ) -> Result { + ) -> Result<(), CommandError> { self.write_component(data, out) } } @@ -414,21 +433,18 @@ impl CompiledEntryFormatComponent { &self, data: EntryData<'a>, mut out: impl Write, - ) -> Result { + ) -> Result<(), CommandError> { match self { Self::Directive(directive) => directive.write_entry_component(data, &mut out), Self::EscapedPercent => out .write_all(b"%") - .wrap_err("failed to write escaped % to output") - .map(|()| 1), + .wrap_err("failed to write escaped % to output"), Self::EscapedNewline => out .write_all(b"\n") - .wrap_err("failed to write escaped newline to output") - .map(|()| 1), + .wrap_err("failed to write escaped newline to output"), Self::Literal(lit) => out .write_all(lit.as_bytes()) - .wrap_err_with(|| format!("failed to write literal {lit:?} to output")) - .map(|()| lit.len()), + .wrap_err_with(|| format!("failed to write literal {lit:?} to output")), } } } @@ -451,12 +467,11 @@ impl CompiledEntryFormatter { &self, data: EntryData<'a>, mut out: impl Write, - ) -> Result { - let mut written: usize = 0; + ) -> Result<(), CommandError> { for c in self.components.iter() { - written += c.write_component(data, &mut out)?; + c.write_component(data, &mut out)?; } - Ok(written) + Ok(()) } } From a76244b60dae0a580485b063cabb927eb433aa90 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 28 Aug 2024 01:02:58 -0400 Subject: [PATCH 41/68] add escaped tab component --- cli/src/args/info.rs | 8 ++++++++ cli/src/info.rs | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/cli/src/args/info.rs b/cli/src/args/info.rs index cef24e8c2..3b34b2c9e 100644 --- a/cli/src/args/info.rs +++ b/cli/src/args/info.rs @@ -158,6 +158,7 @@ trait ParseableFormat: Sized { type Component: Sized; const ESCAPED_PERCENT: Self::Component; const ESCAPED_NEWLINE: Self::Component; + const ESCAPED_TAB: Self::Component; fn make_literal(s: &str) -> Self::Component; fn parse_directive(s: &str) -> Result; fn from_components(components: Vec) -> Self; @@ -190,6 +191,9 @@ trait ParseableFormat: Sized { "%!%" => { components.push(Self::ESCAPED_NEWLINE); } + "%,%" => { + components.push(Self::ESCAPED_TAB); + } /* Otherwise, parse the space between percents. */ d => { let directive = Self::parse_directive(&d[1..(d.len() - 1)]) @@ -211,6 +215,7 @@ pub enum ArchiveOverviewFormatComponent { Directive(ArchiveOverviewFormatDirective), EscapedPercent, EscapedNewline, + EscapedTab, Literal(String), } @@ -223,6 +228,7 @@ impl ParseableFormat for ArchiveOverviewFormatSpec { type Component = ArchiveOverviewFormatComponent; const ESCAPED_PERCENT: Self::Component = ArchiveOverviewFormatComponent::EscapedPercent; const ESCAPED_NEWLINE: Self::Component = ArchiveOverviewFormatComponent::EscapedNewline; + const ESCAPED_TAB: Self::Component = ArchiveOverviewFormatComponent::EscapedTab; fn make_literal(s: &str) -> Self::Component { ArchiveOverviewFormatComponent::Literal(s.to_string()) } @@ -425,6 +431,7 @@ pub enum EntryFormatComponent { Directive(EntryFormatDirective), EscapedPercent, EscapedNewline, + EscapedTab, Literal(String), } @@ -437,6 +444,7 @@ impl ParseableFormat for EntryFormatSpec { type Component = EntryFormatComponent; const ESCAPED_PERCENT: Self::Component = EntryFormatComponent::EscapedPercent; const ESCAPED_NEWLINE: Self::Component = EntryFormatComponent::EscapedNewline; + const ESCAPED_TAB: Self::Component = EntryFormatComponent::EscapedTab; fn make_literal(s: &str) -> Self::Component { EntryFormatComponent::Literal(s.to_string()) } diff --git a/cli/src/info.rs b/cli/src/info.rs index 6b86bab01..7e03e403f 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -397,6 +397,7 @@ enum CompiledEntryFormatComponent { Directive(Box), EscapedPercent, EscapedNewline, + EscapedTab, Literal(String), } @@ -425,6 +426,7 @@ impl CompiledEntryFormatComponent { } EntryFormatComponent::EscapedPercent => Ok(Self::EscapedPercent), EntryFormatComponent::EscapedNewline => Ok(Self::EscapedNewline), + EntryFormatComponent::EscapedTab => Ok(Self::EscapedTab), EntryFormatComponent::Literal(lit) => Ok(Self::Literal(lit)), } } @@ -442,6 +444,9 @@ impl CompiledEntryFormatComponent { Self::EscapedNewline => out .write_all(b"\n") .wrap_err("failed to write escaped newline to output"), + Self::EscapedTab => out + .write_all(b"\t") + .wrap_err("failed to write escaped tab to output"), Self::Literal(lit) => out .write_all(lit.as_bytes()) .wrap_err_with(|| format!("failed to write literal {lit:?} to output")), From cd8a9c7539025f884e9a115fe37d5a2f57bad1a2 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 28 Aug 2024 01:15:27 -0400 Subject: [PATCH 42/68] rename some traits and methods --- cli/src/info.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/cli/src/info.rs b/cli/src/info.rs index 7e03e403f..3713a2538 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -340,17 +340,17 @@ impl FormatDirective for UncompressedSizeField { } } -trait ComponentFormatter { +trait DirectiveFormatter { type Data<'a>; - fn write_component<'a>( + fn write_directive<'a>( &self, data: Self::Data<'a>, out: &mut dyn Write, ) -> Result<(), CommandError>; } -impl ComponentFormatter for FD +impl DirectiveFormatter for FD where FD: FormatDirective, for<'a> <::FieldType as FormatValue>::Output<'a>: Writeable + fmt::Debug, @@ -358,7 +358,7 @@ where { type Data<'a> = ::Data<'a>; - fn write_component<'a>( + fn write_directive<'a>( &self, data: Self::Data<'a>, out: &mut dyn Write, @@ -372,29 +372,29 @@ where } } -trait EntryComponentFormatter { - fn write_entry_component<'a>( +trait EntryDirectiveFormatter { + fn write_entry_directive<'a>( &self, data: EntryData<'a>, out: &mut dyn Write, ) -> Result<(), CommandError>; } -impl EntryComponentFormatter for CF +impl EntryDirectiveFormatter for CF where - CF: for<'a> ComponentFormatter = EntryData<'a>>, + CF: for<'a> DirectiveFormatter = EntryData<'a>>, { - fn write_entry_component<'a>( + fn write_entry_directive<'a>( &self, data: EntryData<'a>, out: &mut dyn Write, ) -> Result<(), CommandError> { - self.write_component(data, out) + self.write_directive(data, out) } } enum CompiledEntryFormatComponent { - Directive(Box), + Directive(Box), EscapedPercent, EscapedNewline, EscapedTab, @@ -404,7 +404,7 @@ enum CompiledEntryFormatComponent { impl CompiledEntryFormatComponent { fn compile_directive( spec: EntryFormatDirective, - ) -> Result, CommandError> { + ) -> Result, CommandError> { Ok(match spec { EntryFormatDirective::Name => Box::new(EntryNameField(NameString)), EntryFormatDirective::FileType(f) => Box::new(FileTypeField(FileTypeValue(f))), @@ -437,7 +437,7 @@ impl CompiledEntryFormatComponent { mut out: impl Write, ) -> Result<(), CommandError> { match self { - Self::Directive(directive) => directive.write_entry_component(data, &mut out), + Self::Directive(directive) => directive.write_entry_directive(data, &mut out), Self::EscapedPercent => out .write_all(b"%") .wrap_err("failed to write escaped % to output"), From 4efaa43b6d27c21a4ca1b5b4b03fe1c266f9d5ca Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 28 Aug 2024 01:23:24 -0400 Subject: [PATCH 43/68] pass around a reference to EntryData instead --- cli/src/info.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cli/src/info.rs b/cli/src/info.rs index 3713a2538..91159099f 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -263,7 +263,7 @@ trait FormatDirective { struct EntryNameField(NameString); impl FormatDirective for EntryNameField { - type Data<'a> = EntryData<'a>; + type Data<'a> = &'a EntryData<'a>; type FieldType = NameString; fn extract_field<'a>( &self, @@ -279,7 +279,7 @@ impl FormatDirective for EntryNameField { struct FileTypeField(FileTypeValue); impl FormatDirective for FileTypeField { - type Data<'a> = EntryData<'a>; + type Data<'a> = &'a EntryData<'a>; type FieldType = FileTypeValue; fn extract_field<'a>( &self, @@ -295,7 +295,7 @@ impl FormatDirective for FileTypeField { struct CompressionMethodField(CompressionMethodValue); impl FormatDirective for CompressionMethodField { - type Data<'a> = EntryData<'a>; + type Data<'a> = &'a EntryData<'a>; type FieldType = CompressionMethodValue; fn extract_field<'a>( &self, @@ -311,7 +311,7 @@ impl FormatDirective for CompressionMethodField { struct UnixModeField(UnixModeValue); impl FormatDirective for UnixModeField { - type Data<'a> = EntryData<'a>; + type Data<'a> = &'a EntryData<'a>; type FieldType = UnixModeValue; fn extract_field<'a>( &self, @@ -327,7 +327,7 @@ impl FormatDirective for UnixModeField { struct UncompressedSizeField(ByteSizeValue); impl FormatDirective for UncompressedSizeField { - type Data<'a> = EntryData<'a>; + type Data<'a> = &'a EntryData<'a>; type FieldType = ByteSizeValue; fn extract_field<'a>( &self, @@ -375,18 +375,18 @@ where trait EntryDirectiveFormatter { fn write_entry_directive<'a>( &self, - data: EntryData<'a>, + data: &EntryData<'a>, out: &mut dyn Write, ) -> Result<(), CommandError>; } impl EntryDirectiveFormatter for CF where - CF: for<'a> DirectiveFormatter = EntryData<'a>>, + CF: for<'a> DirectiveFormatter = &'a EntryData<'a>>, { fn write_entry_directive<'a>( &self, - data: EntryData<'a>, + data: &EntryData<'a>, out: &mut dyn Write, ) -> Result<(), CommandError> { self.write_directive(data, out) @@ -433,7 +433,7 @@ impl CompiledEntryFormatComponent { pub fn write_component<'a>( &self, - data: EntryData<'a>, + data: &EntryData<'a>, mut out: impl Write, ) -> Result<(), CommandError> { match self { @@ -474,7 +474,7 @@ impl CompiledEntryFormatter { mut out: impl Write, ) -> Result<(), CommandError> { for c in self.components.iter() { - c.write_component(data, &mut out)?; + c.write_component(&data, &mut out)?; } Ok(()) } From 2b824783b5969f46fcb0033703484915e6eb1ac5 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:03:25 -0400 Subject: [PATCH 44/68] mess with mode bits to make them look more like ls --- cli/src/info.rs | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/cli/src/info.rs b/cli/src/info.rs index 91159099f..13d9eac10 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -66,9 +66,9 @@ impl FormatValue for FileTypeValue { EntryKind::Symlink => "symlink", }, FileTypeFormat::Abbreviated => match input { - EntryKind::File => "f", + EntryKind::File => "-", EntryKind::Dir => "d", - EntryKind::Symlink => "s", + EntryKind::Symlink => "l", }, }) } @@ -133,6 +133,8 @@ impl UnixModeValue { const S_IWOTH: u32 = 2; const S_IXOTH: u32 = 1; + const UNKNOWN_MODE_BITS: [u8; 9] = [b'?'; 9]; + fn pretty_format_mode_bits(mode: u32) -> [u8; 9] { let mut ret = [b'-'; 9]; @@ -172,14 +174,17 @@ impl UnixModeValue { #[derive(Debug)] enum ModeValueWriter { - Octal(u32), + Octal(Option), Pretty([u8; 9]), } impl Writeable for ModeValueWriter { fn write_to(&self, out: &mut dyn Write) -> Result<(), io::Error> { match self { - Self::Octal(mode) => write!(out, "{:o}", mode), + Self::Octal(mode) => match mode { + Some(bits) => write!(out, "{:o}", bits), + None => write!(out, "?"), + }, Self::Pretty(bits) => out.write_all(bits.as_ref()), } } @@ -190,10 +195,12 @@ impl FormatValue for UnixModeValue { type Output<'a> = ModeValueWriter; type E = Infallible; fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { - let x = input.unwrap_or(0); Ok(match self.0 { - UnixModeFormat::Octal => ModeValueWriter::Octal(x), - UnixModeFormat::Pretty => ModeValueWriter::Pretty(Self::pretty_format_mode_bits(x)), + UnixModeFormat::Octal => ModeValueWriter::Octal(input), + UnixModeFormat::Pretty => ModeValueWriter::Pretty(match input { + Some(bits) => Self::pretty_format_mode_bits(bits), + None => Self::UNKNOWN_MODE_BITS, + }), }) } } From 1c9c081a35aa566cb9f5da78baeb423004a1f2e2 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:14:55 -0400 Subject: [PATCH 45/68] add some verbose logs --- cli/src/info.rs | 48 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/cli/src/info.rs b/cli/src/info.rs index 13d9eac10..ec6e2e3ee 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -235,6 +235,7 @@ impl FormatValue for ByteSizeValue { struct ArchiveWithPath { pub path: PathBuf, + /* TODO: Debug impl for ZipArchive? what about ZipFile? */ pub archive: ZipArchive, } @@ -475,6 +476,10 @@ impl CompiledEntryFormatter { Ok(Self { components }) } + pub fn is_empty(&self) -> bool { + self.components.is_empty() + } + pub fn write_entry<'a>( &self, data: EntryData<'a>, @@ -487,7 +492,7 @@ impl CompiledEntryFormatter { } } -pub fn execute_info(err: impl Write, args: Info) -> Result<(), CommandError> { +pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> { let Info { format_spec, match_expr, @@ -510,22 +515,39 @@ pub fn execute_info(err: impl Write, args: Info) -> Result<(), CommandError> { if stdin_stream { let mut stdin = io::stdin().lock(); - while let Some(entry) = - read_zipfile_from_stream(&mut stdin).wrap_err("error reading zip entry from stdin")? - { - let data = EntryData::from_entry(&entry); - entry_formatter.write_entry(data, &mut output_stream)?; + if entry_formatter.is_empty() { + writeln!(&mut err, "empty entry format, skipping stdin entries").unwrap(); + } else { + while let Some(entry) = read_zipfile_from_stream(&mut stdin) + .wrap_err("error reading zip entry from stdin")? + { + let data = EntryData::from_entry(&entry); + entry_formatter.write_entry(data, &mut output_stream)?; + } } + writeln!( + &mut err, + "stdin currently cannot provide archive format info" + ) + .unwrap(); } for p in zip_paths.into_iter() { let mut zip = ArchiveWithPath::open(p.clone())?; - for i in 0..zip.archive.len() { - let entry = zip - .archive - .by_index(i) - .wrap_err_with(|| format!("failed to read entry {i} from zip at {p:?}"))?; - let data = EntryData::from_entry(&entry); - entry_formatter.write_entry(data, &mut output_stream)?; + if entry_formatter.is_empty() { + writeln!( + &mut err, + "empty entry format, skipping entries for file {p:?}" + ) + .unwrap(); + } else { + for i in 0..zip.archive.len() { + let entry = zip + .archive + .by_index(i) + .wrap_err_with(|| format!("failed to read entry {i} from zip at {p:?}"))?; + let data = EntryData::from_entry(&entry); + entry_formatter.write_entry(data, &mut output_stream)?; + } } } From 2aa66de522fd73566b3bb148b3714daf2fab5b96 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:40:43 -0400 Subject: [PATCH 46/68] refactor info modules --- cli/src/info.rs | 395 +++---------------------------------- cli/src/info/directives.rs | 164 +++++++++++++++ cli/src/info/formats.rs | 211 ++++++++++++++++++++ 3 files changed, 403 insertions(+), 367 deletions(-) create mode 100644 cli/src/info/directives.rs create mode 100644 cli/src/info/formats.rs diff --git a/cli/src/info.rs b/cli/src/info.rs index ec6e2e3ee..eebf9926c 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -1,384 +1,29 @@ use std::{ - convert::Infallible, - fmt, fs, + fs, io::{self, Write}, path::PathBuf, }; -use zip::{ - read::{read_zipfile_from_stream, ZipArchive}, - CompressionMethod, -}; +use zip::read::{read_zipfile_from_stream, ZipArchive}; use crate::{ args::{extract::InputSpec, info::*}, extract::{ matcher::{CompiledMatcher, EntryMatcher}, - receiver::{EntryData, EntryKind}, + receiver::EntryData, }, CommandError, WrapCommandErr, }; -trait Writeable { - fn write_to(&self, out: &mut dyn Write) -> Result<(), io::Error>; -} - -impl Writeable for S -where - S: fmt::Display, -{ - fn write_to(&self, out: &mut dyn Write) -> Result<(), io::Error> { - write!(out, "{}", self) - } -} - -trait FormatValue { - type Input<'a>; - type Output<'a>; - type E; - fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E>; -} - -#[derive(Copy, Clone)] -struct NameString; - -impl FormatValue for NameString { - type Input<'a> = &'a str; - type Output<'a> = &'a str; - type E = Infallible; - fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { - Ok(input) - } -} - -#[derive(Copy, Clone)] -struct FileTypeValue(FileTypeFormat); - -impl FormatValue for FileTypeValue { - type Input<'a> = EntryKind; - type Output<'a> = &'static str; - type E = Infallible; - fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { - Ok(match self.0 { - FileTypeFormat::Full => match input { - EntryKind::File => "file", - EntryKind::Dir => "directory", - EntryKind::Symlink => "symlink", - }, - FileTypeFormat::Abbreviated => match input { - EntryKind::File => "-", - EntryKind::Dir => "d", - EntryKind::Symlink => "l", - }, - }) - } -} - -#[derive(Copy, Clone)] -struct CompressionMethodValue(CompressionMethodFormat); - -impl FormatValue for CompressionMethodValue { - type Input<'a> = CompressionMethod; - type Output<'a> = &'static str; - type E = Infallible; - fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { - Ok(match self.0 { - CompressionMethodFormat::Full => match input { - CompressionMethod::Stored => "stored", - CompressionMethod::Deflated => "deflate", - #[cfg(feature = "deflate64")] - CompressionMethod::Deflate64 => "deflate64", - #[cfg(feature = "bzip2")] - CompressionMethod::Bzip2 => "bzip2", - #[cfg(feature = "zstd")] - CompressionMethod::Zstd => "zstd", - #[cfg(feature = "lzma")] - CompressionMethod::Lzma => "lzma", - #[cfg(feature = "xz")] - CompressionMethod::Xz => "xz", - _ => "unknown", - }, - CompressionMethodFormat::Abbreviated => match input { - CompressionMethod::Stored => "stor", - CompressionMethod::Deflated => "defl", - #[cfg(feature = "deflate64")] - CompressionMethod::Deflate64 => "df64", - #[cfg(feature = "bzip2")] - CompressionMethod::Bzip2 => "bz2", - #[cfg(feature = "zstd")] - CompressionMethod::Zstd => "zst", - #[cfg(feature = "lzma")] - CompressionMethod::Lzma => "lz", - #[cfg(feature = "xz")] - CompressionMethod::Xz => "xz", - _ => "?", - }, - }) - } -} - -#[derive(Copy, Clone)] -struct UnixModeValue(UnixModeFormat); - -impl UnixModeValue { - const S_IRUSR: u32 = 256; - const S_IWUSR: u32 = 128; - const S_IXUSR: u32 = 64; - - const S_IRGRP: u32 = 32; - const S_IWGRP: u32 = 16; - const S_IXGRP: u32 = 8; - - const S_IROTH: u32 = 4; - const S_IWOTH: u32 = 2; - const S_IXOTH: u32 = 1; - - const UNKNOWN_MODE_BITS: [u8; 9] = [b'?'; 9]; - - fn pretty_format_mode_bits(mode: u32) -> [u8; 9] { - let mut ret = [b'-'; 9]; - - if mode & Self::S_IRUSR == Self::S_IRUSR { - ret[0] = b'r'; - } - if mode & Self::S_IWUSR == Self::S_IWUSR { - ret[1] = b'w'; - } - if mode & Self::S_IXUSR == Self::S_IXUSR { - ret[2] = b'x'; - } - - if mode & Self::S_IRGRP == Self::S_IRGRP { - ret[3] = b'r'; - } - if mode & Self::S_IWGRP == Self::S_IWGRP { - ret[4] = b'w'; - } - if mode & Self::S_IXGRP == Self::S_IXGRP { - ret[5] = b'x'; - } - - if mode & Self::S_IROTH == Self::S_IROTH { - ret[6] = b'r'; - } - if mode & Self::S_IWOTH == Self::S_IWOTH { - ret[7] = b'w'; - } - if mode & Self::S_IXOTH == Self::S_IXOTH { - ret[8] = b'x'; - } - - ret - } -} - -#[derive(Debug)] -enum ModeValueWriter { - Octal(Option), - Pretty([u8; 9]), -} - -impl Writeable for ModeValueWriter { - fn write_to(&self, out: &mut dyn Write) -> Result<(), io::Error> { - match self { - Self::Octal(mode) => match mode { - Some(bits) => write!(out, "{:o}", bits), - None => write!(out, "?"), - }, - Self::Pretty(bits) => out.write_all(bits.as_ref()), - } - } -} - -impl FormatValue for UnixModeValue { - type Input<'a> = Option; - type Output<'a> = ModeValueWriter; - type E = Infallible; - fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { - Ok(match self.0 { - UnixModeFormat::Octal => ModeValueWriter::Octal(input), - UnixModeFormat::Pretty => ModeValueWriter::Pretty(match input { - Some(bits) => Self::pretty_format_mode_bits(bits), - None => Self::UNKNOWN_MODE_BITS, - }), - }) - } -} - -#[derive(Copy, Clone)] -struct ByteSizeValue(ByteSizeFormat); - -#[derive(Debug)] -enum ByteSizeWriter { - FullDecimal(u64), -} - -impl fmt::Display for ByteSizeWriter { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::FullDecimal(n) => write!(f, "{}", n), - } - } -} - -impl FormatValue for ByteSizeValue { - type Input<'a> = u64; - type Output<'a> = ByteSizeWriter; - type E = Infallible; - fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { - Ok(match self.0 { - ByteSizeFormat::FullDecimal => ByteSizeWriter::FullDecimal(input), - ByteSizeFormat::HumanAbbreviated => todo!("human abbreviated byte sizes"), - }) - } -} - -struct ArchiveWithPath { - pub path: PathBuf, - /* TODO: Debug impl for ZipArchive? what about ZipFile? */ - pub archive: ZipArchive, -} - -impl ArchiveWithPath { - pub fn open(path: PathBuf) -> Result { - let f = fs::File::open(&path) - .wrap_err_with(|| format!("failed to open zip input file path {:?}", &path))?; - let archive = ZipArchive::new(f) - .wrap_err_with(|| format!("failed to create zip archive from file {:?}", &path))?; - Ok(Self { path, archive }) - } -} - -trait FormatDirective { - type Data<'a>; - type FieldType: FormatValue; - fn extract_field<'a>( - &self, - data: Self::Data<'a>, - ) -> ::Input<'a>; - fn value_formatter(&self) -> Self::FieldType; - - fn format_field<'a>( - &self, - data: Self::Data<'a>, - ) -> Result<::Output<'a>, ::E> - { - self.value_formatter() - .format_value(self.extract_field(data)) - } -} - -struct EntryNameField(NameString); - -impl FormatDirective for EntryNameField { - type Data<'a> = &'a EntryData<'a>; - type FieldType = NameString; - fn extract_field<'a>( - &self, - data: Self::Data<'a>, - ) -> ::Input<'a> { - data.name - } - fn value_formatter(&self) -> NameString { - self.0 - } -} - -struct FileTypeField(FileTypeValue); - -impl FormatDirective for FileTypeField { - type Data<'a> = &'a EntryData<'a>; - type FieldType = FileTypeValue; - fn extract_field<'a>( - &self, - data: Self::Data<'a>, - ) -> ::Input<'a> { - data.kind - } - fn value_formatter(&self) -> FileTypeValue { - self.0 - } -} - -struct CompressionMethodField(CompressionMethodValue); - -impl FormatDirective for CompressionMethodField { - type Data<'a> = &'a EntryData<'a>; - type FieldType = CompressionMethodValue; - fn extract_field<'a>( - &self, - data: Self::Data<'a>, - ) -> ::Input<'a> { - data.compression - } - fn value_formatter(&self) -> CompressionMethodValue { - self.0 - } -} - -struct UnixModeField(UnixModeValue); - -impl FormatDirective for UnixModeField { - type Data<'a> = &'a EntryData<'a>; - type FieldType = UnixModeValue; - fn extract_field<'a>( - &self, - data: Self::Data<'a>, - ) -> ::Input<'a> { - data.unix_mode - } - fn value_formatter(&self) -> UnixModeValue { - self.0 - } -} - -struct UncompressedSizeField(ByteSizeValue); - -impl FormatDirective for UncompressedSizeField { - type Data<'a> = &'a EntryData<'a>; - type FieldType = ByteSizeValue; - fn extract_field<'a>( - &self, - data: Self::Data<'a>, - ) -> ::Input<'a> { - data.size - } - fn value_formatter(&self) -> ByteSizeValue { - self.0 - } -} - -trait DirectiveFormatter { - type Data<'a>; - - fn write_directive<'a>( - &self, - data: Self::Data<'a>, - out: &mut dyn Write, - ) -> Result<(), CommandError>; -} - -impl DirectiveFormatter for FD -where - FD: FormatDirective, - for<'a> <::FieldType as FormatValue>::Output<'a>: Writeable + fmt::Debug, - <::FieldType as FormatValue>::E: fmt::Display, -{ - type Data<'a> = ::Data<'a>; - - fn write_directive<'a>( - &self, - data: Self::Data<'a>, - out: &mut dyn Write, - ) -> Result<(), CommandError> { - let output = self - .format_field(data) - .map_err(|e| CommandError::InvalidData(format!("error formatting field: {e}")))?; - output - .write_to(out) - .wrap_err_with(|| format!("failed to write output to stream: {output:?}")) - } -} +mod directives; +mod formats; +use directives::{ + entry::{ + CompressionMethodField, EntryNameField, FileTypeField, UncompressedSizeField, UnixModeField, + }, + DirectiveFormatter, +}; +use formats::{ByteSizeValue, CompressionMethodValue, FileTypeValue, NameString, UnixModeValue}; trait EntryDirectiveFormatter { fn write_entry_directive<'a>( @@ -492,6 +137,22 @@ impl CompiledEntryFormatter { } } +struct ArchiveWithPath { + pub path: PathBuf, + /* TODO: Debug impl for ZipArchive? what about ZipFile? */ + pub archive: ZipArchive, +} + +impl ArchiveWithPath { + pub fn open(path: PathBuf) -> Result { + let f = fs::File::open(&path) + .wrap_err_with(|| format!("failed to open zip input file path {:?}", &path))?; + let archive = ZipArchive::new(f) + .wrap_err_with(|| format!("failed to create zip archive from file {:?}", &path))?; + Ok(Self { path, archive }) + } +} + pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> { let Info { format_spec, diff --git a/cli/src/info/directives.rs b/cli/src/info/directives.rs new file mode 100644 index 000000000..9b734d328 --- /dev/null +++ b/cli/src/info/directives.rs @@ -0,0 +1,164 @@ +use std::{ + fmt, + io::{self, Write}, +}; + +use super::formats::FormatValue; +use crate::{CommandError, WrapCommandErr}; + +pub trait Writeable { + fn write_to(&self, out: &mut dyn Write) -> Result<(), io::Error>; +} + +impl Writeable for S +where + S: fmt::Display, +{ + fn write_to(&self, out: &mut dyn Write) -> Result<(), io::Error> { + write!(out, "{}", self) + } +} + +pub trait FormatDirective { + type Data<'a>; + type FieldType: FormatValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a>; + fn value_formatter(&self) -> Self::FieldType; + + fn format_field<'a>( + &self, + data: Self::Data<'a>, + ) -> Result<::Output<'a>, ::E> + { + self.value_formatter() + .format_value(self.extract_field(data)) + } +} + +/// Wrap a [`FormatDirective`] and write it to a stream. This isn't directly type-eraseable, but it +/// removes one layer of polymorphism to enable us to do that in a subsequent wrapper trait. +pub trait DirectiveFormatter { + type Data<'a>; + + fn write_directive<'a>( + &self, + data: Self::Data<'a>, + out: &mut dyn Write, + ) -> Result<(), CommandError>; +} + +impl DirectiveFormatter for FD +where + FD: FormatDirective, + for<'a> <::FieldType as FormatValue>::Output<'a>: Writeable + fmt::Debug, + <::FieldType as FormatValue>::E: fmt::Display, +{ + type Data<'a> = ::Data<'a>; + + fn write_directive<'a>( + &self, + data: Self::Data<'a>, + out: &mut dyn Write, + ) -> Result<(), CommandError> { + let output = self + .format_field(data) + .map_err(|e| CommandError::InvalidData(format!("error formatting field: {e}")))?; + output + .write_to(out) + .wrap_err_with(|| format!("failed to write output to stream: {output:?}")) + } +} + +pub mod entry { + use super::{ + super::formats::{ + ByteSizeValue, CompressionMethodValue, FileTypeValue, FormatValue, NameString, + UnixModeValue, + }, + FormatDirective, + }; + use crate::extract::receiver::EntryData; + + pub struct EntryNameField(pub NameString); + + impl FormatDirective for EntryNameField { + type Data<'a> = &'a EntryData<'a>; + type FieldType = NameString; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.name + } + fn value_formatter(&self) -> NameString { + self.0 + } + } + + pub struct FileTypeField(pub FileTypeValue); + + impl FormatDirective for FileTypeField { + type Data<'a> = &'a EntryData<'a>; + type FieldType = FileTypeValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.kind + } + fn value_formatter(&self) -> FileTypeValue { + self.0 + } + } + + pub struct CompressionMethodField(pub CompressionMethodValue); + + impl FormatDirective for CompressionMethodField { + type Data<'a> = &'a EntryData<'a>; + type FieldType = CompressionMethodValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.compression + } + fn value_formatter(&self) -> CompressionMethodValue { + self.0 + } + } + + pub struct UnixModeField(pub UnixModeValue); + + impl FormatDirective for UnixModeField { + type Data<'a> = &'a EntryData<'a>; + type FieldType = UnixModeValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.unix_mode + } + fn value_formatter(&self) -> UnixModeValue { + self.0 + } + } + + pub struct UncompressedSizeField(pub ByteSizeValue); + + impl FormatDirective for UncompressedSizeField { + type Data<'a> = &'a EntryData<'a>; + type FieldType = ByteSizeValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.size + } + fn value_formatter(&self) -> ByteSizeValue { + self.0 + } + } +} diff --git a/cli/src/info/formats.rs b/cli/src/info/formats.rs new file mode 100644 index 000000000..571f384c5 --- /dev/null +++ b/cli/src/info/formats.rs @@ -0,0 +1,211 @@ +use std::{ + convert::Infallible, + fmt, + io::{self, Write}, +}; + +use zip::CompressionMethod; + +use super::directives::Writeable; +use crate::{args::info::*, extract::receiver::EntryKind}; + +pub trait FormatValue { + type Input<'a>; + type Output<'a>; + type E; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E>; +} + +#[derive(Copy, Clone)] +pub struct NameString; + +impl FormatValue for NameString { + type Input<'a> = &'a str; + type Output<'a> = &'a str; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + Ok(input) + } +} + +#[derive(Copy, Clone)] +pub struct FileTypeValue(pub FileTypeFormat); + +impl FormatValue for FileTypeValue { + type Input<'a> = EntryKind; + type Output<'a> = &'static str; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + Ok(match self.0 { + FileTypeFormat::Full => match input { + EntryKind::File => "file", + EntryKind::Dir => "directory", + EntryKind::Symlink => "symlink", + }, + FileTypeFormat::Abbreviated => match input { + EntryKind::File => "-", + EntryKind::Dir => "d", + EntryKind::Symlink => "l", + }, + }) + } +} + +#[derive(Copy, Clone)] +pub struct CompressionMethodValue(pub CompressionMethodFormat); + +impl FormatValue for CompressionMethodValue { + type Input<'a> = CompressionMethod; + type Output<'a> = &'static str; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + Ok(match self.0 { + CompressionMethodFormat::Full => match input { + CompressionMethod::Stored => "stored", + CompressionMethod::Deflated => "deflate", + #[cfg(feature = "deflate64")] + CompressionMethod::Deflate64 => "deflate64", + #[cfg(feature = "bzip2")] + CompressionMethod::Bzip2 => "bzip2", + #[cfg(feature = "zstd")] + CompressionMethod::Zstd => "zstd", + #[cfg(feature = "lzma")] + CompressionMethod::Lzma => "lzma", + #[cfg(feature = "xz")] + CompressionMethod::Xz => "xz", + _ => "unknown", + }, + CompressionMethodFormat::Abbreviated => match input { + CompressionMethod::Stored => "stor", + CompressionMethod::Deflated => "defl", + #[cfg(feature = "deflate64")] + CompressionMethod::Deflate64 => "df64", + #[cfg(feature = "bzip2")] + CompressionMethod::Bzip2 => "bz2", + #[cfg(feature = "zstd")] + CompressionMethod::Zstd => "zst", + #[cfg(feature = "lzma")] + CompressionMethod::Lzma => "lz", + #[cfg(feature = "xz")] + CompressionMethod::Xz => "xz", + _ => "?", + }, + }) + } +} + +#[derive(Copy, Clone)] +pub struct UnixModeValue(pub UnixModeFormat); + +impl UnixModeValue { + const S_IRUSR: u32 = 256; + const S_IWUSR: u32 = 128; + const S_IXUSR: u32 = 64; + + const S_IRGRP: u32 = 32; + const S_IWGRP: u32 = 16; + const S_IXGRP: u32 = 8; + + const S_IROTH: u32 = 4; + const S_IWOTH: u32 = 2; + const S_IXOTH: u32 = 1; + + const UNKNOWN_MODE_BITS: [u8; 9] = [b'?'; 9]; + + fn pretty_format_mode_bits(mode: u32) -> [u8; 9] { + let mut ret = [b'-'; 9]; + + if mode & Self::S_IRUSR == Self::S_IRUSR { + ret[0] = b'r'; + } + if mode & Self::S_IWUSR == Self::S_IWUSR { + ret[1] = b'w'; + } + if mode & Self::S_IXUSR == Self::S_IXUSR { + ret[2] = b'x'; + } + + if mode & Self::S_IRGRP == Self::S_IRGRP { + ret[3] = b'r'; + } + if mode & Self::S_IWGRP == Self::S_IWGRP { + ret[4] = b'w'; + } + if mode & Self::S_IXGRP == Self::S_IXGRP { + ret[5] = b'x'; + } + + if mode & Self::S_IROTH == Self::S_IROTH { + ret[6] = b'r'; + } + if mode & Self::S_IWOTH == Self::S_IWOTH { + ret[7] = b'w'; + } + if mode & Self::S_IXOTH == Self::S_IXOTH { + ret[8] = b'x'; + } + + ret + } +} + +#[derive(Debug)] +pub enum ModeValueWriter { + Octal(Option), + Pretty([u8; 9]), +} + +impl Writeable for ModeValueWriter { + fn write_to(&self, out: &mut dyn Write) -> Result<(), io::Error> { + match self { + Self::Octal(mode) => match mode { + Some(bits) => write!(out, "{:o}", bits), + None => write!(out, "?"), + }, + Self::Pretty(bits) => out.write_all(bits.as_ref()), + } + } +} + +impl FormatValue for UnixModeValue { + type Input<'a> = Option; + type Output<'a> = ModeValueWriter; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + Ok(match self.0 { + UnixModeFormat::Octal => ModeValueWriter::Octal(input), + UnixModeFormat::Pretty => ModeValueWriter::Pretty(match input { + Some(bits) => Self::pretty_format_mode_bits(bits), + None => Self::UNKNOWN_MODE_BITS, + }), + }) + } +} + +#[derive(Copy, Clone)] +pub struct ByteSizeValue(pub ByteSizeFormat); + +#[derive(Debug)] +pub enum ByteSizeWriter { + FullDecimal(u64), +} + +impl fmt::Display for ByteSizeWriter { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::FullDecimal(n) => write!(f, "{}", n), + } + } +} + +impl FormatValue for ByteSizeValue { + type Input<'a> = u64; + type Output<'a> = ByteSizeWriter; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + Ok(match self.0 { + ByteSizeFormat::FullDecimal => ByteSizeWriter::FullDecimal(input), + ByteSizeFormat::HumanAbbreviated => todo!("human abbreviated byte sizes"), + }) + } +} From 3d3afb9957b092442906ce5f48e53ccce5fb5ef8 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:22:43 -0400 Subject: [PATCH 47/68] refactor parseable directive --- cli/src/args/info.rs | 159 ++++++++++++++++--------------------------- cli/src/info.rs | 65 ++++++++++++------ 2 files changed, 103 insertions(+), 121 deletions(-) diff --git a/cli/src/args/info.rs b/cli/src/args/info.rs index 3b34b2c9e..4a2711828 100644 --- a/cli/src/args/info.rs +++ b/cli/src/args/info.rs @@ -6,7 +6,7 @@ use super::{ use std::{collections::VecDeque, ffi::OsString, fmt, path::PathBuf}; #[derive(Debug)] -struct ModifierParseError(pub String); +pub struct ModifierParseError(pub String); impl fmt::Display for ModifierParseError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { @@ -15,7 +15,7 @@ impl fmt::Display for ModifierParseError { } #[derive(Debug)] -enum DirectiveParseError { +pub enum DirectiveParseError { Modifier(String, ModifierParseError), Unrecognized(String), } @@ -34,7 +34,7 @@ impl fmt::Display for DirectiveParseError { } #[derive(Debug)] -enum FormatParseError { +pub enum FormatParseError { Directive(DirectiveParseError), Search(String), } @@ -124,8 +124,8 @@ pub enum ArchiveOverviewFormatDirective { CentralDirectoryStart(OffsetFormat), } -impl ArchiveOverviewFormatDirective { - pub fn parse(s: &str) -> Result { +impl ParseableDirective for ArchiveOverviewFormatDirective { + fn parse_directive(s: &str) -> Result { match s { "name" => Ok(Self::ArchiveName), s if s.starts_with("size") => { @@ -154,17 +154,28 @@ impl ArchiveOverviewFormatDirective { } } -trait ParseableFormat: Sized { - type Component: Sized; - const ESCAPED_PERCENT: Self::Component; - const ESCAPED_NEWLINE: Self::Component; - const ESCAPED_TAB: Self::Component; - fn make_literal(s: &str) -> Self::Component; - fn parse_directive(s: &str) -> Result; - fn from_components(components: Vec) -> Self; +#[derive(Debug)] +pub enum ParseableFormatComponent { + Directive(D), + Escaped(&'static str), + Literal(String), +} - fn parse_format(s: &str) -> Result { - let mut components: Vec = Vec::new(); +#[derive(Debug)] +pub struct ParseableFormatSpec { + pub components: Vec>, +} + +pub trait ParseableDirective: Sized { + fn parse_directive(s: &str) -> Result; +} + +impl ParseableFormatSpec +where + D: ParseableDirective, +{ + pub fn parse_format(s: &str) -> Result { + let mut components: Vec> = Vec::new(); let mut last_source_position: usize = 0; while let Some(pcnt_pos) = s[last_source_position..] .find('%') @@ -172,7 +183,9 @@ trait ParseableFormat: Sized { { /* Anything in between directives is a literal string. */ if pcnt_pos > last_source_position { - components.push(Self::make_literal(&s[last_source_position..pcnt_pos])); + components.push(ParseableFormatComponent::Literal( + s[last_source_position..pcnt_pos].to_string(), + )); last_source_position = pcnt_pos; } let next_pcnt = s[(pcnt_pos + 1)..] @@ -185,60 +198,30 @@ trait ParseableFormat: Sized { match directive_contents { /* An empty directive is a literal percent. */ "%%" => { - components.push(Self::ESCAPED_PERCENT); + components.push(ParseableFormatComponent::Escaped("%")); } /* A single '!' directive is a literal newline. */ "%!%" => { - components.push(Self::ESCAPED_NEWLINE); + components.push(ParseableFormatComponent::Escaped("\n")); } "%,%" => { - components.push(Self::ESCAPED_TAB); + components.push(ParseableFormatComponent::Escaped("\t")); } /* Otherwise, parse the space between percents. */ d => { - let directive = Self::parse_directive(&d[1..(d.len() - 1)]) + let directive = D::parse_directive(&d[1..(d.len() - 1)]) .map_err(FormatParseError::Directive)?; - components.push(directive); + components.push(ParseableFormatComponent::Directive(directive)); } } last_source_position += directive_contents.len(); } if s.len() > last_source_position { - components.push(Self::make_literal(&s[last_source_position..])); + components.push(ParseableFormatComponent::Literal( + s[last_source_position..].to_string(), + )); } - Ok(Self::from_components(components)) - } -} - -#[derive(Debug)] -pub enum ArchiveOverviewFormatComponent { - Directive(ArchiveOverviewFormatDirective), - EscapedPercent, - EscapedNewline, - EscapedTab, - Literal(String), -} - -#[derive(Debug)] -pub struct ArchiveOverviewFormatSpec { - pub components: Vec, -} - -impl ParseableFormat for ArchiveOverviewFormatSpec { - type Component = ArchiveOverviewFormatComponent; - const ESCAPED_PERCENT: Self::Component = ArchiveOverviewFormatComponent::EscapedPercent; - const ESCAPED_NEWLINE: Self::Component = ArchiveOverviewFormatComponent::EscapedNewline; - const ESCAPED_TAB: Self::Component = ArchiveOverviewFormatComponent::EscapedTab; - fn make_literal(s: &str) -> Self::Component { - ArchiveOverviewFormatComponent::Literal(s.to_string()) - } - fn parse_directive(s: &str) -> Result { - Ok(ArchiveOverviewFormatComponent::Directive( - ArchiveOverviewFormatDirective::parse(s)?, - )) - } - fn from_components(components: Vec) -> Self { - Self { components } + Ok(Self { components }) } } @@ -362,8 +345,8 @@ pub enum EntryFormatDirective { Timestamp(TimestampFormat), } -impl EntryFormatDirective { - pub fn parse(s: &str) -> Result { +impl ParseableDirective for EntryFormatDirective { + fn parse_directive(s: &str) -> Result { match s { "name" => Ok(Self::Name), s if s.starts_with("type") => { @@ -426,46 +409,14 @@ impl EntryFormatDirective { } } -#[derive(Debug)] -pub enum EntryFormatComponent { - Directive(EntryFormatDirective), - EscapedPercent, - EscapedNewline, - EscapedTab, - Literal(String), -} - -#[derive(Debug)] -pub struct EntryFormatSpec { - pub components: Vec, -} - -impl ParseableFormat for EntryFormatSpec { - type Component = EntryFormatComponent; - const ESCAPED_PERCENT: Self::Component = EntryFormatComponent::EscapedPercent; - const ESCAPED_NEWLINE: Self::Component = EntryFormatComponent::EscapedNewline; - const ESCAPED_TAB: Self::Component = EntryFormatComponent::EscapedTab; - fn make_literal(s: &str) -> Self::Component { - EntryFormatComponent::Literal(s.to_string()) - } - fn parse_directive(s: &str) -> Result { - Ok(EntryFormatComponent::Directive( - EntryFormatDirective::parse(s)?, - )) - } - fn from_components(components: Vec) -> Self { - Self { components } - } -} - #[derive(Debug, Default)] pub enum FormatSpec { #[default] Compact, Extended, Custom { - overview: ArchiveOverviewFormatSpec, - entry: EntryFormatSpec, + overview: ParseableFormatSpec, + entry: ParseableFormatSpec, }, } @@ -474,17 +425,20 @@ impl FormatSpec { archive_format: String, entry_format: String, ) -> Result { - let overview = ArchiveOverviewFormatSpec::parse_format(&archive_format).map_err(|e| { - Info::exit_arg_invalid(&format!( - "failed to parse archive format string {archive_format:?}: {e}" - )) - })?; + let overview = + ParseableFormatSpec::::parse_format(&archive_format) + .map_err(|e| { + Info::exit_arg_invalid(&format!( + "failed to parse archive format string {archive_format:?}: {e}" + )) + })?; dbg!(&entry_format); - let entry = EntryFormatSpec::parse_format(&entry_format).map_err(|e| { - Info::exit_arg_invalid(&format!( - "failed to parse entry format string {entry_format:?}: {e}" - )) - })?; + let entry = ParseableFormatSpec::::parse_format(&entry_format) + .map_err(|e| { + Info::exit_arg_invalid(&format!( + "failed to parse entry format string {entry_format:?}: {e}" + )) + })?; Ok(Self::Custom { overview, entry }) } } @@ -555,6 +509,9 @@ discussed in the section on . %!% Prints a single literal newline '\n'. +%,% + Prints a single literal tab character '\t'. + ## Archive format directives: This is printed at the bottom of the output, after all entries are formatted. diff --git a/cli/src/info.rs b/cli/src/info.rs index eebf9926c..497b9988f 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -46,11 +46,34 @@ where } } +/* enum CompiledFormatComponent { */ +/* Directive(F), */ +/* EscapedPercent, */ +/* EscapedNewline, */ +/* EscapedTab, */ +/* Literal(String), */ +/* } */ + +/* trait CompiledFormat { */ +/* type DirectiveSpec; */ +/* type CompiledDirective; */ +/* fn compile_directive( */ +/* spec: Self::DirectiveSpec, */ +/* ) -> Result; */ + +/* type FormatSpec: ParseableFormat; */ +/* fn compile_format(spec: Self::FormatSpec) -> Result<(), CommandError>; */ + +/* fn compile_component( */ +/* spec: ::Component, */ +/* ) -> Result<(), CommandError>; */ + +/* type Data<'a>; */ +/* } */ + enum CompiledEntryFormatComponent { Directive(Box), - EscapedPercent, - EscapedNewline, - EscapedTab, + Escaped(&'static str), Literal(String), } @@ -72,15 +95,15 @@ impl CompiledEntryFormatComponent { }) } - pub fn from_spec(spec: EntryFormatComponent) -> Result { + pub fn from_spec( + spec: ParseableFormatComponent, + ) -> Result { match spec { - EntryFormatComponent::Directive(directive) => { + ParseableFormatComponent::Directive(directive) => { Ok(Self::Directive(Self::compile_directive(directive)?)) } - EntryFormatComponent::EscapedPercent => Ok(Self::EscapedPercent), - EntryFormatComponent::EscapedNewline => Ok(Self::EscapedNewline), - EntryFormatComponent::EscapedTab => Ok(Self::EscapedTab), - EntryFormatComponent::Literal(lit) => Ok(Self::Literal(lit)), + ParseableFormatComponent::Escaped(s) => Ok(Self::Escaped(s)), + ParseableFormatComponent::Literal(lit) => Ok(Self::Literal(lit)), } } @@ -91,15 +114,9 @@ impl CompiledEntryFormatComponent { ) -> Result<(), CommandError> { match self { Self::Directive(directive) => directive.write_entry_directive(data, &mut out), - Self::EscapedPercent => out - .write_all(b"%") - .wrap_err("failed to write escaped % to output"), - Self::EscapedNewline => out - .write_all(b"\n") - .wrap_err("failed to write escaped newline to output"), - Self::EscapedTab => out - .write_all(b"\t") - .wrap_err("failed to write escaped tab to output"), + Self::Escaped(s) => out + .write_all(s.as_bytes()) + .wrap_err_with(|| format!("failed to write escaped string {s:?} to output")), Self::Literal(lit) => out .write_all(lit.as_bytes()) .wrap_err_with(|| format!("failed to write literal {lit:?} to output")), @@ -112,8 +129,10 @@ struct CompiledEntryFormatter { } impl CompiledEntryFormatter { - pub fn from_spec(spec: EntryFormatSpec) -> Result { - let EntryFormatSpec { components } = spec; + pub fn from_spec( + spec: ParseableFormatSpec, + ) -> Result { + let ParseableFormatSpec { components } = spec; let components: Vec<_> = components .into_iter() .map(CompiledEntryFormatComponent::from_spec) @@ -183,6 +202,9 @@ pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> .wrap_err("error reading zip entry from stdin")? { let data = EntryData::from_entry(&entry); + if matcher.as_ref().is_some_and(|m| !m.matches(&data)) { + continue; + } entry_formatter.write_entry(data, &mut output_stream)?; } } @@ -207,6 +229,9 @@ pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> .by_index(i) .wrap_err_with(|| format!("failed to read entry {i} from zip at {p:?}"))?; let data = EntryData::from_entry(&entry); + if matcher.as_ref().is_some_and(|m| !m.matches(&data)) { + continue; + } entry_formatter.write_entry(data, &mut output_stream)?; } } From cad8dce1a4e411f498367a5aa070a79172e7e637 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 28 Aug 2024 16:55:05 -0400 Subject: [PATCH 48/68] make compiled format strings much more generic (!) --- cli/src/info.rs | 202 +++++++++++++++++++++++++++--------------------- 1 file changed, 116 insertions(+), 86 deletions(-) diff --git a/cli/src/info.rs b/cli/src/info.rs index 497b9988f..8aac660e4 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -1,6 +1,7 @@ use std::{ fs, io::{self, Write}, + marker::PhantomData, path::PathBuf, }; @@ -46,116 +47,142 @@ where } } -/* enum CompiledFormatComponent { */ -/* Directive(F), */ -/* EscapedPercent, */ -/* EscapedNewline, */ -/* EscapedTab, */ -/* Literal(String), */ -/* } */ - -/* trait CompiledFormat { */ -/* type DirectiveSpec; */ -/* type CompiledDirective; */ -/* fn compile_directive( */ -/* spec: Self::DirectiveSpec, */ -/* ) -> Result; */ - -/* type FormatSpec: ParseableFormat; */ -/* fn compile_format(spec: Self::FormatSpec) -> Result<(), CommandError>; */ - -/* fn compile_component( */ -/* spec: ::Component, */ -/* ) -> Result<(), CommandError>; */ - -/* type Data<'a>; */ -/* } */ - -enum CompiledEntryFormatComponent { - Directive(Box), - Escaped(&'static str), - Literal(String), +enum CompiledFormatComponent { + Directive(F), + ContiguousLiteral(String), } -impl CompiledEntryFormatComponent { - fn compile_directive( - spec: EntryFormatDirective, - ) -> Result, CommandError> { - Ok(match spec { - EntryFormatDirective::Name => Box::new(EntryNameField(NameString)), - EntryFormatDirective::FileType(f) => Box::new(FileTypeField(FileTypeValue(f))), - EntryFormatDirective::UncompressedSize(f) => { - Box::new(UncompressedSizeField(ByteSizeValue(f))) - } - EntryFormatDirective::UnixMode(f) => Box::new(UnixModeField(UnixModeValue(f))), - EntryFormatDirective::CompressionMethod(f) => { - Box::new(CompressionMethodField(CompressionMethodValue(f))) - } - _ => todo!(), - }) - } - - pub fn from_spec( - spec: ParseableFormatComponent, - ) -> Result { - match spec { - ParseableFormatComponent::Directive(directive) => { - Ok(Self::Directive(Self::compile_directive(directive)?)) - } - ParseableFormatComponent::Escaped(s) => Ok(Self::Escaped(s)), - ParseableFormatComponent::Literal(lit) => Ok(Self::Literal(lit)), - } - } - +impl CompiledFormatComponent +where + F: DirectiveFormatter, +{ pub fn write_component<'a>( &self, - data: &EntryData<'a>, + data: ::Data<'a>, mut out: impl Write, ) -> Result<(), CommandError> { match self { - Self::Directive(directive) => directive.write_entry_directive(data, &mut out), - Self::Escaped(s) => out - .write_all(s.as_bytes()) - .wrap_err_with(|| format!("failed to write escaped string {s:?} to output")), - Self::Literal(lit) => out + Self::Directive(d) => d.write_directive(data, &mut out), + Self::ContiguousLiteral(lit) => out .write_all(lit.as_bytes()) .wrap_err_with(|| format!("failed to write literal {lit:?} to output")), } } } -struct CompiledEntryFormatter { - components: Vec, +struct CompiledFormatSpec { + pub components: Vec>, } -impl CompiledEntryFormatter { - pub fn from_spec( - spec: ParseableFormatSpec, - ) -> Result { - let ParseableFormatSpec { components } = spec; - let components: Vec<_> = components - .into_iter() - .map(CompiledEntryFormatComponent::from_spec) - .collect::>()?; - Ok(Self { components }) - } - +impl CompiledFormatSpec { pub fn is_empty(&self) -> bool { self.components.is_empty() } +} + +impl CompiledFormatSpec +where + F: DirectiveFormatter, +{ + pub fn from_spec( + spec: ParseableFormatSpec<::Spec>, + ) -> Result + where + CF: CompiledFormat, + { + let ParseableFormatSpec { + components: spec_components, + } = spec; + + let mut components: Vec> = Vec::new(); + for c in spec_components.into_iter() { + match c { + ParseableFormatComponent::Directive(d) => { + let d = CF::from_directive_spec(d)?; + components.push(CompiledFormatComponent::Directive(d)); + } + ParseableFormatComponent::Escaped(s) => match components.last_mut() { + Some(CompiledFormatComponent::ContiguousLiteral(ref mut last_lit)) => { + last_lit.push_str(s); + } + _ => { + components.push(CompiledFormatComponent::ContiguousLiteral(s.to_string())); + } + }, + ParseableFormatComponent::Literal(new_lit) => match components.last_mut() { + Some(CompiledFormatComponent::ContiguousLiteral(ref mut last_lit)) => { + last_lit.push_str(new_lit.as_str()); + } + _ => { + components.push(CompiledFormatComponent::ContiguousLiteral(new_lit)); + } + }, + } + } + + Ok(Self { components }) + } - pub fn write_entry<'a>( + pub fn execute_format<'a>( &self, - data: EntryData<'a>, + data: ::Data<'a>, mut out: impl Write, - ) -> Result<(), CommandError> { + ) -> Result<(), CommandError> + where + ::Data<'a>: Clone, + { for c in self.components.iter() { - c.write_component(&data, &mut out)?; + c.write_component(data.clone(), &mut out)? } Ok(()) } } +struct CompiledEntryDirective(Box); + +impl DirectiveFormatter for CompiledEntryDirective { + type Data<'a> = EntryData<'a>; + + fn write_directive<'a>( + &self, + data: Self::Data<'a>, + out: &mut dyn Write, + ) -> Result<(), CommandError> { + self.0.write_entry_directive(&data, out) + } +} + +trait CompiledFormat { + type Spec: ParseableDirective; + type Fmt: DirectiveFormatter; + + fn from_directive_spec(spec: Self::Spec) -> Result; +} + +struct CompiledEntryFormat; + +impl CompiledFormat for CompiledEntryFormat { + type Spec = EntryFormatDirective; + type Fmt = CompiledEntryDirective; + + fn from_directive_spec( + spec: EntryFormatDirective, + ) -> Result { + Ok(CompiledEntryDirective(match spec { + EntryFormatDirective::Name => Box::new(EntryNameField(NameString)), + EntryFormatDirective::FileType(f) => Box::new(FileTypeField(FileTypeValue(f))), + EntryFormatDirective::UncompressedSize(f) => { + Box::new(UncompressedSizeField(ByteSizeValue(f))) + } + EntryFormatDirective::UnixMode(f) => Box::new(UnixModeField(UnixModeValue(f))), + EntryFormatDirective::CompressionMethod(f) => { + Box::new(CompressionMethodField(CompressionMethodValue(f))) + } + _ => todo!(), + })) + } +} + struct ArchiveWithPath { pub path: PathBuf, /* TODO: Debug impl for ZipArchive? what about ZipFile? */ @@ -189,7 +216,10 @@ pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> let (archive_formatter, entry_formatter) = match format_spec { FormatSpec::Compact => todo!(), FormatSpec::Extended => todo!(), - FormatSpec::Custom { overview, entry } => ((), CompiledEntryFormatter::from_spec(entry)?), + FormatSpec::Custom { overview, entry } => ( + (), + CompiledFormatSpec::from_spec::(entry)?, + ), }; let mut output_stream = io::stdout().lock(); @@ -205,7 +235,7 @@ pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> if matcher.as_ref().is_some_and(|m| !m.matches(&data)) { continue; } - entry_formatter.write_entry(data, &mut output_stream)?; + entry_formatter.execute_format(data, &mut output_stream)?; } } writeln!( @@ -232,7 +262,7 @@ pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> if matcher.as_ref().is_some_and(|m| !m.matches(&data)) { continue; } - entry_formatter.write_entry(data, &mut output_stream)?; + entry_formatter.execute_format(data, &mut output_stream)?; } } } From c5f5ebbcd3c13a9128209bcda76f03863e4c26c1 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 28 Aug 2024 17:11:01 -0400 Subject: [PATCH 49/68] refactor modules of compiled formatting --- cli/src/info.rs | 166 +--------------------------------- cli/src/info/directives.rs | 178 ++++++++++++++++++++++++++++++++++++- 2 files changed, 177 insertions(+), 167 deletions(-) diff --git a/cli/src/info.rs b/cli/src/info.rs index 8aac660e4..aeacc8d28 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -1,7 +1,6 @@ use std::{ fs, io::{self, Write}, - marker::PhantomData, path::PathBuf, }; @@ -18,170 +17,7 @@ use crate::{ mod directives; mod formats; -use directives::{ - entry::{ - CompressionMethodField, EntryNameField, FileTypeField, UncompressedSizeField, UnixModeField, - }, - DirectiveFormatter, -}; -use formats::{ByteSizeValue, CompressionMethodValue, FileTypeValue, NameString, UnixModeValue}; - -trait EntryDirectiveFormatter { - fn write_entry_directive<'a>( - &self, - data: &EntryData<'a>, - out: &mut dyn Write, - ) -> Result<(), CommandError>; -} - -impl EntryDirectiveFormatter for CF -where - CF: for<'a> DirectiveFormatter = &'a EntryData<'a>>, -{ - fn write_entry_directive<'a>( - &self, - data: &EntryData<'a>, - out: &mut dyn Write, - ) -> Result<(), CommandError> { - self.write_directive(data, out) - } -} - -enum CompiledFormatComponent { - Directive(F), - ContiguousLiteral(String), -} - -impl CompiledFormatComponent -where - F: DirectiveFormatter, -{ - pub fn write_component<'a>( - &self, - data: ::Data<'a>, - mut out: impl Write, - ) -> Result<(), CommandError> { - match self { - Self::Directive(d) => d.write_directive(data, &mut out), - Self::ContiguousLiteral(lit) => out - .write_all(lit.as_bytes()) - .wrap_err_with(|| format!("failed to write literal {lit:?} to output")), - } - } -} - -struct CompiledFormatSpec { - pub components: Vec>, -} - -impl CompiledFormatSpec { - pub fn is_empty(&self) -> bool { - self.components.is_empty() - } -} - -impl CompiledFormatSpec -where - F: DirectiveFormatter, -{ - pub fn from_spec( - spec: ParseableFormatSpec<::Spec>, - ) -> Result - where - CF: CompiledFormat, - { - let ParseableFormatSpec { - components: spec_components, - } = spec; - - let mut components: Vec> = Vec::new(); - for c in spec_components.into_iter() { - match c { - ParseableFormatComponent::Directive(d) => { - let d = CF::from_directive_spec(d)?; - components.push(CompiledFormatComponent::Directive(d)); - } - ParseableFormatComponent::Escaped(s) => match components.last_mut() { - Some(CompiledFormatComponent::ContiguousLiteral(ref mut last_lit)) => { - last_lit.push_str(s); - } - _ => { - components.push(CompiledFormatComponent::ContiguousLiteral(s.to_string())); - } - }, - ParseableFormatComponent::Literal(new_lit) => match components.last_mut() { - Some(CompiledFormatComponent::ContiguousLiteral(ref mut last_lit)) => { - last_lit.push_str(new_lit.as_str()); - } - _ => { - components.push(CompiledFormatComponent::ContiguousLiteral(new_lit)); - } - }, - } - } - - Ok(Self { components }) - } - - pub fn execute_format<'a>( - &self, - data: ::Data<'a>, - mut out: impl Write, - ) -> Result<(), CommandError> - where - ::Data<'a>: Clone, - { - for c in self.components.iter() { - c.write_component(data.clone(), &mut out)? - } - Ok(()) - } -} - -struct CompiledEntryDirective(Box); - -impl DirectiveFormatter for CompiledEntryDirective { - type Data<'a> = EntryData<'a>; - - fn write_directive<'a>( - &self, - data: Self::Data<'a>, - out: &mut dyn Write, - ) -> Result<(), CommandError> { - self.0.write_entry_directive(&data, out) - } -} - -trait CompiledFormat { - type Spec: ParseableDirective; - type Fmt: DirectiveFormatter; - - fn from_directive_spec(spec: Self::Spec) -> Result; -} - -struct CompiledEntryFormat; - -impl CompiledFormat for CompiledEntryFormat { - type Spec = EntryFormatDirective; - type Fmt = CompiledEntryDirective; - - fn from_directive_spec( - spec: EntryFormatDirective, - ) -> Result { - Ok(CompiledEntryDirective(match spec { - EntryFormatDirective::Name => Box::new(EntryNameField(NameString)), - EntryFormatDirective::FileType(f) => Box::new(FileTypeField(FileTypeValue(f))), - EntryFormatDirective::UncompressedSize(f) => { - Box::new(UncompressedSizeField(ByteSizeValue(f))) - } - EntryFormatDirective::UnixMode(f) => Box::new(UnixModeField(UnixModeValue(f))), - EntryFormatDirective::CompressionMethod(f) => { - Box::new(CompressionMethodField(CompressionMethodValue(f))) - } - _ => todo!(), - })) - } -} +use directives::{compiled::CompiledFormatSpec, entry::compiled::CompiledEntryFormat}; struct ArchiveWithPath { pub path: PathBuf, diff --git a/cli/src/info/directives.rs b/cli/src/info/directives.rs index 9b734d328..80a2c7782 100644 --- a/cli/src/info/directives.rs +++ b/cli/src/info/directives.rs @@ -4,7 +4,10 @@ use std::{ }; use super::formats::FormatValue; -use crate::{CommandError, WrapCommandErr}; +use crate::{ + args::info::{ParseableDirective, ParseableFormatComponent, ParseableFormatSpec}, + CommandError, WrapCommandErr, +}; pub trait Writeable { fn write_to(&self, out: &mut dyn Write) -> Result<(), io::Error>; @@ -72,13 +75,116 @@ where } } +pub mod compiled { + use super::*; + + enum CompiledFormatComponent { + Directive(F), + ContiguousLiteral(String), + } + + impl CompiledFormatComponent + where + F: DirectiveFormatter, + { + pub fn write_component<'a>( + &self, + data: ::Data<'a>, + mut out: impl Write, + ) -> Result<(), CommandError> { + match self { + Self::Directive(d) => d.write_directive(data, &mut out), + Self::ContiguousLiteral(lit) => out + .write_all(lit.as_bytes()) + .wrap_err_with(|| format!("failed to write literal {lit:?} to output")), + } + } + } + + pub trait CompiledFormat { + type Spec: ParseableDirective; + type Fmt: DirectiveFormatter; + + fn from_directive_spec(spec: Self::Spec) -> Result; + } + + pub struct CompiledFormatSpec { + pub components: Vec>, + } + + impl CompiledFormatSpec { + pub fn is_empty(&self) -> bool { + self.components.is_empty() + } + } + + impl CompiledFormatSpec + where + F: DirectiveFormatter, + { + pub fn from_spec( + spec: ParseableFormatSpec<::Spec>, + ) -> Result + where + CF: CompiledFormat, + { + let ParseableFormatSpec { + components: spec_components, + } = spec; + + let mut components: Vec> = Vec::new(); + for c in spec_components.into_iter() { + match c { + ParseableFormatComponent::Directive(d) => { + let d = CF::from_directive_spec(d)?; + components.push(CompiledFormatComponent::Directive(d)); + } + ParseableFormatComponent::Escaped(s) => match components.last_mut() { + Some(CompiledFormatComponent::ContiguousLiteral(ref mut last_lit)) => { + last_lit.push_str(s); + } + _ => { + components + .push(CompiledFormatComponent::ContiguousLiteral(s.to_string())); + } + }, + ParseableFormatComponent::Literal(new_lit) => match components.last_mut() { + Some(CompiledFormatComponent::ContiguousLiteral(ref mut last_lit)) => { + last_lit.push_str(new_lit.as_str()); + } + _ => { + components.push(CompiledFormatComponent::ContiguousLiteral(new_lit)); + } + }, + } + } + + Ok(Self { components }) + } + + pub fn execute_format<'a>( + &self, + data: ::Data<'a>, + mut out: impl Write, + ) -> Result<(), CommandError> + where + ::Data<'a>: Clone, + { + for c in self.components.iter() { + c.write_component(data.clone(), &mut out)? + } + Ok(()) + } + } +} + pub mod entry { use super::{ super::formats::{ ByteSizeValue, CompressionMethodValue, FileTypeValue, FormatValue, NameString, UnixModeValue, }, - FormatDirective, + DirectiveFormatter, FormatDirective, }; use crate::extract::receiver::EntryData; @@ -161,4 +267,72 @@ pub mod entry { self.0 } } + + pub mod compiled { + use super::{super::compiled::CompiledFormat, *}; + use crate::{args::info::EntryFormatDirective, CommandError}; + + use std::io::Write; + + /// Used for type erasure by removing the lifetime-bounded associated type. + trait EntryDirectiveFormatter { + fn write_entry_directive<'a>( + &self, + data: &EntryData<'a>, + out: &mut dyn Write, + ) -> Result<(), CommandError>; + } + + impl EntryDirectiveFormatter for CF + where + CF: for<'a> DirectiveFormatter = &'a EntryData<'a>>, + { + fn write_entry_directive<'a>( + &self, + data: &EntryData<'a>, + out: &mut dyn Write, + ) -> Result<(), CommandError> { + self.write_directive(data, out) + } + } + + /// This re-implements the generic trait using the type-erased boxed vtable. + pub struct CompiledEntryDirective(Box); + + impl DirectiveFormatter for CompiledEntryDirective { + type Data<'a> = EntryData<'a>; + + fn write_directive<'a>( + &self, + data: Self::Data<'a>, + out: &mut dyn Write, + ) -> Result<(), CommandError> { + self.0.write_entry_directive(&data, out) + } + } + + pub struct CompiledEntryFormat; + + impl CompiledFormat for CompiledEntryFormat { + type Spec = EntryFormatDirective; + type Fmt = CompiledEntryDirective; + + fn from_directive_spec( + spec: EntryFormatDirective, + ) -> Result { + Ok(CompiledEntryDirective(match spec { + EntryFormatDirective::Name => Box::new(EntryNameField(NameString)), + EntryFormatDirective::FileType(f) => Box::new(FileTypeField(FileTypeValue(f))), + EntryFormatDirective::UncompressedSize(f) => { + Box::new(UncompressedSizeField(ByteSizeValue(f))) + } + EntryFormatDirective::UnixMode(f) => Box::new(UnixModeField(UnixModeValue(f))), + EntryFormatDirective::CompressionMethod(f) => { + Box::new(CompressionMethodField(CompressionMethodValue(f))) + } + _ => todo!(), + })) + } + } + } } From 6a29b56543a6b5f11cd24d36414c104ec52344ab Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 28 Aug 2024 18:08:05 -0400 Subject: [PATCH 50/68] archive format works!!!!! omg --- cli/src/info.rs | 26 ++++-- cli/src/info/directives.rs | 174 ++++++++++++++++++++++++++++++++++++- cli/src/info/formats.rs | 55 ++++++++++++ 3 files changed, 248 insertions(+), 7 deletions(-) diff --git a/cli/src/info.rs b/cli/src/info.rs index aeacc8d28..00ef495c7 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -17,11 +17,14 @@ use crate::{ mod directives; mod formats; -use directives::{compiled::CompiledFormatSpec, entry::compiled::CompiledEntryFormat}; +use directives::{ + archive::compiled::CompiledArchiveFormat, compiled::CompiledFormatSpec, + entry::compiled::CompiledEntryFormat, +}; -struct ArchiveWithPath { +pub struct ArchiveWithPath { pub path: PathBuf, - /* TODO: Debug impl for ZipArchive? what about ZipFile? */ + pub len: u64, pub archive: ZipArchive, } @@ -29,9 +32,13 @@ impl ArchiveWithPath { pub fn open(path: PathBuf) -> Result { let f = fs::File::open(&path) .wrap_err_with(|| format!("failed to open zip input file path {:?}", &path))?; + let len = f + .metadata() + .wrap_err("failed to extract file metadata")? + .len(); let archive = ZipArchive::new(f) .wrap_err_with(|| format!("failed to create zip archive from file {:?}", &path))?; - Ok(Self { path, archive }) + Ok(Self { path, len, archive }) } } @@ -53,7 +60,7 @@ pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> FormatSpec::Compact => todo!(), FormatSpec::Extended => todo!(), FormatSpec::Custom { overview, entry } => ( - (), + CompiledFormatSpec::from_spec::(overview)?, CompiledFormatSpec::from_spec::(entry)?, ), }; @@ -101,6 +108,15 @@ pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> entry_formatter.execute_format(data, &mut output_stream)?; } } + if archive_formatter.is_empty() { + writeln!( + &mut err, + "empty archive format, skipping archive overview for file {p:?}" + ) + .unwrap(); + } else { + archive_formatter.execute_format(&zip, &mut output_stream)?; + } } Ok(()) diff --git a/cli/src/info/directives.rs b/cli/src/info/directives.rs index 80a2c7782..ac11ca8a0 100644 --- a/cli/src/info/directives.rs +++ b/cli/src/info/directives.rs @@ -184,7 +184,7 @@ pub mod entry { ByteSizeValue, CompressionMethodValue, FileTypeValue, FormatValue, NameString, UnixModeValue, }, - DirectiveFormatter, FormatDirective, + FormatDirective, }; use crate::extract::receiver::EntryData; @@ -269,7 +269,10 @@ pub mod entry { } pub mod compiled { - use super::{super::compiled::CompiledFormat, *}; + use super::{ + super::{compiled::CompiledFormat, DirectiveFormatter}, + *, + }; use crate::{args::info::EntryFormatDirective, CommandError}; use std::io::Write; @@ -336,3 +339,170 @@ pub mod entry { } } } + +pub mod archive { + use super::{ + super::{ + formats::{ByteSizeValue, DecimalNumberValue, FormatValue, OffsetValue, PathString}, + ArchiveWithPath, + }, + FormatDirective, + }; + + pub struct ArchiveNameField(pub PathString); + + impl FormatDirective for ArchiveNameField { + type Data<'a> = &'a ArchiveWithPath; + type FieldType = PathString; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.path.as_path() + } + fn value_formatter(&self) -> PathString { + self.0 + } + } + + pub struct ArchiveSizeField(pub ByteSizeValue); + + impl FormatDirective for ArchiveSizeField { + type Data<'a> = &'a ArchiveWithPath; + type FieldType = ByteSizeValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.len + } + fn value_formatter(&self) -> ByteSizeValue { + self.0 + } + } + + pub struct NumEntriesField(pub DecimalNumberValue); + + impl FormatDirective for NumEntriesField { + type Data<'a> = &'a ArchiveWithPath; + type FieldType = DecimalNumberValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.archive.len().try_into().unwrap() + } + fn value_formatter(&self) -> DecimalNumberValue { + self.0 + } + } + + pub struct FirstEntryStartField(pub OffsetValue); + + impl FormatDirective for FirstEntryStartField { + type Data<'a> = &'a ArchiveWithPath; + type FieldType = OffsetValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.archive.offset() + } + fn value_formatter(&self) -> OffsetValue { + self.0 + } + } + + pub struct CentralDirectoryStartField(pub OffsetValue); + + impl FormatDirective for CentralDirectoryStartField { + type Data<'a> = &'a ArchiveWithPath; + type FieldType = OffsetValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.archive.central_directory_start() + } + fn value_formatter(&self) -> OffsetValue { + self.0 + } + } + + pub mod compiled { + use super::{ + super::{compiled::CompiledFormat, DirectiveFormatter}, + *, + }; + use crate::{args::info::ArchiveOverviewFormatDirective, CommandError}; + + use std::io::Write; + + trait ArchiveDirectiveFormatter { + fn write_archive_directive<'a>( + &self, + data: &'a ArchiveWithPath, + out: &mut dyn Write, + ) -> Result<(), CommandError>; + } + + impl ArchiveDirectiveFormatter for CF + where + CF: for<'a> DirectiveFormatter = &'a ArchiveWithPath>, + { + fn write_archive_directive<'a>( + &self, + data: &'a ArchiveWithPath, + out: &mut dyn Write, + ) -> Result<(), CommandError> { + self.write_directive(data, out) + } + } + + pub struct CompiledArchiveDirective(Box); + + impl DirectiveFormatter for CompiledArchiveDirective { + type Data<'a> = &'a ArchiveWithPath; + + fn write_directive<'a>( + &self, + data: Self::Data<'a>, + out: &mut dyn Write, + ) -> Result<(), CommandError> { + self.0.write_archive_directive(data, out) + } + } + + pub struct CompiledArchiveFormat; + + impl CompiledFormat for CompiledArchiveFormat { + type Spec = ArchiveOverviewFormatDirective; + type Fmt = CompiledArchiveDirective; + + fn from_directive_spec( + spec: ArchiveOverviewFormatDirective, + ) -> Result { + Ok(CompiledArchiveDirective(match spec { + ArchiveOverviewFormatDirective::ArchiveName => { + Box::new(ArchiveNameField(PathString)) + } + ArchiveOverviewFormatDirective::TotalSize(f) => { + Box::new(ArchiveSizeField(ByteSizeValue(f))) + } + ArchiveOverviewFormatDirective::NumEntries => { + Box::new(NumEntriesField(DecimalNumberValue)) + } + ArchiveOverviewFormatDirective::ArchiveComment(x) => { + todo!("comment not supported yet: {:?}", x) + } + ArchiveOverviewFormatDirective::FirstEntryStart(f) => { + Box::new(FirstEntryStartField(OffsetValue(f))) + } + ArchiveOverviewFormatDirective::CentralDirectoryStart(f) => { + Box::new(CentralDirectoryStartField(OffsetValue(f))) + } + })) + } + } + } +} diff --git a/cli/src/info/formats.rs b/cli/src/info/formats.rs index 571f384c5..4ff5384cf 100644 --- a/cli/src/info/formats.rs +++ b/cli/src/info/formats.rs @@ -2,6 +2,7 @@ use std::{ convert::Infallible, fmt, io::{self, Write}, + path, }; use zip::CompressionMethod; @@ -28,6 +29,18 @@ impl FormatValue for NameString { } } +#[derive(Copy, Clone)] +pub struct PathString; + +impl FormatValue for PathString { + type Input<'a> = &'a path::Path; + type Output<'a> = path::Display<'a>; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + Ok(input.display()) + } +} + #[derive(Copy, Clone)] pub struct FileTypeValue(pub FileTypeFormat); @@ -209,3 +222,45 @@ impl FormatValue for ByteSizeValue { }) } } + +#[derive(Copy, Clone)] +pub struct DecimalNumberValue; + +impl FormatValue for DecimalNumberValue { + type Input<'a> = u64; + type Output<'a> = u64; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + Ok(input) + } +} + +#[derive(Copy, Clone)] +pub struct OffsetValue(pub OffsetFormat); + +#[derive(Debug)] +pub enum OffsetWriter { + Decimal(u64), + Hexadecimal(u64), +} + +impl fmt::Display for OffsetWriter { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Decimal(x) => write!(f, "{}", x), + Self::Hexadecimal(x) => write!(f, "{:x}", x), + } + } +} + +impl FormatValue for OffsetValue { + type Input<'a> = u64; + type Output<'a> = OffsetWriter; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + Ok(match self.0 { + OffsetFormat::Decimal => OffsetWriter::Decimal(input), + OffsetFormat::Hexadecimal => OffsetWriter::Hexadecimal(input), + }) + } +} From f08b59f1ad383b935d6cca8eeb04073d54c4fbe5 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 28 Aug 2024 21:42:28 -0400 Subject: [PATCH 51/68] move entry and archive iteration into helper methods --- cli/src/extract/entries.rs | 35 +++++++---- cli/src/info.rs | 119 +++++++++++++++++++++++-------------- 2 files changed, 98 insertions(+), 56 deletions(-) diff --git a/cli/src/extract/entries.rs b/cli/src/extract/entries.rs index dcf45a6bf..0ad374254 100644 --- a/cli/src/extract/entries.rs +++ b/cli/src/extract/entries.rs @@ -3,6 +3,7 @@ use std::{ collections::VecDeque, fs, io::{self}, + ops, path::Path, }; @@ -17,7 +18,7 @@ pub trait IterateEntries { fn next_entry(&mut self) -> Result, CommandError>; } -struct StdinInput { +pub struct StdinInput { inner: io::Stdin, } @@ -25,6 +26,10 @@ impl StdinInput { pub fn new() -> Self { Self { inner: io::stdin() } } + + pub fn into_inner(self) -> io::Stdin { + self.inner + } } impl IterateEntries for StdinInput { @@ -34,19 +39,24 @@ impl IterateEntries for StdinInput { } #[derive(Debug)] -struct ZipFileInput { - inner: ZipArchive, +pub struct ZipFileInput { + inner: A, file_counter: usize, } -impl ZipFileInput { - pub fn new(inner: ZipArchive) -> Self { +impl ZipFileInput { + pub fn new(inner: A) -> Self { Self { - inner: inner, + inner, file_counter: 0, } } +} +impl ZipFileInput +where + A: ops::Deref>, +{ pub fn remaining(&self) -> usize { self.inner.len() - self.file_counter } @@ -56,7 +66,10 @@ impl ZipFileInput { } } -impl IterateEntries for ZipFileInput { +impl IterateEntries for ZipFileInput +where + A: ops::DerefMut>, +{ fn next_entry(&mut self) -> Result, CommandError> { if self.none_left() { return Ok(None); @@ -70,9 +83,9 @@ impl IterateEntries for ZipFileInput { } } -struct AllInputZips { - zips_todo: VecDeque, - cur_zip: UnsafeCell, +pub struct AllInputZips { + zips_todo: VecDeque>>>, + cur_zip: UnsafeCell>>>, } impl AllInputZips { @@ -91,7 +104,7 @@ impl AllInputZips { format!("failed to create zip archive for file {:?}", p.as_ref()) }) }) - .map(ZipFileInput::new) + .map(|zip| ZipFileInput::new(Box::new(zip))) }) .collect::, CommandError>>()?; debug_assert!(!zips_todo.is_empty()); diff --git a/cli/src/info.rs b/cli/src/info.rs index 00ef495c7..eafd97edb 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -1,14 +1,16 @@ use std::{ fs, io::{self, Write}, + ops, path::PathBuf, }; -use zip::read::{read_zipfile_from_stream, ZipArchive}; +use zip::read::{read_zipfile_from_stream, ZipArchive, ZipFile}; use crate::{ args::{extract::InputSpec, info::*}, extract::{ + entries::{IterateEntries, StdinInput, ZipFileInput}, matcher::{CompiledMatcher, EntryMatcher}, receiver::EntryData, }, @@ -18,8 +20,9 @@ use crate::{ mod directives; mod formats; use directives::{ - archive::compiled::CompiledArchiveFormat, compiled::CompiledFormatSpec, - entry::compiled::CompiledEntryFormat, + archive::compiled::{CompiledArchiveDirective, CompiledArchiveFormat}, + compiled::CompiledFormatSpec, + entry::compiled::{CompiledEntryDirective, CompiledEntryFormat}, }; pub struct ArchiveWithPath { @@ -42,6 +45,48 @@ impl ArchiveWithPath { } } +fn format_entry_info( + mut err: impl Write, + entry_formatter: &CompiledFormatSpec, + matcher: Option<&CompiledMatcher>, + mut output_stream: impl Write, + source: &mut impl IterateEntries, +) -> Result<(), CommandError> { + if entry_formatter.is_empty() { + writeln!( + &mut err, + "empty entry format, skipping reading from any entries" + ) + .unwrap(); + return Ok(()); + } + + while let Some(entry) = source.next_entry()? { + let data = EntryData::from_entry(&entry); + if matcher.as_ref().is_some_and(|m| !m.matches(&data)) { + writeln!(&mut err, "matcher ignored entry: {:?}", data.name).unwrap(); + continue; + } + entry_formatter.execute_format(data, &mut output_stream)?; + } + Ok(()) +} + +fn format_archive_info( + mut err: impl Write, + archive_formatter: &CompiledFormatSpec, + mut output_stream: impl Write, + zip: &ArchiveWithPath, +) -> Result<(), CommandError> { + if archive_formatter.is_empty() { + writeln!(&mut err, "empty archive format, skipping archive overview").unwrap(); + return Ok(()); + } + + archive_formatter.execute_format(&zip, &mut output_stream)?; + Ok(()) +} + pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> { let Info { format_spec, @@ -67,56 +112,40 @@ pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> let mut output_stream = io::stdout().lock(); if stdin_stream { - let mut stdin = io::stdin().lock(); - if entry_formatter.is_empty() { - writeln!(&mut err, "empty entry format, skipping stdin entries").unwrap(); - } else { - while let Some(entry) = read_zipfile_from_stream(&mut stdin) - .wrap_err("error reading zip entry from stdin")? - { - let data = EntryData::from_entry(&entry); - if matcher.as_ref().is_some_and(|m| !m.matches(&data)) { - continue; - } - entry_formatter.execute_format(data, &mut output_stream)?; - } - } - writeln!( + let mut stdin = StdinInput::new(); + + format_entry_info( &mut err, - "stdin currently cannot provide archive format info" - ) - .unwrap(); - } - for p in zip_paths.into_iter() { - let mut zip = ArchiveWithPath::open(p.clone())?; - if entry_formatter.is_empty() { + &entry_formatter, + matcher.as_ref(), + &mut output_stream, + &mut stdin, + )?; + + if !archive_formatter.is_empty() { writeln!( &mut err, - "empty entry format, skipping entries for file {p:?}" + "archive format was provided but stdin currently cannot provide archive format info" ) .unwrap(); - } else { - for i in 0..zip.archive.len() { - let entry = zip - .archive - .by_index(i) - .wrap_err_with(|| format!("failed to read entry {i} from zip at {p:?}"))?; - let data = EntryData::from_entry(&entry); - if matcher.as_ref().is_some_and(|m| !m.matches(&data)) { - continue; - } - entry_formatter.execute_format(data, &mut output_stream)?; - } } - if archive_formatter.is_empty() { - writeln!( + } + + for p in zip_paths.into_iter() { + let mut zip = ArchiveWithPath::open(p.clone())?; + + { + let mut zip_entry_counter = ZipFileInput::new(&mut zip.archive); + format_entry_info( &mut err, - "empty archive format, skipping archive overview for file {p:?}" - ) - .unwrap(); - } else { - archive_formatter.execute_format(&zip, &mut output_stream)?; + &entry_formatter, + matcher.as_ref(), + &mut output_stream, + &mut zip_entry_counter, + )?; } + + format_archive_info(&mut err, &archive_formatter, &mut output_stream, &zip)?; } Ok(()) From 8ecc1b1ba9722100938582dda66d9a6f89351acb Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 28 Aug 2024 23:10:01 -0400 Subject: [PATCH 52/68] make archive formatting work fully with ArchiveData --- cli/src/args/info.rs | 9 ++-- cli/src/info.rs | 15 ++++--- cli/src/info/directives.rs | 86 +++++++++++++++++++++++++++++-------- cli/src/info/formats.rs | 87 ++++++++++++++++++++++++++++++++++++-- 4 files changed, 165 insertions(+), 32 deletions(-) diff --git a/cli/src/args/info.rs b/cli/src/args/info.rs index 4a2711828..5af76436d 100644 --- a/cli/src/args/info.rs +++ b/cli/src/args/info.rs @@ -96,7 +96,7 @@ impl OffsetFormat { pub enum BinaryStringFormat { #[default] PrintAsString, - EscapeBinary, + EscapeAscii, WriteBinaryContents, } @@ -105,7 +105,7 @@ impl BinaryStringFormat { match s { "" => Ok(Self::default()), ":print" => Ok(Self::PrintAsString), - ":escape" => Ok(Self::EscapeBinary), + ":escape" => Ok(Self::EscapeAscii), ":write" => Ok(Self::WriteBinaryContents), _ => Err(ModifierParseError(format!( "unrecognized string format: {s:?}" @@ -599,8 +599,9 @@ offset = '' [DEFAULT => hex] = ':hex' (hexadecimal numeric representation) bin-str = '' [DEFAULT => print] - = ':print' (print string, erroring upon invalid unicode) - = ':escape' (surround with "" and escape non-unicode characters) + = ':print' (non-unicode chunks are replaced with + the unicode replacement character '�') + = ':escape' (surround with "" and escape each byte as ascii) = ':write' (write string to output without checking for unicode) unix-mode = '' [DEFAULT => octal] diff --git a/cli/src/info.rs b/cli/src/info.rs index eafd97edb..13c3d7241 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -1,11 +1,10 @@ use std::{ fs, io::{self, Write}, - ops, path::PathBuf, }; -use zip::read::{read_zipfile_from_stream, ZipArchive, ZipFile}; +use zip::read::ZipArchive; use crate::{ args::{extract::InputSpec, info::*}, @@ -20,7 +19,10 @@ use crate::{ mod directives; mod formats; use directives::{ - archive::compiled::{CompiledArchiveDirective, CompiledArchiveFormat}, + archive::{ + compiled::{CompiledArchiveDirective, CompiledArchiveFormat}, + ArchiveData, + }, compiled::CompiledFormatSpec, entry::compiled::{CompiledEntryDirective, CompiledEntryFormat}, }; @@ -76,14 +78,14 @@ fn format_archive_info( mut err: impl Write, archive_formatter: &CompiledFormatSpec, mut output_stream: impl Write, - zip: &ArchiveWithPath, + zip: ArchiveData, ) -> Result<(), CommandError> { if archive_formatter.is_empty() { writeln!(&mut err, "empty archive format, skipping archive overview").unwrap(); return Ok(()); } - archive_formatter.execute_format(&zip, &mut output_stream)?; + archive_formatter.execute_format(zip, &mut output_stream)?; Ok(()) } @@ -145,7 +147,8 @@ pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> )?; } - format_archive_info(&mut err, &archive_formatter, &mut output_stream, &zip)?; + let data = ArchiveData::from_archive_with_path(&zip); + format_archive_info(&mut err, &archive_formatter, &mut output_stream, data)?; } Ok(()) diff --git a/cli/src/info/directives.rs b/cli/src/info/directives.rs index ac11ca8a0..785227bc6 100644 --- a/cli/src/info/directives.rs +++ b/cli/src/info/directives.rs @@ -109,7 +109,7 @@ pub mod compiled { } pub struct CompiledFormatSpec { - pub components: Vec>, + components: Vec>, } impl CompiledFormatSpec { @@ -343,22 +343,56 @@ pub mod entry { pub mod archive { use super::{ super::{ - formats::{ByteSizeValue, DecimalNumberValue, FormatValue, OffsetValue, PathString}, + formats::{ + BinaryStringValue, ByteSizeValue, DecimalNumberValue, FormatValue, OffsetValue, + PathString, + }, ArchiveWithPath, }, FormatDirective, }; + use std::path::Path; + + #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] + pub struct ArchiveData<'a> { + pub path: Option<&'a Path>, + pub stream_length: u64, + pub num_entries: usize, + pub comment: Option<&'a [u8]>, + pub first_entry_start: Option, + pub central_directory_start: Option, + } + + impl<'a> ArchiveData<'a> { + pub fn from_archive_with_path(zip: &'a ArchiveWithPath) -> Self { + let path = zip.path.as_path(); + let stream_length = zip.len; + let num_entries = zip.archive.len(); + let comment = zip.archive.comment(); + let first_entry_start = zip.archive.offset(); + let central_directory_start = zip.archive.central_directory_start(); + Self { + path: Some(path), + stream_length, + num_entries, + comment: Some(comment), + first_entry_start: Some(first_entry_start), + central_directory_start: Some(central_directory_start), + } + } + } + pub struct ArchiveNameField(pub PathString); impl FormatDirective for ArchiveNameField { - type Data<'a> = &'a ArchiveWithPath; + type Data<'a> = ArchiveData<'a>; type FieldType = PathString; fn extract_field<'a>( &self, data: Self::Data<'a>, ) -> ::Input<'a> { - data.path.as_path() + data.path } fn value_formatter(&self) -> PathString { self.0 @@ -368,13 +402,13 @@ pub mod archive { pub struct ArchiveSizeField(pub ByteSizeValue); impl FormatDirective for ArchiveSizeField { - type Data<'a> = &'a ArchiveWithPath; + type Data<'a> = ArchiveData<'a>; type FieldType = ByteSizeValue; fn extract_field<'a>( &self, data: Self::Data<'a>, ) -> ::Input<'a> { - data.len + data.stream_length } fn value_formatter(&self) -> ByteSizeValue { self.0 @@ -384,29 +418,45 @@ pub mod archive { pub struct NumEntriesField(pub DecimalNumberValue); impl FormatDirective for NumEntriesField { - type Data<'a> = &'a ArchiveWithPath; + type Data<'a> = ArchiveData<'a>; type FieldType = DecimalNumberValue; fn extract_field<'a>( &self, data: Self::Data<'a>, ) -> ::Input<'a> { - data.archive.len().try_into().unwrap() + data.num_entries.try_into().unwrap() } fn value_formatter(&self) -> DecimalNumberValue { self.0 } } + pub struct ArchiveCommentField(pub BinaryStringValue); + + impl FormatDirective for ArchiveCommentField { + type Data<'a> = ArchiveData<'a>; + type FieldType = BinaryStringValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.comment + } + fn value_formatter(&self) -> BinaryStringValue { + self.0 + } + } + pub struct FirstEntryStartField(pub OffsetValue); impl FormatDirective for FirstEntryStartField { - type Data<'a> = &'a ArchiveWithPath; + type Data<'a> = ArchiveData<'a>; type FieldType = OffsetValue; fn extract_field<'a>( &self, data: Self::Data<'a>, ) -> ::Input<'a> { - data.archive.offset() + data.first_entry_start } fn value_formatter(&self) -> OffsetValue { self.0 @@ -416,13 +466,13 @@ pub mod archive { pub struct CentralDirectoryStartField(pub OffsetValue); impl FormatDirective for CentralDirectoryStartField { - type Data<'a> = &'a ArchiveWithPath; + type Data<'a> = ArchiveData<'a>; type FieldType = OffsetValue; fn extract_field<'a>( &self, data: Self::Data<'a>, ) -> ::Input<'a> { - data.archive.central_directory_start() + data.central_directory_start } fn value_formatter(&self) -> OffsetValue { self.0 @@ -441,18 +491,18 @@ pub mod archive { trait ArchiveDirectiveFormatter { fn write_archive_directive<'a>( &self, - data: &'a ArchiveWithPath, + data: ArchiveData<'a>, out: &mut dyn Write, ) -> Result<(), CommandError>; } impl ArchiveDirectiveFormatter for CF where - CF: for<'a> DirectiveFormatter = &'a ArchiveWithPath>, + CF: for<'a> DirectiveFormatter = ArchiveData<'a>>, { fn write_archive_directive<'a>( &self, - data: &'a ArchiveWithPath, + data: ArchiveData<'a>, out: &mut dyn Write, ) -> Result<(), CommandError> { self.write_directive(data, out) @@ -462,7 +512,7 @@ pub mod archive { pub struct CompiledArchiveDirective(Box); impl DirectiveFormatter for CompiledArchiveDirective { - type Data<'a> = &'a ArchiveWithPath; + type Data<'a> = ArchiveData<'a>; fn write_directive<'a>( &self, @@ -492,8 +542,8 @@ pub mod archive { ArchiveOverviewFormatDirective::NumEntries => { Box::new(NumEntriesField(DecimalNumberValue)) } - ArchiveOverviewFormatDirective::ArchiveComment(x) => { - todo!("comment not supported yet: {:?}", x) + ArchiveOverviewFormatDirective::ArchiveComment(f) => { + Box::new(ArchiveCommentField(BinaryStringValue(f))) } ArchiveOverviewFormatDirective::FirstEntryStart(f) => { Box::new(FirstEntryStartField(OffsetValue(f))) diff --git a/cli/src/info/formats.rs b/cli/src/info/formats.rs index 4ff5384cf..0506f33c8 100644 --- a/cli/src/info/formats.rs +++ b/cli/src/info/formats.rs @@ -32,12 +32,30 @@ impl FormatValue for NameString { #[derive(Copy, Clone)] pub struct PathString; +#[derive(Debug)] +pub enum PathWriter<'a> { + Path(path::Display<'a>), + None, +} + +impl<'a> fmt::Display for PathWriter<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Path(p) => path::Display::fmt(p, f), + Self::None => write!(f, ""), + } + } +} + impl FormatValue for PathString { - type Input<'a> = &'a path::Path; - type Output<'a> = path::Display<'a>; + type Input<'a> = Option<&'a path::Path>; + type Output<'a> = PathWriter<'a>; type E = Infallible; fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { - Ok(input.display()) + Ok(match input { + Some(p) => PathWriter::Path(p.display()), + None => PathWriter::None, + }) } } @@ -240,6 +258,7 @@ pub struct OffsetValue(pub OffsetFormat); #[derive(Debug)] pub enum OffsetWriter { + Unknown, Decimal(u64), Hexadecimal(u64), } @@ -247,6 +266,7 @@ pub enum OffsetWriter { impl fmt::Display for OffsetWriter { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { + Self::Unknown => write!(f, "?"), Self::Decimal(x) => write!(f, "{}", x), Self::Hexadecimal(x) => write!(f, "{:x}", x), } @@ -254,13 +274,72 @@ impl fmt::Display for OffsetWriter { } impl FormatValue for OffsetValue { - type Input<'a> = u64; + type Input<'a> = Option; type Output<'a> = OffsetWriter; type E = Infallible; fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + let input = match input { + None => return Ok(OffsetWriter::Unknown), + Some(input) => input, + }; Ok(match self.0 { OffsetFormat::Decimal => OffsetWriter::Decimal(input), OffsetFormat::Hexadecimal => OffsetWriter::Hexadecimal(input), }) } } + +#[derive(Copy, Clone)] +pub struct BinaryStringValue(pub BinaryStringFormat); + +#[derive(Debug)] +pub enum BinaryStringWriter<'a> { + ReplaceNonUnicode(&'a [u8]), + EscapeAscii(&'a [u8]), + WriteExactly(&'a [u8]), +} + +impl<'a> BinaryStringWriter<'a> { + const INVALID_CHUNK_BUFS: [&'static str; 4] = ["", "�", "��", "���"]; +} + +impl<'a> Writeable for BinaryStringWriter<'a> { + fn write_to(&self, out: &mut dyn Write) -> Result<(), io::Error> { + match self { + Self::ReplaceNonUnicode(s) => { + for chunk in s.utf8_chunks() { + write!(out, "{}", chunk.valid())?; + /* The length of invalid bytes is never longer than 3. */ + write!(out, "{}", Self::INVALID_CHUNK_BUFS[chunk.invalid().len()])?; + } + Ok(()) + } + Self::EscapeAscii(s) => { + if s.is_empty() { + return write!(out, "\"\""); + } + write!(out, "\" ")?; + for b in s.iter().copied() { + write!(out, "{} ", b.escape_ascii())?; + } + write!(out, "\"")?; + Ok(()) + } + Self::WriteExactly(s) => out.write_all(s), + } + } +} + +impl FormatValue for BinaryStringValue { + type Input<'a> = Option<&'a [u8]>; + type Output<'a> = BinaryStringWriter<'a>; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + let input = input.unwrap_or(&[]); + Ok(match self.0 { + BinaryStringFormat::PrintAsString => BinaryStringWriter::ReplaceNonUnicode(input), + BinaryStringFormat::EscapeAscii => BinaryStringWriter::EscapeAscii(input), + BinaryStringFormat::WriteBinaryContents => BinaryStringWriter::WriteExactly(input), + }) + } +} From 1d1d3473e4c9c4b597b55ff8ae792acfc87d305d Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 28 Aug 2024 23:59:46 -0400 Subject: [PATCH 53/68] archive data works for stdin! --- cli/src/extract/entries.rs | 92 ++++++++++++++++++++++++++++++-------- cli/src/info.rs | 30 +++++++++---- 2 files changed, 94 insertions(+), 28 deletions(-) diff --git a/cli/src/extract/entries.rs b/cli/src/extract/entries.rs index 0ad374254..52c8a07a2 100644 --- a/cli/src/extract/entries.rs +++ b/cli/src/extract/entries.rs @@ -1,11 +1,4 @@ -use std::{ - cell::UnsafeCell, - collections::VecDeque, - fs, - io::{self}, - ops, - path::Path, -}; +use std::{cell::UnsafeCell, collections::VecDeque, fs, io, ops, path::Path}; use zip::{ read::{read_zipfile_from_stream, ZipFile}, @@ -18,23 +11,84 @@ pub trait IterateEntries { fn next_entry(&mut self) -> Result, CommandError>; } -pub struct StdinInput { - inner: io::Stdin, +pub struct ReadChecker { + inner: R, + bytes_read: u64, } -impl StdinInput { - pub fn new() -> Self { - Self { inner: io::stdin() } +impl ReadChecker { + pub const fn current_bytes_read(&self) -> u64 { + self.bytes_read } +} - pub fn into_inner(self) -> io::Stdin { - self.inner +impl ReadChecker +where + R: io::Read, +{ + pub fn exhaust(mut self) -> io::Result<(R, u64)> { + io::copy(&mut self, &mut io::sink())?; + let Self { inner, bytes_read } = self; + Ok((inner, bytes_read)) + } +} + +impl io::Read for ReadChecker +where + R: io::Read, +{ + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let n = self.inner.read(buf)?; + let num_read: u64 = n.try_into().unwrap(); + self.bytes_read += num_read; + Ok(n) } } -impl IterateEntries for StdinInput { +pub struct StreamInput { + inner: ReadChecker, + entries_read: usize, +} + +impl StreamInput { + pub fn stdin() -> Self { + Self::new(io::stdin()) + } +} + +impl StreamInput { + pub fn new(inner: R) -> Self { + Self { + inner: ReadChecker { + inner, + bytes_read: 0, + }, + entries_read: 0, + } + } + + pub fn into_inner(self) -> (ReadChecker, usize) { + let Self { + inner, + entries_read, + } = self; + (inner, entries_read) + } +} + +impl IterateEntries for StreamInput +where + R: io::Read, +{ fn next_entry(&mut self) -> Result, CommandError> { - read_zipfile_from_stream(&mut self.inner).wrap_err("failed to read zip entries from stdin") + if let Some(entry) = read_zipfile_from_stream(&mut self.inner) + .wrap_err("failed to read zip entries from stdin")? + { + self.entries_read += 1; + Ok(Some(entry)) + } else { + Ok(None) + } } } @@ -135,7 +189,7 @@ impl IterateEntries for AllInputZips { } pub struct MergedInput { - stdin_stream: Option>, + stdin_stream: Option>>, zips: Option, } @@ -147,7 +201,7 @@ impl MergedInput { } = spec; Ok(Self { stdin_stream: if stdin_stream { - Some(UnsafeCell::new(StdinInput::new())) + Some(UnsafeCell::new(StreamInput::stdin())) } else { None }, diff --git a/cli/src/info.rs b/cli/src/info.rs index 13c3d7241..4a12f0253 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -9,7 +9,7 @@ use zip::read::ZipArchive; use crate::{ args::{extract::InputSpec, info::*}, extract::{ - entries::{IterateEntries, StdinInput, ZipFileInput}, + entries::{IterateEntries, StreamInput, ZipFileInput}, matcher::{CompiledMatcher, EntryMatcher}, receiver::EntryData, }, @@ -114,7 +114,7 @@ pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> let mut output_stream = io::stdout().lock(); if stdin_stream { - let mut stdin = StdinInput::new(); + let mut stdin = StreamInput::new(io::stdin().lock()); format_entry_info( &mut err, @@ -124,13 +124,25 @@ pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> &mut stdin, )?; - if !archive_formatter.is_empty() { - writeln!( - &mut err, - "archive format was provided but stdin currently cannot provide archive format info" - ) - .unwrap(); - } + let (stdin, num_entries) = stdin.into_inner(); + /* NB: The read_zipfile_from_stream() method overruns the size of a single local header into + * the CDE after reading the last input. There are unstable APIs to address this, but for + * now just rely on that internal knowledge. See e.g. zip::read::stream on master or + * zip::unstable::read in https://github.com/zip-rs/zip2/pull/233. */ + let cde_start = stdin.current_bytes_read() - 30; + let (_stdin, stream_length) = stdin + .exhaust() + .wrap_err("failed to exhaust all of stdin after reading all zip entries")?; + + let data = ArchiveData { + path: None, + stream_length, + num_entries, + comment: None, + first_entry_start: Some(0), + central_directory_start: Some(cde_start), + }; + format_archive_info(&mut err, &archive_formatter, &mut output_stream, data)?; } for p in zip_paths.into_iter() { From b88afb8b098ef108745b065289c9dcca70a0bd3c Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 29 Aug 2024 00:56:28 -0400 Subject: [PATCH 54/68] add more logging to extraction --- cli/src/extract.rs | 14 ++--- cli/src/extract/receiver.rs | 104 ++++++++++++++++++++---------------- 2 files changed, 66 insertions(+), 52 deletions(-) diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 3cda277f4..3f772017c 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -16,14 +16,16 @@ use matcher::EntryMatcher; use receiver::{CompiledEntrySpec, ConcatEntry, EntryData, EntryKind, EntryReceiver, ExtractEntry}; use transform::NameTransformer; -pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), CommandError> { +pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandError> { let Extract { output_specs, entry_specs, input_spec, } = extract; + let err = Rc::new(RefCell::new(err)); - let compiled_specs = receiver::process_entry_and_output_specs(entry_specs, output_specs)?; + let compiled_specs = + receiver::process_entry_and_output_specs(err.clone(), entry_specs, output_specs)?; let mut entry_iterator = entries::MergedInput::from_spec(input_spec)?; let mut copy_buf: Vec = vec![0u8; 1024 * 16]; @@ -66,8 +68,8 @@ pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), Comm .as_ref() .map(|t| t.transform_name(&data.name)) .unwrap_or_else(|| Cow::Borrowed(&data.name)); - writeln!(&mut err, "{data:?}").unwrap(); - writeln!(&mut err, "{new_name:?}").unwrap(); + writeln!(&mut err.borrow_mut(), "{data:?}").unwrap(); + writeln!(&mut err.borrow_mut(), "{new_name:?}").unwrap(); matching_extracts.push((new_name, recv.clone())); } } @@ -85,7 +87,7 @@ pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), Comm .iter() .any(|p| Rc::ptr_eq(p, &concat_p)) { - writeln!(&mut err, "skipping repeated concat").unwrap(); + writeln!(&mut err.borrow_mut(), "skipping repeated concat").unwrap(); } else { deduped_concat_writers.push(concat_p); } @@ -96,7 +98,7 @@ pub fn execute_extract(mut err: impl Write, extract: Extract) -> Result<(), Comm .iter() .any(|(n, p)| Rc::ptr_eq(p, &extract_p) && name.as_ref() == n.as_ref()) { - writeln!(&mut err, "skipping repeated extract").unwrap(); + writeln!(&mut err.borrow_mut(), "skipping repeated extract").unwrap(); } else { deduped_matching_extracts.push((name, extract_p)); } diff --git a/cli/src/extract/receiver.rs b/cli/src/extract/receiver.rs index 9e0bc2ae7..0a8bf1594 100644 --- a/cli/src/extract/receiver.rs +++ b/cli/src/extract/receiver.rs @@ -94,31 +94,32 @@ impl ParsedEntrySpecArg { } } -pub struct ConcatEntry { +pub struct ConcatEntry<'w> { pub matcher: Option, - pub stream: Rc>, + pub stream: Rc>, } -pub struct ExtractEntry { +pub struct ExtractEntry<'w> { pub matcher: Option, pub transforms: Option, - pub recv: Rc, + pub recv: Rc, } -pub enum CompiledEntrySpec { - Concat(ConcatEntry), - Extract(ExtractEntry), +pub enum CompiledEntrySpec<'w> { + Concat(ConcatEntry<'w>), + Extract(ExtractEntry<'w>), } -pub struct ParsedNamedOutputs { - concats: HashMap>>, - extracts: HashMap>, +pub struct ParsedNamedOutputs<'w> { + concats: HashMap>>, + extracts: HashMap>, } -pub fn process_entry_and_output_specs( +pub fn process_entry_and_output_specs<'w>( + err: Rc>, entry_specs: impl IntoIterator, output_specs: OutputSpecs, -) -> Result, CommandError> { +) -> Result>, CommandError> { let mut entry_specs: Vec = entry_specs .into_iter() .map(ParsedEntrySpecArg::from_entry_spec) @@ -130,15 +131,15 @@ pub fn process_entry_and_output_specs( output_name: OutputName::default_name(), }); } - let parsed_outputs = ParsedNamedOutputs::from_output_specs(output_specs)?; + let parsed_outputs = ParsedNamedOutputs::from_output_specs(err, output_specs)?; parsed_outputs.process_entry_specs_for_outputs(entry_specs) } -impl ParsedNamedOutputs { +impl<'w> ParsedNamedOutputs<'w> { pub fn process_entry_specs_for_outputs( self, args: impl IntoIterator, - ) -> Result, CommandError> { + ) -> Result>, CommandError> { args.into_iter() .map(|arg| self.lookup_entry_spec_arg(arg)) .collect() @@ -147,7 +148,7 @@ impl ParsedNamedOutputs { fn lookup_entry_spec_arg( &self, arg: ParsedEntrySpecArg, - ) -> Result { + ) -> Result, CommandError> { let ParsedEntrySpecArg { matcher, transforms, @@ -180,7 +181,7 @@ impl ParsedNamedOutputs { seen_stdout: &mut bool, name: OutputName, seen_names: &mut HashSet, - concats: &mut HashMap>>, + concats: &mut HashMap>>, ) -> Result<(), CommandError> { if *seen_stdout { return Err(CommandError::InvalidArg( @@ -194,7 +195,7 @@ impl ParsedNamedOutputs { } assert!(!concats.contains_key(&name)); - let handle: Rc> = Rc::new(RefCell::new(io::stdout())); + let handle: Rc> = Rc::new(RefCell::new(io::stdout())); *seen_stdout = true; assert!(seen_names.insert(name.clone())); @@ -208,7 +209,7 @@ impl ParsedNamedOutputs { name: OutputName, seen_files: &mut HashSet, seen_names: &mut HashSet, - concats: &mut HashMap>>, + concats: &mut HashMap>>, ) -> Result<(), CommandError> { if seen_names.contains(&name) { return Err(CommandError::InvalidArg(format!( @@ -217,7 +218,7 @@ impl ParsedNamedOutputs { } assert!(!concats.contains_key(&name)); - let handle: Rc> = { + let handle: Rc> = { let mut f: fs::File = if append { fs::OpenOptions::new() .write(true) @@ -249,12 +250,13 @@ impl ParsedNamedOutputs { } fn add_dir( + err: Rc>, output_dir: PathBuf, mkdir: bool, name: OutputName, seen_dirs: &mut HashSet, seen_names: &mut HashSet, - extracts: &mut HashMap>, + extracts: &mut HashMap>, ) -> Result<(), CommandError> { if seen_names.contains(&name) { return Err(CommandError::InvalidArg(format!( @@ -277,8 +279,8 @@ impl ParsedNamedOutputs { ))); } - let handle: Rc = { - let d = FilesystemReceiver::new(output_dir); + let handle: Rc = { + let d = FilesystemReceiver::new(err, output_dir); Rc::new(d) }; @@ -288,11 +290,14 @@ impl ParsedNamedOutputs { Ok(()) } - pub fn from_output_specs(spec: OutputSpecs) -> Result { + pub fn from_output_specs( + err: Rc>, + spec: OutputSpecs, + ) -> Result { let OutputSpecs { default, named } = spec; - let mut concats: HashMap>> = HashMap::new(); - let mut extracts: HashMap> = HashMap::new(); + let mut concats: HashMap>> = HashMap::new(); + let mut extracts: HashMap> = HashMap::new(); let mut seen_stdout: bool = false; let mut seen_files: HashSet = HashSet::new(); @@ -321,6 +326,7 @@ impl ParsedNamedOutputs { } OutputCollation::Filesystem { output_dir, mkdir } => { Self::add_dir( + err.clone(), output_dir, mkdir, OutputName::default_name(), @@ -349,6 +355,7 @@ impl ParsedNamedOutputs { } OutputCollation::Filesystem { output_dir, mkdir } => { Self::add_dir( + err.clone(), output_dir, mkdir, name, @@ -375,15 +382,17 @@ pub trait EntryReceiver { fn finalize_entries(&self) -> Result<(), CommandError>; } -struct FilesystemReceiver { +struct FilesystemReceiver { + err: Rc>, output_dir: PathBuf, #[cfg(unix)] perms_to_set: RefCell>, } -impl FilesystemReceiver { - pub fn new(output_dir: PathBuf) -> Self { +impl FilesystemReceiver { + pub fn new(err: Rc>, output_dir: PathBuf) -> Self { Self { + err, output_dir, #[cfg(unix)] perms_to_set: RefCell::new(Vec::new()), @@ -391,29 +400,32 @@ impl FilesystemReceiver { } } -impl EntryReceiver for FilesystemReceiver { +impl EntryReceiver for FilesystemReceiver +where + W: Write, +{ fn generate_entry_handle<'s>( &self, data: EntryData<'s>, symlink_target: Option<&[u8]>, name: Cow<'s, str>, ) -> Result>, CommandError> { - /* let mut err = self.err.borrow_mut(); */ + let mut err = self.err.borrow_mut(); let full_output_path = self.output_dir.join(name.as_ref()); - /* writeln!( */ - /* err, */ - /* "receiving entry {} with name {name} and writing to path {full_output_path:?}", */ - /* entry.name() */ - /* ) */ - /* .unwrap(); */ + writeln!( + err, + "receiving entry {} with name {name} and writing to path {full_output_path:?}", + data.name + ) + .unwrap(); #[cfg(unix)] if let Some(mode) = data.unix_mode { - /* writeln!( */ - /* err, */ - /* "storing unix mode {mode} for path {full_output_path:?}" */ - /* ) */ - /* .unwrap(); */ + writeln!( + err, + "storing unix mode {mode} for path {full_output_path:?}" + ) + .unwrap(); self.perms_to_set .borrow_mut() .push((full_output_path.clone(), mode)); @@ -421,7 +433,7 @@ impl EntryReceiver for FilesystemReceiver { match data.kind { EntryKind::Dir => { - /* writeln!(err, "entry is directory, creating").unwrap(); */ + writeln!(err, "entry is directory, creating").unwrap(); fs::create_dir_all(&full_output_path).wrap_err_with(|| { format!("failed to create directory entry at {full_output_path:?}") })?; @@ -438,7 +450,7 @@ impl EntryReceiver for FilesystemReceiver { os::unix::{ffi::OsStringExt, fs::symlink}, }; let target = OsString::from_vec(target); - /* writeln!(err, "entry is symlink to {target:?}, creating").unwrap(); */ + writeln!(err, "entry is symlink to {target:?}, creating").unwrap(); symlink(&target, &full_output_path).wrap_err_with(|| { format!( "failed to create symlink at {full_output_path:?} with target {target:?}" @@ -452,13 +464,13 @@ impl EntryReceiver for FilesystemReceiver { } } EntryKind::File => { - /* writeln!(err, "entry is file, creating").unwrap(); */ + writeln!(err, "entry is file, creating").unwrap(); if let Some(containing_dir) = full_output_path.parent() { fs::create_dir_all(containing_dir).wrap_err_with(|| { format!("failed to create parent dirs for file at {full_output_path:?}") })?; } else { - /* writeln!(err, "entry had no parent dir (in root dir?)").unwrap(); */ + writeln!(err, "entry had no parent dir (in root dir?)").unwrap(); } let outfile = fs::File::create(&full_output_path) .wrap_err_with(|| format!("failed to create file at {full_output_path:?}"))?; From eee542f8a4b7fce6f2d1b84560cf9a4714af4a93 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 29 Aug 2024 01:24:54 -0400 Subject: [PATCH 55/68] remove allocations in perms todo sorting --- cli/src/extract/receiver.rs | 79 ++++++++++++++++++++++++++----------- 1 file changed, 57 insertions(+), 22 deletions(-) diff --git a/cli/src/extract/receiver.rs b/cli/src/extract/receiver.rs index 0a8bf1594..3118cc2fd 100644 --- a/cli/src/extract/receiver.rs +++ b/cli/src/extract/receiver.rs @@ -382,11 +382,18 @@ pub trait EntryReceiver { fn finalize_entries(&self) -> Result<(), CommandError>; } +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[cfg(unix)] +struct PermsEntry { + path: PathBuf, + mode: u32, +} + struct FilesystemReceiver { err: Rc>, output_dir: PathBuf, #[cfg(unix)] - perms_to_set: RefCell>, + perms_to_set: RefCell>, } impl FilesystemReceiver { @@ -419,18 +426,6 @@ where ) .unwrap(); - #[cfg(unix)] - if let Some(mode) = data.unix_mode { - writeln!( - err, - "storing unix mode {mode} for path {full_output_path:?}" - ) - .unwrap(); - self.perms_to_set - .borrow_mut() - .push((full_output_path.clone(), mode)); - } - match data.kind { EntryKind::Dir => { writeln!(err, "entry is directory, creating").unwrap(); @@ -451,15 +446,36 @@ where }; let target = OsString::from_vec(target); writeln!(err, "entry is symlink to {target:?}, creating").unwrap(); - symlink(&target, &full_output_path).wrap_err_with(|| { - format!( - "failed to create symlink at {full_output_path:?} with target {target:?}" - ) - })?; + /* The stdlib symlink function has no functionality like OpenOptions to + * truncate a symlink if it already exists, so we have to do that ourselves + * here. */ + if let Err(e) = symlink(&target, &full_output_path) { + let e = match e.kind() { + io::ErrorKind::AlreadyExists => { + writeln!(err, "a file already existed at the symlink target {full_output_path:?}, removing") + .unwrap(); + fs::remove_file(&full_output_path) + .wrap_err_with(|| format!("failed to remove file at symlink target {full_output_path:?}"))?; + writeln!( + err, + "successfully removed file entry, creating symlink again" + ) + .unwrap(); + symlink(&target, &full_output_path).err() + } + _ => Some(e), + }; + if let Some(e) = e { + return Err(e).wrap_err_with(|| { + format!( + "failed to create symlink at {full_output_path:?} with target {target:?}" + ) + }); + } + } } #[cfg(not(unix))] { - /* FIXME: non-unix symlink extraction not yet supported! */ todo!("TODO: cannot create symlink for entry {name} on non-unix yet!"); } } @@ -477,17 +493,36 @@ where return Ok(Some(Box::new(outfile))); } } + + #[cfg(unix)] + if let Some(mode) = data.unix_mode { + writeln!( + err, + "storing unix mode {mode} for path {full_output_path:?}" + ) + .unwrap(); + self.perms_to_set.borrow_mut().push(PermsEntry { + path: full_output_path, + mode, + }); + } + Ok(None) } fn finalize_entries(&self) -> Result<(), CommandError> { #[cfg(unix)] { - use std::{cmp::Reverse, os::unix::fs::PermissionsExt}; + use std::os::unix::fs::PermissionsExt; let mut perms_to_set = mem::take(&mut *self.perms_to_set.borrow_mut()); - perms_to_set.sort_unstable_by_key(|(path, _)| Reverse(path.clone())); - for (path, mode) in perms_to_set.into_iter() { + perms_to_set.sort_unstable(); + writeln!( + &mut self.err.borrow_mut(), + "perms to set (these are done in reverse order): {perms_to_set:?}" + ) + .unwrap(); + for PermsEntry { path, mode } in perms_to_set.into_iter().rev() { let perms = fs::Permissions::from_mode(mode); fs::set_permissions(&path, perms.clone()) .wrap_err_with(|| format!("error setting perms {perms:?} for path {path:?}"))?; From cc8fab99fb8407690c27a79fb3091cd2110ef7ab Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 29 Aug 2024 02:09:46 -0400 Subject: [PATCH 56/68] several preliminary notes added to extract command --- cli/src/extract.rs | 43 ++++++++++++++++++++++++++----------- cli/src/extract/receiver.rs | 6 +++--- cli/src/info.rs | 2 +- 3 files changed, 35 insertions(+), 16 deletions(-) diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 3f772017c..16bd89a8b 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -30,6 +30,12 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE let mut copy_buf: Vec = vec![0u8; 1024 * 16]; + let mut matching_concats: Vec>> = Vec::new(); + /* let mut matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); */ + let mut deduped_concat_writers: Vec>> = Vec::new(); + /* let mut deduped_matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); */ + let mut matching_handles: Vec> = Vec::new(); + while let Some(mut entry) = entry_iterator.next_entry()? { let symlink_target: Option> = { let (kind, size) = { @@ -49,7 +55,6 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE }; let data = EntryData::from_entry(&entry); - let mut matching_concats: Vec>> = Vec::new(); let mut matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); for spec in compiled_specs.iter() { match spec { @@ -81,8 +86,7 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE /* Split output handles for concat, and split generated handles by extract source and * name. use Rc::ptr_eq() to split, and Cow::<'s, str>::eq() with str AsRef. */ - let mut deduped_concat_writers: Vec>> = Vec::new(); - for concat_p in matching_concats.into_iter() { + for concat_p in matching_concats.drain(..) { if deduped_concat_writers .iter() .any(|p| Rc::ptr_eq(p, &concat_p)) @@ -104,16 +108,25 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE } } - let mut matching_handles: Vec> = deduped_matching_extracts - .into_iter() - .map(|(name, recv)| { - recv.generate_entry_handle(data, symlink_target.as_ref().map(|t| t.as_ref()), name) - }) - .collect::, _>>()? - .into_iter() - .flatten() - .collect(); + matching_handles.extend( + deduped_matching_extracts + .into_iter() + .map(|(name, recv)| { + recv.generate_entry_handle( + data, + symlink_target.as_ref().map(|t| t.as_ref()), + name, + ) + }) + .collect::, _>>()? + .into_iter() + .flatten(), + ); + /* let mut derefed_concat_writers: Vec> = deduped_concat_writers */ + /* .drain(..) */ + /* .map(|w| w.borrow_mut()) */ + /* .collect(); */ let mut read_len: usize; loop { read_len = entry.read(&mut copy_buf).wrap_err("read of entry failed")?; @@ -133,6 +146,12 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE .wrap_err("failed to write data to extract output")?; } } + + /* matching_concats.clear(); */ + /* matching_extracts.clear(); */ + deduped_concat_writers.clear(); + /* deduped_matching_extracts.clear(); */ + matching_handles.clear(); } /* Finalize all extract entries. */ for spec in compiled_specs.into_iter() { diff --git a/cli/src/extract/receiver.rs b/cli/src/extract/receiver.rs index 3118cc2fd..bc4450eef 100644 --- a/cli/src/extract/receiver.rs +++ b/cli/src/extract/receiver.rs @@ -51,7 +51,7 @@ impl<'a> EntryData<'a> { } #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct OutputName(pub String); +struct OutputName(pub String); impl OutputName { pub fn default_name() -> Self { @@ -59,7 +59,7 @@ impl OutputName { } } -pub struct ParsedEntrySpecArg { +struct ParsedEntrySpecArg { pub matcher: Option, pub transforms: Option, pub output_name: OutputName, @@ -110,7 +110,7 @@ pub enum CompiledEntrySpec<'w> { Extract(ExtractEntry<'w>), } -pub struct ParsedNamedOutputs<'w> { +struct ParsedNamedOutputs<'w> { concats: HashMap>>, extracts: HashMap>, } diff --git a/cli/src/info.rs b/cli/src/info.rs index 4a12f0253..4a206bdce 100644 --- a/cli/src/info.rs +++ b/cli/src/info.rs @@ -146,7 +146,7 @@ pub fn execute_info(mut err: impl Write, args: Info) -> Result<(), CommandError> } for p in zip_paths.into_iter() { - let mut zip = ArchiveWithPath::open(p.clone())?; + let mut zip = ArchiveWithPath::open(p)?; { let mut zip_entry_counter = ZipFileInput::new(&mut zip.archive); From a09f020649bdac70fe0775f9c202ceacbdb37287 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 29 Aug 2024 02:26:22 -0400 Subject: [PATCH 57/68] make process_entry() helper method --- cli/src/extract.rs | 256 +++++++++++++++++++++++++-------------------- 1 file changed, 140 insertions(+), 116 deletions(-) diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 16bd89a8b..b732b454c 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -5,6 +5,8 @@ use std::{ rc::Rc, }; +use zip::read::ZipFile; + use crate::{args::extract::*, CommandError, WrapCommandErr}; pub mod entries; @@ -16,6 +18,133 @@ use matcher::EntryMatcher; use receiver::{CompiledEntrySpec, ConcatEntry, EntryData, EntryKind, EntryReceiver, ExtractEntry}; use transform::NameTransformer; +fn process_entry<'a, 'w, 'it>( + mut entry: ZipFile<'a>, + err: &Rc>, + compiled_specs: impl Iterator>, + copy_buf: &mut [u8], + matching_concats: &mut Vec>>, + deduped_concat_writers: &mut Vec>>, + matching_handles: &mut Vec>, +) -> Result<(), CommandError> +where + 'w: 'it, +{ + let symlink_target: Option> = { + let (kind, size) = { + let data = EntryData::from_entry(&entry); + (data.kind, data.size) + }; + match kind { + EntryKind::Symlink => { + let mut target: Vec = Vec::with_capacity(size.try_into().unwrap()); + entry + .read_to_end(&mut target) + .wrap_err("failed to read symlink target from zip archive entry")?; + Some(target) + } + _ => None, + } + }; + let data = EntryData::from_entry(&entry); + + let mut matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); + for spec in compiled_specs { + match spec { + CompiledEntrySpec::Concat(ConcatEntry { matcher, stream }) => { + if matcher.as_ref().map(|m| m.matches(&data)).unwrap_or(true) { + matching_concats.push(stream.clone()); + } + } + CompiledEntrySpec::Extract(ExtractEntry { + matcher, + transforms, + recv, + }) => { + if matcher.as_ref().map(|m| m.matches(&data)).unwrap_or(true) { + let new_name = transforms + .as_ref() + .map(|t| t.transform_name(&data.name)) + .unwrap_or_else(|| Cow::Borrowed(&data.name)); + writeln!(&mut err.borrow_mut(), "{data:?}").unwrap(); + writeln!(&mut err.borrow_mut(), "{new_name:?}").unwrap(); + matching_extracts.push((new_name, recv.clone())); + } + } + } + } + if matching_concats.is_empty() && matching_extracts.is_empty() { + return Ok(()); + } + + /* Split output handles for concat, and split generated handles by extract source and + * name. use Rc::ptr_eq() to split, and Cow::<'s, str>::eq() with str AsRef. */ + for concat_p in matching_concats.drain(..) { + if deduped_concat_writers + .iter() + .any(|p| Rc::ptr_eq(p, &concat_p)) + { + writeln!(&mut err.borrow_mut(), "skipping repeated concat").unwrap(); + } else { + deduped_concat_writers.push(concat_p); + } + } + let mut deduped_matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); + for (name, extract_p) in matching_extracts.into_iter() { + if deduped_matching_extracts + .iter() + .any(|(n, p)| Rc::ptr_eq(p, &extract_p) && name.as_ref() == n.as_ref()) + { + writeln!(&mut err.borrow_mut(), "skipping repeated extract").unwrap(); + } else { + deduped_matching_extracts.push((name, extract_p)); + } + } + + matching_handles.extend( + deduped_matching_extracts + .into_iter() + .map(|(name, recv)| { + recv.generate_entry_handle(data, symlink_target.as_ref().map(|t| t.as_ref()), name) + }) + .collect::, _>>()? + .into_iter() + .flatten(), + ); + + /* let mut derefed_concat_writers: Vec> = deduped_concat_writers */ + /* .drain(..) */ + /* .map(|w| w.borrow_mut()) */ + /* .collect(); */ + let mut read_len: usize; + loop { + read_len = entry.read(copy_buf).wrap_err("read of entry failed")?; + if read_len == 0 { + break; + } + let cur_data: &[u8] = ©_buf[..read_len]; + for concat_writer in deduped_concat_writers.iter() { + concat_writer + .borrow_mut() + .write_all(cur_data) + .wrap_err("failed to write data to concat output")?; + } + for extract_writer in matching_handles.iter_mut() { + extract_writer + .write_all(cur_data) + .wrap_err("failed to write data to extract output")?; + } + } + + /* matching_concats.clear(); */ + /* matching_extracts.clear(); */ + deduped_concat_writers.clear(); + /* deduped_matching_extracts.clear(); */ + matching_handles.clear(); + + Ok(()) +} + pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandError> { let Extract { output_specs, @@ -36,123 +165,18 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE /* let mut deduped_matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); */ let mut matching_handles: Vec> = Vec::new(); - while let Some(mut entry) = entry_iterator.next_entry()? { - let symlink_target: Option> = { - let (kind, size) = { - let data = EntryData::from_entry(&entry); - (data.kind, data.size) - }; - match kind { - EntryKind::Symlink => { - let mut target: Vec = Vec::with_capacity(size.try_into().unwrap()); - entry - .read_to_end(&mut target) - .wrap_err("failed to read symlink target from zip archive entry")?; - Some(target) - } - _ => None, - } - }; - let data = EntryData::from_entry(&entry); - - let mut matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); - for spec in compiled_specs.iter() { - match spec { - CompiledEntrySpec::Concat(ConcatEntry { matcher, stream }) => { - if matcher.as_ref().map(|m| m.matches(&data)).unwrap_or(true) { - matching_concats.push(stream.clone()); - } - } - CompiledEntrySpec::Extract(ExtractEntry { - matcher, - transforms, - recv, - }) => { - if matcher.as_ref().map(|m| m.matches(&data)).unwrap_or(true) { - let new_name = transforms - .as_ref() - .map(|t| t.transform_name(&data.name)) - .unwrap_or_else(|| Cow::Borrowed(&data.name)); - writeln!(&mut err.borrow_mut(), "{data:?}").unwrap(); - writeln!(&mut err.borrow_mut(), "{new_name:?}").unwrap(); - matching_extracts.push((new_name, recv.clone())); - } - } - } - } - if matching_concats.is_empty() && matching_extracts.is_empty() { - continue; - } - - /* Split output handles for concat, and split generated handles by extract source and - * name. use Rc::ptr_eq() to split, and Cow::<'s, str>::eq() with str AsRef. */ - for concat_p in matching_concats.drain(..) { - if deduped_concat_writers - .iter() - .any(|p| Rc::ptr_eq(p, &concat_p)) - { - writeln!(&mut err.borrow_mut(), "skipping repeated concat").unwrap(); - } else { - deduped_concat_writers.push(concat_p); - } - } - let mut deduped_matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); - for (name, extract_p) in matching_extracts.into_iter() { - if deduped_matching_extracts - .iter() - .any(|(n, p)| Rc::ptr_eq(p, &extract_p) && name.as_ref() == n.as_ref()) - { - writeln!(&mut err.borrow_mut(), "skipping repeated extract").unwrap(); - } else { - deduped_matching_extracts.push((name, extract_p)); - } - } - - matching_handles.extend( - deduped_matching_extracts - .into_iter() - .map(|(name, recv)| { - recv.generate_entry_handle( - data, - symlink_target.as_ref().map(|t| t.as_ref()), - name, - ) - }) - .collect::, _>>()? - .into_iter() - .flatten(), - ); - - /* let mut derefed_concat_writers: Vec> = deduped_concat_writers */ - /* .drain(..) */ - /* .map(|w| w.borrow_mut()) */ - /* .collect(); */ - let mut read_len: usize; - loop { - read_len = entry.read(&mut copy_buf).wrap_err("read of entry failed")?; - if read_len == 0 { - break; - } - let cur_data: &[u8] = ©_buf[..read_len]; - for concat_writer in deduped_concat_writers.iter() { - concat_writer - .borrow_mut() - .write_all(cur_data) - .wrap_err("failed to write data to concat output")?; - } - for extract_writer in matching_handles.iter_mut() { - extract_writer - .write_all(cur_data) - .wrap_err("failed to write data to extract output")?; - } - } - - /* matching_concats.clear(); */ - /* matching_extracts.clear(); */ - deduped_concat_writers.clear(); - /* deduped_matching_extracts.clear(); */ - matching_handles.clear(); + while let Some(entry) = entry_iterator.next_entry()? { + process_entry( + entry, + &err, + compiled_specs.iter(), + &mut copy_buf, + &mut matching_concats, + &mut deduped_concat_writers, + &mut matching_handles, + )?; } + /* Finalize all extract entries. */ for spec in compiled_specs.into_iter() { match spec { From b5046a527082d2ef8e7e29cdfc353006589fe8a3 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 29 Aug 2024 02:39:15 -0400 Subject: [PATCH 58/68] remove UnsafeCell!!! --- cli/src/extract.rs | 65 ++++++++++++++++------ cli/src/extract/entries.rs | 108 +------------------------------------ 2 files changed, 52 insertions(+), 121 deletions(-) diff --git a/cli/src/extract.rs b/cli/src/extract.rs index b732b454c..2c942211d 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -1,11 +1,12 @@ use std::{ borrow::Cow, cell::RefCell, - io::{Read, Write}, + fs, + io::{self, Read, Write}, rc::Rc, }; -use zip::read::ZipFile; +use zip::read::{ZipArchive, ZipFile}; use crate::{args::extract::*, CommandError, WrapCommandErr}; @@ -13,7 +14,7 @@ pub mod entries; pub mod matcher; pub mod receiver; pub mod transform; -use entries::IterateEntries; +use entries::{IterateEntries, StreamInput, ZipFileInput}; use matcher::EntryMatcher; use receiver::{CompiledEntrySpec, ConcatEntry, EntryData, EntryKind, EntryReceiver, ExtractEntry}; use transform::NameTransformer; @@ -149,13 +150,15 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE let Extract { output_specs, entry_specs, - input_spec, + input_spec: InputSpec { + stdin_stream, + zip_paths, + }, } = extract; let err = Rc::new(RefCell::new(err)); let compiled_specs = receiver::process_entry_and_output_specs(err.clone(), entry_specs, output_specs)?; - let mut entry_iterator = entries::MergedInput::from_spec(input_spec)?; let mut copy_buf: Vec = vec![0u8; 1024 * 16]; @@ -165,16 +168,48 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE /* let mut deduped_matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); */ let mut matching_handles: Vec> = Vec::new(); - while let Some(entry) = entry_iterator.next_entry()? { - process_entry( - entry, - &err, - compiled_specs.iter(), - &mut copy_buf, - &mut matching_concats, - &mut deduped_concat_writers, - &mut matching_handles, - )?; + if stdin_stream { + writeln!(&mut err.borrow_mut(), "extracting from stdin").unwrap(); + let mut stdin = StreamInput::new(io::stdin().lock()); + + while let Some(entry) = stdin.next_entry()? { + process_entry( + entry, + &err, + compiled_specs.iter(), + &mut copy_buf, + &mut matching_concats, + &mut deduped_concat_writers, + &mut matching_handles, + )?; + } + } + + for p in zip_paths.into_iter() { + writeln!( + &mut err.borrow_mut(), + "extracting from zip input file {p:?}", + ) + .unwrap(); + let zip = fs::File::open(&p) + .wrap_err_with(|| format!("failed to open zip input file path {p:?}")) + .and_then(|f| { + ZipArchive::new(f) + .wrap_err_with(|| format!("failed to create zip archive for file {p:?}")) + })?; + let mut zip_entries = ZipFileInput::new(Box::new(zip)); + + while let Some(entry) = zip_entries.next_entry()? { + process_entry( + entry, + &err, + compiled_specs.iter(), + &mut copy_buf, + &mut matching_concats, + &mut deduped_concat_writers, + &mut matching_handles, + )?; + } } /* Finalize all extract entries. */ diff --git a/cli/src/extract/entries.rs b/cli/src/extract/entries.rs index 52c8a07a2..bb46fb79b 100644 --- a/cli/src/extract/entries.rs +++ b/cli/src/extract/entries.rs @@ -1,11 +1,11 @@ -use std::{cell::UnsafeCell, collections::VecDeque, fs, io, ops, path::Path}; +use std::{fs, io, ops}; use zip::{ read::{read_zipfile_from_stream, ZipFile}, ZipArchive, }; -use crate::{args::extract::*, CommandError, WrapCommandErr}; +use crate::{CommandError, WrapCommandErr}; pub trait IterateEntries { fn next_entry(&mut self) -> Result, CommandError>; @@ -50,12 +50,6 @@ pub struct StreamInput { entries_read: usize, } -impl StreamInput { - pub fn stdin() -> Self { - Self::new(io::stdin()) - } -} - impl StreamInput { pub fn new(inner: R) -> Self { Self { @@ -136,101 +130,3 @@ where .wrap_err_with(|| format!("failed to read entry #{prev_counter} from zip",)) } } - -pub struct AllInputZips { - zips_todo: VecDeque>>>, - cur_zip: UnsafeCell>>>, -} - -impl AllInputZips { - pub fn new( - zip_paths: impl IntoIterator>, - ) -> Result { - let mut zips_todo = zip_paths - .into_iter() - .map(|p| { - fs::File::open(p.as_ref()) - .wrap_err_with(|| { - format!("failed to open zip input file path {:?}", p.as_ref()) - }) - .and_then(|f| { - ZipArchive::new(f).wrap_err_with(|| { - format!("failed to create zip archive for file {:?}", p.as_ref()) - }) - }) - .map(|zip| ZipFileInput::new(Box::new(zip))) - }) - .collect::, CommandError>>()?; - debug_assert!(!zips_todo.is_empty()); - let cur_zip = zips_todo.pop_front().unwrap(); - Ok(Self { - zips_todo, - cur_zip: UnsafeCell::new(cur_zip), - }) - } -} - -impl IterateEntries for AllInputZips { - fn next_entry(&mut self) -> Result, CommandError> { - loop { - if let Some(entry) = unsafe { &mut *self.cur_zip.get() }.next_entry()? { - return Ok(Some(entry)); - } - match self.zips_todo.pop_front() { - Some(zip) => { - self.cur_zip = UnsafeCell::new(zip); - } - None => { - return Ok(None); - } - } - } - } -} - -pub struct MergedInput { - stdin_stream: Option>>, - zips: Option, -} - -impl MergedInput { - pub fn from_spec(spec: InputSpec) -> Result { - let InputSpec { - stdin_stream, - zip_paths, - } = spec; - Ok(Self { - stdin_stream: if stdin_stream { - Some(UnsafeCell::new(StreamInput::stdin())) - } else { - None - }, - zips: if zip_paths.is_empty() { - None - } else { - Some(AllInputZips::new(zip_paths)?) - }, - }) - } -} - -impl IterateEntries for MergedInput { - fn next_entry(&mut self) -> Result, CommandError> { - let mut completed_stdin: bool = false; - if let Some(stdin_stream) = self.stdin_stream.as_mut() { - if let Some(entry) = unsafe { &mut *stdin_stream.get() }.next_entry()? { - return Ok(Some(entry)); - } - completed_stdin = true; - } - if completed_stdin { - self.stdin_stream = None; - } - if let Some(zips) = self.zips.as_mut() { - if let Some(entry) = zips.next_entry()? { - return Ok(Some(entry)); - } - } - Ok(None) - } -} From 22c417ba1f3fa0192316466dd03b1e3122cb9183 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 29 Aug 2024 02:56:34 -0400 Subject: [PATCH 59/68] don't reallocate the symlink target --- cli/src/extract.rs | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 2c942211d..1524bc0a9 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -24,6 +24,7 @@ fn process_entry<'a, 'w, 'it>( err: &Rc>, compiled_specs: impl Iterator>, copy_buf: &mut [u8], + symlink_target: &mut Vec, matching_concats: &mut Vec>>, deduped_concat_writers: &mut Vec>>, matching_handles: &mut Vec>, @@ -31,18 +32,22 @@ fn process_entry<'a, 'w, 'it>( where 'w: 'it, { - let symlink_target: Option> = { + deduped_concat_writers.clear(); + matching_handles.clear(); + + let symlink_target: Option<&mut Vec> = { let (kind, size) = { let data = EntryData::from_entry(&entry); (data.kind, data.size) }; match kind { EntryKind::Symlink => { - let mut target: Vec = Vec::with_capacity(size.try_into().unwrap()); + symlink_target.clear(); entry - .read_to_end(&mut target) + .read_to_end(symlink_target) .wrap_err("failed to read symlink target from zip archive entry")?; - Some(target) + debug_assert_eq!(symlink_target.len(), size.try_into().unwrap()); + Some(symlink_target) } _ => None, } @@ -137,12 +142,6 @@ where } } - /* matching_concats.clear(); */ - /* matching_extracts.clear(); */ - deduped_concat_writers.clear(); - /* deduped_matching_extracts.clear(); */ - matching_handles.clear(); - Ok(()) } @@ -161,6 +160,7 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE receiver::process_entry_and_output_specs(err.clone(), entry_specs, output_specs)?; let mut copy_buf: Vec = vec![0u8; 1024 * 16]; + let mut symlink_target: Vec = Vec::new(); let mut matching_concats: Vec>> = Vec::new(); /* let mut matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); */ @@ -178,6 +178,7 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE &err, compiled_specs.iter(), &mut copy_buf, + &mut symlink_target, &mut matching_concats, &mut deduped_concat_writers, &mut matching_handles, @@ -205,6 +206,7 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE &err, compiled_specs.iter(), &mut copy_buf, + &mut symlink_target, &mut matching_concats, &mut deduped_concat_writers, &mut matching_handles, From e2ec8682efe992be4e7157f2ca9fe541b1a451a6 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 29 Aug 2024 03:12:27 -0400 Subject: [PATCH 60/68] move symlink processing to a helper --- cli/src/extract.rs | 52 +++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 1524bc0a9..43bfe6e0f 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -19,6 +19,39 @@ use matcher::EntryMatcher; use receiver::{CompiledEntrySpec, ConcatEntry, EntryData, EntryKind, EntryReceiver, ExtractEntry}; use transform::NameTransformer; +fn maybe_process_symlink<'a, 't>( + entry: &mut ZipFile<'a>, + err: &Rc>, + symlink_target: &'t mut Vec, +) -> Result, CommandError> { + let (kind, size) = { + /* FIXME: the ZipFile<'a> struct contains a *mutable* reference to the parent archive, + * and this actually imposes a mutable reference upon any references to the + * immutable ZipFileData contents. This means we cannot have any immutable + * references to the ZipFileData contents at the same time as a mutable + * reference. What this means here is that we have to create a temporary EntryData + * struct and then immediately throw it away in order to be able to read the entry + * contents with io::Read. ZipEntry<'a, R> from + * https://github.com/zip-rs/zip2/pull/233 avoids this issue!!! */ + let data = EntryData::from_entry(&entry); + (data.kind, data.size) + }; + if !matches!(kind, EntryKind::Symlink) { + return Ok(None); + } + + /* We can't read the entry name from EntryData because we can't have any immutable + * references to ZipFileData like the name at the same time we use the entry as + * a reader! That means our log message here is very unclear! */ + writeln!(&mut err.borrow_mut(), "reading symlink target").unwrap(); + symlink_target.clear(); + entry + .read_to_end(symlink_target) + .wrap_err("failed to read symlink target from zip archive entry")?; + debug_assert_eq!(symlink_target.len(), size.try_into().unwrap()); + Ok(Some(symlink_target)) +} + fn process_entry<'a, 'w, 'it>( mut entry: ZipFile<'a>, err: &Rc>, @@ -35,23 +68,8 @@ where deduped_concat_writers.clear(); matching_handles.clear(); - let symlink_target: Option<&mut Vec> = { - let (kind, size) = { - let data = EntryData::from_entry(&entry); - (data.kind, data.size) - }; - match kind { - EntryKind::Symlink => { - symlink_target.clear(); - entry - .read_to_end(symlink_target) - .wrap_err("failed to read symlink target from zip archive entry")?; - debug_assert_eq!(symlink_target.len(), size.try_into().unwrap()); - Some(symlink_target) - } - _ => None, - } - }; + let symlink_target = maybe_process_symlink(&mut entry, err, symlink_target)?; + /* We dropped any mutable handles to the entry, so now we can access its metadata again. */ let data = EntryData::from_entry(&entry); let mut matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); From bc738568d06d4486eb8a91d504b0163727b92727 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 29 Aug 2024 05:48:26 -0400 Subject: [PATCH 61/68] refactor a lot of extraction --- cli/src/args/info.rs | 1 - cli/src/extract.rs | 80 +++++------------------ cli/src/extract/matcher.rs | 22 ++++--- cli/src/extract/receiver.rs | 123 ++++++++++++++++++++++++++++++++++- cli/src/extract/transform.rs | 14 ++-- 5 files changed, 161 insertions(+), 79 deletions(-) diff --git a/cli/src/args/info.rs b/cli/src/args/info.rs index 5af76436d..780b696df 100644 --- a/cli/src/args/info.rs +++ b/cli/src/args/info.rs @@ -432,7 +432,6 @@ impl FormatSpec { "failed to parse archive format string {archive_format:?}: {e}" )) })?; - dbg!(&entry_format); let entry = ParseableFormatSpec::::parse_format(&entry_format) .map_err(|e| { Info::exit_arg_invalid(&format!( diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 43bfe6e0f..6719d5001 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -1,7 +1,7 @@ use std::{ borrow::Cow, cell::RefCell, - fs, + fmt, fs, io::{self, Read, Write}, rc::Rc, }; @@ -15,9 +15,7 @@ pub mod matcher; pub mod receiver; pub mod transform; use entries::{IterateEntries, StreamInput, ZipFileInput}; -use matcher::EntryMatcher; -use receiver::{CompiledEntrySpec, ConcatEntry, EntryData, EntryKind, EntryReceiver, ExtractEntry}; -use transform::NameTransformer; +use receiver::{CompiledEntrySpec, EntryData, EntryKind, EntryReceiver, ExtractEntry}; fn maybe_process_symlink<'a, 't>( entry: &mut ZipFile<'a>, @@ -52,18 +50,18 @@ fn maybe_process_symlink<'a, 't>( Ok(Some(symlink_target)) } -fn process_entry<'a, 'w, 'it>( +fn process_entry<'a, 'w, 'c, 'it>( mut entry: ZipFile<'a>, err: &Rc>, - compiled_specs: impl Iterator>, + compiled_specs: impl Iterator> + fmt::Debug, copy_buf: &mut [u8], symlink_target: &mut Vec, - matching_concats: &mut Vec>>, - deduped_concat_writers: &mut Vec>>, + deduped_concat_writers: &mut Vec<&'c Rc>>, matching_handles: &mut Vec>, ) -> Result<(), CommandError> where 'w: 'it, + 'it: 'c, { deduped_concat_writers.clear(); matching_handles.clear(); @@ -72,63 +70,20 @@ where /* We dropped any mutable handles to the entry, so now we can access its metadata again. */ let data = EntryData::from_entry(&entry); - let mut matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); - for spec in compiled_specs { - match spec { - CompiledEntrySpec::Concat(ConcatEntry { matcher, stream }) => { - if matcher.as_ref().map(|m| m.matches(&data)).unwrap_or(true) { - matching_concats.push(stream.clone()); - } - } - CompiledEntrySpec::Extract(ExtractEntry { - matcher, - transforms, - recv, - }) => { - if matcher.as_ref().map(|m| m.matches(&data)).unwrap_or(true) { - let new_name = transforms - .as_ref() - .map(|t| t.transform_name(&data.name)) - .unwrap_or_else(|| Cow::Borrowed(&data.name)); - writeln!(&mut err.borrow_mut(), "{data:?}").unwrap(); - writeln!(&mut err.borrow_mut(), "{new_name:?}").unwrap(); - matching_extracts.push((new_name, recv.clone())); - } - } - } - } - if matching_concats.is_empty() && matching_extracts.is_empty() { - return Ok(()); - } - - /* Split output handles for concat, and split generated handles by extract source and - * name. use Rc::ptr_eq() to split, and Cow::<'s, str>::eq() with str AsRef. */ - for concat_p in matching_concats.drain(..) { - if deduped_concat_writers - .iter() - .any(|p| Rc::ptr_eq(p, &concat_p)) - { - writeln!(&mut err.borrow_mut(), "skipping repeated concat").unwrap(); - } else { - deduped_concat_writers.push(concat_p); - } - } - let mut deduped_matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); - for (name, extract_p) in matching_extracts.into_iter() { - if deduped_matching_extracts - .iter() - .any(|(n, p)| Rc::ptr_eq(p, &extract_p) && name.as_ref() == n.as_ref()) + let mut deduped_matching_extracts: Vec<(&'c Rc, Vec>)> = + Vec::new(); + for matching_spec in compiled_specs.filter_map(|spec| spec.try_match_and_transform(&data)) { + if matching_spec.is_nested_duplicate(deduped_concat_writers, &mut deduped_matching_extracts) { - writeln!(&mut err.borrow_mut(), "skipping repeated extract").unwrap(); - } else { - deduped_matching_extracts.push((name, extract_p)); + writeln!(&mut err.borrow_mut(), "skipping repeated output").unwrap(); } } matching_handles.extend( deduped_matching_extracts .into_iter() - .map(|(name, recv)| { + .flat_map(|(recv, names)| names.into_iter().map(move |n| (recv, n))) + .map(|(recv, name)| { recv.generate_entry_handle(data, symlink_target.as_ref().map(|t| t.as_ref()), name) }) .collect::, _>>()? @@ -174,16 +129,15 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE } = extract; let err = Rc::new(RefCell::new(err)); + writeln!(&mut err.borrow_mut(), "entry specs: {entry_specs:?}").unwrap(); let compiled_specs = receiver::process_entry_and_output_specs(err.clone(), entry_specs, output_specs)?; + writeln!(&mut err.borrow_mut(), "compiled specs: {compiled_specs:?}").unwrap(); let mut copy_buf: Vec = vec![0u8; 1024 * 16]; let mut symlink_target: Vec = Vec::new(); - let mut matching_concats: Vec>> = Vec::new(); - /* let mut matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); */ - let mut deduped_concat_writers: Vec>> = Vec::new(); - /* let mut deduped_matching_extracts: Vec<(Cow<'_, str>, Rc)> = Vec::new(); */ + let mut deduped_concat_writers: Vec<&Rc>> = Vec::new(); let mut matching_handles: Vec> = Vec::new(); if stdin_stream { @@ -197,7 +151,6 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE compiled_specs.iter(), &mut copy_buf, &mut symlink_target, - &mut matching_concats, &mut deduped_concat_writers, &mut matching_handles, )?; @@ -225,7 +178,6 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE compiled_specs.iter(), &mut copy_buf, &mut symlink_target, - &mut matching_concats, &mut deduped_concat_writers, &mut matching_handles, )?; diff --git a/cli/src/extract/matcher.rs b/cli/src/extract/matcher.rs index 1fabf0220..af382369d 100644 --- a/cli/src/extract/matcher.rs +++ b/cli/src/extract/matcher.rs @@ -1,4 +1,4 @@ -use std::borrow::Cow; +use std::{borrow::Cow, fmt}; #[cfg(feature = "glob")] use glob; @@ -97,13 +97,14 @@ impl MatchModifiers { } } -trait NameMatcher { +trait NameMatcher: fmt::Debug { fn create(pattern: String, opts: MatchModifiers) -> Result where Self: Sized; fn matches(&self, input: &str) -> bool; } +#[derive(Debug)] struct LiteralMatcher { lit: String, case: CaseSensitivity, @@ -148,6 +149,7 @@ impl NameMatcher for LiteralMatcher { } } +#[derive(Debug)] #[cfg(feature = "glob")] struct GlobMatcher { pat: glob::Pattern, @@ -186,6 +188,7 @@ impl NameMatcher for GlobMatcher { } } +#[derive(Debug)] #[cfg(feature = "rx")] struct RegexMatcher { pat: regex::Regex, @@ -220,7 +223,7 @@ impl NameMatcher for RegexMatcher { } } -pub trait EntryMatcher { +pub trait EntryMatcher: fmt::Debug { type Arg where Self: Sized; @@ -230,7 +233,7 @@ pub trait EntryMatcher { fn matches(&self, entry: &EntryData) -> bool; } -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone)] enum TrivialMatcher { True, False, @@ -257,7 +260,7 @@ impl EntryMatcher for TrivialMatcher { } } -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone)] enum EntryTypeMatcher { File, Dir, @@ -288,7 +291,7 @@ impl EntryMatcher for EntryTypeMatcher { } } -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone)] enum NonSpecificMethods { Any, Known, @@ -317,6 +320,7 @@ impl EntryMatcher for NonSpecificMethods { } } +#[derive(Debug)] struct SpecificMethods { specific_method: CompressionMethod, } @@ -338,7 +342,7 @@ impl EntryMatcher for SpecificMethods { } } -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone)] enum DepthLimit { Max(usize), Min(usize), @@ -366,7 +370,7 @@ impl EntryMatcher for DepthLimit { } } -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone)] enum Size { Max(u64), Min(u64), @@ -393,6 +397,7 @@ impl EntryMatcher for Size { } } +#[derive(Debug)] struct PatternMatcher { matcher: Box, comp_sel: ComponentSelector, @@ -452,6 +457,7 @@ impl EntryMatcher for PatternMatcher { } } +#[derive(Debug)] pub enum CompiledMatcher { Primitive(Box), Negated(Box), diff --git a/cli/src/extract/receiver.rs b/cli/src/extract/receiver.rs index bc4450eef..9fcc08799 100644 --- a/cli/src/extract/receiver.rs +++ b/cli/src/extract/receiver.rs @@ -2,7 +2,7 @@ use std::{ borrow::Cow, cell::RefCell, collections::{HashMap, HashSet}, - fs, + fmt, fs, io::{self, Seek, Write}, mem, path::PathBuf, @@ -99,17 +99,126 @@ pub struct ConcatEntry<'w> { pub stream: Rc>, } +impl<'w> fmt::Debug for ConcatEntry<'w> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "ConcatEntry {{ matcher: {:?}, stream: {:p} }}", + &self.matcher, &self.stream + ) + } +} + +impl<'w> ConcatEntry<'w> { + pub fn do_match<'a>(&self, data: &EntryData<'a>) -> Option<&Rc>> { + if self + .matcher + .as_ref() + .map(|m| m.matches(data)) + .unwrap_or(true) + { + Some(&self.stream) + } else { + None + } + } +} + +#[derive(Debug)] pub struct ExtractEntry<'w> { pub matcher: Option, pub transforms: Option, pub recv: Rc, } +impl<'w> ExtractEntry<'w> { + pub fn do_match_and_transform<'a>( + &self, + data: &EntryData<'a>, + ) -> Option<(Cow<'a, str>, &Rc)> { + if self + .matcher + .as_ref() + .map(|m| m.matches(data)) + .unwrap_or(true) + { + let new_name = self + .transforms + .as_ref() + .map(|t| t.transform_name(data.name)) + .unwrap_or_else(|| Cow::Borrowed(data.name)); + Some((new_name, &self.recv)) + } else { + None + } + } +} + +#[derive(Debug)] pub enum CompiledEntrySpec<'w> { Concat(ConcatEntry<'w>), Extract(ExtractEntry<'w>), } +impl<'w> CompiledEntrySpec<'w> { + pub fn try_match_and_transform<'a>( + &self, + data: &EntryData<'a>, + ) -> Option> { + match self { + Self::Concat(c) => c.do_match(data).map(MatchingEntrySpec::Concat), + Self::Extract(e) => e + .do_match_and_transform(data) + .map(|(n, p)| MatchingEntrySpec::Extract(n, p)), + } + } +} + +pub enum MatchingEntrySpec<'a, 'c, 'w> { + Concat(&'c Rc>), + Extract(Cow<'a, str>, &'c Rc), +} + +impl<'a, 'c, 'w> MatchingEntrySpec<'a, 'c, 'w> { + /* Split output handles for concat, and split generated handles by extract source and + * name. use ptr::eq() to split, and Cow::<'s, str>::eq() with str AsRef. */ + pub fn is_nested_duplicate( + self, + deduped_concat_writers: &mut Vec<&'c Rc>>, + deduped_matching_extracts: &mut Vec<(&'c Rc, Vec>)>, + ) -> bool { + match self { + MatchingEntrySpec::Concat(concat_writer) => { + if deduped_concat_writers + .iter() + .any(|p| Rc::ptr_eq(p, &concat_writer)) + { + true + } else { + deduped_concat_writers.push(concat_writer); + false + } + } + MatchingEntrySpec::Extract(name, extract_receiver) => { + if let Some((_, names)) = deduped_matching_extracts + .iter_mut() + .find(|(p, _)| Rc::ptr_eq(p, &extract_receiver)) + { + if !names.iter().any(|n| n.as_ref() == name.as_ref()) { + names.push(name); + false + } else { + true + } + } else { + deduped_matching_extracts.push((extract_receiver, vec![name])); + false + } + } + } + } +} + struct ParsedNamedOutputs<'w> { concats: HashMap>>, extracts: HashMap>, @@ -371,7 +480,7 @@ impl<'w> ParsedNamedOutputs<'w> { } } -pub trait EntryReceiver { +pub trait EntryReceiver: fmt::Debug { fn generate_entry_handle<'s>( &self, data: EntryData<'s>, @@ -407,6 +516,16 @@ impl FilesystemReceiver { } } +impl fmt::Debug for FilesystemReceiver { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "FilesystemReceiver {{ output_dir: {:?} }}", + &self.output_dir + ) + } +} + impl EntryReceiver for FilesystemReceiver where W: Write, diff --git a/cli/src/extract/transform.rs b/cli/src/extract/transform.rs index cc6ad710d..9494da36d 100644 --- a/cli/src/extract/transform.rs +++ b/cli/src/extract/transform.rs @@ -1,4 +1,4 @@ -use std::{borrow::Cow, collections::VecDeque, ops, path::Path, str}; +use std::{borrow::Cow, collections::VecDeque, fmt, ops, path::Path, str}; #[cfg(feature = "rx")] use regex; @@ -6,7 +6,7 @@ use regex; use super::matcher::{CaseSensitivity, SearchAnchoring}; use crate::{args::extract::*, CommandError}; -pub trait NameTransformer { +pub trait NameTransformer: fmt::Debug { type Arg where Self: Sized; @@ -16,7 +16,7 @@ pub trait NameTransformer { fn transform_name<'s>(&self, name: &'s str) -> Cow<'s, str>; } -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone)] enum Trivial { Identity, } @@ -38,6 +38,7 @@ impl NameTransformer for Trivial { } } +#[derive(Debug)] struct StripComponents { num_components_to_strip: usize, } @@ -79,6 +80,7 @@ impl NameTransformer for StripComponents { } } +#[derive(Debug)] struct AddPrefix { prefix_to_add: String, } @@ -142,7 +144,7 @@ impl ReplaceModifiers { } } -trait PatternTransformer { +trait PatternTransformer: fmt::Debug { type Replacement where Self: Sized; @@ -157,6 +159,7 @@ trait PatternTransformer { fn replace<'s>(&self, input: &'s str) -> Cow<'s, str>; } +#[derive(Debug)] struct LiteralTransformer { lit: String, case: CaseSensitivity, @@ -354,6 +357,7 @@ impl PatternTransformer for LiteralTransformer { } } +#[derive(Debug)] #[cfg(feature = "rx")] struct RegexpTransformer { pat: regex::Regex, @@ -555,6 +559,7 @@ impl SubstringAnchoring { } } +#[derive(Debug)] struct ComponentTransformer { pattern_trans: Box, comp_sel: ComponentSelector, @@ -612,6 +617,7 @@ impl NameTransformer for ComponentTransformer { } } +#[derive(Debug)] pub struct CompiledTransformer { transformers: Vec>, } From 0ccd578cc328b440f3ce81beba92266bb3401953 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 29 Aug 2024 06:12:37 -0400 Subject: [PATCH 62/68] ok extract makes a lot more sense now --- cli/src/extract.rs | 14 ++++---------- cli/src/extract/receiver.rs | 6 +++--- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 6719d5001..3724e88a8 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -1,7 +1,7 @@ use std::{ borrow::Cow, cell::RefCell, - fmt, fs, + fs, io::{self, Read, Write}, rc::Rc, }; @@ -53,7 +53,7 @@ fn maybe_process_symlink<'a, 't>( fn process_entry<'a, 'w, 'c, 'it>( mut entry: ZipFile<'a>, err: &Rc>, - compiled_specs: impl Iterator> + fmt::Debug, + compiled_specs: impl Iterator>, copy_buf: &mut [u8], symlink_target: &mut Vec, deduped_concat_writers: &mut Vec<&'c Rc>>, @@ -70,7 +70,7 @@ where /* We dropped any mutable handles to the entry, so now we can access its metadata again. */ let data = EntryData::from_entry(&entry); - let mut deduped_matching_extracts: Vec<(&'c Rc, Vec>)> = + let mut deduped_matching_extracts: Vec<(&'c Rc, Vec>)> = Vec::new(); for matching_spec in compiled_specs.filter_map(|spec| spec.try_match_and_transform(&data)) { if matching_spec.is_nested_duplicate(deduped_concat_writers, &mut deduped_matching_extracts) @@ -83,18 +83,12 @@ where deduped_matching_extracts .into_iter() .flat_map(|(recv, names)| names.into_iter().map(move |n| (recv, n))) - .map(|(recv, name)| { - recv.generate_entry_handle(data, symlink_target.as_ref().map(|t| t.as_ref()), name) - }) + .map(|(recv, name)| recv.generate_entry_handle(data, symlink_target.as_deref(), name)) .collect::, _>>()? .into_iter() .flatten(), ); - /* let mut derefed_concat_writers: Vec> = deduped_concat_writers */ - /* .drain(..) */ - /* .map(|w| w.borrow_mut()) */ - /* .collect(); */ let mut read_len: usize; loop { read_len = entry.read(copy_buf).wrap_err("read of entry failed")?; diff --git a/cli/src/extract/receiver.rs b/cli/src/extract/receiver.rs index 9fcc08799..75333213e 100644 --- a/cli/src/extract/receiver.rs +++ b/cli/src/extract/receiver.rs @@ -204,11 +204,11 @@ impl<'a, 'c, 'w> MatchingEntrySpec<'a, 'c, 'w> { .iter_mut() .find(|(p, _)| Rc::ptr_eq(p, &extract_receiver)) { - if !names.iter().any(|n| n.as_ref() == name.as_ref()) { + if names.iter().any(|n| n.as_ref() == name.as_ref()) { + true + } else { names.push(name); false - } else { - true } } else { deduped_matching_extracts.push((extract_receiver, vec![name])); From 4167b1151697ecb2e49c11b5cd274eab2fd2f0ba Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 29 Aug 2024 06:25:03 -0400 Subject: [PATCH 63/68] support --archive-comment for compression --- cli/src/args/compress.rs | 31 ++++++++++++++++++++++++++++++- cli/src/compress.rs | 7 +++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/cli/src/args/compress.rs b/cli/src/args/compress.rs index d1b82a448..796b47990 100644 --- a/cli/src/args/compress.rs +++ b/cli/src/args/compress.rs @@ -49,6 +49,7 @@ pub enum OutputType { #[derive(Debug)] pub struct Compress { pub output: OutputType, + pub archive_comment: Option, pub args: Vec, pub positional_paths: Vec, } @@ -77,7 +78,8 @@ impl CommandFormat for Compress { const COMMAND_DESCRIPTION: &'static str = "Generate an archive from data in argument strings or read from the filesystem."; - const USAGE_LINE: &'static str = "[-h|--help] [OUTPUT-FLAGS] [ENTRY]... [--] [PATH]..."; + const USAGE_LINE: &'static str = + "[-h|--help] [OUTPUT-FLAGS] [--archive-comment ] [ENTRY]... [--] [PATH]..."; fn generate_help() -> String { format!( @@ -100,6 +102,13 @@ Where and how to write the generated zip archive. --stdout Allow writing output to stdout even if stdout is a tty. +Global flags: +These flags describe information set for the entire produced archive. + + --archive-comment + If provided, this will set the archive's comment field to the + specified bytes. This does not need to be valid unicode. + Entries: After output flags are provided, the rest of the command line is attributes and entry data. Attributes modify later entries. @@ -206,6 +215,7 @@ Positional entries: let mut allow_stdout: bool = false; let mut append_to_output_path: bool = false; let mut output_path: Option = None; + let mut archive_comment: Option = None; let mut args: Vec = Vec::new(); let mut positional_paths: Vec = Vec::new(); @@ -268,6 +278,24 @@ Positional entries: } } + /* Global flags */ + b"--archive-comment" => { + let new_comment = argv.pop_front().ok_or_else(|| { + Self::exit_arg_invalid("no argument provided for --archive-comment") + })?; + if let Some(prev_comment) = archive_comment.take() { + return Err(Self::exit_arg_invalid(&format!( + "--archive-comment provided twice: {prev_comment:?} and {new_comment:?}" + ))); + } else if !args.is_empty() || !positional_paths.is_empty() { + return Err(Self::exit_arg_invalid( + "--archive-comment provided after entries", + )); + } else { + archive_comment = Some(new_comment); + } + } + /* Attributes */ b"-c" | b"--compression-method" => match argv.pop_front() { None => { @@ -437,6 +465,7 @@ Positional entries: Ok(Self { output, + archive_comment, args, positional_paths, }) diff --git a/cli/src/compress.rs b/cli/src/compress.rs index a61fa2018..784f835b1 100644 --- a/cli/src/compress.rs +++ b/cli/src/compress.rs @@ -130,6 +130,7 @@ fn enter_recursive_dir_entries( pub fn execute_compress(mut err: impl Write, args: Compress) -> Result<(), CommandError> { let Compress { output, + archive_comment, args, positional_paths, } = args; @@ -203,6 +204,12 @@ pub fn execute_compress(mut err: impl Write, args: Compress) -> Result<(), Comma ZipWriter::new(out) }; + if let Some(comment) = archive_comment { + writeln!(err, "comment was provided: {comment:?}").unwrap(); + let comment = comment.into_encoded_bytes(); + writer.set_raw_comment(comment.into()); + } + let mut options = SimpleFileOptions::default() .compression_method(CompressionMethod::Deflated) .large_file(false); From fc90ce65cfdc387d2edd8c085312d94b40610946 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 29 Aug 2024 06:30:32 -0400 Subject: [PATCH 64/68] add a TODO --- src/write.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/write.rs b/src/write.rs index 9ba9a63a6..a8bd4ecd5 100644 --- a/src/write.rs +++ b/src/write.rs @@ -781,6 +781,8 @@ impl ZipWriter { } } +/* TODO: consider a ZipWriter which works with just a Write bound to support streaming output? This + * would require some work, but is possible in the protocol. */ impl ZipWriter { /// Initializes the archive. /// From c19b5549c610abfabc3e6d88945604d462de61e0 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 29 Aug 2024 06:49:52 -0400 Subject: [PATCH 65/68] make symlink creation more readable --- cli/src/extract.rs | 8 ++-- cli/src/extract/receiver.rs | 94 ++++++++++++++++++++----------------- 2 files changed, 57 insertions(+), 45 deletions(-) diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 3724e88a8..579b760b3 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -42,6 +42,8 @@ fn maybe_process_symlink<'a, 't>( * references to ZipFileData like the name at the same time we use the entry as * a reader! That means our log message here is very unclear! */ writeln!(&mut err.borrow_mut(), "reading symlink target").unwrap(); + /* Re-use the vector allocation, but make sure to avoid re-using the symlink data from + * a previous iteration. */ symlink_target.clear(); entry .read_to_end(symlink_target) @@ -57,7 +59,7 @@ fn process_entry<'a, 'w, 'c, 'it>( copy_buf: &mut [u8], symlink_target: &mut Vec, deduped_concat_writers: &mut Vec<&'c Rc>>, - matching_handles: &mut Vec>, + matching_handles: &mut Vec>, ) -> Result<(), CommandError> where 'w: 'it, @@ -131,8 +133,8 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE let mut copy_buf: Vec = vec![0u8; 1024 * 16]; let mut symlink_target: Vec = Vec::new(); - let mut deduped_concat_writers: Vec<&Rc>> = Vec::new(); - let mut matching_handles: Vec> = Vec::new(); + let mut deduped_concat_writers: Vec<&Rc>> = Vec::new(); + let mut matching_handles: Vec> = Vec::new(); if stdin_stream { writeln!(&mut err.borrow_mut(), "extracting from stdin").unwrap(); diff --git a/cli/src/extract/receiver.rs b/cli/src/extract/receiver.rs index 75333213e..021efeada 100644 --- a/cli/src/extract/receiver.rs +++ b/cli/src/extract/receiver.rs @@ -5,7 +5,7 @@ use std::{ fmt, fs, io::{self, Seek, Write}, mem, - path::PathBuf, + path::{Path, PathBuf}, rc::Rc, }; @@ -526,6 +526,54 @@ impl fmt::Debug for FilesystemReceiver { } } +impl FilesystemReceiver +where + W: Write, +{ + #[cfg(unix)] + fn create_or_overwrite_symlink( + err: &mut impl Write, + target: &[u8], + full_output_path: &Path, + ) -> Result<(), CommandError> { + use std::{ + ffi::OsStr, + os::unix::{ffi::OsStrExt, fs::symlink}, + }; + let target = OsStr::from_bytes(target); + writeln!(err, "entry is symlink to {target:?}, creating").unwrap(); + /* The stdlib symlink function has no functionality like OpenOptions to + * truncate a symlink if it already exists, so we have to do that ourselves + * here. */ + if let Err(e) = symlink(target, full_output_path) { + let e = match e.kind() { + io::ErrorKind::AlreadyExists => { + writeln!(err, "a file already existed at the symlink target {full_output_path:?}, removing") + .unwrap(); + fs::remove_file(full_output_path).wrap_err_with(|| { + format!("failed to remove file at symlink target {full_output_path:?}") + })?; + writeln!( + err, + "successfully removed file entry, creating symlink again" + ) + .unwrap(); + symlink(target, full_output_path).err() + } + _ => Some(e), + }; + if let Some(e) = e { + return Err(e).wrap_err_with(|| { + format!( + "failed to create symlink at {full_output_path:?} with target {target:?}" + ) + }); + } + } + Ok(()) + } +} + impl EntryReceiver for FilesystemReceiver where W: Write, @@ -553,50 +601,12 @@ where })?; } EntryKind::Symlink => { - let target: Vec = symlink_target - .expect("we should have generated this") - .to_vec(); + let target = symlink_target.expect("we should have generated this"); #[cfg(unix)] - { - use std::{ - ffi::OsString, - os::unix::{ffi::OsStringExt, fs::symlink}, - }; - let target = OsString::from_vec(target); - writeln!(err, "entry is symlink to {target:?}, creating").unwrap(); - /* The stdlib symlink function has no functionality like OpenOptions to - * truncate a symlink if it already exists, so we have to do that ourselves - * here. */ - if let Err(e) = symlink(&target, &full_output_path) { - let e = match e.kind() { - io::ErrorKind::AlreadyExists => { - writeln!(err, "a file already existed at the symlink target {full_output_path:?}, removing") - .unwrap(); - fs::remove_file(&full_output_path) - .wrap_err_with(|| format!("failed to remove file at symlink target {full_output_path:?}"))?; - writeln!( - err, - "successfully removed file entry, creating symlink again" - ) - .unwrap(); - symlink(&target, &full_output_path).err() - } - _ => Some(e), - }; - if let Some(e) = e { - return Err(e).wrap_err_with(|| { - format!( - "failed to create symlink at {full_output_path:?} with target {target:?}" - ) - }); - } - } - } + Self::create_or_overwrite_symlink(&mut *err, target, &full_output_path)?; #[cfg(not(unix))] - { - todo!("TODO: cannot create symlink for entry {name} on non-unix yet!"); - } + todo!("TODO: cannot create symlink for entry {name} on non-unix yet!"); } EntryKind::File => { writeln!(err, "entry is file, creating").unwrap(); From bc1c6ac541c100cf3390335647a9862edad06107 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 29 Aug 2024 06:59:22 -0400 Subject: [PATCH 66/68] refactor output parsing into modules --- cli/src/extract.rs | 3 +- cli/src/extract/named_outputs.rs | 320 +++++++++++++++++++++++++++++++ cli/src/extract/receiver.rs | 312 +----------------------------- 3 files changed, 325 insertions(+), 310 deletions(-) create mode 100644 cli/src/extract/named_outputs.rs diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 579b760b3..69efe3deb 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -12,6 +12,7 @@ use crate::{args::extract::*, CommandError, WrapCommandErr}; pub mod entries; pub mod matcher; +pub mod named_outputs; pub mod receiver; pub mod transform; use entries::{IterateEntries, StreamInput, ZipFileInput}; @@ -127,7 +128,7 @@ pub fn execute_extract(err: impl Write, extract: Extract) -> Result<(), CommandE writeln!(&mut err.borrow_mut(), "entry specs: {entry_specs:?}").unwrap(); let compiled_specs = - receiver::process_entry_and_output_specs(err.clone(), entry_specs, output_specs)?; + named_outputs::process_entry_and_output_specs(err.clone(), entry_specs, output_specs)?; writeln!(&mut err.borrow_mut(), "compiled specs: {compiled_specs:?}").unwrap(); let mut copy_buf: Vec = vec![0u8; 1024 * 16]; diff --git a/cli/src/extract/named_outputs.rs b/cli/src/extract/named_outputs.rs new file mode 100644 index 000000000..0406a36aa --- /dev/null +++ b/cli/src/extract/named_outputs.rs @@ -0,0 +1,320 @@ +use std::{ + cell::RefCell, + collections::{HashMap, HashSet}, + fs, + io::{self, Seek, Write}, + path::PathBuf, + rc::Rc, +}; + +use super::matcher::{CompiledMatcher, EntryMatcher}; +use super::receiver::{ + CompiledEntrySpec, ConcatEntry, EntryReceiver, ExtractEntry, FilesystemReceiver, +}; +use super::transform::{CompiledTransformer, NameTransformer}; +use crate::{args::extract::*, CommandError, WrapCommandErr}; + +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +struct OutputName(pub String); + +impl OutputName { + pub fn default_name() -> Self { + Self("default".to_string()) + } +} + +struct ParsedEntrySpecArg { + pub matcher: Option, + pub transforms: Option, + pub output_name: OutputName, +} + +impl ParsedEntrySpecArg { + pub fn from_entry_spec(spec: EntrySpec) -> Result { + let EntrySpec { + match_expr, + name_transforms, + content_transform, + } = spec; + let matcher = match match_expr { + None => None, + Some(expr) => Some(CompiledMatcher::from_arg(expr)?), + }; + let transforms = if name_transforms.is_empty() { + None + } else { + Some(CompiledTransformer::from_arg(name_transforms)?) + }; + let output_name = match content_transform { + ContentTransform::Extract { name } => name + .map(OutputName) + .unwrap_or_else(OutputName::default_name), + }; + Ok(Self { + matcher, + transforms, + output_name, + }) + } +} + +struct ParsedNamedOutputs<'w> { + concats: HashMap>>, + extracts: HashMap>, +} + +pub fn process_entry_and_output_specs<'w>( + err: Rc>, + entry_specs: impl IntoIterator, + output_specs: OutputSpecs, +) -> Result>, CommandError> { + let mut entry_specs: Vec = entry_specs + .into_iter() + .map(ParsedEntrySpecArg::from_entry_spec) + .collect::>()?; + if entry_specs.is_empty() { + entry_specs.push(ParsedEntrySpecArg { + matcher: None, + transforms: None, + output_name: OutputName::default_name(), + }); + } + let parsed_outputs = ParsedNamedOutputs::from_output_specs(err, output_specs)?; + parsed_outputs.process_entry_specs_for_outputs(entry_specs) +} + +impl<'w> ParsedNamedOutputs<'w> { + pub fn process_entry_specs_for_outputs( + self, + args: impl IntoIterator, + ) -> Result>, CommandError> { + args.into_iter() + .map(|arg| self.lookup_entry_spec_arg(arg)) + .collect() + } + + fn lookup_entry_spec_arg( + &self, + arg: ParsedEntrySpecArg, + ) -> Result, CommandError> { + let ParsedEntrySpecArg { + matcher, + transforms, + output_name, + } = arg; + if let Some(stream) = self.concats.get(&output_name) { + if transforms.is_some() { + return Err(CommandError::InvalidArg(format!( + "entry name transforms do not apply to concat output {output_name:?}" + ))); + } + return Ok(CompiledEntrySpec::Concat(ConcatEntry { + matcher, + stream: stream.clone(), + })); + } + let Some(recv) = self.extracts.get(&output_name) else { + return Err(CommandError::InvalidArg(format!( + "output name {output_name:?} was not found" + ))); + }; + Ok(CompiledEntrySpec::Extract(ExtractEntry { + matcher, + transforms, + recv: recv.clone(), + })) + } + + fn add_stdout( + seen_stdout: &mut bool, + name: OutputName, + seen_names: &mut HashSet, + concats: &mut HashMap>>, + ) -> Result<(), CommandError> { + if *seen_stdout { + return Err(CommandError::InvalidArg( + "--stdout output provided for more than one receiver".to_string(), + )); + } + if seen_names.contains(&name) { + return Err(CommandError::InvalidArg(format!( + "output name {name:?} provided more than once" + ))); + } + assert!(!concats.contains_key(&name)); + + let handle: Rc> = Rc::new(RefCell::new(io::stdout())); + + *seen_stdout = true; + assert!(seen_names.insert(name.clone())); + assert!(concats.insert(name, handle).is_none()); + Ok(()) + } + + fn add_file( + path: PathBuf, + append: bool, + name: OutputName, + seen_files: &mut HashSet, + seen_names: &mut HashSet, + concats: &mut HashMap>>, + ) -> Result<(), CommandError> { + if seen_names.contains(&name) { + return Err(CommandError::InvalidArg(format!( + "output name {name:?} provided more than once" + ))); + } + assert!(!concats.contains_key(&name)); + + let handle: Rc> = { + let mut f: fs::File = if append { + fs::OpenOptions::new() + .write(true) + .create(true) + .open(&path) + .wrap_err_with(|| format!("failed to open file for append at {path:?}"))? + } else { + fs::File::create(&path) + .wrap_err_with(|| format!("failed to open file with truncation at {path:?}"))? + }; + f.seek(io::SeekFrom::End(0)) + .wrap_err_with(|| format!("failed to seek to end of opened file {f:?}"))?; + Rc::new(RefCell::new(f)) + }; + + let canon_path = path + .canonicalize() + .wrap_err_with(|| format!("canonicalizing path {path:?} failed"))?; + if seen_files.contains(&canon_path) { + return Err(CommandError::InvalidArg(format!( + "canonical output file path {canon_path:?} provided more than once" + ))); + } + + assert!(seen_files.insert(canon_path)); + assert!(seen_names.insert(name.clone())); + assert!(concats.insert(name, handle).is_none()); + Ok(()) + } + + fn add_dir( + err: Rc>, + output_dir: PathBuf, + mkdir: bool, + name: OutputName, + seen_dirs: &mut HashSet, + seen_names: &mut HashSet, + extracts: &mut HashMap>, + ) -> Result<(), CommandError> { + if seen_names.contains(&name) { + return Err(CommandError::InvalidArg(format!( + "output name {name:?} provided more than once" + ))); + } + assert!(!extracts.contains_key(&name)); + + if mkdir { + fs::create_dir_all(&output_dir) + .wrap_err_with(|| format!("failed to create output directory {output_dir:?}"))?; + }; + + let canon_path = output_dir + .canonicalize() + .wrap_err_with(|| format!("canonicalizing dir path {output_dir:?} failed"))?; + if seen_dirs.contains(&canon_path) { + return Err(CommandError::InvalidArg(format!( + "canonical output dir path {canon_path:?} provided more than once" + ))); + } + + let handle: Rc = { + let d = FilesystemReceiver::new(err, output_dir); + Rc::new(d) + }; + + assert!(seen_dirs.insert(canon_path)); + assert!(seen_names.insert(name.clone())); + assert!(extracts.insert(name, handle).is_none()); + Ok(()) + } + + pub fn from_output_specs( + err: Rc>, + spec: OutputSpecs, + ) -> Result { + let OutputSpecs { default, named } = spec; + + let mut concats: HashMap>> = HashMap::new(); + let mut extracts: HashMap> = HashMap::new(); + + let mut seen_stdout: bool = false; + let mut seen_files: HashSet = HashSet::new(); + let mut seen_dirs: HashSet = HashSet::new(); + let mut seen_names: HashSet = HashSet::new(); + + if let Some(default) = default { + match default { + OutputCollation::ConcatenateStdout => { + Self::add_stdout( + &mut seen_stdout, + OutputName::default_name(), + &mut seen_names, + &mut concats, + )?; + } + OutputCollation::ConcatenateFile { path, append } => { + Self::add_file( + path, + append, + OutputName::default_name(), + &mut seen_files, + &mut seen_names, + &mut concats, + )?; + } + OutputCollation::Filesystem { output_dir, mkdir } => { + Self::add_dir( + err.clone(), + output_dir, + mkdir, + OutputName::default_name(), + &mut seen_dirs, + &mut seen_names, + &mut extracts, + )?; + } + } + } + for NamedOutput { name, output } in named.into_iter() { + let name = OutputName(name); + match output { + OutputCollation::ConcatenateStdout => { + Self::add_stdout(&mut seen_stdout, name, &mut seen_names, &mut concats)?; + } + OutputCollation::ConcatenateFile { path, append } => { + Self::add_file( + path, + append, + name, + &mut seen_files, + &mut seen_names, + &mut concats, + )?; + } + OutputCollation::Filesystem { output_dir, mkdir } => { + Self::add_dir( + err.clone(), + output_dir, + mkdir, + name, + &mut seen_dirs, + &mut seen_names, + &mut extracts, + )?; + } + } + } + + Ok(Self { concats, extracts }) + } +} diff --git a/cli/src/extract/receiver.rs b/cli/src/extract/receiver.rs index 021efeada..6b106dbd2 100644 --- a/cli/src/extract/receiver.rs +++ b/cli/src/extract/receiver.rs @@ -1,9 +1,8 @@ use std::{ borrow::Cow, cell::RefCell, - collections::{HashMap, HashSet}, fmt, fs, - io::{self, Seek, Write}, + io::{self, Write}, mem, path::{Path, PathBuf}, rc::Rc, @@ -13,7 +12,7 @@ use zip::{read::ZipFile, CompressionMethod}; use super::matcher::{CompiledMatcher, EntryMatcher}; use super::transform::{CompiledTransformer, NameTransformer}; -use crate::{args::extract::*, CommandError, WrapCommandErr}; +use crate::{CommandError, WrapCommandErr}; #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub enum EntryKind { @@ -50,50 +49,6 @@ impl<'a> EntryData<'a> { } } -#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -struct OutputName(pub String); - -impl OutputName { - pub fn default_name() -> Self { - Self("default".to_string()) - } -} - -struct ParsedEntrySpecArg { - pub matcher: Option, - pub transforms: Option, - pub output_name: OutputName, -} - -impl ParsedEntrySpecArg { - pub fn from_entry_spec(spec: EntrySpec) -> Result { - let EntrySpec { - match_expr, - name_transforms, - content_transform, - } = spec; - let matcher = match match_expr { - None => None, - Some(expr) => Some(CompiledMatcher::from_arg(expr)?), - }; - let transforms = if name_transforms.is_empty() { - None - } else { - Some(CompiledTransformer::from_arg(name_transforms)?) - }; - let output_name = match content_transform { - ContentTransform::Extract { name } => name - .map(OutputName) - .unwrap_or_else(OutputName::default_name), - }; - Ok(Self { - matcher, - transforms, - output_name, - }) - } -} - pub struct ConcatEntry<'w> { pub matcher: Option, pub stream: Rc>, @@ -219,267 +174,6 @@ impl<'a, 'c, 'w> MatchingEntrySpec<'a, 'c, 'w> { } } -struct ParsedNamedOutputs<'w> { - concats: HashMap>>, - extracts: HashMap>, -} - -pub fn process_entry_and_output_specs<'w>( - err: Rc>, - entry_specs: impl IntoIterator, - output_specs: OutputSpecs, -) -> Result>, CommandError> { - let mut entry_specs: Vec = entry_specs - .into_iter() - .map(ParsedEntrySpecArg::from_entry_spec) - .collect::>()?; - if entry_specs.is_empty() { - entry_specs.push(ParsedEntrySpecArg { - matcher: None, - transforms: None, - output_name: OutputName::default_name(), - }); - } - let parsed_outputs = ParsedNamedOutputs::from_output_specs(err, output_specs)?; - parsed_outputs.process_entry_specs_for_outputs(entry_specs) -} - -impl<'w> ParsedNamedOutputs<'w> { - pub fn process_entry_specs_for_outputs( - self, - args: impl IntoIterator, - ) -> Result>, CommandError> { - args.into_iter() - .map(|arg| self.lookup_entry_spec_arg(arg)) - .collect() - } - - fn lookup_entry_spec_arg( - &self, - arg: ParsedEntrySpecArg, - ) -> Result, CommandError> { - let ParsedEntrySpecArg { - matcher, - transforms, - output_name, - } = arg; - if let Some(stream) = self.concats.get(&output_name) { - if transforms.is_some() { - return Err(CommandError::InvalidArg(format!( - "entry name transforms do not apply to concat output {output_name:?}" - ))); - } - return Ok(CompiledEntrySpec::Concat(ConcatEntry { - matcher, - stream: stream.clone(), - })); - } - let Some(recv) = self.extracts.get(&output_name) else { - return Err(CommandError::InvalidArg(format!( - "output name {output_name:?} was not found" - ))); - }; - Ok(CompiledEntrySpec::Extract(ExtractEntry { - matcher, - transforms, - recv: recv.clone(), - })) - } - - fn add_stdout( - seen_stdout: &mut bool, - name: OutputName, - seen_names: &mut HashSet, - concats: &mut HashMap>>, - ) -> Result<(), CommandError> { - if *seen_stdout { - return Err(CommandError::InvalidArg( - "--stdout output provided for more than one receiver".to_string(), - )); - } - if seen_names.contains(&name) { - return Err(CommandError::InvalidArg(format!( - "output name {name:?} provided more than once" - ))); - } - assert!(!concats.contains_key(&name)); - - let handle: Rc> = Rc::new(RefCell::new(io::stdout())); - - *seen_stdout = true; - assert!(seen_names.insert(name.clone())); - assert!(concats.insert(name, handle).is_none()); - Ok(()) - } - - fn add_file( - path: PathBuf, - append: bool, - name: OutputName, - seen_files: &mut HashSet, - seen_names: &mut HashSet, - concats: &mut HashMap>>, - ) -> Result<(), CommandError> { - if seen_names.contains(&name) { - return Err(CommandError::InvalidArg(format!( - "output name {name:?} provided more than once" - ))); - } - assert!(!concats.contains_key(&name)); - - let handle: Rc> = { - let mut f: fs::File = if append { - fs::OpenOptions::new() - .write(true) - .create(true) - .open(&path) - .wrap_err_with(|| format!("failed to open file for append at {path:?}"))? - } else { - fs::File::create(&path) - .wrap_err_with(|| format!("failed to open file with truncation at {path:?}"))? - }; - f.seek(io::SeekFrom::End(0)) - .wrap_err_with(|| format!("failed to seek to end of opened file {f:?}"))?; - Rc::new(RefCell::new(f)) - }; - - let canon_path = path - .canonicalize() - .wrap_err_with(|| format!("canonicalizing path {path:?} failed"))?; - if seen_files.contains(&canon_path) { - return Err(CommandError::InvalidArg(format!( - "canonical output file path {canon_path:?} provided more than once" - ))); - } - - assert!(seen_files.insert(canon_path)); - assert!(seen_names.insert(name.clone())); - assert!(concats.insert(name, handle).is_none()); - Ok(()) - } - - fn add_dir( - err: Rc>, - output_dir: PathBuf, - mkdir: bool, - name: OutputName, - seen_dirs: &mut HashSet, - seen_names: &mut HashSet, - extracts: &mut HashMap>, - ) -> Result<(), CommandError> { - if seen_names.contains(&name) { - return Err(CommandError::InvalidArg(format!( - "output name {name:?} provided more than once" - ))); - } - assert!(!extracts.contains_key(&name)); - - if mkdir { - fs::create_dir_all(&output_dir) - .wrap_err_with(|| format!("failed to create output directory {output_dir:?}"))?; - }; - - let canon_path = output_dir - .canonicalize() - .wrap_err_with(|| format!("canonicalizing dir path {output_dir:?} failed"))?; - if seen_dirs.contains(&canon_path) { - return Err(CommandError::InvalidArg(format!( - "canonical output dir path {canon_path:?} provided more than once" - ))); - } - - let handle: Rc = { - let d = FilesystemReceiver::new(err, output_dir); - Rc::new(d) - }; - - assert!(seen_dirs.insert(canon_path)); - assert!(seen_names.insert(name.clone())); - assert!(extracts.insert(name, handle).is_none()); - Ok(()) - } - - pub fn from_output_specs( - err: Rc>, - spec: OutputSpecs, - ) -> Result { - let OutputSpecs { default, named } = spec; - - let mut concats: HashMap>> = HashMap::new(); - let mut extracts: HashMap> = HashMap::new(); - - let mut seen_stdout: bool = false; - let mut seen_files: HashSet = HashSet::new(); - let mut seen_dirs: HashSet = HashSet::new(); - let mut seen_names: HashSet = HashSet::new(); - - if let Some(default) = default { - match default { - OutputCollation::ConcatenateStdout => { - Self::add_stdout( - &mut seen_stdout, - OutputName::default_name(), - &mut seen_names, - &mut concats, - )?; - } - OutputCollation::ConcatenateFile { path, append } => { - Self::add_file( - path, - append, - OutputName::default_name(), - &mut seen_files, - &mut seen_names, - &mut concats, - )?; - } - OutputCollation::Filesystem { output_dir, mkdir } => { - Self::add_dir( - err.clone(), - output_dir, - mkdir, - OutputName::default_name(), - &mut seen_dirs, - &mut seen_names, - &mut extracts, - )?; - } - } - } - for NamedOutput { name, output } in named.into_iter() { - let name = OutputName(name); - match output { - OutputCollation::ConcatenateStdout => { - Self::add_stdout(&mut seen_stdout, name, &mut seen_names, &mut concats)?; - } - OutputCollation::ConcatenateFile { path, append } => { - Self::add_file( - path, - append, - name, - &mut seen_files, - &mut seen_names, - &mut concats, - )?; - } - OutputCollation::Filesystem { output_dir, mkdir } => { - Self::add_dir( - err.clone(), - output_dir, - mkdir, - name, - &mut seen_dirs, - &mut seen_names, - &mut extracts, - )?; - } - } - } - - Ok(Self { concats, extracts }) - } -} - pub trait EntryReceiver: fmt::Debug { fn generate_entry_handle<'s>( &self, @@ -498,7 +192,7 @@ struct PermsEntry { mode: u32, } -struct FilesystemReceiver { +pub struct FilesystemReceiver { err: Rc>, output_dir: PathBuf, #[cfg(unix)] From 404de6a5f69f5361c68d3728817cf6be633261cf Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 29 Aug 2024 07:41:27 -0400 Subject: [PATCH 67/68] make parsing outputs much MUCH more readable with a builder --- cli/src/extract/named_outputs.rs | 355 +++++++++++++++++-------------- 1 file changed, 191 insertions(+), 164 deletions(-) diff --git a/cli/src/extract/named_outputs.rs b/cli/src/extract/named_outputs.rs index 0406a36aa..535cde155 100644 --- a/cli/src/extract/named_outputs.rs +++ b/cli/src/extract/named_outputs.rs @@ -14,6 +14,26 @@ use super::receiver::{ use super::transform::{CompiledTransformer, NameTransformer}; use crate::{args::extract::*, CommandError, WrapCommandErr}; +pub fn process_entry_and_output_specs<'w>( + err: Rc>, + entry_specs: impl IntoIterator, + output_specs: OutputSpecs, +) -> Result>, CommandError> { + let mut entry_specs: Vec = entry_specs + .into_iter() + .map(ParsedEntrySpecArg::from_entry_spec) + .collect::>()?; + if entry_specs.is_empty() { + entry_specs.push(ParsedEntrySpecArg { + matcher: None, + transforms: None, + output_name: OutputName::default_name(), + }); + } + let parsed_outputs = ParsedNamedOutputs::from_output_specs(err, output_specs)?; + parsed_outputs.process_entry_specs_for_outputs(entry_specs) +} + #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] struct OutputName(pub String); @@ -58,115 +78,111 @@ impl ParsedEntrySpecArg { } } -struct ParsedNamedOutputs<'w> { +struct NamedOutputsBuilder<'w, W> { + err: Rc>, concats: HashMap>>, extracts: HashMap>, + seen_stdout: bool, + seen_files: HashSet, + seen_dirs: HashSet, + seen_names: HashSet, } -pub fn process_entry_and_output_specs<'w>( - err: Rc>, - entry_specs: impl IntoIterator, - output_specs: OutputSpecs, -) -> Result>, CommandError> { - let mut entry_specs: Vec = entry_specs - .into_iter() - .map(ParsedEntrySpecArg::from_entry_spec) - .collect::>()?; - if entry_specs.is_empty() { - entry_specs.push(ParsedEntrySpecArg { - matcher: None, - transforms: None, - output_name: OutputName::default_name(), - }); +impl<'w, W> NamedOutputsBuilder<'w, W> { + pub fn new(err: Rc>) -> Self { + Self { + err, + concats: HashMap::new(), + extracts: HashMap::new(), + seen_stdout: false, + seen_files: HashSet::new(), + seen_dirs: HashSet::new(), + seen_names: HashSet::new(), + } } - let parsed_outputs = ParsedNamedOutputs::from_output_specs(err, output_specs)?; - parsed_outputs.process_entry_specs_for_outputs(entry_specs) -} -impl<'w> ParsedNamedOutputs<'w> { - pub fn process_entry_specs_for_outputs( + pub fn into_tables( self, - args: impl IntoIterator, - ) -> Result>, CommandError> { - args.into_iter() - .map(|arg| self.lookup_entry_spec_arg(arg)) - .collect() + ) -> ( + HashMap>>, + HashMap>, + ) { + let Self { + concats, extracts, .. + } = self; + (concats, extracts) } - fn lookup_entry_spec_arg( - &self, - arg: ParsedEntrySpecArg, - ) -> Result, CommandError> { - let ParsedEntrySpecArg { - matcher, - transforms, - output_name, - } = arg; - if let Some(stream) = self.concats.get(&output_name) { - if transforms.is_some() { - return Err(CommandError::InvalidArg(format!( - "entry name transforms do not apply to concat output {output_name:?}" - ))); - } - return Ok(CompiledEntrySpec::Concat(ConcatEntry { - matcher, - stream: stream.clone(), - })); - } - let Some(recv) = self.extracts.get(&output_name) else { + fn add_name( + &mut self, + name: OutputName, + f: impl FnOnce() -> Result, + ) -> Result { + if self.seen_names.contains(&name) { return Err(CommandError::InvalidArg(format!( - "output name {output_name:?} was not found" + "output name {name:?} provided more than once" ))); - }; - Ok(CompiledEntrySpec::Extract(ExtractEntry { - matcher, - transforms, - recv: recv.clone(), - })) + } + + let ret = f()?; + + assert!(self.seen_names.insert(name)); + + Ok(ret) } - fn add_stdout( - seen_stdout: &mut bool, + fn add_concat( + &mut self, name: OutputName, - seen_names: &mut HashSet, - concats: &mut HashMap>>, + handle: impl Write + 'w, ) -> Result<(), CommandError> { - if *seen_stdout { + /* This should be assured by the check against self.seen_names. */ + assert!(!self.concats.contains_key(&name)); + + let handle = Rc::new(RefCell::new(handle)); + + assert!(self.concats.insert(name, handle).is_none()); + + Ok(()) + } + + pub fn add_stdout(&mut self, name: OutputName) -> Result<(), CommandError> { + if self.seen_stdout { return Err(CommandError::InvalidArg( "--stdout output provided for more than one receiver".to_string(), )); } - if seen_names.contains(&name) { + + let handle = self.add_name(name.clone(), || Ok(io::stdout()))?; + self.add_concat(name, handle)?; + + self.seen_stdout = true; + Ok(()) + } + + fn add_seen_file(&mut self, path: PathBuf) -> Result<(), CommandError> { + let canon_path = path + .canonicalize() + .wrap_err_with(|| format!("canonicalizing path {path:?} failed"))?; + + if self.seen_files.contains(&canon_path) { return Err(CommandError::InvalidArg(format!( - "output name {name:?} provided more than once" + "canonical output file path {canon_path:?} provided more than once" ))); } - assert!(!concats.contains_key(&name)); - let handle: Rc> = Rc::new(RefCell::new(io::stdout())); + assert!(self.seen_files.insert(canon_path)); - *seen_stdout = true; - assert!(seen_names.insert(name.clone())); - assert!(concats.insert(name, handle).is_none()); Ok(()) } - fn add_file( + pub fn add_file( + &mut self, path: PathBuf, append: bool, name: OutputName, - seen_files: &mut HashSet, - seen_names: &mut HashSet, - concats: &mut HashMap>>, ) -> Result<(), CommandError> { - if seen_names.contains(&name) { - return Err(CommandError::InvalidArg(format!( - "output name {name:?} provided more than once" - ))); - } - assert!(!concats.contains_key(&name)); - - let handle: Rc> = { + let handle = self.add_name(name.clone(), || { let mut f: fs::File = if append { fs::OpenOptions::new() .write(true) @@ -179,63 +195,113 @@ impl<'w> ParsedNamedOutputs<'w> { }; f.seek(io::SeekFrom::End(0)) .wrap_err_with(|| format!("failed to seek to end of opened file {f:?}"))?; - Rc::new(RefCell::new(f)) - }; + Ok(f) + })?; + self.add_seen_file(path)?; + self.add_concat(name, handle)?; + Ok(()) + } + fn add_seen_dir(&mut self, path: PathBuf) -> Result<(), CommandError> { let canon_path = path .canonicalize() - .wrap_err_with(|| format!("canonicalizing path {path:?} failed"))?; - if seen_files.contains(&canon_path) { + .wrap_err_with(|| format!("canonicalizing dir path {path:?} failed"))?; + if self.seen_dirs.contains(&canon_path) { return Err(CommandError::InvalidArg(format!( - "canonical output file path {canon_path:?} provided more than once" + "canonical output dir path {canon_path:?} provided more than once" ))); } - assert!(seen_files.insert(canon_path)); - assert!(seen_names.insert(name.clone())); - assert!(concats.insert(name, handle).is_none()); + assert!(self.seen_dirs.insert(canon_path)); + Ok(()) } - fn add_dir( - err: Rc>, + fn add_extract( + &mut self, + name: OutputName, + handle: impl EntryReceiver + 'w, + ) -> Result<(), CommandError> { + assert!(!self.extracts.contains_key(&name)); + + let handle = Rc::new(handle); + + assert!(self.extracts.insert(name, handle).is_none()); + + Ok(()) + } +} + +impl<'w, W> NamedOutputsBuilder<'w, W> +where + W: Write + 'w, +{ + pub fn add_dir( + &mut self, output_dir: PathBuf, mkdir: bool, name: OutputName, - seen_dirs: &mut HashSet, - seen_names: &mut HashSet, - extracts: &mut HashMap>, ) -> Result<(), CommandError> { - if seen_names.contains(&name) { - return Err(CommandError::InvalidArg(format!( - "output name {name:?} provided more than once" - ))); - } - assert!(!extracts.contains_key(&name)); + let err = self.err.clone(); + let handle = self.add_name(name.clone(), || { + if mkdir { + fs::create_dir_all(&output_dir).wrap_err_with(|| { + format!("failed to create output directory {output_dir:?}") + })?; + }; + Ok(FilesystemReceiver::new(err, output_dir.clone())) + })?; + self.add_seen_dir(output_dir.clone())?; + self.add_extract(name, handle)?; + Ok(()) + } +} - if mkdir { - fs::create_dir_all(&output_dir) - .wrap_err_with(|| format!("failed to create output directory {output_dir:?}"))?; - }; +struct ParsedNamedOutputs<'w> { + concats: HashMap>>, + extracts: HashMap>, +} - let canon_path = output_dir - .canonicalize() - .wrap_err_with(|| format!("canonicalizing dir path {output_dir:?} failed"))?; - if seen_dirs.contains(&canon_path) { +impl<'w> ParsedNamedOutputs<'w> { + pub fn process_entry_specs_for_outputs( + self, + args: impl IntoIterator, + ) -> Result>, CommandError> { + args.into_iter() + .map(|arg| self.lookup_entry_spec_arg(arg)) + .collect() + } + + fn lookup_entry_spec_arg( + &self, + arg: ParsedEntrySpecArg, + ) -> Result, CommandError> { + let ParsedEntrySpecArg { + matcher, + transforms, + output_name, + } = arg; + if let Some(stream) = self.concats.get(&output_name) { + if transforms.is_some() { + return Err(CommandError::InvalidArg(format!( + "entry name transforms do not apply to concat output {output_name:?}" + ))); + } + return Ok(CompiledEntrySpec::Concat(ConcatEntry { + matcher, + stream: stream.clone(), + })); + } + let Some(recv) = self.extracts.get(&output_name) else { return Err(CommandError::InvalidArg(format!( - "canonical output dir path {canon_path:?} provided more than once" + "output name {output_name:?} was not found" ))); - } - - let handle: Rc = { - let d = FilesystemReceiver::new(err, output_dir); - Rc::new(d) }; - - assert!(seen_dirs.insert(canon_path)); - assert!(seen_names.insert(name.clone())); - assert!(extracts.insert(name, handle).is_none()); - Ok(()) + Ok(CompiledEntrySpec::Extract(ExtractEntry { + matcher, + transforms, + recv: recv.clone(), + })) } pub fn from_output_specs( @@ -244,44 +310,19 @@ impl<'w> ParsedNamedOutputs<'w> { ) -> Result { let OutputSpecs { default, named } = spec; - let mut concats: HashMap>> = HashMap::new(); - let mut extracts: HashMap> = HashMap::new(); - - let mut seen_stdout: bool = false; - let mut seen_files: HashSet = HashSet::new(); - let mut seen_dirs: HashSet = HashSet::new(); - let mut seen_names: HashSet = HashSet::new(); + let mut builder = NamedOutputsBuilder::new(err); if let Some(default) = default { + let name = OutputName::default_name(); match default { OutputCollation::ConcatenateStdout => { - Self::add_stdout( - &mut seen_stdout, - OutputName::default_name(), - &mut seen_names, - &mut concats, - )?; + builder.add_stdout(name)?; } OutputCollation::ConcatenateFile { path, append } => { - Self::add_file( - path, - append, - OutputName::default_name(), - &mut seen_files, - &mut seen_names, - &mut concats, - )?; + builder.add_file(path, append, name)?; } OutputCollation::Filesystem { output_dir, mkdir } => { - Self::add_dir( - err.clone(), - output_dir, - mkdir, - OutputName::default_name(), - &mut seen_dirs, - &mut seen_names, - &mut extracts, - )?; + builder.add_dir(output_dir, mkdir, name)?; } } } @@ -289,32 +330,18 @@ impl<'w> ParsedNamedOutputs<'w> { let name = OutputName(name); match output { OutputCollation::ConcatenateStdout => { - Self::add_stdout(&mut seen_stdout, name, &mut seen_names, &mut concats)?; + builder.add_stdout(name)?; } OutputCollation::ConcatenateFile { path, append } => { - Self::add_file( - path, - append, - name, - &mut seen_files, - &mut seen_names, - &mut concats, - )?; + builder.add_file(path, append, name)?; } OutputCollation::Filesystem { output_dir, mkdir } => { - Self::add_dir( - err.clone(), - output_dir, - mkdir, - name, - &mut seen_dirs, - &mut seen_names, - &mut extracts, - )?; + builder.add_dir(output_dir, mkdir, name)?; } } } + let (concats, extracts) = builder.into_tables(); Ok(Self { concats, extracts }) } } From 69670e35376d20709e694466190d6c866f932074 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 29 Aug 2024 17:18:46 -0400 Subject: [PATCH 68/68] all info directives are now supported --- cli/src/args/info.rs | 17 ++- cli/src/compress.rs | 1 + cli/src/extract.rs | 4 +- cli/src/extract/matcher.rs | 4 +- cli/src/extract/receiver.rs | 44 +++++- cli/src/info/directives.rs | 187 ++++++++++++++++++++++--- cli/src/info/formats.rs | 82 ++++++++++- src/extra_fields/extended_timestamp.rs | 2 +- 8 files changed, 302 insertions(+), 39 deletions(-) diff --git a/cli/src/args/info.rs b/cli/src/args/info.rs index 780b696df..f571829c5 100644 --- a/cli/src/args/info.rs +++ b/cli/src/args/info.rs @@ -247,7 +247,6 @@ impl UnixModeFormat { #[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum TimestampFormat { - UnixEpochMilliseconds, DateOnly, TimeOnly, #[default] @@ -258,7 +257,6 @@ impl TimestampFormat { pub fn parse(s: &str) -> Result { match s { "" => Ok(Self::default()), - ":epoch" => Ok(Self::UnixEpochMilliseconds), ":date" => Ok(Self::DateOnly), ":time" => Ok(Self::TimeOnly), ":date-time" => Ok(Self::DateAndTime), @@ -337,6 +335,7 @@ pub enum EntryFormatDirective { LocalHeaderStart(OffsetFormat), ContentStart(OffsetFormat), ContentEnd(OffsetFormat), + CentralHeaderStart(OffsetFormat), CompressedSize(ByteSizeFormat), UncompressedSize(ByteSizeFormat), UnixMode(UnixModeFormat), @@ -374,6 +373,11 @@ impl ParseableDirective for EntryFormatDirective { .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; Ok(Self::ContentEnd(offset_fmt)) } + s if s.starts_with("central-header-start") => { + let offset_fmt = OffsetFormat::parse(&s["central-header-start".len()..]) + .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; + Ok(Self::CentralHeaderStart(offset_fmt)) + } s if s.starts_with("compressed-size") => { let size_fmt = ByteSizeFormat::parse(&s["compressed-size".len()..]) .map_err(|e| DirectiveParseError::Modifier(s.to_string(), e))?; @@ -563,6 +567,10 @@ all the output to a single line. The offset of the end of the entry's possibly-compressed content. The next entry's local header begins immediately after. +%central-header-start% + The offset of the entry's central directory header, at the end of the + zip file. + %compressed-size% The size of the entry's possibly-compressed content as stored in the archive. @@ -584,7 +592,7 @@ all the output to a single line. %timestamp% The timestamp for the entry. - Note that zip timestamps only have precision down to the minute. + Note that zip timestamps only have precision down to 2 seconds. ## Entry format directives: @@ -608,9 +616,8 @@ unix-mode = '' [DEFAULT => octal] = ':pretty' (`ls`-like permissions string) timestamp = '' [DEFAULT => date-time] - = ':epoch' (milliseconds since unix epoch as a decimal number) = ':date' (ISO 8601 string representation of date) - = ':time' (HH:MM string representation of time) + = ':time' (HH:MM:SS string representation of time) = ':date-time' (ISO 8601 date then HH:MM time joined by a space) diff --git a/cli/src/compress.rs b/cli/src/compress.rs index 784f835b1..e35058273 100644 --- a/cli/src/compress.rs +++ b/cli/src/compress.rs @@ -418,6 +418,7 @@ pub fn execute_compress(mut err: impl Write, args: Compress) -> Result<(), Comma "name {last_name} remaining after all entry flags processed" ))); } + for pos_arg in positional_paths.into_iter() { let file_type = fs::symlink_metadata(&pos_arg) .wrap_err_with(|| format!("failed to read metadata from path {}", pos_arg.display()))? diff --git a/cli/src/extract.rs b/cli/src/extract.rs index 69efe3deb..f5aaa28c7 100644 --- a/cli/src/extract.rs +++ b/cli/src/extract.rs @@ -33,7 +33,7 @@ fn maybe_process_symlink<'a, 't>( * contents with io::Read. ZipEntry<'a, R> from * https://github.com/zip-rs/zip2/pull/233 avoids this issue!!! */ let data = EntryData::from_entry(&entry); - (data.kind, data.size) + (data.kind, data.uncompressed_size) }; if !matches!(kind, EntryKind::Symlink) { return Ok(None); @@ -86,7 +86,7 @@ where deduped_matching_extracts .into_iter() .flat_map(|(recv, names)| names.into_iter().map(move |n| (recv, n))) - .map(|(recv, name)| recv.generate_entry_handle(data, symlink_target.as_deref(), name)) + .map(|(recv, name)| recv.generate_entry_handle(&data, symlink_target.as_deref(), name)) .collect::, _>>()? .into_iter() .flatten(), diff --git a/cli/src/extract/matcher.rs b/cli/src/extract/matcher.rs index af382369d..9e3eb463f 100644 --- a/cli/src/extract/matcher.rs +++ b/cli/src/extract/matcher.rs @@ -391,8 +391,8 @@ impl EntryMatcher for Size { fn matches(&self, entry: &EntryData) -> bool { match self { - Self::Max(max) => entry.size <= *max, - Self::Min(min) => entry.size >= *min, + Self::Max(max) => entry.uncompressed_size <= *max, + Self::Min(min) => entry.uncompressed_size >= *min, } } } diff --git a/cli/src/extract/receiver.rs b/cli/src/extract/receiver.rs index 6b106dbd2..6495ccd60 100644 --- a/cli/src/extract/receiver.rs +++ b/cli/src/extract/receiver.rs @@ -8,7 +8,11 @@ use std::{ rc::Rc, }; -use zip::{read::ZipFile, CompressionMethod}; +use zip::{ + extra_fields::{ExtendedTimestamp, ExtraField}, + read::ZipFile, + CompressionMethod, DateTime, +}; use super::matcher::{CompiledMatcher, EntryMatcher}; use super::transform::{CompiledTransformer, NameTransformer}; @@ -21,13 +25,21 @@ pub enum EntryKind { Symlink, } -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct EntryData<'a> { pub name: &'a str, pub kind: EntryKind, pub compression: CompressionMethod, pub unix_mode: Option, - pub size: u64, + pub comment: &'a str, + pub uncompressed_size: u64, + pub compressed_size: u64, + pub local_header_start: u64, + pub content_start: u64, + pub central_header_start: u64, + pub crc32: u32, + pub last_modified_time: Option, + pub extended_timestamp: Option, } impl<'a> EntryData<'a> { @@ -44,9 +56,27 @@ impl<'a> EntryData<'a> { }, compression: entry.compression(), unix_mode: entry.unix_mode(), - size: entry.size(), + comment: entry.comment(), + uncompressed_size: entry.size(), + compressed_size: entry.compressed_size(), + local_header_start: entry.header_start(), + content_start: entry.data_start(), + central_header_start: entry.central_header_start(), + crc32: entry.crc32(), + last_modified_time: entry.last_modified(), + extended_timestamp: entry + .extra_data_fields() + .find_map(|f| match f { + ExtraField::ExtendedTimestamp(ts) => Some(ts), + }) + .cloned(), } } + + #[inline(always)] + pub const fn content_end(&self) -> u64 { + self.content_start + self.compressed_size + } } pub struct ConcatEntry<'w> { @@ -136,7 +166,7 @@ pub enum MatchingEntrySpec<'a, 'c, 'w> { impl<'a, 'c, 'w> MatchingEntrySpec<'a, 'c, 'w> { /* Split output handles for concat, and split generated handles by extract source and - * name. use ptr::eq() to split, and Cow::<'s, str>::eq() with str AsRef. */ + * name. use Rc::ptr_eq() to split, and Cow::<'s, str>::eq() with str AsRef. */ pub fn is_nested_duplicate( self, deduped_concat_writers: &mut Vec<&'c Rc>>, @@ -177,7 +207,7 @@ impl<'a, 'c, 'w> MatchingEntrySpec<'a, 'c, 'w> { pub trait EntryReceiver: fmt::Debug { fn generate_entry_handle<'s>( &self, - data: EntryData<'s>, + data: &EntryData<'s>, symlink_target: Option<&[u8]>, name: Cow<'s, str>, ) -> Result>, CommandError>; @@ -274,7 +304,7 @@ where { fn generate_entry_handle<'s>( &self, - data: EntryData<'s>, + data: &EntryData<'s>, symlink_target: Option<&[u8]>, name: Cow<'s, str>, ) -> Result>, CommandError> { diff --git a/cli/src/info/directives.rs b/cli/src/info/directives.rs index 785227bc6..e4e3e5bfd 100644 --- a/cli/src/info/directives.rs +++ b/cli/src/info/directives.rs @@ -181,8 +181,8 @@ pub mod compiled { pub mod entry { use super::{ super::formats::{ - ByteSizeValue, CompressionMethodValue, FileTypeValue, FormatValue, NameString, - UnixModeValue, + BinaryNumericValue, BinaryStringValue, ByteSizeValue, CompressionMethodValue, + FileTypeValue, FormatValue, NameString, OffsetValue, TimestampValue, UnixModeValue, }, FormatDirective, }; @@ -220,6 +220,118 @@ pub mod entry { } } + pub struct EntryCommentField(pub BinaryStringValue); + + impl FormatDirective for EntryCommentField { + type Data<'a> = &'a EntryData<'a>; + type FieldType = BinaryStringValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + Some(data.comment.as_bytes()) + } + fn value_formatter(&self) -> BinaryStringValue { + self.0 + } + } + + pub struct LocalHeaderStartField(pub OffsetValue); + + impl FormatDirective for LocalHeaderStartField { + type Data<'a> = &'a EntryData<'a>; + type FieldType = OffsetValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + Some(data.local_header_start) + } + fn value_formatter(&self) -> OffsetValue { + self.0 + } + } + + pub struct ContentStartField(pub OffsetValue); + + impl FormatDirective for ContentStartField { + type Data<'a> = &'a EntryData<'a>; + type FieldType = OffsetValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + Some(data.content_start) + } + fn value_formatter(&self) -> OffsetValue { + self.0 + } + } + + pub struct UncompressedSizeField(pub ByteSizeValue); + + impl FormatDirective for UncompressedSizeField { + type Data<'a> = &'a EntryData<'a>; + type FieldType = ByteSizeValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.uncompressed_size + } + fn value_formatter(&self) -> ByteSizeValue { + self.0 + } + } + + pub struct CompressedSizeField(pub ByteSizeValue); + + impl FormatDirective for CompressedSizeField { + type Data<'a> = &'a EntryData<'a>; + type FieldType = ByteSizeValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.compressed_size + } + fn value_formatter(&self) -> ByteSizeValue { + self.0 + } + } + + pub struct ContentEndField(pub OffsetValue); + + impl FormatDirective for ContentEndField { + type Data<'a> = &'a EntryData<'a>; + type FieldType = OffsetValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + Some(data.content_end()) + } + fn value_formatter(&self) -> OffsetValue { + self.0 + } + } + + pub struct CentralHeaderStartField(pub OffsetValue); + + impl FormatDirective for CentralHeaderStartField { + type Data<'a> = &'a EntryData<'a>; + type FieldType = OffsetValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + Some(data.central_header_start) + } + fn value_formatter(&self) -> OffsetValue { + self.0 + } + } + pub struct CompressionMethodField(pub CompressionMethodValue); impl FormatDirective for CompressionMethodField { @@ -252,18 +364,34 @@ pub mod entry { } } - pub struct UncompressedSizeField(pub ByteSizeValue); + pub struct Crc32Field(pub BinaryNumericValue); - impl FormatDirective for UncompressedSizeField { + impl FormatDirective for Crc32Field { type Data<'a> = &'a EntryData<'a>; - type FieldType = ByteSizeValue; + type FieldType = BinaryNumericValue; fn extract_field<'a>( &self, data: Self::Data<'a>, ) -> ::Input<'a> { - data.size + data.crc32 } - fn value_formatter(&self) -> ByteSizeValue { + fn value_formatter(&self) -> BinaryNumericValue { + self.0 + } + } + + pub struct TimestampField(pub TimestampValue); + + impl FormatDirective for TimestampField { + type Data<'a> = &'a EntryData<'a>; + type FieldType = TimestampValue; + fn extract_field<'a>( + &self, + data: Self::Data<'a>, + ) -> ::Input<'a> { + data.last_modified_time + } + fn value_formatter(&self) -> TimestampValue { self.0 } } @@ -326,6 +454,9 @@ pub mod entry { Ok(CompiledEntryDirective(match spec { EntryFormatDirective::Name => Box::new(EntryNameField(NameString)), EntryFormatDirective::FileType(f) => Box::new(FileTypeField(FileTypeValue(f))), + EntryFormatDirective::CompressedSize(f) => { + Box::new(CompressedSizeField(ByteSizeValue(f))) + } EntryFormatDirective::UncompressedSize(f) => { Box::new(UncompressedSizeField(ByteSizeValue(f))) } @@ -333,7 +464,27 @@ pub mod entry { EntryFormatDirective::CompressionMethod(f) => { Box::new(CompressionMethodField(CompressionMethodValue(f))) } - _ => todo!(), + EntryFormatDirective::Comment(f) => { + Box::new(EntryCommentField(BinaryStringValue(f))) + } + EntryFormatDirective::LocalHeaderStart(f) => { + Box::new(LocalHeaderStartField(OffsetValue(f))) + } + EntryFormatDirective::ContentStart(f) => { + Box::new(ContentStartField(OffsetValue(f))) + } + EntryFormatDirective::ContentEnd(f) => { + Box::new(ContentEndField(OffsetValue(f))) + } + EntryFormatDirective::CentralHeaderStart(f) => { + Box::new(CentralHeaderStartField(OffsetValue(f))) + } + EntryFormatDirective::CrcValue(f) => { + Box::new(Crc32Field(BinaryNumericValue(f))) + } + EntryFormatDirective::Timestamp(f) => { + Box::new(TimestampField(TimestampValue(f))) + } })) } } @@ -354,7 +505,7 @@ pub mod archive { use std::path::Path; - #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] + #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct ArchiveData<'a> { pub path: Option<&'a Path>, pub stream_length: u64, @@ -366,19 +517,13 @@ pub mod archive { impl<'a> ArchiveData<'a> { pub fn from_archive_with_path(zip: &'a ArchiveWithPath) -> Self { - let path = zip.path.as_path(); - let stream_length = zip.len; - let num_entries = zip.archive.len(); - let comment = zip.archive.comment(); - let first_entry_start = zip.archive.offset(); - let central_directory_start = zip.archive.central_directory_start(); Self { - path: Some(path), - stream_length, - num_entries, - comment: Some(comment), - first_entry_start: Some(first_entry_start), - central_directory_start: Some(central_directory_start), + path: Some(zip.path.as_path()), + stream_length: zip.len, + num_entries: zip.archive.len(), + comment: Some(zip.archive.comment()), + first_entry_start: Some(zip.archive.offset()), + central_directory_start: Some(zip.archive.central_directory_start()), } } } diff --git a/cli/src/info/formats.rs b/cli/src/info/formats.rs index 0506f33c8..a320fb122 100644 --- a/cli/src/info/formats.rs +++ b/cli/src/info/formats.rs @@ -5,7 +5,7 @@ use std::{ path, }; -use zip::CompressionMethod; +use zip::{CompressionMethod, DateTime}; use super::directives::Writeable; use crate::{args::info::*, extract::receiver::EntryKind}; @@ -289,6 +289,36 @@ impl FormatValue for OffsetValue { } } +#[derive(Copy, Clone)] +pub struct BinaryNumericValue(pub BinaryNumericValueFormat); + +#[derive(Debug)] +pub enum BinaryNumericValueWriter { + Decimal(u32), + Hexadecimal(u32), +} + +impl fmt::Display for BinaryNumericValueWriter { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Decimal(x) => write!(f, "{}", x), + Self::Hexadecimal(x) => write!(f, "{:x}", x), + } + } +} + +impl FormatValue for BinaryNumericValue { + type Input<'a> = u32; + type Output<'a> = BinaryNumericValueWriter; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + Ok(match self.0 { + BinaryNumericValueFormat::Decimal => BinaryNumericValueWriter::Decimal(input), + BinaryNumericValueFormat::Hexadecimal => BinaryNumericValueWriter::Hexadecimal(input), + }) + } +} + #[derive(Copy, Clone)] pub struct BinaryStringValue(pub BinaryStringFormat); @@ -343,3 +373,53 @@ impl FormatValue for BinaryStringValue { }) } } + +#[derive(Copy, Clone)] +pub struct TimestampValue(pub TimestampFormat); + +#[derive(Debug)] +pub enum TimestampValueWriter { + None, + DateOnly(DateTime), + TimeOnly(DateTime), + DateAndTime(DateTime), +} + +impl fmt::Display for TimestampValueWriter { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::None => write!(f, "?"), + Self::DateOnly(d) => write!(f, "{}-{}-{}", d.year(), d.month(), d.day()), + Self::TimeOnly(t) => write!(f, "{}:{}:{}", t.hour(), t.minute(), t.second()), + Self::DateAndTime(dt) => { + write!( + f, + "{}-{}-{} {}:{}:{}", + dt.year(), + dt.month(), + dt.day(), + dt.hour(), + dt.minute(), + dt.second() + ) + } + } + } +} + +impl FormatValue for TimestampValue { + type Input<'a> = Option; + type Output<'a> = TimestampValueWriter; + type E = Infallible; + fn format_value<'a>(&self, input: Self::Input<'a>) -> Result, Self::E> { + let input = match input { + None => return Ok(TimestampValueWriter::None), + Some(input) => input, + }; + Ok(match self.0 { + TimestampFormat::DateOnly => TimestampValueWriter::DateOnly(input), + TimestampFormat::TimeOnly => TimestampValueWriter::TimeOnly(input), + TimestampFormat::DateAndTime => TimestampValueWriter::DateAndTime(input), + }) + } +} diff --git a/src/extra_fields/extended_timestamp.rs b/src/extra_fields/extended_timestamp.rs index 1cc0f1de4..0cf794c3c 100644 --- a/src/extra_fields/extended_timestamp.rs +++ b/src/extra_fields/extended_timestamp.rs @@ -4,7 +4,7 @@ use std::io::Read; /// extended timestamp, as described in -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct ExtendedTimestamp { mod_time: Option, ac_time: Option,