diff --git a/cli/src/crawl.rs b/cli/src/crawl.rs index c969301..5685d14 100644 --- a/cli/src/crawl.rs +++ b/cli/src/crawl.rs @@ -136,6 +136,11 @@ pub struct MedusaCrawl { /// These patterns will not read through symlinks. #[arg(short, long, default_values_t = Vec::::new())] pub ignore_patterns: Vec, + /// Where [`paths_to_crawl`](Self::paths_to_crawl) is relative to. + /// + /// Defaults to the process's current working directory if not provided. + #[arg(short, long, default_value = None)] + pub working_dir: Option, } impl From for lib_crawl::MedusaCrawl { @@ -143,6 +148,7 @@ impl From for lib_crawl::MedusaCrawl { let MedusaCrawl { paths_to_crawl, ignore_patterns, + working_dir, } = x; let ignore_patterns = RegexSet::new( ignore_patterns @@ -153,6 +159,7 @@ impl From for lib_crawl::MedusaCrawl { Self { paths_to_crawl: paths_to_crawl.into_iter().map(PathBuf::from).collect(), ignores: lib_crawl::Ignores::new(ignore_patterns), + cwd: working_dir, } } } diff --git a/lib/benches/my_benchmark.rs b/lib/benches/my_benchmark.rs index f49b796..5be4643 100644 --- a/lib/benches/my_benchmark.rs +++ b/lib/benches/my_benchmark.rs @@ -53,6 +53,14 @@ mod parallel_merge { Ok((input_files, extract_dir)) } + async fn execute_medusa_crawl( + extracted_dir: &Path, + ) -> Result { + let ignores = lib::crawl::Ignores::default(); + let crawl_spec = lib::crawl::MedusaCrawl::for_single_dir(extracted_dir.to_path_buf(), ignores); + crawl_spec.crawl_paths().await + } + async fn execute_medusa_zip( input_files: Vec, parallelism: lib::zip::Parallelism, @@ -155,7 +163,14 @@ mod parallel_merge { /* FIXME: assigning `_` to the second arg of this tuple will destroy the * extract dir, which is only a silent error producing an empty file!!! * AWFUL UX!!! */ - let (input_files, _tmp_extract_dir) = extract_example_zip(&target).unwrap(); + let (input_files, extracted_dir) = extract_example_zip(&target).unwrap(); + group.bench_function( + BenchmarkId::new(&id, ""), + |b| { + b.to_async(&rt) + .iter(|| execute_medusa_crawl(extracted_dir.path())) + }, + ); /* Run the parallel implementation. */ let parallelism = lib::zip::Parallelism::ParallelMerge; diff --git a/lib/src/crawl.rs b/lib/src/crawl.rs index 4cd3960..ef5be61 100644 --- a/lib/src/crawl.rs +++ b/lib/src/crawl.rs @@ -69,10 +69,12 @@ impl ResolvedPath { } } - pub fn from_path(path: PathBuf) -> Self { + pub(crate) fn from_path_and_root(relative_path: PathBuf, root: &Path) -> Self { + assert!(relative_path.is_relative()); + assert!(root.is_absolute()); Self { - unresolved_path: path.clone(), - resolved_path: path, + resolved_path: root.join(&relative_path), + unresolved_path: relative_path, } } @@ -285,21 +287,51 @@ impl Input { pub struct MedusaCrawl { pub paths_to_crawl: Vec, pub ignores: Ignores, + pub cwd: Option, +} + +impl Default for MedusaCrawl { + fn default() -> Self { + Self { + paths_to_crawl: vec![PathBuf::from(".")], + ignores: Ignores::default(), + cwd: None, + } + } } impl MedusaCrawl { + pub fn for_single_dir(dir: PathBuf, ignores: Ignores) -> Self { + Self { + paths_to_crawl: vec![PathBuf::from(".")], + ignores, + cwd: Some(dir), + } + } + pub async fn crawl_paths(self) -> Result { let Self { paths_to_crawl, ignores, + cwd, } = self; - let cwd = env::current_dir()?; + let cwd: PathBuf = cwd.map(Ok).unwrap_or_else(env::current_dir)?; + + /* Validate all paths in a rayon blast before touching the filesystem. */ + paths_to_crawl + .par_iter() + .map(|p| { + if p.is_absolute() { + Err(MedusaCrawlFormatError::PathWasAbsolute(p.clone())) + } else { + Ok(()) + } + }) + .collect::>()?; - let results: Vec = try_join_all( - paths_to_crawl - .into_iter() - .map(|path| Input::Path(ResolvedPath::from_path(path)).crawl_single(&ignores)), - ) + let results: Vec = try_join_all(paths_to_crawl.into_iter().map(|relative_path| { + Input::Path(ResolvedPath::from_path_and_root(relative_path, &cwd)).crawl_single(&ignores) + })) .await?; let mut result = CrawlResult::merge(results); diff --git a/py/medusa_zip/crawl.pyi b/py/medusa_zip/crawl.pyi index 277b7e8..51e0007 100644 --- a/py/medusa_zip/crawl.pyi +++ b/py/medusa_zip/crawl.pyi @@ -5,6 +5,8 @@ # # Licensed under the Apache License, Version 2.0 (see LICENSE). +# FIXME: remove Optional and Union for future annotations! + from pathlib import Path from typing import Iterable, Optional, Union @@ -51,14 +53,19 @@ class MedusaCrawl: self, paths_to_crawl: Iterable[Union[str, Path]], ignores: Optional[Ignores] = None, + cwd: Optional[Union[str, Path]] = None, ) -> None: ... @property def paths_to_crawl(self) -> Iterable[Path]: ... + @property def ignores(self) -> Ignores: ... + @property + def cwd(self) -> Optional[Path]: ... + async def crawl_paths(self) -> CrawlResult: ... def crawl_paths_sync(self) -> CrawlResult: ... diff --git a/py/src/crawl.rs b/py/src/crawl.rs index f95987d..87c6b98 100644 --- a/py/src/crawl.rs +++ b/py/src/crawl.rs @@ -202,24 +202,30 @@ impl From for Ignores { #[pyclass] #[derive(Clone)] pub struct MedusaCrawl { + /* FIXME: make these both get and set!!! */ #[pyo3(get)] pub paths_to_crawl: Vec, #[pyo3(get)] pub ignores: Ignores, + #[pyo3(get)] + pub cwd: Option, } #[pymethods] impl MedusaCrawl { #[new] - fn new(paths_to_crawl: &PyAny, ignores: Option) -> PyResult { + #[pyo3(signature = (paths_to_crawl, ignores = None, cwd = None))] + fn new(paths_to_crawl: &PyAny, ignores: Option, cwd: Option<&PyAny>) -> PyResult { let ignores = ignores.unwrap_or_default(); let paths_to_crawl: Vec = paths_to_crawl .iter()? .map(|p| p.and_then(PyAny::extract::)) .collect::>()?; + let cwd = cwd.map(|cwd| PyAny::extract::(cwd)).transpose()?; Ok(Self { paths_to_crawl, ignores, + cwd, }) } @@ -227,12 +233,14 @@ impl MedusaCrawl { let Self { paths_to_crawl, ignores, + cwd, } = self; let paths_to_crawl = repr(py, paths_to_crawl.clone())?; let ignores = repr(py, ignores.clone())?; + let cwd = repr(py, cwd.clone())?; Ok(format!( - "MedusaCrawl(paths_to_crawl={}, ignores={})", - paths_to_crawl, ignores + "MedusaCrawl(paths_to_crawl={}, ignores={}, cwd={})", + paths_to_crawl, ignores, cwd, )) } @@ -270,10 +278,12 @@ impl From for lib_crawl::MedusaCrawl { let MedusaCrawl { paths_to_crawl, ignores, + cwd, } = x; Self { paths_to_crawl, ignores: ignores.into(), + cwd, } } }