Skip to content

Commit

Permalink
add cwd field to MedusaCrawl and add crawl to benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
cosmicexplorer committed Aug 24, 2023
1 parent 54ed792 commit 737dae3
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 13 deletions.
7 changes: 7 additions & 0 deletions cli/src/crawl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,19 @@ pub struct MedusaCrawl {
/// These patterns will not read through symlinks.
#[arg(short, long, default_values_t = Vec::<RegexWrapper>::new())]
pub ignore_patterns: Vec<RegexWrapper>,
/// Where [`paths_to_crawl`](Self::paths_to_crawl) is relative to.
///
/// Defaults to the process's current working directory if not provided.
#[arg(short, long, default_value = None)]
pub working_dir: Option<PathBuf>,
}

impl From<MedusaCrawl> for lib_crawl::MedusaCrawl {
fn from(x: MedusaCrawl) -> Self {
let MedusaCrawl {
paths_to_crawl,
ignore_patterns,
working_dir,
} = x;
let ignore_patterns = RegexSet::new(
ignore_patterns
Expand All @@ -153,6 +159,7 @@ impl From<MedusaCrawl> for lib_crawl::MedusaCrawl {
Self {
paths_to_crawl: paths_to_crawl.into_iter().map(PathBuf::from).collect(),
ignores: lib_crawl::Ignores::new(ignore_patterns),
cwd: working_dir,
}
}
}
17 changes: 16 additions & 1 deletion lib/benches/my_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@ mod parallel_merge {
Ok((input_files, extract_dir))
}

async fn execute_medusa_crawl(
extracted_dir: &Path,
) -> Result<lib::crawl::CrawlResult, lib::crawl::MedusaCrawlError> {
let ignores = lib::crawl::Ignores::default();
let crawl_spec = lib::crawl::MedusaCrawl::for_single_dir(extracted_dir.to_path_buf(), ignores);
crawl_spec.crawl_paths().await
}

async fn execute_medusa_zip(
input_files: Vec<lib::FileSource>,
parallelism: lib::zip::Parallelism,
Expand Down Expand Up @@ -155,7 +163,14 @@ mod parallel_merge {
/* FIXME: assigning `_` to the second arg of this tuple will destroy the
* extract dir, which is only a silent error producing an empty file!!!
* AWFUL UX!!! */
let (input_files, _tmp_extract_dir) = extract_example_zip(&target).unwrap();
let (input_files, extracted_dir) = extract_example_zip(&target).unwrap();
group.bench_function(
BenchmarkId::new(&id, "<crawling the extracted contents>"),
|b| {
b.to_async(&rt)
.iter(|| execute_medusa_crawl(extracted_dir.path()))
},
);

/* Run the parallel implementation. */
let parallelism = lib::zip::Parallelism::ParallelMerge;
Expand Down
50 changes: 41 additions & 9 deletions lib/src/crawl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,12 @@ impl ResolvedPath {
}
}

pub fn from_path(path: PathBuf) -> Self {
pub(crate) fn from_path_and_root(relative_path: PathBuf, root: &Path) -> Self {
assert!(relative_path.is_relative());
assert!(root.is_absolute());
Self {
unresolved_path: path.clone(),
resolved_path: path,
resolved_path: root.join(&relative_path),
unresolved_path: relative_path,
}
}

Expand Down Expand Up @@ -285,21 +287,51 @@ impl Input {
pub struct MedusaCrawl {
pub paths_to_crawl: Vec<PathBuf>,
pub ignores: Ignores,
pub cwd: Option<PathBuf>,
}

impl Default for MedusaCrawl {
fn default() -> Self {
Self {
paths_to_crawl: vec![PathBuf::from(".")],
ignores: Ignores::default(),
cwd: None,
}
}
}

impl MedusaCrawl {
pub fn for_single_dir(dir: PathBuf, ignores: Ignores) -> Self {
Self {
paths_to_crawl: vec![PathBuf::from(".")],
ignores,
cwd: Some(dir),
}
}

pub async fn crawl_paths(self) -> Result<CrawlResult, MedusaCrawlError> {
let Self {
paths_to_crawl,
ignores,
cwd,
} = self;
let cwd = env::current_dir()?;
let cwd: PathBuf = cwd.map(Ok).unwrap_or_else(env::current_dir)?;

/* Validate all paths in a rayon blast before touching the filesystem. */
paths_to_crawl
.par_iter()
.map(|p| {
if p.is_absolute() {
Err(MedusaCrawlFormatError::PathWasAbsolute(p.clone()))
} else {
Ok(())
}
})
.collect::<Result<(), MedusaCrawlFormatError>>()?;

let results: Vec<CrawlResult> = try_join_all(
paths_to_crawl
.into_iter()
.map(|path| Input::Path(ResolvedPath::from_path(path)).crawl_single(&ignores)),
)
let results: Vec<CrawlResult> = try_join_all(paths_to_crawl.into_iter().map(|relative_path| {
Input::Path(ResolvedPath::from_path_and_root(relative_path, &cwd)).crawl_single(&ignores)
}))
.await?;
let mut result = CrawlResult::merge(results);

Expand Down
7 changes: 7 additions & 0 deletions py/medusa_zip/crawl.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#
# Licensed under the Apache License, Version 2.0 (see LICENSE).

# FIXME: remove Optional and Union for future annotations!

from pathlib import Path
from typing import Iterable, Optional, Union

Expand Down Expand Up @@ -51,14 +53,19 @@ class MedusaCrawl:
self,
paths_to_crawl: Iterable[Union[str, Path]],
ignores: Optional[Ignores] = None,
cwd: Optional[Union[str, Path]] = None,
) -> None:
...

@property
def paths_to_crawl(self) -> Iterable[Path]: ...

@property
def ignores(self) -> Ignores: ...

@property
def cwd(self) -> Optional[Path]: ...

async def crawl_paths(self) -> CrawlResult: ...

def crawl_paths_sync(self) -> CrawlResult: ...
16 changes: 13 additions & 3 deletions py/src/crawl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,37 +202,45 @@ impl From<lib_crawl::Ignores> for Ignores {
#[pyclass]
#[derive(Clone)]
pub struct MedusaCrawl {
/* FIXME: make these both get and set!!! */
#[pyo3(get)]
pub paths_to_crawl: Vec<PathBuf>,
#[pyo3(get)]
pub ignores: Ignores,
#[pyo3(get)]
pub cwd: Option<PathBuf>,
}

#[pymethods]
impl MedusaCrawl {
#[new]
fn new(paths_to_crawl: &PyAny, ignores: Option<Ignores>) -> PyResult<Self> {
#[pyo3(signature = (paths_to_crawl, ignores = None, cwd = None))]
fn new(paths_to_crawl: &PyAny, ignores: Option<Ignores>, cwd: Option<&PyAny>) -> PyResult<Self> {
let ignores = ignores.unwrap_or_default();
let paths_to_crawl: Vec<PathBuf> = paths_to_crawl
.iter()?
.map(|p| p.and_then(PyAny::extract::<PathBuf>))
.collect::<PyResult<_>>()?;
let cwd = cwd.map(|cwd| PyAny::extract::<PathBuf>(cwd)).transpose()?;
Ok(Self {
paths_to_crawl,
ignores,
cwd,
})
}

fn __repr__(&self, py: Python<'_>) -> PyResult<String> {
let Self {
paths_to_crawl,
ignores,
cwd,
} = self;
let paths_to_crawl = repr(py, paths_to_crawl.clone())?;
let ignores = repr(py, ignores.clone())?;
let cwd = repr(py, cwd.clone())?;
Ok(format!(
"MedusaCrawl(paths_to_crawl={}, ignores={})",
paths_to_crawl, ignores
"MedusaCrawl(paths_to_crawl={}, ignores={}, cwd={})",
paths_to_crawl, ignores, cwd,
))
}

Expand Down Expand Up @@ -270,10 +278,12 @@ impl From<MedusaCrawl> for lib_crawl::MedusaCrawl {
let MedusaCrawl {
paths_to_crawl,
ignores,
cwd,
} = x;
Self {
paths_to_crawl,
ignores: ignores.into(),
cwd,
}
}
}
Expand Down

0 comments on commit 737dae3

Please sign in to comment.