From 3ad6a71cf84332f72611c5ec0166a9c2da870ef6 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Thu, 12 Sep 2024 11:24:09 -0400 Subject: [PATCH] Use globwalk --- Cargo.lock | 2 +- Cargo.toml | 1 + crates/uv-cache-info/Cargo.toml | 2 +- crates/uv-cache-info/src/cache_info.rs | 134 +++++++++--------- crates/uv-distribution/src/error.rs | 2 + .../src/index/built_wheel_index.rs | 3 +- crates/uv-distribution/src/source/mod.rs | 3 +- 7 files changed, 72 insertions(+), 75 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1ac78dd7f5e1..b57616fd7f67 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4610,7 +4610,7 @@ name = "uv-cache-info" version = "0.0.1" dependencies = [ "fs-err", - "glob", + "globwalk", "schemars", "serde", "thiserror", diff --git a/Cargo.toml b/Cargo.toml index c8132050e08e..70d70a3ba69c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -91,6 +91,7 @@ fs-err = { version = "2.11.0" } fs2 = { version = "0.4.3" } futures = { version = "0.3.30" } glob = { version = "0.3.1" } +globwalk = { version = "0.9.1" } goblin = { version = "0.8.2", default-features = false, features = ["std", "elf32", "elf64", "endian_fd"] } hex = { version = "0.4.3" } home = { version = "0.5.9" } diff --git a/crates/uv-cache-info/Cargo.toml b/crates/uv-cache-info/Cargo.toml index db8bff21262e..d1c594a66d09 100644 --- a/crates/uv-cache-info/Cargo.toml +++ b/crates/uv-cache-info/Cargo.toml @@ -14,7 +14,7 @@ workspace = true [dependencies] fs-err = { workspace = true } -glob = { workspace = true } +globwalk = { workspace = true } schemars = { workspace = true, optional = true } serde = { workspace = true, features = ["derive"] } thiserror = { workspace = true } diff --git a/crates/uv-cache-info/src/cache_info.rs b/crates/uv-cache-info/src/cache_info.rs index 2ce0dea03aff..a6ee83959b47 100644 --- a/crates/uv-cache-info/src/cache_info.rs +++ b/crates/uv-cache-info/src/cache_info.rs @@ -1,13 +1,19 @@ use crate::commit_info::CacheCommit; use crate::timestamp::Timestamp; -use glob::MatchOptions; use serde::Deserialize; use std::cmp::max; -use std::io; use std::path::{Path, PathBuf}; use tracing::{debug, warn}; +#[derive(Debug, thiserror::Error)] +pub enum CacheInfoError { + #[error("Failed to parse glob patterns for `cache-keys`: {0}")] + Glob(#[from] globwalk::GlobError), + #[error(transparent)] + Io(#[from] std::io::Error), +} + /// The information used to determine whether a built distribution is up-to-date, based on the /// timestamps of relevant files, the current commit of a repository, etc. #[derive(Default, Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] @@ -33,17 +39,17 @@ impl CacheInfo { } /// Compute the cache info for a given path, which may be a file or a directory. - pub fn from_path(path: &Path) -> io::Result { + pub fn from_path(path: &Path) -> Result { let metadata = fs_err::metadata(path)?; if metadata.is_file() { - Self::from_file(path) + Ok(Self::from_file(path)?) } else { Self::from_directory(path) } } /// Compute the cache info for a given directory. - pub fn from_directory(directory: &Path) -> io::Result { + pub fn from_directory(directory: &Path) -> Result { let mut commit = None; let mut timestamp = None; @@ -71,75 +77,34 @@ impl CacheInfo { ] }); - // Incorporate any additional timestamps or VCS information. + // Incorporate timestamps from any direct filepaths. + let mut globs = vec![]; for cache_key in &cache_keys { match cache_key { CacheKey::Path(file) | CacheKey::File { file } => { - if file.chars().any(|c| matches!(c, '*' | '?' | '[')) { - // Treat the path as a glob. - let path = directory.join(file); - let Some(pattern) = path.to_str() else { - warn!("Failed to convert pattern to string: {}", path.display()); + if file.chars().any(|c| matches!(c, '*' | '?' | '[' | '{')) { + // Defer globs to a separate pass. + globs.push(file); + continue; + } + + // Treat the path as a file. + let path = directory.join(file); + let metadata = match path.metadata() { + Ok(metadata) => metadata, + Err(err) => { + warn!("Failed to read metadata for file: {err}"); continue; - }; - let paths = match glob::glob_with( - pattern, - MatchOptions { - case_sensitive: true, - require_literal_separator: true, - require_literal_leading_dot: false, - }, - ) { - Ok(paths) => paths, - Err(err) => { - warn!("Failed to parse glob pattern: {err}"); - continue; - } - }; - for entry in paths { - let entry = match entry { - Ok(entry) => entry, - Err(err) => { - warn!("Failed to read glob entry: {err}"); - continue; - } - }; - let metadata = match entry.metadata() { - Ok(metadata) => metadata, - Err(err) => { - warn!("Failed to read metadata for glob entry: {err}"); - continue; - } - }; - if metadata.is_file() { - timestamp = - max(timestamp, Some(Timestamp::from_metadata(&metadata))); - } else { - warn!( - "Expected file for cache key, but found directory: `{}`", - entry.display() - ); - } - } - } else { - // Treat the path as a file. - let path = directory.join(file); - let metadata = match path.metadata() { - Ok(metadata) => metadata, - Err(err) => { - warn!("Failed to read metadata for file: {err}"); - continue; - } - }; - if metadata.is_file() { - timestamp = max(timestamp, Some(Timestamp::from_metadata(&metadata))); - } else { - warn!( - "Expected file for cache key, but found directory: `{}`", - path.display() - ); } + }; + if !metadata.is_file() { + warn!( + "Expected file for cache key, but found directory: `{}`", + path.display() + ); + continue; } + timestamp = max(timestamp, Some(Timestamp::from_metadata(&metadata))); } CacheKey::Git { git: true } => match CacheCommit::from_repository(directory) { Ok(commit_info) => commit = Some(commit_info), @@ -151,12 +116,43 @@ impl CacheInfo { } } + // If we have any globs, process them in a single pass. + if !globs.is_empty() { + let walker = globwalk::GlobWalkerBuilder::from_patterns(directory, &globs) + .file_type(globwalk::FileType::FILE | globwalk::FileType::SYMLINK) + .build()?; + for entry in walker { + let entry = match entry { + Ok(entry) => entry, + Err(err) => { + warn!("Failed to read glob entry: {err}"); + continue; + } + }; + let metadata = match entry.metadata() { + Ok(metadata) => metadata, + Err(err) => { + warn!("Failed to read metadata for glob entry: {err}"); + continue; + } + }; + if !metadata.is_file() { + warn!( + "Expected file for cache key, but found directory: `{}`", + entry.path().display() + ); + continue; + } + timestamp = max(timestamp, Some(Timestamp::from_metadata(&metadata))); + } + } + Ok(Self { timestamp, commit }) } /// Compute the cache info for a given file, assumed to be a binary or source distribution /// represented as (e.g.) a `.whl` or `.tar.gz` archive. - pub fn from_file(path: impl AsRef) -> Result { + pub fn from_file(path: impl AsRef) -> std::io::Result { let metadata = fs_err::metadata(path.as_ref())?; let timestamp = Timestamp::from_metadata(&metadata); Ok(Self { diff --git a/crates/uv-distribution/src/error.rs b/crates/uv-distribution/src/error.rs index 068bf7c50f72..3a0c12b7f688 100644 --- a/crates/uv-distribution/src/error.rs +++ b/crates/uv-distribution/src/error.rs @@ -46,6 +46,8 @@ pub enum Error { CacheEncode(#[from] rmp_serde::encode::Error), #[error("Failed to walk the distribution cache")] CacheWalk(#[source] walkdir::Error), + #[error(transparent)] + CacheInfo(#[from] uv_cache_info::CacheInfoError), // Build error #[error(transparent)] diff --git a/crates/uv-distribution/src/index/built_wheel_index.rs b/crates/uv-distribution/src/index/built_wheel_index.rs index 1f5caf94ce4b..2138d7f7394f 100644 --- a/crates/uv-distribution/src/index/built_wheel_index.rs +++ b/crates/uv-distribution/src/index/built_wheel_index.rs @@ -132,8 +132,7 @@ impl<'a> BuiltWheelIndex<'a> { }; // If the distribution is stale, omit it from the index. - let cache_info = - CacheInfo::from_directory(&source_dist.install_path).map_err(Error::CacheRead)?; + let cache_info = CacheInfo::from_directory(&source_dist.install_path)?; if cache_info != *pointer.cache_info() { return Ok(None); diff --git a/crates/uv-distribution/src/source/mod.rs b/crates/uv-distribution/src/source/mod.rs index bedf6bab40be..acf53285432a 100644 --- a/crates/uv-distribution/src/source/mod.rs +++ b/crates/uv-distribution/src/source/mod.rs @@ -1112,8 +1112,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> { } // Determine the last-modified time of the source distribution. - let cache_info = - CacheInfo::from_directory(&resource.install_path).map_err(Error::CacheRead)?; + let cache_info = CacheInfo::from_directory(&resource.install_path)?; // Read the existing metadata from the cache. let entry = cache_shard.entry(LOCAL_REVISION);