diff --git a/Cargo.lock b/Cargo.lock index 79c2fa1b06..12f7e2e4bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1760,6 +1760,19 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "globset" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15f1ce686646e7f1e19bf7d5533fe443a45dbfb990e00629110797578b42fb19" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata 0.4.6", + "regex-syntax 0.8.3", +] + [[package]] name = "gloo-timers" version = "0.2.6" @@ -2219,6 +2232,22 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "ignore" +version = "0.4.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d89fd380afde86567dfba715db065673989d6253f42b88179abd3eae47bda4b" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata 0.4.6", + "same-file", + "walkdir", + "winapi-util", +] + [[package]] name = "indexmap" version = "1.9.3" @@ -2370,16 +2399,6 @@ dependencies = [ "simple_asn1", ] -[[package]] -name = "jwalk" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2735847566356cd2179a2a38264839308f7079fa96e6bd5a42d740460e003c56" -dependencies = [ - "crossbeam", - "rayon", -] - [[package]] name = "kv-log-macro" version = "1.0.7" @@ -2535,8 +2554,8 @@ dependencies = [ "html5gum", "http 1.1.0", "hyper 1.4.1", + "ignore", "ip_network", - "jwalk", "linkify", "log", "octocrab", diff --git a/README.md b/README.md index 57ba948e5e..08beaa1842 100644 --- a/README.md +++ b/README.md @@ -315,7 +315,7 @@ Arguments: Options: -c, --config Configuration file to use - + [default: lychee.toml] -v, --verbose... @@ -333,7 +333,7 @@ Options: --max-cache-age Discard all cached requests older than this duration - + [default: 1d] --dump @@ -344,7 +344,7 @@ Options: --archive Specify the use of a specific web archive. Can be used in combination with `--suggest` - + [possible values: wayback] --suggest @@ -352,17 +352,17 @@ Options: -m, --max-redirects Maximum number of allowed redirects - + [default: 5] --max-retries Maximum number of retries per request - + [default: 3] --max-concurrency Maximum number of concurrent network requests - + [default: 128] -T, --threads @@ -370,7 +370,7 @@ Options: -u, --user-agent User agent - + [default: lychee/x.y.z] -i, --insecure @@ -420,7 +420,7 @@ Options: Test the specified file extensions for URIs when checking files locally. Multiple extensions can be separated by commas. Extensions will be checked in order of appearance. - + Example: --fallback-extensions html,htm,php,asp,aspx,jsp,cgi --header
@@ -428,20 +428,20 @@ Options: -a, --accept A List of accepted status codes for valid links - + The following accept range syntax is supported: [start]..[=]end|code. Some valid examples are: - + - 200..=204 - 200..204 - ..=204 - ..204 - 200 - + Use "lychee --accept '200..=204, 429, 500' ..." to provide a comma- separated list of accepted status codes. This example will accept 200, 201, 202, 203, 204, 429, and 500 as valid status codes. - + [default: 100..=103,200..=299] --include-fragments @@ -449,17 +449,17 @@ Options: -t, --timeout Website timeout in seconds from connect to response finished - + [default: 20] -r, --retry-wait-time Minimum wait time in seconds between retries of failed requests - + [default: 1] -X, --method Request method - + [default: get] -b, --base @@ -470,12 +470,18 @@ Options: --github-token GitHub API token to use when checking github.com links, to avoid rate limiting - + [env: GITHUB_TOKEN] --skip-missing Skip missing input files (default is to error if they don't exist) + --no-ignore + Do not skip files that would otherwise be ignored by '.gitignore', '.ignore', or the global ignore file + + --hidden + Do not skip hidden directories and files + --include-verbatim Find links in verbatim sections like `pre`- and `code` blocks @@ -487,13 +493,13 @@ Options: --mode Set the output display mode. Determines how results are presented in the terminal - + [default: color] [possible values: plain, color, emoji] -f, --format Output format of final status report - + [default: compact] [possible values: compact, detailed, json, markdown, raw] diff --git a/examples/collect_links/collect_links.rs b/examples/collect_links/collect_links.rs index d6c466271d..4a86924c56 100644 --- a/examples/collect_links/collect_links.rs +++ b/examples/collect_links/collect_links.rs @@ -4,7 +4,6 @@ use std::path::PathBuf; use tokio_stream::StreamExt; #[tokio::main] -#[allow(clippy::trivial_regex)] async fn main() -> Result<()> { // Collect all links from the following inputs let inputs = vec![ @@ -24,6 +23,8 @@ async fn main() -> Result<()> { let links = Collector::new(None) // base .skip_missing_inputs(false) // don't skip missing inputs? (default=false) + .skip_hidden(false) // skip hidden files? (default=true) + .skip_ignored(false) // skip files that are ignored by git? (default=true) .use_html5ever(false) // use html5ever for parsing? (default=false) .collect_links(inputs) // base url or directory .collect::>>() diff --git a/fixtures/hidden/.hidden/file.md b/fixtures/hidden/.hidden/file.md new file mode 100644 index 0000000000..0c0f933fbf --- /dev/null +++ b/fixtures/hidden/.hidden/file.md @@ -0,0 +1 @@ +https://wikipedia.org diff --git a/fixtures/ignore/.ignore b/fixtures/ignore/.ignore new file mode 100644 index 0000000000..54bff1ba51 --- /dev/null +++ b/fixtures/ignore/.ignore @@ -0,0 +1 @@ +ignored-file.md diff --git a/fixtures/ignore/ignored-file.md b/fixtures/ignore/ignored-file.md new file mode 100644 index 0000000000..23af390692 --- /dev/null +++ b/fixtures/ignore/ignored-file.md @@ -0,0 +1 @@ +https://archlinux.org diff --git a/fixtures/ignore/.lycheeignore b/fixtures/lycheeignore/.lycheeignore similarity index 100% rename from fixtures/ignore/.lycheeignore rename to fixtures/lycheeignore/.lycheeignore diff --git a/fixtures/ignore/TEST.md b/fixtures/lycheeignore/TEST.md similarity index 100% rename from fixtures/ignore/TEST.md rename to fixtures/lycheeignore/TEST.md diff --git a/fixtures/ignore/normal-exclude-file b/fixtures/lycheeignore/normal-exclude-file similarity index 100% rename from fixtures/ignore/normal-exclude-file rename to fixtures/lycheeignore/normal-exclude-file diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index a482dfc10a..6a382ba41e 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -292,6 +292,8 @@ async fn run(opts: &LycheeOptions) -> Result { let mut collector = Collector::new(opts.config.base.clone()) .skip_missing_inputs(opts.config.skip_missing) + .skip_hidden(!opts.config.hidden) + .skip_ignored(!opts.config.no_ignore) .include_verbatim(opts.config.include_verbatim) // File a bug if you rely on this envvar! It's going to go away eventually. .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1")); diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index ea12c56b1d..b8c23be7f4 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -438,6 +438,17 @@ separated list of accepted status codes. This example will accept 200, 201, #[serde(default)] pub(crate) skip_missing: bool, + /// Do not skip files that would otherwise be ignored by + /// '.gitignore', '.ignore', or the global ignore file. + #[arg(long)] + #[serde(default)] + pub(crate) no_ignore: bool, + + /// Do not skip hidden directories and files. + #[arg(long)] + #[serde(default)] + pub(crate) hidden: bool, + /// Find links in verbatim sections like `pre`- and `code` blocks #[arg(long)] #[serde(default)] diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 65aa8d2f47..9b07ec224c 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -467,6 +467,44 @@ mod cli { cmd.arg(&filename).arg("--skip-missing").assert().success(); } + #[test] + fn test_skips_hidden_files_by_default() { + main_command() + .arg(fixtures_path().join("hidden/")) + .assert() + .success() + .stdout(contains("0 Total")); + } + + #[test] + fn test_include_hidden_file() { + main_command() + .arg(fixtures_path().join("hidden/")) + .arg("--hidden") + .assert() + .success() + .stdout(contains("1 Total")); + } + + #[test] + fn test_skips_ignored_files_by_default() { + main_command() + .arg(fixtures_path().join("ignore/")) + .assert() + .success() + .stdout(contains("0 Total")); + } + + #[test] + fn test_include_ignored_file() { + main_command() + .arg(fixtures_path().join("ignore/")) + .arg("--no-ignore") + .assert() + .success() + .stdout(contains("1 Total")); + } + #[tokio::test] async fn test_glob() -> Result<()> { // using Result to be able to use `?` @@ -755,7 +793,7 @@ mod cli { #[test] fn test_lycheeignore_file() -> Result<()> { let mut cmd = main_command(); - let test_path = fixtures_path().join("ignore"); + let test_path = fixtures_path().join("lycheeignore"); let cmd = cmd .current_dir(test_path) @@ -776,7 +814,7 @@ mod cli { #[test] fn test_lycheeignore_and_exclude_file() -> Result<()> { let mut cmd = main_command(); - let test_path = fixtures_path().join("ignore"); + let test_path = fixtures_path().join("lycheeignore"); let excludes_path = test_path.join("normal-exclude-file"); cmd.current_dir(test_path) diff --git a/lychee-bin/tests/usage.rs b/lychee-bin/tests/usage.rs index b1594891a9..7a1983d3c5 100644 --- a/lychee-bin/tests/usage.rs +++ b/lychee-bin/tests/usage.rs @@ -20,6 +20,22 @@ mod readme { fs::read_to_string(readme_path).unwrap() } + /// Remove line `[default: lychee/x.y.z]` from the string + fn remove_lychee_version_line(string: &str) -> String { + string + .lines() + .filter(|line| !line.contains("[default: lychee/")) + .collect::>() + .join("\n") + } + + fn trim_empty_lines(str: &str) -> String { + str.lines() + .map(|line| if line.trim().is_empty() { "" } else { line }) + .collect::>() + .join("\n") + } + /// Test that the USAGE section in `README.md` is up to date with /// `lychee --help`. /// Only unix: might not work with windows CRLF line-endings returned from @@ -37,13 +53,7 @@ mod readme { .ok_or("Usage not found in help")?; let usage_in_help = &help_output[usage_in_help_start..]; - // Remove line `[default: lychee/0.1.0]` from the help output - let usage_in_help = usage_in_help - .lines() - .filter(|line| !line.contains("[default: lychee/")) - .collect::>() - .join("\n"); - + let usage_in_help = trim_empty_lines(&remove_lychee_version_line(usage_in_help)); let readme = load_readme_text(); let usage_start = readme .find(USAGE_STRING) @@ -52,13 +62,7 @@ mod readme { .find("\n```") .ok_or("End of usage not found in README")?; let usage_in_readme = &readme[usage_start..usage_start + usage_end]; - - // Remove line `[default: lychee/0.1.0]` from the README - let usage_in_readme = usage_in_readme - .lines() - .filter(|line| !line.contains("[default: lychee/")) - .collect::>() - .join("\n"); + let usage_in_readme = remove_lychee_version_line(usage_in_readme); assert_eq!(usage_in_readme, usage_in_help); Ok(()) diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index f0fa878e93..1a0660d61f 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -24,8 +24,8 @@ html5ever = "0.28.0" html5gum = "0.5.7" http = "1.0.0" hyper = "1.3.1" +ignore = "0.4.23" ip_network = "0.4.1" -jwalk = "0.8.1" linkify = "0.10.0" log = "0.4.22" octocrab = "0.39.0" diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index ed048abc9e..40c24ce542 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -11,10 +11,13 @@ use par_stream::ParStreamExt; /// Collector keeps the state of link collection /// It drives the link extraction from inputs +#[allow(clippy::struct_excessive_bools)] #[derive(Debug, Clone)] pub struct Collector { basic_auth_extractor: Option, skip_missing_inputs: bool, + skip_ignored: bool, + skip_hidden: bool, include_verbatim: bool, use_html5ever: bool, base: Option, @@ -29,6 +32,8 @@ impl Collector { skip_missing_inputs: false, include_verbatim: false, use_html5ever: false, + skip_hidden: true, + skip_ignored: true, base, } } @@ -40,6 +45,20 @@ impl Collector { self } + /// Skip files that are hidden + #[must_use] + pub const fn skip_hidden(mut self, yes: bool) -> Self { + self.skip_hidden = yes; + self + } + + /// Skip files that are ignored + #[must_use] + pub const fn skip_ignored(mut self, yes: bool) -> Self { + self.skip_ignored = yes; + self + } + /// Use `html5ever` to parse HTML instead of `html5gum`. #[must_use] pub const fn use_html5ever(mut self, yes: bool) -> Self { @@ -80,11 +99,14 @@ impl Collector { /// /// Will return `Err` if links cannot be extracted from an input pub fn collect_links(self, inputs: Vec) -> impl Stream> { - let skip_missing_inputs = self.skip_missing_inputs; let base = self.base; stream::iter(inputs) .par_then_unordered(None, move |input| async move { - input.get_contents(skip_missing_inputs) + input.get_contents( + self.skip_missing_inputs, + self.skip_hidden, + self.skip_ignored, + ) }) .flatten() .par_then_unordered(None, move |content| { @@ -139,7 +161,10 @@ mod tests { let file_path = temp_dir.path().join("README"); let _file = File::create(&file_path).unwrap(); let input = Input::new(&file_path.as_path().display().to_string(), None, true, None)?; - let contents: Vec<_> = input.get_contents(true).collect::>().await; + let contents: Vec<_> = input + .get_contents(true, true, true) + .collect::>() + .await; assert_eq!(contents.len(), 1); assert_eq!(contents[0].as_ref().unwrap().file_type, FileType::Plaintext); @@ -149,7 +174,10 @@ mod tests { #[tokio::test] async fn test_url_without_extension_is_html() -> Result<()> { let input = Input::new("https://example.com/", None, true, None)?; - let contents: Vec<_> = input.get_contents(true).collect::>().await; + let contents: Vec<_> = input + .get_contents(true, true, true) + .collect::>() + .await; assert_eq!(contents.len(), 1); assert_eq!(contents[0].as_ref().unwrap().file_type, FileType::Html); diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index f2104741ea..0aa6eddedc 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -96,7 +96,7 @@ pub enum ErrorKind { /// Error while traversing an input directory #[error("Cannot traverse input directory: {0}")] - DirTraversal(#[from] jwalk::Error), + DirTraversal(#[from] ignore::Error), /// The given glob pattern is not valid #[error("UNIX glob pattern is invalid")] diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index 47ef22bc2f..c32be7feb8 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -3,7 +3,7 @@ use crate::{utils, ErrorKind, Result}; use async_stream::try_stream; use futures::stream::Stream; use glob::glob_with; -use jwalk::WalkDir; +use ignore::WalkBuilder; use reqwest::Url; use serde::{Deserialize, Serialize}; use shellexpand::tilde; @@ -204,7 +204,12 @@ impl Input { /// Returns an error if the contents can not be retrieved /// because of an underlying I/O error (e.g. an error while making a /// network request or retrieving the contents from the file system) - pub fn get_contents(self, skip_missing: bool) -> impl Stream> { + pub fn get_contents( + self, + skip_missing: bool, + skip_hidden: bool, + skip_gitignored: bool, + ) -> impl Stream> { try_stream! { match self.source { InputSource::RemoteUrl(ref url) => { @@ -226,34 +231,22 @@ impl Input { } InputSource::FsPath(ref path) => { if path.is_dir() { - for entry in WalkDir::new(path).skip_hidden(true) - .process_read_dir(move |_, _, (), children| { - children.retain(|child| { - let Ok(entry) = child.as_ref() else { return true }; - - if self.is_excluded_path(&entry.path()) { - return false; - } - - let file_type = entry.file_type(); - - if file_type.is_dir() { - // Required for recursion - return true; - } - if file_type.is_symlink() { - return false; - } - if !file_type.is_file() { - return false; - } - valid_extension(&entry.path()) - }); - }) { + for entry in WalkBuilder::new(path).standard_filters(skip_gitignored).hidden(skip_hidden).build() { let entry = entry?; - if entry.file_type().is_dir() { + + if self.is_excluded_path(&entry.path().to_path_buf()) { continue; } + + match entry.file_type() { + None => continue, + Some(file_type) => { + if !file_type.is_file() || !valid_extension(entry.path()) { + continue; + } + } + }; + let content = Self::path_content(entry.path()).await?; yield content }