Skip to content

Commit

Permalink
Allow excluding cache based on status code (#1403)
Browse files Browse the repository at this point in the history
This introduces an option `--cache-exclude-status`, which allows specifying a range of HTTP status codes which will be ignored from the cache.

Closes #1400.
  • Loading branch information
dmathieu authored and mre committed Oct 27, 2024
1 parent d0034f3 commit ac16ae7
Show file tree
Hide file tree
Showing 12 changed files with 491 additions and 47 deletions.
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,22 @@ Options:
[default: 1d]
--cache-exclude-status <CACHE_EXCLUDE_STATUS>
A list of status codes that will be ignored from the cache
The following accept range syntax is supported: [start]..[=]end|code. Some valid
examples are:
- 429
- 500..=599
- 500..
Use "lychee --cache-exclude-status '429, 500..502' <inputs>..." to provide a comma- separated
list of excluded status codes. This example will not cache results with a status code of 429, 500,
501 and 502.
[default: ]
--dump
Don't perform any link checking. Instead, dump all the links extracted from inputs that would be checked
Expand Down
94 changes: 89 additions & 5 deletions lychee-bin/src/commands/check.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use reqwest::Url;
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;

use lychee_lib::{Client, ErrorKind, Request, Response};
use lychee_lib::{Client, ErrorKind, Request, Response, Uri};
use lychee_lib::{InputSource, Result};
use lychee_lib::{ResponseBody, Status};

Expand Down Expand Up @@ -46,6 +46,7 @@ where

let client = params.client;
let cache = params.cache;
let cache_exclude_status = params.cfg.cache_exclude_status.into_set();
let accept = params.cfg.accept.into_set();

let pb = if params.cfg.no_progress || params.cfg.verbose.log_level() >= log::Level::Info {
Expand All @@ -61,6 +62,7 @@ where
max_concurrency,
client,
cache,
cache_exclude_status,
accept,
));

Expand Down Expand Up @@ -219,14 +221,22 @@ async fn request_channel_task(
max_concurrency: usize,
client: Client,
cache: Arc<Cache>,
cache_exclude_status: HashSet<u16>,
accept: HashSet<u16>,
) {
StreamExt::for_each_concurrent(
ReceiverStream::new(recv_req),
max_concurrency,
|request: Result<Request>| async {
let request = request.expect("cannot read request");
let response = handle(&client, cache.clone(), request, accept.clone()).await;
let response = handle(
&client,
cache.clone(),
cache_exclude_status.clone(),
request,
accept.clone(),
)
.await;

send_resp
.send(response)
Expand Down Expand Up @@ -260,6 +270,7 @@ async fn check_url(client: &Client, request: Request) -> Response {
async fn handle(
client: &Client,
cache: Arc<Cache>,
cache_exclude_status: HashSet<u16>,
request: Request,
accept: HashSet<u16>,
) -> Response {
Expand Down Expand Up @@ -287,16 +298,37 @@ async fn handle(
// benefit.
// - Skip caching unsupported URLs as they might be supported in a
// future run.
// - Skip caching excluded links; they might not be excluded in the next run
// - Skip caching excluded links; they might not be excluded in the next run.
// - Skip caching links for which the status code has been explicitly excluded from the cache.
let status = response.status();
if uri.is_file() || status.is_excluded() || status.is_unsupported() || status.is_unknown() {
if ignore_cache(&uri, status, &cache_exclude_status) {
return response;
}

cache.insert(uri, status.into());
response
}

/// Returns `true` if the response should be ignored in the cache.
///
/// The response should be ignored if:
/// - The URI is a file URI.
/// - The status is excluded.
/// - The status is unsupported.
/// - The status is unknown.
/// - The status code is excluded from the cache.
fn ignore_cache(uri: &Uri, status: &Status, cache_exclude_status: &HashSet<u16>) -> bool {
let status_code_excluded = status
.code()
.map_or(false, |code| cache_exclude_status.contains(&code.as_u16()));

uri.is_file()
|| status.is_excluded()
|| status.is_unsupported()
|| status.is_unknown()
|| status_code_excluded
}

fn show_progress(
output: &mut dyn Write,
progress_bar: &Option<ProgressBar>,
Expand Down Expand Up @@ -352,8 +384,9 @@ fn get_failed_urls(stats: &mut ResponseStats) -> Vec<(InputSource, Url)> {
#[cfg(test)]
mod tests {
use crate::{formatters::get_response_formatter, options};
use http::StatusCode;
use log::info;
use lychee_lib::{CacheStatus, ClientBuilder, InputSource, Uri};
use lychee_lib::{CacheStatus, ClientBuilder, ErrorKind, InputSource, Uri};

use super::*;

Expand Down Expand Up @@ -414,4 +447,55 @@ mod tests {
Status::Error(ErrorKind::InvalidURI(_))
));
}

#[test]
fn test_cache_by_default() {
assert!(!ignore_cache(
&Uri::try_from("https://[::1]").unwrap(),
&Status::Ok(StatusCode::OK),
&HashSet::default()
));
}

#[test]
// Cache is ignored for file URLs
fn test_cache_ignore_file_urls() {
assert!(ignore_cache(
&Uri::try_from("file:///home").unwrap(),
&Status::Ok(StatusCode::OK),
&HashSet::default()
));
}

#[test]
// Cache is ignored for unsupported status
fn test_cache_ignore_unsupported_status() {
assert!(ignore_cache(
&Uri::try_from("https://[::1]").unwrap(),
&Status::Unsupported(ErrorKind::EmptyUrl),
&HashSet::default()
));
}

#[test]
// Cache is ignored for unknown status
fn test_cache_ignore_unknown_status() {
assert!(ignore_cache(
&Uri::try_from("https://[::1]").unwrap(),
&Status::UnknownStatusCode(StatusCode::IM_A_TEAPOT),
&HashSet::default()
));
}

#[test]
fn test_cache_ignore_excluded_status() {
// Cache is ignored for excluded status codes
let exclude = [StatusCode::OK.as_u16()].iter().copied().collect();

assert!(ignore_cache(
&Uri::try_from("https://[::1]").unwrap(),
&Status::Ok(StatusCode::OK),
&exclude
));
}
}
45 changes: 39 additions & 6 deletions lychee-bin/src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ use clap::builder::PossibleValuesParser;
use clap::{arg, builder::TypedValueParser, Parser};
use const_format::{concatcp, formatcp};
use lychee_lib::{
AcceptSelector, Base, BasicAuthSelector, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES,
DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT,
Base, BasicAuthSelector, Input, StatusCodeExcluder, StatusCodeSelector, DEFAULT_MAX_REDIRECTS,
DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT,
};
use secrecy::{ExposeSecret, SecretString};
use serde::Deserialize;
Expand Down Expand Up @@ -145,7 +145,8 @@ default_function! {
retry_wait_time: usize = DEFAULT_RETRY_WAIT_TIME_SECS;
method: String = DEFAULT_METHOD.to_string();
verbosity: Verbosity = Verbosity::default();
accept_selector: AcceptSelector = AcceptSelector::default();
cache_exclude_selector: StatusCodeExcluder = StatusCodeExcluder::new();
accept_selector: StatusCodeSelector = StatusCodeSelector::default();
}

// Macro for merging configuration values
Expand Down Expand Up @@ -231,6 +232,26 @@ pub(crate) struct Config {
#[serde(with = "humantime_serde")]
pub(crate) max_cache_age: Duration,

/// A list of status codes that will be excluded from the cache
#[arg(
long,
default_value_t,
long_help = "A list of status codes that will be ignored from the cache
The following accept range syntax is supported: [start]..[=]end|code. Some valid
examples are:
- 429
- 500..=599
- 500..
Use \"lychee --cache-exclude-status '429, 500..502' <inputs>...\" to provide a comma- separated
list of excluded status codes. This example will not cache results with a status code of 429, 500,
501 and 502."
)]
#[serde(default = "cache_exclude_selector")]
pub(crate) cache_exclude_status: StatusCodeExcluder,

/// Don't perform any link checking.
/// Instead, dump all the links extracted from inputs that would be checked
#[arg(long)]
Expand Down Expand Up @@ -394,7 +415,7 @@ separated list of accepted status codes. This example will accept 200, 201,
202, 203, 204, 429, and 500 as valid status codes."
)]
#[serde(default = "accept_selector")]
pub(crate) accept: AcceptSelector,
pub(crate) accept: StatusCodeSelector,

/// Enable the checking of fragments in links.
#[arg(long)]
Expand Down Expand Up @@ -509,6 +530,7 @@ impl Config {
max_retries: DEFAULT_MAX_RETRIES;
max_concurrency: DEFAULT_MAX_CONCURRENCY;
max_cache_age: humantime::parse_duration(DEFAULT_MAX_CACHE_AGE).unwrap();
cache_exclude_status: StatusCodeExcluder::default();
threads: None;
user_agent: DEFAULT_USER_AGENT;
insecure: false;
Expand Down Expand Up @@ -538,7 +560,7 @@ impl Config {
require_https: false;
cookie_jar: None;
include_fragments: false;
accept: AcceptSelector::default();
accept: StatusCodeSelector::default();
}

if self
Expand All @@ -564,7 +586,7 @@ mod tests {
#[test]
fn test_accept_status_codes() {
let toml = Config {
accept: AcceptSelector::from_str("200..=204, 429, 500").unwrap(),
accept: StatusCodeSelector::from_str("200..=204, 429, 500").unwrap(),
..Default::default()
};

Expand All @@ -577,4 +599,15 @@ mod tests {
assert!(cli.accept.contains(204));
assert!(!cli.accept.contains(205));
}

#[test]
fn test_default() {
let cli = Config::default();

assert_eq!(
cli.accept,
StatusCodeSelector::from_str("100..=103,200..=299").expect("no error")
);
assert_eq!(cli.cache_exclude_status, StatusCodeExcluder::new());
}
}
59 changes: 59 additions & 0 deletions lychee-bin/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -895,6 +895,65 @@ mod cli {
Ok(())
}

#[tokio::test]
async fn test_lycheecache_exclude_custom_status_codes() -> Result<()> {
let base_path = fixtures_path().join("cache");
let cache_file = base_path.join(LYCHEE_CACHE_FILE);

// Unconditionally remove cache file if it exists
let _ = fs::remove_file(&cache_file);

let mock_server_ok = mock_server!(StatusCode::OK);
let mock_server_no_content = mock_server!(StatusCode::NO_CONTENT);
let mock_server_too_many_requests = mock_server!(StatusCode::TOO_MANY_REQUESTS);

let dir = tempfile::tempdir()?;
let mut file = File::create(dir.path().join("c.md"))?;

writeln!(file, "{}", mock_server_ok.uri().as_str())?;
writeln!(file, "{}", mock_server_no_content.uri().as_str())?;
writeln!(file, "{}", mock_server_too_many_requests.uri().as_str())?;

let mut cmd = main_command();
let test_cmd = cmd
.current_dir(&base_path)
.arg(dir.path().join("c.md"))
.arg("--verbose")
.arg("--no-progress")
.arg("--cache")
.arg("--cache-exclude-status")
.arg("204,429");

assert!(
!cache_file.exists(),
"cache file should not exist before this test"
);

// run first without cache to generate the cache file
test_cmd
.assert()
.stderr(contains(format!("[200] {}/\n", mock_server_ok.uri())))
.stderr(contains(format!(
"[204] {}/ | OK (204 No Content): No Content\n",
mock_server_no_content.uri()
)))
.stderr(contains(format!(
"[429] {}/ | Failed: Network error: Too Many Requests\n",
mock_server_too_many_requests.uri()
)));

// check content of cache file
let data = fs::read_to_string(&cache_file)?;
assert!(data.contains(&format!("{}/,200", mock_server_ok.uri())));
assert!(!data.contains(&format!("{}/,204", mock_server_no_content.uri())));
assert!(!data.contains(&format!("{}/,429", mock_server_too_many_requests.uri())));

// clear the cache file
fs::remove_file(&cache_file)?;

Ok(())
}

#[tokio::test]
async fn test_lycheecache_accept_custom_status_codes() -> Result<()> {
let base_path = fixtures_path().join("cache_accept_custom_status_codes");
Expand Down
5 changes: 3 additions & 2 deletions lychee-lib/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,9 @@ pub use crate::{
collector::Collector,
filter::{Excludes, Filter, Includes},
types::{
uri::valid::Uri, AcceptRange, AcceptRangeError, AcceptSelector, Base, BasicAuthCredentials,
uri::valid::Uri, AcceptRange, AcceptRangeError, Base, BasicAuthCredentials,
BasicAuthSelector, CacheStatus, CookieJar, ErrorKind, FileType, Input, InputContent,
InputSource, Request, Response, ResponseBody, Result, Status,
InputSource, Request, Response, ResponseBody, Result, Status, StatusCodeExcluder,
StatusCodeSelector,
},
};
2 changes: 0 additions & 2 deletions lychee-lib/src/types/accept/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
mod range;
mod selector;

pub use range::*;
pub use selector::*;
4 changes: 2 additions & 2 deletions lychee-lib/src/types/accept/range.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ use thiserror::Error;
static RANGE_PATTERN: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^([0-9]{3})?\.\.(=?)([0-9]{3})+$|^([0-9]{3})$").unwrap());

/// The [`AcceptRangeParseError`] indicates that the parsing process of an
/// [`AcceptRange`] from a string failed due to various underlying reasons.
/// Indicates that the parsing process of an [`AcceptRange`] from a string
/// failed due to various underlying reasons.
#[derive(Debug, Error, PartialEq)]
pub enum AcceptRangeError {
/// The string input didn't contain any range pattern.
Expand Down
Loading

0 comments on commit ac16ae7

Please sign in to comment.