Skip to content

Commit f0ebac2

Browse files
authored
Allow excluding cache based on status code (#1403)
This introduces an option `--cache-exclude-status`, which allows specifying a range of HTTP status codes which will be ignored from the cache. Closes #1400.
1 parent 2a9f11a commit f0ebac2

File tree

12 files changed

+491
-47
lines changed

12 files changed

+491
-47
lines changed

README.md

+16
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,22 @@ Options:
335335
336336
[default: 1d]
337337
338+
--cache-exclude-status <CACHE_EXCLUDE_STATUS>
339+
A list of status codes that will be ignored from the cache
340+
341+
The following accept range syntax is supported: [start]..[=]end|code. Some valid
342+
examples are:
343+
344+
- 429
345+
- 500..=599
346+
- 500..
347+
348+
Use "lychee --cache-exclude-status '429, 500..502' <inputs>..." to provide a comma- separated
349+
list of excluded status codes. This example will not cache results with a status code of 429, 500,
350+
501 and 502.
351+
352+
[default: ]
353+
338354
--dump
339355
Don't perform any link checking. Instead, dump all the links extracted from inputs that would be checked
340356

lychee-bin/src/commands/check.rs

+89-5
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use reqwest::Url;
1010
use tokio::sync::mpsc;
1111
use tokio_stream::wrappers::ReceiverStream;
1212

13-
use lychee_lib::{Client, ErrorKind, Request, Response};
13+
use lychee_lib::{Client, ErrorKind, Request, Response, Uri};
1414
use lychee_lib::{InputSource, Result};
1515
use lychee_lib::{ResponseBody, Status};
1616

@@ -46,6 +46,7 @@ where
4646

4747
let client = params.client;
4848
let cache = params.cache;
49+
let cache_exclude_status = params.cfg.cache_exclude_status.into_set();
4950
let accept = params.cfg.accept.into_set();
5051

5152
let pb = if params.cfg.no_progress || params.cfg.verbose.log_level() >= log::Level::Info {
@@ -61,6 +62,7 @@ where
6162
max_concurrency,
6263
client,
6364
cache,
65+
cache_exclude_status,
6466
accept,
6567
));
6668

@@ -219,14 +221,22 @@ async fn request_channel_task(
219221
max_concurrency: usize,
220222
client: Client,
221223
cache: Arc<Cache>,
224+
cache_exclude_status: HashSet<u16>,
222225
accept: HashSet<u16>,
223226
) {
224227
StreamExt::for_each_concurrent(
225228
ReceiverStream::new(recv_req),
226229
max_concurrency,
227230
|request: Result<Request>| async {
228231
let request = request.expect("cannot read request");
229-
let response = handle(&client, cache.clone(), request, accept.clone()).await;
232+
let response = handle(
233+
&client,
234+
cache.clone(),
235+
cache_exclude_status.clone(),
236+
request,
237+
accept.clone(),
238+
)
239+
.await;
230240

231241
send_resp
232242
.send(response)
@@ -260,6 +270,7 @@ async fn check_url(client: &Client, request: Request) -> Response {
260270
async fn handle(
261271
client: &Client,
262272
cache: Arc<Cache>,
273+
cache_exclude_status: HashSet<u16>,
263274
request: Request,
264275
accept: HashSet<u16>,
265276
) -> Response {
@@ -287,16 +298,37 @@ async fn handle(
287298
// benefit.
288299
// - Skip caching unsupported URLs as they might be supported in a
289300
// future run.
290-
// - Skip caching excluded links; they might not be excluded in the next run
301+
// - Skip caching excluded links; they might not be excluded in the next run.
302+
// - Skip caching links for which the status code has been explicitly excluded from the cache.
291303
let status = response.status();
292-
if uri.is_file() || status.is_excluded() || status.is_unsupported() || status.is_unknown() {
304+
if ignore_cache(&uri, status, &cache_exclude_status) {
293305
return response;
294306
}
295307

296308
cache.insert(uri, status.into());
297309
response
298310
}
299311

312+
/// Returns `true` if the response should be ignored in the cache.
313+
///
314+
/// The response should be ignored if:
315+
/// - The URI is a file URI.
316+
/// - The status is excluded.
317+
/// - The status is unsupported.
318+
/// - The status is unknown.
319+
/// - The status code is excluded from the cache.
320+
fn ignore_cache(uri: &Uri, status: &Status, cache_exclude_status: &HashSet<u16>) -> bool {
321+
let status_code_excluded = status
322+
.code()
323+
.map_or(false, |code| cache_exclude_status.contains(&code.as_u16()));
324+
325+
uri.is_file()
326+
|| status.is_excluded()
327+
|| status.is_unsupported()
328+
|| status.is_unknown()
329+
|| status_code_excluded
330+
}
331+
300332
fn show_progress(
301333
output: &mut dyn Write,
302334
progress_bar: &Option<ProgressBar>,
@@ -344,8 +376,9 @@ fn get_failed_urls(stats: &mut ResponseStats) -> Vec<(InputSource, Url)> {
344376
#[cfg(test)]
345377
mod tests {
346378
use crate::{formatters::get_response_formatter, options};
379+
use http::StatusCode;
347380
use log::info;
348-
use lychee_lib::{CacheStatus, ClientBuilder, InputSource, Uri};
381+
use lychee_lib::{CacheStatus, ClientBuilder, ErrorKind, InputSource, Uri};
349382

350383
use super::*;
351384

@@ -406,4 +439,55 @@ mod tests {
406439
Status::Error(ErrorKind::InvalidURI(_))
407440
));
408441
}
442+
443+
#[test]
444+
fn test_cache_by_default() {
445+
assert!(!ignore_cache(
446+
&Uri::try_from("https://[::1]").unwrap(),
447+
&Status::Ok(StatusCode::OK),
448+
&HashSet::default()
449+
));
450+
}
451+
452+
#[test]
453+
// Cache is ignored for file URLs
454+
fn test_cache_ignore_file_urls() {
455+
assert!(ignore_cache(
456+
&Uri::try_from("file:///home").unwrap(),
457+
&Status::Ok(StatusCode::OK),
458+
&HashSet::default()
459+
));
460+
}
461+
462+
#[test]
463+
// Cache is ignored for unsupported status
464+
fn test_cache_ignore_unsupported_status() {
465+
assert!(ignore_cache(
466+
&Uri::try_from("https://[::1]").unwrap(),
467+
&Status::Unsupported(ErrorKind::EmptyUrl),
468+
&HashSet::default()
469+
));
470+
}
471+
472+
#[test]
473+
// Cache is ignored for unknown status
474+
fn test_cache_ignore_unknown_status() {
475+
assert!(ignore_cache(
476+
&Uri::try_from("https://[::1]").unwrap(),
477+
&Status::UnknownStatusCode(StatusCode::IM_A_TEAPOT),
478+
&HashSet::default()
479+
));
480+
}
481+
482+
#[test]
483+
fn test_cache_ignore_excluded_status() {
484+
// Cache is ignored for excluded status codes
485+
let exclude = [StatusCode::OK.as_u16()].iter().copied().collect();
486+
487+
assert!(ignore_cache(
488+
&Uri::try_from("https://[::1]").unwrap(),
489+
&Status::Ok(StatusCode::OK),
490+
&exclude
491+
));
492+
}
409493
}

lychee-bin/src/options.rs

+39-6
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ use clap::builder::PossibleValuesParser;
66
use clap::{arg, builder::TypedValueParser, Parser};
77
use const_format::{concatcp, formatcp};
88
use lychee_lib::{
9-
AcceptSelector, Base, BasicAuthSelector, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES,
10-
DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT,
9+
Base, BasicAuthSelector, Input, StatusCodeExcluder, StatusCodeSelector, DEFAULT_MAX_REDIRECTS,
10+
DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT,
1111
};
1212
use secrecy::{ExposeSecret, SecretString};
1313
use serde::Deserialize;
@@ -145,7 +145,8 @@ default_function! {
145145
retry_wait_time: usize = DEFAULT_RETRY_WAIT_TIME_SECS;
146146
method: String = DEFAULT_METHOD.to_string();
147147
verbosity: Verbosity = Verbosity::default();
148-
accept_selector: AcceptSelector = AcceptSelector::default();
148+
cache_exclude_selector: StatusCodeExcluder = StatusCodeExcluder::new();
149+
accept_selector: StatusCodeSelector = StatusCodeSelector::default();
149150
}
150151

151152
// Macro for merging configuration values
@@ -231,6 +232,26 @@ pub(crate) struct Config {
231232
#[serde(with = "humantime_serde")]
232233
pub(crate) max_cache_age: Duration,
233234

235+
/// A list of status codes that will be excluded from the cache
236+
#[arg(
237+
long,
238+
default_value_t,
239+
long_help = "A list of status codes that will be ignored from the cache
240+
241+
The following accept range syntax is supported: [start]..[=]end|code. Some valid
242+
examples are:
243+
244+
- 429
245+
- 500..=599
246+
- 500..
247+
248+
Use \"lychee --cache-exclude-status '429, 500..502' <inputs>...\" to provide a comma- separated
249+
list of excluded status codes. This example will not cache results with a status code of 429, 500,
250+
501 and 502."
251+
)]
252+
#[serde(default = "cache_exclude_selector")]
253+
pub(crate) cache_exclude_status: StatusCodeExcluder,
254+
234255
/// Don't perform any link checking.
235256
/// Instead, dump all the links extracted from inputs that would be checked
236257
#[arg(long)]
@@ -394,7 +415,7 @@ separated list of accepted status codes. This example will accept 200, 201,
394415
202, 203, 204, 429, and 500 as valid status codes."
395416
)]
396417
#[serde(default = "accept_selector")]
397-
pub(crate) accept: AcceptSelector,
418+
pub(crate) accept: StatusCodeSelector,
398419

399420
/// Enable the checking of fragments in links.
400421
#[arg(long)]
@@ -509,6 +530,7 @@ impl Config {
509530
max_retries: DEFAULT_MAX_RETRIES;
510531
max_concurrency: DEFAULT_MAX_CONCURRENCY;
511532
max_cache_age: humantime::parse_duration(DEFAULT_MAX_CACHE_AGE).unwrap();
533+
cache_exclude_status: StatusCodeExcluder::default();
512534
threads: None;
513535
user_agent: DEFAULT_USER_AGENT;
514536
insecure: false;
@@ -538,7 +560,7 @@ impl Config {
538560
require_https: false;
539561
cookie_jar: None;
540562
include_fragments: false;
541-
accept: AcceptSelector::default();
563+
accept: StatusCodeSelector::default();
542564
}
543565

544566
if self
@@ -564,7 +586,7 @@ mod tests {
564586
#[test]
565587
fn test_accept_status_codes() {
566588
let toml = Config {
567-
accept: AcceptSelector::from_str("200..=204, 429, 500").unwrap(),
589+
accept: StatusCodeSelector::from_str("200..=204, 429, 500").unwrap(),
568590
..Default::default()
569591
};
570592

@@ -577,4 +599,15 @@ mod tests {
577599
assert!(cli.accept.contains(204));
578600
assert!(!cli.accept.contains(205));
579601
}
602+
603+
#[test]
604+
fn test_default() {
605+
let cli = Config::default();
606+
607+
assert_eq!(
608+
cli.accept,
609+
StatusCodeSelector::from_str("100..=103,200..=299").expect("no error")
610+
);
611+
assert_eq!(cli.cache_exclude_status, StatusCodeExcluder::new());
612+
}
580613
}

lychee-bin/tests/cli.rs

+59
Original file line numberDiff line numberDiff line change
@@ -895,6 +895,65 @@ mod cli {
895895
Ok(())
896896
}
897897

898+
#[tokio::test]
899+
async fn test_lycheecache_exclude_custom_status_codes() -> Result<()> {
900+
let base_path = fixtures_path().join("cache");
901+
let cache_file = base_path.join(LYCHEE_CACHE_FILE);
902+
903+
// Unconditionally remove cache file if it exists
904+
let _ = fs::remove_file(&cache_file);
905+
906+
let mock_server_ok = mock_server!(StatusCode::OK);
907+
let mock_server_no_content = mock_server!(StatusCode::NO_CONTENT);
908+
let mock_server_too_many_requests = mock_server!(StatusCode::TOO_MANY_REQUESTS);
909+
910+
let dir = tempfile::tempdir()?;
911+
let mut file = File::create(dir.path().join("c.md"))?;
912+
913+
writeln!(file, "{}", mock_server_ok.uri().as_str())?;
914+
writeln!(file, "{}", mock_server_no_content.uri().as_str())?;
915+
writeln!(file, "{}", mock_server_too_many_requests.uri().as_str())?;
916+
917+
let mut cmd = main_command();
918+
let test_cmd = cmd
919+
.current_dir(&base_path)
920+
.arg(dir.path().join("c.md"))
921+
.arg("--verbose")
922+
.arg("--no-progress")
923+
.arg("--cache")
924+
.arg("--cache-exclude-status")
925+
.arg("204,429");
926+
927+
assert!(
928+
!cache_file.exists(),
929+
"cache file should not exist before this test"
930+
);
931+
932+
// run first without cache to generate the cache file
933+
test_cmd
934+
.assert()
935+
.stderr(contains(format!("[200] {}/\n", mock_server_ok.uri())))
936+
.stderr(contains(format!(
937+
"[204] {}/ | OK (204 No Content): No Content\n",
938+
mock_server_no_content.uri()
939+
)))
940+
.stderr(contains(format!(
941+
"[429] {}/ | Failed: Network error: Too Many Requests\n",
942+
mock_server_too_many_requests.uri()
943+
)));
944+
945+
// check content of cache file
946+
let data = fs::read_to_string(&cache_file)?;
947+
assert!(data.contains(&format!("{}/,200", mock_server_ok.uri())));
948+
assert!(!data.contains(&format!("{}/,204", mock_server_no_content.uri())));
949+
assert!(!data.contains(&format!("{}/,429", mock_server_too_many_requests.uri())));
950+
951+
// clear the cache file
952+
fs::remove_file(&cache_file)?;
953+
954+
Ok(())
955+
}
956+
898957
#[tokio::test]
899958
async fn test_lycheecache_accept_custom_status_codes() -> Result<()> {
900959
let base_path = fixtures_path().join("cache_accept_custom_status_codes");

lychee-lib/src/lib.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,9 @@ pub use crate::{
9595
collector::Collector,
9696
filter::{Excludes, Filter, Includes},
9797
types::{
98-
uri::valid::Uri, AcceptRange, AcceptRangeError, AcceptSelector, Base, BasicAuthCredentials,
98+
uri::valid::Uri, AcceptRange, AcceptRangeError, Base, BasicAuthCredentials,
9999
BasicAuthSelector, CacheStatus, CookieJar, ErrorKind, FileType, Input, InputContent,
100-
InputSource, Request, Response, ResponseBody, Result, Status,
100+
InputSource, Request, Response, ResponseBody, Result, Status, StatusCodeExcluder,
101+
StatusCodeSelector,
101102
},
102103
};

lychee-lib/src/types/accept/mod.rs

-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
11
mod range;
2-
mod selector;
32

43
pub use range::*;
5-
pub use selector::*;

lychee-lib/src/types/accept/range.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ use thiserror::Error;
77
static RANGE_PATTERN: Lazy<Regex> =
88
Lazy::new(|| Regex::new(r"^([0-9]{3})?\.\.(=?)([0-9]{3})+$|^([0-9]{3})$").unwrap());
99

10-
/// The [`AcceptRangeParseError`] indicates that the parsing process of an
11-
/// [`AcceptRange`] from a string failed due to various underlying reasons.
10+
/// Indicates that the parsing process of an [`AcceptRange`] from a string
11+
/// failed due to various underlying reasons.
1212
#[derive(Debug, Error, PartialEq)]
1313
pub enum AcceptRangeError {
1414
/// The string input didn't contain any range pattern.

0 commit comments

Comments
 (0)