From 98015907f2a3b96c01a267a2d25a1529f415ac43 Mon Sep 17 00:00:00 2001 From: autoantwort <41973254+autoantwort@users.noreply.github.com> Date: Mon, 28 Oct 2024 09:21:13 +0100 Subject: [PATCH] Ignore casing when processing markdown fragments + check for percent encoded ancors (#1535) We must also check the fragment before it is percent-decoded as required by the HTML standard. Fixes https://github.com/lycheeverse/lychee/issues/1467 --- fixtures/fragments/file.html | 13 +++++++++---- fixtures/fragments/file1.md | 10 ++++++++++ lychee-bin/tests/cli.rs | 8 ++++---- lychee-lib/src/extract/markdown.rs | 5 +++-- lychee-lib/src/utils/fragment_checker.rs | 18 ++++++++++++++---- 5 files changed, 40 insertions(+), 14 deletions(-) diff --git a/fixtures/fragments/file.html b/fixtures/fragments/file.html index db98b3af58..5ff181f737 100644 --- a/fixtures/fragments/file.html +++ b/fixtures/fragments/file.html @@ -5,8 +5,9 @@ For Testing Fragments -
-

+

+

+

To start let's run away. @@ -15,8 +16,12 @@

Word

-
back we go - doesn't exist + back we go
+ back we go upper does not work
+ id with percent encoding
+ back to Upper-ÄÖö
+ back to öüä encoded
+ doesn't exist
diff --git a/fixtures/fragments/file1.md b/fixtures/fragments/file1.md index 7f63b15890..69af62f4f6 100644 --- a/fixtures/fragments/file1.md +++ b/fixtures/fragments/file1.md @@ -44,4 +44,14 @@ Therefore we put the test into a code block for now to prevent false positives. [Link to another file type](empty_file#fragment) +# Ignore casing + +[Link with wrong casing](#IGNORE-CASING) + +# Fünf süße Äpfel + +[Link to umlauts](#fünf-süße-äpfel) +[Link to umlauts wrong case](#fünf-sÜße-Äpfel) +[Link to umlauts with percent encoding](#f%C3%BCnf-s%C3%BC%C3%9Fe-%C3%A4pfel) + ##### Lets wear a hat: être diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 43288b7531..3734094025 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -1673,10 +1673,10 @@ mod cli { .stderr(contains( "fixtures/fragments/file1.md#kebab-case-fragment-1", )) - .stdout(contains("15 Total")) - .stdout(contains("12 OK")) - // 3 failures because of missing fragments - .stdout(contains("3 Errors")); + .stdout(contains("21 Total")) + .stdout(contains("17 OK")) + // 4 failures because of missing fragments + .stdout(contains("4 Errors")); } #[test] diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index 2dcb66d6a6..4de6c2c6b2 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -190,10 +190,11 @@ impl HeadingIdGenerator { /// Converts text into kebab case #[must_use] fn into_kebab_case(text: &str) -> String { - text.chars() + text.to_lowercase() + .chars() .filter_map(|ch| { if ch.is_alphanumeric() || ch == '_' || ch == '-' { - Some(ch.to_ascii_lowercase()) + Some(ch) } else if ch.is_whitespace() { Some('-') } else { diff --git a/lychee-lib/src/utils/fragment_checker.rs b/lychee-lib/src/utils/fragment_checker.rs index 064d550ac3..1cc6c772c8 100644 --- a/lychee-lib/src/utils/fragment_checker.rs +++ b/lychee-lib/src/utils/fragment_checker.rs @@ -47,21 +47,31 @@ impl FragmentChecker { let Some(fragment) = url.fragment() else { return Ok(true); }; - let fragment = percent_decode_str(fragment).decode_utf8()?; + let mut fragment_decoded = percent_decode_str(fragment).decode_utf8()?; let url_without_frag = Self::remove_fragment(url.clone()); - let extractor = match FileType::from(path) { + let file_type = FileType::from(path); + let extractor = match file_type { FileType::Markdown => extract_markdown_fragments, FileType::Html => extract_html_fragments, FileType::Plaintext => return Ok(true), }; + if file_type == FileType::Markdown { + fragment_decoded = fragment_decoded.to_lowercase().into(); + } match self.cache.lock().await.entry(url_without_frag) { Entry::Vacant(entry) => { let content = fs::read_to_string(path).await?; let file_frags = extractor(&content); - Ok(entry.insert(file_frags).contains(&fragment as &str)) + let contains_fragment = + file_frags.contains(fragment) || file_frags.contains(&fragment_decoded as &str); + entry.insert(file_frags); + Ok(contains_fragment) + } + Entry::Occupied(entry) => { + Ok(entry.get().contains(fragment) + || entry.get().contains(&fragment_decoded as &str)) } - Entry::Occupied(entry) => Ok(entry.get().contains(&fragment as &str)), } }