From 11eee83d92b2beaad855397f86d11a1946dcb06b Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Tue, 13 Aug 2024 16:43:40 -0700 Subject: [PATCH] Anchor regexes and escape potential metachars [#14] Also add note that the underlying code in the linkchecker uses `re.match()` to evaluate the regular expression, so there's already an implicit start of string anchor in effect, and something like `r'google'` will _NOT_ match anything. --- src/conf.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/conf.py b/src/conf.py index 601235ae..8d8dea85 100644 --- a/src/conf.py +++ b/src/conf.py @@ -88,25 +88,31 @@ # -- Linkchecking ------------------------------------------------------------ + +## NOTE: for both sets of regular expressions that follow, the +## underlying linkchecker code uses `re.match()` to apply them to URLs +## — so there's already an implicit "only at the beginning of a +## string" matching happening, and something like a plain `r'google'` +## regular expression will _NOT_ match all google.com URLs. linkcheck_ignore = [ # we have links to localhost for explanatory purposes; obviously # they will never work in the linkchecker - r'http://127.0.0.1:\d+', - r'http://localhost:\d+', + r'^http://127\.0\.0\.1:\d+', + r'^http://localhost:\d+', # these URLs block the client the linkchecker uses - r'https://www.pnas.org/doi/10.1073/pnas.1507071112', - r'https://www.ncbi.nlm.nih.gov/books/NBK25501', + r'^https://www\.pnas\.org/doi/10\.1073/pnas\.1507071112', + r'^https://www\.ncbi\.nlm\.nih\.gov/books/NBK25501', # we specifically use this as an example of a link that _won't_ work - r'https://nextstrain.org/ncov/gisaid/21L/global/6m/2024-01-10', + r'^https://nextstrain\.org/ncov/gisaid/21L/global/6m/2024-01-10', ] linkcheck_anchors_ignore_for_url = [ # colorbrewer uses pseudo-anchors, ala Github. Converting the `#` # to `?` loads the same page, but it also appends the query # string as a pseudo-anchor, so the URL ends up looking very ugly # and potentially misleading. Let's just ignore the anchor... - r'https://colorbrewer2.org', + r'^https://colorbrewer2\.org', # Github uses anchor-looking links for highlighting lines but # handles the actual resolution with Javascript, so skip anchor # checks for Github URLs: - r'https://github.com', + r'^https://github\.com', ]