Skip to content

Commit

Permalink
Merge branch 'master' into issue-11324-prep/linkcheck-tests-http-1.1
Browse files Browse the repository at this point in the history
  • Loading branch information
jayaddison authored Jul 22, 2023
2 parents 94644fd + bef7fc2 commit 835152a
Show file tree
Hide file tree
Showing 13 changed files with 268 additions and 151 deletions.
10 changes: 6 additions & 4 deletions doc/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ How do I...
For many more extensions and other contributed stuff, see the sphinx-contrib_
repository.

.. _sphinx-contrib: https://bitbucket.org/birkenfeld/sphinx-contrib/
.. _sphinx-contrib: https://github.com/sphinx-contrib/

.. _usingwith:

Expand All @@ -63,7 +63,8 @@ Doxygen

SCons
Glenn Hutchings has written a SCons build script to build Sphinx
documentation; it is hosted here: https://bitbucket.org/zondo/sphinx-scons
documentation; it is hosted here:
https://bitbucket-archive.softwareheritage.org/projects/zo/zondo/sphinx-scons.html

PyPI
Jannis Leidel wrote a `setuptools command
Expand All @@ -77,8 +78,9 @@ GitHub Pages
GitHub Pages on building HTML document automatically.

MediaWiki
See https://bitbucket.org/kevindunn/sphinx-wiki/wiki/Home, a project by
Kevin Dunn.
See `sphinx-wiki`_, a project by Kevin Dunn.

.. _sphinx-wiki: https://bitbucket-archive.softwareheritage.org/projects/ke/kevindunn/sphinx-wiki.html

Google Analytics
You can use a custom ``layout.html`` template, like this:
Expand Down
11 changes: 10 additions & 1 deletion doc/man/sphinx-build.rst
Original file line number Diff line number Diff line change
Expand Up @@ -201,10 +201,19 @@ Options
references. See the config value :confval:`nitpick_ignore` for a way to
exclude some references as "known missing".

.. option:: -N
.. option:: -N, --no-color

Do not emit colored output.

.. versionchanged:: 1.6
Add ``--no-color`` long option.

.. option:: --color

Emit colored output. Auto-detected by default.

.. versionadded:: 1.6

.. option:: -v

Increase verbosity (loglevel). This option can be given up to three times
Expand Down
2 changes: 1 addition & 1 deletion doc/usage/restructuredtext/domains.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2179,5 +2179,5 @@ Jinja_, Operation_, and Scala_.
.. _MATLAB: https://pypi.org/project/sphinxcontrib-matlabdomain/
.. _Operation: https://pypi.org/project/sphinxcontrib-operationdomain/
.. _PHP: https://pypi.org/project/sphinxcontrib-phpdomain/
.. _Ruby: https://bitbucket.org/birkenfeld/sphinx-contrib/src/default/rubydomain
.. _Ruby: https://github.com/sphinx-contrib/rubydomain
.. _Scala: https://pypi.org/project/sphinxcontrib-scaladomain/
6 changes: 1 addition & 5 deletions sphinx/builders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,11 +496,7 @@ def read_doc(self, docname: str, *, _cache: bool = True) -> None:
doctree = publisher.document

# store time of reading, for outdated files detection
# (Some filesystems have coarse timestamp resolution;
# therefore time.time() can be older than filesystem's timestamp.
# For example, FAT32 has 2sec timestamp resolution.)
self.env.all_docs[docname] = max(time.time(),
path.getmtime(self.env.doc2path(docname)))
self.env.all_docs[docname] = time.time_ns() // 1_000

# cleanup
self.env.temp_data.clear()
Expand Down
6 changes: 3 additions & 3 deletions sphinx/builders/_epub_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@
'.svg': 'image/svg+xml',
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.otf': 'application/x-font-otf',
'.ttf': 'application/x-font-ttf',
'.woff': 'application/font-woff',
'.otf': 'font/otf',
'.ttf': 'font/ttf',
'.woff': 'font/woff',
}

VECTOR_GRAPHICS_EXTENSIONS = ('.svg',)
Expand Down
8 changes: 5 additions & 3 deletions sphinx/builders/gettext.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from codecs import open
from collections import defaultdict
from datetime import datetime, timedelta, tzinfo
from datetime import datetime, timedelta, timezone, tzinfo
from os import getenv, path, walk
from time import time
from typing import Any, Generator, Iterable
Expand Down Expand Up @@ -163,8 +163,10 @@ def write_doc(self, docname: str, doctree: nodes.document) -> None:

# determine tzoffset once to remain unaffected by DST change during build
timestamp = time()
tzdelta = datetime.fromtimestamp(timestamp) - \
datetime.utcfromtimestamp(timestamp)
local_time = datetime.fromtimestamp(timestamp)
utc_time = datetime.fromtimestamp(timestamp, tz=timezone.utc)
tzdelta = local_time - utc_time.replace(tzinfo=None)

# set timestamp from SOURCE_DATE_EPOCH if set
# see https://reproducible-builds.org/specs/source-date-epoch/
source_date_epoch = getenv('SOURCE_DATE_EPOCH')
Expand Down
184 changes: 101 additions & 83 deletions sphinx/builders/linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
from os import path
from queue import PriorityQueue, Queue
from threading import Thread
from typing import Any, Generator, NamedTuple, Tuple, Union, cast
from urllib.parse import unquote, urlparse, urlunparse
from typing import Any, Callable, Generator, Iterator, NamedTuple, Tuple, Union, cast
from urllib.parse import unquote, urlparse, urlsplit, urlunparse

from docutils import nodes
from requests import Response
from requests.exceptions import ConnectionError, HTTPError, TooManyRedirects
from requests.exceptions import ConnectionError, HTTPError, SSLError, TooManyRedirects

from sphinx.application import Sphinx
from sphinx.builders.dummy import DummyBuilder
Expand Down Expand Up @@ -72,7 +72,7 @@ class RateLimit(NamedTuple):


class AnchorCheckParser(HTMLParser):
"""Specialized HTML parser that looks for a specific anchor."""
"""Specialised HTML parser that looks for a specific anchor."""

def __init__(self, search_anchor: str) -> None:
super().__init__()
Expand All @@ -87,11 +87,10 @@ def handle_starttag(self, tag: Any, attrs: Any) -> None:
break


def check_anchor(response: requests.requests.Response, anchor: str) -> bool:
"""Reads HTML data from a response object `response` searching for `anchor`.
Returns True if anchor was found, False otherwise.
"""
parser = AnchorCheckParser(anchor)
def contains_anchor(response: Response, anchor: str) -> bool:
"""Determine if an anchor is contained within an HTTP response."""

parser = AnchorCheckParser(unquote(anchor))
# Read file in chunks. If we find a matching anchor, we break
# the loop early in hopes not to have to download the whole thing.
for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):
Expand Down Expand Up @@ -271,7 +270,7 @@ def run(self) -> None:
kwargs['timeout'] = self.config.linkcheck_timeout

def get_request_headers() -> dict[str, str]:
url = urlparse(uri)
url = urlsplit(uri)
candidates = [f"{url.scheme}://{url.netloc}",
f"{url.scheme}://{url.netloc}/",
uri,
Expand All @@ -286,16 +285,11 @@ def get_request_headers() -> dict[str, str]:
return {}

def check_uri() -> tuple[str, str, int]:
# split off anchor
if '#' in uri:
req_url, anchor = uri.split('#', 1)
for rex in self.anchors_ignore:
if rex.match(anchor):
anchor = None
break
else:
req_url = uri
anchor = None
req_url, delimiter, anchor = uri.partition('#')
for rex in self.anchors_ignore if delimiter and anchor else []:
if rex.match(anchor):
anchor = ''
break

# handle non-ASCII URIs
try:
Expand All @@ -313,71 +307,87 @@ def check_uri() -> tuple[str, str, int]:
# update request headers for the URL
kwargs['headers'] = get_request_headers()

try:
if anchor and self.config.linkcheck_anchors:
# Read the whole document and see if #anchor exists
with requests.get(req_url, stream=True, config=self.config, auth=auth_info,
**kwargs) as response:
response.raise_for_status()
found = check_anchor(response, unquote(anchor))

if not found:
raise Exception(__("Anchor '%s' not found") % anchor)
else:
try:
# try a HEAD request first, which should be easier on
# the server and the network
with requests.head(req_url, allow_redirects=True, config=self.config,
auth=auth_info, **kwargs) as response:
response.raise_for_status()
# Linkcheck HTTP request logic:
#
# - Attempt HTTP HEAD before HTTP GET unless page content is required.
# - Follow server-issued HTTP redirects.
# - Respect server-issued HTTP 429 back-offs.
error_message = None
status_code = -1
response_url = retry_after = ''
for retrieval_method, retrieval_kwargs in _retrieval_methods(
self.config.linkcheck_anchors, anchor,
):
try:
with retrieval_method(url=req_url, auth=auth_info, config=self.config,
**retrieval_kwargs, **kwargs) as response:
if response.ok and anchor and not contains_anchor(response, anchor):
raise Exception(__(f'Anchor {anchor!r} not found'))

# Copy data we need from the (closed) response
status_code = response.status_code
redirect_status_code = response.history[-1].status_code if response.history else None # NoQA: E501
retry_after = response.headers.get('Retry-After')
response_url = f'{response.url}'
response.raise_for_status()
del response
break

except SSLError as err:
# SSL failure; report that the link is broken.
return 'broken', str(err), 0

except (ConnectionError, TooManyRedirects) as err:
# Servers drop the connection on HEAD requests, causing
# ConnectionError.
except (ConnectionError, HTTPError, TooManyRedirects) as err:
if isinstance(err, HTTPError) and err.response.status_code == 429:
raise
# retry with GET request if that fails, some servers
# don't like HEAD requests.
with requests.get(req_url, stream=True, config=self.config,
auth=auth_info, **kwargs) as response:
response.raise_for_status()
except HTTPError as err:
if err.response.status_code == 401:
# We'll take "Unauthorized" as working.
return 'working', ' - unauthorized', 0
elif err.response.status_code == 429:
next_check = self.limit_rate(err.response)
if next_check is not None:
self.wqueue.put(CheckRequest(next_check, hyperlink), False)
return 'rate-limited', '', 0
return 'broken', str(err), 0
elif err.response.status_code == 503:
# We'll take "Service Unavailable" as ignored.
return 'ignored', str(err), 0
else:
error_message = str(err)
continue

except HTTPError as err:
error_message = str(err)

# Unauthorised: the reference probably exists
if status_code == 401:
return 'working', 'unauthorized', 0

# Rate limiting; back-off if allowed, or report failure otherwise
if status_code == 429:
if next_check := self.limit_rate(response_url, retry_after):
self.wqueue.put(CheckRequest(next_check, hyperlink), False)
return 'rate-limited', '', 0
return 'broken', error_message, 0

# Don't claim success/failure during server-side outages
if status_code == 503:
return 'ignored', 'service unavailable', 0

# For most HTTP failures, continue attempting alternate retrieval methods
continue

except Exception as err:
# Unhandled exception (intermittent or permanent); report that
# the link is broken.
return 'broken', str(err), 0
except Exception as err:
return 'broken', str(err), 0

else:
netloc = urlparse(req_url).netloc
try:
del self.rate_limits[netloc]
except KeyError:
pass
if response.url.rstrip('/') == req_url.rstrip('/'):
# All available retrieval methods have been exhausted; report
# that the link is broken.
return 'broken', error_message, 0

# Success; clear rate limits for the origin
netloc = urlsplit(req_url).netloc
try:
del self.rate_limits[netloc]
except KeyError:
pass

if ((response_url.rstrip('/') == req_url.rstrip('/'))
or allowed_redirect(req_url, response_url)):
return 'working', '', 0
elif redirect_status_code is not None:
return 'redirected', response_url, redirect_status_code
else:
new_url = response.url
if anchor:
new_url += '#' + anchor

if allowed_redirect(req_url, new_url):
return 'working', '', 0
elif response.history:
# history contains any redirects, get last
code = response.history[-1].status_code
return 'redirected', new_url, code
else:
return 'redirected', new_url, 0
return 'redirected', response_url, 0

def allowed_redirect(url: str, new_url: str) -> bool:
return any(
Expand Down Expand Up @@ -428,7 +438,7 @@ def check(docname: str) -> tuple[str, str, int]:

if uri is None:
break
netloc = urlparse(uri).netloc
netloc = urlsplit(uri).netloc
try:
# Refresh rate limit.
# When there are many links in the queue, workers are all stuck waiting
Expand All @@ -451,9 +461,8 @@ def check(docname: str) -> tuple[str, str, int]:
self.rqueue.put(CheckResult(uri, docname, lineno, status, info, code))
self.wqueue.task_done()

def limit_rate(self, response: Response) -> float | None:
def limit_rate(self, response_url: str, retry_after: str) -> float | None:
next_check = None
retry_after = response.headers.get("Retry-After")
if retry_after:
try:
# Integer: time to wait before next attempt.
Expand All @@ -471,7 +480,7 @@ def limit_rate(self, response: Response) -> float | None:
delay = (until - datetime.now(timezone.utc)).total_seconds()
else:
next_check = time.time() + delay
netloc = urlparse(response.url).netloc
netloc = urlsplit(response_url).netloc
if next_check is None:
max_delay = self.config.linkcheck_rate_limit_timeout
try:
Expand All @@ -490,6 +499,15 @@ def limit_rate(self, response: Response) -> float | None:
return next_check


def _retrieval_methods(
linkcheck_anchors: bool,
anchor: str,
) -> Iterator[tuple[Callable, dict[str, bool]]]:
if not linkcheck_anchors or not anchor:
yield requests.head, {'allow_redirects': True}
yield requests.get, {'stream': True}


class HyperlinkCollector(SphinxPostTransform):
builders = ('linkcheck',)
default_priority = 800
Expand Down
Loading

0 comments on commit 835152a

Please sign in to comment.