Merge branch 'master' into issue-11324-prep/linkcheck-tests-http-1.1

sphinx-doc · Jul 22, 2023 · 835152a · 835152a
2 parents 94644fd + bef7fc2
commit 835152a
Show file tree

Hide file tree

Showing 13 changed files with 268 additions and 151 deletions.
diff --git a/doc/faq.rst b/doc/faq.rst
@@ -40,7 +40,7 @@ How do I...
 For many more extensions and other contributed stuff, see the sphinx-contrib_
 repository.
 
-.. _sphinx-contrib: https://bitbucket.org/birkenfeld/sphinx-contrib/
+.. _sphinx-contrib: https://github.com/sphinx-contrib/
 
 .. _usingwith:
 
@@ -63,7 +63,8 @@ Doxygen
 
 SCons
    Glenn Hutchings has written a SCons build script to build Sphinx
-   documentation; it is hosted here: https://bitbucket.org/zondo/sphinx-scons
+   documentation; it is hosted here:
+   https://bitbucket-archive.softwareheritage.org/projects/zo/zondo/sphinx-scons.html
 
 PyPI
    Jannis Leidel wrote a `setuptools command
@@ -77,8 +78,9 @@ GitHub Pages
    GitHub Pages on building HTML document automatically.
 
 MediaWiki
-   See https://bitbucket.org/kevindunn/sphinx-wiki/wiki/Home, a project by
-   Kevin Dunn.
+   See `sphinx-wiki`_, a project by Kevin Dunn.
+
+   .. _sphinx-wiki: https://bitbucket-archive.softwareheritage.org/projects/ke/kevindunn/sphinx-wiki.html
 
 Google Analytics
    You can use a custom ``layout.html`` template, like this:

diff --git a/doc/man/sphinx-build.rst b/doc/man/sphinx-build.rst
@@ -201,10 +201,19 @@ Options
    references.  See the config value :confval:`nitpick_ignore` for a way to
    exclude some references as "known missing".
 
-.. option:: -N
+.. option:: -N, --no-color
 
    Do not emit colored output.
 
+   .. versionchanged:: 1.6
+      Add ``--no-color`` long option.
+
+.. option:: --color
+
+   Emit colored output. Auto-detected by default.
+
+   .. versionadded:: 1.6
+
 .. option:: -v
 
    Increase verbosity (loglevel).  This option can be given up to three times

diff --git a/doc/usage/restructuredtext/domains.rst b/doc/usage/restructuredtext/domains.rst
@@ -2179,5 +2179,5 @@ Jinja_, Operation_, and Scala_.
 .. _MATLAB: https://pypi.org/project/sphinxcontrib-matlabdomain/
 .. _Operation: https://pypi.org/project/sphinxcontrib-operationdomain/
 .. _PHP: https://pypi.org/project/sphinxcontrib-phpdomain/
-.. _Ruby: https://bitbucket.org/birkenfeld/sphinx-contrib/src/default/rubydomain
+.. _Ruby: https://github.com/sphinx-contrib/rubydomain
 .. _Scala: https://pypi.org/project/sphinxcontrib-scaladomain/
diff --git a/sphinx/builders/__init__.py b/sphinx/builders/__init__.py
@@ -496,11 +496,7 @@ def read_doc(self, docname: str, *, _cache: bool = True) -> None:
             doctree = publisher.document
 
         # store time of reading, for outdated files detection
-        # (Some filesystems have coarse timestamp resolution;
-        # therefore time.time() can be older than filesystem's timestamp.
-        # For example, FAT32 has 2sec timestamp resolution.)
-        self.env.all_docs[docname] = max(time.time(),
-                                         path.getmtime(self.env.doc2path(docname)))
+        self.env.all_docs[docname] = time.time_ns() // 1_000
 
         # cleanup
         self.env.temp_data.clear()

diff --git a/sphinx/builders/_epub_base.py b/sphinx/builders/_epub_base.py
@@ -65,9 +65,9 @@
     '.svg': 'image/svg+xml',
     '.jpg': 'image/jpeg',
     '.jpeg': 'image/jpeg',
-    '.otf': 'application/x-font-otf',
-    '.ttf': 'application/x-font-ttf',
-    '.woff': 'application/font-woff',
+    '.otf': 'font/otf',
+    '.ttf': 'font/ttf',
+    '.woff': 'font/woff',
 }
 
 VECTOR_GRAPHICS_EXTENSIONS = ('.svg',)

diff --git a/sphinx/builders/gettext.py b/sphinx/builders/gettext.py
@@ -4,7 +4,7 @@
 
 from codecs import open
 from collections import defaultdict
-from datetime import datetime, timedelta, tzinfo
+from datetime import datetime, timedelta, timezone, tzinfo
 from os import getenv, path, walk
 from time import time
 from typing import Any, Generator, Iterable
@@ -163,8 +163,10 @@ def write_doc(self, docname: str, doctree: nodes.document) -> None:
 
 # determine tzoffset once to remain unaffected by DST change during build
 timestamp = time()
-tzdelta = datetime.fromtimestamp(timestamp) - \
-    datetime.utcfromtimestamp(timestamp)
+local_time = datetime.fromtimestamp(timestamp)
+utc_time = datetime.fromtimestamp(timestamp, tz=timezone.utc)
+tzdelta = local_time - utc_time.replace(tzinfo=None)
+
 # set timestamp from SOURCE_DATE_EPOCH if set
 # see https://reproducible-builds.org/specs/source-date-epoch/
 source_date_epoch = getenv('SOURCE_DATE_EPOCH')

diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py
@@ -13,12 +13,12 @@
 from os import path
 from queue import PriorityQueue, Queue
 from threading import Thread
-from typing import Any, Generator, NamedTuple, Tuple, Union, cast
-from urllib.parse import unquote, urlparse, urlunparse
+from typing import Any, Callable, Generator, Iterator, NamedTuple, Tuple, Union, cast
+from urllib.parse import unquote, urlparse, urlsplit, urlunparse
 
 from docutils import nodes
 from requests import Response
-from requests.exceptions import ConnectionError, HTTPError, TooManyRedirects
+from requests.exceptions import ConnectionError, HTTPError, SSLError, TooManyRedirects
 
 from sphinx.application import Sphinx
 from sphinx.builders.dummy import DummyBuilder
@@ -72,7 +72,7 @@ class RateLimit(NamedTuple):
 
 
 class AnchorCheckParser(HTMLParser):
-    """Specialized HTML parser that looks for a specific anchor."""
+    """Specialised HTML parser that looks for a specific anchor."""
 
     def __init__(self, search_anchor: str) -> None:
         super().__init__()
@@ -87,11 +87,10 @@ def handle_starttag(self, tag: Any, attrs: Any) -> None:
                 break
 
 
-def check_anchor(response: requests.requests.Response, anchor: str) -> bool:
-    """Reads HTML data from a response object `response` searching for `anchor`.
-    Returns True if anchor was found, False otherwise.
-    """
-    parser = AnchorCheckParser(anchor)
+def contains_anchor(response: Response, anchor: str) -> bool:
+    """Determine if an anchor is contained within an HTTP response."""
+
+    parser = AnchorCheckParser(unquote(anchor))
     # Read file in chunks. If we find a matching anchor, we break
     # the loop early in hopes not to have to download the whole thing.
     for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):
@@ -271,7 +270,7 @@ def run(self) -> None:
             kwargs['timeout'] = self.config.linkcheck_timeout
 
         def get_request_headers() -> dict[str, str]:
-            url = urlparse(uri)
+            url = urlsplit(uri)
             candidates = [f"{url.scheme}://{url.netloc}",
                           f"{url.scheme}://{url.netloc}/",
                           uri,
@@ -286,16 +285,11 @@ def get_request_headers() -> dict[str, str]:
             return {}
 
         def check_uri() -> tuple[str, str, int]:
-            # split off anchor
-            if '#' in uri:
-                req_url, anchor = uri.split('#', 1)
-                for rex in self.anchors_ignore:
-                    if rex.match(anchor):
-                        anchor = None
-                        break
-            else:
-                req_url = uri
-                anchor = None
+            req_url, delimiter, anchor = uri.partition('#')
+            for rex in self.anchors_ignore if delimiter and anchor else []:
+                if rex.match(anchor):
+                    anchor = ''
+                    break
 
             # handle non-ASCII URIs
             try:
@@ -313,71 +307,87 @@ def check_uri() -> tuple[str, str, int]:
             # update request headers for the URL
             kwargs['headers'] = get_request_headers()
 
-            try:
-                if anchor and self.config.linkcheck_anchors:
-                    # Read the whole document and see if #anchor exists
-                    with requests.get(req_url, stream=True, config=self.config, auth=auth_info,
-                                      **kwargs) as response:
-                        response.raise_for_status()
-                        found = check_anchor(response, unquote(anchor))
-
-                    if not found:
-                        raise Exception(__("Anchor '%s' not found") % anchor)
-                else:
-                    try:
-                        # try a HEAD request first, which should be easier on
-                        # the server and the network
-                        with requests.head(req_url, allow_redirects=True, config=self.config,
-                                           auth=auth_info, **kwargs) as response:
-                            response.raise_for_status()
+            # Linkcheck HTTP request logic:
+            #
+            # - Attempt HTTP HEAD before HTTP GET unless page content is required.
+            # - Follow server-issued HTTP redirects.
+            # - Respect server-issued HTTP 429 back-offs.
+            error_message = None
+            status_code = -1
+            response_url = retry_after = ''
+            for retrieval_method, retrieval_kwargs in _retrieval_methods(
+                    self.config.linkcheck_anchors, anchor,
+            ):
+                try:
+                    with retrieval_method(url=req_url, auth=auth_info, config=self.config,
+                                          **retrieval_kwargs, **kwargs) as response:
+                        if response.ok and anchor and not contains_anchor(response, anchor):
+                            raise Exception(__(f'Anchor {anchor!r} not found'))
+
+                    # Copy data we need from the (closed) response
+                    status_code = response.status_code
+                    redirect_status_code = response.history[-1].status_code if response.history else None  # NoQA: E501
+                    retry_after = response.headers.get('Retry-After')
+                    response_url = f'{response.url}'
+                    response.raise_for_status()
+                    del response
+                    break
+
+                except SSLError as err:
+                    # SSL failure; report that the link is broken.
+                    return 'broken', str(err), 0
+
+                except (ConnectionError, TooManyRedirects) as err:
                     # Servers drop the connection on HEAD requests, causing
                     # ConnectionError.
-                    except (ConnectionError, HTTPError, TooManyRedirects) as err:
-                        if isinstance(err, HTTPError) and err.response.status_code == 429:
-                            raise
-                        # retry with GET request if that fails, some servers
-                        # don't like HEAD requests.
-                        with requests.get(req_url, stream=True, config=self.config,
-                                          auth=auth_info, **kwargs) as response:
-                            response.raise_for_status()
-            except HTTPError as err:
-                if err.response.status_code == 401:
-                    # We'll take "Unauthorized" as working.
-                    return 'working', ' - unauthorized', 0
-                elif err.response.status_code == 429:
-                    next_check = self.limit_rate(err.response)
-                    if next_check is not None:
-                        self.wqueue.put(CheckRequest(next_check, hyperlink), False)
-                        return 'rate-limited', '', 0
-                    return 'broken', str(err), 0
-                elif err.response.status_code == 503:
-                    # We'll take "Service Unavailable" as ignored.
-                    return 'ignored', str(err), 0
-                else:
+                    error_message = str(err)
+                    continue
+
+                except HTTPError as err:
+                    error_message = str(err)
+
+                    # Unauthorised: the reference probably exists
+                    if status_code == 401:
+                        return 'working', 'unauthorized', 0
+
+                    # Rate limiting; back-off if allowed, or report failure otherwise
+                    if status_code == 429:
+                        if next_check := self.limit_rate(response_url, retry_after):
+                            self.wqueue.put(CheckRequest(next_check, hyperlink), False)
+                            return 'rate-limited', '', 0
+                        return 'broken', error_message, 0
+
+                    # Don't claim success/failure during server-side outages
+                    if status_code == 503:
+                        return 'ignored', 'service unavailable', 0
+
+                    # For most HTTP failures, continue attempting alternate retrieval methods
+                    continue
+
+                except Exception as err:
+                    # Unhandled exception (intermittent or permanent); report that
+                    # the link is broken.
                     return 'broken', str(err), 0
-            except Exception as err:
-                return 'broken', str(err), 0
+
             else:
-                netloc = urlparse(req_url).netloc
-                try:
-                    del self.rate_limits[netloc]
-                except KeyError:
-                    pass
-            if response.url.rstrip('/') == req_url.rstrip('/'):
+                # All available retrieval methods have been exhausted; report
+                # that the link is broken.
+                return 'broken', error_message, 0
+
+            # Success; clear rate limits for the origin
+            netloc = urlsplit(req_url).netloc
+            try:
+                del self.rate_limits[netloc]
+            except KeyError:
+                pass
+
+            if ((response_url.rstrip('/') == req_url.rstrip('/'))
+                    or allowed_redirect(req_url, response_url)):
                 return 'working', '', 0
+            elif redirect_status_code is not None:
+                return 'redirected', response_url, redirect_status_code
             else:
-                new_url = response.url
-                if anchor:
-                    new_url += '#' + anchor
-
-                if allowed_redirect(req_url, new_url):
-                    return 'working', '', 0
-                elif response.history:
-                    # history contains any redirects, get last
-                    code = response.history[-1].status_code
-                    return 'redirected', new_url, code
-                else:
-                    return 'redirected', new_url, 0
+                return 'redirected', response_url, 0
 
         def allowed_redirect(url: str, new_url: str) -> bool:
             return any(
@@ -428,7 +438,7 @@ def check(docname: str) -> tuple[str, str, int]:
 
             if uri is None:
                 break
-            netloc = urlparse(uri).netloc
+            netloc = urlsplit(uri).netloc
             try:
                 # Refresh rate limit.
                 # When there are many links in the queue, workers are all stuck waiting
@@ -451,9 +461,8 @@ def check(docname: str) -> tuple[str, str, int]:
                 self.rqueue.put(CheckResult(uri, docname, lineno, status, info, code))
             self.wqueue.task_done()
 
-    def limit_rate(self, response: Response) -> float | None:
+    def limit_rate(self, response_url: str, retry_after: str) -> float | None:
         next_check = None
-        retry_after = response.headers.get("Retry-After")
         if retry_after:
             try:
                 # Integer: time to wait before next attempt.
@@ -471,7 +480,7 @@ def limit_rate(self, response: Response) -> float | None:
                     delay = (until - datetime.now(timezone.utc)).total_seconds()
             else:
                 next_check = time.time() + delay
-        netloc = urlparse(response.url).netloc
+        netloc = urlsplit(response_url).netloc
         if next_check is None:
             max_delay = self.config.linkcheck_rate_limit_timeout
             try:
@@ -490,6 +499,15 @@ def limit_rate(self, response: Response) -> float | None:
         return next_check
 
 
+def _retrieval_methods(
+    linkcheck_anchors: bool,
+    anchor: str,
+) -> Iterator[tuple[Callable, dict[str, bool]]]:
+    if not linkcheck_anchors or not anchor:
+        yield requests.head, {'allow_redirects': True}
+    yield requests.get, {'stream': True}
+
+
 class HyperlinkCollector(SphinxPostTransform):
     builders = ('linkcheck',)
     default_priority = 800