linkcheck: Fix race condition that could lead to checking the availability of the same URL twice

So far, linkcheck scans all of references and images from documents, and checks them parallel. As a result, some URL would be checked twice (or more) by race condition. This collects the URL via post-transforms, and removes duplicated URLs before checking availability. refs: #4303
2025-02-25 18:55:22 -06:00 · 2021-01-19 00:23:25 +09:00
parent a7b6b6bb7f
commit cead0f6ddf
6 changed files with 64 additions and 74 deletions
--- a/3
+++ b/3
@@ -10,6 +10,7 @@ Incompatible changes
 Deprecated
 ----------

+* ``sphinx.builders.linkcheck.node_line_or_0()``
 * ``sphinx.ext.autodoc.AttributeDocumenter.isinstanceattribute()``
 * ``sphinx.ext.autodoc.directive.DocumenterBridge.reporter``
 * ``sphinx.ext.autodoc.importer.get_module_members()``
@@ -57,6 +58,8 @@ Bugs fixed
  + or ^) are used as keystrokes
 * #8629: html: A type warning for html_use_opensearch is shown twice
 * #8665: html theme: Could not override globaltoc_maxdepth in theme.conf
+* #4304: linkcheck: Fix race condition that could lead to checking the
+  availability of the same URL twice
 * #8094: texinfo: image files on the different directory with document are not
  copied
 * #8671: :confval:`highlight_options` is not working
--- a/doc/extdev/deprecated.rst
+++ b/doc/extdev/deprecated.rst
@@ -26,6 +26,11 @@ The following is a list of deprecated interfaces.
     - (will be) Removed
     - Alternatives

+   * - ``sphinx.builders.linkcheck.node_line_or_0()``
+     - 3.5
+     - 5.0
+     - ``sphinx.util.nodes.get_node_line()``
+
   * - ``sphinx.ext.autodoc.AttributeDocumenter.isinstanceattribute()``
     - 3.5
     - 5.0
--- a/sphinx/builders/linkcheck.py
+++ b/sphinx/builders/linkcheck.py
@@ -14,11 +14,12 @@ import re
 import socket
 import threading
 import time
+import warnings
 from datetime import datetime, timezone
 from email.utils import parsedate_to_datetime
 from html.parser import HTMLParser
 from os import path
-from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple
+from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple, cast
 from urllib.parse import unquote, urlparse

 from docutils import nodes
@@ -28,7 +29,9 @@ from requests.exceptions import HTTPError, TooManyRedirects

 from sphinx.application import Sphinx
 from sphinx.builders import Builder
+from sphinx.deprecation import RemovedInSphinx40Warning
 from sphinx.locale import __
+from sphinx.transforms.post_transforms import SphinxPostTransform
 from sphinx.util import encode_uri, logging, requests
 from sphinx.util.console import darkgray, darkgreen, purple, red, turquoise  # type: ignore
 from sphinx.util.nodes import get_node_line
@@ -37,6 +40,10 @@ logger = logging.getLogger(__name__)

 uri_re = re.compile('([a-z]+:)?//')  # matches to foo:// and // (a protocol relative URL)

+Hyperlink = NamedTuple('Hyperlink', (('next_check', float),
+                                     ('uri', Optional[str]),
+                                     ('docname', Optional[str]),
+                                     ('lineno', Optional[int])))
 RateLimit = NamedTuple('RateLimit', (('delay', float), ('next_check', float)))

 DEFAULT_REQUEST_HEADERS = {
@@ -52,6 +59,8 @@ def node_line_or_0(node: Element) -> int:
    PriorityQueue items must be comparable. The line number is part of the
    tuple used by the PriorityQueue, keep an homogeneous type for comparison.
    """
+    warnings.warn('node_line_or_0() is deprecated.',
+                  RemovedInSphinx40Warning, stacklevel=2)
    return get_node_line(node) or 0


@@ -98,6 +107,7 @@ class CheckExternalLinksBuilder(Builder):
                '%(outdir)s/output.txt')

    def init(self) -> None:
+        self.hyperlinks = {}    # type: Dict[str, Hyperlink]
        self.to_ignore = [re.compile(x) for x in self.app.config.linkcheck_ignore]
        self.anchors_ignore = [re.compile(x)
                               for x in self.app.config.linkcheck_anchors_ignore]
@@ -406,35 +416,7 @@ class CheckExternalLinksBuilder(Builder):
        return

    def write_doc(self, docname: str, doctree: Node) -> None:
-        logger.info('')
-        n = 0
-
-        # reference nodes
-        for refnode in doctree.traverse(nodes.reference):
-            if 'refuri' not in refnode:
-                continue
-            uri = refnode['refuri']
-            lineno = node_line_or_0(refnode)
-            uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
-            self.wqueue.put(uri_info, False)
-            n += 1
-
-        # image nodes
-        for imgnode in doctree.traverse(nodes.image):
-            uri = imgnode['candidates'].get('?')
-            if uri and '://' in uri:
-                lineno = node_line_or_0(imgnode)
-                uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
-                self.wqueue.put(uri_info, False)
-                n += 1
-
-        done = 0
-        while done < n:
-            self.process_result(self.rqueue.get())
-            done += 1
-
-        if self.broken:
-            self.app.statuscode = 1
+        pass

    def write_entry(self, what: str, docname: str, filename: str, line: int,
                    uri: str) -> None:
@@ -447,14 +429,58 @@ class CheckExternalLinksBuilder(Builder):
            output.write('\n')

    def finish(self) -> None:
+        logger.info('')
+        n = 0
+
+        for hyperlink in self.hyperlinks.values():
+            self.wqueue.put(hyperlink, False)
+            n += 1
+
+        done = 0
+        while done < n:
+            self.process_result(self.rqueue.get())
+            done += 1
+
+        if self.broken:
+            self.app.statuscode = 1
+
        self.wqueue.join()
        # Shutdown threads.
        for worker in self.workers:
            self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False)


+class HyperlinkCollector(SphinxPostTransform):
+    builders = ('linkcheck',)
+    default_priority = 800
+
+    def run(self, **kwargs: Any) -> None:
+        builder = cast(CheckExternalLinksBuilder, self.app.builder)
+        hyperlinks = builder.hyperlinks
+
+        # reference nodes
+        for refnode in self.document.traverse(nodes.reference):
+            if 'refuri' not in refnode:
+                continue
+            uri = refnode['refuri']
+            lineno = get_node_line(refnode)
+            uri_info = Hyperlink(CHECK_IMMEDIATELY, uri, self.env.docname, lineno)
+            if uri not in hyperlinks:
+                hyperlinks[uri] = uri_info
+
+        # image nodes
+        for imgnode in self.document.traverse(nodes.image):
+            uri = imgnode['candidates'].get('?')
+            if uri and '://' in uri:
+                lineno = get_node_line(imgnode)
+                uri_info = Hyperlink(CHECK_IMMEDIATELY, uri, self.env.docname, lineno)
+                if uri not in hyperlinks:
+                    hyperlinks[uri] = uri_info
+
+
 def setup(app: Sphinx) -> Dict[str, Any]:
    app.add_builder(CheckExternalLinksBuilder)
+    app.add_post_transform(HyperlinkCollector)

    app.add_config_value('linkcheck_ignore', [], None)
    app.add_config_value('linkcheck_auth', [], None)
--- a/tests/roots/test-linkcheck-localserver-two-links/conf.py
+++ b/tests/roots/test-linkcheck-localserver-two-links/conf.py
@@ -1 +0,0 @@
-exclude_patterns = ['_build']
--- a/tests/roots/test-linkcheck-localserver-two-links/index.rst
+++ b/tests/roots/test-linkcheck-localserver-two-links/index.rst
@@ -1,6 +0,0 @@
-.. image:: http://localhost:7777/
-   :target: http://localhost:7777/
-
-`weblate.org`_
-
-.. _weblate.org: http://localhost:7777/
--- a/tests/test_build_linkcheck.py
+++ b/tests/test_build_linkcheck.py
@@ -573,40 +573,3 @@ def test_limit_rate_bails_out_after_waiting_max_time(app):
    checker.rate_limits = {"localhost": RateLimit(90.0, 0.0)}
    next_check = checker.limit_rate(FakeResponse())
    assert next_check is None
-
-
-@pytest.mark.sphinx(
-    'linkcheck', testroot='linkcheck-localserver-two-links', freshenv=True,
-)
-def test_priorityqueue_items_are_comparable(app):
-    with http_server(OKHandler):
-        app.builder.build_all()
-    content = (app.outdir / 'output.json').read_text()
-    rows = [json.loads(x) for x in sorted(content.splitlines())]
-    assert rows == [
-        {
-            'filename': 'index.rst',
-            # Should not be None.
-            'lineno': 0,
-            'status': 'working',
-            'code': 0,
-            'uri': 'http://localhost:7777/',
-            'info': '',
-        },
-        {
-            'filename': 'index.rst',
-            'lineno': 0,
-            'status': 'working',
-            'code': 0,
-            'uri': 'http://localhost:7777/',
-            'info': '',
-        },
-        {
-            'filename': 'index.rst',
-            'lineno': 4,
-            'status': 'working',
-            'code': 0,
-            'uri': 'http://localhost:7777/',
-            'info': '',
-        }
-    ]