linkcheck: Fix race condition that could lead to checking the availability of the same URL twice

So far, linkcheck scans all of references and images from documents, and
checks them parallel.  As a result, some URL would be checked twice (or
more) by race condition.

This collects the URL via post-transforms, and removes duplicated URLs
before checking availability.

refs: #4303
This commit is contained in:
Takeshi KOMIYA
2021-01-19 00:23:25 +09:00
parent a7b6b6bb7f
commit cead0f6ddf
6 changed files with 64 additions and 74 deletions

View File

@@ -10,6 +10,7 @@ Incompatible changes
Deprecated
----------
* ``sphinx.builders.linkcheck.node_line_or_0()``
* ``sphinx.ext.autodoc.AttributeDocumenter.isinstanceattribute()``
* ``sphinx.ext.autodoc.directive.DocumenterBridge.reporter``
* ``sphinx.ext.autodoc.importer.get_module_members()``
@@ -57,6 +58,8 @@ Bugs fixed
+ or ^) are used as keystrokes
* #8629: html: A type warning for html_use_opensearch is shown twice
* #8665: html theme: Could not override globaltoc_maxdepth in theme.conf
* #4304: linkcheck: Fix race condition that could lead to checking the
availability of the same URL twice
* #8094: texinfo: image files on the different directory with document are not
copied
* #8671: :confval:`highlight_options` is not working

View File

@@ -26,6 +26,11 @@ The following is a list of deprecated interfaces.
- (will be) Removed
- Alternatives
* - ``sphinx.builders.linkcheck.node_line_or_0()``
- 3.5
- 5.0
- ``sphinx.util.nodes.get_node_line()``
* - ``sphinx.ext.autodoc.AttributeDocumenter.isinstanceattribute()``
- 3.5
- 5.0

View File

@@ -14,11 +14,12 @@ import re
import socket
import threading
import time
import warnings
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from html.parser import HTMLParser
from os import path
from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple
from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple, cast
from urllib.parse import unquote, urlparse
from docutils import nodes
@@ -28,7 +29,9 @@ from requests.exceptions import HTTPError, TooManyRedirects
from sphinx.application import Sphinx
from sphinx.builders import Builder
from sphinx.deprecation import RemovedInSphinx40Warning
from sphinx.locale import __
from sphinx.transforms.post_transforms import SphinxPostTransform
from sphinx.util import encode_uri, logging, requests
from sphinx.util.console import darkgray, darkgreen, purple, red, turquoise # type: ignore
from sphinx.util.nodes import get_node_line
@@ -37,6 +40,10 @@ logger = logging.getLogger(__name__)
uri_re = re.compile('([a-z]+:)?//') # matches to foo:// and // (a protocol relative URL)
Hyperlink = NamedTuple('Hyperlink', (('next_check', float),
('uri', Optional[str]),
('docname', Optional[str]),
('lineno', Optional[int])))
RateLimit = NamedTuple('RateLimit', (('delay', float), ('next_check', float)))
DEFAULT_REQUEST_HEADERS = {
@@ -52,6 +59,8 @@ def node_line_or_0(node: Element) -> int:
PriorityQueue items must be comparable. The line number is part of the
tuple used by the PriorityQueue, keep an homogeneous type for comparison.
"""
warnings.warn('node_line_or_0() is deprecated.',
RemovedInSphinx40Warning, stacklevel=2)
return get_node_line(node) or 0
@@ -98,6 +107,7 @@ class CheckExternalLinksBuilder(Builder):
'%(outdir)s/output.txt')
def init(self) -> None:
self.hyperlinks = {} # type: Dict[str, Hyperlink]
self.to_ignore = [re.compile(x) for x in self.app.config.linkcheck_ignore]
self.anchors_ignore = [re.compile(x)
for x in self.app.config.linkcheck_anchors_ignore]
@@ -406,35 +416,7 @@ class CheckExternalLinksBuilder(Builder):
return
def write_doc(self, docname: str, doctree: Node) -> None:
logger.info('')
n = 0
# reference nodes
for refnode in doctree.traverse(nodes.reference):
if 'refuri' not in refnode:
continue
uri = refnode['refuri']
lineno = node_line_or_0(refnode)
uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
self.wqueue.put(uri_info, False)
n += 1
# image nodes
for imgnode in doctree.traverse(nodes.image):
uri = imgnode['candidates'].get('?')
if uri and '://' in uri:
lineno = node_line_or_0(imgnode)
uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
self.wqueue.put(uri_info, False)
n += 1
done = 0
while done < n:
self.process_result(self.rqueue.get())
done += 1
if self.broken:
self.app.statuscode = 1
pass
def write_entry(self, what: str, docname: str, filename: str, line: int,
uri: str) -> None:
@@ -447,14 +429,58 @@ class CheckExternalLinksBuilder(Builder):
output.write('\n')
def finish(self) -> None:
logger.info('')
n = 0
for hyperlink in self.hyperlinks.values():
self.wqueue.put(hyperlink, False)
n += 1
done = 0
while done < n:
self.process_result(self.rqueue.get())
done += 1
if self.broken:
self.app.statuscode = 1
self.wqueue.join()
# Shutdown threads.
for worker in self.workers:
self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False)
class HyperlinkCollector(SphinxPostTransform):
builders = ('linkcheck',)
default_priority = 800
def run(self, **kwargs: Any) -> None:
builder = cast(CheckExternalLinksBuilder, self.app.builder)
hyperlinks = builder.hyperlinks
# reference nodes
for refnode in self.document.traverse(nodes.reference):
if 'refuri' not in refnode:
continue
uri = refnode['refuri']
lineno = get_node_line(refnode)
uri_info = Hyperlink(CHECK_IMMEDIATELY, uri, self.env.docname, lineno)
if uri not in hyperlinks:
hyperlinks[uri] = uri_info
# image nodes
for imgnode in self.document.traverse(nodes.image):
uri = imgnode['candidates'].get('?')
if uri and '://' in uri:
lineno = get_node_line(imgnode)
uri_info = Hyperlink(CHECK_IMMEDIATELY, uri, self.env.docname, lineno)
if uri not in hyperlinks:
hyperlinks[uri] = uri_info
def setup(app: Sphinx) -> Dict[str, Any]:
app.add_builder(CheckExternalLinksBuilder)
app.add_post_transform(HyperlinkCollector)
app.add_config_value('linkcheck_ignore', [], None)
app.add_config_value('linkcheck_auth', [], None)

View File

@@ -1 +0,0 @@
exclude_patterns = ['_build']

View File

@@ -1,6 +0,0 @@
.. image:: http://localhost:7777/
:target: http://localhost:7777/
`weblate.org`_
.. _weblate.org: http://localhost:7777/

View File

@@ -573,40 +573,3 @@ def test_limit_rate_bails_out_after_waiting_max_time(app):
checker.rate_limits = {"localhost": RateLimit(90.0, 0.0)}
next_check = checker.limit_rate(FakeResponse())
assert next_check is None
@pytest.mark.sphinx(
'linkcheck', testroot='linkcheck-localserver-two-links', freshenv=True,
)
def test_priorityqueue_items_are_comparable(app):
with http_server(OKHandler):
app.builder.build_all()
content = (app.outdir / 'output.json').read_text()
rows = [json.loads(x) for x in sorted(content.splitlines())]
assert rows == [
{
'filename': 'index.rst',
# Should not be None.
'lineno': 0,
'status': 'working',
'code': 0,
'uri': 'http://localhost:7777/',
'info': '',
},
{
'filename': 'index.rst',
'lineno': 0,
'status': 'working',
'code': 0,
'uri': 'http://localhost:7777/',
'info': '',
},
{
'filename': 'index.rst',
'lineno': 4,
'status': 'working',
'code': 0,
'uri': 'http://localhost:7777/',
'info': '',
}
]