refactor: linkcheck: Separate thread manager feature from builder class

To reduce the complexity of the linkcheck builder, this separates
the thread manager feature from the builder class as
HyperlinkAvailabilityChecker.
This commit is contained in:
Takeshi KOMIYA 2021-02-04 02:06:38 +09:00
parent 30bc4d450a
commit 5c223d20d6
4 changed files with 127 additions and 49 deletions

View File

@ -23,7 +23,10 @@ Deprecated
* ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.broken`` * ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.broken``
* ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.good`` * ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.good``
* ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.redirected`` * ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.redirected``
* ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.rqueue``
* ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.to_ignore`` * ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.to_ignore``
* ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.workers``
* ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.wqueue``
* ``sphinx.builders.linkcheck.node_line_or_0()`` * ``sphinx.builders.linkcheck.node_line_or_0()``
* ``sphinx.ext.autodoc.AttributeDocumenter.isinstanceattribute()`` * ``sphinx.ext.autodoc.AttributeDocumenter.isinstanceattribute()``
* ``sphinx.ext.autodoc.directive.DocumenterBridge.reporter`` * ``sphinx.ext.autodoc.directive.DocumenterBridge.reporter``

View File

@ -52,11 +52,26 @@ The following is a list of deprecated interfaces.
- 5.0 - 5.0
- N/A - N/A
* - ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.rqueue``
- 3.5
- 5.0
- N/A
* - ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.to_ignore`` * - ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.to_ignore``
- 3.5 - 3.5
- 5.0 - 5.0
- N/A - N/A
* - ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.workers``
- 3.5
- 5.0
- N/A
* - ``sphinx.builders.linkcheck.CheckExternalLinksBuilder.wqueue``
- 3.5
- 5.0
- N/A
* - ``sphinx.builders.linkcheck.node_line_or_0()`` * - ``sphinx.builders.linkcheck.node_line_or_0()``
- 3.5 - 3.5
- 5.0 - 5.0

View File

@ -19,7 +19,7 @@ from email.utils import parsedate_to_datetime
from html.parser import HTMLParser from html.parser import HTMLParser
from os import path from os import path
from threading import Thread from threading import Thread
from typing import Any, Dict, List, NamedTuple, Optional, Pattern, Set, Tuple, cast from typing import Any, Dict, Generator, List, NamedTuple, Optional, Pattern, Set, Tuple, cast
from urllib.parse import unquote, urlparse from urllib.parse import unquote, urlparse
from docutils import nodes from docutils import nodes
@ -121,17 +121,8 @@ class CheckExternalLinksBuilder(DummyBuilder):
socket.setdefaulttimeout(5.0) socket.setdefaulttimeout(5.0)
# create queues and worker threads # create queues and worker threads
self.rate_limits = {} # type: Dict[str, RateLimit] self._wqueue = queue.PriorityQueue() # type: queue.PriorityQueue
self.wqueue = queue.PriorityQueue() # type: queue.PriorityQueue self._rqueue = queue.Queue() # type: queue.Queue
self.rqueue = queue.Queue() # type: queue.Queue
self.workers = [] # type: List[Thread]
for i in range(self.config.linkcheck_workers):
thread = HyperlinkAvailabilityCheckWorker(self)
thread.start()
self.workers.append(thread)
def is_ignored_uri(self, uri: str) -> bool:
return any(pat.match(uri) for pat in self.to_ignore)
@property @property
def anchors_ignore(self) -> List[Pattern]: def anchors_ignore(self) -> List[Pattern]:
@ -202,7 +193,31 @@ class CheckExternalLinksBuilder(DummyBuilder):
RemovedInSphinx50Warning, RemovedInSphinx50Warning,
stacklevel=2, stacklevel=2,
) )
return HyperlinkAvailabilityCheckWorker(self).limit_rate(response) return HyperlinkAvailabilityCheckWorker(self, None, None, {}).limit_rate(response)
def rqueue(self, response: Response) -> queue.Queue:
warnings.warn(
"%s.%s is deprecated." % (self.__class__.__name__, "rqueue"),
RemovedInSphinx50Warning,
stacklevel=2,
)
return self._rqueue
def workers(self, response: Response) -> List[Thread]:
warnings.warn(
"%s.%s is deprecated." % (self.__class__.__name__, "workers"),
RemovedInSphinx50Warning,
stacklevel=2,
)
return []
def wqueue(self, response: Response) -> queue.Queue:
warnings.warn(
"%s.%s is deprecated." % (self.__class__.__name__, "wqueue"),
RemovedInSphinx50Warning,
stacklevel=2,
)
return self._wqueue
def process_result(self, result: Tuple[str, str, int, str, str, int]) -> None: def process_result(self, result: Tuple[str, str, int, str, str, int]) -> None:
uri, docname, lineno, status, info, code = result uri, docname, lineno, status, info, code = result
@ -268,49 +283,84 @@ class CheckExternalLinksBuilder(DummyBuilder):
self.json_outfile.write('\n') self.json_outfile.write('\n')
def finish(self) -> None: def finish(self) -> None:
checker = HyperlinkAvailabilityChecker(self)
logger.info('') logger.info('')
with open(path.join(self.outdir, 'output.txt'), 'w') as self.txt_outfile,\ with open(path.join(self.outdir, 'output.txt'), 'w') as self.txt_outfile,\
open(path.join(self.outdir, 'output.json'), 'w') as self.json_outfile: open(path.join(self.outdir, 'output.json'), 'w') as self.json_outfile:
for result in checker.check(self.hyperlinks):
self.process_result(result)
if self._broken:
self.app.statuscode = 1
class HyperlinkAvailabilityChecker:
def __init__(self, builder: CheckExternalLinksBuilder) -> None:
self.builder = builder
self.config = builder.config
self.rate_limits = {} # type: Dict[str, RateLimit]
self.workers = [] # type: List[Thread]
self.to_ignore = [re.compile(x) for x in self.config.linkcheck_ignore]
if builder:
self.rqueue = builder._rqueue
self.wqueue = builder._wqueue
else:
self.rqueue = queue.Queue()
self.wqueue = queue.PriorityQueue()
def invoke_threads(self) -> None:
for i in range(self.config.linkcheck_workers):
thread = HyperlinkAvailabilityCheckWorker(self.builder, self.rqueue, self.wqueue,
self.rate_limits)
thread.start()
self.workers.append(thread)
def shutdown_threads(self) -> None:
self.wqueue.join()
for worker in self.workers:
self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False)
def check(self, hyperlinks: Dict[str, Hyperlink]) -> Generator[CheckResult, None, None]:
self.invoke_threads()
total_links = 0 total_links = 0
for hyperlink in self.hyperlinks.values(): for hyperlink in hyperlinks.values():
if self.is_ignored_uri(hyperlink.uri): if self.is_ignored_uri(hyperlink.uri):
self.process_result( yield CheckResult(hyperlink.uri, hyperlink.docname, hyperlink.lineno,
CheckResult(hyperlink.uri, hyperlink.docname, hyperlink.lineno, 'ignored', '', 0)
'ignored', '', 0))
else: else:
self.wqueue.put(hyperlink, False) self.wqueue.put(hyperlink, False)
total_links += 1 total_links += 1
done = 0 done = 0
while done < total_links: while done < total_links:
self.process_result(self.rqueue.get()) yield self.rqueue.get()
done += 1 done += 1
if self._broken: self.shutdown_threads()
self.app.statuscode = 1
self.wqueue.join() def is_ignored_uri(self, uri: str) -> bool:
# Shutdown threads. return any(pat.match(uri) for pat in self.to_ignore)
for worker in self.workers:
self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False)
class HyperlinkAvailabilityCheckWorker(Thread): class HyperlinkAvailabilityCheckWorker(Thread):
"""A worker class for checking the availability of hyperlinks.""" """A worker class for checking the availability of hyperlinks."""
def __init__(self, builder: CheckExternalLinksBuilder) -> None: def __init__(self, builder: CheckExternalLinksBuilder, rqueue: queue.Queue,
wqueue: queue.Queue, rate_limits: Dict[str, RateLimit]) -> None:
self.config = builder.config self.config = builder.config
self.env = builder.env self.env = builder.env
self.rate_limits = builder.rate_limits self.rate_limits = rate_limits
self.rqueue = builder.rqueue self.rqueue = rqueue
self.wqueue = builder.wqueue self.wqueue = wqueue
self.anchors_ignore = [re.compile(x) self.anchors_ignore = [re.compile(x)
for x in self.config.linkcheck_anchors_ignore] for x in self.config.linkcheck_anchors_ignore]
self.auth = [(re.compile(pattern), auth_info) for pattern, auth_info self.auth = [(re.compile(pattern), auth_info) for pattern, auth_info
in self.config.linkcheck_auth] in self.config.linkcheck_auth]
self.to_ignore = [re.compile(x) for x in self.config.linkcheck_ignore]
self._good = builder._good self._good = builder._good
self._broken = builder._broken self._broken = builder._broken

View File

@ -21,7 +21,7 @@ from unittest import mock
import pytest import pytest
import requests import requests
from sphinx.builders.linkcheck import (CheckExternalLinksBuilder, from sphinx.builders.linkcheck import (CheckExternalLinksBuilder, HyperlinkAvailabilityChecker,
HyperlinkAvailabilityCheckWorker, RateLimit) HyperlinkAvailabilityCheckWorker, RateLimit)
from sphinx.util.console import strip_colors from sphinx.util.console import strip_colors
@ -536,10 +536,12 @@ class FakeResponse:
def test_limit_rate_default_sleep(app): def test_limit_rate_default_sleep(app):
checker = CheckExternalLinksBuilder(app) builder = CheckExternalLinksBuilder(app)
checker.init() builder.init()
checker = HyperlinkAvailabilityChecker(builder)
checker.rate_limits = {} checker.rate_limits = {}
worker = HyperlinkAvailabilityCheckWorker(checker) worker = HyperlinkAvailabilityCheckWorker(builder, checker.rqueue, checker.wqueue,
checker.rate_limits)
with mock.patch('time.time', return_value=0.0): with mock.patch('time.time', return_value=0.0):
next_check = worker.limit_rate(FakeResponse()) next_check = worker.limit_rate(FakeResponse())
assert next_check == 60.0 assert next_check == 60.0
@ -547,40 +549,48 @@ def test_limit_rate_default_sleep(app):
def test_limit_rate_user_max_delay(app): def test_limit_rate_user_max_delay(app):
app.config.linkcheck_rate_limit_timeout = 0.0 app.config.linkcheck_rate_limit_timeout = 0.0
checker = CheckExternalLinksBuilder(app) builder = CheckExternalLinksBuilder(app)
checker.init() builder.init()
checker = HyperlinkAvailabilityChecker(builder)
checker.rate_limits = {} checker.rate_limits = {}
worker = HyperlinkAvailabilityCheckWorker(checker) worker = HyperlinkAvailabilityCheckWorker(builder, checker.rqueue, checker.wqueue,
checker.rate_limits)
next_check = worker.limit_rate(FakeResponse()) next_check = worker.limit_rate(FakeResponse())
assert next_check is None assert next_check is None
def test_limit_rate_doubles_previous_wait_time(app): def test_limit_rate_doubles_previous_wait_time(app):
checker = CheckExternalLinksBuilder(app) builder = CheckExternalLinksBuilder(app)
checker.init() builder.init()
checker = HyperlinkAvailabilityChecker(builder)
checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)} checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)}
worker = HyperlinkAvailabilityCheckWorker(checker) worker = HyperlinkAvailabilityCheckWorker(builder, checker.rqueue, checker.wqueue,
checker.rate_limits)
with mock.patch('time.time', return_value=0.0): with mock.patch('time.time', return_value=0.0):
next_check = worker.limit_rate(FakeResponse()) next_check = worker.limit_rate(FakeResponse())
assert next_check == 120.0 assert next_check == 120.0
def test_limit_rate_clips_wait_time_to_max_time(app): def test_limit_rate_clips_wait_time_to_max_time(app):
checker = CheckExternalLinksBuilder(app)
checker.init()
app.config.linkcheck_rate_limit_timeout = 90.0 app.config.linkcheck_rate_limit_timeout = 90.0
builder = CheckExternalLinksBuilder(app)
builder.init()
checker = HyperlinkAvailabilityChecker(builder)
checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)} checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)}
worker = HyperlinkAvailabilityCheckWorker(checker) worker = HyperlinkAvailabilityCheckWorker(builder, checker.rqueue, checker.wqueue,
checker.rate_limits)
with mock.patch('time.time', return_value=0.0): with mock.patch('time.time', return_value=0.0):
next_check = worker.limit_rate(FakeResponse()) next_check = worker.limit_rate(FakeResponse())
assert next_check == 90.0 assert next_check == 90.0
def test_limit_rate_bails_out_after_waiting_max_time(app): def test_limit_rate_bails_out_after_waiting_max_time(app):
checker = CheckExternalLinksBuilder(app)
checker.init()
app.config.linkcheck_rate_limit_timeout = 90.0 app.config.linkcheck_rate_limit_timeout = 90.0
builder = CheckExternalLinksBuilder(app)
builder.init()
checker = HyperlinkAvailabilityChecker(builder)
checker.rate_limits = {"localhost": RateLimit(90.0, 0.0)} checker.rate_limits = {"localhost": RateLimit(90.0, 0.0)}
worker = HyperlinkAvailabilityCheckWorker(checker) worker = HyperlinkAvailabilityCheckWorker(builder, checker.rqueue, checker.wqueue,
checker.rate_limits)
next_check = worker.limit_rate(FakeResponse()) next_check = worker.limit_rate(FakeResponse())
assert next_check is None assert next_check is None