refactor: linkcheck: Separate worker feature from builder class

To reduce the complexity of the linkcheck builder, this separates
the worker feature from the builder class.
This commit is contained in:
Takeshi KOMIYA 2021-01-31 17:45:38 +09:00
parent a39b5f08e9
commit f02fb7a8cc
2 changed files with 148 additions and 100 deletions

View File

@ -12,13 +12,13 @@ import json
import queue import queue
import re import re
import socket import socket
import threading
import time import time
import warnings import warnings
from datetime import datetime, timezone from datetime import datetime, timezone
from email.utils import parsedate_to_datetime from email.utils import parsedate_to_datetime
from html.parser import HTMLParser from html.parser import HTMLParser
from os import path from os import path
from threading import Thread
from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple, cast from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple, cast
from urllib.parse import unquote, urlparse from urllib.parse import unquote, urlparse
@ -129,9 +129,9 @@ class CheckExternalLinksBuilder(DummyBuilder):
self.rate_limits = {} # type: Dict[str, RateLimit] self.rate_limits = {} # type: Dict[str, RateLimit]
self.wqueue = queue.PriorityQueue() # type: queue.PriorityQueue self.wqueue = queue.PriorityQueue() # type: queue.PriorityQueue
self.rqueue = queue.Queue() # type: queue.Queue self.rqueue = queue.Queue() # type: queue.Queue
self.workers = [] # type: List[threading.Thread] self.workers = [] # type: List[Thread]
for i in range(self.config.linkcheck_workers): for i in range(self.config.linkcheck_workers):
thread = threading.Thread(target=self.check_thread, daemon=True) thread = HyperlinkAvailabilityCheckWorker(self)
thread.start() thread.start()
self.workers.append(thread) self.workers.append(thread)
@ -166,6 +166,134 @@ class CheckExternalLinksBuilder(DummyBuilder):
return self._redirected return self._redirected
def check_thread(self) -> None: def check_thread(self) -> None:
warnings.warn(
"%s.%s is deprecated." % (self.__class__.__name__, "check_thread"),
RemovedInSphinx50Warning,
stacklevel=2,
)
# do nothing.
def limit_rate(self, response: Response) -> Optional[float]:
warnings.warn(
"%s.%s is deprecated." % (self.__class__.__name__, "limit_rate"),
RemovedInSphinx50Warning,
stacklevel=2,
)
return HyperlinkAvailabilityCheckWorker(self).limit_rate(response)
def process_result(self, result: Tuple[str, str, int, str, str, int]) -> None:
uri, docname, lineno, status, info, code = result
filename = self.env.doc2path(docname, None)
linkstat = dict(filename=filename, lineno=lineno,
status=status, code=code, uri=uri,
info=info)
if status == 'unchecked':
self.write_linkstat(linkstat)
return
if status == 'working' and info == 'old':
self.write_linkstat(linkstat)
return
if lineno:
logger.info('(%16s: line %4d) ', docname, lineno, nonl=True)
if status == 'ignored':
if info:
logger.info(darkgray('-ignored- ') + uri + ': ' + info)
else:
logger.info(darkgray('-ignored- ') + uri)
self.write_linkstat(linkstat)
elif status == 'local':
logger.info(darkgray('-local- ') + uri)
self.write_entry('local', docname, filename, lineno, uri)
self.write_linkstat(linkstat)
elif status == 'working':
logger.info(darkgreen('ok ') + uri + info)
self.write_linkstat(linkstat)
elif status == 'broken':
if self.app.quiet or self.app.warningiserror:
logger.warning(__('broken link: %s (%s)'), uri, info,
location=(filename, lineno))
else:
logger.info(red('broken ') + uri + red(' - ' + info))
self.write_entry('broken', docname, filename, lineno, uri + ': ' + info)
self.write_linkstat(linkstat)
elif status == 'redirected':
try:
text, color = {
301: ('permanently', purple),
302: ('with Found', purple),
303: ('with See Other', purple),
307: ('temporarily', turquoise),
308: ('permanently', purple),
}[code]
except KeyError:
text, color = ('with unknown code', purple)
linkstat['text'] = text
logger.info(color('redirect ') + uri + color(' - ' + text + ' to ' + info))
self.write_entry('redirected ' + text, docname, filename,
lineno, uri + ' to ' + info)
self.write_linkstat(linkstat)
else:
raise ValueError("Unknown status %s." % status)
def write_entry(self, what: str, docname: str, filename: str, line: int,
uri: str) -> None:
self.txt_outfile.write("%s:%s: [%s] %s\n" % (filename, line, what, uri))
def write_linkstat(self, data: dict) -> None:
self.json_outfile.write(json.dumps(data))
self.json_outfile.write('\n')
def finish(self) -> None:
logger.info('')
with open(path.join(self.outdir, 'output.txt'), 'w') as self.txt_outfile,\
open(path.join(self.outdir, 'output.json'), 'w') as self.json_outfile:
total_links = 0
for hyperlink in self.hyperlinks.values():
if self.is_ignored_uri(hyperlink.uri):
self.process_result(
CheckResult(hyperlink.uri, hyperlink.docname, hyperlink.lineno,
'ignored', '', 0))
else:
self.wqueue.put(hyperlink, False)
total_links += 1
done = 0
while done < total_links:
self.process_result(self.rqueue.get())
done += 1
if self._broken:
self.app.statuscode = 1
self.wqueue.join()
# Shutdown threads.
for worker in self.workers:
self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False)
class HyperlinkAvailabilityCheckWorker(Thread):
"""A worker class for checing the availability of hyperlinks."""
def __init__(self, builder: CheckExternalLinksBuilder) -> None:
self.app = builder.app
self.anchors_ignore = builder.anchors_ignore
self.auth = builder.auth
self.config = builder.config
self.env = builder.env
self.rate_limits = builder.rate_limits
self.rqueue = builder.rqueue
self.to_ignore = builder.to_ignore
self.wqueue = builder.wqueue
self._good = builder._good
self._broken = builder._broken
self._redirected = builder._redirected
super().__init__(daemon=True)
def run(self) -> None:
kwargs = {} kwargs = {}
if self.config.linkcheck_timeout: if self.config.linkcheck_timeout:
kwargs['timeout'] = self.config.linkcheck_timeout kwargs['timeout'] = self.config.linkcheck_timeout
@ -378,97 +506,6 @@ class CheckExternalLinksBuilder(DummyBuilder):
self.rate_limits[netloc] = RateLimit(delay, next_check) self.rate_limits[netloc] = RateLimit(delay, next_check)
return next_check return next_check
def process_result(self, result: CheckResult) -> None:
uri, docname, lineno, status, info, code = result
filename = self.env.doc2path(docname, None)
linkstat = dict(filename=filename, lineno=lineno,
status=status, code=code, uri=uri,
info=info)
if status == 'unchecked':
self.write_linkstat(linkstat)
return
if status == 'working' and info == 'old':
self.write_linkstat(linkstat)
return
if lineno:
logger.info('(%16s: line %4d) ', docname, lineno, nonl=True)
if status == 'ignored':
if info:
logger.info(darkgray('-ignored- ') + uri + ': ' + info)
else:
logger.info(darkgray('-ignored- ') + uri)
self.write_linkstat(linkstat)
elif status == 'local':
logger.info(darkgray('-local- ') + uri)
self.write_entry('local', docname, filename, lineno, uri)
self.write_linkstat(linkstat)
elif status == 'working':
logger.info(darkgreen('ok ') + uri + info)
self.write_linkstat(linkstat)
elif status == 'broken':
if self.app.quiet or self.app.warningiserror:
logger.warning(__('broken link: %s (%s)'), uri, info,
location=(filename, lineno))
else:
logger.info(red('broken ') + uri + red(' - ' + info))
self.write_entry('broken', docname, filename, lineno, uri + ': ' + info)
self.write_linkstat(linkstat)
elif status == 'redirected':
try:
text, color = {
301: ('permanently', purple),
302: ('with Found', purple),
303: ('with See Other', purple),
307: ('temporarily', turquoise),
308: ('permanently', purple),
}[code]
except KeyError:
text, color = ('with unknown code', purple)
linkstat['text'] = text
logger.info(color('redirect ') + uri + color(' - ' + text + ' to ' + info))
self.write_entry('redirected ' + text, docname, filename,
lineno, uri + ' to ' + info)
self.write_linkstat(linkstat)
else:
raise ValueError("Unknown status %s." % status)
def write_entry(self, what: str, docname: str, filename: str, line: int,
uri: str) -> None:
self.txt_outfile.write("%s:%s: [%s] %s\n" % (filename, line, what, uri))
def write_linkstat(self, data: dict) -> None:
self.json_outfile.write(json.dumps(data))
self.json_outfile.write('\n')
def finish(self) -> None:
logger.info('')
with open(path.join(self.outdir, 'output.txt'), 'w') as self.txt_outfile,\
open(path.join(self.outdir, 'output.json'), 'w') as self.json_outfile:
total_links = 0
for hyperlink in self.hyperlinks.values():
if self.is_ignored_uri(hyperlink.uri):
self.process_result(
CheckResult(hyperlink.uri, hyperlink.docname, hyperlink.lineno,
'ignored', '', 0))
else:
self.wqueue.put(hyperlink, False)
total_links += 1
done = 0
while done < total_links:
self.process_result(self.rqueue.get())
done += 1
if self._broken:
self.app.statuscode = 1
self.wqueue.join()
# Shutdown threads.
for worker in self.workers:
self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False)
class HyperlinkCollector(SphinxPostTransform): class HyperlinkCollector(SphinxPostTransform):
builders = ('linkcheck',) builders = ('linkcheck',)

View File

@ -21,7 +21,8 @@ from unittest import mock
import pytest import pytest
import requests import requests
from sphinx.builders.linkcheck import CheckExternalLinksBuilder, RateLimit from sphinx.builders.linkcheck import (CheckExternalLinksBuilder,
HyperlinkAvailabilityCheckWorker, RateLimit)
from sphinx.util.console import strip_colors from sphinx.util.console import strip_colors
from .utils import CERT_FILE, http_server, https_server from .utils import CERT_FILE, http_server, https_server
@ -536,40 +537,50 @@ class FakeResponse:
def test_limit_rate_default_sleep(app): def test_limit_rate_default_sleep(app):
checker = CheckExternalLinksBuilder(app) checker = CheckExternalLinksBuilder(app)
checker.init()
checker.rate_limits = {} checker.rate_limits = {}
worker = HyperlinkAvailabilityCheckWorker(checker)
with mock.patch('time.time', return_value=0.0): with mock.patch('time.time', return_value=0.0):
next_check = checker.limit_rate(FakeResponse()) next_check = worker.limit_rate(FakeResponse())
assert next_check == 60.0 assert next_check == 60.0
def test_limit_rate_user_max_delay(app): def test_limit_rate_user_max_delay(app):
app.config.linkcheck_rate_limit_timeout = 0.0 app.config.linkcheck_rate_limit_timeout = 0.0
checker = CheckExternalLinksBuilder(app) checker = CheckExternalLinksBuilder(app)
checker.init()
checker.rate_limits = {} checker.rate_limits = {}
next_check = checker.limit_rate(FakeResponse()) worker = HyperlinkAvailabilityCheckWorker(checker)
next_check = worker.limit_rate(FakeResponse())
assert next_check is None assert next_check is None
def test_limit_rate_doubles_previous_wait_time(app): def test_limit_rate_doubles_previous_wait_time(app):
checker = CheckExternalLinksBuilder(app) checker = CheckExternalLinksBuilder(app)
checker.init()
checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)} checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)}
worker = HyperlinkAvailabilityCheckWorker(checker)
with mock.patch('time.time', return_value=0.0): with mock.patch('time.time', return_value=0.0):
next_check = checker.limit_rate(FakeResponse()) next_check = worker.limit_rate(FakeResponse())
assert next_check == 120.0 assert next_check == 120.0
def test_limit_rate_clips_wait_time_to_max_time(app): def test_limit_rate_clips_wait_time_to_max_time(app):
checker = CheckExternalLinksBuilder(app) checker = CheckExternalLinksBuilder(app)
checker.init()
app.config.linkcheck_rate_limit_timeout = 90.0 app.config.linkcheck_rate_limit_timeout = 90.0
checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)} checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)}
worker = HyperlinkAvailabilityCheckWorker(checker)
with mock.patch('time.time', return_value=0.0): with mock.patch('time.time', return_value=0.0):
next_check = checker.limit_rate(FakeResponse()) next_check = worker.limit_rate(FakeResponse())
assert next_check == 90.0 assert next_check == 90.0
def test_limit_rate_bails_out_after_waiting_max_time(app): def test_limit_rate_bails_out_after_waiting_max_time(app):
checker = CheckExternalLinksBuilder(app) checker = CheckExternalLinksBuilder(app)
checker.init()
app.config.linkcheck_rate_limit_timeout = 90.0 app.config.linkcheck_rate_limit_timeout = 90.0
checker.rate_limits = {"localhost": RateLimit(90.0, 0.0)} checker.rate_limits = {"localhost": RateLimit(90.0, 0.0)}
next_check = checker.limit_rate(FakeResponse()) worker = HyperlinkAvailabilityCheckWorker(checker)
next_check = worker.limit_rate(FakeResponse())
assert next_check is None assert next_check is None