refactor: linkcheck: Separate worker feature from builder class

To reduce the complexity of the linkcheck builder, this separates
the worker feature from the builder class.
This commit is contained in:
Takeshi KOMIYA 2021-01-31 17:45:38 +09:00
parent a39b5f08e9
commit f02fb7a8cc
2 changed files with 148 additions and 100 deletions

View File

@ -12,13 +12,13 @@ import json
import queue
import re
import socket
import threading
import time
import warnings
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from html.parser import HTMLParser
from os import path
from threading import Thread
from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple, cast
from urllib.parse import unquote, urlparse
@ -129,9 +129,9 @@ class CheckExternalLinksBuilder(DummyBuilder):
self.rate_limits = {} # type: Dict[str, RateLimit]
self.wqueue = queue.PriorityQueue() # type: queue.PriorityQueue
self.rqueue = queue.Queue() # type: queue.Queue
self.workers = [] # type: List[threading.Thread]
self.workers = [] # type: List[Thread]
for i in range(self.config.linkcheck_workers):
thread = threading.Thread(target=self.check_thread, daemon=True)
thread = HyperlinkAvailabilityCheckWorker(self)
thread.start()
self.workers.append(thread)
@ -166,6 +166,134 @@ class CheckExternalLinksBuilder(DummyBuilder):
return self._redirected
def check_thread(self) -> None:
warnings.warn(
"%s.%s is deprecated." % (self.__class__.__name__, "check_thread"),
RemovedInSphinx50Warning,
stacklevel=2,
)
# do nothing.
def limit_rate(self, response: Response) -> Optional[float]:
warnings.warn(
"%s.%s is deprecated." % (self.__class__.__name__, "limit_rate"),
RemovedInSphinx50Warning,
stacklevel=2,
)
return HyperlinkAvailabilityCheckWorker(self).limit_rate(response)
def process_result(self, result: Tuple[str, str, int, str, str, int]) -> None:
uri, docname, lineno, status, info, code = result
filename = self.env.doc2path(docname, None)
linkstat = dict(filename=filename, lineno=lineno,
status=status, code=code, uri=uri,
info=info)
if status == 'unchecked':
self.write_linkstat(linkstat)
return
if status == 'working' and info == 'old':
self.write_linkstat(linkstat)
return
if lineno:
logger.info('(%16s: line %4d) ', docname, lineno, nonl=True)
if status == 'ignored':
if info:
logger.info(darkgray('-ignored- ') + uri + ': ' + info)
else:
logger.info(darkgray('-ignored- ') + uri)
self.write_linkstat(linkstat)
elif status == 'local':
logger.info(darkgray('-local- ') + uri)
self.write_entry('local', docname, filename, lineno, uri)
self.write_linkstat(linkstat)
elif status == 'working':
logger.info(darkgreen('ok ') + uri + info)
self.write_linkstat(linkstat)
elif status == 'broken':
if self.app.quiet or self.app.warningiserror:
logger.warning(__('broken link: %s (%s)'), uri, info,
location=(filename, lineno))
else:
logger.info(red('broken ') + uri + red(' - ' + info))
self.write_entry('broken', docname, filename, lineno, uri + ': ' + info)
self.write_linkstat(linkstat)
elif status == 'redirected':
try:
text, color = {
301: ('permanently', purple),
302: ('with Found', purple),
303: ('with See Other', purple),
307: ('temporarily', turquoise),
308: ('permanently', purple),
}[code]
except KeyError:
text, color = ('with unknown code', purple)
linkstat['text'] = text
logger.info(color('redirect ') + uri + color(' - ' + text + ' to ' + info))
self.write_entry('redirected ' + text, docname, filename,
lineno, uri + ' to ' + info)
self.write_linkstat(linkstat)
else:
raise ValueError("Unknown status %s." % status)
def write_entry(self, what: str, docname: str, filename: str, line: int,
uri: str) -> None:
self.txt_outfile.write("%s:%s: [%s] %s\n" % (filename, line, what, uri))
def write_linkstat(self, data: dict) -> None:
self.json_outfile.write(json.dumps(data))
self.json_outfile.write('\n')
def finish(self) -> None:
logger.info('')
with open(path.join(self.outdir, 'output.txt'), 'w') as self.txt_outfile,\
open(path.join(self.outdir, 'output.json'), 'w') as self.json_outfile:
total_links = 0
for hyperlink in self.hyperlinks.values():
if self.is_ignored_uri(hyperlink.uri):
self.process_result(
CheckResult(hyperlink.uri, hyperlink.docname, hyperlink.lineno,
'ignored', '', 0))
else:
self.wqueue.put(hyperlink, False)
total_links += 1
done = 0
while done < total_links:
self.process_result(self.rqueue.get())
done += 1
if self._broken:
self.app.statuscode = 1
self.wqueue.join()
# Shutdown threads.
for worker in self.workers:
self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False)
class HyperlinkAvailabilityCheckWorker(Thread):
"""A worker class for checing the availability of hyperlinks."""
def __init__(self, builder: CheckExternalLinksBuilder) -> None:
self.app = builder.app
self.anchors_ignore = builder.anchors_ignore
self.auth = builder.auth
self.config = builder.config
self.env = builder.env
self.rate_limits = builder.rate_limits
self.rqueue = builder.rqueue
self.to_ignore = builder.to_ignore
self.wqueue = builder.wqueue
self._good = builder._good
self._broken = builder._broken
self._redirected = builder._redirected
super().__init__(daemon=True)
def run(self) -> None:
kwargs = {}
if self.config.linkcheck_timeout:
kwargs['timeout'] = self.config.linkcheck_timeout
@ -378,97 +506,6 @@ class CheckExternalLinksBuilder(DummyBuilder):
self.rate_limits[netloc] = RateLimit(delay, next_check)
return next_check
def process_result(self, result: CheckResult) -> None:
uri, docname, lineno, status, info, code = result
filename = self.env.doc2path(docname, None)
linkstat = dict(filename=filename, lineno=lineno,
status=status, code=code, uri=uri,
info=info)
if status == 'unchecked':
self.write_linkstat(linkstat)
return
if status == 'working' and info == 'old':
self.write_linkstat(linkstat)
return
if lineno:
logger.info('(%16s: line %4d) ', docname, lineno, nonl=True)
if status == 'ignored':
if info:
logger.info(darkgray('-ignored- ') + uri + ': ' + info)
else:
logger.info(darkgray('-ignored- ') + uri)
self.write_linkstat(linkstat)
elif status == 'local':
logger.info(darkgray('-local- ') + uri)
self.write_entry('local', docname, filename, lineno, uri)
self.write_linkstat(linkstat)
elif status == 'working':
logger.info(darkgreen('ok ') + uri + info)
self.write_linkstat(linkstat)
elif status == 'broken':
if self.app.quiet or self.app.warningiserror:
logger.warning(__('broken link: %s (%s)'), uri, info,
location=(filename, lineno))
else:
logger.info(red('broken ') + uri + red(' - ' + info))
self.write_entry('broken', docname, filename, lineno, uri + ': ' + info)
self.write_linkstat(linkstat)
elif status == 'redirected':
try:
text, color = {
301: ('permanently', purple),
302: ('with Found', purple),
303: ('with See Other', purple),
307: ('temporarily', turquoise),
308: ('permanently', purple),
}[code]
except KeyError:
text, color = ('with unknown code', purple)
linkstat['text'] = text
logger.info(color('redirect ') + uri + color(' - ' + text + ' to ' + info))
self.write_entry('redirected ' + text, docname, filename,
lineno, uri + ' to ' + info)
self.write_linkstat(linkstat)
else:
raise ValueError("Unknown status %s." % status)
def write_entry(self, what: str, docname: str, filename: str, line: int,
uri: str) -> None:
self.txt_outfile.write("%s:%s: [%s] %s\n" % (filename, line, what, uri))
def write_linkstat(self, data: dict) -> None:
self.json_outfile.write(json.dumps(data))
self.json_outfile.write('\n')
def finish(self) -> None:
logger.info('')
with open(path.join(self.outdir, 'output.txt'), 'w') as self.txt_outfile,\
open(path.join(self.outdir, 'output.json'), 'w') as self.json_outfile:
total_links = 0
for hyperlink in self.hyperlinks.values():
if self.is_ignored_uri(hyperlink.uri):
self.process_result(
CheckResult(hyperlink.uri, hyperlink.docname, hyperlink.lineno,
'ignored', '', 0))
else:
self.wqueue.put(hyperlink, False)
total_links += 1
done = 0
while done < total_links:
self.process_result(self.rqueue.get())
done += 1
if self._broken:
self.app.statuscode = 1
self.wqueue.join()
# Shutdown threads.
for worker in self.workers:
self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False)
class HyperlinkCollector(SphinxPostTransform):
builders = ('linkcheck',)

View File

@ -21,7 +21,8 @@ from unittest import mock
import pytest
import requests
from sphinx.builders.linkcheck import CheckExternalLinksBuilder, RateLimit
from sphinx.builders.linkcheck import (CheckExternalLinksBuilder,
HyperlinkAvailabilityCheckWorker, RateLimit)
from sphinx.util.console import strip_colors
from .utils import CERT_FILE, http_server, https_server
@ -536,40 +537,50 @@ class FakeResponse:
def test_limit_rate_default_sleep(app):
checker = CheckExternalLinksBuilder(app)
checker.init()
checker.rate_limits = {}
worker = HyperlinkAvailabilityCheckWorker(checker)
with mock.patch('time.time', return_value=0.0):
next_check = checker.limit_rate(FakeResponse())
next_check = worker.limit_rate(FakeResponse())
assert next_check == 60.0
def test_limit_rate_user_max_delay(app):
app.config.linkcheck_rate_limit_timeout = 0.0
checker = CheckExternalLinksBuilder(app)
checker.init()
checker.rate_limits = {}
next_check = checker.limit_rate(FakeResponse())
worker = HyperlinkAvailabilityCheckWorker(checker)
next_check = worker.limit_rate(FakeResponse())
assert next_check is None
def test_limit_rate_doubles_previous_wait_time(app):
checker = CheckExternalLinksBuilder(app)
checker.init()
checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)}
worker = HyperlinkAvailabilityCheckWorker(checker)
with mock.patch('time.time', return_value=0.0):
next_check = checker.limit_rate(FakeResponse())
next_check = worker.limit_rate(FakeResponse())
assert next_check == 120.0
def test_limit_rate_clips_wait_time_to_max_time(app):
checker = CheckExternalLinksBuilder(app)
checker.init()
app.config.linkcheck_rate_limit_timeout = 90.0
checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)}
worker = HyperlinkAvailabilityCheckWorker(checker)
with mock.patch('time.time', return_value=0.0):
next_check = checker.limit_rate(FakeResponse())
next_check = worker.limit_rate(FakeResponse())
assert next_check == 90.0
def test_limit_rate_bails_out_after_waiting_max_time(app):
checker = CheckExternalLinksBuilder(app)
checker.init()
app.config.linkcheck_rate_limit_timeout = 90.0
checker.rate_limits = {"localhost": RateLimit(90.0, 0.0)}
next_check = checker.limit_rate(FakeResponse())
worker = HyperlinkAvailabilityCheckWorker(checker)
next_check = worker.limit_rate(FakeResponse())
assert next_check is None