From 725f74f5ebfa9abb1d1540bf441823cc4973ef59 Mon Sep 17 00:00:00 2001 From: Takeshi KOMIYA Date: Sun, 7 Feb 2021 00:25:22 +0900 Subject: [PATCH] refactor: linkcheck: Remove next_check from Hyperlink object To separate hyperlink itself and checking (intermediate) state, this removes `next_check` attribute from Hyperlink object and add a new named-tuple `CheckRequest`. It was rejected idea on implementing rate-limiting first (see #8467). But it's better way to separate large builder module into small components and make whole of linkcheck builder simple. After this change, `Hyperlink` object represents a hyperlink on the document. It has a URI and location info (docname and lineno). --- sphinx/builders/linkcheck.py | 39 +++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 6b3dbbe12..bd81bcee6 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -19,7 +19,8 @@ from email.utils import parsedate_to_datetime from html.parser import HTMLParser from os import path from threading import Thread -from typing import Any, Dict, Generator, List, NamedTuple, Optional, Pattern, Set, Tuple, cast +from typing import (Any, Dict, Generator, List, NamedTuple, Optional, Pattern, Set, Tuple, + Union, cast) from urllib.parse import unquote, urlparse from docutils import nodes @@ -42,10 +43,11 @@ logger = logging.getLogger(__name__) uri_re = re.compile('([a-z]+:)?//') # matches to foo:// and // (a protocol relative URL) -Hyperlink = NamedTuple('Hyperlink', (('next_check', float), - ('uri', Optional[str]), - ('docname', Optional[str]), +Hyperlink = NamedTuple('Hyperlink', (('uri', str), + ('docname', str), ('lineno', Optional[int]))) +CheckRequest = NamedTuple('CheckRequest', (('next_check', float), + ('hyperlink', Optional[Hyperlink]))) CheckResult = NamedTuple('CheckResult', (('uri', str), ('docname', str), ('lineno', int), @@ -54,6 +56,9 @@ CheckResult = NamedTuple('CheckResult', (('uri', str), ('code', int))) RateLimit = NamedTuple('RateLimit', (('delay', float), ('next_check', float))) +# Tuple is old styled CheckRequest +CheckRequestType = Union[CheckRequest, Tuple[float, str, str, int]] + DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8', } @@ -123,7 +128,7 @@ class CheckExternalLinksBuilder(DummyBuilder): socket.setdefaulttimeout(5.0) # create queues and worker threads - self._wqueue = queue.PriorityQueue() # type: queue.PriorityQueue + self._wqueue = queue.PriorityQueue() # type: queue.PriorityQueue[CheckRequestType] self._rqueue = queue.Queue() # type: queue.Queue @property @@ -331,7 +336,7 @@ class HyperlinkAvailabilityChecker: def shutdown_threads(self) -> None: self.wqueue.join() for worker in self.workers: - self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False) + self.wqueue.put(CheckRequest(CHECK_IMMEDIATELY, None), False) def check(self, hyperlinks: Dict[str, Hyperlink]) -> Generator[CheckResult, None, None]: self.invoke_threads() @@ -342,7 +347,7 @@ class HyperlinkAvailabilityChecker: yield CheckResult(hyperlink.uri, hyperlink.docname, hyperlink.lineno, 'ignored', '', 0) else: - self.wqueue.put(hyperlink, False) + self.wqueue.put(CheckRequest(CHECK_IMMEDIATELY, hyperlink), False) total_links += 1 done = 0 @@ -471,7 +476,7 @@ class HyperlinkAvailabilityCheckWorker(Thread): elif err.response.status_code == 429: next_check = self.limit_rate(err.response) if next_check is not None: - self.wqueue.put((next_check, uri, docname, lineno), False) + self.wqueue.put(CheckRequest(next_check, hyperlink), False) return 'rate-limited', '', 0 return 'broken', str(err), 0 elif err.response.status_code == 503: @@ -538,7 +543,17 @@ class HyperlinkAvailabilityCheckWorker(Thread): return (status, info, code) while True: - next_check, uri, docname, lineno = self.wqueue.get() + check_request = self.wqueue.get() + try: + next_check, hyperlink = check_request + if hyperlink is None: + break + + uri, docname, lineno = hyperlink + except ValueError: + # old styled check_request (will be deprecated in Sphinx-5.0) + next_check, uri, docname, lineno = check_request + if uri is None: break netloc = urlparse(uri).netloc @@ -554,7 +569,7 @@ class HyperlinkAvailabilityCheckWorker(Thread): # Sleep before putting message back in the queue to avoid # waking up other threads. time.sleep(QUEUE_POLL_SECS) - self.wqueue.put((next_check, uri, docname, lineno), False) + self.wqueue.put(CheckRequest(next_check, hyperlink), False) self.wqueue.task_done() continue status, info, code = check(docname) @@ -617,7 +632,7 @@ class HyperlinkCollector(SphinxPostTransform): continue uri = refnode['refuri'] lineno = get_node_line(refnode) - uri_info = Hyperlink(CHECK_IMMEDIATELY, uri, self.env.docname, lineno) + uri_info = Hyperlink(uri, self.env.docname, lineno) if uri not in hyperlinks: hyperlinks[uri] = uri_info @@ -626,7 +641,7 @@ class HyperlinkCollector(SphinxPostTransform): uri = imgnode['candidates'].get('?') if uri and '://' in uri: lineno = get_node_line(imgnode) - uri_info = Hyperlink(CHECK_IMMEDIATELY, uri, self.env.docname, lineno) + uri_info = Hyperlink(uri, self.env.docname, lineno) if uri not in hyperlinks: hyperlinks[uri] = uri_info