mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
Fix #6629: linkcheck: Handle rate-limiting
Follow the Retry-After header if present, otherwise use an exponential back-off.
This commit is contained in:
parent
c848d4dd37
commit
6b90a63f08
2
CHANGES
2
CHANGES
@ -39,6 +39,8 @@ Features added
|
||||
* #6914: Add a new event :event:`warn-missing-reference` to custom warning
|
||||
messages when failed to resolve a cross-reference
|
||||
* #6914: Emit a detailed warning when failed to resolve a ``:ref:`` reference
|
||||
* #6629: linkcheck: The builder now handles rate limits. See
|
||||
:confval:`linkcheck_retry_on_rate_limit` for details.
|
||||
|
||||
Bugs fixed
|
||||
----------
|
||||
|
@ -110,7 +110,10 @@ texinfo_documents = [
|
||||
1),
|
||||
]
|
||||
|
||||
intersphinx_mapping = {'python': ('https://docs.python.org/3/', None)}
|
||||
intersphinx_mapping = {
|
||||
'python': ('https://docs.python.org/3/', None),
|
||||
'requests': ('https://requests.readthedocs.io/en/master', None),
|
||||
}
|
||||
|
||||
# Sphinx document translation with sphinx gettext feature uses these settings:
|
||||
locale_dirs = ['locale/']
|
||||
|
@ -442,6 +442,10 @@ name is ``rinoh``. Refer to the `rinohtype manual`_ for details.
|
||||
|
||||
Since Sphinx-1.5, the linkcheck builder comes to use requests module.
|
||||
|
||||
.. versionchanged:: 3.4
|
||||
|
||||
The linkcheck builder retries links when servers apply rate limits.
|
||||
|
||||
.. module:: sphinx.builders.xml
|
||||
.. class:: XMLBuilder
|
||||
|
||||
|
@ -2525,6 +2525,23 @@ Options for the linkcheck builder
|
||||
|
||||
.. versionadded:: 2.3
|
||||
|
||||
.. confval:: linkcheck_rate_limit_timeout
|
||||
|
||||
The ``linkcheck`` builder may issue a large number of requests to the same
|
||||
site over a short period of time. This setting controls the builder behavior
|
||||
when servers indicate that requests are rate-limited.
|
||||
|
||||
If a server indicates when to retry (using the `Retry-After`_ header),
|
||||
``linkcheck`` always follows the server indication.
|
||||
|
||||
Otherwise, ``linkcheck`` waits for a minute before to retry and keeps
|
||||
doubling the wait time between attempts until it succeeds or exceeds the
|
||||
``linkcheck_rate_limit_timeout``. By default, the timeout is 5 minutes.
|
||||
|
||||
.. _Retry-After: https://tools.ietf.org/html/rfc7231#section-7.1.3
|
||||
|
||||
.. versionadded:: 3.4
|
||||
|
||||
|
||||
Options for the XML builder
|
||||
---------------------------
|
||||
|
@ -13,13 +13,17 @@ import queue
|
||||
import re
|
||||
import socket
|
||||
import threading
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from email.utils import parsedate_to_datetime
|
||||
from html.parser import HTMLParser
|
||||
from os import path
|
||||
from typing import Any, Dict, List, Set, Tuple
|
||||
from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple
|
||||
from urllib.parse import unquote, urlparse
|
||||
|
||||
from docutils import nodes
|
||||
from docutils.nodes import Node
|
||||
from requests import Response
|
||||
from requests.exceptions import HTTPError, TooManyRedirects
|
||||
|
||||
from sphinx.application import Sphinx
|
||||
@ -33,10 +37,14 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
uri_re = re.compile('([a-z]+:)?//') # matches to foo:// and // (a protocol relative URL)
|
||||
|
||||
RateLimit = NamedTuple('RateLimit', (('delay', float), ('next_check', float)))
|
||||
|
||||
DEFAULT_REQUEST_HEADERS = {
|
||||
'Accept': 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
|
||||
}
|
||||
CHECK_IMMEDIATELY = 0
|
||||
QUEUE_POLL_SECS = 1
|
||||
DEFAULT_DELAY = 60.0
|
||||
|
||||
|
||||
class AnchorCheckParser(HTMLParser):
|
||||
@ -98,7 +106,8 @@ class CheckExternalLinksBuilder(Builder):
|
||||
open(path.join(self.outdir, 'output.json'), 'w').close()
|
||||
|
||||
# create queues and worker threads
|
||||
self.wqueue = queue.Queue() # type: queue.Queue
|
||||
self.rate_limits = {} # type: Dict[str, RateLimit]
|
||||
self.wqueue = queue.PriorityQueue() # type: queue.PriorityQueue
|
||||
self.rqueue = queue.Queue() # type: queue.Queue
|
||||
self.workers = [] # type: List[threading.Thread]
|
||||
for i in range(self.app.config.linkcheck_workers):
|
||||
@ -172,16 +181,25 @@ class CheckExternalLinksBuilder(Builder):
|
||||
config=self.app.config, auth=auth_info,
|
||||
**kwargs)
|
||||
response.raise_for_status()
|
||||
except (HTTPError, TooManyRedirects):
|
||||
except (HTTPError, TooManyRedirects) as err:
|
||||
if isinstance(err, HTTPError) and err.response.status_code == 429:
|
||||
raise
|
||||
# retry with GET request if that fails, some servers
|
||||
# don't like HEAD requests.
|
||||
response = requests.get(req_url, stream=True, config=self.app.config,
|
||||
response = requests.get(req_url, stream=True,
|
||||
config=self.app.config,
|
||||
auth=auth_info, **kwargs)
|
||||
response.raise_for_status()
|
||||
except HTTPError as err:
|
||||
if err.response.status_code == 401:
|
||||
# We'll take "Unauthorized" as working.
|
||||
return 'working', ' - unauthorized', 0
|
||||
elif err.response.status_code == 429:
|
||||
next_check = self.limit_rate(err.response)
|
||||
if next_check is not None:
|
||||
self.wqueue.put((next_check, uri, docname, lineno), False)
|
||||
return 'rate-limited', '', 0
|
||||
return 'broken', str(err), 0
|
||||
elif err.response.status_code == 503:
|
||||
# We'll take "Service Unavailable" as ignored.
|
||||
return 'ignored', str(err), 0
|
||||
@ -189,6 +207,12 @@ class CheckExternalLinksBuilder(Builder):
|
||||
return 'broken', str(err), 0
|
||||
except Exception as err:
|
||||
return 'broken', str(err), 0
|
||||
else:
|
||||
netloc = urlparse(req_url).netloc
|
||||
try:
|
||||
del self.rate_limits[netloc]
|
||||
except KeyError:
|
||||
pass
|
||||
if response.url.rstrip('/') == req_url.rstrip('/'):
|
||||
return 'working', '', 0
|
||||
else:
|
||||
@ -247,11 +271,69 @@ class CheckExternalLinksBuilder(Builder):
|
||||
return (status, info, code)
|
||||
|
||||
while True:
|
||||
uri, docname, lineno = self.wqueue.get()
|
||||
next_check, uri, docname, lineno = self.wqueue.get()
|
||||
if uri is None:
|
||||
break
|
||||
netloc = urlparse(uri).netloc
|
||||
try:
|
||||
# Refresh rate limit.
|
||||
# When there are many links in the queue, workers are all stuck waiting
|
||||
# for responses, but the builder keeps queuing. Links in the queue may
|
||||
# have been queued before rate limits were discovered.
|
||||
next_check = self.rate_limits[netloc].next_check
|
||||
except KeyError:
|
||||
pass
|
||||
if next_check > time.time():
|
||||
# Sleep before putting message back in the queue to avoid
|
||||
# waking up other threads.
|
||||
time.sleep(QUEUE_POLL_SECS)
|
||||
self.wqueue.put((next_check, uri, docname, lineno), False)
|
||||
self.wqueue.task_done()
|
||||
continue
|
||||
status, info, code = check(docname)
|
||||
self.rqueue.put((uri, docname, lineno, status, info, code))
|
||||
if status == 'rate-limited':
|
||||
logger.info(darkgray('-rate limited- ') + uri + darkgray(' | sleeping...'))
|
||||
else:
|
||||
self.rqueue.put((uri, docname, lineno, status, info, code))
|
||||
self.wqueue.task_done()
|
||||
|
||||
def limit_rate(self, response: Response) -> Optional[float]:
|
||||
next_check = None
|
||||
retry_after = response.headers.get("Retry-After")
|
||||
if retry_after:
|
||||
try:
|
||||
# Integer: time to wait before next attempt.
|
||||
delay = float(retry_after)
|
||||
except ValueError:
|
||||
try:
|
||||
# An HTTP-date: time of next attempt.
|
||||
until = parsedate_to_datetime(retry_after)
|
||||
except (TypeError, ValueError):
|
||||
# TypeError: Invalid date format.
|
||||
# ValueError: Invalid date, e.g. Oct 52th.
|
||||
pass
|
||||
else:
|
||||
next_check = datetime.timestamp(until)
|
||||
delay = (until - datetime.now(timezone.utc)).total_seconds()
|
||||
else:
|
||||
next_check = time.time() + delay
|
||||
netloc = urlparse(response.url).netloc
|
||||
if next_check is None:
|
||||
max_delay = self.app.config.linkcheck_rate_limit_timeout
|
||||
try:
|
||||
rate_limit = self.rate_limits[netloc]
|
||||
except KeyError:
|
||||
delay = DEFAULT_DELAY
|
||||
else:
|
||||
last_wait_time = rate_limit.delay
|
||||
delay = 2.0 * last_wait_time
|
||||
if delay > max_delay and last_wait_time < max_delay:
|
||||
delay = max_delay
|
||||
if delay > max_delay:
|
||||
return None
|
||||
next_check = time.time() + delay
|
||||
self.rate_limits[netloc] = RateLimit(delay, next_check)
|
||||
return next_check
|
||||
|
||||
def process_result(self, result: Tuple[str, str, int, str, str, int]) -> None:
|
||||
uri, docname, lineno, status, info, code = result
|
||||
@ -325,7 +407,8 @@ class CheckExternalLinksBuilder(Builder):
|
||||
continue
|
||||
uri = refnode['refuri']
|
||||
lineno = get_node_line(refnode)
|
||||
self.wqueue.put((uri, docname, lineno), False)
|
||||
uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
|
||||
self.wqueue.put(uri_info, False)
|
||||
n += 1
|
||||
|
||||
# image nodes
|
||||
@ -333,7 +416,8 @@ class CheckExternalLinksBuilder(Builder):
|
||||
uri = imgnode['candidates'].get('?')
|
||||
if uri and '://' in uri:
|
||||
lineno = get_node_line(imgnode)
|
||||
self.wqueue.put((uri, docname, lineno), False)
|
||||
uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
|
||||
self.wqueue.put(uri_info, False)
|
||||
n += 1
|
||||
|
||||
done = 0
|
||||
@ -355,8 +439,10 @@ class CheckExternalLinksBuilder(Builder):
|
||||
output.write('\n')
|
||||
|
||||
def finish(self) -> None:
|
||||
self.wqueue.join()
|
||||
# Shutdown threads.
|
||||
for worker in self.workers:
|
||||
self.wqueue.put((None, None, None), False)
|
||||
self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False)
|
||||
|
||||
|
||||
def setup(app: Sphinx) -> Dict[str, Any]:
|
||||
@ -372,6 +458,7 @@ def setup(app: Sphinx) -> Dict[str, Any]:
|
||||
# Anchors starting with ! are ignored since they are
|
||||
# commonly used for dynamic pages
|
||||
app.add_config_value('linkcheck_anchors_ignore', ["^!"], None)
|
||||
app.add_config_value('linkcheck_rate_limit_timeout', 300.0, None)
|
||||
|
||||
return {
|
||||
'version': 'builtin',
|
||||
|
@ -10,13 +10,24 @@
|
||||
|
||||
import http.server
|
||||
import json
|
||||
import re
|
||||
import textwrap
|
||||
import time
|
||||
import wsgiref.handlers
|
||||
from datetime import datetime
|
||||
from typing import Dict
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from sphinx.builders.linkcheck import CheckExternalLinksBuilder, RateLimit
|
||||
from sphinx.util.console import strip_colors
|
||||
|
||||
from .utils import CERT_FILE, http_server, https_server, modify_env
|
||||
|
||||
ts_re = re.compile(r".*\[(?P<ts>.*)\].*")
|
||||
|
||||
|
||||
@pytest.mark.sphinx('linkcheck', testroot='linkcheck', freshenv=True)
|
||||
def test_defaults(app):
|
||||
@ -410,3 +421,154 @@ def test_TooManyRedirects_on_HEAD(app):
|
||||
"uri": "http://localhost:7777/",
|
||||
"info": "",
|
||||
}
|
||||
|
||||
|
||||
def make_retry_after_handler(responses):
|
||||
class RetryAfterHandler(http.server.BaseHTTPRequestHandler):
|
||||
def do_HEAD(self):
|
||||
status, retry_after = responses.pop(0)
|
||||
self.send_response(status)
|
||||
if retry_after:
|
||||
self.send_header('Retry-After', retry_after)
|
||||
self.end_headers()
|
||||
|
||||
def log_date_time_string(self):
|
||||
"""Strip date and time from logged messages for assertions."""
|
||||
return ""
|
||||
|
||||
return RetryAfterHandler
|
||||
|
||||
|
||||
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True)
|
||||
def test_too_many_requests_retry_after_int_delay(app, capsys, status):
|
||||
with http_server(make_retry_after_handler([(429, "0"), (200, None)])), \
|
||||
mock.patch("sphinx.builders.linkcheck.DEFAULT_DELAY", 0), \
|
||||
mock.patch("sphinx.builders.linkcheck.QUEUE_POLL_SECS", 0.01):
|
||||
app.builder.build_all()
|
||||
content = (app.outdir / 'output.json').read_text()
|
||||
assert json.loads(content) == {
|
||||
"filename": "index.rst",
|
||||
"lineno": 1,
|
||||
"status": "working",
|
||||
"code": 0,
|
||||
"uri": "http://localhost:7777/",
|
||||
"info": "",
|
||||
}
|
||||
rate_limit_log = "-rate limited- http://localhost:7777/ | sleeping...\n"
|
||||
assert rate_limit_log in strip_colors(status.getvalue())
|
||||
_stdout, stderr = capsys.readouterr()
|
||||
assert stderr == textwrap.dedent(
|
||||
"""\
|
||||
127.0.0.1 - - [] "HEAD / HTTP/1.1" 429 -
|
||||
127.0.0.1 - - [] "HEAD / HTTP/1.1" 200 -
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True)
|
||||
def test_too_many_requests_retry_after_HTTP_date(app, capsys):
|
||||
now = datetime.now().timetuple()
|
||||
retry_after = wsgiref.handlers.format_date_time(time.mktime(now))
|
||||
with http_server(make_retry_after_handler([(429, retry_after), (200, None)])):
|
||||
app.builder.build_all()
|
||||
content = (app.outdir / 'output.json').read_text()
|
||||
assert json.loads(content) == {
|
||||
"filename": "index.rst",
|
||||
"lineno": 1,
|
||||
"status": "working",
|
||||
"code": 0,
|
||||
"uri": "http://localhost:7777/",
|
||||
"info": "",
|
||||
}
|
||||
_stdout, stderr = capsys.readouterr()
|
||||
assert stderr == textwrap.dedent(
|
||||
"""\
|
||||
127.0.0.1 - - [] "HEAD / HTTP/1.1" 429 -
|
||||
127.0.0.1 - - [] "HEAD / HTTP/1.1" 200 -
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True)
|
||||
def test_too_many_requests_retry_after_without_header(app, capsys):
|
||||
with http_server(make_retry_after_handler([(429, None), (200, None)])),\
|
||||
mock.patch("sphinx.builders.linkcheck.DEFAULT_DELAY", 0):
|
||||
app.builder.build_all()
|
||||
content = (app.outdir / 'output.json').read_text()
|
||||
assert json.loads(content) == {
|
||||
"filename": "index.rst",
|
||||
"lineno": 1,
|
||||
"status": "working",
|
||||
"code": 0,
|
||||
"uri": "http://localhost:7777/",
|
||||
"info": "",
|
||||
}
|
||||
_stdout, stderr = capsys.readouterr()
|
||||
assert stderr == textwrap.dedent(
|
||||
"""\
|
||||
127.0.0.1 - - [] "HEAD / HTTP/1.1" 429 -
|
||||
127.0.0.1 - - [] "HEAD / HTTP/1.1" 200 -
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True)
|
||||
def test_too_many_requests_user_timeout(app, capsys):
|
||||
app.config.linkcheck_rate_limit_timeout = 0.0
|
||||
with http_server(make_retry_after_handler([(429, None)])):
|
||||
app.builder.build_all()
|
||||
content = (app.outdir / 'output.json').read_text()
|
||||
assert json.loads(content) == {
|
||||
"filename": "index.rst",
|
||||
"lineno": 1,
|
||||
"status": "broken",
|
||||
"code": 0,
|
||||
"uri": "http://localhost:7777/",
|
||||
"info": "429 Client Error: Too Many Requests for url: http://localhost:7777/",
|
||||
}
|
||||
|
||||
|
||||
class FakeResponse:
|
||||
headers = {} # type: Dict[str, str]
|
||||
url = "http://localhost/"
|
||||
|
||||
|
||||
def test_limit_rate_default_sleep(app):
|
||||
checker = CheckExternalLinksBuilder(app)
|
||||
checker.rate_limits = {}
|
||||
with mock.patch('time.time', return_value=0.0):
|
||||
next_check = checker.limit_rate(FakeResponse())
|
||||
assert next_check == 60.0
|
||||
|
||||
|
||||
def test_limit_rate_user_max_delay(app):
|
||||
app.config.linkcheck_rate_limit_timeout = 0.0
|
||||
checker = CheckExternalLinksBuilder(app)
|
||||
checker.rate_limits = {}
|
||||
next_check = checker.limit_rate(FakeResponse())
|
||||
assert next_check is None
|
||||
|
||||
|
||||
def test_limit_rate_doubles_previous_wait_time(app):
|
||||
checker = CheckExternalLinksBuilder(app)
|
||||
checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)}
|
||||
with mock.patch('time.time', return_value=0.0):
|
||||
next_check = checker.limit_rate(FakeResponse())
|
||||
assert next_check == 120.0
|
||||
|
||||
|
||||
def test_limit_rate_clips_wait_time_to_max_time(app):
|
||||
checker = CheckExternalLinksBuilder(app)
|
||||
app.config.linkcheck_rate_limit_timeout = 90.0
|
||||
checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)}
|
||||
with mock.patch('time.time', return_value=0.0):
|
||||
next_check = checker.limit_rate(FakeResponse())
|
||||
assert next_check == 90.0
|
||||
|
||||
|
||||
def test_limit_rate_bails_out_after_waiting_max_time(app):
|
||||
checker = CheckExternalLinksBuilder(app)
|
||||
app.config.linkcheck_rate_limit_timeout = 90.0
|
||||
checker.rate_limits = {"localhost": RateLimit(90.0, 0.0)}
|
||||
next_check = checker.limit_rate(FakeResponse())
|
||||
assert next_check is None
|
||||
|
Loading…
Reference in New Issue
Block a user