Fix #6629: linkcheck: Handle rate-limiting

Follow the Retry-After header if present, otherwise use an exponential
back-off.
This commit is contained in:
François Freitag 2020-10-04 21:40:14 +02:00
parent c848d4dd37
commit 6b90a63f08
6 changed files with 285 additions and 10 deletions

View File

@ -39,6 +39,8 @@ Features added
* #6914: Add a new event :event:`warn-missing-reference` to custom warning
messages when failed to resolve a cross-reference
* #6914: Emit a detailed warning when failed to resolve a ``:ref:`` reference
* #6629: linkcheck: The builder now handles rate limits. See
:confval:`linkcheck_retry_on_rate_limit` for details.
Bugs fixed
----------

View File

@ -110,7 +110,10 @@ texinfo_documents = [
1),
]
intersphinx_mapping = {'python': ('https://docs.python.org/3/', None)}
intersphinx_mapping = {
'python': ('https://docs.python.org/3/', None),
'requests': ('https://requests.readthedocs.io/en/master', None),
}
# Sphinx document translation with sphinx gettext feature uses these settings:
locale_dirs = ['locale/']

View File

@ -442,6 +442,10 @@ name is ``rinoh``. Refer to the `rinohtype manual`_ for details.
Since Sphinx-1.5, the linkcheck builder comes to use requests module.
.. versionchanged:: 3.4
The linkcheck builder retries links when servers apply rate limits.
.. module:: sphinx.builders.xml
.. class:: XMLBuilder

View File

@ -2525,6 +2525,23 @@ Options for the linkcheck builder
.. versionadded:: 2.3
.. confval:: linkcheck_rate_limit_timeout
The ``linkcheck`` builder may issue a large number of requests to the same
site over a short period of time. This setting controls the builder behavior
when servers indicate that requests are rate-limited.
If a server indicates when to retry (using the `Retry-After`_ header),
``linkcheck`` always follows the server indication.
Otherwise, ``linkcheck`` waits for a minute before to retry and keeps
doubling the wait time between attempts until it succeeds or exceeds the
``linkcheck_rate_limit_timeout``. By default, the timeout is 5 minutes.
.. _Retry-After: https://tools.ietf.org/html/rfc7231#section-7.1.3
.. versionadded:: 3.4
Options for the XML builder
---------------------------

View File

@ -13,13 +13,17 @@ import queue
import re
import socket
import threading
import time
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from html.parser import HTMLParser
from os import path
from typing import Any, Dict, List, Set, Tuple
from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple
from urllib.parse import unquote, urlparse
from docutils import nodes
from docutils.nodes import Node
from requests import Response
from requests.exceptions import HTTPError, TooManyRedirects
from sphinx.application import Sphinx
@ -33,10 +37,14 @@ logger = logging.getLogger(__name__)
uri_re = re.compile('([a-z]+:)?//') # matches to foo:// and // (a protocol relative URL)
RateLimit = NamedTuple('RateLimit', (('delay', float), ('next_check', float)))
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
}
CHECK_IMMEDIATELY = 0
QUEUE_POLL_SECS = 1
DEFAULT_DELAY = 60.0
class AnchorCheckParser(HTMLParser):
@ -98,7 +106,8 @@ class CheckExternalLinksBuilder(Builder):
open(path.join(self.outdir, 'output.json'), 'w').close()
# create queues and worker threads
self.wqueue = queue.Queue() # type: queue.Queue
self.rate_limits = {} # type: Dict[str, RateLimit]
self.wqueue = queue.PriorityQueue() # type: queue.PriorityQueue
self.rqueue = queue.Queue() # type: queue.Queue
self.workers = [] # type: List[threading.Thread]
for i in range(self.app.config.linkcheck_workers):
@ -172,16 +181,25 @@ class CheckExternalLinksBuilder(Builder):
config=self.app.config, auth=auth_info,
**kwargs)
response.raise_for_status()
except (HTTPError, TooManyRedirects):
except (HTTPError, TooManyRedirects) as err:
if isinstance(err, HTTPError) and err.response.status_code == 429:
raise
# retry with GET request if that fails, some servers
# don't like HEAD requests.
response = requests.get(req_url, stream=True, config=self.app.config,
response = requests.get(req_url, stream=True,
config=self.app.config,
auth=auth_info, **kwargs)
response.raise_for_status()
except HTTPError as err:
if err.response.status_code == 401:
# We'll take "Unauthorized" as working.
return 'working', ' - unauthorized', 0
elif err.response.status_code == 429:
next_check = self.limit_rate(err.response)
if next_check is not None:
self.wqueue.put((next_check, uri, docname, lineno), False)
return 'rate-limited', '', 0
return 'broken', str(err), 0
elif err.response.status_code == 503:
# We'll take "Service Unavailable" as ignored.
return 'ignored', str(err), 0
@ -189,6 +207,12 @@ class CheckExternalLinksBuilder(Builder):
return 'broken', str(err), 0
except Exception as err:
return 'broken', str(err), 0
else:
netloc = urlparse(req_url).netloc
try:
del self.rate_limits[netloc]
except KeyError:
pass
if response.url.rstrip('/') == req_url.rstrip('/'):
return 'working', '', 0
else:
@ -247,11 +271,69 @@ class CheckExternalLinksBuilder(Builder):
return (status, info, code)
while True:
uri, docname, lineno = self.wqueue.get()
next_check, uri, docname, lineno = self.wqueue.get()
if uri is None:
break
netloc = urlparse(uri).netloc
try:
# Refresh rate limit.
# When there are many links in the queue, workers are all stuck waiting
# for responses, but the builder keeps queuing. Links in the queue may
# have been queued before rate limits were discovered.
next_check = self.rate_limits[netloc].next_check
except KeyError:
pass
if next_check > time.time():
# Sleep before putting message back in the queue to avoid
# waking up other threads.
time.sleep(QUEUE_POLL_SECS)
self.wqueue.put((next_check, uri, docname, lineno), False)
self.wqueue.task_done()
continue
status, info, code = check(docname)
self.rqueue.put((uri, docname, lineno, status, info, code))
if status == 'rate-limited':
logger.info(darkgray('-rate limited- ') + uri + darkgray(' | sleeping...'))
else:
self.rqueue.put((uri, docname, lineno, status, info, code))
self.wqueue.task_done()
def limit_rate(self, response: Response) -> Optional[float]:
next_check = None
retry_after = response.headers.get("Retry-After")
if retry_after:
try:
# Integer: time to wait before next attempt.
delay = float(retry_after)
except ValueError:
try:
# An HTTP-date: time of next attempt.
until = parsedate_to_datetime(retry_after)
except (TypeError, ValueError):
# TypeError: Invalid date format.
# ValueError: Invalid date, e.g. Oct 52th.
pass
else:
next_check = datetime.timestamp(until)
delay = (until - datetime.now(timezone.utc)).total_seconds()
else:
next_check = time.time() + delay
netloc = urlparse(response.url).netloc
if next_check is None:
max_delay = self.app.config.linkcheck_rate_limit_timeout
try:
rate_limit = self.rate_limits[netloc]
except KeyError:
delay = DEFAULT_DELAY
else:
last_wait_time = rate_limit.delay
delay = 2.0 * last_wait_time
if delay > max_delay and last_wait_time < max_delay:
delay = max_delay
if delay > max_delay:
return None
next_check = time.time() + delay
self.rate_limits[netloc] = RateLimit(delay, next_check)
return next_check
def process_result(self, result: Tuple[str, str, int, str, str, int]) -> None:
uri, docname, lineno, status, info, code = result
@ -325,7 +407,8 @@ class CheckExternalLinksBuilder(Builder):
continue
uri = refnode['refuri']
lineno = get_node_line(refnode)
self.wqueue.put((uri, docname, lineno), False)
uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
self.wqueue.put(uri_info, False)
n += 1
# image nodes
@ -333,7 +416,8 @@ class CheckExternalLinksBuilder(Builder):
uri = imgnode['candidates'].get('?')
if uri and '://' in uri:
lineno = get_node_line(imgnode)
self.wqueue.put((uri, docname, lineno), False)
uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
self.wqueue.put(uri_info, False)
n += 1
done = 0
@ -355,8 +439,10 @@ class CheckExternalLinksBuilder(Builder):
output.write('\n')
def finish(self) -> None:
self.wqueue.join()
# Shutdown threads.
for worker in self.workers:
self.wqueue.put((None, None, None), False)
self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False)
def setup(app: Sphinx) -> Dict[str, Any]:
@ -372,6 +458,7 @@ def setup(app: Sphinx) -> Dict[str, Any]:
# Anchors starting with ! are ignored since they are
# commonly used for dynamic pages
app.add_config_value('linkcheck_anchors_ignore', ["^!"], None)
app.add_config_value('linkcheck_rate_limit_timeout', 300.0, None)
return {
'version': 'builtin',

View File

@ -10,13 +10,24 @@
import http.server
import json
import re
import textwrap
import time
import wsgiref.handlers
from datetime import datetime
from typing import Dict
from unittest import mock
import pytest
import requests
from sphinx.builders.linkcheck import CheckExternalLinksBuilder, RateLimit
from sphinx.util.console import strip_colors
from .utils import CERT_FILE, http_server, https_server, modify_env
ts_re = re.compile(r".*\[(?P<ts>.*)\].*")
@pytest.mark.sphinx('linkcheck', testroot='linkcheck', freshenv=True)
def test_defaults(app):
@ -410,3 +421,154 @@ def test_TooManyRedirects_on_HEAD(app):
"uri": "http://localhost:7777/",
"info": "",
}
def make_retry_after_handler(responses):
class RetryAfterHandler(http.server.BaseHTTPRequestHandler):
def do_HEAD(self):
status, retry_after = responses.pop(0)
self.send_response(status)
if retry_after:
self.send_header('Retry-After', retry_after)
self.end_headers()
def log_date_time_string(self):
"""Strip date and time from logged messages for assertions."""
return ""
return RetryAfterHandler
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True)
def test_too_many_requests_retry_after_int_delay(app, capsys, status):
with http_server(make_retry_after_handler([(429, "0"), (200, None)])), \
mock.patch("sphinx.builders.linkcheck.DEFAULT_DELAY", 0), \
mock.patch("sphinx.builders.linkcheck.QUEUE_POLL_SECS", 0.01):
app.builder.build_all()
content = (app.outdir / 'output.json').read_text()
assert json.loads(content) == {
"filename": "index.rst",
"lineno": 1,
"status": "working",
"code": 0,
"uri": "http://localhost:7777/",
"info": "",
}
rate_limit_log = "-rate limited- http://localhost:7777/ | sleeping...\n"
assert rate_limit_log in strip_colors(status.getvalue())
_stdout, stderr = capsys.readouterr()
assert stderr == textwrap.dedent(
"""\
127.0.0.1 - - [] "HEAD / HTTP/1.1" 429 -
127.0.0.1 - - [] "HEAD / HTTP/1.1" 200 -
"""
)
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True)
def test_too_many_requests_retry_after_HTTP_date(app, capsys):
now = datetime.now().timetuple()
retry_after = wsgiref.handlers.format_date_time(time.mktime(now))
with http_server(make_retry_after_handler([(429, retry_after), (200, None)])):
app.builder.build_all()
content = (app.outdir / 'output.json').read_text()
assert json.loads(content) == {
"filename": "index.rst",
"lineno": 1,
"status": "working",
"code": 0,
"uri": "http://localhost:7777/",
"info": "",
}
_stdout, stderr = capsys.readouterr()
assert stderr == textwrap.dedent(
"""\
127.0.0.1 - - [] "HEAD / HTTP/1.1" 429 -
127.0.0.1 - - [] "HEAD / HTTP/1.1" 200 -
"""
)
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True)
def test_too_many_requests_retry_after_without_header(app, capsys):
with http_server(make_retry_after_handler([(429, None), (200, None)])),\
mock.patch("sphinx.builders.linkcheck.DEFAULT_DELAY", 0):
app.builder.build_all()
content = (app.outdir / 'output.json').read_text()
assert json.loads(content) == {
"filename": "index.rst",
"lineno": 1,
"status": "working",
"code": 0,
"uri": "http://localhost:7777/",
"info": "",
}
_stdout, stderr = capsys.readouterr()
assert stderr == textwrap.dedent(
"""\
127.0.0.1 - - [] "HEAD / HTTP/1.1" 429 -
127.0.0.1 - - [] "HEAD / HTTP/1.1" 200 -
"""
)
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True)
def test_too_many_requests_user_timeout(app, capsys):
app.config.linkcheck_rate_limit_timeout = 0.0
with http_server(make_retry_after_handler([(429, None)])):
app.builder.build_all()
content = (app.outdir / 'output.json').read_text()
assert json.loads(content) == {
"filename": "index.rst",
"lineno": 1,
"status": "broken",
"code": 0,
"uri": "http://localhost:7777/",
"info": "429 Client Error: Too Many Requests for url: http://localhost:7777/",
}
class FakeResponse:
headers = {} # type: Dict[str, str]
url = "http://localhost/"
def test_limit_rate_default_sleep(app):
checker = CheckExternalLinksBuilder(app)
checker.rate_limits = {}
with mock.patch('time.time', return_value=0.0):
next_check = checker.limit_rate(FakeResponse())
assert next_check == 60.0
def test_limit_rate_user_max_delay(app):
app.config.linkcheck_rate_limit_timeout = 0.0
checker = CheckExternalLinksBuilder(app)
checker.rate_limits = {}
next_check = checker.limit_rate(FakeResponse())
assert next_check is None
def test_limit_rate_doubles_previous_wait_time(app):
checker = CheckExternalLinksBuilder(app)
checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)}
with mock.patch('time.time', return_value=0.0):
next_check = checker.limit_rate(FakeResponse())
assert next_check == 120.0
def test_limit_rate_clips_wait_time_to_max_time(app):
checker = CheckExternalLinksBuilder(app)
app.config.linkcheck_rate_limit_timeout = 90.0
checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)}
with mock.patch('time.time', return_value=0.0):
next_check = checker.limit_rate(FakeResponse())
assert next_check == 90.0
def test_limit_rate_bails_out_after_waiting_max_time(app):
checker = CheckExternalLinksBuilder(app)
app.config.linkcheck_rate_limit_timeout = 90.0
checker.rate_limits = {"localhost": RateLimit(90.0, 0.0)}
next_check = checker.limit_rate(FakeResponse())
assert next_check is None