Merge pull request #8467 from francoisfreitag/rate-limit

Fix #6629: linkcheck: Handle rate-limiting
This commit is contained in:
Takeshi KOMIYA 2020-11-26 02:10:47 +09:00 committed by GitHub
commit 5a0e1231ad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 285 additions and 10 deletions

View File

@ -39,6 +39,8 @@ Features added
* #6914: Add a new event :event:`warn-missing-reference` to custom warning * #6914: Add a new event :event:`warn-missing-reference` to custom warning
messages when failed to resolve a cross-reference messages when failed to resolve a cross-reference
* #6914: Emit a detailed warning when failed to resolve a ``:ref:`` reference * #6914: Emit a detailed warning when failed to resolve a ``:ref:`` reference
* #6629: linkcheck: The builder now handles rate limits. See
:confval:`linkcheck_retry_on_rate_limit` for details.
Bugs fixed Bugs fixed
---------- ----------

View File

@ -110,7 +110,10 @@ texinfo_documents = [
1), 1),
] ]
intersphinx_mapping = {'python': ('https://docs.python.org/3/', None)} intersphinx_mapping = {
'python': ('https://docs.python.org/3/', None),
'requests': ('https://requests.readthedocs.io/en/master', None),
}
# Sphinx document translation with sphinx gettext feature uses these settings: # Sphinx document translation with sphinx gettext feature uses these settings:
locale_dirs = ['locale/'] locale_dirs = ['locale/']

View File

@ -442,6 +442,10 @@ name is ``rinoh``. Refer to the `rinohtype manual`_ for details.
Since Sphinx-1.5, the linkcheck builder comes to use requests module. Since Sphinx-1.5, the linkcheck builder comes to use requests module.
.. versionchanged:: 3.4
The linkcheck builder retries links when servers apply rate limits.
.. module:: sphinx.builders.xml .. module:: sphinx.builders.xml
.. class:: XMLBuilder .. class:: XMLBuilder

View File

@ -2525,6 +2525,23 @@ Options for the linkcheck builder
.. versionadded:: 2.3 .. versionadded:: 2.3
.. confval:: linkcheck_rate_limit_timeout
The ``linkcheck`` builder may issue a large number of requests to the same
site over a short period of time. This setting controls the builder behavior
when servers indicate that requests are rate-limited.
If a server indicates when to retry (using the `Retry-After`_ header),
``linkcheck`` always follows the server indication.
Otherwise, ``linkcheck`` waits for a minute before to retry and keeps
doubling the wait time between attempts until it succeeds or exceeds the
``linkcheck_rate_limit_timeout``. By default, the timeout is 5 minutes.
.. _Retry-After: https://tools.ietf.org/html/rfc7231#section-7.1.3
.. versionadded:: 3.4
Options for the XML builder Options for the XML builder
--------------------------- ---------------------------

View File

@ -13,13 +13,17 @@ import queue
import re import re
import socket import socket
import threading import threading
import time
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from html.parser import HTMLParser from html.parser import HTMLParser
from os import path from os import path
from typing import Any, Dict, List, Set, Tuple from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple
from urllib.parse import unquote, urlparse from urllib.parse import unquote, urlparse
from docutils import nodes from docutils import nodes
from docutils.nodes import Node from docutils.nodes import Node
from requests import Response
from requests.exceptions import HTTPError, TooManyRedirects from requests.exceptions import HTTPError, TooManyRedirects
from sphinx.application import Sphinx from sphinx.application import Sphinx
@ -33,10 +37,14 @@ logger = logging.getLogger(__name__)
uri_re = re.compile('([a-z]+:)?//') # matches to foo:// and // (a protocol relative URL) uri_re = re.compile('([a-z]+:)?//') # matches to foo:// and // (a protocol relative URL)
RateLimit = NamedTuple('RateLimit', (('delay', float), ('next_check', float)))
DEFAULT_REQUEST_HEADERS = { DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8', 'Accept': 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
} }
CHECK_IMMEDIATELY = 0
QUEUE_POLL_SECS = 1
DEFAULT_DELAY = 60.0
class AnchorCheckParser(HTMLParser): class AnchorCheckParser(HTMLParser):
@ -98,7 +106,8 @@ class CheckExternalLinksBuilder(Builder):
open(path.join(self.outdir, 'output.json'), 'w').close() open(path.join(self.outdir, 'output.json'), 'w').close()
# create queues and worker threads # create queues and worker threads
self.wqueue = queue.Queue() # type: queue.Queue self.rate_limits = {} # type: Dict[str, RateLimit]
self.wqueue = queue.PriorityQueue() # type: queue.PriorityQueue
self.rqueue = queue.Queue() # type: queue.Queue self.rqueue = queue.Queue() # type: queue.Queue
self.workers = [] # type: List[threading.Thread] self.workers = [] # type: List[threading.Thread]
for i in range(self.app.config.linkcheck_workers): for i in range(self.app.config.linkcheck_workers):
@ -172,16 +181,25 @@ class CheckExternalLinksBuilder(Builder):
config=self.app.config, auth=auth_info, config=self.app.config, auth=auth_info,
**kwargs) **kwargs)
response.raise_for_status() response.raise_for_status()
except (HTTPError, TooManyRedirects): except (HTTPError, TooManyRedirects) as err:
if isinstance(err, HTTPError) and err.response.status_code == 429:
raise
# retry with GET request if that fails, some servers # retry with GET request if that fails, some servers
# don't like HEAD requests. # don't like HEAD requests.
response = requests.get(req_url, stream=True, config=self.app.config, response = requests.get(req_url, stream=True,
config=self.app.config,
auth=auth_info, **kwargs) auth=auth_info, **kwargs)
response.raise_for_status() response.raise_for_status()
except HTTPError as err: except HTTPError as err:
if err.response.status_code == 401: if err.response.status_code == 401:
# We'll take "Unauthorized" as working. # We'll take "Unauthorized" as working.
return 'working', ' - unauthorized', 0 return 'working', ' - unauthorized', 0
elif err.response.status_code == 429:
next_check = self.limit_rate(err.response)
if next_check is not None:
self.wqueue.put((next_check, uri, docname, lineno), False)
return 'rate-limited', '', 0
return 'broken', str(err), 0
elif err.response.status_code == 503: elif err.response.status_code == 503:
# We'll take "Service Unavailable" as ignored. # We'll take "Service Unavailable" as ignored.
return 'ignored', str(err), 0 return 'ignored', str(err), 0
@ -189,6 +207,12 @@ class CheckExternalLinksBuilder(Builder):
return 'broken', str(err), 0 return 'broken', str(err), 0
except Exception as err: except Exception as err:
return 'broken', str(err), 0 return 'broken', str(err), 0
else:
netloc = urlparse(req_url).netloc
try:
del self.rate_limits[netloc]
except KeyError:
pass
if response.url.rstrip('/') == req_url.rstrip('/'): if response.url.rstrip('/') == req_url.rstrip('/'):
return 'working', '', 0 return 'working', '', 0
else: else:
@ -247,11 +271,69 @@ class CheckExternalLinksBuilder(Builder):
return (status, info, code) return (status, info, code)
while True: while True:
uri, docname, lineno = self.wqueue.get() next_check, uri, docname, lineno = self.wqueue.get()
if uri is None: if uri is None:
break break
netloc = urlparse(uri).netloc
try:
# Refresh rate limit.
# When there are many links in the queue, workers are all stuck waiting
# for responses, but the builder keeps queuing. Links in the queue may
# have been queued before rate limits were discovered.
next_check = self.rate_limits[netloc].next_check
except KeyError:
pass
if next_check > time.time():
# Sleep before putting message back in the queue to avoid
# waking up other threads.
time.sleep(QUEUE_POLL_SECS)
self.wqueue.put((next_check, uri, docname, lineno), False)
self.wqueue.task_done()
continue
status, info, code = check(docname) status, info, code = check(docname)
if status == 'rate-limited':
logger.info(darkgray('-rate limited- ') + uri + darkgray(' | sleeping...'))
else:
self.rqueue.put((uri, docname, lineno, status, info, code)) self.rqueue.put((uri, docname, lineno, status, info, code))
self.wqueue.task_done()
def limit_rate(self, response: Response) -> Optional[float]:
next_check = None
retry_after = response.headers.get("Retry-After")
if retry_after:
try:
# Integer: time to wait before next attempt.
delay = float(retry_after)
except ValueError:
try:
# An HTTP-date: time of next attempt.
until = parsedate_to_datetime(retry_after)
except (TypeError, ValueError):
# TypeError: Invalid date format.
# ValueError: Invalid date, e.g. Oct 52th.
pass
else:
next_check = datetime.timestamp(until)
delay = (until - datetime.now(timezone.utc)).total_seconds()
else:
next_check = time.time() + delay
netloc = urlparse(response.url).netloc
if next_check is None:
max_delay = self.app.config.linkcheck_rate_limit_timeout
try:
rate_limit = self.rate_limits[netloc]
except KeyError:
delay = DEFAULT_DELAY
else:
last_wait_time = rate_limit.delay
delay = 2.0 * last_wait_time
if delay > max_delay and last_wait_time < max_delay:
delay = max_delay
if delay > max_delay:
return None
next_check = time.time() + delay
self.rate_limits[netloc] = RateLimit(delay, next_check)
return next_check
def process_result(self, result: Tuple[str, str, int, str, str, int]) -> None: def process_result(self, result: Tuple[str, str, int, str, str, int]) -> None:
uri, docname, lineno, status, info, code = result uri, docname, lineno, status, info, code = result
@ -325,7 +407,8 @@ class CheckExternalLinksBuilder(Builder):
continue continue
uri = refnode['refuri'] uri = refnode['refuri']
lineno = get_node_line(refnode) lineno = get_node_line(refnode)
self.wqueue.put((uri, docname, lineno), False) uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
self.wqueue.put(uri_info, False)
n += 1 n += 1
# image nodes # image nodes
@ -333,7 +416,8 @@ class CheckExternalLinksBuilder(Builder):
uri = imgnode['candidates'].get('?') uri = imgnode['candidates'].get('?')
if uri and '://' in uri: if uri and '://' in uri:
lineno = get_node_line(imgnode) lineno = get_node_line(imgnode)
self.wqueue.put((uri, docname, lineno), False) uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
self.wqueue.put(uri_info, False)
n += 1 n += 1
done = 0 done = 0
@ -355,8 +439,10 @@ class CheckExternalLinksBuilder(Builder):
output.write('\n') output.write('\n')
def finish(self) -> None: def finish(self) -> None:
self.wqueue.join()
# Shutdown threads.
for worker in self.workers: for worker in self.workers:
self.wqueue.put((None, None, None), False) self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False)
def setup(app: Sphinx) -> Dict[str, Any]: def setup(app: Sphinx) -> Dict[str, Any]:
@ -372,6 +458,7 @@ def setup(app: Sphinx) -> Dict[str, Any]:
# Anchors starting with ! are ignored since they are # Anchors starting with ! are ignored since they are
# commonly used for dynamic pages # commonly used for dynamic pages
app.add_config_value('linkcheck_anchors_ignore', ["^!"], None) app.add_config_value('linkcheck_anchors_ignore', ["^!"], None)
app.add_config_value('linkcheck_rate_limit_timeout', 300.0, None)
return { return {
'version': 'builtin', 'version': 'builtin',

View File

@ -10,13 +10,24 @@
import http.server import http.server
import json import json
import re
import textwrap import textwrap
import time
import wsgiref.handlers
from datetime import datetime
from typing import Dict
from unittest import mock
import pytest import pytest
import requests import requests
from sphinx.builders.linkcheck import CheckExternalLinksBuilder, RateLimit
from sphinx.util.console import strip_colors
from .utils import CERT_FILE, http_server, https_server, modify_env from .utils import CERT_FILE, http_server, https_server, modify_env
ts_re = re.compile(r".*\[(?P<ts>.*)\].*")
@pytest.mark.sphinx('linkcheck', testroot='linkcheck', freshenv=True) @pytest.mark.sphinx('linkcheck', testroot='linkcheck', freshenv=True)
def test_defaults(app): def test_defaults(app):
@ -410,3 +421,154 @@ def test_TooManyRedirects_on_HEAD(app):
"uri": "http://localhost:7777/", "uri": "http://localhost:7777/",
"info": "", "info": "",
} }
def make_retry_after_handler(responses):
class RetryAfterHandler(http.server.BaseHTTPRequestHandler):
def do_HEAD(self):
status, retry_after = responses.pop(0)
self.send_response(status)
if retry_after:
self.send_header('Retry-After', retry_after)
self.end_headers()
def log_date_time_string(self):
"""Strip date and time from logged messages for assertions."""
return ""
return RetryAfterHandler
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True)
def test_too_many_requests_retry_after_int_delay(app, capsys, status):
with http_server(make_retry_after_handler([(429, "0"), (200, None)])), \
mock.patch("sphinx.builders.linkcheck.DEFAULT_DELAY", 0), \
mock.patch("sphinx.builders.linkcheck.QUEUE_POLL_SECS", 0.01):
app.builder.build_all()
content = (app.outdir / 'output.json').read_text()
assert json.loads(content) == {
"filename": "index.rst",
"lineno": 1,
"status": "working",
"code": 0,
"uri": "http://localhost:7777/",
"info": "",
}
rate_limit_log = "-rate limited- http://localhost:7777/ | sleeping...\n"
assert rate_limit_log in strip_colors(status.getvalue())
_stdout, stderr = capsys.readouterr()
assert stderr == textwrap.dedent(
"""\
127.0.0.1 - - [] "HEAD / HTTP/1.1" 429 -
127.0.0.1 - - [] "HEAD / HTTP/1.1" 200 -
"""
)
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True)
def test_too_many_requests_retry_after_HTTP_date(app, capsys):
now = datetime.now().timetuple()
retry_after = wsgiref.handlers.format_date_time(time.mktime(now))
with http_server(make_retry_after_handler([(429, retry_after), (200, None)])):
app.builder.build_all()
content = (app.outdir / 'output.json').read_text()
assert json.loads(content) == {
"filename": "index.rst",
"lineno": 1,
"status": "working",
"code": 0,
"uri": "http://localhost:7777/",
"info": "",
}
_stdout, stderr = capsys.readouterr()
assert stderr == textwrap.dedent(
"""\
127.0.0.1 - - [] "HEAD / HTTP/1.1" 429 -
127.0.0.1 - - [] "HEAD / HTTP/1.1" 200 -
"""
)
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True)
def test_too_many_requests_retry_after_without_header(app, capsys):
with http_server(make_retry_after_handler([(429, None), (200, None)])),\
mock.patch("sphinx.builders.linkcheck.DEFAULT_DELAY", 0):
app.builder.build_all()
content = (app.outdir / 'output.json').read_text()
assert json.loads(content) == {
"filename": "index.rst",
"lineno": 1,
"status": "working",
"code": 0,
"uri": "http://localhost:7777/",
"info": "",
}
_stdout, stderr = capsys.readouterr()
assert stderr == textwrap.dedent(
"""\
127.0.0.1 - - [] "HEAD / HTTP/1.1" 429 -
127.0.0.1 - - [] "HEAD / HTTP/1.1" 200 -
"""
)
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True)
def test_too_many_requests_user_timeout(app, capsys):
app.config.linkcheck_rate_limit_timeout = 0.0
with http_server(make_retry_after_handler([(429, None)])):
app.builder.build_all()
content = (app.outdir / 'output.json').read_text()
assert json.loads(content) == {
"filename": "index.rst",
"lineno": 1,
"status": "broken",
"code": 0,
"uri": "http://localhost:7777/",
"info": "429 Client Error: Too Many Requests for url: http://localhost:7777/",
}
class FakeResponse:
headers = {} # type: Dict[str, str]
url = "http://localhost/"
def test_limit_rate_default_sleep(app):
checker = CheckExternalLinksBuilder(app)
checker.rate_limits = {}
with mock.patch('time.time', return_value=0.0):
next_check = checker.limit_rate(FakeResponse())
assert next_check == 60.0
def test_limit_rate_user_max_delay(app):
app.config.linkcheck_rate_limit_timeout = 0.0
checker = CheckExternalLinksBuilder(app)
checker.rate_limits = {}
next_check = checker.limit_rate(FakeResponse())
assert next_check is None
def test_limit_rate_doubles_previous_wait_time(app):
checker = CheckExternalLinksBuilder(app)
checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)}
with mock.patch('time.time', return_value=0.0):
next_check = checker.limit_rate(FakeResponse())
assert next_check == 120.0
def test_limit_rate_clips_wait_time_to_max_time(app):
checker = CheckExternalLinksBuilder(app)
app.config.linkcheck_rate_limit_timeout = 90.0
checker.rate_limits = {"localhost": RateLimit(60.0, 0.0)}
with mock.patch('time.time', return_value=0.0):
next_check = checker.limit_rate(FakeResponse())
assert next_check == 90.0
def test_limit_rate_bails_out_after_waiting_max_time(app):
checker = CheckExternalLinksBuilder(app)
app.config.linkcheck_rate_limit_timeout = 90.0
checker.rate_limits = {"localhost": RateLimit(90.0, 0.0)}
next_check = checker.limit_rate(FakeResponse())
assert next_check is None