linkcheck: support ignored-URIs for redirects (#13127)

Co-authored-by: Adam Turner <9087854+aa-turner@users.noreply.github.com>
This commit is contained in:
James Addison 2025-01-05 01:20:15 +00:00 committed by GitHub
parent 182f621cad
commit 872d270f10
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 99 additions and 4 deletions

View File

@ -43,6 +43,8 @@ Bugs fixed
Patch by Jean-François B.
* #13096: HTML Search: check that query terms exist as properties in
term indices before accessing them.
* #11233: linkcheck: match redirect URIs against :confval:`linkcheck_ignore` by
overriding session-level ``requests.get_redirect_target``.
Testing
-------

View File

@ -3709,6 +3709,9 @@ and which failures and redirects it ignores.
A list of regular expressions that match URIs that should not be checked
when doing a ``linkcheck`` build.
Server-issued redirects that match :confval:`ignored URIs <linkcheck_ignore>`
will not be followed.
Example:
.. code-block:: python

View File

@ -398,7 +398,9 @@ class HyperlinkAvailabilityCheckWorker(Thread):
self.tls_verify = config.tls_verify
self.tls_cacerts = config.tls_cacerts
self._session = requests._Session()
self._session = requests._Session(
_ignored_redirects=tuple(map(re.compile, config.linkcheck_ignore))
)
super().__init__(daemon=True)
@ -570,6 +572,14 @@ class HyperlinkAvailabilityCheckWorker(Thread):
error_message = str(err)
continue
except requests._IgnoredRedirection as err:
# A redirection to an ignored URI was attempted; report it appropriately
return (
_Status.IGNORED,
f'ignored redirect: {err.destination}',
err.status_code,
)
except HTTPError as err:
error_message = str(err)

View File

@ -3,20 +3,34 @@
from __future__ import annotations
import warnings
from typing import Any
from urllib.parse import urlsplit
from typing import TYPE_CHECKING
from urllib.parse import urljoin, urlsplit
import requests
from urllib3.exceptions import InsecureRequestWarning
import sphinx
if TYPE_CHECKING:
import re
from collections.abc import Sequence
from typing import Any
_USER_AGENT = (
f'Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0 '
f'Sphinx/{sphinx.__version__}'
)
class _IgnoredRedirection(Exception):
"""Sphinx-internal exception raised when an HTTP redirect is ignored"""
def __init__(self, destination: str, status_code: int) -> None:
self.destination = destination
self.status_code = status_code
def _get_tls_cacert(url: str, certs: str | dict[str, str] | None) -> str | bool:
"""Get additional CA cert for a specific URL."""
if not certs:
@ -50,6 +64,23 @@ def head(url: str, **kwargs: Any) -> requests.Response:
class _Session(requests.Session):
_ignored_redirects: Sequence[re.Pattern[str]]
def __init__(self, *args: Any, **kwargs: Any) -> None:
self._ignored_redirects = kwargs.pop('_ignored_redirects', ())
super().__init__(*args, **kwargs)
def get_redirect_target(self, resp: requests.Response) -> str | None:
"""Overrides the default requests.Session.get_redirect_target"""
# do not follow redirections that match ignored URI patterns
if resp.is_redirect:
destination = urljoin(resp.url, resp.headers['location'])
if any(pat.match(destination) for pat in self._ignored_redirects):
raise _IgnoredRedirection(
destination=destination, status_code=resp.status_code
)
return super().get_redirect_target(resp)
def request( # type: ignore[override]
self,
method: str,

View File

@ -926,7 +926,7 @@ class InfiniteRedirectOnHeadHandler(BaseHTTPRequestHandler):
def do_HEAD(self):
self.send_response(302, 'Found')
self.send_header('Location', '/')
self.send_header('Location', '/redirected')
self.send_header('Content-Length', '0')
self.end_headers()
@ -966,6 +966,55 @@ def test_TooManyRedirects_on_HEAD(app, monkeypatch):
}
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver')
def test_ignore_local_redirection(app):
with serve_application(app, InfiniteRedirectOnHeadHandler) as address:
app.config.linkcheck_ignore = [f'http://{address}/redirected']
app.build()
with open(app.outdir / 'output.json', encoding='utf-8') as fp:
content = json.load(fp)
assert content == {
'code': 302,
'status': 'ignored',
'filename': 'index.rst',
'lineno': 1,
'uri': f'http://{address}/',
'info': f'ignored redirect: http://{address}/redirected',
}
class RemoteDomainRedirectHandler(InfiniteRedirectOnHeadHandler):
protocol_version = 'HTTP/1.1'
def do_GET(self):
self.send_response(301, 'Found')
if self.path == '/':
self.send_header('Location', '/local')
elif self.path == '/local':
self.send_header('Location', 'http://example.test/migrated')
self.send_header('Content-Length', '0')
self.end_headers()
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver')
def test_ignore_remote_redirection(app):
with serve_application(app, RemoteDomainRedirectHandler) as address:
app.config.linkcheck_ignore = ['http://example.test']
app.build()
with open(app.outdir / 'output.json', encoding='utf-8') as fp:
content = json.load(fp)
assert content == {
'code': 301,
'status': 'ignored',
'filename': 'index.rst',
'lineno': 1,
'uri': f'http://{address}/',
'info': 'ignored redirect: http://example.test/migrated',
}
def make_retry_after_handler(
responses: list[tuple[int, str | None]],
) -> type[BaseHTTPRequestHandler]: