linkcheck: Store the original (unquoted) anchor (#12206)

This commit is contained in:
James Addison 2024-04-24 19:07:31 +01:00 committed by GitHub
parent 6d6feb240f
commit 2008aa8c78
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 7 additions and 4 deletions

View File

@ -13,7 +13,7 @@ from os import path
from queue import PriorityQueue, Queue
from threading import Thread
from typing import TYPE_CHECKING, NamedTuple, cast
from urllib.parse import unquote, urlparse, urlsplit, urlunparse
from urllib.parse import quote, unquote, urlparse, urlsplit, urlunparse
from docutils import nodes
from requests.exceptions import ConnectionError, HTTPError, SSLError, TooManyRedirects
@ -409,6 +409,7 @@ class HyperlinkAvailabilityCheckWorker(Thread):
if rex.match(req_url):
anchor = ''
break
anchor = unquote(anchor)
# handle non-ASCII URIs
try:
@ -446,7 +447,7 @@ class HyperlinkAvailabilityCheckWorker(Thread):
) as response:
if (self.check_anchors and response.ok and anchor
and not contains_anchor(response, anchor)):
raise Exception(__(f'Anchor {anchor!r} not found'))
raise Exception(__(f'Anchor {quote(anchor)!r} not found'))
# Copy data we need from the (closed) response
status_code = response.status_code
@ -592,7 +593,7 @@ def _get_request_headers(
def contains_anchor(response: Response, anchor: str) -> bool:
"""Determine if an anchor is contained within an HTTP response."""
parser = AnchorCheckParser(unquote(anchor))
parser = AnchorCheckParser(anchor)
# Read file in chunks. If we find a matching anchor, we break
# the loop early in hopes not to have to download the whole thing.
for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):

View File

@ -1,5 +1,6 @@
* `Example valid url, no anchor <http://localhost:7777/valid>`_
* `Example valid url, valid anchor <http://localhost:7777/valid#valid-anchor>`_
* `Example valid url, valid quotable anchor <http://localhost:7777/valid#py:module::urllib.parse>`_
* `Example valid url, invalid anchor <http://localhost:7777/valid#invalid-anchor>`_
* `Example ignored url, no anchor <http://localhost:7777/ignored>`_
* `Example ignored url, invalid anchor <http://localhost:7777/ignored#invalid-anchor>`_

View File

@ -295,7 +295,7 @@ def test_anchors_ignored_for_url(app):
attrs = ('filename', 'lineno', 'status', 'code', 'uri', 'info')
data = [json.loads(x) for x in content.splitlines()]
assert len(data) == 7
assert len(data) == 8
assert all(all(attr in row for attr in attrs) for row in data)
# rows may be unsorted due to network latency or
@ -304,6 +304,7 @@ def test_anchors_ignored_for_url(app):
assert rows[f'http://{address}/valid']['status'] == 'working'
assert rows[f'http://{address}/valid#valid-anchor']['status'] == 'working'
assert rows['http://localhost:7777/valid#py:module::urllib.parse']['status'] == 'broken'
assert rows[f'http://{address}/valid#invalid-anchor'] == {
'status': 'broken',
'info': "Anchor 'invalid-anchor' not found",