linkcheck: Store the original (unquoted) anchor (#12206)

This commit is contained in:
James Addison 2024-04-24 19:07:31 +01:00 committed by GitHub
parent 6d6feb240f
commit 2008aa8c78
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 7 additions and 4 deletions

View File

@ -13,7 +13,7 @@ from os import path
from queue import PriorityQueue, Queue from queue import PriorityQueue, Queue
from threading import Thread from threading import Thread
from typing import TYPE_CHECKING, NamedTuple, cast from typing import TYPE_CHECKING, NamedTuple, cast
from urllib.parse import unquote, urlparse, urlsplit, urlunparse from urllib.parse import quote, unquote, urlparse, urlsplit, urlunparse
from docutils import nodes from docutils import nodes
from requests.exceptions import ConnectionError, HTTPError, SSLError, TooManyRedirects from requests.exceptions import ConnectionError, HTTPError, SSLError, TooManyRedirects
@ -409,6 +409,7 @@ class HyperlinkAvailabilityCheckWorker(Thread):
if rex.match(req_url): if rex.match(req_url):
anchor = '' anchor = ''
break break
anchor = unquote(anchor)
# handle non-ASCII URIs # handle non-ASCII URIs
try: try:
@ -446,7 +447,7 @@ class HyperlinkAvailabilityCheckWorker(Thread):
) as response: ) as response:
if (self.check_anchors and response.ok and anchor if (self.check_anchors and response.ok and anchor
and not contains_anchor(response, anchor)): and not contains_anchor(response, anchor)):
raise Exception(__(f'Anchor {anchor!r} not found')) raise Exception(__(f'Anchor {quote(anchor)!r} not found'))
# Copy data we need from the (closed) response # Copy data we need from the (closed) response
status_code = response.status_code status_code = response.status_code
@ -592,7 +593,7 @@ def _get_request_headers(
def contains_anchor(response: Response, anchor: str) -> bool: def contains_anchor(response: Response, anchor: str) -> bool:
"""Determine if an anchor is contained within an HTTP response.""" """Determine if an anchor is contained within an HTTP response."""
parser = AnchorCheckParser(unquote(anchor)) parser = AnchorCheckParser(anchor)
# Read file in chunks. If we find a matching anchor, we break # Read file in chunks. If we find a matching anchor, we break
# the loop early in hopes not to have to download the whole thing. # the loop early in hopes not to have to download the whole thing.
for chunk in response.iter_content(chunk_size=4096, decode_unicode=True): for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):

View File

@ -1,5 +1,6 @@
* `Example valid url, no anchor <http://localhost:7777/valid>`_ * `Example valid url, no anchor <http://localhost:7777/valid>`_
* `Example valid url, valid anchor <http://localhost:7777/valid#valid-anchor>`_ * `Example valid url, valid anchor <http://localhost:7777/valid#valid-anchor>`_
* `Example valid url, valid quotable anchor <http://localhost:7777/valid#py:module::urllib.parse>`_
* `Example valid url, invalid anchor <http://localhost:7777/valid#invalid-anchor>`_ * `Example valid url, invalid anchor <http://localhost:7777/valid#invalid-anchor>`_
* `Example ignored url, no anchor <http://localhost:7777/ignored>`_ * `Example ignored url, no anchor <http://localhost:7777/ignored>`_
* `Example ignored url, invalid anchor <http://localhost:7777/ignored#invalid-anchor>`_ * `Example ignored url, invalid anchor <http://localhost:7777/ignored#invalid-anchor>`_

View File

@ -295,7 +295,7 @@ def test_anchors_ignored_for_url(app):
attrs = ('filename', 'lineno', 'status', 'code', 'uri', 'info') attrs = ('filename', 'lineno', 'status', 'code', 'uri', 'info')
data = [json.loads(x) for x in content.splitlines()] data = [json.loads(x) for x in content.splitlines()]
assert len(data) == 7 assert len(data) == 8
assert all(all(attr in row for attr in attrs) for row in data) assert all(all(attr in row for attr in attrs) for row in data)
# rows may be unsorted due to network latency or # rows may be unsorted due to network latency or
@ -304,6 +304,7 @@ def test_anchors_ignored_for_url(app):
assert rows[f'http://{address}/valid']['status'] == 'working' assert rows[f'http://{address}/valid']['status'] == 'working'
assert rows[f'http://{address}/valid#valid-anchor']['status'] == 'working' assert rows[f'http://{address}/valid#valid-anchor']['status'] == 'working'
assert rows['http://localhost:7777/valid#py:module::urllib.parse']['status'] == 'broken'
assert rows[f'http://{address}/valid#invalid-anchor'] == { assert rows[f'http://{address}/valid#invalid-anchor'] == {
'status': 'broken', 'status': 'broken',
'info': "Anchor 'invalid-anchor' not found", 'info': "Anchor 'invalid-anchor' not found",