linkcheck: Ignore URLs that respond with non-Unicode content (#12197)

This commit is contained in:
James Addison
2024-07-14 03:48:38 +01:00
committed by GitHub
parent f0d8e2ef5e
commit 35e7bfc347
3 changed files with 101 additions and 7 deletions

View File

@@ -154,6 +154,8 @@ Bugs fixed
Previously, each domain used language-specific nesting rules,
which removed control from document authors.
Patch by Jakob Lykke Andersen and Adam Turner.
* #11041: linkcheck: Ignore URLs that respond with non-Unicode content.
Patch by James Addison.
Testing
-------

View File

@@ -471,9 +471,13 @@ class HyperlinkAvailabilityCheckWorker(Thread):
_user_agent=self.user_agent,
_tls_info=(self.tls_verify, self.tls_cacerts),
) as response:
if (self.check_anchors and response.ok and anchor
and not contains_anchor(response, anchor)):
raise Exception(__("Anchor '%s' not found") % quote(anchor))
if anchor and self.check_anchors and response.ok:
try:
found = contains_anchor(response, anchor)
except UnicodeDecodeError:
return 'ignored', 'unable to decode response content', 0
if not found:
return 'broken', __("Anchor '%s' not found") % quote(anchor), 0
# Copy data we need from the (closed) response
status_code = response.status_code

View File

@@ -36,7 +36,7 @@ from tests.utils import CERT_FILE, serve_application
ts_re = re.compile(r".*\[(?P<ts>.*)\].*")
if TYPE_CHECKING:
from collections.abc import Callable
from collections.abc import Callable, Iterable
from io import StringIO
from sphinx.application import Sphinx
@@ -274,6 +274,43 @@ def test_anchors_ignored(app: Sphinx) -> None:
class AnchorsIgnoreForUrlHandler(BaseHTTPRequestHandler):
protocol_version = 'HTTP/1.1'
def _chunk_content(self, content: str, *, max_chunk_size: int) -> Iterable[bytes]:
def _encode_chunk(chunk: bytes) -> Iterable[bytes]:
"""Encode a bytestring into a format suitable for HTTP chunked-transfer.
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Transfer-Encoding
"""
yield f'{len(chunk):X}'.encode('ascii')
yield b'\r\n'
yield chunk
yield b'\r\n'
buffer = b''
for char in content:
buffer += char.encode('utf-8')
if len(buffer) >= max_chunk_size:
chunk, buffer = buffer[:max_chunk_size], buffer[max_chunk_size:]
yield from _encode_chunk(chunk)
# Flush remaining bytes, if any
if buffer:
yield from _encode_chunk(buffer)
# Emit a final empty chunk to close the stream
yield from _encode_chunk(b'')
def _send_chunked(self, content: str) -> bool:
for chunk in self._chunk_content(content, max_chunk_size=20):
try:
self.wfile.write(chunk)
except (BrokenPipeError, ConnectionResetError) as e:
self.log_message(str(e))
return False
return True
def do_HEAD(self):
if self.path in {'/valid', '/ignored'}:
self.send_response(200, "OK")
@@ -282,11 +319,18 @@ class AnchorsIgnoreForUrlHandler(BaseHTTPRequestHandler):
self.end_headers()
def do_GET(self):
self.do_HEAD()
if self.path == '/valid':
self.wfile.write(b"<h1 id='valid-anchor'>valid anchor</h1>\n")
self.send_response(200, 'OK')
content = "<h1 id='valid-anchor'>valid anchor</h1>\n"
elif self.path == '/ignored':
self.wfile.write(b"no anchor but page exists\n")
self.send_response(200, 'OK')
content = 'no anchor but page exists\n'
else:
self.send_response(404, 'Not Found')
content = 'not found\n'
self.send_header('Transfer-Encoding', 'chunked')
self.end_headers()
self._send_chunked(content)
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-anchors-ignore-for-url', freshenv=True)
@@ -349,6 +393,50 @@ def test_raises_for_invalid_status(app: Sphinx) -> None:
)
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True)
def test_incomplete_html_anchor(app):
class IncompleteHTMLDocumentHandler(BaseHTTPRequestHandler):
protocol_version = 'HTTP/1.1'
def do_GET(self):
content = b'this is <div id="anchor">not</div> a valid HTML document'
self.send_response(200, 'OK')
self.send_header('Content-Length', str(len(content)))
self.end_headers()
self.wfile.write(content)
with serve_application(app, IncompleteHTMLDocumentHandler):
app.build()
content = (app.outdir / 'output.json').read_text(encoding='utf8')
assert len(content.splitlines()) == 1
row = json.loads(content)
assert row['status'] == 'working'
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True)
def test_decoding_error_anchor_ignored(app):
class NonASCIIHandler(BaseHTTPRequestHandler):
protocol_version = 'HTTP/1.1'
def do_GET(self):
content = b'\x80\x00\x80\x00' # non-ASCII byte-string
self.send_response(200, 'OK')
self.send_header('Content-Length', str(len(content)))
self.end_headers()
self.wfile.write(content)
with serve_application(app, NonASCIIHandler):
app.build()
content = (app.outdir / 'output.json').read_text(encoding='utf8')
assert len(content.splitlines()) == 1
row = json.loads(content)
assert row['status'] == 'ignored'
def custom_handler(valid_credentials=(), success_criteria=lambda _: True):
"""
Returns an HTTP request handler that authenticates the client and then determines