From 35e7bfc347f845deff50787f0cd0340ea2ea0a5d Mon Sep 17 00:00:00 2001 From: James Addison <55152140+jayaddison@users.noreply.github.com> Date: Sun, 14 Jul 2024 03:48:38 +0100 Subject: [PATCH] linkcheck: Ignore URLs that respond with non-Unicode content (#12197) --- CHANGES.rst | 2 + sphinx/builders/linkcheck.py | 10 ++- tests/test_builders/test_build_linkcheck.py | 96 ++++++++++++++++++++- 3 files changed, 101 insertions(+), 7 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index d75bde8eb..1f9c6a17d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -154,6 +154,8 @@ Bugs fixed Previously, each domain used language-specific nesting rules, which removed control from document authors. Patch by Jakob Lykke Andersen and Adam Turner. +* #11041: linkcheck: Ignore URLs that respond with non-Unicode content. + Patch by James Addison. Testing ------- diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index c58140352..d50cfa219 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -471,9 +471,13 @@ class HyperlinkAvailabilityCheckWorker(Thread): _user_agent=self.user_agent, _tls_info=(self.tls_verify, self.tls_cacerts), ) as response: - if (self.check_anchors and response.ok and anchor - and not contains_anchor(response, anchor)): - raise Exception(__("Anchor '%s' not found") % quote(anchor)) + if anchor and self.check_anchors and response.ok: + try: + found = contains_anchor(response, anchor) + except UnicodeDecodeError: + return 'ignored', 'unable to decode response content', 0 + if not found: + return 'broken', __("Anchor '%s' not found") % quote(anchor), 0 # Copy data we need from the (closed) response status_code = response.status_code diff --git a/tests/test_builders/test_build_linkcheck.py b/tests/test_builders/test_build_linkcheck.py index 171988000..0787661ac 100644 --- a/tests/test_builders/test_build_linkcheck.py +++ b/tests/test_builders/test_build_linkcheck.py @@ -36,7 +36,7 @@ from tests.utils import CERT_FILE, serve_application ts_re = re.compile(r".*\[(?P.*)\].*") if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Callable, Iterable from io import StringIO from sphinx.application import Sphinx @@ -274,6 +274,43 @@ def test_anchors_ignored(app: Sphinx) -> None: class AnchorsIgnoreForUrlHandler(BaseHTTPRequestHandler): + protocol_version = 'HTTP/1.1' + + def _chunk_content(self, content: str, *, max_chunk_size: int) -> Iterable[bytes]: + + def _encode_chunk(chunk: bytes) -> Iterable[bytes]: + """Encode a bytestring into a format suitable for HTTP chunked-transfer. + + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Transfer-Encoding + """ + yield f'{len(chunk):X}'.encode('ascii') + yield b'\r\n' + yield chunk + yield b'\r\n' + + buffer = b'' + for char in content: + buffer += char.encode('utf-8') + if len(buffer) >= max_chunk_size: + chunk, buffer = buffer[:max_chunk_size], buffer[max_chunk_size:] + yield from _encode_chunk(chunk) + + # Flush remaining bytes, if any + if buffer: + yield from _encode_chunk(buffer) + + # Emit a final empty chunk to close the stream + yield from _encode_chunk(b'') + + def _send_chunked(self, content: str) -> bool: + for chunk in self._chunk_content(content, max_chunk_size=20): + try: + self.wfile.write(chunk) + except (BrokenPipeError, ConnectionResetError) as e: + self.log_message(str(e)) + return False + return True + def do_HEAD(self): if self.path in {'/valid', '/ignored'}: self.send_response(200, "OK") @@ -282,11 +319,18 @@ class AnchorsIgnoreForUrlHandler(BaseHTTPRequestHandler): self.end_headers() def do_GET(self): - self.do_HEAD() if self.path == '/valid': - self.wfile.write(b"

valid anchor

\n") + self.send_response(200, 'OK') + content = "

valid anchor

\n" elif self.path == '/ignored': - self.wfile.write(b"no anchor but page exists\n") + self.send_response(200, 'OK') + content = 'no anchor but page exists\n' + else: + self.send_response(404, 'Not Found') + content = 'not found\n' + self.send_header('Transfer-Encoding', 'chunked') + self.end_headers() + self._send_chunked(content) @pytest.mark.sphinx('linkcheck', testroot='linkcheck-anchors-ignore-for-url', freshenv=True) @@ -349,6 +393,50 @@ def test_raises_for_invalid_status(app: Sphinx) -> None: ) +@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True) +def test_incomplete_html_anchor(app): + class IncompleteHTMLDocumentHandler(BaseHTTPRequestHandler): + protocol_version = 'HTTP/1.1' + + def do_GET(self): + content = b'this is
not
a valid HTML document' + self.send_response(200, 'OK') + self.send_header('Content-Length', str(len(content))) + self.end_headers() + self.wfile.write(content) + + with serve_application(app, IncompleteHTMLDocumentHandler): + app.build() + + content = (app.outdir / 'output.json').read_text(encoding='utf8') + assert len(content.splitlines()) == 1 + + row = json.loads(content) + assert row['status'] == 'working' + + +@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True) +def test_decoding_error_anchor_ignored(app): + class NonASCIIHandler(BaseHTTPRequestHandler): + protocol_version = 'HTTP/1.1' + + def do_GET(self): + content = b'\x80\x00\x80\x00' # non-ASCII byte-string + self.send_response(200, 'OK') + self.send_header('Content-Length', str(len(content))) + self.end_headers() + self.wfile.write(content) + + with serve_application(app, NonASCIIHandler): + app.build() + + content = (app.outdir / 'output.json').read_text(encoding='utf8') + assert len(content.splitlines()) == 1 + + row = json.loads(content) + assert row['status'] == 'ignored' + + def custom_handler(valid_credentials=(), success_criteria=lambda _: True): """ Returns an HTTP request handler that authenticates the client and then determines