linkcheck: Ignore URLs that respond with non-Unicode content (#12197)

2025-02-25 18:55:22 -06:00 · 2024-07-14 03:48:38 +01:00
parent f0d8e2ef5e
commit 35e7bfc347
3 changed files with 101 additions and 7 deletions
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -154,6 +154,8 @@ Bugs fixed
  Previously, each domain used language-specific nesting rules,
  which removed control from document authors.
  Patch by Jakob Lykke Andersen and Adam Turner.
+* #11041: linkcheck: Ignore URLs that respond with non-Unicode content.
+  Patch by James Addison.

 Testing
 -------
--- a/sphinx/builders/linkcheck.py
+++ b/sphinx/builders/linkcheck.py
@@ -471,9 +471,13 @@ class HyperlinkAvailabilityCheckWorker(Thread):
                    _user_agent=self.user_agent,
                    _tls_info=(self.tls_verify, self.tls_cacerts),
                ) as response:
-                    if (self.check_anchors and response.ok and anchor
-                            and not contains_anchor(response, anchor)):
-                        raise Exception(__("Anchor '%s' not found") % quote(anchor))
+                    if anchor and self.check_anchors and response.ok:
+                        try:
+                            found = contains_anchor(response, anchor)
+                        except UnicodeDecodeError:
+                            return 'ignored', 'unable to decode response content', 0
+                        if not found:
+                            return 'broken', __("Anchor '%s' not found") % quote(anchor), 0

                # Copy data we need from the (closed) response
                status_code = response.status_code
--- a/tests/test_builders/test_build_linkcheck.py
+++ b/tests/test_builders/test_build_linkcheck.py
@@ -36,7 +36,7 @@ from tests.utils import CERT_FILE, serve_application
 ts_re = re.compile(r".*\[(?P<ts>.*)\].*")

 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import Callable, Iterable
    from io import StringIO

    from sphinx.application import Sphinx
@@ -274,6 +274,43 @@ def test_anchors_ignored(app: Sphinx) -> None:


 class AnchorsIgnoreForUrlHandler(BaseHTTPRequestHandler):
+    protocol_version = 'HTTP/1.1'
+
+    def _chunk_content(self, content: str, *, max_chunk_size: int) -> Iterable[bytes]:
+
+        def _encode_chunk(chunk: bytes) -> Iterable[bytes]:
+            """Encode a bytestring into a format suitable for HTTP chunked-transfer.
+
+            https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Transfer-Encoding
+            """
+            yield f'{len(chunk):X}'.encode('ascii')
+            yield b'\r\n'
+            yield chunk
+            yield b'\r\n'
+
+        buffer = b''
+        for char in content:
+            buffer += char.encode('utf-8')
+            if len(buffer) >= max_chunk_size:
+                chunk, buffer = buffer[:max_chunk_size], buffer[max_chunk_size:]
+                yield from _encode_chunk(chunk)
+
+        # Flush remaining bytes, if any
+        if buffer:
+            yield from _encode_chunk(buffer)
+
+        # Emit a final empty chunk to close the stream
+        yield from _encode_chunk(b'')
+
+    def _send_chunked(self, content: str) -> bool:
+        for chunk in self._chunk_content(content, max_chunk_size=20):
+            try:
+                self.wfile.write(chunk)
+            except (BrokenPipeError, ConnectionResetError) as e:
+                self.log_message(str(e))
+                return False
+        return True
+
    def do_HEAD(self):
        if self.path in {'/valid', '/ignored'}:
            self.send_response(200, "OK")
@@ -282,11 +319,18 @@ class AnchorsIgnoreForUrlHandler(BaseHTTPRequestHandler):
        self.end_headers()

    def do_GET(self):
-        self.do_HEAD()
        if self.path == '/valid':
-            self.wfile.write(b"<h1 id='valid-anchor'>valid anchor</h1>\n")
+            self.send_response(200, 'OK')
+            content = "<h1 id='valid-anchor'>valid anchor</h1>\n"
        elif self.path == '/ignored':
-            self.wfile.write(b"no anchor but page exists\n")
+            self.send_response(200, 'OK')
+            content = 'no anchor but page exists\n'
+        else:
+            self.send_response(404, 'Not Found')
+            content = 'not found\n'
+        self.send_header('Transfer-Encoding', 'chunked')
+        self.end_headers()
+        self._send_chunked(content)


@pytest.mark.sphinx('linkcheck', testroot='linkcheck-anchors-ignore-for-url', freshenv=True)
@@ -349,6 +393,50 @@ def test_raises_for_invalid_status(app: Sphinx) -> None:
    )


+@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True)
+def test_incomplete_html_anchor(app):
+    class IncompleteHTMLDocumentHandler(BaseHTTPRequestHandler):
+        protocol_version = 'HTTP/1.1'
+
+        def do_GET(self):
+            content = b'this is <div id="anchor">not</div> a valid HTML document'
+            self.send_response(200, 'OK')
+            self.send_header('Content-Length', str(len(content)))
+            self.end_headers()
+            self.wfile.write(content)
+
+    with serve_application(app, IncompleteHTMLDocumentHandler):
+        app.build()
+
+    content = (app.outdir / 'output.json').read_text(encoding='utf8')
+    assert len(content.splitlines()) == 1
+
+    row = json.loads(content)
+    assert row['status'] == 'working'
+
+
+@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True)
+def test_decoding_error_anchor_ignored(app):
+    class NonASCIIHandler(BaseHTTPRequestHandler):
+        protocol_version = 'HTTP/1.1'
+
+        def do_GET(self):
+            content = b'\x80\x00\x80\x00'  # non-ASCII byte-string
+            self.send_response(200, 'OK')
+            self.send_header('Content-Length', str(len(content)))
+            self.end_headers()
+            self.wfile.write(content)
+
+    with serve_application(app, NonASCIIHandler):
+        app.build()
+
+    content = (app.outdir / 'output.json').read_text(encoding='utf8')
+    assert len(content.splitlines()) == 1
+
+    row = json.loads(content)
+    assert row['status'] == 'ignored'
+
+
 def custom_handler(valid_credentials=(), success_criteria=lambda _: True):
    """
    Returns an HTTP request handler that authenticates the client and then determines