From 35e7bfc347f845deff50787f0cd0340ea2ea0a5d Mon Sep 17 00:00:00 2001
From: James Addison <55152140+jayaddison@users.noreply.github.com>
Date: Sun, 14 Jul 2024 03:48:38 +0100
Subject: [PATCH] linkcheck: Ignore URLs that respond with non-Unicode content
 (#12197)

---
 CHANGES.rst                                 |  2 +
 sphinx/builders/linkcheck.py                | 10 ++-
 tests/test_builders/test_build_linkcheck.py | 96 ++++++++++++++++++++-
 3 files changed, 101 insertions(+), 7 deletions(-)
diff --git a/CHANGES.rst b/CHANGES.rst
index d75bde8eb..1f9c6a17d 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -154,6 +154,8 @@ Bugs fixed
   Previously, each domain used language-specific nesting rules,
   which removed control from document authors.
   Patch by Jakob Lykke Andersen and Adam Turner.
+* #11041: linkcheck: Ignore URLs that respond with non-Unicode content.
+  Patch by James Addison.
 
 Testing
 -------
diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py
index c58140352..d50cfa219 100644
--- a/sphinx/builders/linkcheck.py
+++ b/sphinx/builders/linkcheck.py
@@ -471,9 +471,13 @@ class HyperlinkAvailabilityCheckWorker(Thread):
                     _user_agent=self.user_agent,
                     _tls_info=(self.tls_verify, self.tls_cacerts),
                 ) as response:
-                    if (self.check_anchors and response.ok and anchor
-                            and not contains_anchor(response, anchor)):
-                        raise Exception(__("Anchor '%s' not found") % quote(anchor))
+                    if anchor and self.check_anchors and response.ok:
+                        try:
+                            found = contains_anchor(response, anchor)
+                        except UnicodeDecodeError:
+                            return 'ignored', 'unable to decode response content', 0
+                        if not found:
+                            return 'broken', __("Anchor '%s' not found") % quote(anchor), 0
 
                 # Copy data we need from the (closed) response
                 status_code = response.status_code
diff --git a/tests/test_builders/test_build_linkcheck.py b/tests/test_builders/test_build_linkcheck.py
index 171988000..0787661ac 100644
--- a/tests/test_builders/test_build_linkcheck.py
+++ b/tests/test_builders/test_build_linkcheck.py
@@ -36,7 +36,7 @@ from tests.utils import CERT_FILE, serve_application
 ts_re = re.compile(r".*\[(?P<ts>.*)\].*")
 
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import Callable, Iterable
     from io import StringIO
 
     from sphinx.application import Sphinx
@@ -274,6 +274,43 @@ def test_anchors_ignored(app: Sphinx) -> None:
 
 
 class AnchorsIgnoreForUrlHandler(BaseHTTPRequestHandler):
+    protocol_version = 'HTTP/1.1'
+
+    def _chunk_content(self, content: str, *, max_chunk_size: int) -> Iterable[bytes]:
+
+        def _encode_chunk(chunk: bytes) -> Iterable[bytes]:
+            """Encode a bytestring into a format suitable for HTTP chunked-transfer.
+
+            https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Transfer-Encoding
+            """
+            yield f'{len(chunk):X}'.encode('ascii')
+            yield b'\r\n'
+            yield chunk
+            yield b'\r\n'
+
+        buffer = b''
+        for char in content:
+            buffer += char.encode('utf-8')
+            if len(buffer) >= max_chunk_size:
+                chunk, buffer = buffer[:max_chunk_size], buffer[max_chunk_size:]
+                yield from _encode_chunk(chunk)
+
+        # Flush remaining bytes, if any
+        if buffer:
+            yield from _encode_chunk(buffer)
+
+        # Emit a final empty chunk to close the stream
+        yield from _encode_chunk(b'')
+
+    def _send_chunked(self, content: str) -> bool:
+        for chunk in self._chunk_content(content, max_chunk_size=20):
+            try:
+                self.wfile.write(chunk)
+            except (BrokenPipeError, ConnectionResetError) as e:
+                self.log_message(str(e))
+                return False
+        return True
+
     def do_HEAD(self):
         if self.path in {'/valid', '/ignored'}:
             self.send_response(200, "OK")
@@ -282,11 +319,18 @@ class AnchorsIgnoreForUrlHandler(BaseHTTPRequestHandler):
         self.end_headers()
 
     def do_GET(self):
-        self.do_HEAD()
         if self.path == '/valid':
-            self.wfile.write(b"<h1 id='valid-anchor'>valid anchor</h1>\n")
+            self.send_response(200, 'OK')
+            content = "<h1 id='valid-anchor'>valid anchor</h1>\n"
         elif self.path == '/ignored':
-            self.wfile.write(b"no anchor but page exists\n")
+            self.send_response(200, 'OK')
+            content = 'no anchor but page exists\n'
+        else:
+            self.send_response(404, 'Not Found')
+            content = 'not found\n'
+        self.send_header('Transfer-Encoding', 'chunked')
+        self.end_headers()
+        self._send_chunked(content)
 
 
 @pytest.mark.sphinx('linkcheck', testroot='linkcheck-anchors-ignore-for-url', freshenv=True)
@@ -349,6 +393,50 @@ def test_raises_for_invalid_status(app: Sphinx) -> None:
     )
 
 
+@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True)
+def test_incomplete_html_anchor(app):
+    class IncompleteHTMLDocumentHandler(BaseHTTPRequestHandler):
+        protocol_version = 'HTTP/1.1'
+
+        def do_GET(self):
+            content = b'this is <div id="anchor">not</div> a valid HTML document'
+            self.send_response(200, 'OK')
+            self.send_header('Content-Length', str(len(content)))
+            self.end_headers()
+            self.wfile.write(content)
+
+    with serve_application(app, IncompleteHTMLDocumentHandler):
+        app.build()
+
+    content = (app.outdir / 'output.json').read_text(encoding='utf8')
+    assert len(content.splitlines()) == 1
+
+    row = json.loads(content)
+    assert row['status'] == 'working'
+
+
+@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True)
+def test_decoding_error_anchor_ignored(app):
+    class NonASCIIHandler(BaseHTTPRequestHandler):
+        protocol_version = 'HTTP/1.1'
+
+        def do_GET(self):
+            content = b'\x80\x00\x80\x00'  # non-ASCII byte-string
+            self.send_response(200, 'OK')
+            self.send_header('Content-Length', str(len(content)))
+            self.end_headers()
+            self.wfile.write(content)
+
+    with serve_application(app, NonASCIIHandler):
+        app.build()
+
+    content = (app.outdir / 'output.json').read_text(encoding='utf8')
+    assert len(content.splitlines()) == 1
+
+    row = json.loads(content)
+    assert row['status'] == 'ignored'
+
+
 def custom_handler(valid_credentials=(), success_criteria=lambda _: True):
     """
     Returns an HTTP request handler that authenticates the client and then determines