mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
linkcheck: Ignore URLs that respond with non-Unicode content (#12197)
This commit is contained in:
@@ -154,6 +154,8 @@ Bugs fixed
|
||||
Previously, each domain used language-specific nesting rules,
|
||||
which removed control from document authors.
|
||||
Patch by Jakob Lykke Andersen and Adam Turner.
|
||||
* #11041: linkcheck: Ignore URLs that respond with non-Unicode content.
|
||||
Patch by James Addison.
|
||||
|
||||
Testing
|
||||
-------
|
||||
|
||||
@@ -471,9 +471,13 @@ class HyperlinkAvailabilityCheckWorker(Thread):
|
||||
_user_agent=self.user_agent,
|
||||
_tls_info=(self.tls_verify, self.tls_cacerts),
|
||||
) as response:
|
||||
if (self.check_anchors and response.ok and anchor
|
||||
and not contains_anchor(response, anchor)):
|
||||
raise Exception(__("Anchor '%s' not found") % quote(anchor))
|
||||
if anchor and self.check_anchors and response.ok:
|
||||
try:
|
||||
found = contains_anchor(response, anchor)
|
||||
except UnicodeDecodeError:
|
||||
return 'ignored', 'unable to decode response content', 0
|
||||
if not found:
|
||||
return 'broken', __("Anchor '%s' not found") % quote(anchor), 0
|
||||
|
||||
# Copy data we need from the (closed) response
|
||||
status_code = response.status_code
|
||||
|
||||
@@ -36,7 +36,7 @@ from tests.utils import CERT_FILE, serve_application
|
||||
ts_re = re.compile(r".*\[(?P<ts>.*)\].*")
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Callable, Iterable
|
||||
from io import StringIO
|
||||
|
||||
from sphinx.application import Sphinx
|
||||
@@ -274,6 +274,43 @@ def test_anchors_ignored(app: Sphinx) -> None:
|
||||
|
||||
|
||||
class AnchorsIgnoreForUrlHandler(BaseHTTPRequestHandler):
|
||||
protocol_version = 'HTTP/1.1'
|
||||
|
||||
def _chunk_content(self, content: str, *, max_chunk_size: int) -> Iterable[bytes]:
|
||||
|
||||
def _encode_chunk(chunk: bytes) -> Iterable[bytes]:
|
||||
"""Encode a bytestring into a format suitable for HTTP chunked-transfer.
|
||||
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Transfer-Encoding
|
||||
"""
|
||||
yield f'{len(chunk):X}'.encode('ascii')
|
||||
yield b'\r\n'
|
||||
yield chunk
|
||||
yield b'\r\n'
|
||||
|
||||
buffer = b''
|
||||
for char in content:
|
||||
buffer += char.encode('utf-8')
|
||||
if len(buffer) >= max_chunk_size:
|
||||
chunk, buffer = buffer[:max_chunk_size], buffer[max_chunk_size:]
|
||||
yield from _encode_chunk(chunk)
|
||||
|
||||
# Flush remaining bytes, if any
|
||||
if buffer:
|
||||
yield from _encode_chunk(buffer)
|
||||
|
||||
# Emit a final empty chunk to close the stream
|
||||
yield from _encode_chunk(b'')
|
||||
|
||||
def _send_chunked(self, content: str) -> bool:
|
||||
for chunk in self._chunk_content(content, max_chunk_size=20):
|
||||
try:
|
||||
self.wfile.write(chunk)
|
||||
except (BrokenPipeError, ConnectionResetError) as e:
|
||||
self.log_message(str(e))
|
||||
return False
|
||||
return True
|
||||
|
||||
def do_HEAD(self):
|
||||
if self.path in {'/valid', '/ignored'}:
|
||||
self.send_response(200, "OK")
|
||||
@@ -282,11 +319,18 @@ class AnchorsIgnoreForUrlHandler(BaseHTTPRequestHandler):
|
||||
self.end_headers()
|
||||
|
||||
def do_GET(self):
|
||||
self.do_HEAD()
|
||||
if self.path == '/valid':
|
||||
self.wfile.write(b"<h1 id='valid-anchor'>valid anchor</h1>\n")
|
||||
self.send_response(200, 'OK')
|
||||
content = "<h1 id='valid-anchor'>valid anchor</h1>\n"
|
||||
elif self.path == '/ignored':
|
||||
self.wfile.write(b"no anchor but page exists\n")
|
||||
self.send_response(200, 'OK')
|
||||
content = 'no anchor but page exists\n'
|
||||
else:
|
||||
self.send_response(404, 'Not Found')
|
||||
content = 'not found\n'
|
||||
self.send_header('Transfer-Encoding', 'chunked')
|
||||
self.end_headers()
|
||||
self._send_chunked(content)
|
||||
|
||||
|
||||
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-anchors-ignore-for-url', freshenv=True)
|
||||
@@ -349,6 +393,50 @@ def test_raises_for_invalid_status(app: Sphinx) -> None:
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True)
|
||||
def test_incomplete_html_anchor(app):
|
||||
class IncompleteHTMLDocumentHandler(BaseHTTPRequestHandler):
|
||||
protocol_version = 'HTTP/1.1'
|
||||
|
||||
def do_GET(self):
|
||||
content = b'this is <div id="anchor">not</div> a valid HTML document'
|
||||
self.send_response(200, 'OK')
|
||||
self.send_header('Content-Length', str(len(content)))
|
||||
self.end_headers()
|
||||
self.wfile.write(content)
|
||||
|
||||
with serve_application(app, IncompleteHTMLDocumentHandler):
|
||||
app.build()
|
||||
|
||||
content = (app.outdir / 'output.json').read_text(encoding='utf8')
|
||||
assert len(content.splitlines()) == 1
|
||||
|
||||
row = json.loads(content)
|
||||
assert row['status'] == 'working'
|
||||
|
||||
|
||||
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True)
|
||||
def test_decoding_error_anchor_ignored(app):
|
||||
class NonASCIIHandler(BaseHTTPRequestHandler):
|
||||
protocol_version = 'HTTP/1.1'
|
||||
|
||||
def do_GET(self):
|
||||
content = b'\x80\x00\x80\x00' # non-ASCII byte-string
|
||||
self.send_response(200, 'OK')
|
||||
self.send_header('Content-Length', str(len(content)))
|
||||
self.end_headers()
|
||||
self.wfile.write(content)
|
||||
|
||||
with serve_application(app, NonASCIIHandler):
|
||||
app.build()
|
||||
|
||||
content = (app.outdir / 'output.json').read_text(encoding='utf8')
|
||||
assert len(content.splitlines()) == 1
|
||||
|
||||
row = json.loads(content)
|
||||
assert row['status'] == 'ignored'
|
||||
|
||||
|
||||
def custom_handler(valid_credentials=(), success_criteria=lambda _: True):
|
||||
"""
|
||||
Returns an HTTP request handler that authenticates the client and then determines
|
||||
|
||||
Reference in New Issue
Block a user