linkcheck: dont check anchors starting with "!".

Add changelog entry for #1099.
This commit is contained in:
Georg Brandl 2016-02-13 16:06:55 +01:00
parent 5e8642983b
commit 222edf59e7
2 changed files with 18 additions and 12 deletions

View File

@ -53,7 +53,9 @@ Features added
* #1779: Add EPUB 3 builder
* #1751: Add :confval:`todo_link_only` to avoid file path and line indication on
:rst:dir:`todolist`. Thanks to Francesco Montesano.
* #2199: Use ``imagesize`` package to obtain size of images
* #2199: Use ``imagesize`` package to obtain size of images.
* #1099: Add configurable retries to the linkcheck builder. Thanks to Alex Gaynor.
Also don't check anchors starting with ``!``.
Bugs fixed
----------

View File

@ -75,11 +75,11 @@ class AnchorCheckParser(HTMLParser):
self.found = True
def check_anchor(f, hash):
"""Reads HTML data from a filelike object 'f' searching for anchor 'hash'.
def check_anchor(f, anchor):
"""Reads HTML data from a filelike object 'f' searching for *anchor*.
Returns True if anchor was found, False otherwise.
"""
parser = AnchorCheckParser(hash)
parser = AnchorCheckParser(anchor)
try:
# Read file in chunks of 8192 bytes. If we find a matching anchor, we
# break the loop early in hopes not to have to download the whole thing.
@ -140,10 +140,10 @@ class CheckExternalLinksBuilder(Builder):
def check_uri():
# split off anchor
if '#' in uri:
req_url, hash = uri.split('#', 1)
req_url, anchor = uri.split('#', 1)
else:
req_url = uri
hash = None
anchor = None
# handle non-ASCII URIs
try:
@ -152,8 +152,11 @@ class CheckExternalLinksBuilder(Builder):
req_url = encode_uri(req_url)
try:
if hash and self.app.config.linkcheck_anchors:
# Read the whole document and see if #hash exists
if anchor and self.app.config.linkcheck_anchors and \
not anchor.startswith('!'):
# Read the whole document and see if #anchor exists
# (Anchors starting with ! are ignored since they are
# commonly used for dynamic pages)
req = Request(req_url)
f = opener.open(req, **kwargs)
encoding = 'utf-8'
@ -161,11 +164,12 @@ class CheckExternalLinksBuilder(Builder):
encoding = f.headers.get_content_charset() or encoding
else:
encoding = get_content_charset(f) or encoding
found = check_anchor(TextIOWrapper(f, encoding), unquote(hash))
found = check_anchor(TextIOWrapper(f, encoding),
unquote(anchor))
f.close()
if not found:
raise Exception("Anchor '%s' not found" % hash)
raise Exception("Anchor '%s' not found" % anchor)
else:
try:
# try a HEAD request, which should be easier on
@ -193,8 +197,8 @@ class CheckExternalLinksBuilder(Builder):
return 'working', '', 0
else:
new_url = f.url
if hash:
new_url += '#' + hash
if anchor:
new_url += '#' + anchor
code = getattr(req, 'redirect_code', 0)
return 'redirected', new_url, code