Allow skipping anchor checking by regex

To avoid needing to turn off anchor checking across the entire
documentation allow skipping based on matching the anchor against a
regex.

Some sites/pages use JavaScript to perform anchor assignment in a
webpage, which would require rendering the page to determine whether
the anchor exists. Allow fine grain control of whether the anchor is
checked based on pattern matching, until such stage as the retrieved
URLs can be passed through an engine for deeper checking on the HTML
doctree.
This commit is contained in:
Darragh Bailey 2016-03-23 16:38:46 +00:00
parent 78c0d6b46c
commit 22765990f0
4 changed files with 44 additions and 7 deletions

View File

@ -2083,6 +2083,17 @@ Options for the linkcheck builder
.. versionadded:: 1.2
.. confval:: linkcheck_anchors_ignore
A list of regular expressions that match URIs that should skip checking
the validity of anchors in links. This allows skipping entire sites, where
anchors are used to control dynamic pages, or just specific anchors within
a page, where javascript is used to add anchors dynamically, or use the
fragment as part of to trigger an internal REST request. Default is
``["/#!"]``.
.. versionadded:: 1.5
Options for the XML builder
---------------------------

View File

@ -82,6 +82,8 @@ class CheckExternalLinksBuilder(Builder):
def init(self):
self.to_ignore = [re.compile(x) for x in self.app.config.linkcheck_ignore]
self.anchors_ignore = [re.compile(x)
for x in self.app.config.linkcheck_anchors_ignore]
self.good = set()
self.broken = {}
self.redirected = {}
@ -112,6 +114,10 @@ class CheckExternalLinksBuilder(Builder):
# split off anchor
if '#' in uri:
req_url, anchor = uri.split('#', 1)
for rex in self.anchors_ignore:
if rex.match(anchor):
anchor = None
break
else:
req_url = uri
anchor = None
@ -123,11 +129,8 @@ class CheckExternalLinksBuilder(Builder):
req_url = encode_uri(req_url)
try:
if anchor and self.app.config.linkcheck_anchors and \
not anchor.startswith('!'):
if anchor and self.app.config.linkcheck_anchors:
# Read the whole document and see if #anchor exists
# (Anchors starting with ! are ignored since they are
# commonly used for dynamic pages)
response = requests.get(req_url, stream=True, headers=self.headers,
**kwargs)
found = check_anchor(response, unquote(anchor))
@ -294,3 +297,6 @@ def setup(app):
app.add_config_value('linkcheck_timeout', None, None, [int])
app.add_config_value('linkcheck_workers', 5, None)
app.add_config_value('linkcheck_anchors', True, None)
# Anchors starting with ! are ignored since they are
# commonly used for dynamic pages
app.add_config_value('linkcheck_anchors_ignore', ["^!"], None)

View File

@ -2,3 +2,10 @@ This is from CPython documentation.
* Also, if there is a `default namespace <https://www.w3.org/TR/2006/REC-xml-names-20060816/#defaulting>`__, that full URI gets prepended to all of the non-prefixed tags.
* The `SSMEDIAN <https://help.gnome.org/users/gnumeric/stable/gnumeric.html#gnumeric-function-SSMEDIAN>`_ function in the Gnome Gnumeric spreadsheet.
Some additional anchors to exercise ignore code
* `Example Bar invalid <http://example.com/#!bar>`_
* `Example Bar invalid <http://example.com#!bar>`_ tests that default ignore anchor of #! does not need to be prefixed with /
* `Example Bar invalid <http://example.com/#top>`_

View File

@ -14,12 +14,25 @@ from util import with_app
@with_app('linkcheck', testroot='linkcheck', freshenv=True)
def test_all(app, status, warning):
def test_defaults(app, status, warning):
app.builder.build_all()
assert (app.outdir / 'output.txt').exists()
content = (app.outdir / 'output.txt').text()
# expect all ok
assert not content
print(content)
# looking for #top should fail
assert "Anchor 'top' not found" in content
assert len(content.splitlines()) == 1
@with_app('linkcheck', testroot='linkcheck', freshenv=True,
confoverrides={'linkcheck_anchors_ignore': ["^!", "^top$"]})
def test_anchors_ignored(app, status, warning):
app.builder.build_all()
assert (app.outdir / 'output.txt').exists()
content = (app.outdir / 'output.txt').text()
# expect all ok when excluding #top
assert not content