Merge pull request #2495 from electrofelix/linkcheck-skip-anchors

Allow skipping anchor checking using regex
This commit is contained in:
Takeshi KOMIYA 2016-11-04 23:53:46 +09:00 committed by GitHub
commit a503849ac1
4 changed files with 44 additions and 7 deletions

View File

@ -2106,6 +2106,17 @@ Options for the linkcheck builder
.. versionadded:: 1.2 .. versionadded:: 1.2
.. confval:: linkcheck_anchors_ignore
A list of regular expressions that match URIs that should skip checking
the validity of anchors in links. This allows skipping entire sites, where
anchors are used to control dynamic pages, or just specific anchors within
a page, where javascript is used to add anchors dynamically, or use the
fragment as part of to trigger an internal REST request. Default is
``["/#!"]``.
.. versionadded:: 1.5
Options for the XML builder Options for the XML builder
--------------------------- ---------------------------

View File

@ -82,6 +82,8 @@ class CheckExternalLinksBuilder(Builder):
def init(self): def init(self):
self.to_ignore = [re.compile(x) for x in self.app.config.linkcheck_ignore] self.to_ignore = [re.compile(x) for x in self.app.config.linkcheck_ignore]
self.anchors_ignore = [re.compile(x)
for x in self.app.config.linkcheck_anchors_ignore]
self.good = set() self.good = set()
self.broken = {} self.broken = {}
self.redirected = {} self.redirected = {}
@ -112,6 +114,10 @@ class CheckExternalLinksBuilder(Builder):
# split off anchor # split off anchor
if '#' in uri: if '#' in uri:
req_url, anchor = uri.split('#', 1) req_url, anchor = uri.split('#', 1)
for rex in self.anchors_ignore:
if rex.match(anchor):
anchor = None
break
else: else:
req_url = uri req_url = uri
anchor = None anchor = None
@ -123,11 +129,8 @@ class CheckExternalLinksBuilder(Builder):
req_url = encode_uri(req_url) req_url = encode_uri(req_url)
try: try:
if anchor and self.app.config.linkcheck_anchors and \ if anchor and self.app.config.linkcheck_anchors:
not anchor.startswith('!'):
# Read the whole document and see if #anchor exists # Read the whole document and see if #anchor exists
# (Anchors starting with ! are ignored since they are
# commonly used for dynamic pages)
response = requests.get(req_url, stream=True, headers=self.headers, response = requests.get(req_url, stream=True, headers=self.headers,
**kwargs) **kwargs)
found = check_anchor(response, unquote(anchor)) found = check_anchor(response, unquote(anchor))
@ -294,3 +297,6 @@ def setup(app):
app.add_config_value('linkcheck_timeout', None, None, [int]) app.add_config_value('linkcheck_timeout', None, None, [int])
app.add_config_value('linkcheck_workers', 5, None) app.add_config_value('linkcheck_workers', 5, None)
app.add_config_value('linkcheck_anchors', True, None) app.add_config_value('linkcheck_anchors', True, None)
# Anchors starting with ! are ignored since they are
# commonly used for dynamic pages
app.add_config_value('linkcheck_anchors_ignore', ["^!"], None)

View File

@ -2,3 +2,10 @@ This is from CPython documentation.
* Also, if there is a `default namespace <https://www.w3.org/TR/2006/REC-xml-names-20060816/#defaulting>`__, that full URI gets prepended to all of the non-prefixed tags. * Also, if there is a `default namespace <https://www.w3.org/TR/2006/REC-xml-names-20060816/#defaulting>`__, that full URI gets prepended to all of the non-prefixed tags.
* The `SSMEDIAN <https://help.gnome.org/users/gnumeric/stable/gnumeric.html#gnumeric-function-SSMEDIAN>`_ function in the Gnome Gnumeric spreadsheet. * The `SSMEDIAN <https://help.gnome.org/users/gnumeric/stable/gnumeric.html#gnumeric-function-SSMEDIAN>`_ function in the Gnome Gnumeric spreadsheet.
Some additional anchors to exercise ignore code
* `Example Bar invalid <http://example.com/#!bar>`_
* `Example Bar invalid <http://example.com#!bar>`_ tests that default ignore anchor of #! does not need to be prefixed with /
* `Example Bar invalid <http://example.com/#top>`_

View File

@ -14,12 +14,25 @@ from util import with_app
@with_app('linkcheck', testroot='linkcheck', freshenv=True) @with_app('linkcheck', testroot='linkcheck', freshenv=True)
def test_all(app, status, warning): def test_defaults(app, status, warning):
app.builder.build_all() app.builder.build_all()
assert (app.outdir / 'output.txt').exists() assert (app.outdir / 'output.txt').exists()
content = (app.outdir / 'output.txt').text() content = (app.outdir / 'output.txt').text()
# expect all ok print(content)
assert not content # looking for #top should fail
assert "Anchor 'top' not found" in content
assert len(content.splitlines()) == 1
@with_app('linkcheck', testroot='linkcheck', freshenv=True,
confoverrides={'linkcheck_anchors_ignore': ["^!", "^top$"]})
def test_anchors_ignored(app, status, warning):
app.builder.build_all()
assert (app.outdir / 'output.txt').exists()
content = (app.outdir / 'output.txt').text()
# expect all ok when excluding #top
assert not content