Allow skipping anchor checking by regex

To avoid needing to turn off anchor checking across the entire documentation allow skipping based on matching the anchor against a regex. Some sites/pages use JavaScript to perform anchor assignment in a webpage, which would require rendering the page to determine whether the anchor exists. Allow fine grain control of whether the anchor is checked based on pattern matching, until such stage as the retrieved URLs can be passed through an engine for deeper checking on the HTML doctree.
2025-02-25 18:55:22 -06:00 · 2016-03-23 16:38:46 +00:00 · 2016-03-23 16:38:46 +00:00 · 22765990f0
commit 22765990f0
parent 78c0d6b46c
4 changed files with 44 additions and 7 deletions
--- a/doc/config.rst
+++ b/doc/config.rst
@ -2083,6 +2083,17 @@ Options for the linkcheck builder

   .. versionadded:: 1.2

+.. confval:: linkcheck_anchors_ignore
+
+   A list of regular expressions that match URIs that should skip checking
+   the validity of anchors in links. This allows skipping entire sites, where
+   anchors are used to control dynamic pages, or just specific anchors within
+   a page, where javascript is used to add anchors dynamically, or use the
+   fragment as part of to trigger an internal REST request. Default is
+   ``["/#!"]``.
+
+   .. versionadded:: 1.5
+

 Options for the XML builder
 ---------------------------
--- a/sphinx/builders/linkcheck.py
+++ b/sphinx/builders/linkcheck.py
@ -82,6 +82,8 @@ class CheckExternalLinksBuilder(Builder):

    def init(self):
        self.to_ignore = [re.compile(x) for x in self.app.config.linkcheck_ignore]
+        self.anchors_ignore = [re.compile(x)
+                               for x in self.app.config.linkcheck_anchors_ignore]
        self.good = set()
        self.broken = {}
        self.redirected = {}
@ -112,6 +114,10 @@ class CheckExternalLinksBuilder(Builder):
            # split off anchor
            if '#' in uri:
                req_url, anchor = uri.split('#', 1)
+                for rex in self.anchors_ignore:
+                    if rex.match(anchor):
+                        anchor = None
+                        break
            else:
                req_url = uri
                anchor = None
@ -123,11 +129,8 @@ class CheckExternalLinksBuilder(Builder):
                req_url = encode_uri(req_url)

            try:
-                if anchor and self.app.config.linkcheck_anchors and \
-                   not anchor.startswith('!'):
+                if anchor and self.app.config.linkcheck_anchors:
                    # Read the whole document and see if #anchor exists
-                    # (Anchors starting with ! are ignored since they are
-                    # commonly used for dynamic pages)
                    response = requests.get(req_url, stream=True, headers=self.headers,
                                            **kwargs)
                    found = check_anchor(response, unquote(anchor))
@ -294,3 +297,6 @@ def setup(app):
    app.add_config_value('linkcheck_timeout', None, None, [int])
    app.add_config_value('linkcheck_workers', 5, None)
    app.add_config_value('linkcheck_anchors', True, None)
+    # Anchors starting with ! are ignored since they are
+    # commonly used for dynamic pages
+    app.add_config_value('linkcheck_anchors_ignore', ["^!"], None)
--- a/tests/roots/test-linkcheck/links.txt
+++ b/tests/roots/test-linkcheck/links.txt
@ -2,3 +2,10 @@ This is from CPython documentation.

 * Also, if there is a `default namespace <https://www.w3.org/TR/2006/REC-xml-names-20060816/#defaulting>`__, that full URI gets prepended to all of the non-prefixed tags.
 * The `SSMEDIAN <https://help.gnome.org/users/gnumeric/stable/gnumeric.html#gnumeric-function-SSMEDIAN>`_ function in the Gnome Gnumeric spreadsheet.
+
+
+Some additional anchors to exercise ignore code
+
+* `Example Bar invalid <http://example.com/#!bar>`_
+* `Example Bar invalid <http://example.com#!bar>`_ tests that default ignore anchor of #! does not need to be prefixed with /
+* `Example Bar invalid <http://example.com/#top>`_
--- a/tests/test_build_linkcheck.py
+++ b/tests/test_build_linkcheck.py
@ -14,12 +14,25 @@ from util import with_app


@with_app('linkcheck', testroot='linkcheck', freshenv=True)
-def test_all(app, status, warning):
+def test_defaults(app, status, warning):
    app.builder.build_all()

    assert (app.outdir / 'output.txt').exists()
    content = (app.outdir / 'output.txt').text()

-    # expect all ok
-    assert not content
+    print(content)
+    # looking for #top should fail
+    assert "Anchor 'top' not found" in content
+    assert len(content.splitlines()) == 1

+
+@with_app('linkcheck', testroot='linkcheck', freshenv=True,
+          confoverrides={'linkcheck_anchors_ignore': ["^!", "^top$"]})
+def test_anchors_ignored(app, status, warning):
+    app.builder.build_all()
+
+    assert (app.outdir / 'output.txt').exists()
+    content = (app.outdir / 'output.txt').text()
+
+    # expect all ok when excluding #top
+    assert not content