From 22765990f0acf7d4d7a6c535a526a8d16cdf185d Mon Sep 17 00:00:00 2001
From: Darragh Bailey <dbailey@hpe.com>
Date: Wed, 23 Mar 2016 16:38:46 +0000
Subject: [PATCH] Allow skipping anchor checking by regex

To avoid needing to turn off anchor checking across the entire
documentation allow skipping based on matching the anchor against a
regex.

Some sites/pages use JavaScript to perform anchor assignment in a
webpage, which would require rendering the page to determine whether
the anchor exists. Allow fine grain control of whether the anchor is
checked based on pattern matching, until such stage as the retrieved
URLs can be passed through an engine for deeper checking on the HTML
doctree.
---
 doc/config.rst                       | 11 +++++++++++
 sphinx/builders/linkcheck.py         | 14 ++++++++++----
 tests/roots/test-linkcheck/links.txt |  7 +++++++
 tests/test_build_linkcheck.py        | 19 ++++++++++++++++---
 4 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/doc/config.rst b/doc/config.rst
index 0ae1c65e4..93e29fc42 100644
--- a/doc/config.rst
+++ b/doc/config.rst
@@ -2083,6 +2083,17 @@ Options for the linkcheck builder
 
    .. versionadded:: 1.2
 
+.. confval:: linkcheck_anchors_ignore
+
+   A list of regular expressions that match URIs that should skip checking
+   the validity of anchors in links. This allows skipping entire sites, where
+   anchors are used to control dynamic pages, or just specific anchors within
+   a page, where javascript is used to add anchors dynamically, or use the
+   fragment as part of to trigger an internal REST request. Default is
+   ``["/#!"]``.
+
+   .. versionadded:: 1.5
+
 
 Options for the XML builder
 ---------------------------
diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py
index e53cabb62..f49f4f9a3 100644
--- a/sphinx/builders/linkcheck.py
+++ b/sphinx/builders/linkcheck.py
@@ -82,6 +82,8 @@ class CheckExternalLinksBuilder(Builder):
 
     def init(self):
         self.to_ignore = [re.compile(x) for x in self.app.config.linkcheck_ignore]
+        self.anchors_ignore = [re.compile(x)
+                               for x in self.app.config.linkcheck_anchors_ignore]
         self.good = set()
         self.broken = {}
         self.redirected = {}
@@ -112,6 +114,10 @@ class CheckExternalLinksBuilder(Builder):
             # split off anchor
             if '#' in uri:
                 req_url, anchor = uri.split('#', 1)
+                for rex in self.anchors_ignore:
+                    if rex.match(anchor):
+                        anchor = None
+                        break
             else:
                 req_url = uri
                 anchor = None
@@ -123,11 +129,8 @@ class CheckExternalLinksBuilder(Builder):
                 req_url = encode_uri(req_url)
 
             try:
-                if anchor and self.app.config.linkcheck_anchors and \
-                   not anchor.startswith('!'):
+                if anchor and self.app.config.linkcheck_anchors:
                     # Read the whole document and see if #anchor exists
-                    # (Anchors starting with ! are ignored since they are
-                    # commonly used for dynamic pages)
                     response = requests.get(req_url, stream=True, headers=self.headers,
                                             **kwargs)
                     found = check_anchor(response, unquote(anchor))
@@ -294,3 +297,6 @@ def setup(app):
     app.add_config_value('linkcheck_timeout', None, None, [int])
     app.add_config_value('linkcheck_workers', 5, None)
     app.add_config_value('linkcheck_anchors', True, None)
+    # Anchors starting with ! are ignored since they are
+    # commonly used for dynamic pages
+    app.add_config_value('linkcheck_anchors_ignore', ["^!"], None)
diff --git a/tests/roots/test-linkcheck/links.txt b/tests/roots/test-linkcheck/links.txt
index c3ec7235e..ef3607970 100644
--- a/tests/roots/test-linkcheck/links.txt
+++ b/tests/roots/test-linkcheck/links.txt
@@ -2,3 +2,10 @@ This is from CPython documentation.
 
 * Also, if there is a `default namespace <https://www.w3.org/TR/2006/REC-xml-names-20060816/#defaulting>`__, that full URI gets prepended to all of the non-prefixed tags.
 * The `SSMEDIAN <https://help.gnome.org/users/gnumeric/stable/gnumeric.html#gnumeric-function-SSMEDIAN>`_ function in the Gnome Gnumeric spreadsheet.
+
+
+Some additional anchors to exercise ignore code
+
+* `Example Bar invalid <http://example.com/#!bar>`_
+* `Example Bar invalid <http://example.com#!bar>`_ tests that default ignore anchor of #! does not need to be prefixed with /
+* `Example Bar invalid <http://example.com/#top>`_
diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py
index 700642901..1d75135af 100644
--- a/tests/test_build_linkcheck.py
+++ b/tests/test_build_linkcheck.py
@@ -14,12 +14,25 @@ from util import with_app
 
 
 @with_app('linkcheck', testroot='linkcheck', freshenv=True)
-def test_all(app, status, warning):
+def test_defaults(app, status, warning):
     app.builder.build_all()
 
     assert (app.outdir / 'output.txt').exists()
     content = (app.outdir / 'output.txt').text()
 
-    # expect all ok
-    assert not content
+    print(content)
+    # looking for #top should fail
+    assert "Anchor 'top' not found" in content
+    assert len(content.splitlines()) == 1
 
+
+@with_app('linkcheck', testroot='linkcheck', freshenv=True,
+          confoverrides={'linkcheck_anchors_ignore': ["^!", "^top$"]})
+def test_anchors_ignored(app, status, warning):
+    app.builder.build_all()
+
+    assert (app.outdir / 'output.txt').exists()
+    content = (app.outdir / 'output.txt').text()
+
+    # expect all ok when excluding #top
+    assert not content