#472: linkcheck builder: Check links in parallel, use HTTP HEAD requests and allow configuring the timeout.

New config values: :confval:`linkcheck_timeout` and :confval:`linkcheck_workers`.
2025-02-25 18:55:22 -06:00 · 2011-01-04 11:27:42 +01:00
parent 3642b521ed
commit 51852c0e87
4 changed files with 127 additions and 75 deletions
--- a/4
+++ b/4
@@ -24,6 +24,10 @@ Release 1.1 (in development)

 * #443: Allow referencing external graphviz files.

+* #472: linkcheck builder: Check links in parallel, use HTTP HEAD
+  requests and allow configuring the timeout.  New config values:
+  :confval:`linkcheck_timeout` and :confval:`linkcheck_workers`.
+
 * #221: Add Swedish locale.

 * Added ``inline`` option to graphviz directives, and fixed the
--- a/doc/config.rst
+++ b/doc/config.rst
@@ -1132,6 +1132,21 @@ Options for the linkcheck builder

   .. versionadded:: 1.1

+.. confval:: linkcheck_timeout
+
+   A timeout value, in seconds, for the linkcheck builder.  **Only works in
+   Python 2.6 and higher.**  The default is to use Python's global socket
+   timeout.
+
+   .. versionadded:: 1.1
+
+.. confval:: linkcheck_workers
+
+   The number of worker threads to use when checking links.  Default is 5
+   threads.
+
+   .. versionadded:: 1.1
+

 .. rubric:: Footnotes

--- a/sphinx/builders/linkcheck.py
+++ b/sphinx/builders/linkcheck.py
@@ -10,9 +10,12 @@
 """

 import re
+import sys
+import Queue
 import socket
+import threading
 from os import path
-from urllib2 import build_opener, HTTPError
+from urllib2 import build_opener, Request

 from docutils import nodes

@@ -24,6 +27,12 @@ opener = build_opener()
 opener.addheaders = [('User-agent', 'Mozilla/5.0')]


+class HeadRequest(Request):
+    """Subclass of urllib2.Request that sends a HEAD request."""
+    def get_method(self):
+        return 'HEAD'
+
+
 class CheckExternalLinksBuilder(Builder):
    """
    Checks for broken external links.
@@ -40,6 +49,83 @@ class CheckExternalLinksBuilder(Builder):
        # create output file
        open(path.join(self.outdir, 'output.txt'), 'w').close()

+        # create queues and worker threads
+        self.wqueue = Queue.Queue()
+        self.rqueue = Queue.Queue()
+        self.workers = []
+        for i in range(self.app.config.linkcheck_workers):
+            thread = threading.Thread(target=self.check_thread)
+            thread.setDaemon(True)
+            thread.start()
+            self.workers.append(thread)
+
+    def check_thread(self):
+        kwargs = {}
+        if sys.version_info > (2, 5) and self.app.config.linkcheck_timeout:
+            kwargs['timeout'] = self.app.config.linkcheck_timeout
+
+        def check():
+            # check for various conditions without bothering the network
+            if len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
+                return 'unchecked', ''
+            elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'):
+                return 'local', ''
+            elif uri in self.good:
+                return 'working', ''
+            elif uri in self.broken:
+                return 'broken', self.broken[uri]
+            elif uri in self.redirected:
+                return 'redirected', self.redirected[uri]
+            for rex in self.to_ignore:
+                if rex.match(uri):
+                    return 'ignored', ''
+
+            # need to actually check the URI
+            try:
+                f = opener.open(HeadRequest(uri), **kwargs)
+                f.close()
+            except Exception, err:
+                self.broken[uri] = str(err)
+                return 'broken', str(err)
+            if f.url.rstrip('/') == uri.rstrip('/'):
+                self.good.add(uri)
+                return 'working', 'new'
+            else:
+                self.redirected[uri] = f.url
+                return 'redirected', f.url
+
+        while True:
+            uri, docname, lineno = self.wqueue.get()
+            if uri is None:
+                break
+            status, info = check()
+            self.rqueue.put((uri, docname, lineno, status, info))
+
+    def process_result(self, result):
+        uri, docname, lineno, status, info = result
+        if status == 'unchecked':
+            return
+        if status == 'working' and info != 'new':
+            return
+        if lineno:
+            self.info('(line %3d) ' % lineno, nonl=1)
+        if status == 'ignored':
+            self.info(uri + ' - ' + darkgray('ignored'))
+        elif status == 'local':
+            self.info(uri + ' - ' + darkgray('local'))
+            self.write_entry('local', docname, lineno, uri)
+        elif status == 'working':
+            self.info(uri + ' - ' + darkgreen('working'))
+        elif status == 'broken':
+            self.info(uri + ' - ' + red('broken: ') + info)
+            self.write_entry('broken', docname, lineno, uri + ': ' + info)
+            if self.app.quiet:
+                self.warn('broken link: %s' % uri,
+                          '%s:%s' % (self.env.doc2path(docname), lineno))
+        elif status == 'redirected':
+            self.info(uri + ' - ' + purple('redirected') + ' to ' + info)
+            self.write_entry('redirected', docname, lineno, uri + ' to ' + info)
+
    def get_target_uri(self, docname, typ=None):
        return ''

@@ -51,65 +137,25 @@ class CheckExternalLinksBuilder(Builder):

    def write_doc(self, docname, doctree):
        self.info()
+        n = 0
        for node in doctree.traverse(nodes.reference):
-            try:
-                self.check(node, docname)
-            except KeyError:
+            if 'refuri' not in node:
                continue
-
-    def check(self, node, docname):
-        uri = node['refuri']
-
-        if '#' in uri:
-            uri = uri.split('#')[0]
-
-        if uri in self.good:
-            return
-
-        lineno = None
-        while lineno is None:
-            node = node.parent
-            if node is None:
-                break
-            lineno = node.line
-
-        if len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
-            return
-
-        if lineno:
-            self.info('(line %3d) ' % lineno, nonl=1)
-        for rex in self.to_ignore:
-            if rex.match(uri):
-                self.info(uri + ' - ' + darkgray('ignored'))
-                return
-        if uri[0:5] == 'http:' or uri[0:6] == 'https:':
-            self.info(uri, nonl=1)
-
-            if uri in self.broken:
-                (r, s) = self.broken[uri]
-            elif uri in self.redirected:
-                (r, s) = self.redirected[uri]
-            else:
-                (r, s) = self.resolve(uri)
-
-            if r == 0:
-                self.info(' - ' + darkgreen('working'))
-                self.good.add(uri)
-            elif r == 2:
-                self.info(' - ' + red('broken: ') + s)
-                self.write_entry('broken', docname, lineno, uri + ': ' + s)
-                self.broken[uri] = (r, s)
-                if self.app.quiet:
-                    self.warn('broken link: %s' % uri,
-                              '%s:%s' % (self.env.doc2path(docname), lineno))
-            else:
-                self.info(' - ' + purple('redirected') + ' to ' + s)
-                self.write_entry('redirected', docname,
-                                 lineno, uri + ' to ' + s)
-                self.redirected[uri] = (r, s)
-        else:
-            self.info(uri + ' - ' + darkgray('local'))
-            self.write_entry('local', docname, lineno, uri)
+            uri = node['refuri']
+            if '#' in uri:
+                uri = uri.split('#')[0]
+            lineno = None
+            while lineno is None:
+                node = node.parent
+                if node is None:
+                    break
+                lineno = node.line
+            self.wqueue.put((uri, docname, lineno), False)
+            n += 1
+        done = 0
+        while done < n:
+            self.process_result(self.rqueue.get())
+            done += 1

        if self.broken:
            self.app.statuscode = 1
@@ -120,21 +166,6 @@ class CheckExternalLinksBuilder(Builder):
                                           line, what, uri))
        output.close()

-    def resolve(self, uri):
-        try:
-            f = opener.open(uri)
-            f.close()
-        except HTTPError, err:
-            #if err.code == 403 and uri.startswith('http://en.wikipedia.org/'):
-            #    # Wikipedia blocks requests from urllib User-Agent
-            #    return (0, 0)
-            return (2, str(err))
-        except Exception, err:
-            return (2, str(err))
-        if f.url.rstrip('/') == uri.rstrip('/'):
-            return (0, 0)
-        else:
-            return (1, f.url)
-
    def finish(self):
-        return
+        for worker in self.workers:
+            self.wqueue.put((None, None, None), False)
--- a/sphinx/config.py
+++ b/sphinx/config.py
@@ -168,6 +168,8 @@ class Config(object):

        # linkcheck options
        linkcheck_ignore = ([], None),
+        linkcheck_timeout = (None, None),
+        linkcheck_workers = (5, None),
    )

    def __init__(self, dirname, filename, overrides, tags):