mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
#472: linkcheck builder: Check links in parallel, use HTTP HEAD requests and allow configuring the timeout.
New config values: :confval:`linkcheck_timeout` and :confval:`linkcheck_workers`.
This commit is contained in:
4
CHANGES
4
CHANGES
@@ -24,6 +24,10 @@ Release 1.1 (in development)
|
||||
|
||||
* #443: Allow referencing external graphviz files.
|
||||
|
||||
* #472: linkcheck builder: Check links in parallel, use HTTP HEAD
|
||||
requests and allow configuring the timeout. New config values:
|
||||
:confval:`linkcheck_timeout` and :confval:`linkcheck_workers`.
|
||||
|
||||
* #221: Add Swedish locale.
|
||||
|
||||
* Added ``inline`` option to graphviz directives, and fixed the
|
||||
|
||||
@@ -1132,6 +1132,21 @@ Options for the linkcheck builder
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
.. confval:: linkcheck_timeout
|
||||
|
||||
A timeout value, in seconds, for the linkcheck builder. **Only works in
|
||||
Python 2.6 and higher.** The default is to use Python's global socket
|
||||
timeout.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
.. confval:: linkcheck_workers
|
||||
|
||||
The number of worker threads to use when checking links. Default is 5
|
||||
threads.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
|
||||
@@ -10,9 +10,12 @@
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import Queue
|
||||
import socket
|
||||
import threading
|
||||
from os import path
|
||||
from urllib2 import build_opener, HTTPError
|
||||
from urllib2 import build_opener, Request
|
||||
|
||||
from docutils import nodes
|
||||
|
||||
@@ -24,6 +27,12 @@ opener = build_opener()
|
||||
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
|
||||
|
||||
|
||||
class HeadRequest(Request):
|
||||
"""Subclass of urllib2.Request that sends a HEAD request."""
|
||||
def get_method(self):
|
||||
return 'HEAD'
|
||||
|
||||
|
||||
class CheckExternalLinksBuilder(Builder):
|
||||
"""
|
||||
Checks for broken external links.
|
||||
@@ -40,6 +49,83 @@ class CheckExternalLinksBuilder(Builder):
|
||||
# create output file
|
||||
open(path.join(self.outdir, 'output.txt'), 'w').close()
|
||||
|
||||
# create queues and worker threads
|
||||
self.wqueue = Queue.Queue()
|
||||
self.rqueue = Queue.Queue()
|
||||
self.workers = []
|
||||
for i in range(self.app.config.linkcheck_workers):
|
||||
thread = threading.Thread(target=self.check_thread)
|
||||
thread.setDaemon(True)
|
||||
thread.start()
|
||||
self.workers.append(thread)
|
||||
|
||||
def check_thread(self):
|
||||
kwargs = {}
|
||||
if sys.version_info > (2, 5) and self.app.config.linkcheck_timeout:
|
||||
kwargs['timeout'] = self.app.config.linkcheck_timeout
|
||||
|
||||
def check():
|
||||
# check for various conditions without bothering the network
|
||||
if len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
|
||||
return 'unchecked', ''
|
||||
elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'):
|
||||
return 'local', ''
|
||||
elif uri in self.good:
|
||||
return 'working', ''
|
||||
elif uri in self.broken:
|
||||
return 'broken', self.broken[uri]
|
||||
elif uri in self.redirected:
|
||||
return 'redirected', self.redirected[uri]
|
||||
for rex in self.to_ignore:
|
||||
if rex.match(uri):
|
||||
return 'ignored', ''
|
||||
|
||||
# need to actually check the URI
|
||||
try:
|
||||
f = opener.open(HeadRequest(uri), **kwargs)
|
||||
f.close()
|
||||
except Exception, err:
|
||||
self.broken[uri] = str(err)
|
||||
return 'broken', str(err)
|
||||
if f.url.rstrip('/') == uri.rstrip('/'):
|
||||
self.good.add(uri)
|
||||
return 'working', 'new'
|
||||
else:
|
||||
self.redirected[uri] = f.url
|
||||
return 'redirected', f.url
|
||||
|
||||
while True:
|
||||
uri, docname, lineno = self.wqueue.get()
|
||||
if uri is None:
|
||||
break
|
||||
status, info = check()
|
||||
self.rqueue.put((uri, docname, lineno, status, info))
|
||||
|
||||
def process_result(self, result):
|
||||
uri, docname, lineno, status, info = result
|
||||
if status == 'unchecked':
|
||||
return
|
||||
if status == 'working' and info != 'new':
|
||||
return
|
||||
if lineno:
|
||||
self.info('(line %3d) ' % lineno, nonl=1)
|
||||
if status == 'ignored':
|
||||
self.info(uri + ' - ' + darkgray('ignored'))
|
||||
elif status == 'local':
|
||||
self.info(uri + ' - ' + darkgray('local'))
|
||||
self.write_entry('local', docname, lineno, uri)
|
||||
elif status == 'working':
|
||||
self.info(uri + ' - ' + darkgreen('working'))
|
||||
elif status == 'broken':
|
||||
self.info(uri + ' - ' + red('broken: ') + info)
|
||||
self.write_entry('broken', docname, lineno, uri + ': ' + info)
|
||||
if self.app.quiet:
|
||||
self.warn('broken link: %s' % uri,
|
||||
'%s:%s' % (self.env.doc2path(docname), lineno))
|
||||
elif status == 'redirected':
|
||||
self.info(uri + ' - ' + purple('redirected') + ' to ' + info)
|
||||
self.write_entry('redirected', docname, lineno, uri + ' to ' + info)
|
||||
|
||||
def get_target_uri(self, docname, typ=None):
|
||||
return ''
|
||||
|
||||
@@ -51,65 +137,25 @@ class CheckExternalLinksBuilder(Builder):
|
||||
|
||||
def write_doc(self, docname, doctree):
|
||||
self.info()
|
||||
n = 0
|
||||
for node in doctree.traverse(nodes.reference):
|
||||
try:
|
||||
self.check(node, docname)
|
||||
except KeyError:
|
||||
if 'refuri' not in node:
|
||||
continue
|
||||
|
||||
def check(self, node, docname):
|
||||
uri = node['refuri']
|
||||
|
||||
if '#' in uri:
|
||||
uri = uri.split('#')[0]
|
||||
|
||||
if uri in self.good:
|
||||
return
|
||||
|
||||
lineno = None
|
||||
while lineno is None:
|
||||
node = node.parent
|
||||
if node is None:
|
||||
break
|
||||
lineno = node.line
|
||||
|
||||
if len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
|
||||
return
|
||||
|
||||
if lineno:
|
||||
self.info('(line %3d) ' % lineno, nonl=1)
|
||||
for rex in self.to_ignore:
|
||||
if rex.match(uri):
|
||||
self.info(uri + ' - ' + darkgray('ignored'))
|
||||
return
|
||||
if uri[0:5] == 'http:' or uri[0:6] == 'https:':
|
||||
self.info(uri, nonl=1)
|
||||
|
||||
if uri in self.broken:
|
||||
(r, s) = self.broken[uri]
|
||||
elif uri in self.redirected:
|
||||
(r, s) = self.redirected[uri]
|
||||
else:
|
||||
(r, s) = self.resolve(uri)
|
||||
|
||||
if r == 0:
|
||||
self.info(' - ' + darkgreen('working'))
|
||||
self.good.add(uri)
|
||||
elif r == 2:
|
||||
self.info(' - ' + red('broken: ') + s)
|
||||
self.write_entry('broken', docname, lineno, uri + ': ' + s)
|
||||
self.broken[uri] = (r, s)
|
||||
if self.app.quiet:
|
||||
self.warn('broken link: %s' % uri,
|
||||
'%s:%s' % (self.env.doc2path(docname), lineno))
|
||||
else:
|
||||
self.info(' - ' + purple('redirected') + ' to ' + s)
|
||||
self.write_entry('redirected', docname,
|
||||
lineno, uri + ' to ' + s)
|
||||
self.redirected[uri] = (r, s)
|
||||
else:
|
||||
self.info(uri + ' - ' + darkgray('local'))
|
||||
self.write_entry('local', docname, lineno, uri)
|
||||
uri = node['refuri']
|
||||
if '#' in uri:
|
||||
uri = uri.split('#')[0]
|
||||
lineno = None
|
||||
while lineno is None:
|
||||
node = node.parent
|
||||
if node is None:
|
||||
break
|
||||
lineno = node.line
|
||||
self.wqueue.put((uri, docname, lineno), False)
|
||||
n += 1
|
||||
done = 0
|
||||
while done < n:
|
||||
self.process_result(self.rqueue.get())
|
||||
done += 1
|
||||
|
||||
if self.broken:
|
||||
self.app.statuscode = 1
|
||||
@@ -120,21 +166,6 @@ class CheckExternalLinksBuilder(Builder):
|
||||
line, what, uri))
|
||||
output.close()
|
||||
|
||||
def resolve(self, uri):
|
||||
try:
|
||||
f = opener.open(uri)
|
||||
f.close()
|
||||
except HTTPError, err:
|
||||
#if err.code == 403 and uri.startswith('http://en.wikipedia.org/'):
|
||||
# # Wikipedia blocks requests from urllib User-Agent
|
||||
# return (0, 0)
|
||||
return (2, str(err))
|
||||
except Exception, err:
|
||||
return (2, str(err))
|
||||
if f.url.rstrip('/') == uri.rstrip('/'):
|
||||
return (0, 0)
|
||||
else:
|
||||
return (1, f.url)
|
||||
|
||||
def finish(self):
|
||||
return
|
||||
for worker in self.workers:
|
||||
self.wqueue.put((None, None, None), False)
|
||||
|
||||
@@ -168,6 +168,8 @@ class Config(object):
|
||||
|
||||
# linkcheck options
|
||||
linkcheck_ignore = ([], None),
|
||||
linkcheck_timeout = (None, None),
|
||||
linkcheck_workers = (5, None),
|
||||
)
|
||||
|
||||
def __init__(self, dirname, filename, overrides, tags):
|
||||
|
||||
Reference in New Issue
Block a user