From e886f2c5aedecd14c0af735b723c1bfba20d6092 Mon Sep 17 00:00:00 2001 From: Georg Brandl Date: Fri, 14 Mar 2008 23:47:30 +0000 Subject: [PATCH] Move link checker to its own file. Use different user-agent to enable Wikipedia lookup. --- sphinx/builder.py | 107 +------------------------------------- sphinx/linkcheck.py | 124 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+), 105 deletions(-) create mode 100644 sphinx/linkcheck.py diff --git a/sphinx/builder.py b/sphinx/builder.py index 85afcb3aa..0e94b93ea 100644 --- a/sphinx/builder.py +++ b/sphinx/builder.py @@ -5,7 +5,7 @@ Builder classes for different output formats. - :copyright: 2007-2008 by Georg Brandl, Thomas Lamb. + :copyright: 2007-2008 by Georg Brandl. :license: BSD. """ @@ -13,11 +13,9 @@ import os import time import codecs import shutil -import socket import cPickle as pickle from os import path from cgi import escape -from urllib2 import urlopen, HTTPError from docutils import nodes from docutils.io import StringOutput, FileOutput, DocTreeInput @@ -891,108 +889,7 @@ class ChangesBuilder(Builder): pass -class CheckExternalLinksBuilder(Builder): - """ - Checks for broken external links. - """ - name = 'linkcheck' - - def init(self): - self.good = set() - self.broken = {} - self.redirected = {} - # set a timeout for non-responding servers - socket.setdefaulttimeout(5.0) - # create output file - open(path.join(self.outdir, 'output.txt'), 'w').close() - - def get_target_uri(self, docname, typ=None): - return '' - - def get_outdated_docs(self): - return self.env.all_docs - - def prepare_writing(self, docnames): - return - - def write_doc(self, docname, doctree): - self.info() - for node in doctree.traverse(nodes.reference): - try: - self.check(node, docname) - except KeyError: - continue - - def check(self, node, docname): - uri = node['refuri'] - - if '#' in uri: - uri = uri.split('#')[0] - - if uri in self.good: - return - - if uri[0:5] == 'http:' or uri[0:6] == 'https:': - self.info(uri, nonl=1) - lineno = None - while lineno is None and node: - node = node.parent - lineno = node.line - - if uri in self.broken: - (r, s) = self.broken[uri] - elif uri in self.redirected: - (r, s) = self.redirected[uri] - else: - (r, s) = self.resolve(uri) - - if r == 0: - self.info(' - ' + darkgreen('working')) - self.good.add(uri) - elif r == 2: - self.info(' - ' + red('broken: ') + s) - self.broken[uri] = (r, s) - self.write_entry('broken', docname, lineno, uri + ': ' + s) - else: - self.info(' - ' + purple('redirected') + ' to ' + s) - self.redirected[uri] = (r, s) - self.write_entry('redirected', docname, lineno, uri + ' to ' + s) - - elif len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:': - return - else: - self.info(uri + ' - ' + red('malformed!')) - self.write_entry('malformed', docname, lineno, uri) - - return - - def write_entry(self, what, docname, line, uri): - output = open(path.join(self.outdir, 'output.txt'), 'a') - output.write("%s:%s [%s] %s\n" % (self.env.doc2path(docname, None), - line, what, uri)) - output.close() - - def resolve(self, uri): - try: - f = urlopen(uri) - f.close() - except HTTPError, err: - if err.code == 403 and uri.startswith('http://en.wikipedia.org/'): - # Wikipedia blocks requests from urllib User-Agent - return (0, 0) - return (2, str(err)) - except Exception, err: - return (2, str(err)) - if f.url.rstrip('/') == uri.rstrip('/'): - return (0, 0) - else: - return (1, f.url) - - def finish(self): - return - - - +from sphinx.linkcheck import CheckExternalLinksBuilder builtin_builders = { 'html': StandaloneHTMLBuilder, diff --git a/sphinx/linkcheck.py b/sphinx/linkcheck.py new file mode 100644 index 000000000..572f8d4fa --- /dev/null +++ b/sphinx/linkcheck.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- +""" + sphinx.linkcheck + ~~~~~~~~~~~~~~~~ + + The CheckExternalLinksBuilder class. + + :copyright: 2008 by Georg Brandl, Thomas Lamb. + :license: BSD. +""" + +import socket +from os import path +from urllib2 import build_opener, HTTPError + +from docutils import nodes + +from sphinx.builder import Builder +from sphinx.util.console import bold, purple, red, darkgreen + +# create an opener that will simulate a browser user-agent +opener = build_opener() +opener.addheaders = [('User-agent', 'Mozilla/5.0')] + + +class CheckExternalLinksBuilder(Builder): + """ + Checks for broken external links. + """ + name = 'linkcheck' + + def init(self): + self.good = set() + self.broken = {} + self.redirected = {} + # set a timeout for non-responding servers + socket.setdefaulttimeout(5.0) + # create output file + open(path.join(self.outdir, 'output.txt'), 'w').close() + + def get_target_uri(self, docname, typ=None): + return '' + + def get_outdated_docs(self): + return self.env.all_docs + + def prepare_writing(self, docnames): + return + + def write_doc(self, docname, doctree): + self.info() + for node in doctree.traverse(nodes.reference): + try: + self.check(node, docname) + except KeyError: + continue + + def check(self, node, docname): + uri = node['refuri'] + + if '#' in uri: + uri = uri.split('#')[0] + + if uri in self.good: + return + + if uri[0:5] == 'http:' or uri[0:6] == 'https:': + self.info(uri, nonl=1) + lineno = None + while lineno is None and node: + node = node.parent + lineno = node.line + + if uri in self.broken: + (r, s) = self.broken[uri] + elif uri in self.redirected: + (r, s) = self.redirected[uri] + else: + (r, s) = self.resolve(uri) + + if r == 0: + self.info(' - ' + darkgreen('working')) + self.good.add(uri) + elif r == 2: + self.info(' - ' + red('broken: ') + s) + self.broken[uri] = (r, s) + self.write_entry('broken', docname, lineno, uri + ': ' + s) + else: + self.info(' - ' + purple('redirected') + ' to ' + s) + self.redirected[uri] = (r, s) + self.write_entry('redirected', docname, lineno, uri + ' to ' + s) + + elif len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:': + return + else: + self.info(uri + ' - ' + red('malformed!')) + self.write_entry('malformed', docname, lineno, uri) + + return + + def write_entry(self, what, docname, line, uri): + output = open(path.join(self.outdir, 'output.txt'), 'a') + output.write("%s:%s [%s] %s\n" % (self.env.doc2path(docname, None), + line, what, uri)) + output.close() + + def resolve(self, uri): + try: + f = opener.open(uri) + f.close() + except HTTPError, err: + #if err.code == 403 and uri.startswith('http://en.wikipedia.org/'): + # # Wikipedia blocks requests from urllib User-Agent + # return (0, 0) + return (2, str(err)) + except Exception, err: + return (2, str(err)) + if f.url.rstrip('/') == uri.rstrip('/'): + return (0, 0) + else: + return (1, f.url) + + def finish(self): + return