Move link checker to its own file. Use different user-agent to enable Wikipedia lookup.

This commit is contained in:
Georg Brandl 2008-03-14 23:47:30 +00:00
parent f2d713c577
commit e886f2c5ae
2 changed files with 126 additions and 105 deletions

View File

@ -5,7 +5,7 @@
Builder classes for different output formats.
:copyright: 2007-2008 by Georg Brandl, Thomas Lamb.
:copyright: 2007-2008 by Georg Brandl.
:license: BSD.
"""
@ -13,11 +13,9 @@ import os
import time
import codecs
import shutil
import socket
import cPickle as pickle
from os import path
from cgi import escape
from urllib2 import urlopen, HTTPError
from docutils import nodes
from docutils.io import StringOutput, FileOutput, DocTreeInput
@ -891,108 +889,7 @@ class ChangesBuilder(Builder):
pass
class CheckExternalLinksBuilder(Builder):
"""
Checks for broken external links.
"""
name = 'linkcheck'
def init(self):
self.good = set()
self.broken = {}
self.redirected = {}
# set a timeout for non-responding servers
socket.setdefaulttimeout(5.0)
# create output file
open(path.join(self.outdir, 'output.txt'), 'w').close()
def get_target_uri(self, docname, typ=None):
return ''
def get_outdated_docs(self):
return self.env.all_docs
def prepare_writing(self, docnames):
return
def write_doc(self, docname, doctree):
self.info()
for node in doctree.traverse(nodes.reference):
try:
self.check(node, docname)
except KeyError:
continue
def check(self, node, docname):
uri = node['refuri']
if '#' in uri:
uri = uri.split('#')[0]
if uri in self.good:
return
if uri[0:5] == 'http:' or uri[0:6] == 'https:':
self.info(uri, nonl=1)
lineno = None
while lineno is None and node:
node = node.parent
lineno = node.line
if uri in self.broken:
(r, s) = self.broken[uri]
elif uri in self.redirected:
(r, s) = self.redirected[uri]
else:
(r, s) = self.resolve(uri)
if r == 0:
self.info(' - ' + darkgreen('working'))
self.good.add(uri)
elif r == 2:
self.info(' - ' + red('broken: ') + s)
self.broken[uri] = (r, s)
self.write_entry('broken', docname, lineno, uri + ': ' + s)
else:
self.info(' - ' + purple('redirected') + ' to ' + s)
self.redirected[uri] = (r, s)
self.write_entry('redirected', docname, lineno, uri + ' to ' + s)
elif len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
return
else:
self.info(uri + ' - ' + red('malformed!'))
self.write_entry('malformed', docname, lineno, uri)
return
def write_entry(self, what, docname, line, uri):
output = open(path.join(self.outdir, 'output.txt'), 'a')
output.write("%s:%s [%s] %s\n" % (self.env.doc2path(docname, None),
line, what, uri))
output.close()
def resolve(self, uri):
try:
f = urlopen(uri)
f.close()
except HTTPError, err:
if err.code == 403 and uri.startswith('http://en.wikipedia.org/'):
# Wikipedia blocks requests from urllib User-Agent
return (0, 0)
return (2, str(err))
except Exception, err:
return (2, str(err))
if f.url.rstrip('/') == uri.rstrip('/'):
return (0, 0)
else:
return (1, f.url)
def finish(self):
return
from sphinx.linkcheck import CheckExternalLinksBuilder
builtin_builders = {
'html': StandaloneHTMLBuilder,

124
sphinx/linkcheck.py Normal file
View File

@ -0,0 +1,124 @@
# -*- coding: utf-8 -*-
"""
sphinx.linkcheck
~~~~~~~~~~~~~~~~
The CheckExternalLinksBuilder class.
:copyright: 2008 by Georg Brandl, Thomas Lamb.
:license: BSD.
"""
import socket
from os import path
from urllib2 import build_opener, HTTPError
from docutils import nodes
from sphinx.builder import Builder
from sphinx.util.console import bold, purple, red, darkgreen
# create an opener that will simulate a browser user-agent
opener = build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
class CheckExternalLinksBuilder(Builder):
"""
Checks for broken external links.
"""
name = 'linkcheck'
def init(self):
self.good = set()
self.broken = {}
self.redirected = {}
# set a timeout for non-responding servers
socket.setdefaulttimeout(5.0)
# create output file
open(path.join(self.outdir, 'output.txt'), 'w').close()
def get_target_uri(self, docname, typ=None):
return ''
def get_outdated_docs(self):
return self.env.all_docs
def prepare_writing(self, docnames):
return
def write_doc(self, docname, doctree):
self.info()
for node in doctree.traverse(nodes.reference):
try:
self.check(node, docname)
except KeyError:
continue
def check(self, node, docname):
uri = node['refuri']
if '#' in uri:
uri = uri.split('#')[0]
if uri in self.good:
return
if uri[0:5] == 'http:' or uri[0:6] == 'https:':
self.info(uri, nonl=1)
lineno = None
while lineno is None and node:
node = node.parent
lineno = node.line
if uri in self.broken:
(r, s) = self.broken[uri]
elif uri in self.redirected:
(r, s) = self.redirected[uri]
else:
(r, s) = self.resolve(uri)
if r == 0:
self.info(' - ' + darkgreen('working'))
self.good.add(uri)
elif r == 2:
self.info(' - ' + red('broken: ') + s)
self.broken[uri] = (r, s)
self.write_entry('broken', docname, lineno, uri + ': ' + s)
else:
self.info(' - ' + purple('redirected') + ' to ' + s)
self.redirected[uri] = (r, s)
self.write_entry('redirected', docname, lineno, uri + ' to ' + s)
elif len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
return
else:
self.info(uri + ' - ' + red('malformed!'))
self.write_entry('malformed', docname, lineno, uri)
return
def write_entry(self, what, docname, line, uri):
output = open(path.join(self.outdir, 'output.txt'), 'a')
output.write("%s:%s [%s] %s\n" % (self.env.doc2path(docname, None),
line, what, uri))
output.close()
def resolve(self, uri):
try:
f = opener.open(uri)
f.close()
except HTTPError, err:
#if err.code == 403 and uri.startswith('http://en.wikipedia.org/'):
# # Wikipedia blocks requests from urllib User-Agent
# return (0, 0)
return (2, str(err))
except Exception, err:
return (2, str(err))
if f.url.rstrip('/') == uri.rstrip('/'):
return (0, 0)
else:
return (1, f.url)
def finish(self):
return