Closes #1292: In the linkchecker, retry HEAD requests when denied by HTTP 405.

Also make the redirect code apparent and tweak the output a bit to be
more obvious.
This commit is contained in:
Georg Brandl
2014-01-12 23:52:16 +01:00
parent 5d23ef5a8c
commit 5406bff1cd
2 changed files with 66 additions and 28 deletions

View File

@@ -93,6 +93,10 @@ Bugs fixed
* #1249: Fix duplicate LaTeX page numbering for manual documents. * #1249: Fix duplicate LaTeX page numbering for manual documents.
* #1292: In the linkchecker, retry HEAD requests when denied by HTTP 405.
Also make the redirect code apparent and tweak the output a bit to be
more obvious.
Documentation Documentation
------------- -------------

View File

@@ -15,17 +15,30 @@ import Queue
import socket import socket
import threading import threading
from os import path from os import path
from urllib2 import build_opener, unquote, Request from urllib2 import build_opener, unquote, Request, \
HTTPError, HTTPRedirectHandler
from HTMLParser import HTMLParser, HTMLParseError from HTMLParser import HTMLParser, HTMLParseError
from docutils import nodes from docutils import nodes
from sphinx.builders import Builder from sphinx.builders import Builder
from sphinx.util.console import purple, red, darkgreen, darkgray from sphinx.util.console import purple, red, darkgreen, darkgray, \
darkred, turquoise
class RedirectHandler(HTTPRedirectHandler):
"""A RedirectHandler that records the redirect code we got."""
def redirect_request(self, req, fp, code, msg, headers, newurl):
new_req = HTTPRedirectHandler.redirect_request(self, req, fp, code,
msg, headers, newurl)
req.redirect_code = code
return new_req
# create an opener that will simulate a browser user-agent # create an opener that will simulate a browser user-agent
opener = build_opener() opener = build_opener(RedirectHandler)
opener.addheaders = [('User-agent', 'Mozilla/5.0')] opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) '
'Gecko/20100101 Firefox/25.0')]
class HeadRequest(Request): class HeadRequest(Request):
@@ -104,18 +117,18 @@ class CheckExternalLinksBuilder(Builder):
# check for various conditions without bothering the network # check for various conditions without bothering the network
if len(uri) == 0 or uri[0] == '#' or \ if len(uri) == 0 or uri[0] == '#' or \
uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:': uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
return 'unchecked', '' return 'unchecked', '', 0
elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'): elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'):
return 'local', '' return 'local', '', 0
elif uri in self.good: elif uri in self.good:
return 'working', '' return 'working', '', 0
elif uri in self.broken: elif uri in self.broken:
return 'broken', self.broken[uri] return 'broken', self.broken[uri], 0
elif uri in self.redirected: elif uri in self.redirected:
return 'redirected', self.redirected[uri] return 'redirected', self.redirected[uri][0], self.redirected[uri][1]
for rex in self.to_ignore: for rex in self.to_ignore:
if rex.match(uri): if rex.match(uri):
return 'ignored', '' return 'ignored', '', 0
if '#' in uri: if '#' in uri:
req_url, hash = uri.split('#', 1) req_url, hash = uri.split('#', 1)
@@ -127,61 +140,82 @@ class CheckExternalLinksBuilder(Builder):
try: try:
if hash and self.app.config.linkcheck_anchors: if hash and self.app.config.linkcheck_anchors:
# Read the whole document and see if #hash exists # Read the whole document and see if #hash exists
f = opener.open(Request(req_url), **kwargs) req = Request(req_url)
f = opener.open(req, **kwargs)
found = check_anchor(f, unquote(hash)) found = check_anchor(f, unquote(hash))
f.close() f.close()
if not found: if not found:
raise Exception("Anchor '%s' not found" % hash) raise Exception("Anchor '%s' not found" % hash)
else: else:
f = opener.open(HeadRequest(req_url), **kwargs) try:
# try a HEAD request, which should be easier on
# the server and the network
req = HeadRequest(req_url)
f = opener.open(req, **kwargs)
f.close()
except HTTPError, err:
if err.code != 405:
raise
# retry with GET if that fails, some servers
# don't like HEAD requests and reply with 405
req = Request(req_url)
f = opener.open(req, **kwargs)
f.close() f.close()
except Exception, err: except Exception, err:
self.broken[uri] = str(err) self.broken[uri] = str(err)
return 'broken', str(err) return 'broken', str(err), 0
if f.url.rstrip('/') == req_url.rstrip('/'): if f.url.rstrip('/') == req_url.rstrip('/'):
self.good.add(uri) self.good.add(uri)
return 'working', 'new' return 'working', 'new', 0
else: else:
new_url = f.url new_url = f.url
if hash: if hash:
new_url += '#' + hash new_url += '#' + hash
code = getattr(req, 'redirect_code', 0)
self.redirected[uri] = new_url self.redirected[uri] = (new_url, code)
return 'redirected', new_url return 'redirected', new_url, code
while True: while True:
uri, docname, lineno = self.wqueue.get() uri, docname, lineno = self.wqueue.get()
if uri is None: if uri is None:
break break
status, info = check() status, info, code = check()
self.rqueue.put((uri, docname, lineno, status, info)) self.rqueue.put((uri, docname, lineno, status, info, code))
def process_result(self, result): def process_result(self, result):
uri, docname, lineno, status, info = result uri, docname, lineno, status, info, code = result
if status == 'unchecked': if status == 'unchecked':
return return
if status == 'working' and info != 'new': if status == 'working' and info != 'new':
return return
if lineno: if lineno:
self.info('(line %3d) ' % lineno, nonl=1) self.info('(line %4d) ' % lineno, nonl=1)
if status == 'ignored': if status == 'ignored':
self.info(uri + ' - ' + darkgray('ignored')) self.info(darkgray('-ignored- ') + uri)
elif status == 'local': elif status == 'local':
self.info(uri + ' - ' + darkgray('local')) self.info(darkgray('-local- ') + uri)
self.write_entry('local', docname, lineno, uri) self.write_entry('local', docname, lineno, uri)
elif status == 'working': elif status == 'working':
self.info(uri + ' - ' + darkgreen('working')) self.info(darkgreen('ok ') + uri)
elif status == 'broken': elif status == 'broken':
self.info(uri + ' - ' + red('broken: ') + info) self.info(red('broken ') + uri + red(' - ' + info))
self.write_entry('broken', docname, lineno, uri + ': ' + info) self.write_entry('broken', docname, lineno, uri + ': ' + info)
if self.app.quiet: if self.app.quiet:
self.warn('broken link: %s' % uri, self.warn('broken link: %s' % uri,
'%s:%s' % (self.env.doc2path(docname), lineno)) '%s:%s' % (self.env.doc2path(docname), lineno))
elif status == 'redirected': elif status == 'redirected':
self.info(uri + ' - ' + purple('redirected') + ' to ' + info) text, color = {
self.write_entry('redirected', docname, lineno, uri + ' to ' + info) 301: ('permanently', darkred),
302: ('with Found', purple),
303: ('with See Other', purple),
307: ('temporarily', turquoise),
0: ('with unknown code', purple),
}[code]
self.write_entry('redirected ' + text, docname, lineno,
uri + ' to ' + info)
self.info(color('redirect ') + uri + color(' - ' + text + ' to ' + info))
def get_target_uri(self, docname, typ=None): def get_target_uri(self, docname, typ=None):
return '' return ''