From e0e9d2a7faaaa247900718c4185a0ebe9042903d Mon Sep 17 00:00:00 2001 From: Marti Raudsepp Date: Tue, 28 Feb 2012 20:12:59 +0200 Subject: [PATCH 1/2] Add #anchor checking to 'linkcheck' builder. This requires us to download the document and parse its HTML. --- doc/config.rst | 8 ++++ sphinx/builders/linkcheck.py | 73 +++++++++++++++++++++++++++++++----- sphinx/config.py | 1 + 3 files changed, 73 insertions(+), 9 deletions(-) diff --git a/doc/config.rst b/doc/config.rst index 80457a6a6..4368a3060 100644 --- a/doc/config.rst +++ b/doc/config.rst @@ -1318,6 +1318,14 @@ Options for the linkcheck builder .. versionadded:: 1.1 +.. confval:: linkcheck_anchors + + True or false, whether to check the existence of #anchor in links. Since + this requires downloading the whole document, it's considerably slower + when enabled. Default is ``True``. + + .. versionadded:: 1.2 + .. rubric:: Footnotes diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index ad15b55de..25d34acac 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -15,7 +15,8 @@ import Queue import socket import threading from os import path -from urllib2 import build_opener, Request +from urllib2 import build_opener, unquote, Request +from HTMLParser import HTMLParser, HTMLParseError from docutils import nodes @@ -33,6 +34,42 @@ class HeadRequest(Request): return 'HEAD' +class AnchorCheckParser(HTMLParser): + def __init__(self, search_anchor): + HTMLParser.__init__(self) + + self.search_anchor = search_anchor + self.found = False + + def handle_starttag(self, tag, attrs): + for key, value in attrs: + if key in ('id', 'name') and value == self.search_anchor: + self.found = True + +def check_anchor(f, hash): + """Reads HTML data from a filelike object 'f' searching for anchor 'hash'. + + Returns True if anchor was found, False otherwise""" + + parser = AnchorCheckParser(hash) + + try: + # Read file in chunks of 8192 bytes. If we find a matching anchor, we + # break the loop early in hopes not to have to download the whole thing + + chunk = f.read(8192) + while chunk and not parser.found: + parser.feed(chunk) + chunk = f.read(8192) + + parser.close() + except HTMLParseError: + # HTMLParser is usually pretty good with sloppy HTML, but it tends to + # choke on EOF. But we're done then anyway. + pass + + return parser.found + class CheckExternalLinksBuilder(Builder): """ Checks for broken external links. @@ -66,7 +103,7 @@ class CheckExternalLinksBuilder(Builder): def check(): # check for various conditions without bothering the network - if len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:': + if len(uri) == 0 or uri[0] == '#' or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:': return 'unchecked', '' elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'): return 'local', '' @@ -80,19 +117,39 @@ class CheckExternalLinksBuilder(Builder): if rex.match(uri): return 'ignored', '' + if '#' in uri: + req_url, hash = uri.split('#', 1) + else: + req_url = uri + hash = None + # need to actually check the URI try: - f = opener.open(HeadRequest(uri), **kwargs) - f.close() + if hash and self.app.config.linkcheck_anchors: + # Read the whole document and see if #hash exists + f = opener.open(Request(req_url), **kwargs) + found = check_anchor(f, unquote(hash)) + f.close() + + if not found: + raise Exception("Anchor '%s' not found" % hash) + else: + f = opener.open(HeadRequest(req_url), **kwargs) + f.close() + except Exception, err: self.broken[uri] = str(err) return 'broken', str(err) - if f.url.rstrip('/') == uri.rstrip('/'): + if f.url.rstrip('/') == req_url.rstrip('/'): self.good.add(uri) return 'working', 'new' else: - self.redirected[uri] = f.url - return 'redirected', f.url + new_url = f.url + if hash: + new_url += '#' + hash + + self.redirected[uri] = new_url + return 'redirected', new_url while True: uri, docname, lineno = self.wqueue.get() @@ -142,8 +199,6 @@ class CheckExternalLinksBuilder(Builder): if 'refuri' not in node: continue uri = node['refuri'] - if '#' in uri: - uri = uri.split('#')[0] lineno = None while lineno is None: node = node.parent diff --git a/sphinx/config.py b/sphinx/config.py index 767bf0882..17b961aeb 100644 --- a/sphinx/config.py +++ b/sphinx/config.py @@ -179,6 +179,7 @@ class Config(object): linkcheck_ignore = ([], None), linkcheck_timeout = (None, None), linkcheck_workers = (5, None), + linkcheck_anchors = (True, None), # gettext options gettext_compact = (True, 'gettext'), From fd70920456dcd4323118ce229e79edca33c32256 Mon Sep 17 00:00:00 2001 From: Marti Raudsepp Date: Tue, 28 Feb 2012 20:34:57 +0200 Subject: [PATCH 2/2] Fix docutils links found by linkcheck (with the new anchor checking feature) --- doc/rest.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/rest.rst b/doc/rest.rst index db832ed82..ccf51c66f 100644 --- a/doc/rest.rst +++ b/doc/rest.rst @@ -265,8 +265,9 @@ Docutils supports the following directives: * Admonitions: :dudir:`attention`, :dudir:`caution`, :dudir:`danger`, :dudir:`error`, :dudir:`hint`, :dudir:`important`, :dudir:`note`, - :dudir:`tip`, :dudir:`warning` and the generic :dudir:`admonition`. - (Most themes style only "note" and "warning" specially.) + :dudir:`tip`, :dudir:`warning` and the generic + :dudir:`admonition `. (Most themes style only "note" and + "warning" specially.) * Images: @@ -285,7 +286,7 @@ Docutils supports the following directives: - :dudir:`epigraph` (a block quote with optional attribution line) - :dudir:`highlights`, :dudir:`pull-quote` (block quotes with their own class attribute) - - :dudir:`compound` (a compound paragraph) + - :dudir:`compound ` (a compound paragraph) * Special tables: @@ -295,7 +296,7 @@ Docutils supports the following directives: * Special directives: - - :dudir:`raw` (include raw target-format markup) + - :dudir:`raw ` (include raw target-format markup) - :dudir:`include` (include reStructuredText from another file) -- in Sphinx, when given an absolute include file path, this directive takes it as relative to the source directory @@ -304,7 +305,7 @@ Docutils supports the following directives: * HTML specifics: - :dudir:`meta` (generation of HTML ```` tags) - - :dudir:`title` (override document title) + - :dudir:`title ` (override document title) * Influencing markup: @@ -472,9 +473,8 @@ There are some problems one commonly runs into while authoring reST documents: * **Separation of inline markup:** As said above, inline markup spans must be separated from the surrounding text by non-word characters, you have to use a - backslash-escaped space to get around that. See `the reference - `_ - for the details. + backslash-escaped space to get around that. See + :duref:`the reference ` for the details. * **No nested inline markup:** Something like ``*see :func:`foo`*`` is not possible.