sphinx/tests/etree13/HTMLTreeBuilder.py

#
# ElementTree
# $Id$
#
# a simple tree builder, for HTML input
#
# history:
# 2002-04-06 fl   created
# 2002-04-07 fl   ignore IMG and HR end tags
# 2002-04-07 fl   added support for 1.5.2 and later
# 2003-04-13 fl   added HTMLTreeBuilder alias
# 2004-12-02 fl   don't feed non-ASCII charrefs/entities as 8-bit strings
# 2004-12-05 fl   don't feed non-ASCII CDATA as 8-bit strings
#
# Copyright (c) 1999-2004 by Fredrik Lundh.  All rights reserved.
#
# fredrik@pythonware.com
# http://www.pythonware.com
#
# --------------------------------------------------------------------
# The ElementTree toolkit is
#
# Copyright (c) 1999-2007 by Fredrik Lundh
#
# By obtaining, using, and/or copying this software and/or its
# associated documentation, you agree that you have read, understood,
# and will comply with the following terms and conditions:
#
# Permission to use, copy, modify, and distribute this software and
# its associated documentation for any purpose and without fee is
# hereby granted, provided that the above copyright notice appears in
# all copies, and that both that copyright notice and this permission
# notice appear in supporting documentation, and that the name of
# Secret Labs AB or the author not be used in advertising or publicity
# pertaining to distribution of the software without specific, written
# prior permission.
#
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THIS SOFTWARE.
# --------------------------------------------------------------------

##
# Tools to build element trees from HTML files.
##

import htmlentitydefs
import re, string, sys
import mimetools, StringIO

import ElementTree

AUTOCLOSE = "p", "li", "tr", "th", "td", "head", "body"
IGNOREEND = "img", "hr", "meta", "link", "br"

if sys.version[:3] == "1.5":
    is_not_ascii = re.compile(r"[\x80-\xff]").search # 1.5.2
else:
    is_not_ascii = re.compile(eval(r'u"[\u0080-\uffff]"')).search

try:
    from HTMLParser import HTMLParser
except ImportError:
    from sgmllib import SGMLParser
    # hack to use sgmllib's SGMLParser to emulate 2.2's HTMLParser
    class HTMLParser(SGMLParser):
        # the following only works as long as this class doesn't
        # provide any do, start, or end handlers
        def unknown_starttag(self, tag, attrs):
            self.handle_starttag(tag, attrs)
        def unknown_endtag(self, tag):
            self.handle_endtag(tag)

##
# ElementTree builder for HTML source code.  This builder converts an
# HTML document or fragment to an ElementTree.
# <p>
# The parser is relatively picky, and requires balanced tags for most
# elements.  However, elements belonging to the following group are
# automatically closed: P, LI, TR, TH, and TD.  In addition, the
# parser automatically inserts end tags immediately after the start
# tag, and ignores any end tags for the following group: IMG, HR,
# META, and LINK.
#
# @keyparam builder Optional builder object.  If omitted, the parser
#     uses the standard <b>elementtree</b> builder.
# @keyparam encoding Optional character encoding, if known.  If omitted,
#     the parser looks for META tags inside the document.  If no tags
#     are found, the parser defaults to ISO-8859-1.  Note that if your
#     document uses a non-ASCII compatible encoding, you must decode
#     the document before parsing.
#
# @see elementtree.ElementTree

class HTMLTreeBuilder(HTMLParser):

    # FIXME: shouldn't this class be named Parser, not Builder?

    def __init__(self, builder=None, encoding=None):
        self.__stack = []
        if builder is None:
            builder = ElementTree.TreeBuilder()
        self.__builder = builder
        self.encoding = encoding or "iso-8859-1"
        HTMLParser.__init__(self)

    ##
    # Flushes parser buffers, and return the root element.
    #
    # @return An Element instance.

    def close(self):
        HTMLParser.close(self)
        return self.__builder.close()

    ##
    # (Internal) Handles start tags.

    def handle_starttag(self, tag, attrs):
        if tag == "meta":
            # look for encoding directives
            http_equiv = content = None
            for k, v in attrs:
                if k == "http-equiv":
                    http_equiv = string.lower(v)
                elif k == "content":
                    content = v
            if http_equiv == "content-type" and content:
                # use mimetools to parse the http header
                header = mimetools.Message(
                    StringIO.StringIO("%s: %s\n\n" % (http_equiv, content))
                    )
                encoding = header.getparam("charset")
                if encoding:
                    self.encoding = encoding
        if tag in AUTOCLOSE:
            if self.__stack and self.__stack[-1] == tag:
                self.handle_endtag(tag)
        self.__stack.append(tag)
        attrib = {}
        if attrs:
            for k, v in attrs:
                attrib[string.lower(k)] = v
        self.__builder.start(tag, attrib)
        if tag in IGNOREEND:
            self.__stack.pop()
            self.__builder.end(tag)

    ##
    # (Internal) Handles end tags.

    def handle_endtag(self, tag):
        if tag in IGNOREEND:
            return
        lasttag = self.__stack.pop()
        if tag != lasttag and lasttag in AUTOCLOSE:
            self.handle_endtag(lasttag)
        self.__builder.end(tag)

    ##
    # (Internal) Handles character references.

    def handle_charref(self, char):
        if char[:1] == "x":
            char = int(char[1:], 16)
        else:
            char = int(char)
        if 0 <= char < 128:
            self.__builder.data(chr(char))
        else:
            self.__builder.data(unichr(char))

    ##
    # (Internal) Handles entity references.

    def handle_entityref(self, name):
        entity = htmlentitydefs.entitydefs.get(name)
        if entity:
            if len(entity) == 1:
                entity = ord(entity)
            else:
                entity = int(entity[2:-1])
            if 0 <= entity < 128:
                self.__builder.data(chr(entity))
            else:
                self.__builder.data(unichr(entity))
        else:
            self.unknown_entityref(name)

    ##
    # (Internal) Handles character data.

    def handle_data(self, data):
        if isinstance(data, type('')) and is_not_ascii(data):
            # convert to unicode, but only if necessary
            data = unicode(data, self.encoding, "ignore")
        self.__builder.data(data)

    ##
    # (Hook) Handles unknown entity references.  The default action
    # is to ignore unknown entities.

    def unknown_entityref(self, name):
        pass # ignore by default; override if necessary

##
# An alias for the <b>HTMLTreeBuilder</b> class.

TreeBuilder = HTMLTreeBuilder

##
# Parse an HTML document or document fragment.
#
# @param source A filename or file object containing HTML data.
# @param encoding Optional character encoding, if known.  If omitted,
#     the parser looks for META tags inside the document.  If no tags
#     are found, the parser defaults to ISO-8859-1.
# @return An ElementTree instance

def parse(source, encoding=None):
    return ElementTree.parse(source, HTMLTreeBuilder(encoding=encoding))

if __name__ == "__main__":
    import sys
    ElementTree.dump(parse(open(sys.argv[1])))
Merged revisions 65283,65303,65316-65317,65372-65375,65377,65380,65483-65485,65494 via svnmerge from svn+ssh://pythondev@svn.python.org/doctools/branches/0.4.x ........ r65283 \| georg.brandl \| 2008-07-29 10:07:26 +0000 (Tue, 29 Jul 2008) \| 2 lines Update ez_setup.py. ........ r65303 \| benjamin.peterson \| 2008-07-30 12:35:34 +0000 (Wed, 30 Jul 2008) \| 1 line add a with_testapp decorator for test functions that passes the TestApp instance in a cleans up after it ........ r65316 \| benjamin.peterson \| 2008-07-30 23:12:07 +0000 (Wed, 30 Jul 2008) \| 1 line make the app for test_markup global to the module ........ r65317 \| benjamin.peterson \| 2008-07-30 23:31:29 +0000 (Wed, 30 Jul 2008) \| 1 line make TestApp.cleanup more aggressive ........ r65372 \| georg.brandl \| 2008-08-01 19:11:22 +0000 (Fri, 01 Aug 2008) \| 2 lines Add more tests, fix a few bugs in image handling. ........ r65373 \| georg.brandl \| 2008-08-01 19:28:33 +0000 (Fri, 01 Aug 2008) \| 2 lines Fix oversight. ........ r65374 \| benjamin.peterson \| 2008-08-01 19:36:32 +0000 (Fri, 01 Aug 2008) \| 1 line fix one broken test ........ r65375 \| georg.brandl \| 2008-08-01 19:41:11 +0000 (Fri, 01 Aug 2008) \| 2 lines Fix the handling of non-ASCII input in quickstart. ........ r65377 \| georg.brandl \| 2008-08-01 19:48:24 +0000 (Fri, 01 Aug 2008) \| 2 lines Allow REs in markup checks. ........ r65380 \| georg.brandl \| 2008-08-01 20:31:18 +0000 (Fri, 01 Aug 2008) \| 2 lines Don't rely on mtimes being different for changed files. ........ r65483 \| georg.brandl \| 2008-08-04 09:01:40 +0000 (Mon, 04 Aug 2008) \| 4 lines Add an "encoding" option to literalinclude. Add tests for include directives. ........ r65484 \| georg.brandl \| 2008-08-04 09:11:17 +0000 (Mon, 04 Aug 2008) \| 2 lines Add changelog entry. ........ r65485 \| georg.brandl \| 2008-08-04 09:21:58 +0000 (Mon, 04 Aug 2008) \| 2 lines Fix markup. ........ r65494 \| georg.brandl \| 2008-08-04 16:34:59 +0000 (Mon, 04 Aug 2008) \| 2 lines Correctly use HTML file suffix in templates. ........ 2008-08-04 12:01:15 -05:00			`#`
			`# ElementTree`
			`# $Id$`
			`#`
			`# a simple tree builder, for HTML input`
			`#`
			`# history:`
			`# 2002-04-06 fl created`
			`# 2002-04-07 fl ignore IMG and HR end tags`
			`# 2002-04-07 fl added support for 1.5.2 and later`
			`# 2003-04-13 fl added HTMLTreeBuilder alias`
			`# 2004-12-02 fl don't feed non-ASCII charrefs/entities as 8-bit strings`
			`# 2004-12-05 fl don't feed non-ASCII CDATA as 8-bit strings`
			`#`
			`# Copyright (c) 1999-2004 by Fredrik Lundh. All rights reserved.`
			`#`
			`# fredrik@pythonware.com`
			`# http://www.pythonware.com`
			`#`
			`# --------------------------------------------------------------------`
			`# The ElementTree toolkit is`
			`#`
			`# Copyright (c) 1999-2007 by Fredrik Lundh`
			`#`
			`# By obtaining, using, and/or copying this software and/or its`
			`# associated documentation, you agree that you have read, understood,`
			`# and will comply with the following terms and conditions:`
			`#`
			`# Permission to use, copy, modify, and distribute this software and`
			`# its associated documentation for any purpose and without fee is`
			`# hereby granted, provided that the above copyright notice appears in`
			`# all copies, and that both that copyright notice and this permission`
			`# notice appear in supporting documentation, and that the name of`
			`# Secret Labs AB or the author not be used in advertising or publicity`
			`# pertaining to distribution of the software without specific, written`
			`# prior permission.`
			`#`
			`# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD`
			`# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-`
			`# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR`
			`# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY`
			`# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,`
			`# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS`
			`# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE`
			`# OF THIS SOFTWARE.`
			`# --------------------------------------------------------------------`

			`##`
			`# Tools to build element trees from HTML files.`
			`##`

			`import htmlentitydefs`
			`import re, string, sys`
			`import mimetools, StringIO`

			`import ElementTree`

			`AUTOCLOSE = "p", "li", "tr", "th", "td", "head", "body"`
			`IGNOREEND = "img", "hr", "meta", "link", "br"`

			`if sys.version[:3] == "1.5":`
			`is_not_ascii = re.compile(r"[\x80-\xff]").search # 1.5.2`
			`else:`
			`is_not_ascii = re.compile(eval(r'u"[\u0080-\uffff]"')).search`

			`try:`
			`from HTMLParser import HTMLParser`
			`except ImportError:`
			`from sgmllib import SGMLParser`
			`# hack to use sgmllib's SGMLParser to emulate 2.2's HTMLParser`
			`class HTMLParser(SGMLParser):`
			`# the following only works as long as this class doesn't`
			`# provide any do, start, or end handlers`
			`def unknown_starttag(self, tag, attrs):`
			`self.handle_starttag(tag, attrs)`
			`def unknown_endtag(self, tag):`
			`self.handle_endtag(tag)`

			`##`
			`# ElementTree builder for HTML source code. This builder converts an`
			`# HTML document or fragment to an ElementTree.`
			`# <p>`
			`# The parser is relatively picky, and requires balanced tags for most`
			`# elements. However, elements belonging to the following group are`
			`# automatically closed: P, LI, TR, TH, and TD. In addition, the`
			`# parser automatically inserts end tags immediately after the start`
			`# tag, and ignores any end tags for the following group: IMG, HR,`
			`# META, and LINK.`
			`#`
			`# @keyparam builder Optional builder object. If omitted, the parser`
			`# uses the standard <b>elementtree</b> builder.`
			`# @keyparam encoding Optional character encoding, if known. If omitted,`
			`# the parser looks for META tags inside the document. If no tags`
			`# are found, the parser defaults to ISO-8859-1. Note that if your`
			`# document uses a non-ASCII compatible encoding, you must decode`
			`# the document before parsing.`
			`#`
			`# @see elementtree.ElementTree`

			`class HTMLTreeBuilder(HTMLParser):`

			`# FIXME: shouldn't this class be named Parser, not Builder?`

			`def __init__(self, builder=None, encoding=None):`
			`self.__stack = []`
			`if builder is None:`
			`builder = ElementTree.TreeBuilder()`
			`self.__builder = builder`
			`self.encoding = encoding or "iso-8859-1"`
			`HTMLParser.__init__(self)`

			`##`
			`# Flushes parser buffers, and return the root element.`
			`#`
			`# @return An Element instance.`

			`def close(self):`
			`HTMLParser.close(self)`
			`return self.__builder.close()`

			`##`
			`# (Internal) Handles start tags.`

			`def handle_starttag(self, tag, attrs):`
			`if tag == "meta":`
			`# look for encoding directives`
			`http_equiv = content = None`
			`for k, v in attrs:`
			`if k == "http-equiv":`
			`http_equiv = string.lower(v)`
			`elif k == "content":`
			`content = v`
			`if http_equiv == "content-type" and content:`
			`# use mimetools to parse the http header`
			`header = mimetools.Message(`
			`StringIO.StringIO("%s: %s\n\n" % (http_equiv, content))`
			`)`
			`encoding = header.getparam("charset")`
			`if encoding:`
			`self.encoding = encoding`
			`if tag in AUTOCLOSE:`
			`if self.__stack and self.__stack[-1] == tag:`
			`self.handle_endtag(tag)`
			`self.__stack.append(tag)`
			`attrib = {}`
			`if attrs:`
			`for k, v in attrs:`
			`attrib[string.lower(k)] = v`
			`self.__builder.start(tag, attrib)`
			`if tag in IGNOREEND:`
			`self.__stack.pop()`
			`self.__builder.end(tag)`

			`##`
			`# (Internal) Handles end tags.`

			`def handle_endtag(self, tag):`
			`if tag in IGNOREEND:`
			`return`
			`lasttag = self.__stack.pop()`
			`if tag != lasttag and lasttag in AUTOCLOSE:`
			`self.handle_endtag(lasttag)`
			`self.__builder.end(tag)`

			`##`
			`# (Internal) Handles character references.`

			`def handle_charref(self, char):`
			`if char[:1] == "x":`
			`char = int(char[1:], 16)`
			`else:`
			`char = int(char)`
			`if 0 <= char < 128:`
			`self.__builder.data(chr(char))`
			`else:`
			`self.__builder.data(unichr(char))`

			`##`
			`# (Internal) Handles entity references.`

			`def handle_entityref(self, name):`
			`entity = htmlentitydefs.entitydefs.get(name)`
			`if entity:`
			`if len(entity) == 1:`
			`entity = ord(entity)`
			`else:`
			`entity = int(entity[2:-1])`
			`if 0 <= entity < 128:`
			`self.__builder.data(chr(entity))`
			`else:`
			`self.__builder.data(unichr(entity))`
			`else:`
			`self.unknown_entityref(name)`

			`##`
			`# (Internal) Handles character data.`

			`def handle_data(self, data):`
			`if isinstance(data, type('')) and is_not_ascii(data):`
			`# convert to unicode, but only if necessary`
			`data = unicode(data, self.encoding, "ignore")`
			`self.__builder.data(data)`

			`##`
			`# (Hook) Handles unknown entity references. The default action`
			`# is to ignore unknown entities.`

			`def unknown_entityref(self, name):`
			`pass # ignore by default; override if necessary`

			`##`
			`# An alias for the <b>HTMLTreeBuilder</b> class.`

			`TreeBuilder = HTMLTreeBuilder`

			`##`
			`# Parse an HTML document or document fragment.`
			`#`
			`# @param source A filename or file object containing HTML data.`
			`# @param encoding Optional character encoding, if known. If omitted,`
			`# the parser looks for META tags inside the document. If no tags`
			`# are found, the parser defaults to ISO-8859-1.`
			`# @return An ElementTree instance`

			`def parse(source, encoding=None):`
			`return ElementTree.parse(source, HTMLTreeBuilder(encoding=encoding))`

			`if __name__ == "__main__":`
			`import sys`
			`ElementTree.dump(parse(open(sys.argv[1])))`