Source code for zope.tal.htmltalparser

##############################################################################
#
# Copyright (c) 2001, 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""
Parse HTML and compile to :class:`~.TALInterpreter` intermediate code, using
a :class:`~.TALGenerator`.
"""
from html.parser import HTMLParser

from zope.tal.taldefs import ZOPE_I18N_NS
from zope.tal.taldefs import ZOPE_METAL_NS
from zope.tal.taldefs import ZOPE_TAL_NS
from zope.tal.taldefs import I18NError
from zope.tal.taldefs import METALError
from zope.tal.taldefs import TALError
from zope.tal.talgenerator import TALGenerator


[docs]class HTMLParseError(Exception):
    # Python 3.5 removed this class, but we need it as a base class
    # so here's a copy taken from Python 3.4

    def __init__(self, msg, position=(None, None)):
        Exception.__init__(self)
        assert msg
        self.msg = msg
        self.lineno = position[0]
        self.offset = position[1]

    def __str__(self):
        result = self.msg
        if self.lineno is not None:
            result = result + ", at line %d" % self.lineno
        if self.offset is not None:
            result = result + ", column %d" % (self.offset + 1)
        return result


_html_parser_extras = {}
if 'convert_charrefs' in HTMLParser.__init__.__code__.co_names:
    _html_parser_extras['convert_charrefs'] = False  # pragma: NO COVER py34

#: List of Boolean attributes in HTML that may be given in
#: minimized form (e.g. ``<img ismap>`` rather than ``<img ismap="">``)
#: From http://www.w3.org/TR/xhtml1/#guidelines (C.10)
BOOLEAN_HTML_ATTRS = frozenset([
    "compact", "nowrap", "ismap", "declare", "noshade", "checked",
    "disabled", "readonly", "multiple", "selected", "noresize",
    "defer"
])

#: List of HTML tags with an empty content model; these are
#: rendered in minimized form, e.g. ``<img />``.
#: From http://www.w3.org/TR/xhtml1/#dtds
EMPTY_HTML_TAGS = frozenset([
    "base", "meta", "link", "hr", "br", "param", "img", "area",
    "input", "col", "basefont", "isindex", "frame",
])

#: List of HTML elements that close open paragraph-level elements
#: and are themselves paragraph-level.
PARA_LEVEL_HTML_TAGS = frozenset([
    "h1", "h2", "h3", "h4", "h5", "h6", "p",
])

#: Tags that automatically close other tags.
BLOCK_CLOSING_TAG_MAP = {
    "tr": frozenset(["tr", "td", "th"]),
    "td": frozenset(["td", "th"]),
    "th": frozenset(["td", "th"]),
    "li": frozenset(["li"]),
    "dd": frozenset(["dd", "dt"]),
    "dt": frozenset(["dd", "dt"]),
}

#: List of HTML tags that denote larger sections than paragraphs.
BLOCK_LEVEL_HTML_TAGS = frozenset([
    "blockquote", "table", "tr", "th", "td", "thead", "tfoot", "tbody",
    "noframe", "ul", "ol", "li", "dl", "dt", "dd", "div", "nav",
])

#: Section level HTML tags
SECTION_LEVEL_HTML_TAGS = PARA_LEVEL_HTML_TAGS.union(BLOCK_LEVEL_HTML_TAGS)

TIGHTEN_IMPLICIT_CLOSE_TAGS = PARA_LEVEL_HTML_TAGS.union(BLOCK_CLOSING_TAG_MAP)


[docs]class NestingError(HTMLParseError):
    """Exception raised when elements aren't properly nested."""

    def __init__(self, tagstack, endtag, position=(None, None)):
        self.endtag = endtag
        if tagstack:
            if len(tagstack) == 1:
                msg = ('Open tag <%s> does not match close tag </%s>'
                       % (tagstack[0], endtag))
            else:
                msg = ('Open tags <%s> do not match close tag </%s>'
                       % ('>, <'.join(tagstack), endtag))
        else:
            msg = 'No tags are open to match </%s>' % endtag
        HTMLParseError.__init__(self, msg, position)


[docs]class EmptyTagError(NestingError):
    """Exception raised when empty elements have an end tag."""

    def __init__(self, tag, position=(None, None)):
        self.tag = tag
        msg = 'Close tag </%s> should be removed' % tag
        HTMLParseError.__init__(self, msg, position)


[docs]class OpenTagError(NestingError):
    """Exception raised when a tag is not allowed in another tag."""

    def __init__(self, tagstack, tag, position=(None, None)):
        self.tag = tag
        msg = 'Tag <{}> is not allowed in <{}>'.format(tag, tagstack[-1])
        HTMLParseError.__init__(self, msg, position)


[docs]class HTMLTALParser(HTMLParser):
    """
    Parser for HTML.

    After you call either :meth:`parseFile` and :meth:`parseString`
    you can retrieve the compiled program using :meth:`getCode`.
    """

    # External API

    def __init__(self, gen=None):
        """
        :keyword TALGenerator gen: The configured (with an expression compiler)
            code generator to use. If one is not given, a default will be used.
        """
        HTMLParser.__init__(self, **_html_parser_extras)
        if gen is None:
            gen = TALGenerator(xml=0)
        self.gen = gen
        self.tagstack = []
        self.nsstack = []
        self.nsdict = {
            'tal': ZOPE_TAL_NS,
            'metal': ZOPE_METAL_NS,
            'i18n': ZOPE_I18N_NS,
        }

[docs]    def parseFile(self, file):
        """Parse data in the given file."""
        with open(file) as f:
            data = f.read()

        try:
            self.parseString(data)
        except TALError as e:
            e.setFile(file)
            raise

[docs]    def parseString(self, data):
        """Parse data in the given string."""
        self.feed(data)
        self.close()
        while self.tagstack:
            self.implied_endtag(self.tagstack[-1], 2)
        assert self.nsstack == [], self.nsstack

[docs]    def getCode(self):
        """
        After parsing, this returns ``(program, macros)``.
        """
        return self.gen.getCode()

    # Overriding HTMLParser methods

    def handle_starttag(self, tag, attrs):
        self.close_para_tags(tag)
        self.scan_xmlns(attrs)
        tag, attrlist, taldict, metaldict, i18ndict \
            = self.process_ns(tag, attrs)
        if tag in EMPTY_HTML_TAGS and "content" in taldict:
            raise TALError(
                "empty HTML tags cannot use tal:content: %s" % repr(tag),
                self.getpos())
        # Support for inline Python code.
        if tag == 'script':
            type_attr = [a for a in attrlist if a[0] == "type"]
            if type_attr and type_attr[0][1].startswith('text/server-'):
                attrlist.remove(type_attr[0])
                taldict = {'script': type_attr[0][1], 'omit-tag': ''}
        self.tagstack.append(tag)
        self.gen.emitStartElement(tag, attrlist, taldict, metaldict, i18ndict,
                                  self.getpos())
        if tag in EMPTY_HTML_TAGS:
            self.implied_endtag(tag, -1)

    def handle_startendtag(self, tag, attrs):
        self.close_para_tags(tag)
        self.scan_xmlns(attrs)
        tag, attrlist, taldict, metaldict, i18ndict \
            = self.process_ns(tag, attrs)
        if "content" in taldict:
            if tag in EMPTY_HTML_TAGS:
                raise TALError(
                    "empty HTML tags cannot use tal:content: %s" % repr(tag),
                    self.getpos())
            self.gen.emitStartElement(tag, attrlist, taldict, metaldict,
                                      i18ndict, self.getpos())
            self.gen.emitEndElement(tag, implied=-1, position=self.getpos())
        else:
            self.gen.emitStartElement(tag, attrlist, taldict, metaldict,
                                      i18ndict, self.getpos(), isend=1)
        self.pop_xmlns()

    def handle_endtag(self, tag):
        if self.tagstack and self.tagstack[-1] == 'script' and tag != 'script':
            self.handle_data('</%s>' % tag)
            return
        if tag in EMPTY_HTML_TAGS:
            # </img> etc. in the source is an error
            raise EmptyTagError(tag, self.getpos())
        self.close_enclosed_tags(tag)
        self.gen.emitEndElement(tag, position=self.getpos())
        self.pop_xmlns()
        self.tagstack.pop()

    def close_para_tags(self, tag):
        if tag in EMPTY_HTML_TAGS:
            return
        close_to = -1
        if tag in BLOCK_CLOSING_TAG_MAP:
            blocks_to_close = BLOCK_CLOSING_TAG_MAP[tag]
            for i, t in enumerate(self.tagstack):
                if t in blocks_to_close:
                    if close_to == -1:
                        close_to = i
                elif t in BLOCK_LEVEL_HTML_TAGS:
                    close_to = -1
        elif tag in SECTION_LEVEL_HTML_TAGS:
            for i in range(len(self.tagstack) - 1, -1, -1):
                closetag = self.tagstack[i]
                if closetag in BLOCK_LEVEL_HTML_TAGS:
                    break
                elif closetag in PARA_LEVEL_HTML_TAGS:
                    if closetag != "p":
                        raise OpenTagError(self.tagstack, tag, self.getpos())
                    close_to = i
        if close_to >= 0:
            while len(self.tagstack) > close_to:
                self.implied_endtag(self.tagstack[-1], 1)

    def close_enclosed_tags(self, tag):
        if tag not in self.tagstack:
            raise NestingError(self.tagstack, tag, self.getpos())
        while tag != self.tagstack[-1]:
            self.implied_endtag(self.tagstack[-1], 1)
        assert self.tagstack[-1] == tag

    def implied_endtag(self, tag, implied):
        assert tag == self.tagstack[-1]
        assert implied in (-1, 1, 2)
        isend = (implied < 0)
        if tag in TIGHTEN_IMPLICIT_CLOSE_TAGS:
            # Pick out trailing whitespace from the program, and
            # insert the close tag before the whitespace.
            white = self.gen.unEmitWhitespace()
        else:
            white = None
        self.gen.emitEndElement(tag, isend=isend, implied=implied,
                                position=self.getpos())
        if white:
            self.gen.emitRawText(white)
        self.tagstack.pop()
        self.pop_xmlns()

    def handle_charref(self, name):
        self.gen.emitRawText("&#%s;" % name)

    def handle_entityref(self, name):
        self.gen.emitRawText("&%s;" % name)

    def handle_data(self, data):
        self.gen.emitRawText(data)

    def handle_comment(self, data):
        self.gen.emitRawText("<!--%s-->" % data)

    def handle_decl(self, data):
        self.gen.emitRawText("<!%s>" % data)

    def handle_pi(self, data):
        self.gen.emitRawText("<?%s>" % data)

    # Internal thingies

    def scan_xmlns(self, attrs):
        nsnew = {}
        for key, value in attrs:
            if key.startswith("xmlns:"):
                nsnew[key[6:]] = value
        self.nsstack.append(self.nsdict)
        if nsnew:
            self.nsdict = self.nsdict.copy()
            self.nsdict.update(nsnew)

    def pop_xmlns(self):
        self.nsdict = self.nsstack.pop()

    _namespaces = {
        ZOPE_TAL_NS: "tal",
        ZOPE_METAL_NS: "metal",
        ZOPE_I18N_NS: "i18n",
    }

    def fixname(self, name):
        if ':' in name:
            prefix, suffix = name.split(':', 1)
            if prefix == 'xmlns':
                nsuri = self.nsdict.get(suffix)
                if nsuri in self._namespaces:
                    return name, name, prefix
            else:
                nsuri = self.nsdict.get(prefix)
                if nsuri in self._namespaces:
                    return name, suffix, self._namespaces[nsuri]
        return name, name, 0

    def process_ns(self, name, attrs):
        attrlist = []
        taldict = {}
        metaldict = {}
        i18ndict = {}
        name, namebase, namens = self.fixname(name)
        for item in attrs:
            key, value = item
            key, keybase, keyns = self.fixname(key)
            ns = keyns or namens  # default to tag namespace
            if ns and ns != 'unknown':
                item = (key, value, ns)
            if ns == 'tal':
                if keybase in taldict:
                    raise TALError("duplicate TAL attribute " +
                                   repr(keybase), self.getpos())
                taldict[keybase] = value
            elif ns == 'metal':
                if keybase in metaldict:
                    raise METALError("duplicate METAL attribute " +
                                     repr(keybase), self.getpos())
                metaldict[keybase] = value
            elif ns == 'i18n':
                if keybase in i18ndict:
                    raise I18NError("duplicate i18n attribute " +
                                    repr(keybase), self.getpos())
                i18ndict[keybase] = value
            attrlist.append(item)
        if namens in ('metal', 'tal', 'i18n'):
            taldict['tal tag'] = namens
        return name, attrlist, taldict, metaldict, i18ndict