| from __future__ import absolute_import, division, unicode_literals |
| |
| from xml.dom import Node |
| from ..constants import namespaces, voidElements, spaceCharacters |
| |
| __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN", |
| "TreeWalker", "NonRecursiveTreeWalker"] |
| |
| DOCUMENT = Node.DOCUMENT_NODE |
| DOCTYPE = Node.DOCUMENT_TYPE_NODE |
| TEXT = Node.TEXT_NODE |
| ELEMENT = Node.ELEMENT_NODE |
| COMMENT = Node.COMMENT_NODE |
| ENTITY = Node.ENTITY_NODE |
| UNKNOWN = "<#UNKNOWN#>" |
| |
| spaceCharacters = "".join(spaceCharacters) |
| |
| |
| class TreeWalker(object): |
| """Walks a tree yielding tokens |
| |
| Tokens are dicts that all have a ``type`` field specifying the type of the |
| token. |
| |
| """ |
| def __init__(self, tree): |
| """Creates a TreeWalker |
| |
| :arg tree: the tree to walk |
| |
| """ |
| self.tree = tree |
| |
| def __iter__(self): |
| raise NotImplementedError |
| |
| def error(self, msg): |
| """Generates an error token with the given message |
| |
| :arg msg: the error message |
| |
| :returns: SerializeError token |
| |
| """ |
| return {"type": "SerializeError", "data": msg} |
| |
| def emptyTag(self, namespace, name, attrs, hasChildren=False): |
| """Generates an EmptyTag token |
| |
| :arg namespace: the namespace of the token--can be ``None`` |
| |
| :arg name: the name of the element |
| |
| :arg attrs: the attributes of the element as a dict |
| |
| :arg hasChildren: whether or not to yield a SerializationError because |
| this tag shouldn't have children |
| |
| :returns: EmptyTag token |
| |
| """ |
| yield {"type": "EmptyTag", "name": name, |
| "namespace": namespace, |
| "data": attrs} |
| if hasChildren: |
| yield self.error("Void element has children") |
| |
| def startTag(self, namespace, name, attrs): |
| """Generates a StartTag token |
| |
| :arg namespace: the namespace of the token--can be ``None`` |
| |
| :arg name: the name of the element |
| |
| :arg attrs: the attributes of the element as a dict |
| |
| :returns: StartTag token |
| |
| """ |
| return {"type": "StartTag", |
| "name": name, |
| "namespace": namespace, |
| "data": attrs} |
| |
| def endTag(self, namespace, name): |
| """Generates an EndTag token |
| |
| :arg namespace: the namespace of the token--can be ``None`` |
| |
| :arg name: the name of the element |
| |
| :returns: EndTag token |
| |
| """ |
| return {"type": "EndTag", |
| "name": name, |
| "namespace": namespace} |
| |
| def text(self, data): |
| """Generates SpaceCharacters and Characters tokens |
| |
| Depending on what's in the data, this generates one or more |
| ``SpaceCharacters`` and ``Characters`` tokens. |
| |
| For example: |
| |
| >>> from html5lib.treewalkers.base import TreeWalker |
| >>> # Give it an empty tree just so it instantiates |
| >>> walker = TreeWalker([]) |
| >>> list(walker.text('')) |
| [] |
| >>> list(walker.text(' ')) |
| [{u'data': ' ', u'type': u'SpaceCharacters'}] |
| >>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE |
| [{u'data': ' ', u'type': u'SpaceCharacters'}, |
| {u'data': u'abc', u'type': u'Characters'}, |
| {u'data': u' ', u'type': u'SpaceCharacters'}] |
| |
| :arg data: the text data |
| |
| :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens |
| |
| """ |
| data = data |
| middle = data.lstrip(spaceCharacters) |
| left = data[:len(data) - len(middle)] |
| if left: |
| yield {"type": "SpaceCharacters", "data": left} |
| data = middle |
| middle = data.rstrip(spaceCharacters) |
| right = data[len(middle):] |
| if middle: |
| yield {"type": "Characters", "data": middle} |
| if right: |
| yield {"type": "SpaceCharacters", "data": right} |
| |
| def comment(self, data): |
| """Generates a Comment token |
| |
| :arg data: the comment |
| |
| :returns: Comment token |
| |
| """ |
| return {"type": "Comment", "data": data} |
| |
| def doctype(self, name, publicId=None, systemId=None): |
| """Generates a Doctype token |
| |
| :arg name: |
| |
| :arg publicId: |
| |
| :arg systemId: |
| |
| :returns: the Doctype token |
| |
| """ |
| return {"type": "Doctype", |
| "name": name, |
| "publicId": publicId, |
| "systemId": systemId} |
| |
| def entity(self, name): |
| """Generates an Entity token |
| |
| :arg name: the entity name |
| |
| :returns: an Entity token |
| |
| """ |
| return {"type": "Entity", "name": name} |
| |
| def unknown(self, nodeType): |
| """Handles unknown node types""" |
| return self.error("Unknown node type: " + nodeType) |
| |
| |
| class NonRecursiveTreeWalker(TreeWalker): |
| def getNodeDetails(self, node): |
| raise NotImplementedError |
| |
| def getFirstChild(self, node): |
| raise NotImplementedError |
| |
| def getNextSibling(self, node): |
| raise NotImplementedError |
| |
| def getParentNode(self, node): |
| raise NotImplementedError |
| |
| def __iter__(self): |
| currentNode = self.tree |
| while currentNode is not None: |
| details = self.getNodeDetails(currentNode) |
| type, details = details[0], details[1:] |
| hasChildren = False |
| |
| if type == DOCTYPE: |
| yield self.doctype(*details) |
| |
| elif type == TEXT: |
| for token in self.text(*details): |
| yield token |
| |
| elif type == ELEMENT: |
| namespace, name, attributes, hasChildren = details |
| if (not namespace or namespace == namespaces["html"]) and name in voidElements: |
| for token in self.emptyTag(namespace, name, attributes, |
| hasChildren): |
| yield token |
| hasChildren = False |
| else: |
| yield self.startTag(namespace, name, attributes) |
| |
| elif type == COMMENT: |
| yield self.comment(details[0]) |
| |
| elif type == ENTITY: |
| yield self.entity(details[0]) |
| |
| elif type == DOCUMENT: |
| hasChildren = True |
| |
| else: |
| yield self.unknown(details[0]) |
| |
| if hasChildren: |
| firstChild = self.getFirstChild(currentNode) |
| else: |
| firstChild = None |
| |
| if firstChild is not None: |
| currentNode = firstChild |
| else: |
| while currentNode is not None: |
| details = self.getNodeDetails(currentNode) |
| type, details = details[0], details[1:] |
| if type == ELEMENT: |
| namespace, name, attributes, hasChildren = details |
| if (namespace and namespace != namespaces["html"]) or name not in voidElements: |
| yield self.endTag(namespace, name) |
| if self.tree is currentNode: |
| currentNode = None |
| break |
| nextSibling = self.getNextSibling(currentNode) |
| if nextSibling is not None: |
| currentNode = nextSibling |
| break |
| else: |
| currentNode = self.getParentNode(currentNode) |