| """A collection of modules for building different kinds of trees from HTML |
| documents. |
| |
| To create a treebuilder for a new type of tree, you need to do |
| implement several things: |
| |
| 1. A set of classes for various types of elements: Document, Doctype, Comment, |
| Element. These must implement the interface of ``base.treebuilders.Node`` |
| (although comment nodes have a different signature for their constructor, |
| see ``treebuilders.etree.Comment``) Textual content may also be implemented |
| as another node type, or not, as your tree implementation requires. |
| |
| 2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits |
| from ``treebuilders.base.TreeBuilder``. This has 4 required attributes: |
| |
| * ``documentClass`` - the class to use for the bottommost node of a document |
| * ``elementClass`` - the class to use for HTML Elements |
| * ``commentClass`` - the class to use for comments |
| * ``doctypeClass`` - the class to use for doctypes |
| |
| It also has one required method: |
| |
| * ``getDocument`` - Returns the root node of the complete document tree |
| |
| 3. If you wish to run the unit tests, you must also create a ``testSerializer`` |
| method on your treebuilder which accepts a node and returns a string |
| containing Node and its children serialized according to the format used in |
| the unittests |
| |
| """ |
| |
| from __future__ import absolute_import, division, unicode_literals |
| |
| from .._utils import default_etree |
| |
| treeBuilderCache = {} |
| |
| |
| def getTreeBuilder(treeType, implementation=None, **kwargs): |
| """Get a TreeBuilder class for various types of trees with built-in support |
| |
| :arg treeType: the name of the tree type required (case-insensitive). Supported |
| values are: |
| |
| * "dom" - A generic builder for DOM implementations, defaulting to a |
| xml.dom.minidom based implementation. |
| * "etree" - A generic builder for tree implementations exposing an |
| ElementTree-like interface, defaulting to xml.etree.cElementTree if |
| available and xml.etree.ElementTree if not. |
| * "lxml" - A etree-based builder for lxml.etree, handling limitations |
| of lxml's implementation. |
| |
| :arg implementation: (Currently applies to the "etree" and "dom" tree |
| types). A module implementing the tree type e.g. xml.etree.ElementTree |
| or xml.etree.cElementTree. |
| |
| :arg kwargs: Any additional options to pass to the TreeBuilder when |
| creating it. |
| |
| Example: |
| |
| >>> from html5lib.treebuilders import getTreeBuilder |
| >>> builder = getTreeBuilder('etree') |
| |
| """ |
| |
| treeType = treeType.lower() |
| if treeType not in treeBuilderCache: |
| if treeType == "dom": |
| from . import dom |
| # Come up with a sane default (pref. from the stdlib) |
| if implementation is None: |
| from xml.dom import minidom |
| implementation = minidom |
| # NEVER cache here, caching is done in the dom submodule |
| return dom.getDomModule(implementation, **kwargs).TreeBuilder |
| elif treeType == "lxml": |
| from . import etree_lxml |
| treeBuilderCache[treeType] = etree_lxml.TreeBuilder |
| elif treeType == "etree": |
| from . import etree |
| if implementation is None: |
| implementation = default_etree |
| # NEVER cache here, caching is done in the etree submodule |
| return etree.getETreeModule(implementation, **kwargs).TreeBuilder |
| else: |
| raise ValueError("""Unrecognised treebuilder "%s" """ % treeType) |
| return treeBuilderCache.get(treeType) |