Source code for xml4h.impls.xml_etree_elementtree

import re
import copy

import six

from xml4h.impls.interface import XmlImplAdapter
from xml4h import nodes, exceptions

# Import the pure-Python ElementTree implementation, if possible
try:
    import xml.etree.ElementTree as PythonET
    # Re-import non-C ElementTree with a definitive name, for cases where we
    # must explicilty use non-C-based elements of ElementTree.
    import xml.etree.ElementTree as BaseET
except ImportError:
    pass

# Import the C-based ElementTree implementation, if possible
try:
    import xml.etree.cElementTree as cET
except ImportError:
    pass


[docs]class ElementTreeAdapter(XmlImplAdapter):
    """
    Adapter to the
    `ElementTree <http://docs.python.org/2/library/xml.etree.elementtree.html>`_
    XML library.

    This code *must* work with either the base ElementTree pure python
    implementation or the C-based cElementTree implementation, since it is
    reused in the `cElementTree` class defined below.
    """

    ET = PythonET  # Use the pure-Python implementation

    SUPPORTED_FEATURES = {
        'xpath': True,
        }

[docs]    @classmethod
    def is_available(cls):
        # Is vital piece of ElementTree module available at all?
        try:
            cls.ET.Element
        except:
            return False
        # We only support ElementTree version 1.3+
        from distutils.version import StrictVersion
        return StrictVersion(BaseET.VERSION) >= StrictVersion('1.3')

    @classmethod
    def parse_string(cls, xml_str, ignore_whitespace_text_nodes=True):
        return cls.parse_file(
            six.StringIO(xml_str),
            ignore_whitespace_text_nodes=ignore_whitespace_text_nodes)

    @classmethod
    def parse_bytes(cls, xml_bytes, ignore_whitespace_text_nodes=True):
        return cls.parse_file(
            six.BytesIO(xml_bytes),
            ignore_whitespace_text_nodes=ignore_whitespace_text_nodes)

    @classmethod
    def parse_file(cls, xml_file_path, ignore_whitespace_text_nodes=True):
        # To retain explicit xmlns namespace definition attributes, we need to
        # manually add these elements to the parsed DOM as we go using
        # iterative parsing per:
        # effbot.org/zone/element-namespaces.htm#preserving-existing-namespace-attributes
        events = ('start', 'start-ns')
        impl_root = None
        ns_list = []
        for event, node in cls.ET.iterparse(xml_file_path, events):
            if event == 'start-ns':
                # Track namespaces as nodes declared
                ns_list.append(node)
            elif event == 'start':
                # Recognise and retain root node
                if impl_root is None:
                    impl_root = node
                # Add xmlns attributes for each namespace declared
                for ns_prefix, ns_uri in ns_list:
                    if ns_prefix:
                        attr_name = 'xmlns:%s' % ns_prefix
                    else:
                        attr_name = 'xmlns'
                    node.set(attr_name, ns_uri)
                # Reset namespace list now the corresponding attributes exist
                ns_list = []

        impl_doc = cls.ET.ElementTree(impl_root)
        wrapped_doc = cls.wrap_document(impl_doc)
        if ignore_whitespace_text_nodes:
            cls.ignore_whitespace_text_nodes(wrapped_doc)
        return wrapped_doc

    @classmethod
    def new_impl_document(cls, root_tagname, ns_uri=None, **kwargs):
        root_nsmap = {}
        if ns_uri is not None:
            root_nsmap[None] = ns_uri
        else:
            ns_uri = nodes.Node.XMLNS_URI
            root_nsmap[None] = ns_uri
        root_elem = cls.ET.Element('{%s}%s' % (ns_uri, root_tagname))
        doc = cls.ET.ElementTree(root_elem)
        return doc

    # This method is called by interface super-class's __init__
[docs]    def clear_caches(self):
        self.CACHED_ANCESTRY_DICT = {}

    def _lookup_node_parent(self, node):
        """
        Return the parent of the given node, based on an internal dictionary
        mapping of child nodes to the child's parent required since
        ElementTree doesn't make info about node ancestry/parentage available.
        """
        # Basic caching of our internal ancestry dict to help performance
        if not node in self.CACHED_ANCESTRY_DICT:
            # Given node isn't in cached ancestry dictionary, rebuild this now
            ancestry_dict = dict(
                (c, p) for p in self._impl_document.getiterator() for c in p)
            self.CACHED_ANCESTRY_DICT = ancestry_dict
        return self.CACHED_ANCESTRY_DICT[node]

    def _is_node_an_element(self, node):
        """
        Return True if the given node is an ElementTree Element, a fact that
        can be tricky to determine if the cElementTree implementation is
        used.
        """
        # Try the simplest approach first, works for plain old ElementTree
        if isinstance(node, BaseET.Element):
            return True
        # For cElementTree we need to be more cunning (or find a better way)
        if hasattr(node, 'makeelement') \
                and isinstance(node.tag, six.string_types):
            return True

    def map_node_to_class(self, node):
        if isinstance(node, BaseET.ElementTree):
            return nodes.Document
        elif node.tag == BaseET.ProcessingInstruction:
            return nodes.ProcessingInstruction
        elif node.tag == BaseET.Comment:
            return nodes.Comment
        elif isinstance(node, ETAttribute):
            return nodes.Attribute
        elif isinstance(node, ElementTreeText):
            if node.is_cdata:
                return nodes.CDATA
            else:
                return nodes.Text
        elif self._is_node_an_element(node):
            return nodes.Element
        raise exceptions.Xml4hImplementationBug(
            'Unrecognized type for implementation node: %s' % node)

    def get_impl_root(self, node):
        return self._impl_document.getroot()

    # Document implementation methods

    def new_impl_element(self, tagname, ns_uri=None, parent=None):
        if ns_uri is not None:
            if ':' in tagname:
                tagname = tagname.split(':')[1]
            element = self.ET.Element('{%s}%s' % (ns_uri, tagname))
            return element
        else:
            return self.ET.Element(tagname)

    def new_impl_text(self, text):
        return ElementTreeText(text)

    def new_impl_comment(self, text):
        return self.ET.Comment(text)

    def new_impl_instruction(self, target, data):
        return self.ET.ProcessingInstruction(target, data)

    def new_impl_cdata(self, text):
        return ElementTreeText(text, is_cdata=True)

[docs]    def find_node_elements(self, node, name='*', ns_uri='*'):
        # TODO Any proper way to find namespaced elements by name?
        name_match_nodes = node.getiterator()
        # Filter nodes by name and ns_uri if necessary
        results = []
        for n in name_match_nodes:
            # Ignore the current node
            if n == node:
                continue
            # Ignore non-Elements
            if not isinstance(n.tag, six.string_types):
                continue
            if ns_uri != '*' and self.get_node_namespace_uri(n) != ns_uri:
                continue
            if name != '*' and self.get_node_local_name(n) != name:
                continue
            results.append(n)
        return results
    find_node_elements.__doc__ = XmlImplAdapter.find_node_elements.__doc__

[docs]    def xpath_on_node(self, node, xpath, **kwargs):
        """
        Return result of performing the given XPath query on the given node.

        All known namespace prefix-to-URI mappings in the document are
        automatically included in the XPath invocation.

        If an empty/default namespace (i.e. None) is defined, this is
        converted to the prefix name '_' so it can be used despite empty
        namespace prefixes being unsupported by XPath.
        """
        namespaces_dict = {}
        if 'namespaces' in kwargs:
            namespaces_dict.update(kwargs['namespaces'])
        # Empty namespace prefix is not supported, convert to '_' prefix
        if None in namespaces_dict:
            default_ns_uri = namespaces_dict.pop(None)
            namespaces_dict['_'] = default_ns_uri
        # If no default namespace URI defined, use root's namespace (if any)
        if not '_' in namespaces_dict:
            root = self.get_impl_root(node)
            qname, ns_uri, prefix, local_name = self._unpack_name(
                root.tag, root)
            if ns_uri:
                namespaces_dict['_'] = ns_uri
        # Include XMLNS namespace if it's not already defined
        if not 'xmlns' in namespaces_dict:
            namespaces_dict['xmlns'] = nodes.Node.XMLNS_URI
        return node.findall(xpath, namespaces_dict)

    # Node implementation methods

    def get_node_namespace_uri(self, node):
        if '}' in node.tag:
            return node.tag.split('}')[0][1:]
        elif isinstance(node, ETAttribute):
            return node.namespace_uri
        elif self._is_node_an_element(node):
            qname, ns_uri = self._unpack_name(node.tag, node)[:2]
            return ns_uri
        else:
            return None

    def set_node_namespace_uri(self, node, ns_uri):
        qname, orig_ns_uri, prefix, local_name = self._unpack_name(
            node.tag, node)
        node.tag = '{%s}%s' % (ns_uri, local_name)

    def get_node_parent(self, node):
        parent = None
        # Root document has no parent
        if isinstance(node, BaseET.ElementTree):
            pass
        elif hasattr(node, 'getparent'):
            parent = node.getparent()
        # Return ElementTree as root element's parent
        elif node == self.get_impl_root(node):
            parent = self._impl_document
        else:
            parent = self._lookup_node_parent(node)
        return parent

    def get_node_children(self, node):
        if isinstance(node, BaseET.ElementTree):
            children = [node.getroot()]
        else:
            if not hasattr(node, 'getchildren'):
                return []
            children = list(node.getchildren())
            # Hack to treat text attribute as child text nodes
            if node.text is not None:
                children.insert(0, ElementTreeText(node.text, parent=node))
        return children

    def get_node_name(self, node):
        if node.tag == BaseET.Comment:
            return '#comment'
        elif node.tag == BaseET.ProcessingInstruction:
            name, target = node.text.split(' ')
            return name
        prefix = self.get_node_name_prefix(node)
        if prefix is not None:
            return '%s:%s' % (prefix, self.get_node_local_name(node))
        else:
            return self.get_node_local_name(node)

    def get_node_local_name(self, node):
        return re.sub('{.*}', '', node.tag)

    def get_node_name_prefix(self, node):
        # Ignore non-elements
        if not isinstance(node.tag, six.string_types):
            return None
        # Believe nodes that have their own prefix (likely only ETAttribute)
        prefix = getattr(node, 'prefix', None)
        if prefix:
            return prefix
        # Derive prefix by unpacking node name
        qname, ns_uri, prefix, local_name = self._unpack_name(node.tag, node)
        if prefix:
            # Don't add unnecessary excess namespace prefixes for elements
            # with a local default namespace declaration
            if node.attrib.get('xmlns') == ns_uri:
                return None
            # Don't add unnecessary excess namespace prefixes for default ns
            elif prefix == 'xmlns':
                return None
            else:
                return prefix
        else:
            return None

    def get_node_value(self, node):
        if node.tag == BaseET.ProcessingInstruction:
            name, target = node.text.split(' ')
            return target
        elif node.tag == BaseET.Comment:
            return node.text
        elif hasattr(node, 'value'):
            return node.value
        else:
            return node.text

    def set_node_value(self, node, value):
        if hasattr(node, 'value'):
            node.value = value
        else:
            self.set_node_text(node, value)

    def get_node_text(self, node):
        return node.text

    def set_node_text(self, node, text):
        node.text = text

    def get_node_attributes(self, element, ns_uri=None):
        # TODO: Filter by ns_uri
        attribs_by_qname = {}
        for n, v in list(element.attrib.items()):
            qname, ns_uri, prefix, local_name = self._unpack_name(n, element)
            attribs_by_qname[qname] = ETAttribute(
                qname, ns_uri, prefix, local_name, v, element)
        return list(attribs_by_qname.values())

    def has_node_attribute(self, element, name, ns_uri=None):
        return name in [a.qname for a
                        in self.get_node_attributes(element, ns_uri)]

    def get_node_attribute_node(self, element, name, ns_uri=None):
        for attr in self.get_node_attributes(element, ns_uri):
            if attr.qname == name:
                return attr
        return None

    def get_node_attribute_value(self, element, name, ns_uri=None):
        if ns_uri is not None:
            prefix = self.lookup_ns_prefix_for_uri(element, ns_uri)
            name = '%s:%s' % (prefix, name)
        for attr in self.get_node_attributes(element, ns_uri):
            if attr.qname == name:
                return attr.value
        return None

    def set_node_attribute_value(self, element, name, value, ns_uri=None):
        prefix = None
        if ':' in name:
            prefix, name = name.split(':')
        if ns_uri is None and prefix is not None:
            ns_uri = self.lookup_ns_uri_by_attr_name(element, prefix)
        if ns_uri is not None:
            name = '{%s}%s' % (ns_uri, name)
        if name.startswith('{%s}' % nodes.Node.XMLNS_URI):
            if name.split('}')[1] == 'xmlns':
                # Hack to remove namespace URI from 'xmlns' attributes so
                # the name is just a simple string
                name = 'xmlns'
            element.attrib[name] = value
        else:
            element.attrib[name] = value

    def remove_node_attribute(self, element, name, ns_uri=None):
        if ns_uri is not None:
            name = '{%s}%s' % (ns_uri, name)
        elif ':' in name:
            prefix, local_name = name.split(':')
            if prefix != 'xmlns':
                ns_attr_name = 'xmlns:%s' % prefix
                ns_uri = self.lookup_ns_uri_by_attr_name(element, ns_attr_name)
                name = '{%s}%s' % (ns_uri, local_name)
        if name in element.attrib:
            del(element.attrib[name])

    def add_node_child(self, parent, child, before_sibling=None):
        if isinstance(child, ElementTreeText):
            # Add text values directly to parent's 'text' attribute
            if parent.text is not None:
                parent.text = parent.text + child.text
            else:
                parent.text = child.text
            self.CACHED_ANCESTRY_DICT[child] = parent
            return None
        else:
            if before_sibling is not None:
                offset = 0
                for c in parent.getchildren():
                    if c == before_sibling:
                        break
                    offset += 1
                parent.insert(offset, child)
            else:
                parent.append(child)
            self.CACHED_ANCESTRY_DICT[child] = parent
            return child

    def import_node(self, parent, node, original_parent=None, clone=False):
        original_node = node
        # We always clone for (c)ElementTree adapter so we can remove original
        # if necessary
        node = self.clone_node(node)
        self.add_node_child(parent, node)
        # Hack to remove text node content from original parent by manually
        # deleting matching text content
        if not clone:
            if isinstance(original_node, ElementTreeText):
                original_parent = self.get_node_parent(original_node)
                if original_parent.text == original_node.text:
                    # Must set to None if there would be no remaining text,
                    # otherwise parent element won't realise it's empty
                    original_parent.text = None
                else:
                    original_parent.text = \
                        original_parent.text.replace(original_node.text, '', 1)
            else:
                original_parent.remove(original_node)

    def clone_node(self, node, deep=True):
        if deep:
            return copy.deepcopy(node)
        else:
            return copy.copy(node)

    def remove_node_child(self, parent, child, destroy_node=True):
        if isinstance(child, ElementTreeText):
            child._parent.text = None
            return
        parent.remove(child)
        if destroy_node:
            child.clear()
            return None
        else:
            return child

    def lookup_ns_uri_by_attr_name(self, node, name):
        curr_node = node
        while (curr_node is not None
                and not isinstance(curr_node, BaseET.ElementTree)):
            uri = self.get_node_attribute_value(curr_node, name)
            if uri is not None:
                return uri
            curr_node = self.get_node_parent(curr_node)
        return None

    def lookup_ns_prefix_for_uri(self, node, uri):
        if uri == nodes.Node.XMLNS_URI:
            return 'xmlns'
        result = None
        # Lookup namespace URI in ET's awful global namespace/prefix registry
        if hasattr(BaseET, '_namespace_map') and uri in BaseET._namespace_map:
            result = BaseET._namespace_map[uri]
            if result == '':
                result = None
        if result is None or re.match('ns\d', result):
            # We either have no namespace prefix in the global mapping, in
            # which case we will try looking for a matching xmlns attribute,
            # or we have a namespace prefix that was probably assigned
            # automatically by ElementTree and we'd rather use a
            # human-assigned prefix if available.
            curr_node = node
            while self._is_node_an_element(curr_node):
                for n, v in list(curr_node.attrib.items()):
                    if v == uri:
                        if n.startswith('xmlns:'):
                            result = n.split(':')[1]
                            return result
                        elif n.startswith('{%s}' % nodes.Node.XMLNS_URI):
                            result = n.split('}')[1]
                            return result
                curr_node = self.get_node_parent(curr_node)
        return result

    def _unpack_name(self, name, node):
        qname = prefix = local_name = ns_uri = None
        if name == 'xmlns':
            # Namespace URI of 'xmlns' is a constant
            ns_uri = nodes.Node.XMLNS_URI
        elif '}' in name:
            # Namespace URI is contained in {}, find URI's defined prefix
            ns_uri, local_name = name.split('}')
            ns_uri = ns_uri[1:]
            prefix = self.lookup_ns_prefix_for_uri(node, ns_uri)
        elif ':' in name:
            # Namespace prefix is before ':', find prefix's defined URI
            prefix, local_name = name.split(':')
            if prefix == 'xmlns':
                # All 'xmlns' attributes are in XMLNS URI by definition
                ns_uri = nodes.Node.XMLNS_URI
            else:
                ns_uri = self.lookup_ns_uri_by_attr_name(node, prefix)
        # Catch case where a prefix other than 'xmlns' points at XMLNS URI
        if name != 'xmlns' and ns_uri == nodes.Node.XMLNS_URI:
            prefix = 'xmlns'
        # Construct fully-qualified name from prefix + local names
        if prefix is not None:
            qname = '%s:%s' % (prefix, local_name)
        else:
            qname = local_name = name
        return (qname, ns_uri, prefix, local_name)


class ElementTreeText(object):

    def __init__(self, text, parent=None, is_cdata=False):
        self._text = text
        self._parent = parent
        self._is_cdata = is_cdata

    @property
    def is_cdata(self):
        return self._is_cdata

    @property
    def value(self):
        return self._text

    text = value  # Alias

    def getparent(self):
        return self._parent

    @property
    def prefix(self):
        return None

    @property
    def tag(self):
        if self.is_cdata:
            return "#cdata-section"
        else:
            return "#text"


class ETAttribute(object):

    def __init__(self, qname, ns_uri, prefix, local_name, value, element):
        self._qname, self._ns_uri, self._prefix, self._local_name = (
            qname, ns_uri, prefix, local_name)
        self._value, self._element = (value, element)

    def getroottree(self):
        return self._element.getroottree()

    @property
    def qname(self):
        return self._qname

    @property
    def namespace_uri(self):
        return self._ns_uri

    @property
    def prefix(self):
        return self._prefix

    @property
    def local_name(self):
        return self._local_name

    @property
    def value(self):
        return self._value

    name = tag = local_name  # Alias


[docs]class cElementTreeAdapter(ElementTreeAdapter):
    """
    Adapter to the C-based implementation of the
    `ElementTree <http://docs.python.org/2/library/xml.etree.elementtree.html>`_
    XML library.
    """

    ET = cET  # Use the C-based implementation

[docs]    @classmethod
    def is_available(cls):
        if not super(cElementTreeAdapter, cls).is_available():
            return False
        # We only support cElementTree version 1.0.6+
        from distutils.version import StrictVersion
        return StrictVersion(cls.ET.VERSION) >= StrictVersion('1.0.6')