slixmpp/sleekxmpp/xmlstream/tostring.py

# -*- coding: utf-8 -*-
"""
    sleekxmpp.xmlstream.tostring
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    This module converts XML objects into Unicode strings and
    intelligently includes namespaces only when necessary to
    keep the output readable.

    Part of SleekXMPP: The Sleek XMPP Library

    :copyright: (c) 2011 Nathanael C. Fritz
    :license: MIT, see LICENSE for more details
"""

from __future__ import unicode_literals

import sys

if sys.version_info < (3, 0):
    import types


XML_NS = 'http://www.w3.org/XML/1998/namespace'


def tostring(xml=None, xmlns='', stream=None, outbuffer='',
             top_level=False, open_only=False, namespaces=None):
    """Serialize an XML object to a Unicode string.

    If an outer xmlns is provided using ``xmlns``, then the current element's
    namespace will not be included if it matches the outer namespace. An
    exception is made for elements that have an attached stream, and appear
    at the stream root.

    :param XML xml: The XML object to serialize.
    :param string xmlns: Optional namespace of an element wrapping the XML
                         object.
    :param stream: The XML stream that generated the XML object.
    :param string outbuffer: Optional buffer for storing serializations
                             during recursive calls.
    :param bool top_level: Indicates that the element is the outermost
                           element.
    :param set namespaces: Track which namespaces are in active use so
                           that new ones can be declared when needed.

    :type xml: :py:class:`~xml.etree.ElementTree.Element`
    :type stream: :class:`~sleekxmpp.xmlstream.xmlstream.XMLStream`

    :rtype: Unicode string
    """
    # Add previous results to the start of the output.
    output = [outbuffer]

    # Extract the element's tag name.
    tag_name = xml.tag.split('}', 1)[-1]

    # Extract the element's namespace if it is defined.
    if '}' in xml.tag:
        tag_xmlns = xml.tag.split('}', 1)[0][1:]
    else:
        tag_xmlns = ''

    default_ns = ''
    stream_ns = ''
    use_cdata = False

    if stream:
        default_ns = stream.default_ns
        stream_ns = stream.stream_ns
        use_cdata = stream.use_cdata

    # Output the tag name and derived namespace of the element.
    namespace = ''
    if tag_xmlns:
        if top_level and tag_xmlns not in [default_ns, xmlns, stream_ns] \
          or not top_level and tag_xmlns != xmlns:
            namespace = ' xmlns="%s"' % tag_xmlns
    if stream and tag_xmlns in stream.namespace_map:
        mapped_namespace = stream.namespace_map[tag_xmlns]
        if mapped_namespace:
            tag_name = "%s:%s" % (mapped_namespace, tag_name)
    output.append("<%s" % tag_name)
    output.append(namespace)

    # Output escaped attribute values.
    new_namespaces = set()
    for attrib, value in xml.attrib.items():
        value = escape(value, use_cdata)
        if '}' not in attrib:
            output.append(' %s="%s"' % (attrib, value))
        else:
            attrib_ns = attrib.split('}')[0][1:]
            attrib = attrib.split('}')[1]
            if attrib_ns == XML_NS:
                output.append(' xml:%s="%s"' % (attrib, value))
            elif stream and attrib_ns in stream.namespace_map:
                mapped_ns = stream.namespace_map[attrib_ns]
                if mapped_ns:
                    if namespaces is None:
                        namespaces = set()
                    if attrib_ns not in namespaces:
                        namespaces.add(attrib_ns)
                        new_namespaces.add(attrib_ns)
                        output.append(' xmlns:%s="%s"' % (
                            mapped_ns, attrib_ns))
                    output.append(' %s:%s="%s"' % (
                        mapped_ns, attrib, value))

    if open_only:
        # Only output the opening tag, regardless of content.
        output.append(">")
        return ''.join(output)

    if len(xml) or xml.text:
        # If there are additional child elements to serialize.
        output.append(">")
        if xml.text:
            output.append(escape(xml.text, use_cdata))
        if len(xml):
            for child in xml:
                output.append(tostring(child, tag_xmlns, stream,
                    namespaces=namespaces))
        output.append("</%s>" % tag_name)
    elif xml.text:
        # If we only have text content.
        output.append(">%s</%s>" % (escape(xml.text, use_cdata), tag_name))
    else:
        # Empty element.
        output.append(" />")
    if xml.tail:
        # If there is additional text after the element.
        output.append(escape(xml.tail, use_cdata))
    for ns in new_namespaces:
        # Remove namespaces introduced in this context. This is necessary
        # because the namespaces object continues to be shared with other
        # contexts.
        namespaces.remove(ns)
    return ''.join(output)


def escape(text, use_cdata=False):
    """Convert special characters in XML to escape sequences.

    :param string text: The XML text to convert.
    :rtype: Unicode string
    """
    if sys.version_info < (3, 0):
        if type(text) != types.UnicodeType:
            text = unicode(text, 'utf-8', 'ignore')

    escapes = {'&': '&amp;',
               '<': '&lt;',
               '>': '&gt;',
               "'": '&apos;',
               '"': '&quot;'}

    if not use_cdata:
        text = list(text)
        for i, c in enumerate(text):
            text[i] = escapes.get(c, c)
        return ''.join(text)
    else:
        escape_needed = False
        for c in text:
            if c in escapes:
                escape_needed = True
                break
        if escape_needed:
            escaped = map(lambda x : "<![CDATA[%s]]>" % x, text.split("]]>"))
            return "<![CDATA[]]]><![CDATA[]>]]>".join(escaped)
        return text
Update docs for tostring 2011-11-22 23:25:02 +00:00			`# -- coding: utf-8 --`
Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00			`"""`
Update docs for tostring 2011-11-22 23:25:02 +00:00			`sleekxmpp.xmlstream.tostring`
			`~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
Modified sleekxmpp.xmlstream.tostring to import ToString class based on Python version. The package sleekxmpp.xmlstream.tostring26 remains for now until stanzabase is updated, but is no longer needed. 2010-08-04 18:41:37 +00:00
Update docs for tostring 2011-11-22 23:25:02 +00:00			`This module converts XML objects into Unicode strings and`
			`intelligently includes namespaces only when necessary to`
			`keep the output readable.`

			`Part of SleekXMPP: The Sleek XMPP Library`

			`:copyright: (c) 2011 Nathanael C. Fritz`
			`:license: MIT, see LICENSE for more details`
Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00			`"""`

Add initial support for xml:lang for streams and stanza plugins. Remaining items are suitable default actions for language supporting interfaces. 2012-06-05 23:54:26 +00:00			`from __future__ import unicode_literals`

Clean up and unify tostring once and for all. Packaging for Python3 just got easier. 2011-08-04 18:41:36 +00:00			`import sys`

			`if sys.version_info < (3, 0):`
			`import types`

Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00
Add initial support for xml:lang for streams and stanza plugins. Remaining items are suitable default actions for language supporting interfaces. 2012-06-05 23:54:26 +00:00			`XML_NS = 'http://www.w3.org/XML/1998/namespace'`


Update tostring to inject xmlns definitions when needed. 2013-01-24 10:43:46 +00:00			`def tostring(xml=None, xmlns='', stream=None, outbuffer='',`
			`top_level=False, open_only=False, namespaces=None):`
Update docs for tostring 2011-11-22 23:25:02 +00:00			`"""Serialize an XML object to a Unicode string.`

Simplify stringifying XML 2012-09-25 03:59:51 +00:00			If an outer xmlns is provided using ``xmlns``, then the current element's
			`namespace will not be included if it matches the outer namespace. An`
			`exception is made for elements that have an attached stream, and appear`
			`at the stream root.`
Update docs for tostring 2011-11-22 23:25:02 +00:00
Update tostring docs, plus more doc cleanup 2011-11-23 00:25:33 +00:00			`:param XML xml: The XML object to serialize.`
Update docs for tostring 2011-11-22 23:25:02 +00:00			`:param string xmlns: Optional namespace of an element wrapping the XML`
			`object.`
			`:param stream: The XML stream that generated the XML object.`
			`:param string outbuffer: Optional buffer for storing serializations`
			`during recursive calls.`
			`:param bool top_level: Indicates that the element is the outermost`
			`element.`
Update tostring to inject xmlns definitions when needed. 2013-01-24 10:43:46 +00:00			`:param set namespaces: Track which namespaces are in active use so`
			`that new ones can be declared when needed.`
Update tostring docs, plus more doc cleanup 2011-11-23 00:25:33 +00:00
			:type xml: :py:class:`~xml.etree.ElementTree.Element`
Update docs for tostring 2011-11-22 23:25:02 +00:00			:type stream: :class:`~sleekxmpp.xmlstream.xmlstream.XMLStream`

			`:rtype: Unicode string`
Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00			`"""`
			`# Add previous results to the start of the output.`
			`output = [outbuffer]`

			`# Extract the element's tag name.`
			`tag_name = xml.tag.split('}', 1)[-1]`

			`# Extract the element's namespace if it is defined.`
			`if '}' in xml.tag:`
			`tag_xmlns = xml.tag.split('}', 1)[0][1:]`
			`else:`
			`tag_xmlns = ''`

Update tostring methods. Will now always show top-level namespace, unless it is the same as the stream's default namespace. Also added the XMPP stream namespace to the namespace map as 'stream'. 2011-01-27 23:05:05 +00:00			`default_ns = ''`
			`stream_ns = ''`
Add support for using CDATA for escaping. CDATA escaping is disabled by default, but may be enabled by setting: self.use_cdata = True Closes issue #114 2012-07-24 10:25:55 +00:00			`use_cdata = False`
Update tostring to inject xmlns definitions when needed. 2013-01-24 10:43:46 +00:00
Update tostring methods. Will now always show top-level namespace, unless it is the same as the stream's default namespace. Also added the XMPP stream namespace to the namespace map as 'stream'. 2011-01-27 23:05:05 +00:00			`if stream:`
			`default_ns = stream.default_ns`
			`stream_ns = stream.stream_ns`
Add support for using CDATA for escaping. CDATA escaping is disabled by default, but may be enabled by setting: self.use_cdata = True Closes issue #114 2012-07-24 10:25:55 +00:00			`use_cdata = stream.use_cdata`
Update tostring methods. Will now always show top-level namespace, unless it is the same as the stream's default namespace. Also added the XMPP stream namespace to the namespace map as 'stream'. 2011-01-27 23:05:05 +00:00
Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00			`# Output the tag name and derived namespace of the element.`
			`namespace = ''`
Prevent xmlns="" in stream output. This was causing problems for HTML-IM because the HTML is parsed without a namespaced context. While xmlns="" technically can be valid, it's usually wrong, so this will work for now until the HTML-IM parsing is fixed. 2012-10-16 05:22:07 +00:00			`if tag_xmlns:`
			`if top_level and tag_xmlns not in [default_ns, xmlns, stream_ns] \`
			`or not top_level and tag_xmlns != xmlns:`
			`namespace = ' xmlns="%s"' % tag_xmlns`
Update tostring methods. Will now always show top-level namespace, unless it is the same as the stream's default namespace. Also added the XMPP stream namespace to the namespace map as 'stream'. 2011-01-27 23:05:05 +00:00			`if stream and tag_xmlns in stream.namespace_map:`
			`mapped_namespace = stream.namespace_map[tag_xmlns]`
			`if mapped_namespace:`
			`tag_name = "%s:%s" % (mapped_namespace, tag_name)`
Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00			`output.append("<%s" % tag_name)`
			`output.append(namespace)`

			`# Output escaped attribute values.`
Update tostring to inject xmlns definitions when needed. 2013-01-24 10:43:46 +00:00			`new_namespaces = set()`
Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00			`for attrib, value in xml.attrib.items():`
Add support for using CDATA for escaping. CDATA escaping is disabled by default, but may be enabled by setting: self.use_cdata = True Closes issue #114 2012-07-24 10:25:55 +00:00			`value = escape(value, use_cdata)`
Add support for using xml:lang values. Support is only for adding literal XML content to stanzas. Full support for things like multiple message bodies with different xml:lang values is still in the works. 2010-12-08 04:07:40 +00:00			`if '}' not in attrib:`
Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00			`output.append(' %s="%s"' % (attrib, value))`
Add support for using xml:lang values. Support is only for adding literal XML content to stanzas. Full support for things like multiple message bodies with different xml:lang values is still in the works. 2010-12-08 04:07:40 +00:00			`else:`
			`attrib_ns = attrib.split('}')[0][1:]`
			`attrib = attrib.split('}')[1]`
Get tests to pass again. Re-add old gmail_notify plugin for now. 2013-01-26 23:10:06 +00:00			`if attrib_ns == XML_NS:`
			`output.append(' xml:%s="%s"' % (attrib, value))`
			`elif stream and attrib_ns in stream.namespace_map:`
Add support for using xml:lang values. Support is only for adding literal XML content to stanzas. Full support for things like multiple message bodies with different xml:lang values is still in the works. 2010-12-08 04:07:40 +00:00			`mapped_ns = stream.namespace_map[attrib_ns]`
			`if mapped_ns:`
Update tostring to inject xmlns definitions when needed. 2013-01-24 10:43:46 +00:00			`if namespaces is None:`
			`namespaces = set()`
			`if attrib_ns not in namespaces:`
			`namespaces.add(attrib_ns)`
			`new_namespaces.add(attrib_ns)`
			`output.append(' xmlns:%s="%s"' % (`
			`mapped_ns, attrib_ns))`
			`output.append(' %s:%s="%s"' % (`
			`mapped_ns, attrib, value))`
Add initial support for xml:lang for streams and stanza plugins. Remaining items are suitable default actions for language supporting interfaces. 2012-06-05 23:54:26 +00:00
			`if open_only:`
			`# Only output the opening tag, regardless of content.`
			`output.append(">")`
			`return ''.join(output)`
Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00
			`if len(xml) or xml.text:`
			`# If there are additional child elements to serialize.`
			`output.append(">")`
			`if xml.text:`
Add support for using CDATA for escaping. CDATA escaping is disabled by default, but may be enabled by setting: self.use_cdata = True Closes issue #114 2012-07-24 10:25:55 +00:00			`output.append(escape(xml.text, use_cdata))`
Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00			`if len(xml):`
Remove usage of deprecated getchildren() method. 2012-06-19 16:47:31 +00:00			`for child in xml:`
Update tostring to inject xmlns definitions when needed. 2013-01-24 10:43:46 +00:00			`output.append(tostring(child, tag_xmlns, stream,`
			`namespaces=namespaces))`
Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00			`output.append("</%s>" % tag_name)`
			`elif xml.text:`
			`# If we only have text content.`
Add support for using CDATA for escaping. CDATA escaping is disabled by default, but may be enabled by setting: self.use_cdata = True Closes issue #114 2012-07-24 10:25:55 +00:00			`output.append(">%s</%s>" % (escape(xml.text, use_cdata), tag_name))`
Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00			`else:`
			`# Empty element.`
			`output.append(" />")`
			`if xml.tail:`
			`# If there is additional text after the element.`
Add support for using CDATA for escaping. CDATA escaping is disabled by default, but may be enabled by setting: self.use_cdata = True Closes issue #114 2012-07-24 10:25:55 +00:00			`output.append(escape(xml.tail, use_cdata))`
Update tostring to inject xmlns definitions when needed. 2013-01-24 10:43:46 +00:00			`for ns in new_namespaces:`
			`# Remove namespaces introduced in this context. This is necessary`
			`# because the namespaces object continues to be shared with other`
			`# contexts.`
			`namespaces.remove(ns)`
Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00			`return ''.join(output)`


Add support for using CDATA for escaping. CDATA escaping is disabled by default, but may be enabled by setting: self.use_cdata = True Closes issue #114 2012-07-24 10:25:55 +00:00			`def escape(text, use_cdata=False):`
Update docs for tostring 2011-11-22 23:25:02 +00:00			`"""Convert special characters in XML to escape sequences.`
Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00
Update docs for tostring 2011-11-22 23:25:02 +00:00			`:param string text: The XML text to convert.`
Update tostring docs, plus more doc cleanup 2011-11-23 00:25:33 +00:00			`:rtype: Unicode string`
Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00			`"""`
Clean up and unify tostring once and for all. Packaging for Python3 just got easier. 2011-08-04 18:41:36 +00:00			`if sys.version_info < (3, 0):`
			`if type(text) != types.UnicodeType:`
			`text = unicode(text, 'utf-8', 'ignore')`

Replaced the ToString class with a tostring function. The sleekxmpp.xmlstream.tostring and sleekxmpp.xmlstream.tostring26 packages have been merged to sleekxmpp.xmlstream.tostring. The __init__.py file will import the appropriate tostring function depending on the Python version. The setup.py file has been updated with the package changes. ElementBase is now a direct descendent of object and does not subclass ToString. Stanza objects now return their XML contents for __repr__. 2010-08-06 00:26:41 +00:00			`escapes = {'&': '&',`
			`'<': '<',`
			`'>': '>',`
			`"'": ''',`
			`'"': '"'}`
Add support for using CDATA for escaping. CDATA escaping is disabled by default, but may be enabled by setting: self.use_cdata = True Closes issue #114 2012-07-24 10:25:55 +00:00
			`if not use_cdata:`
			`text = list(text)`
			`for i, c in enumerate(text):`
			`text[i] = escapes.get(c, c)`
			`return ''.join(text)`
			`else:`
			`escape_needed = False`
			`for c in text:`
			`if c in escapes:`
			`escape_needed = True`
			`break`
			`if escape_needed:`
			`escaped = map(lambda x : "<![CDATA[%s]]>" % x, text.split("]]>"))`
			`return "<![CDATA[]]]><![CDATA[]>]]>".join(escaped)`
			`return text`