# markdown/html4.py # # Add html4 serialization to older versions of Elementree # Taken from ElementTree 1.3 preview with slight modifications # # Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved. # # fredrik@pythonware.com # http://www.pythonware.com # # -------------------------------------------------------------------- # The ElementTree toolkit is # # Copyright (c) 1999-2007 by Fredrik Lundh # # By obtaining, using, and/or copying this software and/or its # associated documentation, you agree that you have read, understood, # and will comply with the following terms and conditions: # # Permission to use, copy, modify, and distribute this software and # its associated documentation for any purpose and without fee is # hereby granted, provided that the above copyright notice appears in # all copies, and that both that copyright notice and this permission # notice appear in supporting documentation, and that the name of # Secret Labs AB or the author not be used in advertising or publicity # pertaining to distribution of the software without specific, written # prior permission. # # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE # OF THIS SOFTWARE. # -------------------------------------------------------------------- import markdown ElementTree = markdown.etree.ElementTree QName = markdown.etree.QName Comment = markdown.etree.Comment PI = markdown.etree.PI ProcessingInstruction = markdown.etree.ProcessingInstruction HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", "img", "input", "isindex", "link", "meta" "param") try: HTML_EMPTY = set(HTML_EMPTY) except NameError: pass _namespace_map = { # "well-known" namespace prefixes "http://www.w3.org/XML/1998/namespace": "xml", "http://www.w3.org/1999/xhtml": "html", "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", "http://schemas.xmlsoap.org/wsdl/": "wsdl", # xml schema "http://www.w3.org/2001/XMLSchema": "xs", "http://www.w3.org/2001/XMLSchema-instance": "xsi", # dublic core "http://purl.org/dc/elements/1.1/": "dc", } def _raise_serialization_error(text): raise TypeError( "cannot serialize %r (type %s)" % (text, type(text).__name__) ) def _encode(text, encoding): try: return text.encode(encoding, "xmlcharrefreplace") except (TypeError, AttributeError): _raise_serialization_error(text) def _escape_cdata(text, encoding): # escape character data try: # it's worth avoiding do-nothing calls for strings that are # shorter than 500 character, or so. assume that's, by far, # the most common case in most applications. if "&" in text: text = text.replace("&", "&") if "<" in text: text = text.replace("<", "<") if ">" in text: text = text.replace(">", ">") return text.encode(encoding, "xmlcharrefreplace") except (TypeError, AttributeError): _raise_serialization_error(text) def _escape_attrib(text, encoding): # escape attribute value try: if "&" in text: text = text.replace("&", "&") if "<" in text: text = text.replace("<", "<") if ">" in text: text = text.replace(">", ">") if "\"" in text: text = text.replace("\"", """) if "\n" in text: text = text.replace("\n", " ") return text.encode(encoding, "xmlcharrefreplace") except (TypeError, AttributeError): _raise_serialization_error(text) def _escape_attrib_html(text, encoding): # escape attribute value try: if "&" in text: text = text.replace("&", "&") if ">" in text: text = text.replace(">", ">") if "\"" in text: text = text.replace("\"", """) return text.encode(encoding, "xmlcharrefreplace") except (TypeError, AttributeError): _raise_serialization_error(text) def _serialize_html(write, elem, encoding, qnames, namespaces): tag = elem.tag text = elem.text if tag is Comment: write("" % _escape_cdata(text, encoding)) elif tag is ProcessingInstruction: write("" % _escape_cdata(text, encoding)) else: tag = qnames[tag] if tag is None: if text: write(_escape_cdata(text, encoding)) for e in elem: _serialize_html(write, e, encoding, qnames, None) else: write("<" + tag) items = elem.items() if items or namespaces: items.sort() # lexical order for k, v in items: if isinstance(k, QName): k = k.text if isinstance(v, QName): v = qnames[v.text] else: v = _escape_attrib_html(v, encoding) # FIXME: handle boolean attributes write(" %s=\"%s\"" % (qnames[k], v)) if namespaces: items = namespaces.items() items.sort(key=lambda x: x[1]) # sort on prefix for v, k in items: if k: k = ":" + k write(" xmlns%s=\"%s\"" % ( k.encode(encoding), _escape_attrib(v, encoding) )) write(">") tag = tag.lower() if text: if tag == "script" or tag == "style": write(_encode(text, encoding)) else: write(_escape_cdata(text, encoding)) for e in elem: _serialize_html(write, e, encoding, qnames, None) if tag not in HTML_EMPTY: write("") if elem.tail: write(_escape_cdata(elem.tail, encoding)) def write_html(root, f, # keyword arguments encoding="us-ascii", default_namespace=None): assert root is not None if not hasattr(f, "write"): f = open(f, "wb") write = f.write if not encoding: encoding = "us-ascii" qnames, namespaces = _namespaces( root, encoding, default_namespace ) _serialize_html( write, root, encoding, qnames, namespaces ) # -------------------------------------------------------------------- # serialization support def _namespaces(elem, encoding, default_namespace=None): # identify namespaces used in this tree # maps qnames to *encoded* prefix:local names qnames = {None: None} # maps uri:s to prefixes namespaces = {} if default_namespace: namespaces[default_namespace] = "" def encode(text): return text.encode(encoding) def add_qname(qname): # calculate serialized qname representation try: if qname[:1] == "{": uri, tag = qname[1:].split("}", 1) prefix = namespaces.get(uri) if prefix is None: prefix = _namespace_map.get(uri) if prefix is None: prefix = "ns%d" % len(namespaces) if prefix != "xml": namespaces[uri] = prefix if prefix: qnames[qname] = encode("%s:%s" % (prefix, tag)) else: qnames[qname] = encode(tag) # default element else: if default_namespace: # FIXME: can this be handled in XML 1.0? raise ValueError( "cannot use non-qualified names with " "default_namespace option" ) qnames[qname] = encode(qname) except TypeError: _raise_serialization_error(qname) # populate qname and namespaces table try: iterate = elem.iter except AttributeError: iterate = elem.getiterator # cET compatibility for elem in iterate(): tag = elem.tag if isinstance(tag, QName) and tag.text not in qnames: add_qname(tag.text) elif isinstance(tag, basestring): if tag not in qnames: add_qname(tag) elif tag is not None and tag is not Comment and tag is not PI: _raise_serialization_error(tag) for key, value in elem.items(): if isinstance(key, QName): key = key.text if key not in qnames: add_qname(key) if isinstance(value, QName) and value.text not in qnames: add_qname(value.text) text = elem.text if isinstance(text, QName) and text.text not in qnames: add_qname(text.text) return qnames, namespaces def to_html_string(element, encoding=None): class dummy: pass data = [] file = dummy() file.write = data.append write_html(ElementTree(element).getroot(),file,encoding) return "".join(data)