import patch22 import lazydom from htmlentitydefs import name2codepoint, codepoint2name import re from xml.dom import minidom,Node from sgmllib import SGMLParser def unescape(data): "convert html entitydefs into unicode characters" chunks = re.split('&(#?\w+);',data) for i in range(1,len(chunks),2): if chunks[i] in name2codepoint: chunks[i] = unichr(name2codepoint[chunks[i]]) elif re.match('#\d+$',chunks[i]): chunks[i] = unichr(int(chunks[i][1:])) return "".join(chunks) class stripHtml(SGMLParser): "remove all tags from the data" def __init__(self, data): SGMLParser.__init__(self) self.result='' self.feed(data) self.close() def handle_data(self, data): if data: self.result+=data def escape(data,entity=1): "escape all entities and non-7bit ASCII characters" for i in range(len(data)-1,-1,-1): n=ord(data[i]) if (n<128) and not entity: continue if n in codepoint2name: data = '%s&%s;%s' % (data[:i], codepoint2name[n], data[i+1:]) elif n>=128: data = '%s&#%d;%s' % (data[:i], n, data[i+1:]) return data def parse(stream): "parse a stream into atomef" return atomef(minidom.parse(stream).documentElement) class atomef(lazydom.lazydom): def __init__(self,element=None): lazydom.lazydom.__init__(self,element) def toHtml(self): if self['@mode'] == 'escaped': data=''.join([n.nodeValue for n in self.list[0].childNodes if n.nodeType==Node.TEXT_NODE or n.nodeType == Node.CDATA_SECTION_NODE]) else: data=''.join([n.toxml() for n in self.list[0].childNodes]) if self['@type'] <> 'text/html': data = escape(data,1) else: data = escape(data,0) return data def toString(self): data=''.join([n.nodeValue for n in self.list[0].childNodes if n.nodeType==Node.TEXT_NODE or n.nodeType == Node.CDATA_SECTION_NODE]) if self['@mode'] == 'escaped': data = unescape(data) if self['@type'] == 'text/html': data = stripHtml(data).result return data if __name__ == '__main__': print unescape('<>') print escape(unescape('Señor González')) print stripHtml(unescape('not <b>very</b> nice')).result print escape(unescape('€'))