import patch22
import lazydom
from htmlentitydefs import name2codepoint, codepoint2name
import re
from xml.dom import minidom,Node
from sgmllib import SGMLParser
def unescape(data):
"convert html entitydefs into unicode characters"
chunks = re.split('&(#?\w+);',data)
for i in range(1,len(chunks),2):
if chunks[i] in name2codepoint:
chunks[i] = unichr(name2codepoint[chunks[i]])
elif re.match('#\d+$',chunks[i]):
chunks[i] = unichr(int(chunks[i][1:]))
return "".join(chunks)
class stripHtml(SGMLParser):
"remove all tags from the data"
def __init__(self, data):
SGMLParser.__init__(self)
self.result=''
self.feed(data)
self.close()
def handle_data(self, data):
if data: self.result+=data
def escape(data,entity=1):
"escape all entities and non-7bit ASCII characters"
for i in range(len(data)-1,-1,-1):
n=ord(data[i])
if (n<128) and not entity: continue
if n in codepoint2name:
data = '%s&%s;%s' % (data[:i], codepoint2name[n], data[i+1:])
elif n>=128:
data = '%s%d;%s' % (data[:i], n, data[i+1:])
return data
def parse(stream):
"parse a stream into atomef"
return atomef(minidom.parse(stream).documentElement)
class atomef(lazydom.lazydom):
def __init__(self,element=None):
lazydom.lazydom.__init__(self,element)
def toHtml(self):
if self['@mode'] == 'escaped':
data=''.join([n.nodeValue for n in self.list[0].childNodes
if n.nodeType==Node.TEXT_NODE or n.nodeType == Node.CDATA_SECTION_NODE])
else:
data=''.join([n.toxml() for n in self.list[0].childNodes])
if self['@type'] <> 'text/html':
data = escape(data,1)
else:
data = escape(data,0)
return data
def toString(self):
data=''.join([n.nodeValue for n in self.list[0].childNodes
if n.nodeType==Node.TEXT_NODE or n.nodeType == Node.CDATA_SECTION_NODE])
if self['@mode'] == 'escaped':
data = unescape(data)
if self['@type'] == 'text/html':
data = stripHtml(data).result
return data
if __name__ == '__main__':
print unescape('<>')
print escape(unescape('Señor González'))
print stripHtml(unescape('not <b>very</b> nice')).result
print escape(unescape(''))