#!/usr/bin/python from rfc822 import parseaddr from post import sanitize import atomef from lazydom import namespace atom=namespace('http://purl.org/atom/ns#') dc=namespace('http://purl.org/dc/elements/1.1/') content=namespace('http://purl.org/rss/1.0/modules/content/') http=namespace('http://schemas.xmlsoap.org/wsdl/http/') rdf=namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') rss=namespace('http://purl.org/rss/1.0/') soap=namespace('http://schemas.xmlsoap.org/soap/envelope/') wfw=namespace('http://wellformedweb.org/CommentAPI/') xhtml=namespace('http://www.w3.org/1999/xhtml') oktags = ['a','b', 'br', 'img', 'p', 'strong', 'em', 'ul', 'li', 'blockquote'] def validate(xml): # http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely for element in xml[:]: if not element.name[1] in oktags: class DisallowedTag(Exception): pass raise DisallowedTag, "Disallowed tag: " + element.name[1] if 'style' in map(unicode.upper,element.node.attributes.keys()): class DisallowedAttr(Exception): pass raise DisallowedAttr, "Disallowed attribute: style" validate(element) def parse(entry): entry = atomef.parse(entry) # soap Headers for header in entry[soap.Header][:]: if header[soap._mustUnderstand]=='1': if header.name == wfw.preview: doPreview=True else: class NotUnderstood(Exception): pass raise NotUnderstood, "Header %s not understood" % str(header.node.nodeName) del header if entry[soap.Body]: entry=entry[soap.Body][:]; addSoap=True if entry.name == http.POST: entry=entry[:] # rdf Marker if entry.name==rdf.RDF: entry=entry[:]; addRdf=True # title title = entry[atom.title] or entry['title'] or entry[rss.title] # content content=globals()['content'] body = entry[atom.content] or entry[atom.summary] body = body or entry[xhtml.body] or entry[content.encoded] body = body or entry[rss.description] or entry['description'] if body and body.name[1] in ['description','encoded']: body.node.setAttribute('mode','escaped') body.node.setAttribute('type','text/html') if body and body.name == xhtml.body: body.node.setAttribute('type','text/html') if len(body[:])==1 and body[:].name[1] in ['div','span']: if body[:].node.attributes.keys() in [[],['xmlns']]: child=body[:].node for ((ns,qname),name) in body.node.attributes.itemsNS(): child.setAttributeNS(ns,qname,name) del child,ns,qname,name body=body[:] validate(body) content = body.toHtml() if body["@mode"] == 'escaped': content = sanitize(content) del body # determine email email = entry[atom.author][atom.email] or entry['email'] # determine link link = entry[atom.link] or entry[atom.author][atom.url] link = link or entry[rss.link] or entry['link'] or entry[rdf._about] if not link and email: link = "mailto:" + str(email) # determine name name = entry[atom.author][atom.name] or entry[dc.creator] or entry['author'] if name.toString().find('@')>0: (name, email)=parseaddr(name.toString()) if not name and link and str(link).startswith('http://'): try: from pingback import parser file=urllib.urlopen(comment.link.split('#')[0]) page = parser() page.feed(file.read()) file.close() name = page.title del parser, file, page except: pass if not name: name="anonymous" # issued from template import parseDate issued = parseDate(str(entry[atom.issued] or entry[dc.date])) del entry return locals()