#!/usr/bin/python import commands, cgi, os, re, sys, time from xml.sax.saxutils import escape from glob import glob from config import directory import cache, nonce from atomef import unescape fs = cgi.FieldStorage() charset=cgi.parse_header(fs.headers['content-type'])[1].get('charset','utf-8') def param(key): value=(fs.list and fs.has_key(key) and fs.getvalue(key)) or '' try: return unicode(value,charset) except: return str(value).decode('iso-8859-1') def xmlchars(text): # convert high bit characters to numeric equivalents for i in range(len(text)-1,-1,-1): if ord(text[i])>=128: text = '%s&#%d;%s' % (text[:i], ord(text[i]), text[i+1:]) return text cp1252 = { 128: 8364, # euro sign 130: 8218, # single low-9 quotation mark 131: 402, # latin small letter f with hook 132: 8222, # double low-9 quotation mark 133: 8230, # horizontal ellipsis 134: 8224, # dagger 135: 8225, # double dagger 136: 710, # modifier letter circumflex accent 137: 8240, # per mille sign 138: 352, # latin capital letter s with caron 139: 8249, # single left-pointing angle quotation mark 140: 338, # latin capital ligature oe 142: 381, # latin capital letter z with caron 145: 8216, # left single quotation mark 146: 8217, # right single quotation mark 147: 8220, # left double quotation mark 148: 8221, # right double quotation mark 149: 8226, # bullet 150: 8211, # en dash 151: 8212, # em dash 152: 732, # small tilde 153: 8482, # trade mark sign 154: 353, # latin small letter s with caron 155: 8250, # single right-pointing angle quotation mark 156: 339, # latin small ligature oe 158: 382, # latin small letter z with caron 159: 376} # latin capital letter y with diaeresis ######################################################################### # Permit only selected HTML constructs, escape the rest # ######################################################################### def sanitize(body): if not body: return body original = body def hyperlink(match): href=re.sub("&","&",match.group(1)) return '%s' % (href, match.group(2)) # code (literal) support chunks=re.split('{{{((?:.|\n)*?)}}}',body) if len(chunks)>1: work=chunks[:] for i in range(1,len(chunks),2): if chunks[i].find('\n')>=0: chunks[i] = '
' + escape(chunks[i].strip()) + '
' work[i]="\n\n{{{%d}}}\n\n" % i else: chunks[i] = '' + escape(chunks[i]) + '' work[i]="{{{%d}}}" % i body=''.join(work) else: # naked urls become hypertext links body=re.sub('(^|[\\s.:;?\\-\\]<\\(])' + '(http://[-\\w;/?:@&=+$.!~*\'()%,#]+[\\w/])' + '(?=$|[\\s.:;?\\-\\[\\]>\\)])', '\\1[link]',body) # html characters used in text become escaped body=escape(body) # canonicalize line control characters body=re.sub('\r\n?','\n', body.strip()) # support NCR characters body=re.sub('&#x([\da-zA-Z]+);', lambda n: "&#x%s;" % n.group(1), body) body=re.sub('&#(\d+);', lambda n: "&#%s;" % n.group(1), body) # map windows-1252 to unicode for i in range(len(body)-1,-1,-1): if ord(body[i]) in cp1252: body=body[0:i] + "&#" + str(cp1252[ord(body[i])]) + ";" + body[i+1:] # remove control characters body=re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]"). \ sub(lambda c: u'' % hex(ord(c.group(0)))[2:].rjust(4,'0').upper(),body) # passthru , , , ,
,
,

body=re.sub('<a href="([^"]*)">([^&]*)</a>', hyperlink, body) body=re.sub('<a href=\'([^\']*)\'>([^&]*)</a>', hyperlink, body) body=re.sub('<abbr title="([^"]*)">([^&]*)</abbr>', '\\2', body) body=re.sub('<acronym title="([^"]*)">([^&]*)</acronym>', '\\2', body) body=re.sub('<em>([^&]*)</em>', '\\1', body) body=re.sub('<i>([^&]*)</i>', '\\1', body) body=re.sub('<b>([^&]*)</b>', '\\1', body) body=re.sub('<strong>([^&]*)</strong>', '\\1', body) body=re.sub('<blockquote>([^~]*?)</blockquote>', '\n\n

\\1

\n\n', body) body=re.sub('<br\s*/?>\n?','\n',body) body=re.sub('</?p>','\n\n',body).strip() # typographic support for mdash, curly quotes body=re.sub(r'(\s)--(\s)', r'\1—\2', body) body=re.sub(r'(\w)\'(\w)', r'\1’\2', body) body=re.sub(r"(^|\s)'([^<]*?)'(\s|[,;.\]]|$)", r'\1‘\2’\3', body) body=re.sub(r'(^|\s)"([^<]*?)"(\s|[,;.\]]|$)', r'\1“\2”\3', body) # wiki like support: _em_, -del-, *b*, [url title] body=re.sub(r'\b_(\w.*?)_\b', r'\1', body) body=re.sub(r'\*(\w.*?\w)\*', r'\1', body) body=re.sub(r'\[(\w+:\S+\.gif) (.*?)\]', r'\2', body) body=re.sub(r'\[(\w+:\S+\.jpg) (.*?)\]', r'\2', body) body=re.sub(r'\[(\w+:\S+\.png) (.*?)\]', r'\2', body) body=re.sub(r'\[(\w+:\S+) (.*?)\]', hyperlink, body).strip() # cvs urls body=re.sub(r'/checkout/', r'/*checkout*/', body) body=re.sub(r'/checkout/', r'/_checkout_/', body) # email style quotes (lines beginning with '>') body=re.sub('^>(.*)\n\n+>', r'>\1\n>\n>', body) quotes=re.compile(r'^( *>.*(?:\n *>.*)*)',re.M).split(body) for i in range(1, len(quotes), 2): (html,depth)=('',0) qline=re.compile('^ *(>(?: |>)*)(.*)',re.M); for indent,line in qline.findall(quotes[i])+[('','')]: indent=len(indent.replace(' ','').replace('>','>')) while indent>depth: (depth,html)=(depth+1,html+'

') while indent

') html+=line+"\r" quotes[i]=html.strip() body=''.join(quotes) # unordered lists: consecutive lines starting with spaces and an asterisk lists=re.compile(r'^( *\*.*(?:\n *\*.*)*)\n*',re.M).split(body) for i in range(1, len(lists), 2): (html,stack)=('', ['']) for indent,line in re.findall(r'( +)\* +(.*)', lists[i]) + [('','')]: if indent>stack[-1]: (stack,html)=(stack+[indent],html+'\n