#!/usr/bin/python import commands, cgi, os, re, sys, time from xml.sax.saxutils import escape from glob import glob from config import directory import cache, nonce from atomef import unescape fs = cgi.FieldStorage() charset=cgi.parse_header(fs.headers['content-type'])[1].get('charset','utf-8') def param(key): value=(fs.list and fs.has_key(key) and fs.getvalue(key)) or '' try: return unicode(value,charset) except: if type(value) == unicode: return value else: return str(value).decode('iso-8859-1') def xmlchars(text): # convert high bit characters to numeric equivalents for i in range(len(text)-1,-1,-1): if ord(text[i])>=128: text = '%s%d;%s' % (text[:i], ord(text[i]), text[i+1:]) return text cp1252 = { 128: 8364, # euro sign 130: 8218, # single low-9 quotation mark 131: 402, # latin small letter f with hook 132: 8222, # double low-9 quotation mark 133: 8230, # horizontal ellipsis 134: 8224, # dagger 135: 8225, # double dagger 136: 710, # modifier letter circumflex accent 137: 8240, # per mille sign 138: 352, # latin capital letter s with caron 139: 8249, # single left-pointing angle quotation mark 140: 338, # latin capital ligature oe 142: 381, # latin capital letter z with caron 145: 8216, # left single quotation mark 146: 8217, # right single quotation mark 147: 8220, # left double quotation mark 148: 8221, # right double quotation mark 149: 8226, # bullet 150: 8211, # en dash 151: 8212, # em dash 152: 732, # small tilde 153: 8482, # trade mark sign 154: 353, # latin small letter s with caron 155: 8250, # single right-pointing angle quotation mark 156: 339, # latin small ligature oe 158: 382, # latin small letter z with caron 159: 376} # latin capital letter y with diaeresis ######################################################################### # Permit only selected HTML constructs, escape the rest # ######################################################################### def sanitize(body): if not body: return body original = body def hyperlink(match): href=re.sub("&","&",match.group(1)) return '%s' % (href, match.group(2)) # code (literal) support chunks=re.split('{{{((?:.|\n)*?)}}}',body) if len(chunks)>1: work=chunks[:] for i in range(1,len(chunks),2): if chunks[i].find('\n')>=0: chunks[i] = '
' + escape(chunks[i].strip()) + '' work[i]="\n\n{{{%d}}}\n\n" % i else: chunks[i] = '
' + escape(chunks[i]) + ''
work[i]="{{{%d}}}" % i
body=''.join(work)
else:
# naked urls become hypertext links
body=re.sub('(^|[\\s.:;?\\-\\]<\\(])' +
'(http://[-\\w;/?:@&=+$.!~*\'()%,#]+[\\w/])' +
'(?=$|[\\s.:;?\\-\\[\\]>\\)])',
'\\1[link]',body)
# html characters used in text become escaped
body=escape(body)
# canonicalize line control characters
body=re.sub('\r\n?','\n', body.strip())
# support NCR characters
body=re.sub('&#x([\da-zA-Z]+);', lambda n: "%s;" % n.group(1), body)
body=re.sub('&#(\d+);', lambda n: "%s;" % n.group(1), body)
# map windows-1252 to unicode
for i in range(len(body)-1,-1,-1):
if ord(body[i]) in cp1252:
body=body[0:i] + "" + str(cp1252[ord(body[i])]) + ";" + body[i+1:]
# remove control characters
body=re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]"). \
sub(lambda c: u'�' %
hex(ord(c.group(0)))[2:].rjust(4,'0').upper(),body)
# passthru , , , , ,
,body=re.sub('<a href="([^"]*)">([^&]*)</a>', hyperlink, body) body=re.sub('<a href=\'([^\']*)\'>([^&]*)</a>', hyperlink, body) body=re.sub('<abbr title="([^"]*)">([^&]*)</abbr>', '\\2', body) body=re.sub('<acronym title="([^"]*)">([^&]*)</acronym>', '\\2', body) body=re.sub('<em>([^&]*)</em>', '\\1', body) body=re.sub('<i>([^&]*)</i>', '\\1', body) body=re.sub('<b>([^&]*)</b>', '\\1', body) body=re.sub('<strong>([^&]*)</strong>', '\\1', body) body=re.sub('<blockquote>([^~]*?)</blockquote>', '\n\n
\n\n', body) body=re.sub('<br\s*/?>\n?','\n',body) body=re.sub('</?p>','\n\n',body).strip() # typographic support for mdash, curly quotes body=re.sub(r'(\s)--(\s)', r'\1—\2', body) body=re.sub(r'(\w)\'(\w)', r'\1’\2', body) body=re.sub(r"(^|\s)'([^<]*?)'(\s|[,;.\]]|$)", r'\1‘\2’\3', body) body=re.sub(r'(^|\s)"([^<]*?)"(\s|[,;.\]]|$)', r'\1“\2”\3', body) # wiki like support: _em_, -del-, *b*, [url title] body=re.sub(r'\b_(\w.*?)_\b', r'\1', body) body=re.sub(r'\*(\w.*?\w)\*', r'\1', body) body=re.sub(r'\[(\w+:\S+\.gif) (.*?)\]', r'\\1
', body) body=re.sub(r'\[(\w+:\S+\.jpg) (.*?)\]', r'
', body) body=re.sub(r'\[(\w+:\S+\.png) (.*?)\]', r'
', body) body=re.sub(r'\[(\w+:\S+) (.*?)\]', hyperlink, body).strip() # cvs urls body=re.sub(r'/checkout/', r'/*checkout*/', body) body=re.sub(r'/checkout/', r'/_checkout_/', body) # email style quotes (lines beginning with '>') body=re.sub('^>(.*)\n\n+>', r'>\1\n>\n>', body) quotes=re.compile(r'^( *>.*(?:\n *>.*)*)',re.M).split(body) for i in range(1, len(quotes), 2): (html,depth)=('',0) qline=re.compile('^ *(>(?: |>)*)(.*)',re.M); for indent,line in qline.findall(quotes[i])+[('','')]: indent=len(indent.replace(' ','').replace('>','>')) while indent>depth: (depth,html)=(depth+1,html+'
') html+=line+"\r" quotes[i]=html.strip() body=''.join(quotes) # unordered lists: consecutive lines starting with spaces and an asterisk lists=re.compile(r'^( *\*.*(?:\n *\*.*)*)\n*',re.M).split(body) for i in range(1, len(lists), 2): (html,stack)=('', ['']) for indent,line in re.findall(r'( +)\* +(.*)', lists[i]) + [('','')]: if indent>stack[-1]: (stack,html)=(stack+[indent],html+'\n') while indent
\r') while indent
\r') if line: html += ' - '+line+'
\r' lists[i]=html+"\n\n" body = ''.join(lists) # white space stanzas=re.split('\n\n+', body.strip()) if len(stanzas)>1: for i in range(0,len(stanzas)): if not stanzas[i].startswith('') and \ not stanzas[i].startswith('
%s' % stanzas[i] body = '\r'.join(stanzas) body=re.sub('\n','', body) body=re.sub('\n','
\n', body) body=re.compile('(
.*?
)?',re.M).sub(r'\1',body) body=re.sub(' +', ' ', body) # reinsert literals if len(chunks)>1: for i in range(1,len(chunks),2): body=re.sub("\{\{\{%d\}\}\}
" % i, chunks[i].replace('\\','\\\\'), body) body=body.replace("{{{%d}}}" % i, chunks[i]) # renormalize linefeeds body=re.sub('\r\n*', '\n', body).strip() from atomize import html2xml body = html2xml(body) try: from xml.dom import minidom if type(body) == unicode: minidom.parseString("%s" % body.encode('utf-8')) else: minidom.parseString("%s" % body) except: body = escape(original).replace('\n','
') return body ######################################################################### # Determine if this entry has already been found # ######################################################################### def existingBacklink(entry,link): link=xmlchars(link) for filename in glob('%s%s-*.cmt' % (directory.data,entry)): file = open(filename) try: for line in file: if line.find(link)>0: return True finally: file.close() return False ######################################################################### # Determine if a comment is a duplicate # ######################################################################### def duplicateComment(entry,title,body): (title,body)=(xmlchars(title),xmlchars(body)) for filename in glob('%s%s-*.cmt' % (directory.data,entry)): file = open(filename) try: if file.readline().strip()==title and file.read()==body: return True finally: file.close() return False ######################################################################### # Write a comment out to disk # ######################################################################### def writeComment(entry,title,body,decache=True): # Throttle comments from any one individual import spamthrottle if spamthrottle.spammer(body): message="POST limit exceeded" nonce.generate(time.time()+86400, repr([message, entry, title, body])) raise Exception(message) if not title: title=open(directory.data+entry+'.txt').readline().strip() cmtid = time.mktime(time.gmtime()) while (os.path.exists('%s%s-%d.cmt' % (directory.data,entry,cmtid))): cmtid+=1 file='%s%s-%d.cmt' % (directory.data,entry,cmtid) out=open(file, 'w') out.write("%s\n%s" % (xmlchars(title), xmlchars(body))) out.close() os.chmod(file, 0664) if decache: cache.clear(entry) notify(file) return file def notify(file): try: import xnotify xnotify.send(os.path.basename(file)) # PuSH comment feed try: import urllib, urllib2 hub = 'http://pubsubhubbub.appspot.com/' feed = 'http://intertwingly.net/blog/comments.atom' data = urllib.urlencode((('hub.mode','publish'), ('hub.url',feed))) response = urllib2.urlopen(hub, data) except urllib2.HTTPError, e: if not hasattr(e, 'code') or e.code != 204: raise except: import traceback, nonce, time tb = ''.join(apply(traceback.format_exception, sys.exc_info())) nonce.generate(time.time()+86400, tb) ######################################################################### # Decache and index in the background # ######################################################################### def cleanup(posted): import daemonize, fcntl, search # complete the http request log=open(directory.web+"post.log","w") daemonize.daemonize(stdout=log.name) for file, entry in posted: cache.clear(entry) notify(file) fcntl.flock(log.fileno(),fcntl.LOCK_EX) search.index() fcntl.flock(log.fileno(),fcntl.LOCK_UN) log.close() ######################################################################### # Handle both Trackbacks and Comments # ######################################################################### def post(url=None): posted=[] name = escape(param('name') or param('blog_name') or 'anonymous') addr = param('url') email = param('email') title = sanitize(unescape(param('title'))) body = sanitize(param('comment') or param('excerpt')); entry = param('parent') or param('tb_id') if addr: if addr.find(':')<0 and not addr.startswith('='): if addr.find('.')>=0 and url[0].isalnum(): if not addr.find('/')>=0: url = url + '/' addr = "http://" + addr if url.endswith('.tb'): entry=url.split('.')[0] host=os.environ['REMOTE_ADDR'] match = re.search(r' domain name pointer (\S+)\.\s*$', commands.getoutput('host ' + host)) if match: host=match.group(1) host=escape(host) moderate = False preview = param('preview') if param('nonce') and nonce.unique(param('nonce'), time.time()+86400, body): preview = moderate = 1 if param('comment'): from spamthrottle import captcha_valid if not param('captcha'): preview = 1 elif not captcha_valid(param('captcha')): preview = moderate = 1 # wierd spammers if name == 'ninki': preview = moderate = 1 # if body.find('http://maxi')>=0: preview = moderate = 1 # if body.find('http://bardak.com.ru/')>=0: preview = moderate = 1 # if body.find('http://funny-gift-idea')>=0: preview = moderate = 1 # if body.find('http://advertisersworld.com')>=0: preview = moderate = 1 # if body.find('ugly.as/')>=0: preview = moderate = 1 if title.startswith('<a href="'): preview = moderate = 1 if param('excerpt'): try: if open(directory.data+entry+'.txt').readline().strip().startswith(body.strip('.')): preview = moderate = 1 if name == 'anonymous': preview = moderate = 1 except: preview = moderate = 1 if not moderate: import community moderate = not community.find(host, addr) if not moderate: body = body.replace('\ufffd' % hex(ord(c.group(0)))[2:].rjust(4,'0').title(),comment) if moderate: warning = ['moderate'] else: warning = warn(entry, host, addr, comment) if param('warn'): warning += param('warn').split() # spamlog = open(directory.spamlog + host,'w') # spamlog.write(str(warning)) # spamlog.close() from entry import post parent={'title':title, 'description':body, 'comment':comment, 'entry':entry, 'link':post(entry).link(), 'id':'', 'mtime':time.localtime(), 'warning':warning, 'host':host, 'name':name, 'url':addr, 'email':email} data={'channel':channel, 'Items':[], 'children':[], 'parent':parent, 'age':0} from config import service, __dict__ as config print template(searchList=[data, config]) if param('nonce') and not nonce.unique(param('nonce')): nonce.remove(param('nonce')) elif (body or param('blog_name')) and entry: message = None # address tweaks: handle omitted http, default to email if present addr = re.sub('^(\\w+(\\.\\w+)+)', 'http://\\1', addr) if "http://".startswith(addr): addr="" if email and not addr: addr='mailto:'+email # add who the post was from, possibly annotated with IP info if param('comment'): body = re.sub(r'\s+$','',body) if body[-4:] <> '': body += '\n
\n
' body += "\nPosted by" # Verify nonce if param('nonce') and not nonce.unique(param('nonce')): nonce.remove(param('nonce')) else: message="Invalid nonce" else: body=re.sub('&(#?\w+;)','&\\1',body) # if len(body)>252: body=body[:body.rfind(' ',0,252)][:252] body += "...\n" if addr: body += "[more]" % escape(addr) body += "
Trackback from" try: from urllib import urlopen urlopen(addr).read().index('intertwingly') except: message = 'spam' # if blogname is a URL, set it as the address addr = None if re.match('^\w+\\.[\w\\.]+$',name): addr = "http://%s/" % name if re.match('^http://',name): addr=name if duplicateComment(entry, title, body): message = 'Duplicate comment' name = xmlchars(name) # openid? redirect = None if addr and param('comment'): import identity redirect=identity.lookup(addr, {'name': param('name'), 'url': param('url'), 'email': param('email'), 'title': param('title'), 'comment': param('comment'), 'parent': entry}) if redirect: print 'Status: 302 Found\r\nLocation: %s\r\n\r\nRedirecting' % redirect else: # add who the post was from, possibly annotated with link info if addr: id = "%s" % (nofollow, escape(addr), host, name) else: id = "%s" % (host, name) body += " " + id # do the post if not message: file = writeComment(entry, title, body, decache=False) from entry import post from config import channel link = post(entry).link() redirect = channel.link + link + '?#c%d' % os.stat(file).st_mtime if os.path.exists(os.path.join(directory.cache,link)): os.remove(os.path.join(directory.cache,link)) posted.append((file, entry)) # produce response if param('comment') and not message: if redirect: print 'Status: 302 Found\r\nLocation: %s\r\n\r\nRedirecting'%redirect else: from entry import post from get import format response=format(post(entry).link()) print response cache.save('index.html', response) else: from template.tb import tb as template print template(searchList=[{'error':0,'message':message}]) elif entry and re.search(r'\bxml\b', fs.type): # from soap import soap # soap(entry, fs.file) from config import channel try: from entryparser import parse comment=parse(fs.file) comment['url'] = comment['link'] if 'doPreview' in comment: from spellcheck import spellcheck comment['content']=spellcheck(str(comment['content'])) comment['link'] = '%s.html#commentForm' % entry filename=directory.data+entry+'.txt' else: name=escape(str(comment['name'])) # add who the post was from, possibly annotated with link info if 'link' in comment: id = ("%s" % (escape(str(comment['link'])), host, name)) else: id = "%s" % (host, name) comment['content'] += "Message from %s" % id filename = writeComment(entry, escape(str(comment['title'])), str(comment['content']), decache=False) posted.append((filename, entry)) mtime=int(os.stat(filename).st_mtime) comment['modified']=time.localtime(mtime) comment['link'] = '%s.html#c%d' % (entry,mtime) if not comment['title']: comment['title']=open(filename).readline().strip() # reply from template.atomapi import atomapi as reply print reply(searchList=[{'channel': channel, 'entry':comment}]) except: import traceback tb = ''.join(apply(traceback.format_exception, sys.exc_info())) nonce.generate(time.time()+86400, tb) from template.soapfault import soapfault as reply print reply(searchList=[]) else: data='' if fs.file: data += "===\n" + fs.file.read() + "\n" + "===\n" + "\n" data += "fstype: " + fs.type + "\n" data += "fshead: " + str(fs.headers) + "\n" try: for key in fs.keys(): data += key + ": " + fs.getvalue(key) + "\n" except: pass for (key,value) in os.environ.items(): data += key + ": " + value + "\n" nonce.generate(time.time()+86400, data) print "Content-type: text/plain\r\n\r\n" if posted: cleanup(posted)