#!/usr/bin/python """Ping all traceback-eligable or pingback-elibable servers associated with hrefs found in a given blog entry """ import re, sgmllib, sys, urllib, xmlrpclib from xml.sax import parseString, SAXParseException from xml.sax.handler import ContentHandler try: import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py timeoutsocket.setDefaultSocketTimeout(10) except ImportError: pass def excerpt(filename, title, body): """ filename,title,body => url,args Excerpt the body and urlencode the trackback arguments. """ splitExcerpt=re.compile('(?:
(.*?)
\s*)?(.*)',re.S) (excerpt,body) = splitExcerpt.match(body).groups() if excerpt: body=excerpt body = re.sub('\n',' ',body) body = re.sub(' ',' ',body) body = re.sub('^(

\s*)?[\w\s\.]+<\/a>:\s*','',body) body = re.sub('.*?<\/em>\.?\s*','',body) body = re.sub('<.*?>','',body) body = body[:252] url = 'http://www.intertwingly.net/blog/' + filename url = re.sub('.txt$','.html',url) arg = {} arg['url'] = url arg['blog_name'] = 'Sam Ruby' arg['title'] = title.encode('utf-8') arg['excerpt'] = body.encode('utf-8') return url, urllib.urlencode(arg) class link(sgmllib.SGMLParser): """ source -> list of trackbacks, list of pingbacks Parse a given html page, and retrieve the trackbacks associated with pages referenced via href found. """ def __init__(self, name, title, body): sgmllib.SGMLParser.__init__(self) self.trackbacks = [] self.pingbacks = [] self.title = title (self.url,self.args) = excerpt(name,title,body) self.feed(body) def start_a(self, attrs): attrs = dict(attrs) if attrs.has_key('href'): try: href = attrs['href'] trackbacks,pingbacks = backrefs(href) for trackback in trackbacks: if trackback not in self.trackbacks: self.trackbacks.append(trackback) for pingback in pingbacks: if pingback not in self.pingbacks: self.pingbacks.append(pingback) except: pass tb_re=re.compile('()') pb_re=re.compile('') def backrefs(href): """ href -> ([trackbacks],[pingbacks]) Parse a given html page, and retrieve the rdf:about, X-Pingback header, or pingback link information associated with a given href. At most one is returned (in the above priority). """ base = href.split("#")[0] file = urllib.urlopen(base) info = file.info() data = file.read().replace('\n',' ') file.close() trackback = [] pingback = pb_re.findall(data)[:1] for x in tb_re.findall(data): try: parseString(x, rdf()) except SAXParseException: pass if info.has_key("X-Pingback"): pingback=[info["X-Pingback"]] if rdf.ids.has_key(href): trackback = [rdf.ids[href]] if not trackback and not pingback and href.find("#")>0: if rdf.ids.has_key(base): trackback = [rdf.ids[base]] if trackback: pingback=[] if pingback: pingback=[(href, pingback[0])] return (trackback, pingback) class rdf(ContentHandler): """ xml -> dictionary of {dc:identifier => trackback:ping|rdf:about} Parse a given html page, and retrieve the rdf:about information associated with a given href. """ ids = {} def startElement(self, name, attrs): if name == 'rdf:Description': attrs=dict(attrs) if attrs.has_key('dc:identifer'): attrs['dc:identifier'] = attrs['dc:identifer'] if attrs.has_key('dc:identifier'): if attrs.has_key('trackback:ping'): self.ids[attrs['dc:identifier']] = attrs['trackback:ping'] elif attrs.has_key('about'): self.ids[attrs['dc:identifier']] = attrs['about'] elif attrs.has_key('rdf:about'): self.ids[attrs['dc:identifier']] = attrs['rdf:about'] def trackback(parser): """ parser -> None Ping all trackbacks encountered with the url, title, blog_name, and excerpt. """ for url in parser.trackbacks: try: print "*** Trackback " + url if url.find('?tb_id=') >= 0: file=urllib.urlopen(url + "&" + parser.args) else: import urllib2 request=urllib2.Request(url, parser.args) request.add_header("User-agent", "http://www.intertwingly.net/code/autoping.py") request.add_header("Content-type", "application/x-www-form-urlencoded; charset=utf-8") file=urllib2.urlopen(request) print file.read() file.close() except: pass def pingback(parser): """ parser -> None Ping all pingbacks encountered with the source and targets """ for target,server in parser.pingbacks: try: print "" print "*** Pingback " + server server=xmlrpclib.Server(server) # print server.pingback.ping(parser.url,target) except: pass if __name__ == '__main__': for name in sys.argv[1:]: try: sys.path.insert(0,"/home/rubys/mombo") from entry import post entry = post(name.replace(".txt",".html")) parser = link(entry.link(), entry.title(), entry.body()) trackback(parser) pingback(parser) except: pass