#!/usr/bin/python
"""Ping all traceback-eligable or pingback-elibable servers associated with
hrefs found in a given blog entry
"""
import re, sgmllib, sys, urllib, xmlrpclib
from xml.sax import parseString, SAXParseException
from xml.sax.handler import ContentHandler
try:
import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
timeoutsocket.setDefaultSocketTimeout(10)
except ImportError:
pass
def excerpt(filename, title, body):
""" filename,title,body => url,args
Excerpt the body and urlencode the trackback arguments.
"""
splitExcerpt=re.compile('(?:
(.*?)
\s*)?(.*)',re.S)
(excerpt,body) = splitExcerpt.match(body).groups()
if excerpt: body=excerpt
body = re.sub('\n',' ',body)
body = re.sub(' ',' ',body)
body = re.sub('^(\s*)?[\w\s\.]+<\/a>:\s*','',body)
body = re.sub('.*?<\/em>\.?\s*','',body)
body = re.sub('<.*?>','',body)
body = body[:252]
url = 'http://www.intertwingly.net/blog/' + filename
url = re.sub('.txt$','.html',url)
arg = {}
arg['url'] = url
arg['blog_name'] = 'Sam Ruby'
arg['title'] = title.encode('utf-8')
arg['excerpt'] = body.encode('utf-8')
return url, urllib.urlencode(arg)
class link(sgmllib.SGMLParser):
""" source -> list of trackbacks, list of pingbacks
Parse a given html page, and retrieve the trackbacks associated with
pages referenced via href found.
"""
def __init__(self, name, title, body):
sgmllib.SGMLParser.__init__(self)
self.trackbacks = []
self.pingbacks = []
self.title = title
(self.url,self.args) = excerpt(name,title,body)
self.feed(body)
def start_a(self, attrs):
attrs = dict(attrs)
if attrs.has_key('href'):
try:
href = attrs['href']
trackbacks,pingbacks = backrefs(href)
for trackback in trackbacks:
if trackback not in self.trackbacks:
self.trackbacks.append(trackback)
for pingback in pingbacks:
if pingback not in self.pingbacks:
self.pingbacks.append(pingback)
except:
pass
tb_re=re.compile('()')
pb_re=re.compile(' ')
def backrefs(href):
""" href -> ([trackbacks],[pingbacks])
Parse a given html page, and retrieve the rdf:about, X-Pingback header,
or pingback link information associated with a given href. At most
one is returned (in the above priority).
"""
base = href.split("#")[0]
file = urllib.urlopen(base)
info = file.info()
data = file.read().replace('\n',' ')
file.close()
trackback = []
pingback = pb_re.findall(data)[:1]
for x in tb_re.findall(data):
try:
parseString(x, rdf())
except SAXParseException:
pass
if info.has_key("X-Pingback"): pingback=[info["X-Pingback"]]
if rdf.ids.has_key(href): trackback = [rdf.ids[href]]
if not trackback and not pingback and href.find("#")>0:
if rdf.ids.has_key(base): trackback = [rdf.ids[base]]
if trackback: pingback=[]
if pingback: pingback=[(href, pingback[0])]
return (trackback, pingback)
class rdf(ContentHandler):
""" xml -> dictionary of {dc:identifier => trackback:ping|rdf:about}
Parse a given html page, and retrieve the rdf:about information associated
with a given href.
"""
ids = {}
def startElement(self, name, attrs):
if name == 'rdf:Description':
attrs=dict(attrs)
if attrs.has_key('dc:identifer'):
attrs['dc:identifier'] = attrs['dc:identifer']
if attrs.has_key('dc:identifier'):
if attrs.has_key('trackback:ping'):
self.ids[attrs['dc:identifier']] = attrs['trackback:ping']
elif attrs.has_key('about'):
self.ids[attrs['dc:identifier']] = attrs['about']
elif attrs.has_key('rdf:about'):
self.ids[attrs['dc:identifier']] = attrs['rdf:about']
def trackback(parser):
""" parser -> None
Ping all trackbacks encountered with the url, title, blog_name, and
excerpt.
"""
for url in parser.trackbacks:
try:
print "*** Trackback " + url
if url.find('?tb_id=') >= 0:
file=urllib.urlopen(url + "&" + parser.args)
else:
import urllib2
request=urllib2.Request(url, parser.args)
request.add_header("User-agent",
"http://www.intertwingly.net/code/autoping.py")
request.add_header("Content-type",
"application/x-www-form-urlencoded; charset=utf-8")
file=urllib2.urlopen(request)
print file.read()
file.close()
except:
pass
def pingback(parser):
""" parser -> None
Ping all pingbacks encountered with the source and targets
"""
for target,server in parser.pingbacks:
try:
print ""
print "*** Pingback " + server
server=xmlrpclib.Server(server)
# print server.pingback.ping(parser.url,target)
except:
pass
if __name__ == '__main__':
for name in sys.argv[1:]:
try:
sys.path.insert(0,"/home/rubys/mombo")
from entry import post
entry = post(name.replace(".txt",".html"))
parser = link(entry.link(), entry.title(), entry.body())
trackback(parser)
pingback(parser)
except:
pass