#!/usr/bin/python2.4
import sys
trace=False
if 'trace' in sys.argv:
trace = True
sys.argv.remove('trace')
print "+++ begin trace"
sys.stdout.flush()
# import patch22
import patch25
from xml.dom import minidom
import os, re, sys, sgmllib, time, urllib, urlparse
from xml.sax.saxutils import escape
from atomef import unescape
atomns = 'http://www.w3.org/2005/Atom'
import technorati
from config import directory
from post import writeComment, existingBacklink, sanitize
from entry import post
try:
import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
timeoutsocket.setDefaultSocketTimeout(10)
except ImportError:
pass
if os.getcwd() not in sys.path: sys.path.insert(0, os.getcwd())
newrefs=os.path.join(os.path.expanduser('~'),'.mltfo')
newlastupdatetechno=os.path.join(os.path.expanduser('~'),'.techno')
os.chdir(directory.log)
common_feed_names = ['atom.xml', 'rss.xml', 'index.xml', 'index.rdf', '?flav=rss', 'backend.php', 'GetRss?', 'blogger_rss.xml', 'rss']
# parse a html page, looking for feeds
class html(sgmllib.SGMLParser):
def __init__(self, url):
self.feedurl = None
self.urlbase = url[:url.rfind('/',8)+1].lower()
self.intitle = False
self.title = ""
sgmllib.SGMLParser.__init__(self)
try:
if url: self.feed(urllib.URLopener().open(url).read())
except:
pass
#
def start_link(self, attrs):
attrs = dict(map(lambda (k,v): (k.lower(),v), attrs))
if not 'rel' in attrs: return
rels = attrs['rel'].split(' ')
if 'alternate' not in rels: return
if not 'type' in attrs.keys() or not attrs['type'].endswith('xml'): return
if 'href' in attrs:
if not self.feedurl:
self.feedurl = attrs['href']
#
def start_a(self, attrs):
if self.feedurl: return
attrs = dict(map(lambda (k,v): (k.lower(),v.lower()), attrs))
if 'href' in attrs:
href = attrs['href']
if self.urlbase == href[:href.rfind('/',8)+1]:
if href[href.rfind('/',8)+1:] in common_feed_names:
self.feedurl = href
# title
def do_title(self, attrs):
if self.title=="": self.intitle=1
def unknown_starttag(self, tag, attrs):
self.intitle=0
def unknown_endtag(self,tag):
self.intitle=0
def handle_charref(self, ref):
if self.intitle: self.title = self.title + ("%s;" % ref)
def handle_data(self,text):
if self.intitle: self.title = self.title + text
# return the text associated with a given DOM node
def text(element, tag):
nodes = element.getElementsByTagName(tag)
if not nodes: nodes=element.getElementsByTagNameNS(atomns, tag)
if not nodes: return ""
attrs=dict(nodes[0].attributes)
if 'mode' in attrs and attrs['mode'].value=='xml':
return innerxml(element, nodes[0].namespaceURI, tag)
elif 'type' in attrs.keys() and attrs['type'].value.find('xhtml')>=0:
return innerxml(element, nodes[0].namespaceURI, tag)
elif 'type' in attrs.keys() and attrs['type'].value.find('html')>=0:
return unescape("".join([getattr(child,'data','') for child in nodes[0].childNodes]))
elif tag in ['content','summary'] and ('type' not in attrs or attrs['type'].value in ['xhtml','plain']):
return innerxml(element, nodes[0].namespaceURI, tag)
else:
return "".join([getattr(child,'data','') for child in nodes[0].childNodes])
# return the innerxml associated with a given DOM node
def innerxml(element, ns, tag):
nodes = element.getElementsByTagNameNS(ns, tag)
if not nodes: return ""
if len(nodes[0].childNodes)==1 and nodes[0].childNodes[0].nodeName=='div':
nodes = nodes[0].childNodes
value=nodes[0].toxml()
return value[value.find('>')+1:value.rfind('<')]
def extract(entry, base=''):
attrs=dict(entry.attributes)
if 'xml:base' in attrs:
if base:
base=urlparse.urljoin(base,attrs['xml:base'].value)
else:
base = attrs['xml:base']
title=text(entry,'title')
ref=''
alternate = None
if 'rdf:about' in dict(entry.attributes):
alternate=dict(entry.attributes)['rdf:about'].value
if not alternate:
guid=entry.getElementsByTagName('guid')[:1]
if guid and guid[0].getAttribute('isPermaLink') in ('','true'):
alternate=text(entry,'guid')
for link in entry.getElementsByTagNameNS(entry.namespaceURI, 'link'):
attrs=dict(link.attributes)
# print [(key, attrs[key].value) for key in attrs.keys()]
if (not 'rel' in attrs) or attrs['rel'].value=='alternate':
if (not 'type' in attrs) or attrs['type'].value.find('html')>=0:
if 'href' in attrs: alternate=alternate or attrs['href'].value
ref = ref or text(entry,'link')
alternate = alternate or ref
if alternate and base: alternate=urlparse.urljoin(base,alternate)
if alternate and alternate.startswith("http://blogdex.net/route.asp?"):
alternate=alternate.replace("/route.asp?","/track.asp?")
if base.startswith("http://del.icio.us/") and base.find("/inbox/")<0:
if base<>'http://del.icio.us/rss' and alternate.find("intertwingly")>=0:
ref=alternate
from md5 import md5
alternate="http://del.icio.us/url/%s" % md5(alternate).hexdigest()
if alternate and alternate.startswith("http://programming.reddit.com/goto?rss=true&id="):
id = alternate.split('=',2)[2]
alternate = 'http://programming.reddit.com/info/%s/comments' % id
if alternate and alternate.startswith("http://reddit.com/goto?rss=true&id="):
id = alternate.split('=',2)[2].split('_')[-1]
alternate = 'http://programming.reddit.com/info/%s/comments' % id
summary=text(entry,'summary') or text(entry,'description')
content=(text(entry,'content') or
innerxml(entry,'http://www.w3.org/1999/xhtml','body') or
text(entry,'content:encoded') or summary)
if ref: content = content + " " + ref
if base and base.startswith("http://archipelago.phrasewise.com"):
match=re.compile('\[(.*)\] .* time.strftime('%Y%m%d'):
scan.append((time.strftime('%Y%m%d'),'00:00:00'))
# logfile pattern
pattern = re.compile(r'(.*?) .*? \[(.*?)\] "(.*?)" (.*?) (\S+) "(.*?)" "(.*?)"')
# ip timestamp url status size referer browser
# scan referers for references to specific blog entries
if trace: print "+++ collect referers"
referers = sys.argv[1:]
for (logfile,bookmark) in scan:
if os.path.exists(logfile + ".log.gz"):
import gzip
file = gzip.open(logfile + ".log.gz")
else:
if not os.path.exists(logfile + ".log"): continue
file = open(logfile + ".log")
if trace: print "... " + logfile
entrycache = {}
for line in file.readlines():
cursor = line.find(':')
if bookmark > line[cursor+1:cursor+9]: continue
if line.find(' 200 ')<0: continue
(ip,ts,url,status,size,refer,browser) = pattern.search(line).groups()
if status<>'200': continue
if refer=='-': continue
if refer.startswith('.'): continue
if refer.find('/search?')>0: continue
if refer.find('intertwingly.net')>0: continue
if refer.find('bloglines.com/myblogs_display')>0: continue
if refer.find('thauvin.net')>0: continue
if refer.find('diveintomark.blogspot.com')>0: continue
if refer.find('bolli.homeip.net')>0: continue
if refer.find('fozbaca.org/blagg')>0: continue
if refer.find('feeds.diveintomark.org')>0: continue
if refer.find('/aggsome.cgi/')>0: continue
if refer.find('20six.co.uk')>0: continue
if refer.find('/mediajunkie.com/')>0: continue
if refer.find('/treesalive.com/')>0: continue
if refer.find('automated.adsensemoney.net')>0: continue
if refer.find('dcostanet.net/rss')>0: continue
url = url.split(' ')[1]
url = url.split('#')[0]
url = url.split('?')[0]
try:
entry=entrycache[url]
except:
entry=re.match('/blog\/(\d+)\.',url)
if entry:
entry = entry.group(1)
else:
entry=post(url).id()
if not entry or not entry.isdigit(): continue
entrycache[url]=entry
refer = refer.split('#')[0]
refer = refer.split('?')[0]
if refer[0:2]=='//': refer='http:'+refer
if trace: print "??? %s %s" % (entry, refer)
if refer[0:7]=='http://':
if not refer in referers:
if not existingBacklink(entry, refer):
referers.append(refer)
# scan each unique referer for feeds
if trace: print "+++ fetch referers"
os.chdir(directory.data)
feeds = []
for refer in referers:
if refer.startswith('http://www.google.'): continue
if refer.startswith('http://search.msn.com'): continue
if refer.startswith('http://search.live.com'): continue
if refer.startswith('http://xmlns.com/foaf'): continue
if refer.find('8z21-7pie.blogspot.com')>0: continue
if refer.find('correctserver.com')>0: continue
if refer.find('getfirefoxbrowsers.com')>0: continue
if refer.find('javablogs.com/Jump')>0: continue
if trace: print refer
try:
feedurl = html(refer).feedurl
if feedurl.startswith('feed://'): feedurl="http" + feedurl[4:]
if feedurl.find('ken.coar.org/blog/index.rss')>0:
feedurl+='?words=all&sanitise=false'
if not feedurl: continue
if feedurl.find('rr.bloghackers.net')>0: continue
if feedurl.find('intertwingly.net/blog/index.rss')>0: print refer
# resolve relative urls
feedurl = urlparse.urljoin(refer,feedurl)
if not feedurl in feeds: feeds.append(feedurl)
if feedurl=='http://weblog.philringnalda.com/comments/feed/':
feedurl='http://weblog.philringnalda.com/feed/'
if not feedurl in feeds: feeds.append(feedurl)
except:
pass
# add in technorati found links since last scan
if trace: print "+++ fetch technorati"
try:
items=technorati.getCosmos('http://www.intertwingly.net')['inbound']
except:
items=[]
for item in items:
if item['linkcreated']>lastTR and item['weblog'].has_key('rssurl'):
rssurl=item['weblog']['rssurl']
if rssurl.find('thauvin.net')>0: continue
if not rssurl: rssurl=html(item['weblog']['url']+'/').feedurl
if rssurl and rssurl.find('/mediajunkie.com/')>0: continue
if trace and rssurl: print rssurl
if rssurl and not rssurl in feeds: feeds.append(rssurl)
lastTR=max([item['linkcreated'] for item in items]+[lastTR])
# scan the unique rss feeds encountered
if trace: print "+++ fetch feeds"
it_href = re.compile('intertwingly.net/blog/([-a-zA-Z0-9/]+)')
slides_href = re.compile('intertwingly.net/slides/([0-9]+/[-a-zA-Z0-9]+)')
stories_href = re.compile('intertwingly.net/stories/([0-9]+/[0-9]+/[0-9]+/[-.a-zA-Z0-9]+)')
for url in feeds:
url=url.replace('&','&')
if trace: print url
try:
data = urllib.urlopen(url).read()
if not data: continue
feed = minidom.parseString(data)
blog=text(feed,'title') or html(text(feed,'link')).title
attrs=dict(feed.documentElement.attributes)
if 'xml:base' in attrs:
base=urlparse.urljoin(url,attrs['xml:base'].value)
else:
base=url
entries = feed.getElementsByTagNameNS(atomns, 'entry')
entries = entries or feed.getElementsByTagName('item')
for entry in entries:
try:
(title, alternate, summary, content) = extract(entry, base)
if not alternate: continue
if text(entry, 'id').startswith('tag:planet.intertwingly.net'): continue
for target in it_href.findall(content)+slides_href.findall(content)+stories_href.findall(content):
if target.endswith('/'): target=target[:-1]
if not target.isdigit():
if target == '2005/xmlconf': target="2117"
elif target == '2004/devcon': target="1868"
elif target == '2005/08/09/rails_example.rb': target="2046"
elif target == '2005/etcon': target="1926"
elif target == '2005/fosssl/keynote.html': target="2060"
elif target == '2005/rs': target="1945"
elif target == '2006/07/30/expatparser.rb': target="2361"
elif target == '2006/AtomInASeaOfRSS2': target="2381"
elif target == '2006/AtomInASeaOfRSS': target="2230"
elif target == '2006/etcon': target="2183"
elif target == '2006/npuc': target="2355"
elif target == '2007/05/02/msft.html4': target="2558"
elif target == '2007/05/02/msft.html': target="2558"
elif target == '2007/09/11/toucan.html': target="2662"
else:
target=post(target).id()
if not target: continue
if trace: print target
if target.startswith('index'): continue
if target.startswith('comments'): continue
alternate=escape(alternate)
if alternate.find('intertwingly.net/blog')>0: continue
if alternate.find('javablogs.com/Jump')>0: continue
if not existingBacklink(target,alternate):
# remove html and truncate excerpt description to 250 chars
edesc=re.compile(u'<.*?>',re.S).sub(' ',summary or content)
edesc=re.sub('\s+',' ',edesc)
if len(edesc)>250: edesc=edesc[:edesc.rfind(' ',0,250)][:250]
source = entry.getElementsByTagNameNS(atomns, 'source')
if source: source = text(source[0],'title')
if not os.path.exists(directory.data + target + ".txt"): continue
# write out the excerpt
if alternate.startswith("http://") or alternate.startswith("https://"):
writeComment(target, escape(title.strip()),
'%s...\n Excerpt from %s \n' %
(sanitize(unescape(edesc)), alternate, escape(source or blog)))
except:
import traceback, sys
print "".join(apply(traceback.format_exception, sys.exc_info()))
print url
try:
print title
print alternate
print content
except:
pass
print
except:
import traceback, sys
print "".join(apply(traceback.format_exception, sys.exc_info()))
print url
print
# mark the point at which the next scan is to start
if trace: print "+++ write bookmark"
file=open(newrefs,'w')
file.write(start+lastTR+'\n')
file.close()