import glob, libxml2, os, time, sys, sgmllib, urllib, urlparse, re from xml.sax.saxutils import escape sys.path.insert(0, '/home/rubys/bzr/venus') import planet now = time.time() week = 7 * 86400 week_ago = now - week cache = os.path.join(sys.argv[1], '*') all_links = {} def canonicalize(url): parts = list(urlparse.urlparse(url)) parts[0] = parts[0].lower() parts[1] = parts[1].lower() if not parts[2]: parts[2] = '/' return urlparse.urlunparse(parts) for name in glob.glob(cache): # ensure that this is within the past week if os.path.isdir(name): continue mtime = os.stat(name).st_mtime if mtime < week_ago: continue # parse the file doc = libxml2.parseFile(name) xp = doc.xpathNewContext() xp.xpathRegisterNs("atom", "http://www.w3.org/2005/Atom") # determine the entry entry = xp.xpathEval("//atom:link[@rel='alternate']") if not entry: continue entry = canonicalize(entry[0].prop("href")) # determine the title title = xp.xpathEval("//atom:title") if title: if title[0].prop('type') == 'html': title = re.sub('<.*?>','',title[0].content) else: title = title[0].content title = str(title or 'Untitled') # determine the feed feed = xp.xpathEval("//atom:source/atom:id") if feed: feed = feed[0].content else: feed = xp.xpathEval("//atom:source/atom:link[@href and @rel='self']") if not feed: feed = xp.xpathEval("//atom:source/atom:link[@href]") if not feed: continue feed = feed[0].prop('href') # identify the unique links entry_links = [] for node in doc.xpathEval("//*[@href and not(@rel='source')]"): if node.parent.name == 'source': continue link = canonicalize(node.prop('href')) if not link in entry_links: entry_links.append(link) # add the votes vote = [(1.0 - (now - mtime)**2 / week**2, str(entry), str(feed), title)] for link in entry_links: all_links[link] = all_links.get(link,list()) + vote # free the entry doc.freeDoc() # tally the votes weighted_links = [] for link, votes in all_links.items(): site = {} for weight, entry, feed, title in votes: site[feed] = min(site.get(feed,1), weight) weighted_links.append((sum(site.values()), link)) weighted_links.sort() weighted_links.reverse() # determine the title for a given url class html(sgmllib.SGMLParser): def __init__(self, url): sgmllib.SGMLParser.__init__(self) self.title = "" self.feedurl = "" self.intitle = False try: self.feed(urllib.urlopen(url).read()) except: pass # if there is a feed, look for an entry that matches, and take that title if self.feedurl: data = feedparser.parse(self.feedurl) for entry in data.entries: if entry.has_key('link') and entry.link == url: if entry.has_key('title_detail'): self.title = entry.title_detail.value if entry.title_detail.type == 'text/plain': self.title = escape(self.title) break # fallback is the basename of the URI if not self.title: self.title = url.split('/')[-1] # parse out the first autodiscovery link def start_link(self, attrs): if self.feedurl: return attrs = dict(map(lambda (k,v): (k.lower(),v), attrs)) if not 'rel' in attrs: return rels = attrs['rel'].split(' ') if 'alternate' not in rels: return if not 'type' in attrs or not attrs['type'].endswith('xml'): return if 'href' in attrs: self.feedurl = attrs['href'] # parse the page title def start_title(self, attributes): if not self.title: self.intitle = True def end_title(self): self.intitle = False def handle_data(self, text): if self.intitle: self.title += text # convert unicode string to a json string def toj(value): result = repr(value) if result[:1] == 'u': return result[1:] return result # collect the results json = [] from planet.spider import filename from planet import feedparser for weight, link in weighted_links[:10]: all_links[link].sort() all_links[link].reverse() links = [] for weight, entry, feed, title in all_links[link]: links.append(' [%f, %s, %s]' % (weight, toj(entry), toj(title.decode('utf-8').strip()))) cache_file = filename(sys.argv[1], link) title = None # when possible, take the title from the cache if os.path.exists(cache_file): entry = feedparser.parse(cache_file).entries[0] if entry.has_key('title_detail'): title = entry.title_detail.value if entry.title_detail.type == 'text/plain': title = escape(title) # otherwise, parse the html if not title: title = html(link).title json.append(' [%s, %s, [\n' % (toj(link), toj(title.strip())) + ',\n'.join(links) + ']]') print '[\n', ',\n\n'.join(json), '\n]'