from xml.sax import make_parser, handler from xml.dom import XHTML_NAMESPACE from xml.sax.handler import ContentHandler Atom_NAMESPACE = u'http://www.w3.org/2005/Atom' textConstructs = [u'title', u'summary', u'content'] # extract id, text, and some links from an Atom entry. Assumes a highly # normalized entry (e.g., no type="html"). Note: links aren't currently used. class AtomParser(ContentHandler): def __init__(self): self.text = [] self.id = u'' self.links = [] self.intext = self.inid = False def startElementNS(self, (ns,name), local_name, attrs): if ns==Atom_NAMESPACE: if name in textConstructs: self.intext = True if name == u'id' and not self.id: self.inid = True elif ns==XHTML_NAMESPACE: if name == u'a': attrs = dict(attrs) if u'href' in attrs: self.links.append(attrs[u'href']) def endElementNS(self, (ns,name), local_name): if self.intext and self.text and not self.text[-1].isspace(): self.text.append(u' ') if ns==Atom_NAMESPACE: if name in textConstructs: self.intext = False if name == u'id': self.inid = False def characters(self, string): if self.inid: self.id += string if self.intext and not string.isspace(): self.text.append(string) # initialize database import apsw, os connection=apsw.Connection('blog.db') cursor = connection.cursor() if not list(cursor.execute("select * from SQLite_Master where name='post'")): # new database: create tables and index cursor.execute('create table post(name, mtime integer, id)') cursor.execute('create virtual table post_text using fts3(text)') cursor.execute('create index post_name on post (name)') # for performance cursor.execute('pragma synchronous = off') # create a SAX parser parser = make_parser() parser.setFeature(handler.feature_namespaces, True) # parse each blog entry transaction = 0 os.chdir('/var/www/atom') from glob import glob for name in glob('*.atom'): try: # get filesystem and db modification times mtime = int(os.stat(name).st_mtime) prev = list(cursor.execute('select mtime,rowid from post where name=?', (name,))) if prev and prev[0][0] == mtime: continue # unchanged if not transaction: cursor.execute('begin') transaction += 1 if prev: # cleanup cursor.execute(''' delete from post where rowid= ?; delete from post_text where rowid=?''', (prev[0][1], prev[0][1])) # parse post, insert into database gatherer = AtomParser() parser.setContentHandler(gatherer) parser.parse(open(name)) cursor.execute(''' insert into post (name, mtime, id) values(?, ?, ?); insert into post_text (rowid, text) values (last_insert_rowid(), ?); ''', (name, mtime, gatherer.id, u''.join(gatherer.text))) # flush every 50 entries if transaction > 50: cursor.execute('commit') transaction = 0 except: import traceback, sys print ''.join(apply(traceback.format_exception, sys.exc_info())) # complete any open transaction if transaction: cursor.execute('commit') # print a list of blog entries that contain the word 'venus' for (id,) in cursor.execute('''select id from post join post_text on post.rowid = post_text.rowid where post_text match ? order by name''', (u'venus',)): print id # close the database connection.close()