#!/usr/bin/python2.4 from glob import glob from config import directory from StringIO import StringIO import os import libxml2 verbose = True def checkXhtml(): """ run all cached xhtml pages through a validating parser """ if verbose: print "...", "checking xhtml" for xhtml in glob(directory.cache+'*.html')+glob(directory.cache+'200*/*/*/*/index.html'): stats=os.stat(xhtml) data=open(xhtml).read() if data[0]=='<': data = """""" + data[data.find('>')+1:] input = libxml2.inputBuffer(StringIO(data)) reader = input.newTextReader(xhtml.split('/')[-1]) reader.SetParserProp(libxml2.PARSER_VALIDATE, 1) ret = reader.Read() while ret == 1: ret = reader.Read() if ret != 0: print "Error parsing and validating %s" % (xhtml) break os.utime(xhtml,(stats.st_atime,stats.st_mtime)) def checkWellFormed(): """ print out a list of entries which have mode='escaped' """ if verbose: print "...", "checking well formedness" for atom in glob(directory.atom+'*.atom'): doc = libxml2.parseFile(atom) ctxt = doc.xpathNewContext() ctxt.xpathRegisterNs("atom","http://purl.org/atom/ns#") if ctxt.xpathEval("//atom:content[@mode='escaped']"): print "***", "escaped", atom doc.freeDoc() def checkQueryCache(): """ reexecute all cached queries and validate that the lists returned match the cached results. """ from search import search for file in glob(directory.querycache + "*.query"): list='.'.join(file.split('.')[:-1])+'.list' if not os.path.exists(list): os.remove(file) else: stats=[os.stat(file),os.stat(list)] query=open(file).read() if verbose: print "...", 'query', query # retrieve the cached results cache=open(list).read().split('\n') if len(cache) and not cache[0]: del cache[0] cache.sort() # reexecute the query os.remove(list) actual=search(query) actual.sort() if actual<>cache: # determine the union of the entries tally={} for entry in actual+cache: tally[entry]=1 tally=tally.keys() tally.sort() # print out the differences for entry in tally: if not entry in actual: print "+",entry,query if not entry in cache: print "-",entry,query else: # restore access and modify times on cache and list files os.utime(file,(stats[0].st_atime,stats[0].st_mtime)) os.utime(list,(stats[1].st_atime,stats[1].st_mtime)) if __name__ == "__main__": import sys verbose = not ('-q' in sys.argv[1:]) checkXhtml() checkWellFormed() checkQueryCache()