from __future__ import generators from xml.dom import minidom,Node def parse(stream): "parse a stream into a lazydom" return lazydom(minidom.parse(stream).documentElement) class namespace: "helper class: combines namespaces and local names" def __init__(self, uri): self.uri = uri def __getattr__(self, name): name=name.replace('_','-') if name[0]=='-': name[0]='@' return (self.uri,name) class lazydom(object): """simplify traversal of DOMs. Evaluation is done lazily. The current implementation is read only. The basic metaphor is that everything is an array, where the indexes are either an integer or a tag name or an attribute name. This simplifies a number of traversal operations. In particular, as methods like find and __getitem__ are applied to each element in the array, zero length arrays are handled naturally. """ def __init__(self,element=None): self.list=[] self.element = None if element: self.list.append(element) def append(self,element): self.list.append(element) if not self.element: self.element = element self.name = (element.namespaceURI,element.localName) def new(self,element=None): "create a new instance of this (sub-)class" result = lazydom.__new__(self.__class__) result.__init__(element) return result def find(self,index): if type(index)==str: index=(None,index) result = self.new() for element in self.list: for match in element.getElementsByTagNameNS(index[0],index[1]): result.append(match) return result def __getitem__(self,index): if type(index)==str: index=(None,index) result=self.new() if type(index) == int: # index result.append(self.list[index]) elif index[1][0]=='@': # attribute index=(index[0],unicode(index[1][1:])) for element in self.list: attrs=element.attributes for i in range(0,attrs.length): child=attrs.item(i) if (child.namespaceURI,child.localName) == index: result.append(child) else: # element for element in self.list: for child in element.childNodes: if child.nodeType == Node.ELEMENT_NODE: if (child.namespaceURI,child.localName) == index: result.append(child) return result def __len__(self): return len(self.list) def __iter__(self): for element in self.list: yield self.new(element) def __str__(self): if len(self.list) == 0: return '' if self.list[0].nodeValue: return self.list[0].nodeValue return ''.join([n.toxml() for n in self.list[0].childNodes]) def __cmp__(self,other): return cmp(str(self), str(other)) if __name__ == '__main__': bbfeed='http://new.blogger.com/pyra/temporary/atom-prototype?blogID=40471' from urllib import urlopen atom=namespace('http://purl.org/atom/ns#') feed=parse(urlopen(bbfeed)) # iteration and traversal for name in feed[atom.entry][atom.author][atom.name]: print name print # random access for issued in feed.find(atom.issued): print issued print # attribute access and comparison print feed[atom.generator]['@name'] print feed[atom.generator]['@name'] == 'Blogger'