#!/usr/bin/python2.5 import urllib data=urllib.urlopen('http://www.pocketsoap.com/weblog/').read() feedtypes = ['application/atom+xml', 'application/rss+xml'] import sgmllib class linkparser(sgmllib.SGMLParser): def start_link(self,attrs): if dict(attrs).get('type',None) in feedtypes: print dict(attrs), '\n' print "sgmllib without monkey patch" linkparser().feed(data) import re sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') sgmllib.special = re.compile(']|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''') def search(self,string,index=0): self.match = self.endbracket.match(string,index) if self.match: return self def start(self,n): return self.match.end(n) sgmllib.endbracket = EndBracketMatch() print "sgmllib with nightly feedparser monkey patch" linkparser().feed(data) print "html5lib" import xml.etree.ElementTree from html5lib import html5parser, treebuilders etree = treebuilders.getTreeBuilder('etree', xml.etree.ElementTree) tree = html5parser.HTMLParser(False, etree).parse(data) for link in tree.findall('head/link'): if link.attrib.get('type') in feedtypes: print link.attrib