--- BeautifulSoup.py 2006-06-06 22:17:09.000000000 -0400 +++ /home/rubys/bzr/BeautifulSoup-3.0.3/BeautifulSoup.py 2006-09-20 11:51:44.000000000 -0400 @@ -56,7 +56,18 @@ import types import re import sgmllib -from htmlentitydefs import name2codepoint + +try: + from htmlentitydefs import name2codepoint +except: + import htmlentitydefs + name2codepoint={} + for (name,codepoint) in htmlentitydefs.entitydefs.iteritems(): + if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1])) + name2codepoint[name]=ord(codepoint) + +# python 2.2 support +if not hasattr(__builtins__, 'basestring'): basestring=str # This RE makes Beautiful Soup able to parse XML with namespaces. sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') @@ -518,8 +529,12 @@ def _convertEntities(self, match): x = match.group(1) if x in name2codepoint: - return unichr(name2codepoint[x]) - elif "&" + x + ";" in self.XML_ENTITIES_TO_CHARS: + c = unichr(name2codepoint[x]) + if c in self.XML_ENTITIES_TO_CHARS.values(): + return '&%s;' % x + else: + return c + elif x in self.XML_ENTITIES_TO_CHARS: return '&%s;' % x else: return '&%s;' % x @@ -809,7 +824,8 @@ def _matches(self, markup, matchAgainst): #print "Matching %s against %s" % (markup, matchAgainst) result = False - if matchAgainst == True and type(matchAgainst) == types.BooleanType: + if matchAgainst == True and (not hasattr(types, 'BooleanType') or + type(matchAgainst) == types.BooleanType): result = markup != None elif callable(matchAgainst): result = matchAgainst(markup) @@ -857,7 +873,7 @@ """Convenience method that works with all 2.x versions of Python to determine whether or not something is stringlike.""" try: - return isinstance(s, unicode) or isintance(s, basestring) + return isinstance(s, unicode) or isinstance(s, basestring) except NameError: return isinstance(s, str) @@ -1272,6 +1288,12 @@ j = i + len(toHandle) return j + def convert_charref(self, name): + return '&#%s;' % name + + def convert_entityref(self, name): + return '&%s;' % name + class BeautifulSoup(BeautifulStoneSoup): """This parser knows the following facts about HTML: @@ -1642,6 +1664,8 @@ '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' + if not data: return u'' + # strip Byte Order Mark (if present) if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ and (data[2:4] != '\x00\x00'):