root/BeautifulSoup.py

Revision 17, 69.9 kB (checked in by ploum, 2 years ago)

#13 : launchpadweb searches incorrect results
#28 : results are not scrollable

Added BeautifulSoup? 3.0.3 from http://www.crummy.com/software/BeautifulSoup/
Thanks a lot to authors and contributors to BeautifulSoup?

Line 
1"""Beautiful Soup
2Elixir and Tonic
3"The Screen-Scraper's Friend"
4http://www.crummy.com/software/BeautifulSoup/
5
6Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7tree representation. It provides methods and Pythonic idioms that make
8it easy to navigate, search, and modify the tree.
9
10A well-structured XML/HTML document yields a well-behaved data
11structure. An ill-structured XML/HTML document yields a
12correspondingly ill-behaved data structure. If your document is only
13locally well-structured, you can use this library to find and process
14the well-structured part of it.
15
16Beautiful Soup works with Python 2.2 and up. It has no external
17dependencies, but you'll have more success at converting data to UTF-8
18if you also install these three packages:
19
20* chardet, for auto-detecting character encodings
21  http://chardet.feedparser.org/
22* cjkcodecs and iconv_codec, which add more encodings to the ones supported
23  by stock Python.
24  http://cjkpython.i18n.org/
25
26Beautiful Soup defines classes for two main parsing strategies:
27   
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29   language that kind of looks like XML.
30
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32   or invalid. This class has web browser-like heuristics for
33   obtaining a sensible parse tree in the face of common HTML errors.
34
35Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36the encoding of an HTML or XML document, and converting it to
37Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed
38Parser.
39
40For more than you ever wanted to know about Beautiful Soup, see the
41documentation:
42http://www.crummy.com/software/BeautifulSoup/documentation.html
43"""
44from __future__ import generators
45
46__author__ = "Leonard Richardson (crummy.com)"
47__contributors__ = ["Sam Ruby (intertwingly.net)",
48                    "the unwitting Mark Pilgrim (diveintomark.org)",
49                    "http://www.crummy.com/software/BeautifulSoup/AUTHORS.html"]
50__version__ = "3.0.3"
51__copyright__ = "Copyright (c) 2004-2006 Leonard Richardson"
52__license__ = "PSF"
53
54from sgmllib import SGMLParser, SGMLParseError
55import codecs
56import types
57import re
58import sgmllib
59from htmlentitydefs import name2codepoint
60
61# This RE makes Beautiful Soup able to parse XML with namespaces.
62sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
63
64# This RE makes Beautiful Soup capable of recognizing numeric character
65# references that use hexadecimal.
66sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);')
67
68DEFAULT_OUTPUT_ENCODING = "utf-8"
69
70# First, the classes that represent markup elements.
71
72class PageElement:
73    """Contains the navigational information for some part of the page
74    (either a tag or a piece of text)"""
75
76    def setup(self, parent=None, previous=None):
77        """Sets up the initial relations between this element and
78        other elements."""       
79        self.parent = parent
80        self.previous = previous
81        self.next = None
82        self.previousSibling = None
83        self.nextSibling = None
84        if self.parent and self.parent.contents:
85            self.previousSibling = self.parent.contents[-1]
86            self.previousSibling.nextSibling = self
87
88    def replaceWith(self, replaceWith):       
89        oldParent = self.parent
90        myIndex = self.parent.contents.index(self)
91        if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
92            # We're replacing this element with one of its siblings.
93            index = self.parent.contents.index(replaceWith)
94            if index and index < myIndex:
95                # Furthermore, it comes before this element. That
96                # means that when we extract it, the index of this
97                # element will change.
98                myIndex = myIndex - 1
99        self.extract()       
100        oldParent.insert(myIndex, replaceWith)
101       
102    def extract(self):
103        """Destructively rips this element out of the tree."""       
104        if self.parent:
105            try:
106                self.parent.contents.remove(self)
107            except ValueError:
108                pass
109
110        #Find the two elements that would be next to each other if
111        #this element (and any children) hadn't been parsed. Connect
112        #the two.       
113        lastChild = self._lastRecursiveChild()
114        nextElement = lastChild.next
115
116        if self.previous:
117            self.previous.next = nextElement
118        if nextElement:
119            nextElement.previous = self.previous
120        self.previous = None
121        lastChild.next = None
122
123        self.parent = None       
124        if self.previousSibling:
125            self.previousSibling.nextSibling = self.nextSibling
126        if self.nextSibling:
127            self.nextSibling.previousSibling = self.previousSibling
128        self.previousSibling = self.nextSibling = None       
129
130    def _lastRecursiveChild(self):
131        "Finds the last element beneath this object to be parsed."
132        lastChild = self
133        while hasattr(lastChild, 'contents') and lastChild.contents:
134            lastChild = lastChild.contents[-1]
135        return lastChild
136
137    def insert(self, position, newChild):
138        if (isinstance(newChild, basestring)
139            or isinstance(newChild, unicode)) \
140            and not isinstance(newChild, NavigableString):
141            newChild = NavigableString(newChild)       
142
143        position =  min(position, len(self.contents))
144        if hasattr(newChild, 'parent') and newChild.parent != None:
145            # We're 'inserting' an element that's already one
146            # of this object's children.
147            if newChild.parent == self:
148                index = self.find(newChild)
149                if index and index < position:
150                    # Furthermore we're moving it further down the
151                    # list of this object's children. That means that
152                    # when we extract this element, our target index
153                    # will jump down one.
154                    position = position - 1
155            newChild.extract()
156           
157        newChild.parent = self
158        previousChild = None
159        if position == 0:
160            newChild.previousSibling = None
161            newChild.previous = self
162        else:
163            previousChild = self.contents[position-1]
164            newChild.previousSibling = previousChild
165            newChild.previousSibling.nextSibling = newChild
166            newChild.previous = previousChild._lastRecursiveChild()
167        if newChild.previous:
168            newChild.previous.next = newChild       
169
170        newChildsLastElement = newChild._lastRecursiveChild()
171
172        if position >= len(self.contents):
173            newChild.nextSibling = None
174           
175            parent = self
176            parentsNextSibling = None
177            while not parentsNextSibling:
178                parentsNextSibling = parent.nextSibling
179                parent = parent.parent
180                if not parent: # This is the last element in the document.
181                    break
182            if parentsNextSibling:
183                newChildsLastElement.next = parentsNextSibling
184            else:
185                newChildsLastElement.next = None
186        else:
187            nextChild = self.contents[position]           
188            newChild.nextSibling = nextChild           
189            if newChild.nextSibling:
190                newChild.nextSibling.previousSibling = newChild
191            newChildsLastElement.next = nextChild
192
193        if newChildsLastElement.next:
194            newChildsLastElement.next.previous = newChildsLastElement
195        self.contents.insert(position, newChild)
196
197    def findNext(self, name=None, attrs={}, text=None, **kwargs):
198        """Returns the first item that matches the given criteria and
199        appears after this Tag in the document."""
200        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
201
202    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
203                    **kwargs):
204        """Returns all items that match the given criteria and appear
205        before after Tag in the document."""
206        return self._findAll(name, attrs, text, limit, self.nextGenerator)
207
208    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
209        """Returns the closest sibling to this Tag that matches the
210        given criteria and appears after this Tag in the document."""
211        return self._findOne(self.findNextSiblings, name, attrs, text,
212                             **kwargs)
213
214    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
215                         **kwargs):
216        """Returns the siblings of this Tag that match the given
217        criteria and appear after this Tag in the document."""
218        return self._findAll(name, attrs, text, limit,
219                             self.nextSiblingGenerator, **kwargs)
220    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
221
222    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
223        """Returns the first item that matches the given criteria and
224        appears before this Tag in the document."""
225        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
226
227    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
228                        **kwargs):
229        """Returns all items that match the given criteria and appear
230        before this Tag in the document."""
231        return self._findAll(name, attrs, text, limit, self.previousGenerator,
232                           **kwargs)
233    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
234
235    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
236        """Returns the closest sibling to this Tag that matches the
237        given criteria and appears before this Tag in the document."""
238        return self._findOne(self.findPreviousSiblings, name, attrs, text,
239                             **kwargs)
240
241    def findPreviousSiblings(self, name=None, attrs={}, text=None,
242                             limit=None, **kwargs):
243        """Returns the siblings of this Tag that match the given
244        criteria and appear before this Tag in the document."""
245        return self._findAll(name, attrs, text, limit,
246                             self.previousSiblingGenerator, **kwargs)
247    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
248
249    def findParent(self, name=None, attrs={}, **kwargs):
250        """Returns the closest parent of this Tag that matches the given
251        criteria."""
252        # NOTE: We can't use _findOne because findParents takes a different
253        # set of arguments.
254        r = None
255        l = self.findParents(name, attrs, 1)
256        if l:
257            r = l[0]
258        return r
259
260    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
261        """Returns the parents of this Tag that match the given
262        criteria."""
263
264        return self._findAll(name, attrs, None, limit, self.parentGenerator,
265                             **kwargs)
266    fetchParents = findParents # Compatibility with pre-3.x
267
268    #These methods do the real heavy lifting.
269
270    def _findOne(self, method, name, attrs, text, **kwargs):
271        r = None
272        l = method(name, attrs, text, 1, **kwargs)
273        if l:
274            r = l[0]
275        return r
276   
277    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
278        "Iterates over a generator looking for things that match."
279
280        if isinstance(name, SoupStrainer):
281            strainer = name
282        else:
283            # Build a SoupStrainer
284            strainer = SoupStrainer(name, attrs, text, **kwargs)
285        results = ResultSet(strainer)
286        g = generator()
287        while True:
288            try:
289                i = g.next()
290            except StopIteration:
291                break
292            if i:
293                found = strainer.search(i)
294                if found:
295                    results.append(found)
296                    if limit and len(results) >= limit:
297                        break
298        return results
299
300    #These Generators can be used to navigate starting from both
301    #NavigableStrings and Tags.               
302    def nextGenerator(self):
303        i = self
304        while i:
305            i = i.next
306            yield i
307
308    def nextSiblingGenerator(self):
309        i = self
310        while i:
311            i = i.nextSibling
312            yield i
313
314    def previousGenerator(self):
315        i = self
316        while i:
317            i = i.previous
318            yield i
319
320    def previousSiblingGenerator(self):
321        i = self
322        while i:
323            i = i.previousSibling
324            yield i
325
326    def parentGenerator(self):
327        i = self
328        while i:
329            i = i.parent
330            yield i
331
332    # Utility methods
333    def substituteEncoding(self, str, encoding=None):
334        encoding = encoding or "utf-8"
335        return str.replace("%SOUP-ENCODING%", encoding)   
336
337    def toEncoding(self, s, encoding=None):
338        """Encodes an object to a string in some encoding, or to Unicode.
339        ."""
340        if isinstance(s, unicode):
341            if encoding:
342                s = s.encode(encoding)
343        elif isinstance(s, str):
344            if encoding:
345                s = s.encode(encoding)
346            else:
347                s = unicode(s)
348        else:
349            if encoding:
350                s  = self.toEncoding(str(s), encoding)
351            else:
352                s = unicode(s)
353        return s
354
355class NavigableString(unicode, PageElement):
356
357    def __getattr__(self, attr):
358        """text.string gives you text. This is for backwards
359        compatibility for Navigable*String, but for CData* it lets you
360        get the string without the CData wrapper."""
361        if attr == 'string':
362            return self
363        else:
364            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
365
366    def __unicode__(self):
367        return __str__(self, None)
368
369    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
370        if encoding:
371            return self.encode(encoding)
372        else:
373            return self
374       
375class CData(NavigableString):
376
377    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
378        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
379
380class ProcessingInstruction(NavigableString):
381    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
382        output = self
383        if "%SOUP-ENCODING%" in output:
384            output = self.substituteEncoding(output, encoding)
385        return "<?%s?>" % self.toEncoding(output, encoding)
386
387class Comment(NavigableString):
388    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
389        return "<!--%s-->" % NavigableString.__str__(self, encoding)   
390
391class Declaration(NavigableString):
392    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
393        return "<!%s>" % NavigableString.__str__(self, encoding)       
394
395class Tag(PageElement):
396    """Represents a found HTML tag with its attributes and contents."""
397
398    XML_ENTITIES_TO_CHARS = { 'apos' : "'",
399                              "quot" : '"',
400                              "amp" : "&",
401                              "lt" : "<",
402                              "gt" : ">"
403                              }
404    # An RE for finding ampersands that aren't the start of of a
405    # numeric entity.
406    BARE_AMPERSAND = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
407
408    def __init__(self, parser, name, attrs=None, parent=None,
409                 previous=None):
410        "Basic constructor."
411
412        # We don't actually store the parser object: that lets extracted
413        # chunks be garbage-collected
414        self.parserClass = parser.__class__
415        self.isSelfClosing = parser.isSelfClosingTag(name)
416        self.convertHTMLEntities = parser.convertHTMLEntities
417        self.name = name
418        if attrs == None:
419            attrs = []
420        self.attrs = attrs
421        self.contents = []
422        self.setup(parent, previous)
423        self.hidden = False
424        self.containsSubstitutions = False
425
426    def get(self, key, default=None):
427        """Returns the value of the 'key' attribute for the tag, or
428        the value given for 'default' if it doesn't have that
429        attribute."""
430        return self._getAttrMap().get(key, default)   
431
432    def has_key(self, key):
433        return self._getAttrMap().has_key(key)
434
435    def __getitem__(self, key):
436        """tag[key] returns the value of the 'key' attribute for the tag,
437        and throws an exception if it's not there."""
438        return self._getAttrMap()[key]
439
440    def __iter__(self):
441        "Iterating over a tag iterates over its contents."
442        return iter(self.contents)
443
444    def __len__(self):
445        "The length of a tag is the length of its list of contents."
446        return len(self.contents)
447
448    def __contains__(self, x):
449        return x in self.contents
450
451    def __nonzero__(self):
452        "A tag is non-None even if it has no contents."
453        return True
454
455    def __setitem__(self, key, value):       
456        """Setting tag[key] sets the value of the 'key' attribute for the
457        tag."""
458        self._getAttrMap()
459        self.attrMap[key] = value
460        found = False
461        for i in range(0, len(self.attrs)):
462            if self.attrs[i][0] == key:
463                self.attrs[i] = (key, value)
464                found = True
465        if not found:
466            self.attrs.append((key, value))
467        self._getAttrMap()[key] = value
468
469    def __delitem__(self, key):
470        "Deleting tag[key] deletes all 'key' attributes for the tag."
471        for item in self.attrs:
472            if item[0] == key:
473                self.attrs.remove(item)
474                #We don't break because bad HTML can define the same
475                #attribute multiple times.
476            self._getAttrMap()
477            if self.attrMap.has_key(key):
478                del self.attrMap[key]
479
480    def __call__(self, *args, **kwargs):
481        """Calling a tag like a function is the same as calling its
482        findAll() method. Eg. tag('a') returns a list of all the A tags
483        found within this tag."""
484        return apply(self.findAll, args, kwargs)
485
486    def __getattr__(self, tag):
487        #print "Getattr %s.%s" % (self.__class__, tag)
488        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
489            return self.find(tag[:-3])
490        elif tag.find('__') != 0:
491            return self.find(tag)
492
493    def __eq__(self, other):
494        """Returns true iff this tag has the same name, the same attributes,
495        and the same contents (recursively) as the given tag.
496
497        NOTE: right now this will return false if two tags have the
498        same attributes in a different order. Should this be fixed?"""
499        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
500            return False
501        for i in range(0, len(self.contents)):
502            if self.contents[i] != other.contents[i]:
503                return False
504        return True
505
506    def __ne__(self, other):
507        """Returns true iff this tag is not identical to the other tag,
508        as defined in __eq__."""
509        return not self == other
510
511    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
512        """Renders this tag as a string."""
513        return self.__str__(encoding)
514
515    def __unicode__(self):
516        return self.__str__(None)
517
518    def _convertEntities(self, match):
519        x = match.group(1)
520        if x in name2codepoint:
521            return unichr(name2codepoint[x])           
522        elif "&" + x + ";" in self.XML_ENTITIES_TO_CHARS:
523            return '&%s;' % x
524        else:
525            return '&amp;%s;' % x
526
527    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
528                prettyPrint=False, indentLevel=0):
529        """Returns a string or Unicode representation of this tag and
530        its contents. To get Unicode, pass None for encoding.
531
532        NOTE: since Python's HTML parser consumes whitespace, this
533        method is not certain to reproduce the whitespace present in
534        the original string."""
535
536        encodedName = self.toEncoding(self.name, encoding)
537
538        attrs = []
539        if self.attrs:
540            for key, val in self.attrs:
541                fmt = '%s="%s"'
542                if isString(val):                   
543                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
544                        val = self.substituteEncoding(val, encoding)
545
546                    # The attribute value either:
547                    #
548                    # * Contains no embedded double quotes or single quotes.
549                    #   No problem: we enclose it in double quotes.
550                    # * Contains embedded single quotes. No problem:
551                    #   double quotes work here too.
552                    # * Contains embedded double quotes. No problem:
553                    #   we enclose it in single quotes.
554                    # * Embeds both single _and_ double quotes. This
555                    #   can't happen naturally, but it can happen if
556                    #   you modify an attribute value after parsing
557                    #   the document. Now we have a bit of a
558                    #   problem. We solve it by enclosing the
559                    #   attribute in single quotes, and escaping any
560                    #   embedded single quotes to XML entities.
561                    if '"' in val:
562                        # This can't happen naturally, but it can happen
563                        # if you modify an attribute value after parsing.
564                        if "'" in val:
565                            val = val.replace('"', "&quot;")
566                        else:
567                            fmt = "%s='%s'"
568
569                    # Optionally convert any HTML entities
570                    if self.convertHTMLEntities:
571                        val = re.sub("&(\w+);", self._convertEntities, val)
572
573                    # Now we're okay w/r/t quotes. But the attribute
574                    # value might also contain angle brackets, or
575                    # ampersands that aren't part of entities. We need
576                    # to escape those to XML entities too.
577                    val = val.replace("<", "&lt;").replace(">", "&gt;")
578                    val = self.BARE_AMPERSAND.sub("&amp;", val)
579
580                                     
581                attrs.append(fmt % (self.toEncoding(key, encoding),
582                                    self.toEncoding(val, encoding)))
583        close = ''
584        closeTag = ''
585        if self.isSelfClosing:
586            close = ' /'
587        else:
588            closeTag = '</%s>' % encodedName
589
590        indentTag, indentContents = 0, 0
591        if prettyPrint:
592            indentTag = indentLevel
593            space = (' ' * (indentTag-1))
594            indentContents = indentTag + 1
595        contents = self.renderContents(encoding, prettyPrint, indentContents)
596        if self.hidden:
597            s = contents
598        else:
599            s = []
600            attributeString = ''
601            if attrs:
602                attributeString = ' ' + ' '.join(attrs)           
603            if prettyPrint:
604                s.append(space)
605            s.append('<%s%s%s>' % (encodedName, attributeString, close))
606            if prettyPrint:
607                s.append("\n")
608            s.append(contents)
609            if prettyPrint and contents and contents[-1] != "\n":
610                s.append("\n")
611            if prettyPrint and closeTag:
612                s.append(space)
613            s.append(closeTag)
614            if prettyPrint and closeTag and self.nextSibling:
615                s.append("\n")
616            s = ''.join(s)
617        return s
618
619    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
620        return self.__str__(encoding, True)
621
622    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
623                       prettyPrint=False, indentLevel=0):
624        """Renders the contents of this tag as a string in the given
625        encoding. If encoding is None, returns a Unicode string.."""
626        s=[]
627        for c in self:
628            text = None
629            if isinstance(c, NavigableString):
630                text = c.__str__(encoding)
631            elif isinstance(c, Tag):
632                s.append(c.__str__(encoding, prettyPrint, indentLevel))
633            if text and prettyPrint:
634                text = text.strip()             
635            if text:
636                if prettyPrint:
637                    s.append(" " * (indentLevel-1))
638                s.append(text)
639                if prettyPrint:
640                    s.append("\n")
641        return ''.join(s)   
642
643    #Soup methods
644
645    def find(self, name=None, attrs={}, recursive=True, text=None,
646             **kwargs):
647        """Return only the first child of this Tag matching the given
648        criteria."""
649        r = None
650        l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
651        if l:
652            r = l[0]
653        return r
654    findChild = find
655
656    def findAll(self, name=None, attrs={}, recursive=True, text=None,
657                limit=None, **kwargs):
658        """Extracts a list of Tag objects that match the given
659        criteria.  You can specify the name of the Tag and any
660        attributes you want the Tag to have.
661
662        The value of a key-value pair in the 'attrs' map can be a
663        string, a list of strings, a regular expression object, or a
664        callable that takes a string and returns whether or not the
665        string matches for some custom definition of 'matches'. The
666        same is true of the tag name."""
667        generator = self.recursiveChildGenerator
668        if not recursive:
669            generator = self.childGenerator
670        return self._findAll(name, attrs, text, limit, generator, **kwargs)
671    findChildren = findAll
672
673    # Pre-3.x compatibility methods
674    first = find
675    fetch = findAll
676   
677    def fetchText(self, text=None, recursive=True, limit=None):
678        return self.findAll(text=text, recursive=recursive, limit=limit)
679
680    def firstText(self, text=None, recursive=True):
681        return self.find(text=text, recursive=recursive)
682   
683    #Utility methods
684
685    def append(self, tag):
686        """Appends the given tag to the contents of this tag."""
687        self.contents.append(tag)
688
689    #Private methods
690
691    def _getAttrMap(self):
692        """Initializes a map representation of this tag's attributes,
693        if not already initialized."""
694        if not getattr(self, 'attrMap'):
695            self.attrMap = {}
696            for (key, value) in self.attrs:
697                self.attrMap[key] = value
698        return self.attrMap
699
700    #Generator methods
701    def childGenerator(self):
702        for i in range(0, len(self.contents)):
703            yield self.contents[i]
704        raise StopIteration
705   
706    def recursiveChildGenerator(self):
707        stack = [(self, 0)]
708        while stack:
709            tag, start = stack.pop()
710            if isinstance(tag, Tag):           
711                for i in range(start, len(tag.contents)):
712                    a = tag.contents[i]
713                    yield a
714                    if isinstance(a, Tag) and tag.contents:
715                        if i < len(tag.contents) - 1:
716                            stack.append((tag, i+1))
717                        stack.append((a, 0))
718                        break
719        raise StopIteration
720
721# Next, a couple classes to represent queries and their results.
722class SoupStrainer:
723    """Encapsulates a number of ways of matching a markup element (tag or
724    text)."""
725
726    def __init__(self, name=None, attrs={}, text=None, **kwargs):
727        self.name = name
728        if isString(attrs):
729            kwargs['class'] = attrs
730            attrs = None
731        if kwargs:
732            if attrs:
733                attrs = attrs.copy()
734                attrs.update(kwargs)
735            else:
736                attrs = kwargs
737        self.attrs = attrs
738        self.text = text
739
740    def __str__(self):
741        if self.text:
742            return self.text
743        else:
744            return "%s|%s" % (self.name, self.attrs)
745   
746    def searchTag(self, markupName=None, markupAttrs={}):
747        found = None
748        markup = None
749        if isinstance(markupName, Tag):
750            markup = markupName
751            markupAttrs = markup
752        callFunctionWithTagData = callable(self.name) \
753                                and not isinstance(markupName, Tag)
754
755        if (not self.name) \
756               or callFunctionWithTagData \
757               or (markup and self._matches(markup, self.name)) \
758               or (not markup and self._matches(markupName, self.name)):
759            if callFunctionWithTagData:
760                match = self.name(markupName, markupAttrs)
761            else:
762                match = True           
763                markupAttrMap = None
764                for attr, matchAgainst in self.attrs.items():
765                    if not markupAttrMap:
766                         if hasattr(markupAttrs, 'get'):
767                            markupAttrMap = markupAttrs
768                         else:
769                            markupAttrMap = {}
770                            for k,v in markupAttrs:
771                                markupAttrMap[k] = v
772                    attrValue = markupAttrMap.get(attr)
773                    if not self._matches(attrValue, matchAgainst):
774                        match = False
775                        break
776            if match:
777                if markup:
778                    found = markup
779                else:
780                    found = markupName
781        return found
782
783    def search(self, markup):
784        #print 'looking for %s in %s' % (self, markup)
785        found = None
786        # If given a list of items, scan it for a text element that
787        # matches.       
788        if isList(markup) and not isinstance(markup, Tag):
789            for element in markup:
790                if isinstance(element, NavigableString) \
791                       and self.search(element):
792                    found = element
793                    break
794        # If it's a Tag, make sure its name or attributes match.
795        # Don't bother with Tags if we're searching for text.
796        elif isinstance(markup, Tag):
797            if not self.text:
798                found = self.searchTag(markup)
799        # If it's text, make sure the text matches.
800        elif isinstance(markup, NavigableString) or \
801                 isString(markup):
802            if self._matches(markup, self.text):
803                found = markup
804        else:
805            raise Exception, "I don't know how to match against a %s" \
806                  % markup.__class__
807        return found
808       
809    def _matches(self, markup, matchAgainst):   
810        #print "Matching %s against %s" % (markup, matchAgainst)
811        result = False
812        if matchAgainst == True and type(matchAgainst) == types.BooleanType:
813            result = markup != None
814        elif callable(matchAgainst):
815            result = matchAgainst(markup)
816        else:
817            #Custom match methods take the tag as an argument, but all
818            #other ways of matching match the tag name as a string.
819            if isinstance(markup, Tag):
820                markup = markup.name
821            if markup and not isString(markup):
822                markup = unicode(markup)
823            #Now we know that chunk is either a string, or None.
824            if hasattr(matchAgainst, 'match'):
825                # It's a regexp object.
826                result = markup and matchAgainst.search(markup)
827            elif isList(matchAgainst):
828                result = markup in matchAgainst
829            elif hasattr(matchAgainst, 'items'):
830                result = markup.has_key(matchAgainst)
831            elif matchAgainst and isString(markup):
832                if isinstance(markup, unicode):
833                    matchAgainst = unicode(matchAgainst)
834                else:
835                    matchAgainst = str(matchAgainst)
836
837            if not result:
838                result = matchAgainst == markup
839        return result
840
841class ResultSet(list):
842    """A ResultSet is just a list that keeps track of the SoupStrainer
843    that created it."""
844    def __init__(self, source):
845        list.__init__([])
846        self.source = source
847
848# Now, some helper functions.
849
850def isList(l):
851    """Convenience method that works with all 2.x versions of Python
852    to determine whether or not something is listlike."""
853    return hasattr(l, '__iter__') \
854           or (type(l) in (types.ListType, types.TupleType))
855
856def isString(s):
857    """Convenience method that works with all 2.x versions of Python
858    to determine whether or not something is stringlike."""
859    try:
860        return isinstance(s, unicode) or isintance(s, basestring)
861    except NameError:
862        return isinstance(s, str)
863
864def buildTagMap(default, *args):
865    """Turns a list of maps, lists, or scalars into a single map.
866    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
867    NESTING_RESET_TAGS maps out of lists and partial maps."""
868    built = {}
869    for portion in args:
870        if hasattr(portion, 'items'):
871            #It's a map. Merge it.
872            for k,v in portion.items():
873                built[k] = v
874        elif isList(portion):
875            #It's a list. Map each item to the default.
876            for k in portion:
877                built[k] = default
878        else:
879            #It's a scalar. Map it to the default.
880            built[portion] = default
881    return built
882
883# Now, the parser classes.
884
885class BeautifulStoneSoup(Tag, SGMLParser):
886
887    """This class contains the basic parser and search code. It defines
888    a parser that knows nothing about tag behavior except for the
889    following:
890   
891      You can't close a tag without closing all the tags it encloses.
892      That is, "<foo><bar></foo>" actually means
893      "<foo><bar></bar></foo>".
894
895    [Another possible explanation is "<foo><bar /></foo>", but since
896    this class defines no SELF_CLOSING_TAGS, it will never use that
897    explanation.]
898
899    This class is useful for parsing XML or made-up markup languages,
900    or when BeautifulSoup makes an assumption counter to what you were
901    expecting."""
902
903    SELF_CLOSING_TAGS = {}
904    NESTABLE_TAGS = {}
905    RESET_NESTING_TAGS = {}
906    QUOTE_TAGS = {}
907
908    MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
909                       lambda x: x.group(1) + ' />'),
910                      (re.compile('<!\s+([^<>]*)>'),
911                       lambda x: '<!' + x.group(1) + '>')
912                      ]
913
914    ROOT_TAG_NAME = u'[document]'
915
916    HTML_ENTITIES = "html"
917    XML_ENTITIES = "xml"
918    ALL_ENTITIES = [HTML_ENTITIES, XML_ENTITIES]
919
920    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
921                 markupMassage=True, smartQuotesTo=XML_ENTITIES,
922                 convertEntities=None, selfClosingTags=None):
923        """The Soup object is initialized as the 'root tag', and the
924        provided markup (which can be a string or a file-like object)
925        is fed into the underlying parser.
926
927        sgmllib will process most bad HTML, and the BeautifulSoup
928        class has some tricks for dealing with some HTML that kills
929        sgmllib, but Beautiful Soup can nonetheless choke or lose data
930        if your data uses self-closing tags or declarations
931        incorrectly.
932
933        By default, Beautiful Soup uses regexes to sanitize input,
934        avoiding the vast majority of these problems. If the problems
935        don't apply to you, pass in False for markupMassage, and
936        you'll get better performance.
937
938        The default parser massage techniques fix the two most common
939        instances of invalid HTML that choke sgmllib:
940
941         <br/> (No space between name of closing tag and tag close)
942         <! --Comment--> (Extraneous whitespace in declaration)
943
944        You can pass in a custom list of (RE object, replace method)
945        tuples to get Beautiful Soup to scrub your input the way you
946        want."""
947
948        self.parseOnlyThese = parseOnlyThese
949        self.fromEncoding = fromEncoding
950        self.smartQuotesTo = smartQuotesTo
951
952        if convertEntities:
953            # It doesn't make sense to convert encoded characters to
954            # entities even while you're converting entities to Unicode.
955            # Just convert it all to Unicode.
956            self.smartQuotesTo = None
957
958        if isList(convertEntities):
959            self.convertHTMLEntities = self.HTML_ENTITIES in convertEntities
960            self.convertXMLEntities = self.XML_ENTITIES in convertEntities
961        else:
962            self.convertHTMLEntities = self.HTML_ENTITIES == convertEntities
963            self.convertXMLEntities = self.XML_ENTITIES == convertEntities
964
965        self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
966        SGMLParser.__init__(self)
967           
968        if hasattr(markup, 'read'):        # It's a file-type object.
969            markup = markup.read()
970        self.markup = markup
971        self.markupMassage = markupMassage
972        try:
973            self._feed()
974        except StopParsing:
975            pass
976        self.markup = None                 # The markup can now be GCed
977
978    def _feed(self, inDocumentEncoding=None):
979        # Convert the document to Unicode.
980        markup = self.markup
981        if isinstance(markup, unicode):
982            if not hasattr(self, 'originalEncoding'):
983                self.originalEncoding = None
984        else:
985            dammit = UnicodeDammit\
986                     (markup, [self.fromEncoding, inDocumentEncoding],
987                      smartQuotesTo=self.smartQuotesTo)
988            markup = dammit.unicode
989            self.originalEncoding = dammit.originalEncoding
990        if markup:
991            if self.markupMassage:
992                if not isList(self.markupMassage):
993                    self.markupMassage = self.MARKUP_MASSAGE           
994                for fix, m in self.markupMassage:
995                    markup = fix.sub(m, markup)
996        self.reset()
997
998        SGMLParser.feed(self, markup or "")
999        SGMLParser.close(self)
1000        # Close out any unfinished strings and close all the open tags.
1001        self.endData()
1002        while self.currentTag.name != self.ROOT_TAG_NAME:
1003            self.popTag()
1004
1005    def __getattr__(self, methodName):
1006        """This method routes method call requests to either the SGMLParser
1007        superclass or the Tag superclass, depending on the method name."""
1008        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1009
1010        if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
1011               or methodName.find('do_') == 0:
1012            return SGMLParser.__getattr__(self, methodName)
1013        elif methodName.find('__') != 0:
1014            return Tag.__getattr__(self, methodName)
1015        else:
1016            raise AttributeError
1017
1018    def isSelfClosingTag(self, name):
1019        """Returns true iff the given string is the name of a
1020        self-closing tag according to this parser."""
1021        return self.SELF_CLOSING_TAGS.has_key(name) \
1022               or self.instanceSelfClosingTags.has_key(name)
1023           
1024    def reset(self):
1025        Tag.__init__(self, self, self.ROOT_TAG_NAME)
1026        self.hidden = 1
1027        SGMLParser.reset(self)
1028        self.currentData = []
1029        self.currentTag = None
1030        self.tagStack = []
1031        self.quoteStack = []
1032        self.pushTag(self)
1033   
1034    def popTag(self):
1035        tag = self.tagStack.pop()
1036        # Tags with just one string-owning child get the child as a
1037        # 'string' property, so that soup.tag.string is shorthand for
1038        # soup.tag.contents[0]
1039        if len(self.currentTag.contents) == 1 and \
1040           isinstance(self.currentTag.contents[0], NavigableString):
1041            self.currentTag.string = self.currentTag.contents[0]
1042
1043        #print "Pop", tag.name
1044        if self.tagStack:
1045            self.currentTag = self.tagStack[-1]
1046        return self.currentTag
1047
1048    def pushTag(self, tag):
1049        #print "Push", tag.name
1050        if self.currentTag:
1051            self.currentTag.append(tag)
1052        self.tagStack.append(tag)
1053        self.currentTag = self.tagStack[-1]
1054
1055    def endData(self, containerClass=NavigableString):
1056        if self.currentData:
1057            currentData = ''.join(self.currentData)
1058            if currentData.endswith('<') and self.convertHTMLEntities:
1059                currentData = currentData[:-1] + '&lt;'
1060            if not currentData.strip():
1061                if '\n' in currentData:
1062                    currentData = '\n'
1063                else:
1064                    currentData = ' '
1065            self.currentData = []
1066            if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1067                   (not self.parseOnlyThese.text or \
1068                    not self.parseOnlyThese.search(currentData)):
1069                return
1070            o = containerClass(currentData)
1071            o.setup(self.currentTag, self.previous)
1072            if self.previous:
1073                self.previous.next = o
1074            self.previous = o
1075            self.currentTag.contents.append(o)
1076
1077
1078    def _popToTag(self, name, inclusivePop=True):
1079        """Pops the tag stack up to and including the most recent
1080        instance of the given tag. If inclusivePop is false, pops the tag
1081        stack up to but *not* including the most recent instqance of
1082        the given tag."""
1083        #print "Popping to %s" % name
1084        if name == self.ROOT_TAG_NAME:
1085            return           
1086
1087        numPops = 0
1088        mostRecentTag = None
1089        for i in range(len(self.tagStack)-1, 0, -1):
1090            if name == self.tagStack[i].name:
1091                numPops = len(self.tagStack)-i
1092                break
1093        if not inclusivePop:
1094            numPops = numPops - 1
1095
1096        for i in range(0, numPops):
1097            mostRecentTag = self.popTag()
1098        return mostRecentTag   
1099
1100    def _smartPop(self, name):
1101
1102        """We need to pop up to the previous tag of this type, unless
1103        one of this tag's nesting reset triggers comes between this
1104        tag and the previous tag of this type, OR unless this tag is a
1105        generic nesting trigger and another generic nesting trigger
1106        comes between this tag and the previous tag of this type.
1107
1108        Examples:
1109         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1110         <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
1111         <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
1112         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1113
1114         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1115         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1116         <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1117        """
1118
1119        nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1120        isNestable = nestingResetTriggers != None
1121        isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1122        popTo = None
1123        inclusive = True
1124        for i in range(len(self.tagStack)-1, 0, -1):
1125            p = self.tagStack[i]
1126            if (not p or p.name == name) and not isNestable:
1127                #Non-nestable tags get popped to the top or to their
1128                #last occurance.
1129                popTo = name
1130                break
1131            if (nestingResetTriggers != None
1132                and p.name in nestingResetTriggers) \
1133                or (nestingResetTriggers == None and isResetNesting
1134                    and self.RESET_NESTING_TAGS.has_key(p.name)):
1135               
1136                #If we encounter one of the nesting reset triggers
1137                #peculiar to this tag, or we encounter another tag
1138                #that causes nesting to reset, pop up to but not
1139                #including that tag.
1140                popTo = p.name
1141                inclusive = False
1142                break
1143            p = p.parent
1144        if popTo:
1145            self._popToTag(popTo, inclusive)
1146
1147    def unknown_starttag(self, name, attrs, selfClosing=0):
1148        #print "Start tag %s: %s" % (name, attrs)
1149        if self.quoteStack:
1150            #This is not a real tag.
1151            #print "<%s> is not real!" % name
1152            attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1153            self.currentData.append('<%s%s>' % (name, attrs))
1154            return       
1155        self.endData()
1156
1157        if not self.isSelfClosingTag(name) and not selfClosing:
1158            self._smartPop(name)
1159
1160        if self.parseOnlyThese and len(self.tagStack) <= 1 \
1161               and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1162            return
1163
1164        tag = Tag(self, name, attrs, self.currentTag, self.previous)
1165        if self.previous:
1166            self.previous.next = tag
1167        self.previous = tag
1168        self.pushTag(tag)
1169        if selfClosing or self.isSelfClosingTag(name):
1170            self.popTag()               
1171        if name in self.QUOTE_TAGS:
1172            #print "Beginning quote (%s)" % name
1173            self.quoteStack.append(name)
1174            self.literal = 1
1175        return tag
1176
1177    def unknown_endtag(self, name):
1178        #print "End tag %s" % name
1179        if self.quoteStack and self.quoteStack[-1] != name:
1180            #This is not a real end tag.
1181            #print "</%s> is not real!" % name
1182            self.currentData.append('</%s>' % name)
1183            return
1184        self.endData()
1185        self._popToTag(name)
1186        if self.quoteStack and self.quoteStack[-1] == name:
1187            self.quoteStack.pop()
1188            self.literal = (len(self.quoteStack) > 0)
1189
1190    def handle_data(self, data):
1191        if self.convertHTMLEntities:
1192            if data[0] == '&':
1193                data = self.BARE_AMPERSAND.sub("&amp;",data)
1194            else:
1195                data = data.replace('&','&amp;') \
1196                           .replace('<','&lt;') \
1197                           .replace('>','&gt;')
1198        self.currentData.append(data)
1199
1200    def _toStringSubclass(self, text, subclass):
1201        """Adds a certain piece of text to the tree as a NavigableString
1202        subclass."""
1203        self.endData()
1204        self.handle_data(text)
1205        self.endData(subclass)
1206
1207    def handle_pi(self, text):
1208        """Handle a processing instruction as a ProcessingInstruction
1209        object, possibly one with a %SOUP-ENCODING% slot into which an
1210        encoding will be plugged later."""
1211        if text[:3] == "xml":
1212            text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
1213        self._toStringSubclass(text, ProcessingInstruction)
1214
1215    def handle_comment(self, text):
1216        "Handle comments as Comment objects."
1217        self._toStringSubclass(text, Comment)
1218
1219    def handle_charref(self, ref):
1220        "Handle character references as data."
1221    Â