Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: py-scraping/mechanize/_beautifulsoup.py@ 122

Last change on this file since 122 was 106, checked in by Rick van der Zwet, 15 years ago
Initial commit...
File size: 39.9 KB

Line
1	"""Beautiful Soup
2	Elixir and Tonic
3	"The Screen-Scraper's Friend"
4	v2.1.1
5	http://www.crummy.com/software/BeautifulSoup/
6
7	Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance
8	into a tree representation. It provides methods and Pythonic idioms
9	that make it easy to search and modify the tree.
10
11	A well-formed XML/HTML document will yield a well-formed data
12	structure. An ill-formed XML/HTML document will yield a
13	correspondingly ill-formed data structure. If your document is only
14	locally well-formed, you can use this library to find and process the
15	well-formed part of it. The BeautifulSoup class has heuristics for
16	obtaining a sensible parse tree in the face of common HTML errors.
17
18	Beautiful Soup has no external dependencies. It works with Python 2.2
19	and up.
20
21	Beautiful Soup defines classes for four different parsing strategies:
22
23	* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
24	language that kind of looks like XML.
25
26	* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
27	or invalid.
28
29	* ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML
30	that trips up BeautifulSoup.
31
32	* BeautifulSOAP, for making it easier to parse XML documents that use
33	lots of subelements containing a single string, where you'd prefer
34	they put that string into an attribute (such as SOAP messages).
35
36	You can subclass BeautifulStoneSoup or BeautifulSoup to create a
37	parsing strategy specific to an XML schema or a particular bizarre
38	HTML document. Typically your subclass would just override
39	SELF_CLOSING_TAGS and/or NESTABLE_TAGS.
40	""" #"
41	from __future__ import generators
42
43	__author__ = "Leonard Richardson (leonardr@segfault.org)"
44	__version__ = "2.1.1"
45	__date__ = "$Date: 2004/10/18 00:14:20 $"
46	__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson"
47	__license__ = "PSF"
48
49	from sgmllib import SGMLParser, SGMLParseError
50	import types
51	import re
52	import sgmllib
53
54	#This code makes Beautiful Soup able to parse XML with namespaces
55	sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
56
57	class NullType(object):
58
59	"""Similar to NoneType with a corresponding singleton instance
60	'Null' that, unlike None, accepts any message and returns itself.
61
62	Examples:
63	>>> Null("send", "a", "message")("and one more",
64	... "and what you get still") is Null
65	True
66	"""
67
68	def __new__(cls): return Null
69	def __call__(self, args, *kwargs): return Null
70	## def __getstate__(self, *args): return Null
71	def __getattr__(self, attr): return Null
72	def __getitem__(self, item): return Null
73	def __setattr__(self, attr, value): pass
74	def __setitem__(self, item, value): pass
75	def __len__(self): return 0
76	# FIXME: is this a python bug? otherwise ``for x in Null: pass``
77	# never terminates...
78	def __iter__(self): return iter([])
79	def __contains__(self, item): return False
80	def __repr__(self): return "Null"
81	Null = object.__new__(NullType)
82
83	class PageElement:
84	"""Contains the navigational information for some part of the page
85	(either a tag or a piece of text)"""
86
87	def setup(self, parent=Null, previous=Null):
88	"""Sets up the initial relations between this element and
89	other elements."""
90	self.parent = parent
91	self.previous = previous
92	self.next = Null
93	self.previousSibling = Null
94	self.nextSibling = Null
95	if self.parent and self.parent.contents:
96	self.previousSibling = self.parent.contents[-1]
97	self.previousSibling.nextSibling = self
98
99	def findNext(self, name=None, attrs={}, text=None):
100	"""Returns the first item that matches the given criteria and
101	appears after this Tag in the document."""
102	return self._first(self.fetchNext, name, attrs, text)
103	firstNext = findNext
104
105	def fetchNext(self, name=None, attrs={}, text=None, limit=None):
106	"""Returns all items that match the given criteria and appear
107	before after Tag in the document."""
108	return self._fetch(name, attrs, text, limit, self.nextGenerator)
109
110	def findNextSibling(self, name=None, attrs={}, text=None):
111	"""Returns the closest sibling to this Tag that matches the
112	given criteria and appears after this Tag in the document."""
113	return self._first(self.fetchNextSiblings, name, attrs, text)
114	firstNextSibling = findNextSibling
115
116	def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None):
117	"""Returns the siblings of this Tag that match the given
118	criteria and appear after this Tag in the document."""
119	return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator)
120
121	def findPrevious(self, name=None, attrs={}, text=None):
122	"""Returns the first item that matches the given criteria and
123	appears before this Tag in the document."""
124	return self._first(self.fetchPrevious, name, attrs, text)
125
126	def fetchPrevious(self, name=None, attrs={}, text=None, limit=None):
127	"""Returns all items that match the given criteria and appear
128	before this Tag in the document."""
129	return self._fetch(name, attrs, text, limit, self.previousGenerator)
130	firstPrevious = findPrevious
131
132	def findPreviousSibling(self, name=None, attrs={}, text=None):
133	"""Returns the closest sibling to this Tag that matches the
134	given criteria and appears before this Tag in the document."""
135	return self._first(self.fetchPreviousSiblings, name, attrs, text)
136	firstPreviousSibling = findPreviousSibling
137
138	def fetchPreviousSiblings(self, name=None, attrs={}, text=None,
139	limit=None):
140	"""Returns the siblings of this Tag that match the given
141	criteria and appear before this Tag in the document."""
142	return self._fetch(name, attrs, text, limit,
143	self.previousSiblingGenerator)
144
145	def findParent(self, name=None, attrs={}):
146	"""Returns the closest parent of this Tag that matches the given
147	criteria."""
148	r = Null
149	l = self.fetchParents(name, attrs, 1)
150	if l:
151	r = l[0]
152	return r
153	firstParent = findParent
154
155	def fetchParents(self, name=None, attrs={}, limit=None):
156	"""Returns the parents of this Tag that match the given
157	criteria."""
158	return self._fetch(name, attrs, None, limit, self.parentGenerator)
159
160	#These methods do the real heavy lifting.
161
162	def _first(self, method, name, attrs, text):
163	r = Null
164	l = method(name, attrs, text, 1)
165	if l:
166	r = l[0]
167	return r
168
169	def _fetch(self, name, attrs, text, limit, generator):
170	"Iterates over a generator looking for things that match."
171	if not hasattr(attrs, 'items'):
172	attrs = {'class' : attrs}
173
174	results = []
175	g = generator()
176	while True:
177	try:
178	i = g.next()
179	except StopIteration:
180	break
181	found = None
182	if isinstance(i, Tag):
183	if not text:
184	if not name or self._matches(i, name):
185	match = True
186	for attr, matchAgainst in attrs.items():
187	check = i.get(attr)
188	if not self._matches(check, matchAgainst):
189	match = False
190	break
191	if match:
192	found = i
193	elif text:
194	if self._matches(i, text):
195	found = i
196	if found:
197	results.append(found)
198	if limit and len(results) >= limit:
199	break
200	return results
201
202	#Generators that can be used to navigate starting from both
203	#NavigableTexts and Tags.
204	def nextGenerator(self):
205	i = self
206	while i:
207	i = i.next
208	yield i
209
210	def nextSiblingGenerator(self):
211	i = self
212	while i:
213	i = i.nextSibling
214	yield i
215
216	def previousGenerator(self):
217	i = self
218	while i:
219	i = i.previous
220	yield i
221
222	def previousSiblingGenerator(self):
223	i = self
224	while i:
225	i = i.previousSibling
226	yield i
227
228	def parentGenerator(self):
229	i = self
230	while i:
231	i = i.parent
232	yield i
233
234	def _matches(self, chunk, howToMatch):
235	#print 'looking for %s in %s' % (howToMatch, chunk)
236	#
237	# If given a list of items, return true if the list contains a
238	# text element that matches.
239	if isList(chunk) and not isinstance(chunk, Tag):
240	for tag in chunk:
241	if isinstance(tag, NavigableText) and self._matches(tag, howToMatch):
242	return True
243	return False
244	if callable(howToMatch):
245	return howToMatch(chunk)
246	if isinstance(chunk, Tag):
247	#Custom match methods take the tag as an argument, but all other
248	#ways of matching match the tag name as a string
249	chunk = chunk.name
250	#Now we know that chunk is a string
251	if not isinstance(chunk, basestring):
252	chunk = str(chunk)
253	if hasattr(howToMatch, 'match'):
254	# It's a regexp object.
255	return howToMatch.search(chunk)
256	if isList(howToMatch):
257	return chunk in howToMatch
258	if hasattr(howToMatch, 'items'):
259	return howToMatch.has_key(chunk)
260	#It's just a string
261	return str(howToMatch) == chunk
262
263	class NavigableText(PageElement):
264
265	def __getattr__(self, attr):
266	"For backwards compatibility, text.string gives you text"
267	if attr == 'string':
268	return self
269	else:
270	raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
271
272	class NavigableString(str, NavigableText):
273	pass
274
275	class NavigableUnicodeString(unicode, NavigableText):
276	pass
277
278	class Tag(PageElement):
279
280	"""Represents a found HTML tag with its attributes and contents."""
281
282	def __init__(self, name, attrs=None, parent=Null, previous=Null):
283	"Basic constructor."
284	self.name = name
285	if attrs == None:
286	attrs = []
287	self.attrs = attrs
288	self.contents = []
289	self.setup(parent, previous)
290	self.hidden = False
291
292	def get(self, key, default=None):
293	"""Returns the value of the 'key' attribute for the tag, or
294	the value given for 'default' if it doesn't have that
295	attribute."""
296	return self._getAttrMap().get(key, default)
297
298	def __getitem__(self, key):
299	"""tag[key] returns the value of the 'key' attribute for the tag,
300	and throws an exception if it's not there."""
301	return self._getAttrMap()[key]
302
303	def __iter__(self):
304	"Iterating over a tag iterates over its contents."
305	return iter(self.contents)
306
307	def __len__(self):
308	"The length of a tag is the length of its list of contents."
309	return len(self.contents)
310
311	def __contains__(self, x):
312	return x in self.contents
313
314	def __nonzero__(self):
315	"A tag is non-None even if it has no contents."
316	return True
317
318	def __setitem__(self, key, value):
319	"""Setting tag[key] sets the value of the 'key' attribute for the
320	tag."""
321	self._getAttrMap()
322	self.attrMap[key] = value
323	found = False
324	for i in range(0, len(self.attrs)):
325	if self.attrs[i][0] == key:
326	self.attrs[i] = (key, value)
327	found = True
328	if not found:
329	self.attrs.append((key, value))
330	self._getAttrMap()[key] = value
331
332	def __delitem__(self, key):
333	"Deleting tag[key] deletes all 'key' attributes for the tag."
334	for item in self.attrs:
335	if item[0] == key:
336	self.attrs.remove(item)
337	#We don't break because bad HTML can define the same
338	#attribute multiple times.
339	self._getAttrMap()
340	if self.attrMap.has_key(key):
341	del self.attrMap[key]
342
343	def __call__(self, args, *kwargs):
344	"""Calling a tag like a function is the same as calling its
345	fetch() method. Eg. tag('a') returns a list of all the A tags
346	found within this tag."""
347	return apply(self.fetch, args, kwargs)
348
349	def __getattr__(self, tag):
350	if len(tag) > 3 and tag.rfind('Tag') == len(tag) - 3:
351	return self.first(tag[:-3])
352	elif tag.find('__') != 0:
353	return self.first(tag)
354
355	def __eq__(self, other):
356	"""Returns true iff this tag has the same name, the same attributes,
357	and the same contents (recursively) as the given tag.
358
359	NOTE: right now this will return false if two tags have the
360	same attributes in a different order. Should this be fixed?"""
361	if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
362	return False
363	for i in range(0, len(self.contents)):
364	if self.contents[i] != other.contents[i]:
365	return False
366	return True
367
368	def __ne__(self, other):
369	"""Returns true iff this tag is not identical to the other tag,
370	as defined in __eq__."""
371	return not self == other
372
373	def __repr__(self):
374	"""Renders this tag as a string."""
375	return str(self)
376
377	def __unicode__(self):
378	return self.__str__(1)
379
380	def __str__(self, needUnicode=None, showStructureIndent=None):
381	"""Returns a string or Unicode representation of this tag and
382	its contents.
383
384	NOTE: since Python's HTML parser consumes whitespace, this
385	method is not certain to reproduce the whitespace present in
386	the original string."""
387
388	attrs = []
389	if self.attrs:
390	for key, val in self.attrs:
391	attrs.append('%s="%s"' % (key, val))
392	close = ''
393	closeTag = ''
394	if self.isSelfClosing():
395	close = ' /'
396	else:
397	closeTag = '</%s>' % self.name
398	indentIncrement = None
399	if showStructureIndent != None:
400	indentIncrement = showStructureIndent
401	if not self.hidden:
402	indentIncrement += 1
403	contents = self.renderContents(indentIncrement, needUnicode=needUnicode)
404	if showStructureIndent:
405	space = '\n%s' % (' ' * showStructureIndent)
406	if self.hidden:
407	s = contents
408	else:
409	s = []
410	attributeString = ''
411	if attrs:
412	attributeString = ' ' + ' '.join(attrs)
413	if showStructureIndent:
414	s.append(space)
415	s.append('<%s%s%s>' % (self.name, attributeString, close))
416	s.append(contents)
417	if closeTag and showStructureIndent != None:
418	s.append(space)
419	s.append(closeTag)
420	s = ''.join(s)
421	isUnicode = type(s) == types.UnicodeType
422	if needUnicode and not isUnicode:
423	s = unicode(s)
424	elif isUnicode and needUnicode == False:
425	s = str(s)
426	return s
427
428	def prettify(self, needUnicode=None):
429	return self.__str__(needUnicode, showStructureIndent=True)
430
431	def renderContents(self, showStructureIndent=None, needUnicode=None):
432	"""Renders the contents of this tag as a (possibly Unicode)
433	string."""
434	s = []
435	for c in self:
436	text = None
437	if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType:
438	text = unicode(c)
439	elif isinstance(c, Tag):
440	s.append(c.__str__(needUnicode, showStructureIndent))
441	elif needUnicode:
442	text = unicode(c)
443	else:
444	text = str(c)
445	if text:
446	if showStructureIndent != None:
447	if text[-1] == '\n':
448	text = text[:-1]
449	s.append(text)
450	return ''.join(s)
451
452	#Soup methods
453
454	def firstText(self, text, recursive=True):
455	"""Convenience method to retrieve the first piece of text matching the
456	given criteria. 'text' can be a string, a regular expression object,
457	a callable that takes a string and returns whether or not the
458	string 'matches', etc."""
459	return self.first(recursive=recursive, text=text)
460
461	def fetchText(self, text, recursive=True, limit=None):
462	"""Convenience method to retrieve all pieces of text matching the
463	given criteria. 'text' can be a string, a regular expression object,
464	a callable that takes a string and returns whether or not the
465	string 'matches', etc."""
466	return self.fetch(recursive=recursive, text=text, limit=limit)
467
468	def first(self, name=None, attrs={}, recursive=True, text=None):
469	"""Return only the first child of this
470	Tag matching the given criteria."""
471	r = Null
472	l = self.fetch(name, attrs, recursive, text, 1)
473	if l:
474	r = l[0]
475	return r
476	findChild = first
477
478	def fetch(self, name=None, attrs={}, recursive=True, text=None,
479	limit=None):
480	"""Extracts a list of Tag objects that match the given
481	criteria. You can specify the name of the Tag and any
482	attributes you want the Tag to have.
483
484	The value of a key-value pair in the 'attrs' map can be a
485	string, a list of strings, a regular expression object, or a
486	callable that takes a string and returns whether or not the
487	string matches for some custom definition of 'matches'. The
488	same is true of the tag name."""
489	generator = self.recursiveChildGenerator
490	if not recursive:
491	generator = self.childGenerator
492	return self._fetch(name, attrs, text, limit, generator)
493	fetchChildren = fetch
494
495	#Utility methods
496
497	def isSelfClosing(self):
498	"""Returns true iff this is a self-closing tag as defined in the HTML
499	standard.
500
501	TODO: This is specific to BeautifulSoup and its subclasses, but it's
502	used by __str__"""
503	return self.name in BeautifulSoup.SELF_CLOSING_TAGS
504
505	def append(self, tag):
506	"""Appends the given tag to the contents of this tag."""
507	self.contents.append(tag)
508
509	#Private methods
510
511	def _getAttrMap(self):
512	"""Initializes a map representation of this tag's attributes,
513	if not already initialized."""
514	if not getattr(self, 'attrMap'):
515	self.attrMap = {}
516	for (key, value) in self.attrs:
517	self.attrMap[key] = value
518	return self.attrMap
519
520	#Generator methods
521	def childGenerator(self):
522	for i in range(0, len(self.contents)):
523	yield self.contents[i]
524	raise StopIteration
525
526	def recursiveChildGenerator(self):
527	stack = [(self, 0)]
528	while stack:
529	tag, start = stack.pop()
530	if isinstance(tag, Tag):
531	for i in range(start, len(tag.contents)):
532	a = tag.contents[i]
533	yield a
534	if isinstance(a, Tag) and tag.contents:
535	if i < len(tag.contents) - 1:
536	stack.append((tag, i + 1))
537	stack.append((a, 0))
538	break
539	raise StopIteration
540
541
542	def isList(l):
543	"""Convenience method that works with all 2.x versions of Python
544	to determine whether or not something is listlike."""
545	return hasattr(l, '__iter__') \
546	or (type(l) in (types.ListType, types.TupleType))
547
548	def buildTagMap(default, *args):
549	"""Turns a list of maps, lists, or scalars into a single map.
550	Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
551	of lists and partial maps."""
552	built = {}
553	for portion in args:
554	if hasattr(portion, 'items'):
555	#It's a map. Merge it.
556	for k, v in portion.items():
557	built[k] = v
558	elif isList(portion):
559	#It's a list. Map each item to the default.
560	for k in portion:
561	built[k] = default
562	else:
563	#It's a scalar. Map it to the default.
564	built[portion] = default
565	return built
566
567	class BeautifulStoneSoup(Tag, SGMLParser):
568
569	"""This class contains the basic parser and fetch code. It defines
570	a parser that knows nothing about tag behavior except for the
571	following:
572
573	You can't close a tag without closing all the tags it encloses.
574	That is, "<foo><bar></foo>" actually means
575	"<foo><bar></bar></foo>".
576
577	[Another possible explanation is "<foo><bar /></foo>", but since
578	this class defines no SELF_CLOSING_TAGS, it will never use that
579	explanation.]
580
581	This class is useful for parsing XML or made-up markup languages,
582	or when BeautifulSoup makes an assumption counter to what you were
583	expecting."""
584
585	SELF_CLOSING_TAGS = {}
586	NESTABLE_TAGS = {}
587	RESET_NESTING_TAGS = {}
588	QUOTE_TAGS = {}
589
590	#As a public service we will by default silently replace MS smart quotes
591	#and similar characters with their HTML or ASCII equivalents.
592	MS_CHARS = { '\x80' : '€',
593	'\x81' : ' ',
594	'\x82' : '&sbquo;',
595	'\x83' : '&fnof;',
596	'\x84' : '&bdquo;',
597	'\x85' : '…',
598	'\x86' : '&dagger;',
599	'\x87' : '&Dagger;',
600	'\x88' : '&caret;',
601	'\x89' : '%',
602	'\x8A' : '&Scaron;',
603	'\x8B' : '<',
604	'\x8C' : '&OElig;',
605	'\x8D' : '?',
606	'\x8E' : 'Z',
607	'\x8F' : '?',
608	'\x90' : '?',
609	'\x91' : '‘',
610	'\x92' : '’',
611	'\x93' : '“',
612	'\x94' : '”',
613	'\x95' : '•',
614	'\x96' : '–',
615	'\x97' : '—',
616	'\x98' : '&tilde;',
617	'\x99' : '™',
618	'\x9a' : '&scaron;',
619	'\x9b' : '>',
620	'\x9c' : '&oelig;',
621	'\x9d' : '?',
622	'\x9e' : 'z',
623	'\x9f' : '&Yuml;', }
624
625	PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
626	lambda(x):x.group(1) + ' />'),
627	(re.compile('<!\s+([^<>]*)>'),
628	lambda(x):'<!' + x.group(1) + '>'),
629	(re.compile("([\x80-\x9f])"),
630	lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1)))
631	]
632
633	ROOT_TAG_NAME = '[document]'
634
635	def __init__(self, text=None, avoidParserProblems=True,
636	initialTextIsEverything=True):
637	"""Initialize this as the 'root tag' and feed in any text to
638	the parser.
639
640	NOTE about avoidParserProblems: sgmllib will process most bad
641	HTML, and BeautifulSoup has tricks for dealing with some HTML
642	that kills sgmllib, but Beautiful Soup can nonetheless choke
643	or lose data if your data uses self-closing tags or
644	declarations incorrectly. By default, Beautiful Soup sanitizes
645	its input to avoid the vast majority of these problems. The
646	problems are relatively rare, even in bad HTML, so feel free
647	to pass in False to avoidParserProblems if they don't apply to
648	you, and you'll get better performance. The only reason I have
649	this turned on by default is so I don't get so many tech
650	support questions.
651
652	The two most common instances of invalid HTML that will choke
653	sgmllib are fixed by the default parser massage techniques:
654
655	<br/> (No space between name of closing tag and tag close)
656	<! --Comment--> (Extraneous whitespace in declaration)
657
658	You can pass in a custom list of (RE object, replace method)
659	tuples to get Beautiful Soup to scrub your input the way you
660	want."""
661	Tag.__init__(self, self.ROOT_TAG_NAME)
662	if avoidParserProblems \
663	and not isList(avoidParserProblems):
664	avoidParserProblems = self.PARSER_MASSAGE
665	self.avoidParserProblems = avoidParserProblems
666	SGMLParser.__init__(self)
667	self.quoteStack = []
668	self.hidden = 1
669	self.reset()
670	if hasattr(text, 'read'):
671	#It's a file-type object.
672	text = text.read()
673	if text:
674	self.feed(text)
675	if initialTextIsEverything:
676	self.done()
677
678	def __getattr__(self, methodName):
679	"""This method routes method call requests to either the SGMLParser
680	superclass or the Tag superclass, depending on the method name."""
681	if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
682	or methodName.find('do_') == 0:
683	return SGMLParser.__getattr__(self, methodName)
684	elif methodName.find('__') != 0:
685	return Tag.__getattr__(self, methodName)
686	else:
687	raise AttributeError
688
689	def feed(self, text):
690	if self.avoidParserProblems:
691	for fix, m in self.avoidParserProblems:
692	text = fix.sub(m, text)
693	SGMLParser.feed(self, text)
694
695	def done(self):
696	"""Called when you're done parsing, so that the unclosed tags can be
697	correctly processed."""
698	self.endData() #NEW
699	while self.currentTag.name != self.ROOT_TAG_NAME:
700	self.popTag()
701
702	def reset(self):
703	SGMLParser.reset(self)
704	self.currentData = []
705	self.currentTag = None
706	self.tagStack = []
707	self.pushTag(self)
708
709	def popTag(self):
710	tag = self.tagStack.pop()
711	# Tags with just one string-owning child get the child as a
712	# 'string' property, so that soup.tag.string is shorthand for
713	# soup.tag.contents[0]
714	if len(self.currentTag.contents) == 1 and \
715	isinstance(self.currentTag.contents[0], NavigableText):
716	self.currentTag.string = self.currentTag.contents[0]
717
718	#print "Pop", tag.name
719	if self.tagStack:
720	self.currentTag = self.tagStack[-1]
721	return self.currentTag
722
723	def pushTag(self, tag):
724	#print "Push", tag.name
725	if self.currentTag:
726	self.currentTag.append(tag)
727	self.tagStack.append(tag)
728	self.currentTag = self.tagStack[-1]
729
730	def endData(self):
731	currentData = ''.join(self.currentData)
732	if currentData:
733	if not currentData.strip():
734	if '\n' in currentData:
735	currentData = '\n'
736	else:
737	currentData = ' '
738	c = NavigableString
739	if type(currentData) == types.UnicodeType:
740	c = NavigableUnicodeString
741	o = c(currentData)
742	o.setup(self.currentTag, self.previous)
743	if self.previous:
744	self.previous.next = o
745	self.previous = o
746	self.currentTag.contents.append(o)
747	self.currentData = []
748
749	def _popToTag(self, name, inclusivePop=True):
750	"""Pops the tag stack up to and including the most recent
751	instance of the given tag. If inclusivePop is false, pops the tag
752	stack up to but not including the most recent instqance of
753	the given tag."""
754	if name == self.ROOT_TAG_NAME:
755	return
756
757	numPops = 0
758	mostRecentTag = None
759	for i in range(len(self.tagStack) - 1, 0, -1):
760	if name == self.tagStack[i].name:
761	numPops = len(self.tagStack) - i
762	break
763	if not inclusivePop:
764	numPops = numPops - 1
765
766	for i in range(0, numPops):
767	mostRecentTag = self.popTag()
768	return mostRecentTag
769
770	def _smartPop(self, name):
771
772	"""We need to pop up to the previous tag of this type, unless
773	one of this tag's nesting reset triggers comes between this
774	tag and the previous tag of this type, OR unless this tag is a
775	generic nesting trigger and another generic nesting trigger
776	comes between this tag and the previous tag of this type.
777
778	Examples:
779	<p>Foo<b>Bar<p> should pop to 'p', not 'b'.
780	<p>Foo<table>Bar<p> should pop to 'table', not 'p'.
781	<p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
782	<p>Foo<b>Bar<p> should pop to 'p', not 'b'.
783
784	<li><ul><li> <li> should pop to 'ul', not the first 'li'.
785	<tr><table><tr> <tr> should pop to 'table', not the first 'tr'
786	<td><tr><td> <td> should pop to 'tr', not the first 'td'
787	"""
788
789	nestingResetTriggers = self.NESTABLE_TAGS.get(name)
790	isNestable = nestingResetTriggers != None
791	isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
792	popTo = None
793	inclusive = True
794	for i in range(len(self.tagStack) - 1, 0, -1):
795	p = self.tagStack[i]
796	if (not p or p.name == name) and not isNestable:
797	#Non-nestable tags get popped to the top or to their
798	#last occurance.
799	popTo = name
800	break
801	if (nestingResetTriggers != None
802	and p.name in nestingResetTriggers) \
803	or (nestingResetTriggers == None and isResetNesting
804	and self.RESET_NESTING_TAGS.has_key(p.name)):
805
806	#If we encounter one of the nesting reset triggers
807	#peculiar to this tag, or we encounter another tag
808	#that causes nesting to reset, pop up to but not
809	#including that tag.
810
811	popTo = p.name
812	inclusive = False
813	break
814	p = p.parent
815	if popTo:
816	self._popToTag(popTo, inclusive)
817
818	def unknown_starttag(self, name, attrs, selfClosing=0):
819	#print "Start tag %s" % name
820	if self.quoteStack:
821	#This is not a real tag.
822	#print "<%s> is not real!" % name
823	attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
824	self.handle_data('<%s%s>' % (name, attrs))
825	return
826	self.endData()
827	if not name in self.SELF_CLOSING_TAGS and not selfClosing:
828	self._smartPop(name)
829	tag = Tag(name, attrs, self.currentTag, self.previous)
830	if self.previous:
831	self.previous.next = tag
832	self.previous = tag
833	self.pushTag(tag)
834	if selfClosing or name in self.SELF_CLOSING_TAGS:
835	self.popTag()
836	if name in self.QUOTE_TAGS:
837	#print "Beginning quote (%s)" % name
838	self.quoteStack.append(name)
839	self.literal = 1
840
841	def unknown_endtag(self, name):
842	if self.quoteStack and self.quoteStack[-1] != name:
843	#This is not a real end tag.
844	#print "</%s> is not real!" % name
845	self.handle_data('</%s>' % name)
846	return
847	self.endData()
848	self._popToTag(name)
849	if self.quoteStack and self.quoteStack[-1] == name:
850	self.quoteStack.pop()
851	self.literal = (len(self.quoteStack) > 0)
852
853	def handle_data(self, data):
854	self.currentData.append(data)
855
856	def handle_pi(self, text):
857	"Propagate processing instructions right through."
858	self.handle_data("<?%s>" % text)
859
860	def handle_comment(self, text):
861	"Propagate comments right through."
862	self.handle_data("<!--%s-->" % text)
863
864	def handle_charref(self, ref):
865	"Propagate char refs right through."
866	self.handle_data('&#%s;' % ref)
867
868	def handle_entityref(self, ref):
869	"Propagate entity refs right through."
870	self.handle_data('&%s;' % ref)
871
872	def handle_decl(self, data):
873	"Propagate DOCTYPEs and the like right through."
874	self.handle_data('<!%s>' % data)
875
876	def parse_declaration(self, i):
877	"""Treat a bogus SGML declaration as raw data. Treat a CDATA
878	declaration as regular data."""
879	j = None
880	if self.rawdata[i:i + 9] == '<![CDATA[':
881	k = self.rawdata.find(']]>', i)
882	if k == -1:
883	k = len(self.rawdata)
884	self.handle_data(self.rawdata[i + 9:k])
885	j = k + 3
886	else:
887	try:
888	j = SGMLParser.parse_declaration(self, i)
889	except SGMLParseError:
890	toHandle = self.rawdata[i:]
891	self.handle_data(toHandle)
892	j = i + len(toHandle)
893	return j
894
895	class BeautifulSoup(BeautifulStoneSoup):
896
897	"""This parser knows the following facts about HTML:
898
899	* Some tags have no closing tag and should be interpreted as being
900	closed as soon as they are encountered.
901
902	* The text inside some tags (ie. 'script') may contain tags which
903	are not really part of the document and which should be parsed
904	as text, not tags. If you want to parse the text as tags, you can
905	always fetch it and parse it explicitly.
906
907	* Tag nesting rules:
908
909	Most tags can't be nested at all. For instance, the occurance of
910	a <p> tag should implicitly close the previous <p> tag.
911
912	<p>Para1<p>Para2
913	should be transformed into:
914	<p>Para1</p><p>Para2
915
916	Some tags can be nested arbitrarily. For instance, the occurance
917	of a <blockquote> tag should _not_ implicitly close the previous
918	<blockquote> tag.
919
920	Alice said: <blockquote>Bob said: <blockquote>Blah
921	should NOT be transformed into:
922	Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
923
924	Some tags can be nested, but the nesting is reset by the
925	interposition of other tags. For instance, a <tr> tag should
926	implicitly close the previous <tr> tag within the same <table>,
927	but not close a <tr> tag in another table.
928
929	<table><tr>Blah<tr>Blah
930	should be transformed into:
931	<table><tr>Blah</tr><tr>Blah
932	but,
933	<tr>Blah<table><tr>Blah
934	should NOT be transformed into
935	<tr>Blah<table></tr><tr>Blah
936
937	Differing assumptions about tag nesting rules are a major source
938	of problems with the BeautifulSoup class. If BeautifulSoup is not
939	treating as nestable a tag your page author treats as nestable,
940	try ICantBelieveItsBeautifulSoup before writing your own
941	subclass."""
942
943	SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta',
944	'spacer', 'link', 'frame', 'base'])
945
946	QUOTE_TAGS = {'script': None}
947
948	#According to the HTML standard, each of these inline tags can
949	#contain another tag of the same type. Furthermore, it's common
950	#to actually use these tags this way.
951	NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
952	'center']
953
954	#According to the HTML standard, these block tags can contain
955	#another tag of the same type. Furthermore, it's common
956	#to actually use these tags this way.
957	NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
958
959	#Lists can contain other lists, but there are restrictions.
960	NESTABLE_LIST_TAGS = { 'ol' : [],
961	'ul' : [],
962	'li' : ['ul', 'ol'],
963	'dl' : [],
964	'dd' : ['dl'],
965	'dt' : ['dl'] }
966
967	#Tables can contain other tables, but there are restrictions.
968	NESTABLE_TABLE_TAGS = {'table' : [],
969	'tr' : ['table', 'tbody', 'tfoot', 'thead'],
970	'td' : ['tr'],
971	'th' : ['tr'],
972	}
973
974	NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
975
976	#If one of these tags is encountered, all tags up to the next tag of
977	#this type are popped.
978	RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
979	NON_NESTABLE_BLOCK_TAGS,
980	NESTABLE_LIST_TAGS,
981	NESTABLE_TABLE_TAGS)
982
983	NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
984	NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
985
986	class ICantBelieveItsBeautifulSoup(BeautifulSoup):
987
988	"""The BeautifulSoup class is oriented towards skipping over
989	common HTML errors like unclosed tags. However, sometimes it makes
990	errors of its own. For instance, consider this fragment:
991
992	<b>Foo<b>Bar</b></b>
993
994	This is perfectly valid (if bizarre) HTML. However, the
995	BeautifulSoup class will implicitly close the first b tag when it
996	encounters the second 'b'. It will think the author wrote
997	"<b>Foo<b>Bar", and didn't close the first 'b' tag, because
998	there's no real-world reason to bold something that's already
999	bold. When it encounters '</b></b>' it will close two more 'b'
1000	tags, for a grand total of three tags closed instead of two. This
1001	can throw off the rest of your document structure. The same is
1002	true of a number of other tags, listed below.
1003
1004	It's much more common for someone to forget to close (eg.) a 'b'
1005	tag than to actually use nested 'b' tags, and the BeautifulSoup
1006	class handles the common case. This class handles the
1007	not-co-common case: where you can't believe someone wrote what
1008	they did, but it's valid HTML and BeautifulSoup screwed up by
1009	assuming it wouldn't be.
1010
1011	If this doesn't do what you need, try subclassing this class or
1012	BeautifulSoup, and providing your own list of NESTABLE_TAGS."""
1013
1014	I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1015	['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1016	'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1017	'big']
1018
1019	I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1020
1021	NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1022	I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1023	I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1024
1025	class BeautifulSOAP(BeautifulStoneSoup):
1026	"""This class will push a tag with only a single string child into
1027	the tag's parent as an attribute. The attribute's name is the tag
1028	name, and the value is the string child. An example should give
1029	the flavor of the change:
1030
1031	<foo><bar>baz</bar></foo>
1032	=>
1033	<foo bar="baz"><bar>baz</bar></foo>
1034
1035	You can then access fooTag['bar'] instead of fooTag.barTag.string.
1036
1037	This is, of course, useful for scraping structures that tend to
1038	use subelements instead of attributes, such as SOAP messages. Note
1039	that it modifies its input, so don't print the modified version
1040	out.
1041
1042	I'm not sure how many people really want to use this class; let me
1043	know if you do. Mainly I like the name."""
1044
1045	def popTag(self):
1046	if len(self.tagStack) > 1:
1047	tag = self.tagStack[-1]
1048	parent = self.tagStack[-2]
1049	parent._getAttrMap()
1050	if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1051	isinstance(tag.contents[0], NavigableText) and
1052	not parent.attrMap.has_key(tag.name)):
1053	parent[tag.name] = tag.contents[0]
1054	BeautifulStoneSoup.popTag(self)
1055
1056	#Enterprise class names! It has come to our attention that some people
1057	#think the names of the Beautiful Soup parser classes are too silly
1058	#and "unprofessional" for use in enterprise screen-scraping. We feel
1059	#your pain! For such-minded folk, the Beautiful Soup Consortium And
1060	#All-Night Kosher Bakery recommends renaming this file to
1061	#"RobustParser.py" (or, in cases of extreme enterprisitude,
1062	#"RobustParserBeanInterface.class") and using the following
1063	#enterprise-friendly class aliases:
1064	class RobustXMLParser(BeautifulStoneSoup):
1065	pass
1066	class RobustHTMLParser(BeautifulSoup):
1067	pass
1068	class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1069	pass
1070	class SimplifyingSOAPParser(BeautifulSOAP):
1071	pass
1072
1073	###
1074
1075
1076	#By default, act as an HTML pretty-printer.
1077	if __name__ == '__main__':
1078	import sys
1079	soup = BeautifulStoneSoup(sys.stdin.read())
1080	print soup.prettify()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: