Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: py-scraping/mechanize/_beautifulsoup.py@ 200

Last change on this file since 200 was 106, checked in by Rick van der Zwet, 15 years ago
Initial commit...
File size: 39.9 KB

Rev	Line
[106]	1	"""Beautiful Soup
	2	Elixir and Tonic
	3	"The Screen-Scraper's Friend"
	4	v2.1.1
	5	http://www.crummy.com/software/BeautifulSoup/
	6
	7	Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance
	8	into a tree representation. It provides methods and Pythonic idioms
	9	that make it easy to search and modify the tree.
	10
	11	A well-formed XML/HTML document will yield a well-formed data
	12	structure. An ill-formed XML/HTML document will yield a
	13	correspondingly ill-formed data structure. If your document is only
	14	locally well-formed, you can use this library to find and process the
	15	well-formed part of it. The BeautifulSoup class has heuristics for
	16	obtaining a sensible parse tree in the face of common HTML errors.
	17
	18	Beautiful Soup has no external dependencies. It works with Python 2.2
	19	and up.
	20
	21	Beautiful Soup defines classes for four different parsing strategies:
	22
	23	* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
	24	language that kind of looks like XML.
	25
	26	* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
	27	or invalid.
	28
	29	* ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML
	30	that trips up BeautifulSoup.
	31
	32	* BeautifulSOAP, for making it easier to parse XML documents that use
	33	lots of subelements containing a single string, where you'd prefer
	34	they put that string into an attribute (such as SOAP messages).
	35
	36	You can subclass BeautifulStoneSoup or BeautifulSoup to create a
	37	parsing strategy specific to an XML schema or a particular bizarre
	38	HTML document. Typically your subclass would just override
	39	SELF_CLOSING_TAGS and/or NESTABLE_TAGS.
	40	""" #"
	41	from __future__ import generators
	42
	43	__author__ = "Leonard Richardson (leonardr@segfault.org)"
	44	__version__ = "2.1.1"
	45	__date__ = "$Date: 2004/10/18 00:14:20 $"
	46	__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson"
	47	__license__ = "PSF"
	48
	49	from sgmllib import SGMLParser, SGMLParseError
	50	import types
	51	import re
	52	import sgmllib
	53
	54	#This code makes Beautiful Soup able to parse XML with namespaces
	55	sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
	56
	57	class NullType(object):
	58
	59	"""Similar to NoneType with a corresponding singleton instance
	60	'Null' that, unlike None, accepts any message and returns itself.
	61
	62	Examples:
	63	>>> Null("send", "a", "message")("and one more",
	64	... "and what you get still") is Null
	65	True
	66	"""
	67
	68	def __new__(cls): return Null
	69	def __call__(self, args, *kwargs): return Null
	70	## def __getstate__(self, *args): return Null
	71	def __getattr__(self, attr): return Null
	72	def __getitem__(self, item): return Null
	73	def __setattr__(self, attr, value): pass
	74	def __setitem__(self, item, value): pass
	75	def __len__(self): return 0
	76	# FIXME: is this a python bug? otherwise ``for x in Null: pass``
	77	# never terminates...
	78	def __iter__(self): return iter([])
	79	def __contains__(self, item): return False
	80	def __repr__(self): return "Null"
	81	Null = object.__new__(NullType)
	82
	83	class PageElement:
	84	"""Contains the navigational information for some part of the page
	85	(either a tag or a piece of text)"""
	86
	87	def setup(self, parent=Null, previous=Null):
	88	"""Sets up the initial relations between this element and
	89	other elements."""
	90	self.parent = parent
	91	self.previous = previous
	92	self.next = Null
	93	self.previousSibling = Null
	94	self.nextSibling = Null
	95	if self.parent and self.parent.contents:
	96	self.previousSibling = self.parent.contents[-1]
	97	self.previousSibling.nextSibling = self
	98
	99	def findNext(self, name=None, attrs={}, text=None):
	100	"""Returns the first item that matches the given criteria and
	101	appears after this Tag in the document."""
	102	return self._first(self.fetchNext, name, attrs, text)
	103	firstNext = findNext
	104
	105	def fetchNext(self, name=None, attrs={}, text=None, limit=None):
	106	"""Returns all items that match the given criteria and appear
	107	before after Tag in the document."""
	108	return self._fetch(name, attrs, text, limit, self.nextGenerator)
	109
	110	def findNextSibling(self, name=None, attrs={}, text=None):
	111	"""Returns the closest sibling to this Tag that matches the
	112	given criteria and appears after this Tag in the document."""
	113	return self._first(self.fetchNextSiblings, name, attrs, text)
	114	firstNextSibling = findNextSibling
	115
	116	def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None):
	117	"""Returns the siblings of this Tag that match the given
	118	criteria and appear after this Tag in the document."""
	119	return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator)
	120
	121	def findPrevious(self, name=None, attrs={}, text=None):
	122	"""Returns the first item that matches the given criteria and
	123	appears before this Tag in the document."""
	124	return self._first(self.fetchPrevious, name, attrs, text)
	125
	126	def fetchPrevious(self, name=None, attrs={}, text=None, limit=None):
	127	"""Returns all items that match the given criteria and appear
	128	before this Tag in the document."""
	129	return self._fetch(name, attrs, text, limit, self.previousGenerator)
	130	firstPrevious = findPrevious
	131
	132	def findPreviousSibling(self, name=None, attrs={}, text=None):
	133	"""Returns the closest sibling to this Tag that matches the
	134	given criteria and appears before this Tag in the document."""
	135	return self._first(self.fetchPreviousSiblings, name, attrs, text)
	136	firstPreviousSibling = findPreviousSibling
	137
	138	def fetchPreviousSiblings(self, name=None, attrs={}, text=None,
	139	limit=None):
	140	"""Returns the siblings of this Tag that match the given
	141	criteria and appear before this Tag in the document."""
	142	return self._fetch(name, attrs, text, limit,
	143	self.previousSiblingGenerator)
	144
	145	def findParent(self, name=None, attrs={}):
	146	"""Returns the closest parent of this Tag that matches the given
	147	criteria."""
	148	r = Null
	149	l = self.fetchParents(name, attrs, 1)
	150	if l:
	151	r = l[0]
	152	return r
	153	firstParent = findParent
	154
	155	def fetchParents(self, name=None, attrs={}, limit=None):
	156	"""Returns the parents of this Tag that match the given
	157	criteria."""
	158	return self._fetch(name, attrs, None, limit, self.parentGenerator)
	159
	160	#These methods do the real heavy lifting.
	161
	162	def _first(self, method, name, attrs, text):
	163	r = Null
	164	l = method(name, attrs, text, 1)
	165	if l:
	166	r = l[0]
	167	return r
	168
	169	def _fetch(self, name, attrs, text, limit, generator):
	170	"Iterates over a generator looking for things that match."
	171	if not hasattr(attrs, 'items'):
	172	attrs = {'class' : attrs}
	173
	174	results = []
	175	g = generator()
	176	while True:
	177	try:
	178	i = g.next()
	179	except StopIteration:
	180	break
	181	found = None
	182	if isinstance(i, Tag):
	183	if not text:
	184	if not name or self._matches(i, name):
	185	match = True
	186	for attr, matchAgainst in attrs.items():
	187	check = i.get(attr)
	188	if not self._matches(check, matchAgainst):
	189	match = False
	190	break
	191	if match:
	192	found = i
	193	elif text:
	194	if self._matches(i, text):
	195	found = i
	196	if found:
	197	results.append(found)
	198	if limit and len(results) >= limit:
	199	break
	200	return results
	201
	202	#Generators that can be used to navigate starting from both
	203	#NavigableTexts and Tags.
	204	def nextGenerator(self):
	205	i = self
	206	while i:
	207	i = i.next
	208	yield i
	209
	210	def nextSiblingGenerator(self):
	211	i = self
	212	while i:
	213	i = i.nextSibling
	214	yield i
	215
	216	def previousGenerator(self):
	217	i = self
	218	while i:
	219	i = i.previous
	220	yield i
	221
	222	def previousSiblingGenerator(self):
	223	i = self
	224	while i:
	225	i = i.previousSibling
	226	yield i
	227
	228	def parentGenerator(self):
	229	i = self
	230	while i:
	231	i = i.parent
	232	yield i
	233
	234	def _matches(self, chunk, howToMatch):
	235	#print 'looking for %s in %s' % (howToMatch, chunk)
	236	#
	237	# If given a list of items, return true if the list contains a
	238	# text element that matches.
	239	if isList(chunk) and not isinstance(chunk, Tag):
	240	for tag in chunk:
	241	if isinstance(tag, NavigableText) and self._matches(tag, howToMatch):
	242	return True
	243	return False
	244	if callable(howToMatch):
	245	return howToMatch(chunk)
	246	if isinstance(chunk, Tag):
	247	#Custom match methods take the tag as an argument, but all other
	248	#ways of matching match the tag name as a string
	249	chunk = chunk.name
	250	#Now we know that chunk is a string
	251	if not isinstance(chunk, basestring):
	252	chunk = str(chunk)
	253	if hasattr(howToMatch, 'match'):
	254	# It's a regexp object.
	255	return howToMatch.search(chunk)
	256	if isList(howToMatch):
	257	return chunk in howToMatch
	258	if hasattr(howToMatch, 'items'):
	259	return howToMatch.has_key(chunk)
	260	#It's just a string
	261	return str(howToMatch) == chunk
	262
	263	class NavigableText(PageElement):
	264
	265	def __getattr__(self, attr):
	266	"For backwards compatibility, text.string gives you text"
	267	if attr == 'string':
	268	return self
	269	else:
	270	raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
	271
	272	class NavigableString(str, NavigableText):
	273	pass
	274
	275	class NavigableUnicodeString(unicode, NavigableText):
	276	pass
	277
	278	class Tag(PageElement):
	279
	280	"""Represents a found HTML tag with its attributes and contents."""
	281
	282	def __init__(self, name, attrs=None, parent=Null, previous=Null):
	283	"Basic constructor."
	284	self.name = name
	285	if attrs == None:
	286	attrs = []
	287	self.attrs = attrs
	288	self.contents = []
	289	self.setup(parent, previous)
	290	self.hidden = False
	291
	292	def get(self, key, default=None):
	293	"""Returns the value of the 'key' attribute for the tag, or
	294	the value given for 'default' if it doesn't have that
	295	attribute."""
	296	return self._getAttrMap().get(key, default)
	297
	298	def __getitem__(self, key):
	299	"""tag[key] returns the value of the 'key' attribute for the tag,
	300	and throws an exception if it's not there."""
	301	return self._getAttrMap()[key]
	302
	303	def __iter__(self):
	304	"Iterating over a tag iterates over its contents."
	305	return iter(self.contents)
	306
	307	def __len__(self):
	308	"The length of a tag is the length of its list of contents."
	309	return len(self.contents)
	310
	311	def __contains__(self, x):
	312	return x in self.contents
	313
	314	def __nonzero__(self):
	315	"A tag is non-None even if it has no contents."
	316	return True
	317
	318	def __setitem__(self, key, value):
	319	"""Setting tag[key] sets the value of the 'key' attribute for the
	320	tag."""
	321	self._getAttrMap()
	322	self.attrMap[key] = value
	323	found = False
	324	for i in range(0, len(self.attrs)):
	325	if self.attrs[i][0] == key:
	326	self.attrs[i] = (key, value)
	327	found = True
	328	if not found:
	329	self.attrs.append((key, value))
	330	self._getAttrMap()[key] = value
	331
	332	def __delitem__(self, key):
	333	"Deleting tag[key] deletes all 'key' attributes for the tag."
	334	for item in self.attrs:
	335	if item[0] == key:
	336	self.attrs.remove(item)
	337	#We don't break because bad HTML can define the same
	338	#attribute multiple times.
	339	self._getAttrMap()
	340	if self.attrMap.has_key(key):
	341	del self.attrMap[key]
	342
	343	def __call__(self, args, *kwargs):
	344	"""Calling a tag like a function is the same as calling its
	345	fetch() method. Eg. tag('a') returns a list of all the A tags
	346	found within this tag."""
	347	return apply(self.fetch, args, kwargs)
	348
	349	def __getattr__(self, tag):
	350	if len(tag) > 3 and tag.rfind('Tag') == len(tag) - 3:
	351	return self.first(tag[:-3])
	352	elif tag.find('__') != 0:
	353	return self.first(tag)
	354
	355	def __eq__(self, other):
	356	"""Returns true iff this tag has the same name, the same attributes,
	357	and the same contents (recursively) as the given tag.
	358
	359	NOTE: right now this will return false if two tags have the
	360	same attributes in a different order. Should this be fixed?"""
	361	if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
	362	return False
	363	for i in range(0, len(self.contents)):
	364	if self.contents[i] != other.contents[i]:
	365	return False
	366	return True
	367
	368	def __ne__(self, other):
	369	"""Returns true iff this tag is not identical to the other tag,
	370	as defined in __eq__."""
	371	return not self == other
	372
	373	def __repr__(self):
	374	"""Renders this tag as a string."""
	375	return str(self)
	376
	377	def __unicode__(self):
	378	return self.__str__(1)
	379
	380	def __str__(self, needUnicode=None, showStructureIndent=None):
	381	"""Returns a string or Unicode representation of this tag and
	382	its contents.
	383
	384	NOTE: since Python's HTML parser consumes whitespace, this
	385	method is not certain to reproduce the whitespace present in
	386	the original string."""
	387
	388	attrs = []
	389	if self.attrs:
	390	for key, val in self.attrs:
	391	attrs.append('%s="%s"' % (key, val))
	392	close = ''
	393	closeTag = ''
	394	if self.isSelfClosing():
	395	close = ' /'
	396	else:
	397	closeTag = '</%s>' % self.name
	398	indentIncrement = None
	399	if showStructureIndent != None:
	400	indentIncrement = showStructureIndent
	401	if not self.hidden:
	402	indentIncrement += 1
	403	contents = self.renderContents(indentIncrement, needUnicode=needUnicode)
	404	if showStructureIndent:
	405	space = '\n%s' % (' ' * showStructureIndent)
	406	if self.hidden:
	407	s = contents
	408	else:
	409	s = []
	410	attributeString = ''
	411	if attrs:
	412	attributeString = ' ' + ' '.join(attrs)
	413	if showStructureIndent:
	414	s.append(space)
	415	s.append('<%s%s%s>' % (self.name, attributeString, close))
	416	s.append(contents)
	417	if closeTag and showStructureIndent != None:
	418	s.append(space)
	419	s.append(closeTag)
	420	s = ''.join(s)
	421	isUnicode = type(s) == types.UnicodeType
	422	if needUnicode and not isUnicode:
	423	s = unicode(s)
	424	elif isUnicode and needUnicode == False:
	425	s = str(s)
	426	return s
	427
	428	def prettify(self, needUnicode=None):
	429	return self.__str__(needUnicode, showStructureIndent=True)
	430
	431	def renderContents(self, showStructureIndent=None, needUnicode=None):
	432	"""Renders the contents of this tag as a (possibly Unicode)
	433	string."""
	434	s = []
	435	for c in self:
	436	text = None
	437	if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType:
	438	text = unicode(c)
	439	elif isinstance(c, Tag):
	440	s.append(c.__str__(needUnicode, showStructureIndent))
	441	elif needUnicode:
	442	text = unicode(c)
	443	else:
	444	text = str(c)
	445	if text:
	446	if showStructureIndent != None:
	447	if text[-1] == '\n':
	448	text = text[:-1]
	449	s.append(text)
	450	return ''.join(s)
	451
	452	#Soup methods
	453
	454	def firstText(self, text, recursive=True):
	455	"""Convenience method to retrieve the first piece of text matching the
	456	given criteria. 'text' can be a string, a regular expression object,
	457	a callable that takes a string and returns whether or not the
	458	string 'matches', etc."""
	459	return self.first(recursive=recursive, text=text)
	460
	461	def fetchText(self, text, recursive=True, limit=None):
	462	"""Convenience method to retrieve all pieces of text matching the
	463	given criteria. 'text' can be a string, a regular expression object,
	464	a callable that takes a string and returns whether or not the
	465	string 'matches', etc."""
	466	return self.fetch(recursive=recursive, text=text, limit=limit)
	467
	468	def first(self, name=None, attrs={}, recursive=True, text=None):
	469	"""Return only the first child of this
	470	Tag matching the given criteria."""
	471	r = Null
	472	l = self.fetch(name, attrs, recursive, text, 1)
	473	if l:
	474	r = l[0]
	475	return r
	476	findChild = first
	477
	478	def fetch(self, name=None, attrs={}, recursive=True, text=None,
	479	limit=None):
	480	"""Extracts a list of Tag objects that match the given
	481	criteria. You can specify the name of the Tag and any
	482	attributes you want the Tag to have.
	483
	484	The value of a key-value pair in the 'attrs' map can be a
	485	string, a list of strings, a regular expression object, or a
	486	callable that takes a string and returns whether or not the
	487	string matches for some custom definition of 'matches'. The
	488	same is true of the tag name."""
	489	generator = self.recursiveChildGenerator
	490	if not recursive:
	491	generator = self.childGenerator
	492	return self._fetch(name, attrs, text, limit, generator)
	493	fetchChildren = fetch
	494
	495	#Utility methods
	496
	497	def isSelfClosing(self):
	498	"""Returns true iff this is a self-closing tag as defined in the HTML
	499	standard.
	500
	501	TODO: This is specific to BeautifulSoup and its subclasses, but it's
	502	used by __str__"""
	503	return self.name in BeautifulSoup.SELF_CLOSING_TAGS
	504
	505	def append(self, tag):
	506	"""Appends the given tag to the contents of this tag."""
	507	self.contents.append(tag)
	508
	509	#Private methods
	510
	511	def _getAttrMap(self):
	512	"""Initializes a map representation of this tag's attributes,
	513	if not already initialized."""
	514	if not getattr(self, 'attrMap'):
	515	self.attrMap = {}
	516	for (key, value) in self.attrs:
	517	self.attrMap[key] = value
	518	return self.attrMap
	519
	520	#Generator methods
	521	def childGenerator(self):
	522	for i in range(0, len(self.contents)):
	523	yield self.contents[i]
	524	raise StopIteration
	525
	526	def recursiveChildGenerator(self):
	527	stack = [(self, 0)]
	528	while stack:
	529	tag, start = stack.pop()
	530	if isinstance(tag, Tag):
	531	for i in range(start, len(tag.contents)):
	532	a = tag.contents[i]
	533	yield a
	534	if isinstance(a, Tag) and tag.contents:
	535	if i < len(tag.contents) - 1:
	536	stack.append((tag, i + 1))
	537	stack.append((a, 0))
	538	break
	539	raise StopIteration
	540
	541
	542	def isList(l):
	543	"""Convenience method that works with all 2.x versions of Python
	544	to determine whether or not something is listlike."""
	545	return hasattr(l, '__iter__') \
	546	or (type(l) in (types.ListType, types.TupleType))
	547
	548	def buildTagMap(default, *args):
	549	"""Turns a list of maps, lists, or scalars into a single map.
	550	Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
	551	of lists and partial maps."""
	552	built = {}
	553	for portion in args:
	554	if hasattr(portion, 'items'):
	555	#It's a map. Merge it.
	556	for k, v in portion.items():
	557	built[k] = v
	558	elif isList(portion):
	559	#It's a list. Map each item to the default.
	560	for k in portion:
	561	built[k] = default
	562	else:
	563	#It's a scalar. Map it to the default.
	564	built[portion] = default
	565	return built
	566
	567	class BeautifulStoneSoup(Tag, SGMLParser):
	568
	569	"""This class contains the basic parser and fetch code. It defines
	570	a parser that knows nothing about tag behavior except for the
	571	following:
	572
	573	You can't close a tag without closing all the tags it encloses.
	574	That is, "<foo><bar></foo>" actually means
	575	"<foo><bar></bar></foo>".
	576
	577	[Another possible explanation is "<foo><bar /></foo>", but since
	578	this class defines no SELF_CLOSING_TAGS, it will never use that
	579	explanation.]
	580
	581	This class is useful for parsing XML or made-up markup languages,
	582	or when BeautifulSoup makes an assumption counter to what you were
	583	expecting."""
	584
	585	SELF_CLOSING_TAGS = {}
	586	NESTABLE_TAGS = {}
	587	RESET_NESTING_TAGS = {}
	588	QUOTE_TAGS = {}
	589
	590	#As a public service we will by default silently replace MS smart quotes
	591	#and similar characters with their HTML or ASCII equivalents.
	592	MS_CHARS = { '\x80' : '€',
	593	'\x81' : ' ',
	594	'\x82' : '&sbquo;',
	595	'\x83' : '&fnof;',
	596	'\x84' : '&bdquo;',
	597	'\x85' : '…',
	598	'\x86' : '&dagger;',
	599	'\x87' : '&Dagger;',
	600	'\x88' : '&caret;',
	601	'\x89' : '%',
	602	'\x8A' : '&Scaron;',
	603	'\x8B' : '<',
	604	'\x8C' : '&OElig;',
	605	'\x8D' : '?',
	606	'\x8E' : 'Z',
	607	'\x8F' : '?',
	608	'\x90' : '?',
	609	'\x91' : '‘',
	610	'\x92' : '’',
	611	'\x93' : '“',
	612	'\x94' : '”',
	613	'\x95' : '•',
	614	'\x96' : '–',
	615	'\x97' : '—',
	616	'\x98' : '&tilde;',
	617	'\x99' : '™',
	618	'\x9a' : '&scaron;',
	619	'\x9b' : '>',
	620	'\x9c' : '&oelig;',
	621	'\x9d' : '?',
	622	'\x9e' : 'z',
	623	'\x9f' : '&Yuml;', }
	624
	625	PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
	626	lambda(x):x.group(1) + ' />'),
	627	(re.compile('<!\s+([^<>]*)>'),
	628	lambda(x):'<!' + x.group(1) + '>'),
	629	(re.compile("([\x80-\x9f])"),
	630	lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1)))
	631	]
	632
	633	ROOT_TAG_NAME = '[document]'
	634
	635	def __init__(self, text=None, avoidParserProblems=True,
	636	initialTextIsEverything=True):
	637	"""Initialize this as the 'root tag' and feed in any text to
	638	the parser.
	639
	640	NOTE about avoidParserProblems: sgmllib will process most bad
	641	HTML, and BeautifulSoup has tricks for dealing with some HTML
	642	that kills sgmllib, but Beautiful Soup can nonetheless choke
	643	or lose data if your data uses self-closing tags or
	644	declarations incorrectly. By default, Beautiful Soup sanitizes
	645	its input to avoid the vast majority of these problems. The
	646	problems are relatively rare, even in bad HTML, so feel free
	647	to pass in False to avoidParserProblems if they don't apply to
	648	you, and you'll get better performance. The only reason I have
	649	this turned on by default is so I don't get so many tech
	650	support questions.
	651
	652	The two most common instances of invalid HTML that will choke
	653	sgmllib are fixed by the default parser massage techniques:
	654
	655	<br/> (No space between name of closing tag and tag close)
	656	<! --Comment--> (Extraneous whitespace in declaration)
	657
	658	You can pass in a custom list of (RE object, replace method)
	659	tuples to get Beautiful Soup to scrub your input the way you
	660	want."""
	661	Tag.__init__(self, self.ROOT_TAG_NAME)
	662	if avoidParserProblems \
	663	and not isList(avoidParserProblems):
	664	avoidParserProblems = self.PARSER_MASSAGE
	665	self.avoidParserProblems = avoidParserProblems
	666	SGMLParser.__init__(self)
	667	self.quoteStack = []
	668	self.hidden = 1
	669	self.reset()
	670	if hasattr(text, 'read'):
	671	#It's a file-type object.
	672	text = text.read()
	673	if text:
	674	self.feed(text)
	675	if initialTextIsEverything:
	676	self.done()
	677
	678	def __getattr__(self, methodName):
	679	"""This method routes method call requests to either the SGMLParser
	680	superclass or the Tag superclass, depending on the method name."""
	681	if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
	682	or methodName.find('do_') == 0:
	683	return SGMLParser.__getattr__(self, methodName)
	684	elif methodName.find('__') != 0:
	685	return Tag.__getattr__(self, methodName)
	686	else:
	687	raise AttributeError
	688
	689	def feed(self, text):
	690	if self.avoidParserProblems:
	691	for fix, m in self.avoidParserProblems:
	692	text = fix.sub(m, text)
	693	SGMLParser.feed(self, text)
	694
	695	def done(self):
	696	"""Called when you're done parsing, so that the unclosed tags can be
	697	correctly processed."""
	698	self.endData() #NEW
	699	while self.currentTag.name != self.ROOT_TAG_NAME:
	700	self.popTag()
	701
	702	def reset(self):
	703	SGMLParser.reset(self)
	704	self.currentData = []
	705	self.currentTag = None
	706	self.tagStack = []
	707	self.pushTag(self)
	708
	709	def popTag(self):
	710	tag = self.tagStack.pop()
	711	# Tags with just one string-owning child get the child as a
	712	# 'string' property, so that soup.tag.string is shorthand for
	713	# soup.tag.contents[0]
	714	if len(self.currentTag.contents) == 1 and \
	715	isinstance(self.currentTag.contents[0], NavigableText):
	716	self.currentTag.string = self.currentTag.contents[0]
	717
	718	#print "Pop", tag.name
	719	if self.tagStack:
	720	self.currentTag = self.tagStack[-1]
	721	return self.currentTag
	722
	723	def pushTag(self, tag):
	724	#print "Push", tag.name
	725	if self.currentTag:
	726	self.currentTag.append(tag)
	727	self.tagStack.append(tag)
	728	self.currentTag = self.tagStack[-1]
	729
	730	def endData(self):
	731	currentData = ''.join(self.currentData)
	732	if currentData:
	733	if not currentData.strip():
	734	if '\n' in currentData:
	735	currentData = '\n'
	736	else:
	737	currentData = ' '
	738	c = NavigableString
	739	if type(currentData) == types.UnicodeType:
	740	c = NavigableUnicodeString
	741	o = c(currentData)
	742	o.setup(self.currentTag, self.previous)
	743	if self.previous:
	744	self.previous.next = o
	745	self.previous = o
	746	self.currentTag.contents.append(o)
	747	self.currentData = []
	748
	749	def _popToTag(self, name, inclusivePop=True):
	750	"""Pops the tag stack up to and including the most recent
	751	instance of the given tag. If inclusivePop is false, pops the tag
	752	stack up to but not including the most recent instqance of
	753	the given tag."""
	754	if name == self.ROOT_TAG_NAME:
	755	return
	756
	757	numPops = 0
	758	mostRecentTag = None
	759	for i in range(len(self.tagStack) - 1, 0, -1):
	760	if name == self.tagStack[i].name:
	761	numPops = len(self.tagStack) - i
	762	break
	763	if not inclusivePop:
	764	numPops = numPops - 1
	765
	766	for i in range(0, numPops):
	767	mostRecentTag = self.popTag()
	768	return mostRecentTag
	769
	770	def _smartPop(self, name):
	771
	772	"""We need to pop up to the previous tag of this type, unless
	773	one of this tag's nesting reset triggers comes between this
	774	tag and the previous tag of this type, OR unless this tag is a
	775	generic nesting trigger and another generic nesting trigger
	776	comes between this tag and the previous tag of this type.
	777
	778	Examples:
	779	<p>Foo<b>Bar<p> should pop to 'p', not 'b'.
	780	<p>Foo<table>Bar<p> should pop to 'table', not 'p'.
	781	<p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
	782	<p>Foo<b>Bar<p> should pop to 'p', not 'b'.
	783
	784	<li><ul><li> <li> should pop to 'ul', not the first 'li'.
	785	<tr><table><tr> <tr> should pop to 'table', not the first 'tr'
	786	<td><tr><td> <td> should pop to 'tr', not the first 'td'
	787	"""
	788
	789	nestingResetTriggers = self.NESTABLE_TAGS.get(name)
	790	isNestable = nestingResetTriggers != None
	791	isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
	792	popTo = None
	793	inclusive = True
	794	for i in range(len(self.tagStack) - 1, 0, -1):
	795	p = self.tagStack[i]
	796	if (not p or p.name == name) and not isNestable:
	797	#Non-nestable tags get popped to the top or to their
	798	#last occurance.
	799	popTo = name
	800	break
	801	if (nestingResetTriggers != None
	802	and p.name in nestingResetTriggers) \
	803	or (nestingResetTriggers == None and isResetNesting
	804	and self.RESET_NESTING_TAGS.has_key(p.name)):
	805
	806	#If we encounter one of the nesting reset triggers
	807	#peculiar to this tag, or we encounter another tag
	808	#that causes nesting to reset, pop up to but not
	809	#including that tag.
	810
	811	popTo = p.name
	812	inclusive = False
	813	break
	814	p = p.parent
	815	if popTo:
	816	self._popToTag(popTo, inclusive)
	817
	818	def unknown_starttag(self, name, attrs, selfClosing=0):
	819	#print "Start tag %s" % name
	820	if self.quoteStack:
	821	#This is not a real tag.
	822	#print "<%s> is not real!" % name
	823	attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
	824	self.handle_data('<%s%s>' % (name, attrs))
	825	return
	826	self.endData()
	827	if not name in self.SELF_CLOSING_TAGS and not selfClosing:
	828	self._smartPop(name)
	829	tag = Tag(name, attrs, self.currentTag, self.previous)
	830	if self.previous:
	831	self.previous.next = tag
	832	self.previous = tag
	833	self.pushTag(tag)
	834	if selfClosing or name in self.SELF_CLOSING_TAGS:
	835	self.popTag()
	836	if name in self.QUOTE_TAGS:
	837	#print "Beginning quote (%s)" % name
	838	self.quoteStack.append(name)
	839	self.literal = 1
	840
	841	def unknown_endtag(self, name):
	842	if self.quoteStack and self.quoteStack[-1] != name:
	843	#This is not a real end tag.
	844	#print "</%s> is not real!" % name
	845	self.handle_data('</%s>' % name)
	846	return
	847	self.endData()
	848	self._popToTag(name)
	849	if self.quoteStack and self.quoteStack[-1] == name:
	850	self.quoteStack.pop()
	851	self.literal = (len(self.quoteStack) > 0)
	852
	853	def handle_data(self, data):
	854	self.currentData.append(data)
	855
	856	def handle_pi(self, text):
	857	"Propagate processing instructions right through."
	858	self.handle_data("<?%s>" % text)
	859
	860	def handle_comment(self, text):
	861	"Propagate comments right through."
	862	self.handle_data("<!--%s-->" % text)
	863
	864	def handle_charref(self, ref):
	865	"Propagate char refs right through."
	866	self.handle_data('&#%s;' % ref)
	867
	868	def handle_entityref(self, ref):
	869	"Propagate entity refs right through."
	870	self.handle_data('&%s;' % ref)
	871
	872	def handle_decl(self, data):
	873	"Propagate DOCTYPEs and the like right through."
	874	self.handle_data('<!%s>' % data)
	875
	876	def parse_declaration(self, i):
	877	"""Treat a bogus SGML declaration as raw data. Treat a CDATA
	878	declaration as regular data."""
	879	j = None
	880	if self.rawdata[i:i + 9] == '<![CDATA[':
	881	k = self.rawdata.find(']]>', i)
	882	if k == -1:
	883	k = len(self.rawdata)
	884	self.handle_data(self.rawdata[i + 9:k])
	885	j = k + 3
	886	else:
	887	try:
	888	j = SGMLParser.parse_declaration(self, i)
	889	except SGMLParseError:
	890	toHandle = self.rawdata[i:]
	891	self.handle_data(toHandle)
	892	j = i + len(toHandle)
	893	return j
	894
	895	class BeautifulSoup(BeautifulStoneSoup):
	896
	897	"""This parser knows the following facts about HTML:
	898
	899	* Some tags have no closing tag and should be interpreted as being
	900	closed as soon as they are encountered.
	901
	902	* The text inside some tags (ie. 'script') may contain tags which
	903	are not really part of the document and which should be parsed
	904	as text, not tags. If you want to parse the text as tags, you can
	905	always fetch it and parse it explicitly.
	906
	907	* Tag nesting rules:
	908
	909	Most tags can't be nested at all. For instance, the occurance of
	910	a <p> tag should implicitly close the previous <p> tag.
	911
	912	<p>Para1<p>Para2
	913	should be transformed into:
	914	<p>Para1</p><p>Para2
	915
	916	Some tags can be nested arbitrarily. For instance, the occurance
	917	of a <blockquote> tag should _not_ implicitly close the previous
	918	<blockquote> tag.
	919
	920	Alice said: <blockquote>Bob said: <blockquote>Blah
	921	should NOT be transformed into:
	922	Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
	923
	924	Some tags can be nested, but the nesting is reset by the
	925	interposition of other tags. For instance, a <tr> tag should
	926	implicitly close the previous <tr> tag within the same <table>,
	927	but not close a <tr> tag in another table.
	928
	929	<table><tr>Blah<tr>Blah
	930	should be transformed into:
	931	<table><tr>Blah</tr><tr>Blah
	932	but,
	933	<tr>Blah<table><tr>Blah
	934	should NOT be transformed into
	935	<tr>Blah<table></tr><tr>Blah
	936
	937	Differing assumptions about tag nesting rules are a major source
	938	of problems with the BeautifulSoup class. If BeautifulSoup is not
	939	treating as nestable a tag your page author treats as nestable,
	940	try ICantBelieveItsBeautifulSoup before writing your own
	941	subclass."""
	942
	943	SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta',
	944	'spacer', 'link', 'frame', 'base'])
	945
	946	QUOTE_TAGS = {'script': None}
	947
	948	#According to the HTML standard, each of these inline tags can
	949	#contain another tag of the same type. Furthermore, it's common
	950	#to actually use these tags this way.
	951	NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
	952	'center']
	953
	954	#According to the HTML standard, these block tags can contain
	955	#another tag of the same type. Furthermore, it's common
	956	#to actually use these tags this way.
	957	NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
	958
	959	#Lists can contain other lists, but there are restrictions.
	960	NESTABLE_LIST_TAGS = { 'ol' : [],
	961	'ul' : [],
	962	'li' : ['ul', 'ol'],
	963	'dl' : [],
	964	'dd' : ['dl'],
	965	'dt' : ['dl'] }
	966
	967	#Tables can contain other tables, but there are restrictions.
	968	NESTABLE_TABLE_TAGS = {'table' : [],
	969	'tr' : ['table', 'tbody', 'tfoot', 'thead'],
	970	'td' : ['tr'],
	971	'th' : ['tr'],
	972	}
	973
	974	NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
	975
	976	#If one of these tags is encountered, all tags up to the next tag of
	977	#this type are popped.
	978	RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
	979	NON_NESTABLE_BLOCK_TAGS,
	980	NESTABLE_LIST_TAGS,
	981	NESTABLE_TABLE_TAGS)
	982
	983	NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
	984	NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
	985
	986	class ICantBelieveItsBeautifulSoup(BeautifulSoup):
	987
	988	"""The BeautifulSoup class is oriented towards skipping over
	989	common HTML errors like unclosed tags. However, sometimes it makes
	990	errors of its own. For instance, consider this fragment:
	991
	992	<b>Foo<b>Bar</b></b>
	993
	994	This is perfectly valid (if bizarre) HTML. However, the
	995	BeautifulSoup class will implicitly close the first b tag when it
	996	encounters the second 'b'. It will think the author wrote
	997	"<b>Foo<b>Bar", and didn't close the first 'b' tag, because
	998	there's no real-world reason to bold something that's already
	999	bold. When it encounters '</b></b>' it will close two more 'b'
	1000	tags, for a grand total of three tags closed instead of two. This
	1001	can throw off the rest of your document structure. The same is
	1002	true of a number of other tags, listed below.
	1003
	1004	It's much more common for someone to forget to close (eg.) a 'b'
	1005	tag than to actually use nested 'b' tags, and the BeautifulSoup
	1006	class handles the common case. This class handles the
	1007	not-co-common case: where you can't believe someone wrote what
	1008	they did, but it's valid HTML and BeautifulSoup screwed up by
	1009	assuming it wouldn't be.
	1010
	1011	If this doesn't do what you need, try subclassing this class or
	1012	BeautifulSoup, and providing your own list of NESTABLE_TAGS."""
	1013
	1014	I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
	1015	['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
	1016	'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
	1017	'big']
	1018
	1019	I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
	1020
	1021	NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
	1022	I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
	1023	I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
	1024
	1025	class BeautifulSOAP(BeautifulStoneSoup):
	1026	"""This class will push a tag with only a single string child into
	1027	the tag's parent as an attribute. The attribute's name is the tag
	1028	name, and the value is the string child. An example should give
	1029	the flavor of the change:
	1030
	1031	<foo><bar>baz</bar></foo>
	1032	=>
	1033	<foo bar="baz"><bar>baz</bar></foo>
	1034
	1035	You can then access fooTag['bar'] instead of fooTag.barTag.string.
	1036
	1037	This is, of course, useful for scraping structures that tend to
	1038	use subelements instead of attributes, such as SOAP messages. Note
	1039	that it modifies its input, so don't print the modified version
	1040	out.
	1041
	1042	I'm not sure how many people really want to use this class; let me
	1043	know if you do. Mainly I like the name."""
	1044
	1045	def popTag(self):
	1046	if len(self.tagStack) > 1:
	1047	tag = self.tagStack[-1]
	1048	parent = self.tagStack[-2]
	1049	parent._getAttrMap()
	1050	if (isinstance(tag, Tag) and len(tag.contents) == 1 and
	1051	isinstance(tag.contents[0], NavigableText) and
	1052	not parent.attrMap.has_key(tag.name)):
	1053	parent[tag.name] = tag.contents[0]
	1054	BeautifulStoneSoup.popTag(self)
	1055
	1056	#Enterprise class names! It has come to our attention that some people
	1057	#think the names of the Beautiful Soup parser classes are too silly
	1058	#and "unprofessional" for use in enterprise screen-scraping. We feel
	1059	#your pain! For such-minded folk, the Beautiful Soup Consortium And
	1060	#All-Night Kosher Bakery recommends renaming this file to
	1061	#"RobustParser.py" (or, in cases of extreme enterprisitude,
	1062	#"RobustParserBeanInterface.class") and using the following
	1063	#enterprise-friendly class aliases:
	1064	class RobustXMLParser(BeautifulStoneSoup):
	1065	pass
	1066	class RobustHTMLParser(BeautifulSoup):
	1067	pass
	1068	class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
	1069	pass
	1070	class SimplifyingSOAPParser(BeautifulSOAP):
	1071	pass
	1072
	1073	###
	1074
	1075
	1076	#By default, act as an HTML pretty-printer.
	1077	if __name__ == '__main__':
	1078	import sys
	1079	soup = BeautifulStoneSoup(sys.stdin.read())
	1080	print soup.prettify()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: