Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: py-scraping/mechanize/_pullparser.py@ 148

Last change on this file since 148 was 106, checked in by Rick van der Zwet, 15 years ago
Initial commit...
File size: 14.0 KB

Rev	Line
[106]	1	"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
	2
	3	Examples
	4
	5	This program extracts all links from a document. It will print one
	6	line for each link, containing the URL and the textual description
	7	between the <A>...</A> tags:
	8
	9	import pullparser, sys
	10	f = file(sys.argv[1])
	11	p = pullparser.PullParser(f)
	12	for token in p.tags("a"):
	13	if token.type == "endtag": continue
	14	url = dict(token.attrs).get("href", "-")
	15	text = p.get_compressed_text(endat=("endtag", "a"))
	16	print "%s\t%s" % (url, text)
	17
	18	This program extracts the <TITLE> from the document:
	19
	20	import pullparser, sys
	21	f = file(sys.argv[1])
	22	p = pullparser.PullParser(f)
	23	if p.get_tag("title"):
	24	title = p.get_compressed_text()
	25	print "Title: %s" % title
	26
	27
	28	Copyright 2003-2006 John J. Lee <jjl@pobox.com>
	29	Copyright 1998-2001 Gisle Aas (original libwww-perl code)
	30
	31	This code is free software; you can redistribute it and/or modify it
	32	under the terms of the BSD or ZPL 2.1 licenses.
	33
	34	"""
	35
	36	import re, htmlentitydefs
	37	import sgmllib, HTMLParser
	38	from xml.sax import saxutils
	39
	40	from _html import unescape, unescape_charref
	41
	42
	43	class NoMoreTokensError(Exception): pass
	44
	45	class Token:
	46	"""Represents an HTML tag, declaration, processing instruction etc.
	47
	48	Behaves as both a tuple-like object (ie. iterable) and has attributes
	49	.type, .data and .attrs.
	50
	51	>>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
	52	>>> t == ("starttag", "a", [("href", "http://www.python.org/")])
	53	True
	54	>>> (t.type, t.data) == ("starttag", "a")
	55	True
	56	>>> t.attrs == [("href", "http://www.python.org/")]
	57	True
	58
	59	Public attributes
	60
	61	type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
	62	"data", "comment", "decl", "pi", after the corresponding methods of
	63	HTMLParser.HTMLParser
	64	data: For a tag, the tag name; otherwise, the relevant data carried by the
	65	tag, as a string
	66	attrs: list of (name, value) pairs representing HTML attributes
	67	(or None if token does not represent an opening tag)
	68
	69	"""
	70	def __init__(self, type, data, attrs=None):
	71	self.type = type
	72	self.data = data
	73	self.attrs = attrs
	74	def __iter__(self):
	75	return iter((self.type, self.data, self.attrs))
	76	def __eq__(self, other):
	77	type, data, attrs = other
	78	if (self.type == type and
	79	self.data == data and
	80	self.attrs == attrs):
	81	return True
	82	else:
	83	return False
	84	def __ne__(self, other): return not self.__eq__(other)
	85	def __repr__(self):
	86	args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
	87	return self.__class__.__name__ + "(%s)" % args
	88
	89	def __str__(self):
	90	"""
	91	>>> print Token("starttag", "br")
	92	<br>
	93	>>> print Token("starttag", "a",
	94	... [("href", "http://www.python.org/"), ("alt", '"foo"')])
	95	<a href="http://www.python.org/" alt='"foo"'>
	96	>>> print Token("startendtag", "br")
	97	<br />
	98	>>> print Token("startendtag", "br", [("spam", "eggs")])
	99	<br spam="eggs" />
	100	>>> print Token("endtag", "p")
	101	</p>
	102	>>> print Token("charref", "38")
	103	&
	104	>>> print Token("entityref", "amp")
	105	&
	106	>>> print Token("data", "foo\\nbar")
	107	foo
	108	bar
	109	>>> print Token("comment", "Life is a bowl\\nof cherries.")
	110	<!--Life is a bowl
	111	of cherries.-->
	112	>>> print Token("decl", "decl")
	113	<!decl>
	114	>>> print Token("pi", "pi")
	115	<?pi>
	116	"""
	117	if self.attrs is not None:
	118	attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for
	119	k, v in self.attrs])
	120	else:
	121	attrs = ""
	122	if self.type == "starttag":
	123	return "<%s%s>" % (self.data, attrs)
	124	elif self.type == "startendtag":
	125	return "<%s%s />" % (self.data, attrs)
	126	elif self.type == "endtag":
	127	return "</%s>" % self.data
	128	elif self.type == "charref":
	129	return "&#%s;" % self.data
	130	elif self.type == "entityref":
	131	return "&%s;" % self.data
	132	elif self.type == "data":
	133	return self.data
	134	elif self.type == "comment":
	135	return "<!--%s-->" % self.data
	136	elif self.type == "decl":
	137	return "<!%s>" % self.data
	138	elif self.type == "pi":
	139	return "<?%s>" % self.data
	140	assert False
	141
	142
	143	def iter_until_exception(fn, exception, args, *kwds):
	144	while 1:
	145	try:
	146	yield fn(args, *kwds)
	147	except exception:
	148	raise StopIteration
	149
	150
	151	class _AbstractParser:
	152	chunk = 1024
	153	compress_re = re.compile(r"\s+")
	154	def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
	155	encoding="ascii", entitydefs=None):
	156	"""
	157	fh: file-like object (only a .read() method is required) from which to
	158	read HTML to be parsed
	159	textify: mapping used by .get_text() and .get_compressed_text() methods
	160	to represent opening tags as text
	161	encoding: encoding used to encode numeric character references by
	162	.get_text() and .get_compressed_text() ("ascii" by default)
	163
	164	entitydefs: mapping like {"amp": "&", ...} containing HTML entity
	165	definitions (a sensible default is used). This is used to unescape
	166	entities in .get_text() (and .get_compressed_text()) and attribute
	167	values. If the encoding can not represent the character, the entity
	168	reference is left unescaped. Note that entity references (both
	169	numeric - e.g. { or ઼ - and non-numeric - e.g. &) are
	170	unescaped in attribute values and the return value of .get_text(), but
	171	not in data outside of tags. Instead, entity references outside of
	172	tags are represented as tokens. This is a bit odd, it's true :-/
	173
	174	If the element name of an opening tag matches a key in the textify
	175	mapping then that tag is converted to text. The corresponding value is
	176	used to specify which tag attribute to obtain the text from. textify
	177	maps from element names to either:
	178
	179	- an HTML attribute name, in which case the HTML attribute value is
	180	used as its text value along with the element name in square
	181	brackets (eg."alt text goes here[IMG]", or, if the alt attribute
	182	were missing, just "[IMG]")
	183	- a callable object (eg. a function) which takes a Token and returns
	184	the string to be used as its text value
	185
	186	If textify has no key for an element name, nothing is substituted for
	187	the opening tag.
	188
	189	Public attributes:
	190
	191	encoding and textify: see above
	192
	193	"""
	194	self._fh = fh
	195	self._tokenstack = [] # FIFO
	196	self.textify = textify
	197	self.encoding = encoding
	198	if entitydefs is None:
	199	entitydefs = htmlentitydefs.name2codepoint
	200	self._entitydefs = entitydefs
	201
	202	def __iter__(self): return self
	203
	204	def tags(self, *names):
	205	return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
	206
	207	def tokens(self, *tokentypes):
	208	return iter_until_exception(self.get_token, NoMoreTokensError,
	209	*tokentypes)
	210
	211	def next(self):
	212	try:
	213	return self.get_token()
	214	except NoMoreTokensError:
	215	raise StopIteration()
	216
	217	def get_token(self, *tokentypes):
	218	"""Pop the next Token object from the stack of parsed tokens.
	219
	220	If arguments are given, they are taken to be token types in which the
	221	caller is interested: tokens representing other elements will be
	222	skipped. Element names must be given in lower case.
	223
	224	Raises NoMoreTokensError.
	225
	226	"""
	227	while 1:
	228	while self._tokenstack:
	229	token = self._tokenstack.pop(0)
	230	if tokentypes:
	231	if token.type in tokentypes:
	232	return token
	233	else:
	234	return token
	235	data = self._fh.read(self.chunk)
	236	if not data:
	237	raise NoMoreTokensError()
	238	self.feed(data)
	239
	240	def unget_token(self, token):
	241	"""Push a Token back onto the stack."""
	242	self._tokenstack.insert(0, token)
	243
	244	def get_tag(self, *names):
	245	"""Return the next Token that represents an opening or closing tag.
	246
	247	If arguments are given, they are taken to be element names in which the
	248	caller is interested: tags representing other elements will be skipped.
	249	Element names must be given in lower case.
	250
	251	Raises NoMoreTokensError.
	252
	253	"""
	254	while 1:
	255	tok = self.get_token()
	256	if tok.type not in ["starttag", "endtag", "startendtag"]:
	257	continue
	258	if names:
	259	if tok.data in names:
	260	return tok
	261	else:
	262	return tok
	263
	264	def get_text(self, endat=None):
	265	"""Get some text.
	266
	267	endat: stop reading text at this tag (the tag is included in the
	268	returned text); endtag is a tuple (type, name) where type is
	269	"starttag", "endtag" or "startendtag", and name is the element name of
	270	the tag (element names must be given in lower case)
	271
	272	If endat is not given, .get_text() will stop at the next opening or
	273	closing tag, or when there are no more tokens (no exception is raised).
	274	Note that .get_text() includes the text representation (if any) of the
	275	opening tag, but pushes the opening tag back onto the stack. As a
	276	result, if you want to call .get_text() again, you need to call
	277	.get_tag() first (unless you want an empty string returned when you
	278	next call .get_text()).
	279
	280	Entity references are translated using the value of the entitydefs
	281	constructor argument (a mapping from names to characters like that
	282	provided by the standard module htmlentitydefs). Named entity
	283	references that are not in this mapping are left unchanged.
	284
	285	The textify attribute is used to translate opening tags into text: see
	286	the class docstring.
	287
	288	"""
	289	text = []
	290	tok = None
	291	while 1:
	292	try:
	293	tok = self.get_token()
	294	except NoMoreTokensError:
	295	# unget last token (not the one we just failed to get)
	296	if tok: self.unget_token(tok)
	297	break
	298	if tok.type == "data":
	299	text.append(tok.data)
	300	elif tok.type == "entityref":
	301	t = unescape("&%s;" % tok.data, self._entitydefs, self.encoding)
	302	text.append(t)
	303	elif tok.type == "charref":
	304	t = unescape_charref(tok.data, self.encoding)
	305	text.append(t)
	306	elif tok.type in ["starttag", "endtag", "startendtag"]:
	307	tag_name = tok.data
	308	if tok.type in ["starttag", "startendtag"]:
	309	alt = self.textify.get(tag_name)
	310	if alt is not None:
	311	if callable(alt):
	312	text.append(alt(tok))
	313	elif tok.attrs is not None:
	314	for k, v in tok.attrs:
	315	if k == alt:
	316	text.append(v)
	317	text.append("[%s]" % tag_name.upper())
	318	if endat is None or endat == (tok.type, tag_name):
	319	self.unget_token(tok)
	320	break
	321	return "".join(text)
	322
	323	def get_compressed_text(self, args, *kwds):
	324	"""
	325	As .get_text(), but collapses each group of contiguous whitespace to a
	326	single space character, and removes all initial and trailing
	327	whitespace.
	328
	329	"""
	330	text = self.get_text(args, *kwds)
	331	text = text.strip()
	332	return self.compress_re.sub(" ", text)
	333
	334	def handle_startendtag(self, tag, attrs):
	335	self._tokenstack.append(Token("startendtag", tag, attrs))
	336	def handle_starttag(self, tag, attrs):
	337	self._tokenstack.append(Token("starttag", tag, attrs))
	338	def handle_endtag(self, tag):
	339	self._tokenstack.append(Token("endtag", tag))
	340	def handle_charref(self, name):
	341	self._tokenstack.append(Token("charref", name))
	342	def handle_entityref(self, name):
	343	self._tokenstack.append(Token("entityref", name))
	344	def handle_data(self, data):
	345	self._tokenstack.append(Token("data", data))
	346	def handle_comment(self, data):
	347	self._tokenstack.append(Token("comment", data))
	348	def handle_decl(self, decl):
	349	self._tokenstack.append(Token("decl", decl))
	350	def unknown_decl(self, data):
	351	# XXX should this call self.error instead?
	352	#self.error("unknown declaration: " + `data`)
	353	self._tokenstack.append(Token("decl", data))
	354	def handle_pi(self, data):
	355	self._tokenstack.append(Token("pi", data))
	356
	357	def unescape_attr(self, name):
	358	return unescape(name, self._entitydefs, self.encoding)
	359	def unescape_attrs(self, attrs):
	360	escaped_attrs = []
	361	for key, val in attrs:
	362	escaped_attrs.append((key, self.unescape_attr(val)))
	363	return escaped_attrs
	364
	365	class PullParser(_AbstractParser, HTMLParser.HTMLParser):
	366	def __init__(self, args, *kwds):
	367	HTMLParser.HTMLParser.__init__(self)
	368	_AbstractParser.__init__(self, args, *kwds)
	369	def unescape(self, name):
	370	# Use the entitydefs passed into constructor, not
	371	# HTMLParser.HTMLParser's entitydefs.
	372	return self.unescape_attr(name)
	373
	374	class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
	375	def __init__(self, args, *kwds):
	376	sgmllib.SGMLParser.__init__(self)
	377	_AbstractParser.__init__(self, args, *kwds)
	378	def unknown_starttag(self, tag, attrs):
	379	attrs = self.unescape_attrs(attrs)
	380	self._tokenstack.append(Token("starttag", tag, attrs))
	381	def unknown_endtag(self, tag):
	382	self._tokenstack.append(Token("endtag", tag))
	383
	384
	385	def _test():
	386	import doctest, _pullparser
	387	return doctest.testmod(_pullparser)
	388
	389	if __name__ == "__main__":
	390	_test()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: