Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: py-scraping/mechanize/_pullparser.py@ 140

Last change on this file since 140 was 106, checked in by Rick van der Zwet, 15 years ago
Initial commit...
File size: 14.0 KB

Line
1	"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
2
3	Examples
4
5	This program extracts all links from a document. It will print one
6	line for each link, containing the URL and the textual description
7	between the <A>...</A> tags:
8
9	import pullparser, sys
10	f = file(sys.argv[1])
11	p = pullparser.PullParser(f)
12	for token in p.tags("a"):
13	if token.type == "endtag": continue
14	url = dict(token.attrs).get("href", "-")
15	text = p.get_compressed_text(endat=("endtag", "a"))
16	print "%s\t%s" % (url, text)
17
18	This program extracts the <TITLE> from the document:
19
20	import pullparser, sys
21	f = file(sys.argv[1])
22	p = pullparser.PullParser(f)
23	if p.get_tag("title"):
24	title = p.get_compressed_text()
25	print "Title: %s" % title
26
27
28	Copyright 2003-2006 John J. Lee <jjl@pobox.com>
29	Copyright 1998-2001 Gisle Aas (original libwww-perl code)
30
31	This code is free software; you can redistribute it and/or modify it
32	under the terms of the BSD or ZPL 2.1 licenses.
33
34	"""
35
36	import re, htmlentitydefs
37	import sgmllib, HTMLParser
38	from xml.sax import saxutils
39
40	from _html import unescape, unescape_charref
41
42
43	class NoMoreTokensError(Exception): pass
44
45	class Token:
46	"""Represents an HTML tag, declaration, processing instruction etc.
47
48	Behaves as both a tuple-like object (ie. iterable) and has attributes
49	.type, .data and .attrs.
50
51	>>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
52	>>> t == ("starttag", "a", [("href", "http://www.python.org/")])
53	True
54	>>> (t.type, t.data) == ("starttag", "a")
55	True
56	>>> t.attrs == [("href", "http://www.python.org/")]
57	True
58
59	Public attributes
60
61	type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
62	"data", "comment", "decl", "pi", after the corresponding methods of
63	HTMLParser.HTMLParser
64	data: For a tag, the tag name; otherwise, the relevant data carried by the
65	tag, as a string
66	attrs: list of (name, value) pairs representing HTML attributes
67	(or None if token does not represent an opening tag)
68
69	"""
70	def __init__(self, type, data, attrs=None):
71	self.type = type
72	self.data = data
73	self.attrs = attrs
74	def __iter__(self):
75	return iter((self.type, self.data, self.attrs))
76	def __eq__(self, other):
77	type, data, attrs = other
78	if (self.type == type and
79	self.data == data and
80	self.attrs == attrs):
81	return True
82	else:
83	return False
84	def __ne__(self, other): return not self.__eq__(other)
85	def __repr__(self):
86	args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
87	return self.__class__.__name__ + "(%s)" % args
88
89	def __str__(self):
90	"""
91	>>> print Token("starttag", "br")
92	<br>
93	>>> print Token("starttag", "a",
94	... [("href", "http://www.python.org/"), ("alt", '"foo"')])
95	<a href="http://www.python.org/" alt='"foo"'>
96	>>> print Token("startendtag", "br")
97	<br />
98	>>> print Token("startendtag", "br", [("spam", "eggs")])
99	<br spam="eggs" />
100	>>> print Token("endtag", "p")
101	</p>
102	>>> print Token("charref", "38")
103	&
104	>>> print Token("entityref", "amp")
105	&
106	>>> print Token("data", "foo\\nbar")
107	foo
108	bar
109	>>> print Token("comment", "Life is a bowl\\nof cherries.")
110	<!--Life is a bowl
111	of cherries.-->
112	>>> print Token("decl", "decl")
113	<!decl>
114	>>> print Token("pi", "pi")
115	<?pi>
116	"""
117	if self.attrs is not None:
118	attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for
119	k, v in self.attrs])
120	else:
121	attrs = ""
122	if self.type == "starttag":
123	return "<%s%s>" % (self.data, attrs)
124	elif self.type == "startendtag":
125	return "<%s%s />" % (self.data, attrs)
126	elif self.type == "endtag":
127	return "</%s>" % self.data
128	elif self.type == "charref":
129	return "&#%s;" % self.data
130	elif self.type == "entityref":
131	return "&%s;" % self.data
132	elif self.type == "data":
133	return self.data
134	elif self.type == "comment":
135	return "<!--%s-->" % self.data
136	elif self.type == "decl":
137	return "<!%s>" % self.data
138	elif self.type == "pi":
139	return "<?%s>" % self.data
140	assert False
141
142
143	def iter_until_exception(fn, exception, args, *kwds):
144	while 1:
145	try:
146	yield fn(args, *kwds)
147	except exception:
148	raise StopIteration
149
150
151	class _AbstractParser:
152	chunk = 1024
153	compress_re = re.compile(r"\s+")
154	def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
155	encoding="ascii", entitydefs=None):
156	"""
157	fh: file-like object (only a .read() method is required) from which to
158	read HTML to be parsed
159	textify: mapping used by .get_text() and .get_compressed_text() methods
160	to represent opening tags as text
161	encoding: encoding used to encode numeric character references by
162	.get_text() and .get_compressed_text() ("ascii" by default)
163
164	entitydefs: mapping like {"amp": "&", ...} containing HTML entity
165	definitions (a sensible default is used). This is used to unescape
166	entities in .get_text() (and .get_compressed_text()) and attribute
167	values. If the encoding can not represent the character, the entity
168	reference is left unescaped. Note that entity references (both
169	numeric - e.g. { or ઼ - and non-numeric - e.g. &) are
170	unescaped in attribute values and the return value of .get_text(), but
171	not in data outside of tags. Instead, entity references outside of
172	tags are represented as tokens. This is a bit odd, it's true :-/
173
174	If the element name of an opening tag matches a key in the textify
175	mapping then that tag is converted to text. The corresponding value is
176	used to specify which tag attribute to obtain the text from. textify
177	maps from element names to either:
178
179	- an HTML attribute name, in which case the HTML attribute value is
180	used as its text value along with the element name in square
181	brackets (eg."alt text goes here[IMG]", or, if the alt attribute
182	were missing, just "[IMG]")
183	- a callable object (eg. a function) which takes a Token and returns
184	the string to be used as its text value
185
186	If textify has no key for an element name, nothing is substituted for
187	the opening tag.
188
189	Public attributes:
190
191	encoding and textify: see above
192
193	"""
194	self._fh = fh
195	self._tokenstack = [] # FIFO
196	self.textify = textify
197	self.encoding = encoding
198	if entitydefs is None:
199	entitydefs = htmlentitydefs.name2codepoint
200	self._entitydefs = entitydefs
201
202	def __iter__(self): return self
203
204	def tags(self, *names):
205	return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
206
207	def tokens(self, *tokentypes):
208	return iter_until_exception(self.get_token, NoMoreTokensError,
209	*tokentypes)
210
211	def next(self):
212	try:
213	return self.get_token()
214	except NoMoreTokensError:
215	raise StopIteration()
216
217	def get_token(self, *tokentypes):
218	"""Pop the next Token object from the stack of parsed tokens.
219
220	If arguments are given, they are taken to be token types in which the
221	caller is interested: tokens representing other elements will be
222	skipped. Element names must be given in lower case.
223
224	Raises NoMoreTokensError.
225
226	"""
227	while 1:
228	while self._tokenstack:
229	token = self._tokenstack.pop(0)
230	if tokentypes:
231	if token.type in tokentypes:
232	return token
233	else:
234	return token
235	data = self._fh.read(self.chunk)
236	if not data:
237	raise NoMoreTokensError()
238	self.feed(data)
239
240	def unget_token(self, token):
241	"""Push a Token back onto the stack."""
242	self._tokenstack.insert(0, token)
243
244	def get_tag(self, *names):
245	"""Return the next Token that represents an opening or closing tag.
246
247	If arguments are given, they are taken to be element names in which the
248	caller is interested: tags representing other elements will be skipped.
249	Element names must be given in lower case.
250
251	Raises NoMoreTokensError.
252
253	"""
254	while 1:
255	tok = self.get_token()
256	if tok.type not in ["starttag", "endtag", "startendtag"]:
257	continue
258	if names:
259	if tok.data in names:
260	return tok
261	else:
262	return tok
263
264	def get_text(self, endat=None):
265	"""Get some text.
266
267	endat: stop reading text at this tag (the tag is included in the
268	returned text); endtag is a tuple (type, name) where type is
269	"starttag", "endtag" or "startendtag", and name is the element name of
270	the tag (element names must be given in lower case)
271
272	If endat is not given, .get_text() will stop at the next opening or
273	closing tag, or when there are no more tokens (no exception is raised).
274	Note that .get_text() includes the text representation (if any) of the
275	opening tag, but pushes the opening tag back onto the stack. As a
276	result, if you want to call .get_text() again, you need to call
277	.get_tag() first (unless you want an empty string returned when you
278	next call .get_text()).
279
280	Entity references are translated using the value of the entitydefs
281	constructor argument (a mapping from names to characters like that
282	provided by the standard module htmlentitydefs). Named entity
283	references that are not in this mapping are left unchanged.
284
285	The textify attribute is used to translate opening tags into text: see
286	the class docstring.
287
288	"""
289	text = []
290	tok = None
291	while 1:
292	try:
293	tok = self.get_token()
294	except NoMoreTokensError:
295	# unget last token (not the one we just failed to get)
296	if tok: self.unget_token(tok)
297	break
298	if tok.type == "data":
299	text.append(tok.data)
300	elif tok.type == "entityref":
301	t = unescape("&%s;" % tok.data, self._entitydefs, self.encoding)
302	text.append(t)
303	elif tok.type == "charref":
304	t = unescape_charref(tok.data, self.encoding)
305	text.append(t)
306	elif tok.type in ["starttag", "endtag", "startendtag"]:
307	tag_name = tok.data
308	if tok.type in ["starttag", "startendtag"]:
309	alt = self.textify.get(tag_name)
310	if alt is not None:
311	if callable(alt):
312	text.append(alt(tok))
313	elif tok.attrs is not None:
314	for k, v in tok.attrs:
315	if k == alt:
316	text.append(v)
317	text.append("[%s]" % tag_name.upper())
318	if endat is None or endat == (tok.type, tag_name):
319	self.unget_token(tok)
320	break
321	return "".join(text)
322
323	def get_compressed_text(self, args, *kwds):
324	"""
325	As .get_text(), but collapses each group of contiguous whitespace to a
326	single space character, and removes all initial and trailing
327	whitespace.
328
329	"""
330	text = self.get_text(args, *kwds)
331	text = text.strip()
332	return self.compress_re.sub(" ", text)
333
334	def handle_startendtag(self, tag, attrs):
335	self._tokenstack.append(Token("startendtag", tag, attrs))
336	def handle_starttag(self, tag, attrs):
337	self._tokenstack.append(Token("starttag", tag, attrs))
338	def handle_endtag(self, tag):
339	self._tokenstack.append(Token("endtag", tag))
340	def handle_charref(self, name):
341	self._tokenstack.append(Token("charref", name))
342	def handle_entityref(self, name):
343	self._tokenstack.append(Token("entityref", name))
344	def handle_data(self, data):
345	self._tokenstack.append(Token("data", data))
346	def handle_comment(self, data):
347	self._tokenstack.append(Token("comment", data))
348	def handle_decl(self, decl):
349	self._tokenstack.append(Token("decl", decl))
350	def unknown_decl(self, data):
351	# XXX should this call self.error instead?
352	#self.error("unknown declaration: " + `data`)
353	self._tokenstack.append(Token("decl", data))
354	def handle_pi(self, data):
355	self._tokenstack.append(Token("pi", data))
356
357	def unescape_attr(self, name):
358	return unescape(name, self._entitydefs, self.encoding)
359	def unescape_attrs(self, attrs):
360	escaped_attrs = []
361	for key, val in attrs:
362	escaped_attrs.append((key, self.unescape_attr(val)))
363	return escaped_attrs
364
365	class PullParser(_AbstractParser, HTMLParser.HTMLParser):
366	def __init__(self, args, *kwds):
367	HTMLParser.HTMLParser.__init__(self)
368	_AbstractParser.__init__(self, args, *kwds)
369	def unescape(self, name):
370	# Use the entitydefs passed into constructor, not
371	# HTMLParser.HTMLParser's entitydefs.
372	return self.unescape_attr(name)
373
374	class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
375	def __init__(self, args, *kwds):
376	sgmllib.SGMLParser.__init__(self)
377	_AbstractParser.__init__(self, args, *kwds)
378	def unknown_starttag(self, tag, attrs):
379	attrs = self.unescape_attrs(attrs)
380	self._tokenstack.append(Token("starttag", tag, attrs))
381	def unknown_endtag(self, tag):
382	self._tokenstack.append(Token("endtag", tag))
383
384
385	def _test():
386	import doctest, _pullparser
387	return doctest.testmod(_pullparser)
388
389	if __name__ == "__main__":
390	_test()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: