source: py-scraping/mechanize/_pullparser.py@ 163

Last change on this file since 163 was 106, checked in by Rick van der Zwet, 15 years ago

Initial commit...

File size: 14.0 KB
Line 
1"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
2
3Examples
4
5This program extracts all links from a document. It will print one
6line for each link, containing the URL and the textual description
7between the <A>...</A> tags:
8
9import pullparser, sys
10f = file(sys.argv[1])
11p = pullparser.PullParser(f)
12for token in p.tags("a"):
13 if token.type == "endtag": continue
14 url = dict(token.attrs).get("href", "-")
15 text = p.get_compressed_text(endat=("endtag", "a"))
16 print "%s\t%s" % (url, text)
17
18This program extracts the <TITLE> from the document:
19
20import pullparser, sys
21f = file(sys.argv[1])
22p = pullparser.PullParser(f)
23if p.get_tag("title"):
24 title = p.get_compressed_text()
25 print "Title: %s" % title
26
27
28Copyright 2003-2006 John J. Lee <jjl@pobox.com>
29Copyright 1998-2001 Gisle Aas (original libwww-perl code)
30
31This code is free software; you can redistribute it and/or modify it
32under the terms of the BSD or ZPL 2.1 licenses.
33
34"""
35
36import re, htmlentitydefs
37import sgmllib, HTMLParser
38from xml.sax import saxutils
39
40from _html import unescape, unescape_charref
41
42
43class NoMoreTokensError(Exception): pass
44
45class Token:
46 """Represents an HTML tag, declaration, processing instruction etc.
47
48 Behaves as both a tuple-like object (ie. iterable) and has attributes
49 .type, .data and .attrs.
50
51 >>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
52 >>> t == ("starttag", "a", [("href", "http://www.python.org/")])
53 True
54 >>> (t.type, t.data) == ("starttag", "a")
55 True
56 >>> t.attrs == [("href", "http://www.python.org/")]
57 True
58
59 Public attributes
60
61 type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
62 "data", "comment", "decl", "pi", after the corresponding methods of
63 HTMLParser.HTMLParser
64 data: For a tag, the tag name; otherwise, the relevant data carried by the
65 tag, as a string
66 attrs: list of (name, value) pairs representing HTML attributes
67 (or None if token does not represent an opening tag)
68
69 """
70 def __init__(self, type, data, attrs=None):
71 self.type = type
72 self.data = data
73 self.attrs = attrs
74 def __iter__(self):
75 return iter((self.type, self.data, self.attrs))
76 def __eq__(self, other):
77 type, data, attrs = other
78 if (self.type == type and
79 self.data == data and
80 self.attrs == attrs):
81 return True
82 else:
83 return False
84 def __ne__(self, other): return not self.__eq__(other)
85 def __repr__(self):
86 args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
87 return self.__class__.__name__ + "(%s)" % args
88
89 def __str__(self):
90 """
91 >>> print Token("starttag", "br")
92 <br>
93 >>> print Token("starttag", "a",
94 ... [("href", "http://www.python.org/"), ("alt", '"foo"')])
95 <a href="http://www.python.org/" alt='"foo"'>
96 >>> print Token("startendtag", "br")
97 <br />
98 >>> print Token("startendtag", "br", [("spam", "eggs")])
99 <br spam="eggs" />
100 >>> print Token("endtag", "p")
101 </p>
102 >>> print Token("charref", "38")
103 &#38;
104 >>> print Token("entityref", "amp")
105 &amp;
106 >>> print Token("data", "foo\\nbar")
107 foo
108 bar
109 >>> print Token("comment", "Life is a bowl\\nof cherries.")
110 <!--Life is a bowl
111 of cherries.-->
112 >>> print Token("decl", "decl")
113 <!decl>
114 >>> print Token("pi", "pi")
115 <?pi>
116 """
117 if self.attrs is not None:
118 attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for
119 k, v in self.attrs])
120 else:
121 attrs = ""
122 if self.type == "starttag":
123 return "<%s%s>" % (self.data, attrs)
124 elif self.type == "startendtag":
125 return "<%s%s />" % (self.data, attrs)
126 elif self.type == "endtag":
127 return "</%s>" % self.data
128 elif self.type == "charref":
129 return "&#%s;" % self.data
130 elif self.type == "entityref":
131 return "&%s;" % self.data
132 elif self.type == "data":
133 return self.data
134 elif self.type == "comment":
135 return "<!--%s-->" % self.data
136 elif self.type == "decl":
137 return "<!%s>" % self.data
138 elif self.type == "pi":
139 return "<?%s>" % self.data
140 assert False
141
142
143def iter_until_exception(fn, exception, *args, **kwds):
144 while 1:
145 try:
146 yield fn(*args, **kwds)
147 except exception:
148 raise StopIteration
149
150
151class _AbstractParser:
152 chunk = 1024
153 compress_re = re.compile(r"\s+")
154 def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
155 encoding="ascii", entitydefs=None):
156 """
157 fh: file-like object (only a .read() method is required) from which to
158 read HTML to be parsed
159 textify: mapping used by .get_text() and .get_compressed_text() methods
160 to represent opening tags as text
161 encoding: encoding used to encode numeric character references by
162 .get_text() and .get_compressed_text() ("ascii" by default)
163
164 entitydefs: mapping like {"amp": "&", ...} containing HTML entity
165 definitions (a sensible default is used). This is used to unescape
166 entities in .get_text() (and .get_compressed_text()) and attribute
167 values. If the encoding can not represent the character, the entity
168 reference is left unescaped. Note that entity references (both
169 numeric - e.g. &#123; or &#xabc; - and non-numeric - e.g. &amp;) are
170 unescaped in attribute values and the return value of .get_text(), but
171 not in data outside of tags. Instead, entity references outside of
172 tags are represented as tokens. This is a bit odd, it's true :-/
173
174 If the element name of an opening tag matches a key in the textify
175 mapping then that tag is converted to text. The corresponding value is
176 used to specify which tag attribute to obtain the text from. textify
177 maps from element names to either:
178
179 - an HTML attribute name, in which case the HTML attribute value is
180 used as its text value along with the element name in square
181 brackets (eg."alt text goes here[IMG]", or, if the alt attribute
182 were missing, just "[IMG]")
183 - a callable object (eg. a function) which takes a Token and returns
184 the string to be used as its text value
185
186 If textify has no key for an element name, nothing is substituted for
187 the opening tag.
188
189 Public attributes:
190
191 encoding and textify: see above
192
193 """
194 self._fh = fh
195 self._tokenstack = [] # FIFO
196 self.textify = textify
197 self.encoding = encoding
198 if entitydefs is None:
199 entitydefs = htmlentitydefs.name2codepoint
200 self._entitydefs = entitydefs
201
202 def __iter__(self): return self
203
204 def tags(self, *names):
205 return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
206
207 def tokens(self, *tokentypes):
208 return iter_until_exception(self.get_token, NoMoreTokensError,
209 *tokentypes)
210
211 def next(self):
212 try:
213 return self.get_token()
214 except NoMoreTokensError:
215 raise StopIteration()
216
217 def get_token(self, *tokentypes):
218 """Pop the next Token object from the stack of parsed tokens.
219
220 If arguments are given, they are taken to be token types in which the
221 caller is interested: tokens representing other elements will be
222 skipped. Element names must be given in lower case.
223
224 Raises NoMoreTokensError.
225
226 """
227 while 1:
228 while self._tokenstack:
229 token = self._tokenstack.pop(0)
230 if tokentypes:
231 if token.type in tokentypes:
232 return token
233 else:
234 return token
235 data = self._fh.read(self.chunk)
236 if not data:
237 raise NoMoreTokensError()
238 self.feed(data)
239
240 def unget_token(self, token):
241 """Push a Token back onto the stack."""
242 self._tokenstack.insert(0, token)
243
244 def get_tag(self, *names):
245 """Return the next Token that represents an opening or closing tag.
246
247 If arguments are given, they are taken to be element names in which the
248 caller is interested: tags representing other elements will be skipped.
249 Element names must be given in lower case.
250
251 Raises NoMoreTokensError.
252
253 """
254 while 1:
255 tok = self.get_token()
256 if tok.type not in ["starttag", "endtag", "startendtag"]:
257 continue
258 if names:
259 if tok.data in names:
260 return tok
261 else:
262 return tok
263
264 def get_text(self, endat=None):
265 """Get some text.
266
267 endat: stop reading text at this tag (the tag is included in the
268 returned text); endtag is a tuple (type, name) where type is
269 "starttag", "endtag" or "startendtag", and name is the element name of
270 the tag (element names must be given in lower case)
271
272 If endat is not given, .get_text() will stop at the next opening or
273 closing tag, or when there are no more tokens (no exception is raised).
274 Note that .get_text() includes the text representation (if any) of the
275 opening tag, but pushes the opening tag back onto the stack. As a
276 result, if you want to call .get_text() again, you need to call
277 .get_tag() first (unless you want an empty string returned when you
278 next call .get_text()).
279
280 Entity references are translated using the value of the entitydefs
281 constructor argument (a mapping from names to characters like that
282 provided by the standard module htmlentitydefs). Named entity
283 references that are not in this mapping are left unchanged.
284
285 The textify attribute is used to translate opening tags into text: see
286 the class docstring.
287
288 """
289 text = []
290 tok = None
291 while 1:
292 try:
293 tok = self.get_token()
294 except NoMoreTokensError:
295 # unget last token (not the one we just failed to get)
296 if tok: self.unget_token(tok)
297 break
298 if tok.type == "data":
299 text.append(tok.data)
300 elif tok.type == "entityref":
301 t = unescape("&%s;" % tok.data, self._entitydefs, self.encoding)
302 text.append(t)
303 elif tok.type == "charref":
304 t = unescape_charref(tok.data, self.encoding)
305 text.append(t)
306 elif tok.type in ["starttag", "endtag", "startendtag"]:
307 tag_name = tok.data
308 if tok.type in ["starttag", "startendtag"]:
309 alt = self.textify.get(tag_name)
310 if alt is not None:
311 if callable(alt):
312 text.append(alt(tok))
313 elif tok.attrs is not None:
314 for k, v in tok.attrs:
315 if k == alt:
316 text.append(v)
317 text.append("[%s]" % tag_name.upper())
318 if endat is None or endat == (tok.type, tag_name):
319 self.unget_token(tok)
320 break
321 return "".join(text)
322
323 def get_compressed_text(self, *args, **kwds):
324 """
325 As .get_text(), but collapses each group of contiguous whitespace to a
326 single space character, and removes all initial and trailing
327 whitespace.
328
329 """
330 text = self.get_text(*args, **kwds)
331 text = text.strip()
332 return self.compress_re.sub(" ", text)
333
334 def handle_startendtag(self, tag, attrs):
335 self._tokenstack.append(Token("startendtag", tag, attrs))
336 def handle_starttag(self, tag, attrs):
337 self._tokenstack.append(Token("starttag", tag, attrs))
338 def handle_endtag(self, tag):
339 self._tokenstack.append(Token("endtag", tag))
340 def handle_charref(self, name):
341 self._tokenstack.append(Token("charref", name))
342 def handle_entityref(self, name):
343 self._tokenstack.append(Token("entityref", name))
344 def handle_data(self, data):
345 self._tokenstack.append(Token("data", data))
346 def handle_comment(self, data):
347 self._tokenstack.append(Token("comment", data))
348 def handle_decl(self, decl):
349 self._tokenstack.append(Token("decl", decl))
350 def unknown_decl(self, data):
351 # XXX should this call self.error instead?
352 #self.error("unknown declaration: " + `data`)
353 self._tokenstack.append(Token("decl", data))
354 def handle_pi(self, data):
355 self._tokenstack.append(Token("pi", data))
356
357 def unescape_attr(self, name):
358 return unescape(name, self._entitydefs, self.encoding)
359 def unescape_attrs(self, attrs):
360 escaped_attrs = []
361 for key, val in attrs:
362 escaped_attrs.append((key, self.unescape_attr(val)))
363 return escaped_attrs
364
365class PullParser(_AbstractParser, HTMLParser.HTMLParser):
366 def __init__(self, *args, **kwds):
367 HTMLParser.HTMLParser.__init__(self)
368 _AbstractParser.__init__(self, *args, **kwds)
369 def unescape(self, name):
370 # Use the entitydefs passed into constructor, not
371 # HTMLParser.HTMLParser's entitydefs.
372 return self.unescape_attr(name)
373
374class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
375 def __init__(self, *args, **kwds):
376 sgmllib.SGMLParser.__init__(self)
377 _AbstractParser.__init__(self, *args, **kwds)
378 def unknown_starttag(self, tag, attrs):
379 attrs = self.unescape_attrs(attrs)
380 self._tokenstack.append(Token("starttag", tag, attrs))
381 def unknown_endtag(self, tag):
382 self._tokenstack.append(Token("endtag", tag))
383
384
385def _test():
386 import doctest, _pullparser
387 return doctest.testmod(_pullparser)
388
389if __name__ == "__main__":
390 _test()
Note: See TracBrowser for help on using the repository browser.