1 | """A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
|
---|
2 |
|
---|
3 | Examples
|
---|
4 |
|
---|
5 | This program extracts all links from a document. It will print one
|
---|
6 | line for each link, containing the URL and the textual description
|
---|
7 | between the <A>...</A> tags:
|
---|
8 |
|
---|
9 | import pullparser, sys
|
---|
10 | f = file(sys.argv[1])
|
---|
11 | p = pullparser.PullParser(f)
|
---|
12 | for token in p.tags("a"):
|
---|
13 | if token.type == "endtag": continue
|
---|
14 | url = dict(token.attrs).get("href", "-")
|
---|
15 | text = p.get_compressed_text(endat=("endtag", "a"))
|
---|
16 | print "%s\t%s" % (url, text)
|
---|
17 |
|
---|
18 | This program extracts the <TITLE> from the document:
|
---|
19 |
|
---|
20 | import pullparser, sys
|
---|
21 | f = file(sys.argv[1])
|
---|
22 | p = pullparser.PullParser(f)
|
---|
23 | if p.get_tag("title"):
|
---|
24 | title = p.get_compressed_text()
|
---|
25 | print "Title: %s" % title
|
---|
26 |
|
---|
27 |
|
---|
28 | Copyright 2003-2006 John J. Lee <jjl@pobox.com>
|
---|
29 | Copyright 1998-2001 Gisle Aas (original libwww-perl code)
|
---|
30 |
|
---|
31 | This code is free software; you can redistribute it and/or modify it
|
---|
32 | under the terms of the BSD or ZPL 2.1 licenses.
|
---|
33 |
|
---|
34 | """
|
---|
35 |
|
---|
36 | import re, htmlentitydefs
|
---|
37 | import sgmllib, HTMLParser
|
---|
38 | from xml.sax import saxutils
|
---|
39 |
|
---|
40 | from _html import unescape, unescape_charref
|
---|
41 |
|
---|
42 |
|
---|
43 | class NoMoreTokensError(Exception): pass
|
---|
44 |
|
---|
45 | class Token:
|
---|
46 | """Represents an HTML tag, declaration, processing instruction etc.
|
---|
47 |
|
---|
48 | Behaves as both a tuple-like object (ie. iterable) and has attributes
|
---|
49 | .type, .data and .attrs.
|
---|
50 |
|
---|
51 | >>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
|
---|
52 | >>> t == ("starttag", "a", [("href", "http://www.python.org/")])
|
---|
53 | True
|
---|
54 | >>> (t.type, t.data) == ("starttag", "a")
|
---|
55 | True
|
---|
56 | >>> t.attrs == [("href", "http://www.python.org/")]
|
---|
57 | True
|
---|
58 |
|
---|
59 | Public attributes
|
---|
60 |
|
---|
61 | type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
|
---|
62 | "data", "comment", "decl", "pi", after the corresponding methods of
|
---|
63 | HTMLParser.HTMLParser
|
---|
64 | data: For a tag, the tag name; otherwise, the relevant data carried by the
|
---|
65 | tag, as a string
|
---|
66 | attrs: list of (name, value) pairs representing HTML attributes
|
---|
67 | (or None if token does not represent an opening tag)
|
---|
68 |
|
---|
69 | """
|
---|
70 | def __init__(self, type, data, attrs=None):
|
---|
71 | self.type = type
|
---|
72 | self.data = data
|
---|
73 | self.attrs = attrs
|
---|
74 | def __iter__(self):
|
---|
75 | return iter((self.type, self.data, self.attrs))
|
---|
76 | def __eq__(self, other):
|
---|
77 | type, data, attrs = other
|
---|
78 | if (self.type == type and
|
---|
79 | self.data == data and
|
---|
80 | self.attrs == attrs):
|
---|
81 | return True
|
---|
82 | else:
|
---|
83 | return False
|
---|
84 | def __ne__(self, other): return not self.__eq__(other)
|
---|
85 | def __repr__(self):
|
---|
86 | args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
|
---|
87 | return self.__class__.__name__ + "(%s)" % args
|
---|
88 |
|
---|
89 | def __str__(self):
|
---|
90 | """
|
---|
91 | >>> print Token("starttag", "br")
|
---|
92 | <br>
|
---|
93 | >>> print Token("starttag", "a",
|
---|
94 | ... [("href", "http://www.python.org/"), ("alt", '"foo"')])
|
---|
95 | <a href="http://www.python.org/" alt='"foo"'>
|
---|
96 | >>> print Token("startendtag", "br")
|
---|
97 | <br />
|
---|
98 | >>> print Token("startendtag", "br", [("spam", "eggs")])
|
---|
99 | <br spam="eggs" />
|
---|
100 | >>> print Token("endtag", "p")
|
---|
101 | </p>
|
---|
102 | >>> print Token("charref", "38")
|
---|
103 | &
|
---|
104 | >>> print Token("entityref", "amp")
|
---|
105 | &
|
---|
106 | >>> print Token("data", "foo\\nbar")
|
---|
107 | foo
|
---|
108 | bar
|
---|
109 | >>> print Token("comment", "Life is a bowl\\nof cherries.")
|
---|
110 | <!--Life is a bowl
|
---|
111 | of cherries.-->
|
---|
112 | >>> print Token("decl", "decl")
|
---|
113 | <!decl>
|
---|
114 | >>> print Token("pi", "pi")
|
---|
115 | <?pi>
|
---|
116 | """
|
---|
117 | if self.attrs is not None:
|
---|
118 | attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for
|
---|
119 | k, v in self.attrs])
|
---|
120 | else:
|
---|
121 | attrs = ""
|
---|
122 | if self.type == "starttag":
|
---|
123 | return "<%s%s>" % (self.data, attrs)
|
---|
124 | elif self.type == "startendtag":
|
---|
125 | return "<%s%s />" % (self.data, attrs)
|
---|
126 | elif self.type == "endtag":
|
---|
127 | return "</%s>" % self.data
|
---|
128 | elif self.type == "charref":
|
---|
129 | return "&#%s;" % self.data
|
---|
130 | elif self.type == "entityref":
|
---|
131 | return "&%s;" % self.data
|
---|
132 | elif self.type == "data":
|
---|
133 | return self.data
|
---|
134 | elif self.type == "comment":
|
---|
135 | return "<!--%s-->" % self.data
|
---|
136 | elif self.type == "decl":
|
---|
137 | return "<!%s>" % self.data
|
---|
138 | elif self.type == "pi":
|
---|
139 | return "<?%s>" % self.data
|
---|
140 | assert False
|
---|
141 |
|
---|
142 |
|
---|
143 | def iter_until_exception(fn, exception, *args, **kwds):
|
---|
144 | while 1:
|
---|
145 | try:
|
---|
146 | yield fn(*args, **kwds)
|
---|
147 | except exception:
|
---|
148 | raise StopIteration
|
---|
149 |
|
---|
150 |
|
---|
151 | class _AbstractParser:
|
---|
152 | chunk = 1024
|
---|
153 | compress_re = re.compile(r"\s+")
|
---|
154 | def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
|
---|
155 | encoding="ascii", entitydefs=None):
|
---|
156 | """
|
---|
157 | fh: file-like object (only a .read() method is required) from which to
|
---|
158 | read HTML to be parsed
|
---|
159 | textify: mapping used by .get_text() and .get_compressed_text() methods
|
---|
160 | to represent opening tags as text
|
---|
161 | encoding: encoding used to encode numeric character references by
|
---|
162 | .get_text() and .get_compressed_text() ("ascii" by default)
|
---|
163 |
|
---|
164 | entitydefs: mapping like {"amp": "&", ...} containing HTML entity
|
---|
165 | definitions (a sensible default is used). This is used to unescape
|
---|
166 | entities in .get_text() (and .get_compressed_text()) and attribute
|
---|
167 | values. If the encoding can not represent the character, the entity
|
---|
168 | reference is left unescaped. Note that entity references (both
|
---|
169 | numeric - e.g. { or ઼ - and non-numeric - e.g. &) are
|
---|
170 | unescaped in attribute values and the return value of .get_text(), but
|
---|
171 | not in data outside of tags. Instead, entity references outside of
|
---|
172 | tags are represented as tokens. This is a bit odd, it's true :-/
|
---|
173 |
|
---|
174 | If the element name of an opening tag matches a key in the textify
|
---|
175 | mapping then that tag is converted to text. The corresponding value is
|
---|
176 | used to specify which tag attribute to obtain the text from. textify
|
---|
177 | maps from element names to either:
|
---|
178 |
|
---|
179 | - an HTML attribute name, in which case the HTML attribute value is
|
---|
180 | used as its text value along with the element name in square
|
---|
181 | brackets (eg."alt text goes here[IMG]", or, if the alt attribute
|
---|
182 | were missing, just "[IMG]")
|
---|
183 | - a callable object (eg. a function) which takes a Token and returns
|
---|
184 | the string to be used as its text value
|
---|
185 |
|
---|
186 | If textify has no key for an element name, nothing is substituted for
|
---|
187 | the opening tag.
|
---|
188 |
|
---|
189 | Public attributes:
|
---|
190 |
|
---|
191 | encoding and textify: see above
|
---|
192 |
|
---|
193 | """
|
---|
194 | self._fh = fh
|
---|
195 | self._tokenstack = [] # FIFO
|
---|
196 | self.textify = textify
|
---|
197 | self.encoding = encoding
|
---|
198 | if entitydefs is None:
|
---|
199 | entitydefs = htmlentitydefs.name2codepoint
|
---|
200 | self._entitydefs = entitydefs
|
---|
201 |
|
---|
202 | def __iter__(self): return self
|
---|
203 |
|
---|
204 | def tags(self, *names):
|
---|
205 | return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
|
---|
206 |
|
---|
207 | def tokens(self, *tokentypes):
|
---|
208 | return iter_until_exception(self.get_token, NoMoreTokensError,
|
---|
209 | *tokentypes)
|
---|
210 |
|
---|
211 | def next(self):
|
---|
212 | try:
|
---|
213 | return self.get_token()
|
---|
214 | except NoMoreTokensError:
|
---|
215 | raise StopIteration()
|
---|
216 |
|
---|
217 | def get_token(self, *tokentypes):
|
---|
218 | """Pop the next Token object from the stack of parsed tokens.
|
---|
219 |
|
---|
220 | If arguments are given, they are taken to be token types in which the
|
---|
221 | caller is interested: tokens representing other elements will be
|
---|
222 | skipped. Element names must be given in lower case.
|
---|
223 |
|
---|
224 | Raises NoMoreTokensError.
|
---|
225 |
|
---|
226 | """
|
---|
227 | while 1:
|
---|
228 | while self._tokenstack:
|
---|
229 | token = self._tokenstack.pop(0)
|
---|
230 | if tokentypes:
|
---|
231 | if token.type in tokentypes:
|
---|
232 | return token
|
---|
233 | else:
|
---|
234 | return token
|
---|
235 | data = self._fh.read(self.chunk)
|
---|
236 | if not data:
|
---|
237 | raise NoMoreTokensError()
|
---|
238 | self.feed(data)
|
---|
239 |
|
---|
240 | def unget_token(self, token):
|
---|
241 | """Push a Token back onto the stack."""
|
---|
242 | self._tokenstack.insert(0, token)
|
---|
243 |
|
---|
244 | def get_tag(self, *names):
|
---|
245 | """Return the next Token that represents an opening or closing tag.
|
---|
246 |
|
---|
247 | If arguments are given, they are taken to be element names in which the
|
---|
248 | caller is interested: tags representing other elements will be skipped.
|
---|
249 | Element names must be given in lower case.
|
---|
250 |
|
---|
251 | Raises NoMoreTokensError.
|
---|
252 |
|
---|
253 | """
|
---|
254 | while 1:
|
---|
255 | tok = self.get_token()
|
---|
256 | if tok.type not in ["starttag", "endtag", "startendtag"]:
|
---|
257 | continue
|
---|
258 | if names:
|
---|
259 | if tok.data in names:
|
---|
260 | return tok
|
---|
261 | else:
|
---|
262 | return tok
|
---|
263 |
|
---|
264 | def get_text(self, endat=None):
|
---|
265 | """Get some text.
|
---|
266 |
|
---|
267 | endat: stop reading text at this tag (the tag is included in the
|
---|
268 | returned text); endtag is a tuple (type, name) where type is
|
---|
269 | "starttag", "endtag" or "startendtag", and name is the element name of
|
---|
270 | the tag (element names must be given in lower case)
|
---|
271 |
|
---|
272 | If endat is not given, .get_text() will stop at the next opening or
|
---|
273 | closing tag, or when there are no more tokens (no exception is raised).
|
---|
274 | Note that .get_text() includes the text representation (if any) of the
|
---|
275 | opening tag, but pushes the opening tag back onto the stack. As a
|
---|
276 | result, if you want to call .get_text() again, you need to call
|
---|
277 | .get_tag() first (unless you want an empty string returned when you
|
---|
278 | next call .get_text()).
|
---|
279 |
|
---|
280 | Entity references are translated using the value of the entitydefs
|
---|
281 | constructor argument (a mapping from names to characters like that
|
---|
282 | provided by the standard module htmlentitydefs). Named entity
|
---|
283 | references that are not in this mapping are left unchanged.
|
---|
284 |
|
---|
285 | The textify attribute is used to translate opening tags into text: see
|
---|
286 | the class docstring.
|
---|
287 |
|
---|
288 | """
|
---|
289 | text = []
|
---|
290 | tok = None
|
---|
291 | while 1:
|
---|
292 | try:
|
---|
293 | tok = self.get_token()
|
---|
294 | except NoMoreTokensError:
|
---|
295 | # unget last token (not the one we just failed to get)
|
---|
296 | if tok: self.unget_token(tok)
|
---|
297 | break
|
---|
298 | if tok.type == "data":
|
---|
299 | text.append(tok.data)
|
---|
300 | elif tok.type == "entityref":
|
---|
301 | t = unescape("&%s;" % tok.data, self._entitydefs, self.encoding)
|
---|
302 | text.append(t)
|
---|
303 | elif tok.type == "charref":
|
---|
304 | t = unescape_charref(tok.data, self.encoding)
|
---|
305 | text.append(t)
|
---|
306 | elif tok.type in ["starttag", "endtag", "startendtag"]:
|
---|
307 | tag_name = tok.data
|
---|
308 | if tok.type in ["starttag", "startendtag"]:
|
---|
309 | alt = self.textify.get(tag_name)
|
---|
310 | if alt is not None:
|
---|
311 | if callable(alt):
|
---|
312 | text.append(alt(tok))
|
---|
313 | elif tok.attrs is not None:
|
---|
314 | for k, v in tok.attrs:
|
---|
315 | if k == alt:
|
---|
316 | text.append(v)
|
---|
317 | text.append("[%s]" % tag_name.upper())
|
---|
318 | if endat is None or endat == (tok.type, tag_name):
|
---|
319 | self.unget_token(tok)
|
---|
320 | break
|
---|
321 | return "".join(text)
|
---|
322 |
|
---|
323 | def get_compressed_text(self, *args, **kwds):
|
---|
324 | """
|
---|
325 | As .get_text(), but collapses each group of contiguous whitespace to a
|
---|
326 | single space character, and removes all initial and trailing
|
---|
327 | whitespace.
|
---|
328 |
|
---|
329 | """
|
---|
330 | text = self.get_text(*args, **kwds)
|
---|
331 | text = text.strip()
|
---|
332 | return self.compress_re.sub(" ", text)
|
---|
333 |
|
---|
334 | def handle_startendtag(self, tag, attrs):
|
---|
335 | self._tokenstack.append(Token("startendtag", tag, attrs))
|
---|
336 | def handle_starttag(self, tag, attrs):
|
---|
337 | self._tokenstack.append(Token("starttag", tag, attrs))
|
---|
338 | def handle_endtag(self, tag):
|
---|
339 | self._tokenstack.append(Token("endtag", tag))
|
---|
340 | def handle_charref(self, name):
|
---|
341 | self._tokenstack.append(Token("charref", name))
|
---|
342 | def handle_entityref(self, name):
|
---|
343 | self._tokenstack.append(Token("entityref", name))
|
---|
344 | def handle_data(self, data):
|
---|
345 | self._tokenstack.append(Token("data", data))
|
---|
346 | def handle_comment(self, data):
|
---|
347 | self._tokenstack.append(Token("comment", data))
|
---|
348 | def handle_decl(self, decl):
|
---|
349 | self._tokenstack.append(Token("decl", decl))
|
---|
350 | def unknown_decl(self, data):
|
---|
351 | # XXX should this call self.error instead?
|
---|
352 | #self.error("unknown declaration: " + `data`)
|
---|
353 | self._tokenstack.append(Token("decl", data))
|
---|
354 | def handle_pi(self, data):
|
---|
355 | self._tokenstack.append(Token("pi", data))
|
---|
356 |
|
---|
357 | def unescape_attr(self, name):
|
---|
358 | return unescape(name, self._entitydefs, self.encoding)
|
---|
359 | def unescape_attrs(self, attrs):
|
---|
360 | escaped_attrs = []
|
---|
361 | for key, val in attrs:
|
---|
362 | escaped_attrs.append((key, self.unescape_attr(val)))
|
---|
363 | return escaped_attrs
|
---|
364 |
|
---|
365 | class PullParser(_AbstractParser, HTMLParser.HTMLParser):
|
---|
366 | def __init__(self, *args, **kwds):
|
---|
367 | HTMLParser.HTMLParser.__init__(self)
|
---|
368 | _AbstractParser.__init__(self, *args, **kwds)
|
---|
369 | def unescape(self, name):
|
---|
370 | # Use the entitydefs passed into constructor, not
|
---|
371 | # HTMLParser.HTMLParser's entitydefs.
|
---|
372 | return self.unescape_attr(name)
|
---|
373 |
|
---|
374 | class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
|
---|
375 | def __init__(self, *args, **kwds):
|
---|
376 | sgmllib.SGMLParser.__init__(self)
|
---|
377 | _AbstractParser.__init__(self, *args, **kwds)
|
---|
378 | def unknown_starttag(self, tag, attrs):
|
---|
379 | attrs = self.unescape_attrs(attrs)
|
---|
380 | self._tokenstack.append(Token("starttag", tag, attrs))
|
---|
381 | def unknown_endtag(self, tag):
|
---|
382 | self._tokenstack.append(Token("endtag", tag))
|
---|
383 |
|
---|
384 |
|
---|
385 | def _test():
|
---|
386 | import doctest, _pullparser
|
---|
387 | return doctest.testmod(_pullparser)
|
---|
388 |
|
---|
389 | if __name__ == "__main__":
|
---|
390 | _test()
|
---|