[106] | 1 | """A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
|
---|
| 2 |
|
---|
| 3 | Examples
|
---|
| 4 |
|
---|
| 5 | This program extracts all links from a document. It will print one
|
---|
| 6 | line for each link, containing the URL and the textual description
|
---|
| 7 | between the <A>...</A> tags:
|
---|
| 8 |
|
---|
| 9 | import pullparser, sys
|
---|
| 10 | f = file(sys.argv[1])
|
---|
| 11 | p = pullparser.PullParser(f)
|
---|
| 12 | for token in p.tags("a"):
|
---|
| 13 | if token.type == "endtag": continue
|
---|
| 14 | url = dict(token.attrs).get("href", "-")
|
---|
| 15 | text = p.get_compressed_text(endat=("endtag", "a"))
|
---|
| 16 | print "%s\t%s" % (url, text)
|
---|
| 17 |
|
---|
| 18 | This program extracts the <TITLE> from the document:
|
---|
| 19 |
|
---|
| 20 | import pullparser, sys
|
---|
| 21 | f = file(sys.argv[1])
|
---|
| 22 | p = pullparser.PullParser(f)
|
---|
| 23 | if p.get_tag("title"):
|
---|
| 24 | title = p.get_compressed_text()
|
---|
| 25 | print "Title: %s" % title
|
---|
| 26 |
|
---|
| 27 |
|
---|
| 28 | Copyright 2003-2006 John J. Lee <jjl@pobox.com>
|
---|
| 29 | Copyright 1998-2001 Gisle Aas (original libwww-perl code)
|
---|
| 30 |
|
---|
| 31 | This code is free software; you can redistribute it and/or modify it
|
---|
| 32 | under the terms of the BSD or ZPL 2.1 licenses.
|
---|
| 33 |
|
---|
| 34 | """
|
---|
| 35 |
|
---|
| 36 | import re, htmlentitydefs
|
---|
| 37 | import sgmllib, HTMLParser
|
---|
| 38 | from xml.sax import saxutils
|
---|
| 39 |
|
---|
| 40 | from _html import unescape, unescape_charref
|
---|
| 41 |
|
---|
| 42 |
|
---|
| 43 | class NoMoreTokensError(Exception): pass
|
---|
| 44 |
|
---|
| 45 | class Token:
|
---|
| 46 | """Represents an HTML tag, declaration, processing instruction etc.
|
---|
| 47 |
|
---|
| 48 | Behaves as both a tuple-like object (ie. iterable) and has attributes
|
---|
| 49 | .type, .data and .attrs.
|
---|
| 50 |
|
---|
| 51 | >>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
|
---|
| 52 | >>> t == ("starttag", "a", [("href", "http://www.python.org/")])
|
---|
| 53 | True
|
---|
| 54 | >>> (t.type, t.data) == ("starttag", "a")
|
---|
| 55 | True
|
---|
| 56 | >>> t.attrs == [("href", "http://www.python.org/")]
|
---|
| 57 | True
|
---|
| 58 |
|
---|
| 59 | Public attributes
|
---|
| 60 |
|
---|
| 61 | type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
|
---|
| 62 | "data", "comment", "decl", "pi", after the corresponding methods of
|
---|
| 63 | HTMLParser.HTMLParser
|
---|
| 64 | data: For a tag, the tag name; otherwise, the relevant data carried by the
|
---|
| 65 | tag, as a string
|
---|
| 66 | attrs: list of (name, value) pairs representing HTML attributes
|
---|
| 67 | (or None if token does not represent an opening tag)
|
---|
| 68 |
|
---|
| 69 | """
|
---|
| 70 | def __init__(self, type, data, attrs=None):
|
---|
| 71 | self.type = type
|
---|
| 72 | self.data = data
|
---|
| 73 | self.attrs = attrs
|
---|
| 74 | def __iter__(self):
|
---|
| 75 | return iter((self.type, self.data, self.attrs))
|
---|
| 76 | def __eq__(self, other):
|
---|
| 77 | type, data, attrs = other
|
---|
| 78 | if (self.type == type and
|
---|
| 79 | self.data == data and
|
---|
| 80 | self.attrs == attrs):
|
---|
| 81 | return True
|
---|
| 82 | else:
|
---|
| 83 | return False
|
---|
| 84 | def __ne__(self, other): return not self.__eq__(other)
|
---|
| 85 | def __repr__(self):
|
---|
| 86 | args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
|
---|
| 87 | return self.__class__.__name__ + "(%s)" % args
|
---|
| 88 |
|
---|
| 89 | def __str__(self):
|
---|
| 90 | """
|
---|
| 91 | >>> print Token("starttag", "br")
|
---|
| 92 | <br>
|
---|
| 93 | >>> print Token("starttag", "a",
|
---|
| 94 | ... [("href", "http://www.python.org/"), ("alt", '"foo"')])
|
---|
| 95 | <a href="http://www.python.org/" alt='"foo"'>
|
---|
| 96 | >>> print Token("startendtag", "br")
|
---|
| 97 | <br />
|
---|
| 98 | >>> print Token("startendtag", "br", [("spam", "eggs")])
|
---|
| 99 | <br spam="eggs" />
|
---|
| 100 | >>> print Token("endtag", "p")
|
---|
| 101 | </p>
|
---|
| 102 | >>> print Token("charref", "38")
|
---|
| 103 | &
|
---|
| 104 | >>> print Token("entityref", "amp")
|
---|
| 105 | &
|
---|
| 106 | >>> print Token("data", "foo\\nbar")
|
---|
| 107 | foo
|
---|
| 108 | bar
|
---|
| 109 | >>> print Token("comment", "Life is a bowl\\nof cherries.")
|
---|
| 110 | <!--Life is a bowl
|
---|
| 111 | of cherries.-->
|
---|
| 112 | >>> print Token("decl", "decl")
|
---|
| 113 | <!decl>
|
---|
| 114 | >>> print Token("pi", "pi")
|
---|
| 115 | <?pi>
|
---|
| 116 | """
|
---|
| 117 | if self.attrs is not None:
|
---|
| 118 | attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for
|
---|
| 119 | k, v in self.attrs])
|
---|
| 120 | else:
|
---|
| 121 | attrs = ""
|
---|
| 122 | if self.type == "starttag":
|
---|
| 123 | return "<%s%s>" % (self.data, attrs)
|
---|
| 124 | elif self.type == "startendtag":
|
---|
| 125 | return "<%s%s />" % (self.data, attrs)
|
---|
| 126 | elif self.type == "endtag":
|
---|
| 127 | return "</%s>" % self.data
|
---|
| 128 | elif self.type == "charref":
|
---|
| 129 | return "&#%s;" % self.data
|
---|
| 130 | elif self.type == "entityref":
|
---|
| 131 | return "&%s;" % self.data
|
---|
| 132 | elif self.type == "data":
|
---|
| 133 | return self.data
|
---|
| 134 | elif self.type == "comment":
|
---|
| 135 | return "<!--%s-->" % self.data
|
---|
| 136 | elif self.type == "decl":
|
---|
| 137 | return "<!%s>" % self.data
|
---|
| 138 | elif self.type == "pi":
|
---|
| 139 | return "<?%s>" % self.data
|
---|
| 140 | assert False
|
---|
| 141 |
|
---|
| 142 |
|
---|
| 143 | def iter_until_exception(fn, exception, *args, **kwds):
|
---|
| 144 | while 1:
|
---|
| 145 | try:
|
---|
| 146 | yield fn(*args, **kwds)
|
---|
| 147 | except exception:
|
---|
| 148 | raise StopIteration
|
---|
| 149 |
|
---|
| 150 |
|
---|
| 151 | class _AbstractParser:
|
---|
| 152 | chunk = 1024
|
---|
| 153 | compress_re = re.compile(r"\s+")
|
---|
| 154 | def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
|
---|
| 155 | encoding="ascii", entitydefs=None):
|
---|
| 156 | """
|
---|
| 157 | fh: file-like object (only a .read() method is required) from which to
|
---|
| 158 | read HTML to be parsed
|
---|
| 159 | textify: mapping used by .get_text() and .get_compressed_text() methods
|
---|
| 160 | to represent opening tags as text
|
---|
| 161 | encoding: encoding used to encode numeric character references by
|
---|
| 162 | .get_text() and .get_compressed_text() ("ascii" by default)
|
---|
| 163 |
|
---|
| 164 | entitydefs: mapping like {"amp": "&", ...} containing HTML entity
|
---|
| 165 | definitions (a sensible default is used). This is used to unescape
|
---|
| 166 | entities in .get_text() (and .get_compressed_text()) and attribute
|
---|
| 167 | values. If the encoding can not represent the character, the entity
|
---|
| 168 | reference is left unescaped. Note that entity references (both
|
---|
| 169 | numeric - e.g. { or ઼ - and non-numeric - e.g. &) are
|
---|
| 170 | unescaped in attribute values and the return value of .get_text(), but
|
---|
| 171 | not in data outside of tags. Instead, entity references outside of
|
---|
| 172 | tags are represented as tokens. This is a bit odd, it's true :-/
|
---|
| 173 |
|
---|
| 174 | If the element name of an opening tag matches a key in the textify
|
---|
| 175 | mapping then that tag is converted to text. The corresponding value is
|
---|
| 176 | used to specify which tag attribute to obtain the text from. textify
|
---|
| 177 | maps from element names to either:
|
---|
| 178 |
|
---|
| 179 | - an HTML attribute name, in which case the HTML attribute value is
|
---|
| 180 | used as its text value along with the element name in square
|
---|
| 181 | brackets (eg."alt text goes here[IMG]", or, if the alt attribute
|
---|
| 182 | were missing, just "[IMG]")
|
---|
| 183 | - a callable object (eg. a function) which takes a Token and returns
|
---|
| 184 | the string to be used as its text value
|
---|
| 185 |
|
---|
| 186 | If textify has no key for an element name, nothing is substituted for
|
---|
| 187 | the opening tag.
|
---|
| 188 |
|
---|
| 189 | Public attributes:
|
---|
| 190 |
|
---|
| 191 | encoding and textify: see above
|
---|
| 192 |
|
---|
| 193 | """
|
---|
| 194 | self._fh = fh
|
---|
| 195 | self._tokenstack = [] # FIFO
|
---|
| 196 | self.textify = textify
|
---|
| 197 | self.encoding = encoding
|
---|
| 198 | if entitydefs is None:
|
---|
| 199 | entitydefs = htmlentitydefs.name2codepoint
|
---|
| 200 | self._entitydefs = entitydefs
|
---|
| 201 |
|
---|
| 202 | def __iter__(self): return self
|
---|
| 203 |
|
---|
| 204 | def tags(self, *names):
|
---|
| 205 | return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
|
---|
| 206 |
|
---|
| 207 | def tokens(self, *tokentypes):
|
---|
| 208 | return iter_until_exception(self.get_token, NoMoreTokensError,
|
---|
| 209 | *tokentypes)
|
---|
| 210 |
|
---|
| 211 | def next(self):
|
---|
| 212 | try:
|
---|
| 213 | return self.get_token()
|
---|
| 214 | except NoMoreTokensError:
|
---|
| 215 | raise StopIteration()
|
---|
| 216 |
|
---|
| 217 | def get_token(self, *tokentypes):
|
---|
| 218 | """Pop the next Token object from the stack of parsed tokens.
|
---|
| 219 |
|
---|
| 220 | If arguments are given, they are taken to be token types in which the
|
---|
| 221 | caller is interested: tokens representing other elements will be
|
---|
| 222 | skipped. Element names must be given in lower case.
|
---|
| 223 |
|
---|
| 224 | Raises NoMoreTokensError.
|
---|
| 225 |
|
---|
| 226 | """
|
---|
| 227 | while 1:
|
---|
| 228 | while self._tokenstack:
|
---|
| 229 | token = self._tokenstack.pop(0)
|
---|
| 230 | if tokentypes:
|
---|
| 231 | if token.type in tokentypes:
|
---|
| 232 | return token
|
---|
| 233 | else:
|
---|
| 234 | return token
|
---|
| 235 | data = self._fh.read(self.chunk)
|
---|
| 236 | if not data:
|
---|
| 237 | raise NoMoreTokensError()
|
---|
| 238 | self.feed(data)
|
---|
| 239 |
|
---|
| 240 | def unget_token(self, token):
|
---|
| 241 | """Push a Token back onto the stack."""
|
---|
| 242 | self._tokenstack.insert(0, token)
|
---|
| 243 |
|
---|
| 244 | def get_tag(self, *names):
|
---|
| 245 | """Return the next Token that represents an opening or closing tag.
|
---|
| 246 |
|
---|
| 247 | If arguments are given, they are taken to be element names in which the
|
---|
| 248 | caller is interested: tags representing other elements will be skipped.
|
---|
| 249 | Element names must be given in lower case.
|
---|
| 250 |
|
---|
| 251 | Raises NoMoreTokensError.
|
---|
| 252 |
|
---|
| 253 | """
|
---|
| 254 | while 1:
|
---|
| 255 | tok = self.get_token()
|
---|
| 256 | if tok.type not in ["starttag", "endtag", "startendtag"]:
|
---|
| 257 | continue
|
---|
| 258 | if names:
|
---|
| 259 | if tok.data in names:
|
---|
| 260 | return tok
|
---|
| 261 | else:
|
---|
| 262 | return tok
|
---|
| 263 |
|
---|
| 264 | def get_text(self, endat=None):
|
---|
| 265 | """Get some text.
|
---|
| 266 |
|
---|
| 267 | endat: stop reading text at this tag (the tag is included in the
|
---|
| 268 | returned text); endtag is a tuple (type, name) where type is
|
---|
| 269 | "starttag", "endtag" or "startendtag", and name is the element name of
|
---|
| 270 | the tag (element names must be given in lower case)
|
---|
| 271 |
|
---|
| 272 | If endat is not given, .get_text() will stop at the next opening or
|
---|
| 273 | closing tag, or when there are no more tokens (no exception is raised).
|
---|
| 274 | Note that .get_text() includes the text representation (if any) of the
|
---|
| 275 | opening tag, but pushes the opening tag back onto the stack. As a
|
---|
| 276 | result, if you want to call .get_text() again, you need to call
|
---|
| 277 | .get_tag() first (unless you want an empty string returned when you
|
---|
| 278 | next call .get_text()).
|
---|
| 279 |
|
---|
| 280 | Entity references are translated using the value of the entitydefs
|
---|
| 281 | constructor argument (a mapping from names to characters like that
|
---|
| 282 | provided by the standard module htmlentitydefs). Named entity
|
---|
| 283 | references that are not in this mapping are left unchanged.
|
---|
| 284 |
|
---|
| 285 | The textify attribute is used to translate opening tags into text: see
|
---|
| 286 | the class docstring.
|
---|
| 287 |
|
---|
| 288 | """
|
---|
| 289 | text = []
|
---|
| 290 | tok = None
|
---|
| 291 | while 1:
|
---|
| 292 | try:
|
---|
| 293 | tok = self.get_token()
|
---|
| 294 | except NoMoreTokensError:
|
---|
| 295 | # unget last token (not the one we just failed to get)
|
---|
| 296 | if tok: self.unget_token(tok)
|
---|
| 297 | break
|
---|
| 298 | if tok.type == "data":
|
---|
| 299 | text.append(tok.data)
|
---|
| 300 | elif tok.type == "entityref":
|
---|
| 301 | t = unescape("&%s;" % tok.data, self._entitydefs, self.encoding)
|
---|
| 302 | text.append(t)
|
---|
| 303 | elif tok.type == "charref":
|
---|
| 304 | t = unescape_charref(tok.data, self.encoding)
|
---|
| 305 | text.append(t)
|
---|
| 306 | elif tok.type in ["starttag", "endtag", "startendtag"]:
|
---|
| 307 | tag_name = tok.data
|
---|
| 308 | if tok.type in ["starttag", "startendtag"]:
|
---|
| 309 | alt = self.textify.get(tag_name)
|
---|
| 310 | if alt is not None:
|
---|
| 311 | if callable(alt):
|
---|
| 312 | text.append(alt(tok))
|
---|
| 313 | elif tok.attrs is not None:
|
---|
| 314 | for k, v in tok.attrs:
|
---|
| 315 | if k == alt:
|
---|
| 316 | text.append(v)
|
---|
| 317 | text.append("[%s]" % tag_name.upper())
|
---|
| 318 | if endat is None or endat == (tok.type, tag_name):
|
---|
| 319 | self.unget_token(tok)
|
---|
| 320 | break
|
---|
| 321 | return "".join(text)
|
---|
| 322 |
|
---|
| 323 | def get_compressed_text(self, *args, **kwds):
|
---|
| 324 | """
|
---|
| 325 | As .get_text(), but collapses each group of contiguous whitespace to a
|
---|
| 326 | single space character, and removes all initial and trailing
|
---|
| 327 | whitespace.
|
---|
| 328 |
|
---|
| 329 | """
|
---|
| 330 | text = self.get_text(*args, **kwds)
|
---|
| 331 | text = text.strip()
|
---|
| 332 | return self.compress_re.sub(" ", text)
|
---|
| 333 |
|
---|
| 334 | def handle_startendtag(self, tag, attrs):
|
---|
| 335 | self._tokenstack.append(Token("startendtag", tag, attrs))
|
---|
| 336 | def handle_starttag(self, tag, attrs):
|
---|
| 337 | self._tokenstack.append(Token("starttag", tag, attrs))
|
---|
| 338 | def handle_endtag(self, tag):
|
---|
| 339 | self._tokenstack.append(Token("endtag", tag))
|
---|
| 340 | def handle_charref(self, name):
|
---|
| 341 | self._tokenstack.append(Token("charref", name))
|
---|
| 342 | def handle_entityref(self, name):
|
---|
| 343 | self._tokenstack.append(Token("entityref", name))
|
---|
| 344 | def handle_data(self, data):
|
---|
| 345 | self._tokenstack.append(Token("data", data))
|
---|
| 346 | def handle_comment(self, data):
|
---|
| 347 | self._tokenstack.append(Token("comment", data))
|
---|
| 348 | def handle_decl(self, decl):
|
---|
| 349 | self._tokenstack.append(Token("decl", decl))
|
---|
| 350 | def unknown_decl(self, data):
|
---|
| 351 | # XXX should this call self.error instead?
|
---|
| 352 | #self.error("unknown declaration: " + `data`)
|
---|
| 353 | self._tokenstack.append(Token("decl", data))
|
---|
| 354 | def handle_pi(self, data):
|
---|
| 355 | self._tokenstack.append(Token("pi", data))
|
---|
| 356 |
|
---|
| 357 | def unescape_attr(self, name):
|
---|
| 358 | return unescape(name, self._entitydefs, self.encoding)
|
---|
| 359 | def unescape_attrs(self, attrs):
|
---|
| 360 | escaped_attrs = []
|
---|
| 361 | for key, val in attrs:
|
---|
| 362 | escaped_attrs.append((key, self.unescape_attr(val)))
|
---|
| 363 | return escaped_attrs
|
---|
| 364 |
|
---|
| 365 | class PullParser(_AbstractParser, HTMLParser.HTMLParser):
|
---|
| 366 | def __init__(self, *args, **kwds):
|
---|
| 367 | HTMLParser.HTMLParser.__init__(self)
|
---|
| 368 | _AbstractParser.__init__(self, *args, **kwds)
|
---|
| 369 | def unescape(self, name):
|
---|
| 370 | # Use the entitydefs passed into constructor, not
|
---|
| 371 | # HTMLParser.HTMLParser's entitydefs.
|
---|
| 372 | return self.unescape_attr(name)
|
---|
| 373 |
|
---|
| 374 | class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
|
---|
| 375 | def __init__(self, *args, **kwds):
|
---|
| 376 | sgmllib.SGMLParser.__init__(self)
|
---|
| 377 | _AbstractParser.__init__(self, *args, **kwds)
|
---|
| 378 | def unknown_starttag(self, tag, attrs):
|
---|
| 379 | attrs = self.unescape_attrs(attrs)
|
---|
| 380 | self._tokenstack.append(Token("starttag", tag, attrs))
|
---|
| 381 | def unknown_endtag(self, tag):
|
---|
| 382 | self._tokenstack.append(Token("endtag", tag))
|
---|
| 383 |
|
---|
| 384 |
|
---|
| 385 | def _test():
|
---|
| 386 | import doctest, _pullparser
|
---|
| 387 | return doctest.testmod(_pullparser)
|
---|
| 388 |
|
---|
| 389 | if __name__ == "__main__":
|
---|
| 390 | _test()
|
---|