source: py-scraping/mechanize/_html.py@ 144

Last change on this file since 144 was 106, checked in by Rick van der Zwet, 15 years ago

Initial commit...

File size: 20.8 KB
RevLine 
[106]1"""HTML handling.
2
3Copyright 2003-2006 John J. Lee <jjl@pobox.com>
4
5This code is free software; you can redistribute it and/or modify it under
6the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
7included with the distribution).
8
9"""
10
11import re, copy, htmlentitydefs
12import sgmllib, ClientForm
13
14import _request
15from _headersutil import split_header_words, is_html as _is_html
16import _rfc3986
17
18DEFAULT_ENCODING = "latin-1"
19
20COMPRESS_RE = re.compile(r"\s+")
21
22
23# the base classe is purely for backwards compatibility
24class ParseError(ClientForm.ParseError): pass
25
26
27class CachingGeneratorFunction(object):
28 """Caching wrapper around a no-arguments iterable."""
29
30 def __init__(self, iterable):
31 self._cache = []
32 # wrap iterable to make it non-restartable (otherwise, repeated
33 # __call__ would give incorrect results)
34 self._iterator = iter(iterable)
35
36 def __call__(self):
37 cache = self._cache
38 for item in cache:
39 yield item
40 for item in self._iterator:
41 cache.append(item)
42 yield item
43
44
45class EncodingFinder:
46 def __init__(self, default_encoding):
47 self._default_encoding = default_encoding
48 def encoding(self, response):
49 # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
50 # headers may be in the response. HTTP-EQUIV headers come last,
51 # so try in order from first to last.
52 for ct in response.info().getheaders("content-type"):
53 for k, v in split_header_words([ct])[0]:
54 if k == "charset":
55 return v
56 return self._default_encoding
57
58class ResponseTypeFinder:
59 def __init__(self, allow_xhtml):
60 self._allow_xhtml = allow_xhtml
61 def is_html(self, response, encoding):
62 ct_hdrs = response.info().getheaders("content-type")
63 url = response.geturl()
64 # XXX encoding
65 return _is_html(ct_hdrs, url, self._allow_xhtml)
66
67
68# idea for this argument-processing trick is from Peter Otten
69class Args:
70 def __init__(self, args_map):
71 self.dictionary = dict(args_map)
72 def __getattr__(self, key):
73 try:
74 return self.dictionary[key]
75 except KeyError:
76 return getattr(self.__class__, key)
77
78def form_parser_args(
79 select_default=False,
80 form_parser_class=None,
81 request_class=None,
82 backwards_compat=False,
83 ):
84 return Args(locals())
85
86
87class Link:
88 def __init__(self, base_url, url, text, tag, attrs):
89 assert None not in [url, tag, attrs]
90 self.base_url = base_url
91 self.absolute_url = _rfc3986.urljoin(base_url, url)
92 self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
93 def __cmp__(self, other):
94 try:
95 for name in "url", "text", "tag", "attrs":
96 if getattr(self, name) != getattr(other, name):
97 return - 1
98 except AttributeError:
99 return - 1
100 return 0
101 def __repr__(self):
102 return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
103 self.base_url, self.url, self.text, self.tag, self.attrs)
104
105
106class LinksFactory:
107
108 def __init__(self,
109 link_parser_class=None,
110 link_class=Link,
111 urltags=None,
112 ):
113 import _pullparser
114 if link_parser_class is None:
115 link_parser_class = _pullparser.TolerantPullParser
116 self.link_parser_class = link_parser_class
117 self.link_class = link_class
118 if urltags is None:
119 urltags = {
120 "a": "href",
121 "area": "href",
122 "frame": "src",
123 "iframe": "src",
124 }
125 self.urltags = urltags
126 self._response = None
127 self._encoding = None
128
129 def set_response(self, response, base_url, encoding):
130 self._response = response
131 self._encoding = encoding
132 self._base_url = base_url
133
134 def links(self):
135 """Return an iterator that provides links of the document."""
136 response = self._response
137 encoding = self._encoding
138 base_url = self._base_url
139 p = self.link_parser_class(response, encoding=encoding)
140
141 try:
142 for token in p.tags(*(self.urltags.keys() + ["base"])):
143 if token.type == "endtag":
144 continue
145 if token.data == "base":
146 base_href = dict(token.attrs).get("href")
147 if base_href is not None:
148 base_url = base_href
149 continue
150 attrs = dict(token.attrs)
151 tag = token.data
152 name = attrs.get("name")
153 text = None
154 # XXX use attr_encoding for ref'd doc if that doc does not
155 # provide one by other means
156 #attr_encoding = attrs.get("charset")
157 url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
158 if not url:
159 # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
160 # For our purposes a link is something with a URL, so
161 # ignore this.
162 continue
163
164 url = _rfc3986.clean_url(url, encoding)
165 if tag == "a":
166 if token.type != "startendtag":
167 # hmm, this'd break if end tag is missing
168 text = p.get_compressed_text(("endtag", tag))
169 # but this doesn't work for eg.
170 # <a href="blah"><b>Andy</b></a>
171 #text = p.get_compressed_text()
172
173 yield Link(base_url, url, text, tag, token.attrs)
174 except sgmllib.SGMLParseError, exc:
175 raise ParseError(exc)
176
177class FormsFactory:
178
179 """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
180
181 After calling .forms(), the .global_form attribute is a form object
182 containing all controls not a descendant of any FORM element.
183
184 For constructor argument docs, see ClientForm.ParseResponse
185 argument docs.
186
187 """
188
189 def __init__(self,
190 select_default=False,
191 form_parser_class=None,
192 request_class=None,
193 backwards_compat=False,
194 ):
195 import ClientForm
196 self.select_default = select_default
197 if form_parser_class is None:
198 form_parser_class = ClientForm.FormParser
199 self.form_parser_class = form_parser_class
200 if request_class is None:
201 request_class = _request.Request
202 self.request_class = request_class
203 self.backwards_compat = backwards_compat
204 self._response = None
205 self.encoding = None
206 self.global_form = None
207
208 def set_response(self, response, encoding):
209 self._response = response
210 self.encoding = encoding
211 self.global_form = None
212
213 def forms(self):
214 import ClientForm
215 encoding = self.encoding
216 try:
217 forms = ClientForm.ParseResponseEx(
218 self._response,
219 select_default=self.select_default,
220 form_parser_class=self.form_parser_class,
221 request_class=self.request_class,
222 encoding=encoding,
223 _urljoin=_rfc3986.urljoin,
224 _urlparse=_rfc3986.urlsplit,
225 _urlunparse=_rfc3986.urlunsplit,
226 )
227 except ClientForm.ParseError, exc:
228 raise ParseError(exc)
229 self.global_form = forms[0]
230 return forms[1:]
231
232class TitleFactory:
233 def __init__(self):
234 self._response = self._encoding = None
235
236 def set_response(self, response, encoding):
237 self._response = response
238 self._encoding = encoding
239
240 def _get_title_text(self, parser):
241 import _pullparser
242 text = []
243 tok = None
244 while 1:
245 try:
246 tok = parser.get_token()
247 except _pullparser.NoMoreTokensError:
248 break
249 if tok.type == "data":
250 text.append(str(tok))
251 elif tok.type == "entityref":
252 t = unescape("&%s;" % tok.data,
253 parser._entitydefs, parser.encoding)
254 text.append(t)
255 elif tok.type == "charref":
256 t = unescape_charref(tok.data, parser.encoding)
257 text.append(t)
258 elif tok.type in ["starttag", "endtag", "startendtag"]:
259 tag_name = tok.data
260 if tok.type == "endtag" and tag_name == "title":
261 break
262 text.append(str(tok))
263 return COMPRESS_RE.sub(" ", "".join(text).strip())
264
265 def title(self):
266 import _pullparser
267 p = _pullparser.TolerantPullParser(
268 self._response, encoding=self._encoding)
269 try:
270 try:
271 p.get_tag("title")
272 except _pullparser.NoMoreTokensError:
273 return None
274 else:
275 return self._get_title_text(p)
276 except sgmllib.SGMLParseError, exc:
277 raise ParseError(exc)
278
279
280def unescape(data, entities, encoding):
281 if data is None or "&" not in data:
282 return data
283
284 def replace_entities(match):
285 ent = match.group()
286 if ent[1] == "#":
287 return unescape_charref(ent[2:-1], encoding)
288
289 repl = entities.get(ent[1:-1])
290 if repl is not None:
291 repl = unichr(repl)
292 if type(repl) != type(""):
293 try:
294 repl = repl.encode(encoding)
295 except UnicodeError:
296 repl = ent
297 else:
298 repl = ent
299 return repl
300
301 return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
302
303def unescape_charref(data, encoding):
304 name, base = data, 10
305 if name.startswith("x"):
306 name, base = name[1:], 16
307 uc = unichr(int(name, base))
308 if encoding is None:
309 return uc
310 else:
311 try:
312 repl = uc.encode(encoding)
313 except UnicodeError:
314 repl = "&#%s;" % data
315 return repl
316
317
318# bizarre import gymnastics for bundled BeautifulSoup
319import _beautifulsoup
320import ClientForm
321RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
322 _beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
323 )
324# monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
325sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
326
327class MechanizeBs(_beautifulsoup.BeautifulSoup):
328 _entitydefs = htmlentitydefs.name2codepoint
329 # don't want the magic Microsoft-char workaround
330 PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
331 lambda(x):x.group(1) + ' />'),
332 (re.compile('<!\s+([^<>]*)>'),
333 lambda(x):'<!' + x.group(1) + '>')
334 ]
335
336 def __init__(self, encoding, text=None, avoidParserProblems=True,
337 initialTextIsEverything=True):
338 self._encoding = encoding
339 _beautifulsoup.BeautifulSoup.__init__(
340 self, text, avoidParserProblems, initialTextIsEverything)
341
342 def handle_charref(self, ref):
343 t = unescape("&#%s;" % ref, self._entitydefs, self._encoding)
344 self.handle_data(t)
345 def handle_entityref(self, ref):
346 t = unescape("&%s;" % ref, self._entitydefs, self._encoding)
347 self.handle_data(t)
348 def unescape_attrs(self, attrs):
349 escaped_attrs = []
350 for key, val in attrs:
351 val = unescape(val, self._entitydefs, self._encoding)
352 escaped_attrs.append((key, val))
353 return escaped_attrs
354
355class RobustLinksFactory:
356
357 compress_re = COMPRESS_RE
358
359 def __init__(self,
360 link_parser_class=None,
361 link_class=Link,
362 urltags=None,
363 ):
364 if link_parser_class is None:
365 link_parser_class = MechanizeBs
366 self.link_parser_class = link_parser_class
367 self.link_class = link_class
368 if urltags is None:
369 urltags = {
370 "a": "href",
371 "area": "href",
372 "frame": "src",
373 "iframe": "src",
374 }
375 self.urltags = urltags
376 self._bs = None
377 self._encoding = None
378 self._base_url = None
379
380 def set_soup(self, soup, base_url, encoding):
381 self._bs = soup
382 self._base_url = base_url
383 self._encoding = encoding
384
385 def links(self):
386 import _beautifulsoup
387 bs = self._bs
388 base_url = self._base_url
389 encoding = self._encoding
390 gen = bs.recursiveChildGenerator()
391 for ch in bs.recursiveChildGenerator():
392 if (isinstance(ch, _beautifulsoup.Tag) and
393 ch.name in self.urltags.keys() + ["base"]):
394 link = ch
395 attrs = bs.unescape_attrs(link.attrs)
396 attrs_dict = dict(attrs)
397 if link.name == "base":
398 base_href = attrs_dict.get("href")
399 if base_href is not None:
400 base_url = base_href
401 continue
402 url_attr = self.urltags[link.name]
403 url = attrs_dict.get(url_attr)
404 if not url:
405 continue
406 url = _rfc3986.clean_url(url, encoding)
407 text = link.fetchText(lambda t: True)
408 if not text:
409 # follow _pullparser's weird behaviour rigidly
410 if link.name == "a":
411 text = ""
412 else:
413 text = None
414 else:
415 text = self.compress_re.sub(" ", " ".join(text).strip())
416 yield Link(base_url, url, text, link.name, attrs)
417
418
419class RobustFormsFactory(FormsFactory):
420 def __init__(self, *args, **kwds):
421 args = form_parser_args(*args, **kwds)
422 if args.form_parser_class is None:
423 args.form_parser_class = RobustFormParser
424 FormsFactory.__init__(self, **args.dictionary)
425
426 def set_response(self, response, encoding):
427 self._response = response
428 self.encoding = encoding
429
430
431class RobustTitleFactory:
432 def __init__(self):
433 self._bs = self._encoding = None
434
435 def set_soup(self, soup, encoding):
436 self._bs = soup
437 self._encoding = encoding
438
439 def title(self):
440 import _beautifulsoup
441 title = self._bs.first("title")
442 if title == _beautifulsoup.Null:
443 return None
444 else:
445 inner_html = "".join([str(node) for node in title.contents])
446 return COMPRESS_RE.sub(" ", inner_html.strip())
447
448
449class Factory:
450 """Factory for forms, links, etc.
451
452 This interface may expand in future.
453
454 Public methods:
455
456 set_request_class(request_class)
457 set_response(response)
458 forms()
459 links()
460
461 Public attributes:
462
463 Note that accessing these attributes may raise ParseError.
464
465 encoding: string specifying the encoding of response if it contains a text
466 document (this value is left unspecified for documents that do not have
467 an encoding, e.g. an image file)
468 is_html: true if response contains an HTML document (XHTML may be
469 regarded as HTML too)
470 title: page title, or None if no title or not HTML
471 global_form: form object containing all controls that are not descendants
472 of any FORM element, or None if the forms_factory does not support
473 supplying a global form
474
475 """
476
477 LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
478
479 def __init__(self, forms_factory, links_factory, title_factory,
480 encoding_finder=EncodingFinder(DEFAULT_ENCODING),
481 response_type_finder=ResponseTypeFinder(allow_xhtml=False),
482 ):
483 """
484
485 Pass keyword arguments only.
486
487 default_encoding: character encoding to use if encoding cannot be
488 determined (or guessed) from the response. You should turn on
489 HTTP-EQUIV handling if you want the best chance of getting this right
490 without resorting to this default. The default value of this
491 parameter (currently latin-1) may change in future.
492
493 """
494 self._forms_factory = forms_factory
495 self._links_factory = links_factory
496 self._title_factory = title_factory
497 self._encoding_finder = encoding_finder
498 self._response_type_finder = response_type_finder
499
500 self.set_response(None)
501
502 def set_request_class(self, request_class):
503 """Set urllib2.Request class.
504
505 ClientForm.HTMLForm instances returned by .forms() will return
506 instances of this class when .click()ed.
507
508 """
509 self._forms_factory.request_class = request_class
510
511 def set_response(self, response):
512 """Set response.
513
514 The response must either be None or implement the same interface as
515 objects returned by urllib2.urlopen().
516
517 """
518 self._response = response
519 self._forms_genf = self._links_genf = None
520 self._get_title = None
521 for name in self.LAZY_ATTRS:
522 try:
523 delattr(self, name)
524 except AttributeError:
525 pass
526
527 def __getattr__(self, name):
528 if name not in self.LAZY_ATTRS:
529 return getattr(self.__class__, name)
530
531 if name == "encoding":
532 self.encoding = self._encoding_finder.encoding(
533 copy.copy(self._response))
534 return self.encoding
535 elif name == "is_html":
536 self.is_html = self._response_type_finder.is_html(
537 copy.copy(self._response), self.encoding)
538 return self.is_html
539 elif name == "title":
540 if self.is_html:
541 self.title = self._title_factory.title()
542 else:
543 self.title = None
544 return self.title
545 elif name == "global_form":
546 self.forms()
547 return self.global_form
548
549 def forms(self):
550 """Return iterable over ClientForm.HTMLForm-like objects.
551
552 Raises mechanize.ParseError on failure.
553 """
554 # this implementation sets .global_form as a side-effect, for benefit
555 # of __getattr__ impl
556 if self._forms_genf is None:
557 try:
558 self._forms_genf = CachingGeneratorFunction(
559 self._forms_factory.forms())
560 except: # XXXX define exception!
561 self.set_response(self._response)
562 raise
563 self.global_form = getattr(
564 self._forms_factory, "global_form", None)
565 return self._forms_genf()
566
567 def links(self):
568 """Return iterable over mechanize.Link-like objects.
569
570 Raises mechanize.ParseError on failure.
571 """
572 if self._links_genf is None:
573 try:
574 self._links_genf = CachingGeneratorFunction(
575 self._links_factory.links())
576 except: # XXXX define exception!
577 self.set_response(self._response)
578 raise
579 return self._links_genf()
580
581class DefaultFactory(Factory):
582 """Based on sgmllib."""
583 def __init__(self, i_want_broken_xhtml_support=False):
584 Factory.__init__(
585 self,
586 forms_factory=FormsFactory(),
587 links_factory=LinksFactory(),
588 title_factory=TitleFactory(),
589 response_type_finder=ResponseTypeFinder(
590 allow_xhtml=i_want_broken_xhtml_support),
591 )
592
593 def set_response(self, response):
594 Factory.set_response(self, response)
595 if response is not None:
596 self._forms_factory.set_response(
597 copy.copy(response), self.encoding)
598 self._links_factory.set_response(
599 copy.copy(response), response.geturl(), self.encoding)
600 self._title_factory.set_response(
601 copy.copy(response), self.encoding)
602
603class RobustFactory(Factory):
604 """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
605 DefaultFactory.
606
607 """
608 def __init__(self, i_want_broken_xhtml_support=False,
609 soup_class=None):
610 Factory.__init__(
611 self,
612 forms_factory=RobustFormsFactory(),
613 links_factory=RobustLinksFactory(),
614 title_factory=RobustTitleFactory(),
615 response_type_finder=ResponseTypeFinder(
616 allow_xhtml=i_want_broken_xhtml_support),
617 )
618 if soup_class is None:
619 soup_class = MechanizeBs
620 self._soup_class = soup_class
621
622 def set_response(self, response):
623 Factory.set_response(self, response)
624 if response is not None:
625 data = response.read()
626 soup = self._soup_class(self.encoding, data)
627 self._forms_factory.set_response(
628 copy.copy(response), self.encoding)
629 self._links_factory.set_soup(
630 soup, response.geturl(), self.encoding)
631 self._title_factory.set_soup(soup, self.encoding)
Note: See TracBrowser for help on using the repository browser.