1 | """HTML handling.
|
---|
2 |
|
---|
3 | Copyright 2003-2006 John J. Lee <jjl@pobox.com>
|
---|
4 |
|
---|
5 | This code is free software; you can redistribute it and/or modify it under
|
---|
6 | the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
---|
7 | included with the distribution).
|
---|
8 |
|
---|
9 | """
|
---|
10 |
|
---|
11 | import re, copy, htmlentitydefs
|
---|
12 | import sgmllib, ClientForm
|
---|
13 |
|
---|
14 | import _request
|
---|
15 | from _headersutil import split_header_words, is_html as _is_html
|
---|
16 | import _rfc3986
|
---|
17 |
|
---|
18 | DEFAULT_ENCODING = "latin-1"
|
---|
19 |
|
---|
20 | COMPRESS_RE = re.compile(r"\s+")
|
---|
21 |
|
---|
22 |
|
---|
23 | # the base classe is purely for backwards compatibility
|
---|
24 | class ParseError(ClientForm.ParseError): pass
|
---|
25 |
|
---|
26 |
|
---|
27 | class CachingGeneratorFunction(object):
|
---|
28 | """Caching wrapper around a no-arguments iterable."""
|
---|
29 |
|
---|
30 | def __init__(self, iterable):
|
---|
31 | self._cache = []
|
---|
32 | # wrap iterable to make it non-restartable (otherwise, repeated
|
---|
33 | # __call__ would give incorrect results)
|
---|
34 | self._iterator = iter(iterable)
|
---|
35 |
|
---|
36 | def __call__(self):
|
---|
37 | cache = self._cache
|
---|
38 | for item in cache:
|
---|
39 | yield item
|
---|
40 | for item in self._iterator:
|
---|
41 | cache.append(item)
|
---|
42 | yield item
|
---|
43 |
|
---|
44 |
|
---|
45 | class EncodingFinder:
|
---|
46 | def __init__(self, default_encoding):
|
---|
47 | self._default_encoding = default_encoding
|
---|
48 | def encoding(self, response):
|
---|
49 | # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
|
---|
50 | # headers may be in the response. HTTP-EQUIV headers come last,
|
---|
51 | # so try in order from first to last.
|
---|
52 | for ct in response.info().getheaders("content-type"):
|
---|
53 | for k, v in split_header_words([ct])[0]:
|
---|
54 | if k == "charset":
|
---|
55 | return v
|
---|
56 | return self._default_encoding
|
---|
57 |
|
---|
58 | class ResponseTypeFinder:
|
---|
59 | def __init__(self, allow_xhtml):
|
---|
60 | self._allow_xhtml = allow_xhtml
|
---|
61 | def is_html(self, response, encoding):
|
---|
62 | ct_hdrs = response.info().getheaders("content-type")
|
---|
63 | url = response.geturl()
|
---|
64 | # XXX encoding
|
---|
65 | return _is_html(ct_hdrs, url, self._allow_xhtml)
|
---|
66 |
|
---|
67 |
|
---|
68 | # idea for this argument-processing trick is from Peter Otten
|
---|
69 | class Args:
|
---|
70 | def __init__(self, args_map):
|
---|
71 | self.dictionary = dict(args_map)
|
---|
72 | def __getattr__(self, key):
|
---|
73 | try:
|
---|
74 | return self.dictionary[key]
|
---|
75 | except KeyError:
|
---|
76 | return getattr(self.__class__, key)
|
---|
77 |
|
---|
78 | def form_parser_args(
|
---|
79 | select_default=False,
|
---|
80 | form_parser_class=None,
|
---|
81 | request_class=None,
|
---|
82 | backwards_compat=False,
|
---|
83 | ):
|
---|
84 | return Args(locals())
|
---|
85 |
|
---|
86 |
|
---|
87 | class Link:
|
---|
88 | def __init__(self, base_url, url, text, tag, attrs):
|
---|
89 | assert None not in [url, tag, attrs]
|
---|
90 | self.base_url = base_url
|
---|
91 | self.absolute_url = _rfc3986.urljoin(base_url, url)
|
---|
92 | self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
|
---|
93 | def __cmp__(self, other):
|
---|
94 | try:
|
---|
95 | for name in "url", "text", "tag", "attrs":
|
---|
96 | if getattr(self, name) != getattr(other, name):
|
---|
97 | return - 1
|
---|
98 | except AttributeError:
|
---|
99 | return - 1
|
---|
100 | return 0
|
---|
101 | def __repr__(self):
|
---|
102 | return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
|
---|
103 | self.base_url, self.url, self.text, self.tag, self.attrs)
|
---|
104 |
|
---|
105 |
|
---|
106 | class LinksFactory:
|
---|
107 |
|
---|
108 | def __init__(self,
|
---|
109 | link_parser_class=None,
|
---|
110 | link_class=Link,
|
---|
111 | urltags=None,
|
---|
112 | ):
|
---|
113 | import _pullparser
|
---|
114 | if link_parser_class is None:
|
---|
115 | link_parser_class = _pullparser.TolerantPullParser
|
---|
116 | self.link_parser_class = link_parser_class
|
---|
117 | self.link_class = link_class
|
---|
118 | if urltags is None:
|
---|
119 | urltags = {
|
---|
120 | "a": "href",
|
---|
121 | "area": "href",
|
---|
122 | "frame": "src",
|
---|
123 | "iframe": "src",
|
---|
124 | }
|
---|
125 | self.urltags = urltags
|
---|
126 | self._response = None
|
---|
127 | self._encoding = None
|
---|
128 |
|
---|
129 | def set_response(self, response, base_url, encoding):
|
---|
130 | self._response = response
|
---|
131 | self._encoding = encoding
|
---|
132 | self._base_url = base_url
|
---|
133 |
|
---|
134 | def links(self):
|
---|
135 | """Return an iterator that provides links of the document."""
|
---|
136 | response = self._response
|
---|
137 | encoding = self._encoding
|
---|
138 | base_url = self._base_url
|
---|
139 | p = self.link_parser_class(response, encoding=encoding)
|
---|
140 |
|
---|
141 | try:
|
---|
142 | for token in p.tags(*(self.urltags.keys() + ["base"])):
|
---|
143 | if token.type == "endtag":
|
---|
144 | continue
|
---|
145 | if token.data == "base":
|
---|
146 | base_href = dict(token.attrs).get("href")
|
---|
147 | if base_href is not None:
|
---|
148 | base_url = base_href
|
---|
149 | continue
|
---|
150 | attrs = dict(token.attrs)
|
---|
151 | tag = token.data
|
---|
152 | name = attrs.get("name")
|
---|
153 | text = None
|
---|
154 | # XXX use attr_encoding for ref'd doc if that doc does not
|
---|
155 | # provide one by other means
|
---|
156 | #attr_encoding = attrs.get("charset")
|
---|
157 | url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
|
---|
158 | if not url:
|
---|
159 | # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
|
---|
160 | # For our purposes a link is something with a URL, so
|
---|
161 | # ignore this.
|
---|
162 | continue
|
---|
163 |
|
---|
164 | url = _rfc3986.clean_url(url, encoding)
|
---|
165 | if tag == "a":
|
---|
166 | if token.type != "startendtag":
|
---|
167 | # hmm, this'd break if end tag is missing
|
---|
168 | text = p.get_compressed_text(("endtag", tag))
|
---|
169 | # but this doesn't work for eg.
|
---|
170 | # <a href="blah"><b>Andy</b></a>
|
---|
171 | #text = p.get_compressed_text()
|
---|
172 |
|
---|
173 | yield Link(base_url, url, text, tag, token.attrs)
|
---|
174 | except sgmllib.SGMLParseError, exc:
|
---|
175 | raise ParseError(exc)
|
---|
176 |
|
---|
177 | class FormsFactory:
|
---|
178 |
|
---|
179 | """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
|
---|
180 |
|
---|
181 | After calling .forms(), the .global_form attribute is a form object
|
---|
182 | containing all controls not a descendant of any FORM element.
|
---|
183 |
|
---|
184 | For constructor argument docs, see ClientForm.ParseResponse
|
---|
185 | argument docs.
|
---|
186 |
|
---|
187 | """
|
---|
188 |
|
---|
189 | def __init__(self,
|
---|
190 | select_default=False,
|
---|
191 | form_parser_class=None,
|
---|
192 | request_class=None,
|
---|
193 | backwards_compat=False,
|
---|
194 | ):
|
---|
195 | import ClientForm
|
---|
196 | self.select_default = select_default
|
---|
197 | if form_parser_class is None:
|
---|
198 | form_parser_class = ClientForm.FormParser
|
---|
199 | self.form_parser_class = form_parser_class
|
---|
200 | if request_class is None:
|
---|
201 | request_class = _request.Request
|
---|
202 | self.request_class = request_class
|
---|
203 | self.backwards_compat = backwards_compat
|
---|
204 | self._response = None
|
---|
205 | self.encoding = None
|
---|
206 | self.global_form = None
|
---|
207 |
|
---|
208 | def set_response(self, response, encoding):
|
---|
209 | self._response = response
|
---|
210 | self.encoding = encoding
|
---|
211 | self.global_form = None
|
---|
212 |
|
---|
213 | def forms(self):
|
---|
214 | import ClientForm
|
---|
215 | encoding = self.encoding
|
---|
216 | try:
|
---|
217 | forms = ClientForm.ParseResponseEx(
|
---|
218 | self._response,
|
---|
219 | select_default=self.select_default,
|
---|
220 | form_parser_class=self.form_parser_class,
|
---|
221 | request_class=self.request_class,
|
---|
222 | encoding=encoding,
|
---|
223 | _urljoin=_rfc3986.urljoin,
|
---|
224 | _urlparse=_rfc3986.urlsplit,
|
---|
225 | _urlunparse=_rfc3986.urlunsplit,
|
---|
226 | )
|
---|
227 | except ClientForm.ParseError, exc:
|
---|
228 | raise ParseError(exc)
|
---|
229 | self.global_form = forms[0]
|
---|
230 | return forms[1:]
|
---|
231 |
|
---|
232 | class TitleFactory:
|
---|
233 | def __init__(self):
|
---|
234 | self._response = self._encoding = None
|
---|
235 |
|
---|
236 | def set_response(self, response, encoding):
|
---|
237 | self._response = response
|
---|
238 | self._encoding = encoding
|
---|
239 |
|
---|
240 | def _get_title_text(self, parser):
|
---|
241 | import _pullparser
|
---|
242 | text = []
|
---|
243 | tok = None
|
---|
244 | while 1:
|
---|
245 | try:
|
---|
246 | tok = parser.get_token()
|
---|
247 | except _pullparser.NoMoreTokensError:
|
---|
248 | break
|
---|
249 | if tok.type == "data":
|
---|
250 | text.append(str(tok))
|
---|
251 | elif tok.type == "entityref":
|
---|
252 | t = unescape("&%s;" % tok.data,
|
---|
253 | parser._entitydefs, parser.encoding)
|
---|
254 | text.append(t)
|
---|
255 | elif tok.type == "charref":
|
---|
256 | t = unescape_charref(tok.data, parser.encoding)
|
---|
257 | text.append(t)
|
---|
258 | elif tok.type in ["starttag", "endtag", "startendtag"]:
|
---|
259 | tag_name = tok.data
|
---|
260 | if tok.type == "endtag" and tag_name == "title":
|
---|
261 | break
|
---|
262 | text.append(str(tok))
|
---|
263 | return COMPRESS_RE.sub(" ", "".join(text).strip())
|
---|
264 |
|
---|
265 | def title(self):
|
---|
266 | import _pullparser
|
---|
267 | p = _pullparser.TolerantPullParser(
|
---|
268 | self._response, encoding=self._encoding)
|
---|
269 | try:
|
---|
270 | try:
|
---|
271 | p.get_tag("title")
|
---|
272 | except _pullparser.NoMoreTokensError:
|
---|
273 | return None
|
---|
274 | else:
|
---|
275 | return self._get_title_text(p)
|
---|
276 | except sgmllib.SGMLParseError, exc:
|
---|
277 | raise ParseError(exc)
|
---|
278 |
|
---|
279 |
|
---|
280 | def unescape(data, entities, encoding):
|
---|
281 | if data is None or "&" not in data:
|
---|
282 | return data
|
---|
283 |
|
---|
284 | def replace_entities(match):
|
---|
285 | ent = match.group()
|
---|
286 | if ent[1] == "#":
|
---|
287 | return unescape_charref(ent[2:-1], encoding)
|
---|
288 |
|
---|
289 | repl = entities.get(ent[1:-1])
|
---|
290 | if repl is not None:
|
---|
291 | repl = unichr(repl)
|
---|
292 | if type(repl) != type(""):
|
---|
293 | try:
|
---|
294 | repl = repl.encode(encoding)
|
---|
295 | except UnicodeError:
|
---|
296 | repl = ent
|
---|
297 | else:
|
---|
298 | repl = ent
|
---|
299 | return repl
|
---|
300 |
|
---|
301 | return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
|
---|
302 |
|
---|
303 | def unescape_charref(data, encoding):
|
---|
304 | name, base = data, 10
|
---|
305 | if name.startswith("x"):
|
---|
306 | name, base = name[1:], 16
|
---|
307 | uc = unichr(int(name, base))
|
---|
308 | if encoding is None:
|
---|
309 | return uc
|
---|
310 | else:
|
---|
311 | try:
|
---|
312 | repl = uc.encode(encoding)
|
---|
313 | except UnicodeError:
|
---|
314 | repl = "&#%s;" % data
|
---|
315 | return repl
|
---|
316 |
|
---|
317 |
|
---|
318 | # bizarre import gymnastics for bundled BeautifulSoup
|
---|
319 | import _beautifulsoup
|
---|
320 | import ClientForm
|
---|
321 | RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
|
---|
322 | _beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
|
---|
323 | )
|
---|
324 | # monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
|
---|
325 | sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
|
---|
326 |
|
---|
327 | class MechanizeBs(_beautifulsoup.BeautifulSoup):
|
---|
328 | _entitydefs = htmlentitydefs.name2codepoint
|
---|
329 | # don't want the magic Microsoft-char workaround
|
---|
330 | PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
|
---|
331 | lambda(x):x.group(1) + ' />'),
|
---|
332 | (re.compile('<!\s+([^<>]*)>'),
|
---|
333 | lambda(x):'<!' + x.group(1) + '>')
|
---|
334 | ]
|
---|
335 |
|
---|
336 | def __init__(self, encoding, text=None, avoidParserProblems=True,
|
---|
337 | initialTextIsEverything=True):
|
---|
338 | self._encoding = encoding
|
---|
339 | _beautifulsoup.BeautifulSoup.__init__(
|
---|
340 | self, text, avoidParserProblems, initialTextIsEverything)
|
---|
341 |
|
---|
342 | def handle_charref(self, ref):
|
---|
343 | t = unescape("&#%s;" % ref, self._entitydefs, self._encoding)
|
---|
344 | self.handle_data(t)
|
---|
345 | def handle_entityref(self, ref):
|
---|
346 | t = unescape("&%s;" % ref, self._entitydefs, self._encoding)
|
---|
347 | self.handle_data(t)
|
---|
348 | def unescape_attrs(self, attrs):
|
---|
349 | escaped_attrs = []
|
---|
350 | for key, val in attrs:
|
---|
351 | val = unescape(val, self._entitydefs, self._encoding)
|
---|
352 | escaped_attrs.append((key, val))
|
---|
353 | return escaped_attrs
|
---|
354 |
|
---|
355 | class RobustLinksFactory:
|
---|
356 |
|
---|
357 | compress_re = COMPRESS_RE
|
---|
358 |
|
---|
359 | def __init__(self,
|
---|
360 | link_parser_class=None,
|
---|
361 | link_class=Link,
|
---|
362 | urltags=None,
|
---|
363 | ):
|
---|
364 | if link_parser_class is None:
|
---|
365 | link_parser_class = MechanizeBs
|
---|
366 | self.link_parser_class = link_parser_class
|
---|
367 | self.link_class = link_class
|
---|
368 | if urltags is None:
|
---|
369 | urltags = {
|
---|
370 | "a": "href",
|
---|
371 | "area": "href",
|
---|
372 | "frame": "src",
|
---|
373 | "iframe": "src",
|
---|
374 | }
|
---|
375 | self.urltags = urltags
|
---|
376 | self._bs = None
|
---|
377 | self._encoding = None
|
---|
378 | self._base_url = None
|
---|
379 |
|
---|
380 | def set_soup(self, soup, base_url, encoding):
|
---|
381 | self._bs = soup
|
---|
382 | self._base_url = base_url
|
---|
383 | self._encoding = encoding
|
---|
384 |
|
---|
385 | def links(self):
|
---|
386 | import _beautifulsoup
|
---|
387 | bs = self._bs
|
---|
388 | base_url = self._base_url
|
---|
389 | encoding = self._encoding
|
---|
390 | gen = bs.recursiveChildGenerator()
|
---|
391 | for ch in bs.recursiveChildGenerator():
|
---|
392 | if (isinstance(ch, _beautifulsoup.Tag) and
|
---|
393 | ch.name in self.urltags.keys() + ["base"]):
|
---|
394 | link = ch
|
---|
395 | attrs = bs.unescape_attrs(link.attrs)
|
---|
396 | attrs_dict = dict(attrs)
|
---|
397 | if link.name == "base":
|
---|
398 | base_href = attrs_dict.get("href")
|
---|
399 | if base_href is not None:
|
---|
400 | base_url = base_href
|
---|
401 | continue
|
---|
402 | url_attr = self.urltags[link.name]
|
---|
403 | url = attrs_dict.get(url_attr)
|
---|
404 | if not url:
|
---|
405 | continue
|
---|
406 | url = _rfc3986.clean_url(url, encoding)
|
---|
407 | text = link.fetchText(lambda t: True)
|
---|
408 | if not text:
|
---|
409 | # follow _pullparser's weird behaviour rigidly
|
---|
410 | if link.name == "a":
|
---|
411 | text = ""
|
---|
412 | else:
|
---|
413 | text = None
|
---|
414 | else:
|
---|
415 | text = self.compress_re.sub(" ", " ".join(text).strip())
|
---|
416 | yield Link(base_url, url, text, link.name, attrs)
|
---|
417 |
|
---|
418 |
|
---|
419 | class RobustFormsFactory(FormsFactory):
|
---|
420 | def __init__(self, *args, **kwds):
|
---|
421 | args = form_parser_args(*args, **kwds)
|
---|
422 | if args.form_parser_class is None:
|
---|
423 | args.form_parser_class = RobustFormParser
|
---|
424 | FormsFactory.__init__(self, **args.dictionary)
|
---|
425 |
|
---|
426 | def set_response(self, response, encoding):
|
---|
427 | self._response = response
|
---|
428 | self.encoding = encoding
|
---|
429 |
|
---|
430 |
|
---|
431 | class RobustTitleFactory:
|
---|
432 | def __init__(self):
|
---|
433 | self._bs = self._encoding = None
|
---|
434 |
|
---|
435 | def set_soup(self, soup, encoding):
|
---|
436 | self._bs = soup
|
---|
437 | self._encoding = encoding
|
---|
438 |
|
---|
439 | def title(self):
|
---|
440 | import _beautifulsoup
|
---|
441 | title = self._bs.first("title")
|
---|
442 | if title == _beautifulsoup.Null:
|
---|
443 | return None
|
---|
444 | else:
|
---|
445 | inner_html = "".join([str(node) for node in title.contents])
|
---|
446 | return COMPRESS_RE.sub(" ", inner_html.strip())
|
---|
447 |
|
---|
448 |
|
---|
449 | class Factory:
|
---|
450 | """Factory for forms, links, etc.
|
---|
451 |
|
---|
452 | This interface may expand in future.
|
---|
453 |
|
---|
454 | Public methods:
|
---|
455 |
|
---|
456 | set_request_class(request_class)
|
---|
457 | set_response(response)
|
---|
458 | forms()
|
---|
459 | links()
|
---|
460 |
|
---|
461 | Public attributes:
|
---|
462 |
|
---|
463 | Note that accessing these attributes may raise ParseError.
|
---|
464 |
|
---|
465 | encoding: string specifying the encoding of response if it contains a text
|
---|
466 | document (this value is left unspecified for documents that do not have
|
---|
467 | an encoding, e.g. an image file)
|
---|
468 | is_html: true if response contains an HTML document (XHTML may be
|
---|
469 | regarded as HTML too)
|
---|
470 | title: page title, or None if no title or not HTML
|
---|
471 | global_form: form object containing all controls that are not descendants
|
---|
472 | of any FORM element, or None if the forms_factory does not support
|
---|
473 | supplying a global form
|
---|
474 |
|
---|
475 | """
|
---|
476 |
|
---|
477 | LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
|
---|
478 |
|
---|
479 | def __init__(self, forms_factory, links_factory, title_factory,
|
---|
480 | encoding_finder=EncodingFinder(DEFAULT_ENCODING),
|
---|
481 | response_type_finder=ResponseTypeFinder(allow_xhtml=False),
|
---|
482 | ):
|
---|
483 | """
|
---|
484 |
|
---|
485 | Pass keyword arguments only.
|
---|
486 |
|
---|
487 | default_encoding: character encoding to use if encoding cannot be
|
---|
488 | determined (or guessed) from the response. You should turn on
|
---|
489 | HTTP-EQUIV handling if you want the best chance of getting this right
|
---|
490 | without resorting to this default. The default value of this
|
---|
491 | parameter (currently latin-1) may change in future.
|
---|
492 |
|
---|
493 | """
|
---|
494 | self._forms_factory = forms_factory
|
---|
495 | self._links_factory = links_factory
|
---|
496 | self._title_factory = title_factory
|
---|
497 | self._encoding_finder = encoding_finder
|
---|
498 | self._response_type_finder = response_type_finder
|
---|
499 |
|
---|
500 | self.set_response(None)
|
---|
501 |
|
---|
502 | def set_request_class(self, request_class):
|
---|
503 | """Set urllib2.Request class.
|
---|
504 |
|
---|
505 | ClientForm.HTMLForm instances returned by .forms() will return
|
---|
506 | instances of this class when .click()ed.
|
---|
507 |
|
---|
508 | """
|
---|
509 | self._forms_factory.request_class = request_class
|
---|
510 |
|
---|
511 | def set_response(self, response):
|
---|
512 | """Set response.
|
---|
513 |
|
---|
514 | The response must either be None or implement the same interface as
|
---|
515 | objects returned by urllib2.urlopen().
|
---|
516 |
|
---|
517 | """
|
---|
518 | self._response = response
|
---|
519 | self._forms_genf = self._links_genf = None
|
---|
520 | self._get_title = None
|
---|
521 | for name in self.LAZY_ATTRS:
|
---|
522 | try:
|
---|
523 | delattr(self, name)
|
---|
524 | except AttributeError:
|
---|
525 | pass
|
---|
526 |
|
---|
527 | def __getattr__(self, name):
|
---|
528 | if name not in self.LAZY_ATTRS:
|
---|
529 | return getattr(self.__class__, name)
|
---|
530 |
|
---|
531 | if name == "encoding":
|
---|
532 | self.encoding = self._encoding_finder.encoding(
|
---|
533 | copy.copy(self._response))
|
---|
534 | return self.encoding
|
---|
535 | elif name == "is_html":
|
---|
536 | self.is_html = self._response_type_finder.is_html(
|
---|
537 | copy.copy(self._response), self.encoding)
|
---|
538 | return self.is_html
|
---|
539 | elif name == "title":
|
---|
540 | if self.is_html:
|
---|
541 | self.title = self._title_factory.title()
|
---|
542 | else:
|
---|
543 | self.title = None
|
---|
544 | return self.title
|
---|
545 | elif name == "global_form":
|
---|
546 | self.forms()
|
---|
547 | return self.global_form
|
---|
548 |
|
---|
549 | def forms(self):
|
---|
550 | """Return iterable over ClientForm.HTMLForm-like objects.
|
---|
551 |
|
---|
552 | Raises mechanize.ParseError on failure.
|
---|
553 | """
|
---|
554 | # this implementation sets .global_form as a side-effect, for benefit
|
---|
555 | # of __getattr__ impl
|
---|
556 | if self._forms_genf is None:
|
---|
557 | try:
|
---|
558 | self._forms_genf = CachingGeneratorFunction(
|
---|
559 | self._forms_factory.forms())
|
---|
560 | except: # XXXX define exception!
|
---|
561 | self.set_response(self._response)
|
---|
562 | raise
|
---|
563 | self.global_form = getattr(
|
---|
564 | self._forms_factory, "global_form", None)
|
---|
565 | return self._forms_genf()
|
---|
566 |
|
---|
567 | def links(self):
|
---|
568 | """Return iterable over mechanize.Link-like objects.
|
---|
569 |
|
---|
570 | Raises mechanize.ParseError on failure.
|
---|
571 | """
|
---|
572 | if self._links_genf is None:
|
---|
573 | try:
|
---|
574 | self._links_genf = CachingGeneratorFunction(
|
---|
575 | self._links_factory.links())
|
---|
576 | except: # XXXX define exception!
|
---|
577 | self.set_response(self._response)
|
---|
578 | raise
|
---|
579 | return self._links_genf()
|
---|
580 |
|
---|
581 | class DefaultFactory(Factory):
|
---|
582 | """Based on sgmllib."""
|
---|
583 | def __init__(self, i_want_broken_xhtml_support=False):
|
---|
584 | Factory.__init__(
|
---|
585 | self,
|
---|
586 | forms_factory=FormsFactory(),
|
---|
587 | links_factory=LinksFactory(),
|
---|
588 | title_factory=TitleFactory(),
|
---|
589 | response_type_finder=ResponseTypeFinder(
|
---|
590 | allow_xhtml=i_want_broken_xhtml_support),
|
---|
591 | )
|
---|
592 |
|
---|
593 | def set_response(self, response):
|
---|
594 | Factory.set_response(self, response)
|
---|
595 | if response is not None:
|
---|
596 | self._forms_factory.set_response(
|
---|
597 | copy.copy(response), self.encoding)
|
---|
598 | self._links_factory.set_response(
|
---|
599 | copy.copy(response), response.geturl(), self.encoding)
|
---|
600 | self._title_factory.set_response(
|
---|
601 | copy.copy(response), self.encoding)
|
---|
602 |
|
---|
603 | class RobustFactory(Factory):
|
---|
604 | """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
|
---|
605 | DefaultFactory.
|
---|
606 |
|
---|
607 | """
|
---|
608 | def __init__(self, i_want_broken_xhtml_support=False,
|
---|
609 | soup_class=None):
|
---|
610 | Factory.__init__(
|
---|
611 | self,
|
---|
612 | forms_factory=RobustFormsFactory(),
|
---|
613 | links_factory=RobustLinksFactory(),
|
---|
614 | title_factory=RobustTitleFactory(),
|
---|
615 | response_type_finder=ResponseTypeFinder(
|
---|
616 | allow_xhtml=i_want_broken_xhtml_support),
|
---|
617 | )
|
---|
618 | if soup_class is None:
|
---|
619 | soup_class = MechanizeBs
|
---|
620 | self._soup_class = soup_class
|
---|
621 |
|
---|
622 | def set_response(self, response):
|
---|
623 | Factory.set_response(self, response)
|
---|
624 | if response is not None:
|
---|
625 | data = response.read()
|
---|
626 | soup = self._soup_class(self.encoding, data)
|
---|
627 | self._forms_factory.set_response(
|
---|
628 | copy.copy(response), self.encoding)
|
---|
629 | self._links_factory.set_soup(
|
---|
630 | soup, response.geturl(), self.encoding)
|
---|
631 | self._title_factory.set_soup(soup, self.encoding)
|
---|