source: py-scraping/mechanize/_http.py@ 146

Last change on this file since 146 was 106, checked in by Rick van der Zwet, 15 years ago

Initial commit...

File size: 26.1 KB
RevLine 
[106]1"""HTTP related handlers.
2
3Note that some other HTTP handlers live in more specific modules: _auth.py,
4_gzip.py, etc.
5
6
7Copyright 2002-2006 John J Lee <jjl@pobox.com>
8
9This code is free software; you can redistribute it and/or modify it
10under the terms of the BSD or ZPL 2.1 licenses (see the file
11COPYING.txt included with the distribution).
12
13"""
14
15import time, htmlentitydefs, logging, socket, \
16 urllib2, urllib, httplib, sgmllib
17from urllib2 import URLError, HTTPError, BaseHandler
18from cStringIO import StringIO
19
20from _clientcookie import CookieJar
21from _headersutil import is_html
22from _html import unescape, unescape_charref
23from _request import Request
24from _response import closeable_response, response_seek_wrapper
25import _rfc3986
26import _sockettimeout
27
28debug = logging.getLogger("mechanize").debug
29debug_robots = logging.getLogger("mechanize.robots").debug
30
31# monkeypatch urllib2.HTTPError to show URL
32## def urllib2_str(self):
33## return 'HTTP Error %s: %s (%s)' % (
34## self.code, self.msg, self.geturl())
35## urllib2.HTTPError.__str__ = urllib2_str
36
37
38CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes
39DEFAULT_ENCODING = 'latin-1'
40
41
42try:
43 socket._fileobject("fake socket", close=True)
44except TypeError:
45 # python <= 2.4
46 create_readline_wrapper = socket._fileobject
47else:
48 def create_readline_wrapper(fh):
49 return socket._fileobject(fh, close=True)
50
51
52# This adds "refresh" to the list of redirectables and provides a redirection
53# algorithm that doesn't go into a loop in the presence of cookies
54# (Python 2.4 has this new algorithm, 2.3 doesn't).
55class HTTPRedirectHandler(BaseHandler):
56 # maximum number of redirections to any single URL
57 # this is needed because of the state that cookies introduce
58 max_repeats = 4
59 # maximum total number of redirections (regardless of URL) before
60 # assuming we're in a loop
61 max_redirections = 10
62
63 # Implementation notes:
64
65 # To avoid the server sending us into an infinite loop, the request
66 # object needs to track what URLs we have already seen. Do this by
67 # adding a handler-specific attribute to the Request object. The value
68 # of the dict is used to count the number of times the same URL has
69 # been visited. This is needed because visiting the same URL twice
70 # does not necessarily imply a loop, thanks to state introduced by
71 # cookies.
72
73 # Always unhandled redirection codes:
74 # 300 Multiple Choices: should not handle this here.
75 # 304 Not Modified: no need to handle here: only of interest to caches
76 # that do conditional GETs
77 # 305 Use Proxy: probably not worth dealing with here
78 # 306 Unused: what was this for in the previous versions of protocol??
79
80 def redirect_request(self, newurl, req, fp, code, msg, headers):
81 """Return a Request or None in response to a redirect.
82
83 This is called by the http_error_30x methods when a redirection
84 response is received. If a redirection should take place, return a
85 new Request to allow http_error_30x to perform the redirect;
86 otherwise, return None to indicate that an HTTPError should be
87 raised.
88
89 """
90 if code in (301, 302, 303, "refresh") or \
91 (code == 307 and not req.has_data()):
92 # Strictly (according to RFC 2616), 301 or 302 in response to
93 # a POST MUST NOT cause a redirection without confirmation
94 # from the user (of urllib2, in this case). In practice,
95 # essentially all clients do redirect in this case, so we do
96 # the same.
97 # XXX really refresh redirections should be visiting; tricky to
98 # fix, so this will wait until post-stable release
99 new = Request(newurl,
100 headers=req.headers,
101 origin_req_host=req.get_origin_req_host(),
102 unverifiable=True,
103 visit=False,
104 )
105 new._origin_req = getattr(req, "_origin_req", req)
106 return new
107 else:
108 raise HTTPError(req.get_full_url(), code, msg, headers, fp)
109
110 def http_error_302(self, req, fp, code, msg, headers):
111 # Some servers (incorrectly) return multiple Location headers
112 # (so probably same goes for URI). Use first header.
113 if headers.has_key('location'):
114 newurl = headers.getheaders('location')[0]
115 elif headers.has_key('uri'):
116 newurl = headers.getheaders('uri')[0]
117 else:
118 return
119 newurl = _rfc3986.clean_url(newurl, "latin-1")
120 newurl = _rfc3986.urljoin(req.get_full_url(), newurl)
121
122 # XXX Probably want to forget about the state of the current
123 # request, although that might interact poorly with other
124 # handlers that also use handler-specific request attributes
125 new = self.redirect_request(newurl, req, fp, code, msg, headers)
126 if new is None:
127 return
128
129 # loop detection
130 # .redirect_dict has a key url if url was previously visited.
131 if hasattr(req, 'redirect_dict'):
132 visited = new.redirect_dict = req.redirect_dict
133 if (visited.get(newurl, 0) >= self.max_repeats or
134 len(visited) >= self.max_redirections):
135 raise HTTPError(req.get_full_url(), code,
136 self.inf_msg + msg, headers, fp)
137 else:
138 visited = new.redirect_dict = req.redirect_dict = {}
139 visited[newurl] = visited.get(newurl, 0) + 1
140
141 # Don't close the fp until we are sure that we won't use it
142 # with HTTPError.
143 fp.read()
144 fp.close()
145
146 return self.parent.open(new)
147
148 http_error_301 = http_error_303 = http_error_307 = http_error_302
149 http_error_refresh = http_error_302
150
151 inf_msg = "The HTTP server returned a redirect error that would " \
152 "lead to an infinite loop.\n" \
153 "The last 30x error message was:\n"
154
155
156# XXX would self.reset() work, instead of raising this exception?
157class EndOfHeadError(Exception): pass
158class AbstractHeadParser:
159 # only these elements are allowed in or before HEAD of document
160 head_elems = ("html", "head",
161 "title", "base",
162 "script", "style", "meta", "link", "object")
163 _entitydefs = htmlentitydefs.name2codepoint
164 _encoding = DEFAULT_ENCODING
165
166 def __init__(self):
167 self.http_equiv = []
168
169 def start_meta(self, attrs):
170 http_equiv = content = None
171 for key, value in attrs:
172 if key == "http-equiv":
173 http_equiv = self.unescape_attr_if_required(value)
174 elif key == "content":
175 content = self.unescape_attr_if_required(value)
176 if http_equiv is not None and content is not None:
177 self.http_equiv.append((http_equiv, content))
178
179 def end_head(self):
180 raise EndOfHeadError()
181
182 def handle_entityref(self, name):
183 #debug("%s", name)
184 self.handle_data(unescape(
185 '&%s;' % name, self._entitydefs, self._encoding))
186
187 def handle_charref(self, name):
188 #debug("%s", name)
189 self.handle_data(unescape_charref(name, self._encoding))
190
191 def unescape_attr(self, name):
192 #debug("%s", name)
193 return unescape(name, self._entitydefs, self._encoding)
194
195 def unescape_attrs(self, attrs):
196 #debug("%s", attrs)
197 escaped_attrs = {}
198 for key, val in attrs.items():
199 escaped_attrs[key] = self.unescape_attr(val)
200 return escaped_attrs
201
202 def unknown_entityref(self, ref):
203 self.handle_data("&%s;" % ref)
204
205 def unknown_charref(self, ref):
206 self.handle_data("&#%s;" % ref)
207
208
209try:
210 import HTMLParser
211except ImportError:
212 pass
213else:
214 class XHTMLCompatibleHeadParser(AbstractHeadParser,
215 HTMLParser.HTMLParser):
216 def __init__(self):
217 HTMLParser.HTMLParser.__init__(self)
218 AbstractHeadParser.__init__(self)
219
220 def handle_starttag(self, tag, attrs):
221 if tag not in self.head_elems:
222 raise EndOfHeadError()
223 try:
224 method = getattr(self, 'start_' + tag)
225 except AttributeError:
226 try:
227 method = getattr(self, 'do_' + tag)
228 except AttributeError:
229 pass # unknown tag
230 else:
231 method(attrs)
232 else:
233 method(attrs)
234
235 def handle_endtag(self, tag):
236 if tag not in self.head_elems:
237 raise EndOfHeadError()
238 try:
239 method = getattr(self, 'end_' + tag)
240 except AttributeError:
241 pass # unknown tag
242 else:
243 method()
244
245 def unescape(self, name):
246 # Use the entitydefs passed into constructor, not
247 # HTMLParser.HTMLParser's entitydefs.
248 return self.unescape_attr(name)
249
250 def unescape_attr_if_required(self, name):
251 return name # HTMLParser.HTMLParser already did it
252
253class HeadParser(AbstractHeadParser, sgmllib.SGMLParser):
254
255 def _not_called(self):
256 assert False
257
258 def __init__(self):
259 sgmllib.SGMLParser.__init__(self)
260 AbstractHeadParser.__init__(self)
261
262 def handle_starttag(self, tag, method, attrs):
263 if tag not in self.head_elems:
264 raise EndOfHeadError()
265 if tag == "meta":
266 method(attrs)
267
268 def unknown_starttag(self, tag, attrs):
269 self.handle_starttag(tag, self._not_called, attrs)
270
271 def handle_endtag(self, tag, method):
272 if tag in self.head_elems:
273 method()
274 else:
275 raise EndOfHeadError()
276
277 def unescape_attr_if_required(self, name):
278 return self.unescape_attr(name)
279
280def parse_head(fileobj, parser):
281 """Return a list of key, value pairs."""
282 while 1:
283 data = fileobj.read(CHUNK)
284 try:
285 parser.feed(data)
286 except EndOfHeadError:
287 break
288 if len(data) != CHUNK:
289 # this should only happen if there is no HTML body, or if
290 # CHUNK is big
291 break
292 return parser.http_equiv
293
294class HTTPEquivProcessor(BaseHandler):
295 """Append META HTTP-EQUIV headers to regular HTTP headers."""
296
297 handler_order = 300 # before handlers that look at HTTP headers
298
299 def __init__(self, head_parser_class=HeadParser,
300 i_want_broken_xhtml_support=False,
301 ):
302 self.head_parser_class = head_parser_class
303 self._allow_xhtml = i_want_broken_xhtml_support
304
305 def http_response(self, request, response):
306 if not hasattr(response, "seek"):
307 response = response_seek_wrapper(response)
308 http_message = response.info()
309 url = response.geturl()
310 ct_hdrs = http_message.getheaders("content-type")
311 if is_html(ct_hdrs, url, self._allow_xhtml):
312 try:
313 try:
314 html_headers = parse_head(response,
315 self.head_parser_class())
316 finally:
317 response.seek(0)
318 except (HTMLParser.HTMLParseError,
319 sgmllib.SGMLParseError):
320 pass
321 else:
322 for hdr, val in html_headers:
323 # add a header
324 http_message.dict[hdr.lower()] = val
325 text = hdr + ": " + val
326 for line in text.split("\n"):
327 http_message.headers.append(line + "\n")
328 return response
329
330 https_response = http_response
331
332class HTTPCookieProcessor(BaseHandler):
333 """Handle HTTP cookies.
334
335 Public attributes:
336
337 cookiejar: CookieJar instance
338
339 """
340 def __init__(self, cookiejar=None):
341 if cookiejar is None:
342 cookiejar = CookieJar()
343 self.cookiejar = cookiejar
344
345 def http_request(self, request):
346 self.cookiejar.add_cookie_header(request)
347 return request
348
349 def http_response(self, request, response):
350 self.cookiejar.extract_cookies(response, request)
351 return response
352
353 https_request = http_request
354 https_response = http_response
355
356try:
357 import robotparser
358except ImportError:
359 pass
360else:
361 class MechanizeRobotFileParser(robotparser.RobotFileParser):
362
363 def __init__(self, url='', opener=None):
364 robotparser.RobotFileParser.__init__(self, url)
365 self._opener = opener
366 self._timeout = _sockettimeout._GLOBAL_DEFAULT_TIMEOUT
367
368 def set_opener(self, opener=None):
369 import _opener
370 if opener is None:
371 opener = _opener.OpenerDirector()
372 self._opener = opener
373
374 def set_timeout(self, timeout):
375 self._timeout = timeout
376
377 def read(self):
378 """Reads the robots.txt URL and feeds it to the parser."""
379 if self._opener is None:
380 self.set_opener()
381 req = Request(self.url, unverifiable=True, visit=False,
382 timeout=self._timeout)
383 try:
384 f = self._opener.open(req)
385 except HTTPError, f:
386 pass
387 except (IOError, socket.error, OSError), exc:
388 debug_robots("ignoring error opening %r: %s" %
389 (self.url, exc))
390 return
391 lines = []
392 line = f.readline()
393 while line:
394 lines.append(line.strip())
395 line = f.readline()
396 status = f.code
397 if status == 401 or status == 403:
398 self.disallow_all = True
399 debug_robots("disallow all")
400 elif status >= 400:
401 self.allow_all = True
402 debug_robots("allow all")
403 elif status == 200 and lines:
404 debug_robots("parse lines")
405 self.parse(lines)
406
407 class RobotExclusionError(urllib2.HTTPError):
408 def __init__(self, request, *args):
409 apply(urllib2.HTTPError.__init__, (self,) + args)
410 self.request = request
411
412 class HTTPRobotRulesProcessor(BaseHandler):
413 # before redirections, after everything else
414 handler_order = 800
415
416 try:
417 from httplib import HTTPMessage
418 except:
419 from mimetools import Message
420 http_response_class = Message
421 else:
422 http_response_class = HTTPMessage
423
424 def __init__(self, rfp_class=MechanizeRobotFileParser):
425 self.rfp_class = rfp_class
426 self.rfp = None
427 self._host = None
428
429 def http_request(self, request):
430 scheme = request.get_type()
431 if scheme not in ["http", "https"]:
432 # robots exclusion only applies to HTTP
433 return request
434
435 if request.get_selector() == "/robots.txt":
436 # /robots.txt is always OK to fetch
437 return request
438
439 host = request.get_host()
440
441 # robots.txt requests don't need to be allowed by robots.txt :-)
442 origin_req = getattr(request, "_origin_req", None)
443 if (origin_req is not None and
444 origin_req.get_selector() == "/robots.txt" and
445 origin_req.get_host() == host
446 ):
447 return request
448
449 if host != self._host:
450 self.rfp = self.rfp_class()
451 try:
452 self.rfp.set_opener(self.parent)
453 except AttributeError:
454 debug("%r instance does not support set_opener" %
455 self.rfp.__class__)
456 self.rfp.set_url(scheme + "://" + host + "/robots.txt")
457 self.rfp.set_timeout(request.timeout)
458 self.rfp.read()
459 self._host = host
460
461 ua = request.get_header("User-agent", "")
462 if self.rfp.can_fetch(ua, request.get_full_url()):
463 return request
464 else:
465 # XXX This should really have raised URLError. Too late now...
466 msg = "request disallowed by robots.txt"
467 raise RobotExclusionError(
468 request,
469 request.get_full_url(),
470 403, msg,
471 self.http_response_class(StringIO()), StringIO(msg))
472
473 https_request = http_request
474
475class HTTPRefererProcessor(BaseHandler):
476 """Add Referer header to requests.
477
478 This only makes sense if you use each RefererProcessor for a single
479 chain of requests only (so, for example, if you use a single
480 HTTPRefererProcessor to fetch a series of URLs extracted from a single
481 page, this will break).
482
483 There's a proper implementation of this in mechanize.Browser.
484
485 """
486 def __init__(self):
487 self.referer = None
488
489 def http_request(self, request):
490 if ((self.referer is not None) and
491 not request.has_header("Referer")):
492 request.add_unredirected_header("Referer", self.referer)
493 return request
494
495 def http_response(self, request, response):
496 self.referer = response.geturl()
497 return response
498
499 https_request = http_request
500 https_response = http_response
501
502
503def clean_refresh_url(url):
504 # e.g. Firefox 1.5 does (something like) this
505 if ((url.startswith('"') and url.endswith('"')) or
506 (url.startswith("'") and url.endswith("'"))):
507 url = url[1:-1]
508 return _rfc3986.clean_url(url, "latin-1") # XXX encoding
509
510def parse_refresh_header(refresh):
511 """
512 >>> parse_refresh_header("1; url=http://example.com/")
513 (1.0, 'http://example.com/')
514 >>> parse_refresh_header("1; url='http://example.com/'")
515 (1.0, 'http://example.com/')
516 >>> parse_refresh_header("1")
517 (1.0, None)
518 >>> parse_refresh_header("blah")
519 Traceback (most recent call last):
520 ValueError: invalid literal for float(): blah
521
522 """
523
524 ii = refresh.find(";")
525 if ii != -1:
526 pause, newurl_spec = float(refresh[:ii]), refresh[ii + 1:]
527 jj = newurl_spec.find("=")
528 key = None
529 if jj != -1:
530 key, newurl = newurl_spec[:jj], newurl_spec[jj + 1:]
531 newurl = clean_refresh_url(newurl)
532 if key is None or key.strip().lower() != "url":
533 raise ValueError()
534 else:
535 pause, newurl = float(refresh), None
536 return pause, newurl
537
538class HTTPRefreshProcessor(BaseHandler):
539 """Perform HTTP Refresh redirections.
540
541 Note that if a non-200 HTTP code has occurred (for example, a 30x
542 redirect), this processor will do nothing.
543
544 By default, only zero-time Refresh headers are redirected. Use the
545 max_time attribute / constructor argument to allow Refresh with longer
546 pauses. Use the honor_time attribute / constructor argument to control
547 whether the requested pause is honoured (with a time.sleep()) or
548 skipped in favour of immediate redirection.
549
550 Public attributes:
551
552 max_time: see above
553 honor_time: see above
554
555 """
556 handler_order = 1000
557
558 def __init__(self, max_time=0, honor_time=True):
559 self.max_time = max_time
560 self.honor_time = honor_time
561 self._sleep = time.sleep
562
563 def http_response(self, request, response):
564 code, msg, hdrs = response.code, response.msg, response.info()
565
566 if code == 200 and hdrs.has_key("refresh"):
567 refresh = hdrs.getheaders("refresh")[0]
568 try:
569 pause, newurl = parse_refresh_header(refresh)
570 except ValueError:
571 debug("bad Refresh header: %r" % refresh)
572 return response
573
574 if newurl is None:
575 newurl = response.geturl()
576 if (self.max_time is None) or (pause <= self.max_time):
577 if pause > 1E-3 and self.honor_time:
578 self._sleep(pause)
579 hdrs["location"] = newurl
580 # hardcoded http is NOT a bug
581 response = self.parent.error(
582 "http", request, response,
583 "refresh", msg, hdrs)
584 else:
585 debug("Refresh header ignored: %r" % refresh)
586
587 return response
588
589 https_response = http_response
590
591class HTTPErrorProcessor(BaseHandler):
592 """Process HTTP error responses.
593
594 The purpose of this handler is to to allow other response processors a
595 look-in by removing the call to parent.error() from
596 AbstractHTTPHandler.
597
598 For non-200 error codes, this just passes the job on to the
599 Handler.<proto>_error_<code> methods, via the OpenerDirector.error
600 method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an
601 HTTPError if no other handler handles the error.
602
603 """
604 handler_order = 1000 # after all other processors
605
606 def http_response(self, request, response):
607 code, msg, hdrs = response.code, response.msg, response.info()
608
609 if code != 200:
610 # hardcoded http is NOT a bug
611 response = self.parent.error(
612 "http", request, response, code, msg, hdrs)
613
614 return response
615
616 https_response = http_response
617
618
619class HTTPDefaultErrorHandler(BaseHandler):
620 def http_error_default(self, req, fp, code, msg, hdrs):
621 # why these error methods took the code, msg, headers args in the first
622 # place rather than a response object, I don't know, but to avoid
623 # multiple wrapping, we're discarding them
624
625 if isinstance(fp, urllib2.HTTPError):
626 response = fp
627 else:
628 response = urllib2.HTTPError(
629 req.get_full_url(), code, msg, hdrs, fp)
630 assert code == response.code
631 assert msg == response.msg
632 assert hdrs == response.hdrs
633 raise response
634
635
636class AbstractHTTPHandler(BaseHandler):
637
638 def __init__(self, debuglevel=0):
639 self._debuglevel = debuglevel
640
641 def set_http_debuglevel(self, level):
642 self._debuglevel = level
643
644 def do_request_(self, request):
645 host = request.get_host()
646 if not host:
647 raise URLError('no host given')
648
649 if request.has_data(): # POST
650 data = request.get_data()
651 if not request.has_header('Content-type'):
652 request.add_unredirected_header(
653 'Content-type',
654 'application/x-www-form-urlencoded')
655 if not request.has_header('Content-length'):
656 request.add_unredirected_header(
657 'Content-length', '%d' % len(data))
658
659 scheme, sel = urllib.splittype(request.get_selector())
660 sel_host, sel_path = urllib.splithost(sel)
661 if not request.has_header('Host'):
662 request.add_unredirected_header('Host', sel_host or host)
663 for name, value in self.parent.addheaders:
664 name = name.capitalize()
665 if not request.has_header(name):
666 request.add_unredirected_header(name, value)
667
668 return request
669
670 def do_open(self, http_class, req):
671 """Return an addinfourl object for the request, using http_class.
672
673 http_class must implement the HTTPConnection API from httplib.
674 The addinfourl return value is a file-like object. It also
675 has methods and attributes including:
676 - info(): return a mimetools.Message object for the headers
677 - geturl(): return the original request URL
678 - code: HTTP status code
679 """
680 host_port = req.get_host()
681 if not host_port:
682 raise URLError('no host given')
683
684 try:
685 h = http_class(host_port, timeout=req.timeout)
686 except TypeError:
687 # Python < 2.6, no per-connection timeout support
688 h = http_class(host_port)
689 h.set_debuglevel(self._debuglevel)
690
691 headers = dict(req.headers)
692 headers.update(req.unredirected_hdrs)
693 # We want to make an HTTP/1.1 request, but the addinfourl
694 # class isn't prepared to deal with a persistent connection.
695 # It will try to read all remaining data from the socket,
696 # which will block while the server waits for the next request.
697 # So make sure the connection gets closed after the (only)
698 # request.
699 headers["Connection"] = "close"
700 headers = dict(
701 [(name.title(), val) for name, val in headers.items()])
702 try:
703 h.request(req.get_method(), req.get_selector(), req.data, headers)
704 r = h.getresponse()
705 except socket.error, err: # XXX what error?
706 raise URLError(err)
707
708 # Pick apart the HTTPResponse object to get the addinfourl
709 # object initialized properly.
710
711 # Wrap the HTTPResponse object in socket's file object adapter
712 # for Windows. That adapter calls recv(), so delegate recv()
713 # to read(). This weird wrapping allows the returned object to
714 # have readline() and readlines() methods.
715
716 # XXX It might be better to extract the read buffering code
717 # out of socket._fileobject() and into a base class.
718
719 r.recv = r.read
720 fp = create_readline_wrapper(r)
721
722 resp = closeable_response(fp, r.msg, req.get_full_url(),
723 r.status, r.reason)
724 return resp
725
726
727class HTTPHandler(AbstractHTTPHandler):
728 def http_open(self, req):
729 return self.do_open(httplib.HTTPConnection, req)
730
731 http_request = AbstractHTTPHandler.do_request_
732
733if hasattr(httplib, 'HTTPS'):
734
735 class HTTPSConnectionFactory:
736 def __init__(self, key_file, cert_file):
737 self._key_file = key_file
738 self._cert_file = cert_file
739 def __call__(self, hostport):
740 return httplib.HTTPSConnection(
741 hostport,
742 key_file=self._key_file, cert_file=self._cert_file)
743
744 class HTTPSHandler(AbstractHTTPHandler):
745 def __init__(self, client_cert_manager=None):
746 AbstractHTTPHandler.__init__(self)
747 self.client_cert_manager = client_cert_manager
748
749 def https_open(self, req):
750 if self.client_cert_manager is not None:
751 key_file, cert_file = self.client_cert_manager.find_key_cert(
752 req.get_full_url())
753 conn_factory = HTTPSConnectionFactory(key_file, cert_file)
754 else:
755 conn_factory = httplib.HTTPSConnection
756 return self.do_open(conn_factory, req)
757
758 https_request = AbstractHTTPHandler.do_request_
Note: See TracBrowser for help on using the repository browser.