"""HTTP related handlers. Note that some other HTTP handlers live in more specific modules: _auth.py, _gzip.py, etc. Copyright 2002-2006 John J Lee This code is free software; you can redistribute it and/or modify it under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt included with the distribution). """ import time, htmlentitydefs, logging, socket, \ urllib2, urllib, httplib, sgmllib from urllib2 import URLError, HTTPError, BaseHandler from cStringIO import StringIO from _clientcookie import CookieJar from _headersutil import is_html from _html import unescape, unescape_charref from _request import Request from _response import closeable_response, response_seek_wrapper import _rfc3986 import _sockettimeout debug = logging.getLogger("mechanize").debug debug_robots = logging.getLogger("mechanize.robots").debug # monkeypatch urllib2.HTTPError to show URL ## def urllib2_str(self): ## return 'HTTP Error %s: %s (%s)' % ( ## self.code, self.msg, self.geturl()) ## urllib2.HTTPError.__str__ = urllib2_str CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes DEFAULT_ENCODING = 'latin-1' try: socket._fileobject("fake socket", close=True) except TypeError: # python <= 2.4 create_readline_wrapper = socket._fileobject else: def create_readline_wrapper(fh): return socket._fileobject(fh, close=True) # This adds "refresh" to the list of redirectables and provides a redirection # algorithm that doesn't go into a loop in the presence of cookies # (Python 2.4 has this new algorithm, 2.3 doesn't). class HTTPRedirectHandler(BaseHandler): # maximum number of redirections to any single URL # this is needed because of the state that cookies introduce max_repeats = 4 # maximum total number of redirections (regardless of URL) before # assuming we're in a loop max_redirections = 10 # Implementation notes: # To avoid the server sending us into an infinite loop, the request # object needs to track what URLs we have already seen. Do this by # adding a handler-specific attribute to the Request object. The value # of the dict is used to count the number of times the same URL has # been visited. This is needed because visiting the same URL twice # does not necessarily imply a loop, thanks to state introduced by # cookies. # Always unhandled redirection codes: # 300 Multiple Choices: should not handle this here. # 304 Not Modified: no need to handle here: only of interest to caches # that do conditional GETs # 305 Use Proxy: probably not worth dealing with here # 306 Unused: what was this for in the previous versions of protocol?? def redirect_request(self, newurl, req, fp, code, msg, headers): """Return a Request or None in response to a redirect. This is called by the http_error_30x methods when a redirection response is received. If a redirection should take place, return a new Request to allow http_error_30x to perform the redirect; otherwise, return None to indicate that an HTTPError should be raised. """ if code in (301, 302, 303, "refresh") or \ (code == 307 and not req.has_data()): # Strictly (according to RFC 2616), 301 or 302 in response to # a POST MUST NOT cause a redirection without confirmation # from the user (of urllib2, in this case). In practice, # essentially all clients do redirect in this case, so we do # the same. # XXX really refresh redirections should be visiting; tricky to # fix, so this will wait until post-stable release new = Request(newurl, headers=req.headers, origin_req_host=req.get_origin_req_host(), unverifiable=True, visit=False, ) new._origin_req = getattr(req, "_origin_req", req) return new else: raise HTTPError(req.get_full_url(), code, msg, headers, fp) def http_error_302(self, req, fp, code, msg, headers): # Some servers (incorrectly) return multiple Location headers # (so probably same goes for URI). Use first header. if headers.has_key('location'): newurl = headers.getheaders('location')[0] elif headers.has_key('uri'): newurl = headers.getheaders('uri')[0] else: return newurl = _rfc3986.clean_url(newurl, "latin-1") newurl = _rfc3986.urljoin(req.get_full_url(), newurl) # XXX Probably want to forget about the state of the current # request, although that might interact poorly with other # handlers that also use handler-specific request attributes new = self.redirect_request(newurl, req, fp, code, msg, headers) if new is None: return # loop detection # .redirect_dict has a key url if url was previously visited. if hasattr(req, 'redirect_dict'): visited = new.redirect_dict = req.redirect_dict if (visited.get(newurl, 0) >= self.max_repeats or len(visited) >= self.max_redirections): raise HTTPError(req.get_full_url(), code, self.inf_msg + msg, headers, fp) else: visited = new.redirect_dict = req.redirect_dict = {} visited[newurl] = visited.get(newurl, 0) + 1 # Don't close the fp until we are sure that we won't use it # with HTTPError. fp.read() fp.close() return self.parent.open(new) http_error_301 = http_error_303 = http_error_307 = http_error_302 http_error_refresh = http_error_302 inf_msg = "The HTTP server returned a redirect error that would " \ "lead to an infinite loop.\n" \ "The last 30x error message was:\n" # XXX would self.reset() work, instead of raising this exception? class EndOfHeadError(Exception): pass class AbstractHeadParser: # only these elements are allowed in or before HEAD of document head_elems = ("html", "head", "title", "base", "script", "style", "meta", "link", "object") _entitydefs = htmlentitydefs.name2codepoint _encoding = DEFAULT_ENCODING def __init__(self): self.http_equiv = [] def start_meta(self, attrs): http_equiv = content = None for key, value in attrs: if key == "http-equiv": http_equiv = self.unescape_attr_if_required(value) elif key == "content": content = self.unescape_attr_if_required(value) if http_equiv is not None and content is not None: self.http_equiv.append((http_equiv, content)) def end_head(self): raise EndOfHeadError() def handle_entityref(self, name): #debug("%s", name) self.handle_data(unescape( '&%s;' % name, self._entitydefs, self._encoding)) def handle_charref(self, name): #debug("%s", name) self.handle_data(unescape_charref(name, self._encoding)) def unescape_attr(self, name): #debug("%s", name) return unescape(name, self._entitydefs, self._encoding) def unescape_attrs(self, attrs): #debug("%s", attrs) escaped_attrs = {} for key, val in attrs.items(): escaped_attrs[key] = self.unescape_attr(val) return escaped_attrs def unknown_entityref(self, ref): self.handle_data("&%s;" % ref) def unknown_charref(self, ref): self.handle_data("&#%s;" % ref) try: import HTMLParser except ImportError: pass else: class XHTMLCompatibleHeadParser(AbstractHeadParser, HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) AbstractHeadParser.__init__(self) def handle_starttag(self, tag, attrs): if tag not in self.head_elems: raise EndOfHeadError() try: method = getattr(self, 'start_' + tag) except AttributeError: try: method = getattr(self, 'do_' + tag) except AttributeError: pass # unknown tag else: method(attrs) else: method(attrs) def handle_endtag(self, tag): if tag not in self.head_elems: raise EndOfHeadError() try: method = getattr(self, 'end_' + tag) except AttributeError: pass # unknown tag else: method() def unescape(self, name): # Use the entitydefs passed into constructor, not # HTMLParser.HTMLParser's entitydefs. return self.unescape_attr(name) def unescape_attr_if_required(self, name): return name # HTMLParser.HTMLParser already did it class HeadParser(AbstractHeadParser, sgmllib.SGMLParser): def _not_called(self): assert False def __init__(self): sgmllib.SGMLParser.__init__(self) AbstractHeadParser.__init__(self) def handle_starttag(self, tag, method, attrs): if tag not in self.head_elems: raise EndOfHeadError() if tag == "meta": method(attrs) def unknown_starttag(self, tag, attrs): self.handle_starttag(tag, self._not_called, attrs) def handle_endtag(self, tag, method): if tag in self.head_elems: method() else: raise EndOfHeadError() def unescape_attr_if_required(self, name): return self.unescape_attr(name) def parse_head(fileobj, parser): """Return a list of key, value pairs.""" while 1: data = fileobj.read(CHUNK) try: parser.feed(data) except EndOfHeadError: break if len(data) != CHUNK: # this should only happen if there is no HTML body, or if # CHUNK is big break return parser.http_equiv class HTTPEquivProcessor(BaseHandler): """Append META HTTP-EQUIV headers to regular HTTP headers.""" handler_order = 300 # before handlers that look at HTTP headers def __init__(self, head_parser_class=HeadParser, i_want_broken_xhtml_support=False, ): self.head_parser_class = head_parser_class self._allow_xhtml = i_want_broken_xhtml_support def http_response(self, request, response): if not hasattr(response, "seek"): response = response_seek_wrapper(response) http_message = response.info() url = response.geturl() ct_hdrs = http_message.getheaders("content-type") if is_html(ct_hdrs, url, self._allow_xhtml): try: try: html_headers = parse_head(response, self.head_parser_class()) finally: response.seek(0) except (HTMLParser.HTMLParseError, sgmllib.SGMLParseError): pass else: for hdr, val in html_headers: # add a header http_message.dict[hdr.lower()] = val text = hdr + ": " + val for line in text.split("\n"): http_message.headers.append(line + "\n") return response https_response = http_response class HTTPCookieProcessor(BaseHandler): """Handle HTTP cookies. Public attributes: cookiejar: CookieJar instance """ def __init__(self, cookiejar=None): if cookiejar is None: cookiejar = CookieJar() self.cookiejar = cookiejar def http_request(self, request): self.cookiejar.add_cookie_header(request) return request def http_response(self, request, response): self.cookiejar.extract_cookies(response, request) return response https_request = http_request https_response = http_response try: import robotparser except ImportError: pass else: class MechanizeRobotFileParser(robotparser.RobotFileParser): def __init__(self, url='', opener=None): robotparser.RobotFileParser.__init__(self, url) self._opener = opener self._timeout = _sockettimeout._GLOBAL_DEFAULT_TIMEOUT def set_opener(self, opener=None): import _opener if opener is None: opener = _opener.OpenerDirector() self._opener = opener def set_timeout(self, timeout): self._timeout = timeout def read(self): """Reads the robots.txt URL and feeds it to the parser.""" if self._opener is None: self.set_opener() req = Request(self.url, unverifiable=True, visit=False, timeout=self._timeout) try: f = self._opener.open(req) except HTTPError, f: pass except (IOError, socket.error, OSError), exc: debug_robots("ignoring error opening %r: %s" % (self.url, exc)) return lines = [] line = f.readline() while line: lines.append(line.strip()) line = f.readline() status = f.code if status == 401 or status == 403: self.disallow_all = True debug_robots("disallow all") elif status >= 400: self.allow_all = True debug_robots("allow all") elif status == 200 and lines: debug_robots("parse lines") self.parse(lines) class RobotExclusionError(urllib2.HTTPError): def __init__(self, request, *args): apply(urllib2.HTTPError.__init__, (self,) + args) self.request = request class HTTPRobotRulesProcessor(BaseHandler): # before redirections, after everything else handler_order = 800 try: from httplib import HTTPMessage except: from mimetools import Message http_response_class = Message else: http_response_class = HTTPMessage def __init__(self, rfp_class=MechanizeRobotFileParser): self.rfp_class = rfp_class self.rfp = None self._host = None def http_request(self, request): scheme = request.get_type() if scheme not in ["http", "https"]: # robots exclusion only applies to HTTP return request if request.get_selector() == "/robots.txt": # /robots.txt is always OK to fetch return request host = request.get_host() # robots.txt requests don't need to be allowed by robots.txt :-) origin_req = getattr(request, "_origin_req", None) if (origin_req is not None and origin_req.get_selector() == "/robots.txt" and origin_req.get_host() == host ): return request if host != self._host: self.rfp = self.rfp_class() try: self.rfp.set_opener(self.parent) except AttributeError: debug("%r instance does not support set_opener" % self.rfp.__class__) self.rfp.set_url(scheme + "://" + host + "/robots.txt") self.rfp.set_timeout(request.timeout) self.rfp.read() self._host = host ua = request.get_header("User-agent", "") if self.rfp.can_fetch(ua, request.get_full_url()): return request else: # XXX This should really have raised URLError. Too late now... msg = "request disallowed by robots.txt" raise RobotExclusionError( request, request.get_full_url(), 403, msg, self.http_response_class(StringIO()), StringIO(msg)) https_request = http_request class HTTPRefererProcessor(BaseHandler): """Add Referer header to requests. This only makes sense if you use each RefererProcessor for a single chain of requests only (so, for example, if you use a single HTTPRefererProcessor to fetch a series of URLs extracted from a single page, this will break). There's a proper implementation of this in mechanize.Browser. """ def __init__(self): self.referer = None def http_request(self, request): if ((self.referer is not None) and not request.has_header("Referer")): request.add_unredirected_header("Referer", self.referer) return request def http_response(self, request, response): self.referer = response.geturl() return response https_request = http_request https_response = http_response def clean_refresh_url(url): # e.g. Firefox 1.5 does (something like) this if ((url.startswith('"') and url.endswith('"')) or (url.startswith("'") and url.endswith("'"))): url = url[1:-1] return _rfc3986.clean_url(url, "latin-1") # XXX encoding def parse_refresh_header(refresh): """ >>> parse_refresh_header("1; url=http://example.com/") (1.0, 'http://example.com/') >>> parse_refresh_header("1; url='http://example.com/'") (1.0, 'http://example.com/') >>> parse_refresh_header("1") (1.0, None) >>> parse_refresh_header("blah") Traceback (most recent call last): ValueError: invalid literal for float(): blah """ ii = refresh.find(";") if ii != -1: pause, newurl_spec = float(refresh[:ii]), refresh[ii + 1:] jj = newurl_spec.find("=") key = None if jj != -1: key, newurl = newurl_spec[:jj], newurl_spec[jj + 1:] newurl = clean_refresh_url(newurl) if key is None or key.strip().lower() != "url": raise ValueError() else: pause, newurl = float(refresh), None return pause, newurl class HTTPRefreshProcessor(BaseHandler): """Perform HTTP Refresh redirections. Note that if a non-200 HTTP code has occurred (for example, a 30x redirect), this processor will do nothing. By default, only zero-time Refresh headers are redirected. Use the max_time attribute / constructor argument to allow Refresh with longer pauses. Use the honor_time attribute / constructor argument to control whether the requested pause is honoured (with a time.sleep()) or skipped in favour of immediate redirection. Public attributes: max_time: see above honor_time: see above """ handler_order = 1000 def __init__(self, max_time=0, honor_time=True): self.max_time = max_time self.honor_time = honor_time self._sleep = time.sleep def http_response(self, request, response): code, msg, hdrs = response.code, response.msg, response.info() if code == 200 and hdrs.has_key("refresh"): refresh = hdrs.getheaders("refresh")[0] try: pause, newurl = parse_refresh_header(refresh) except ValueError: debug("bad Refresh header: %r" % refresh) return response if newurl is None: newurl = response.geturl() if (self.max_time is None) or (pause <= self.max_time): if pause > 1E-3 and self.honor_time: self._sleep(pause) hdrs["location"] = newurl # hardcoded http is NOT a bug response = self.parent.error( "http", request, response, "refresh", msg, hdrs) else: debug("Refresh header ignored: %r" % refresh) return response https_response = http_response class HTTPErrorProcessor(BaseHandler): """Process HTTP error responses. The purpose of this handler is to to allow other response processors a look-in by removing the call to parent.error() from AbstractHTTPHandler. For non-200 error codes, this just passes the job on to the Handler._error_ methods, via the OpenerDirector.error method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an HTTPError if no other handler handles the error. """ handler_order = 1000 # after all other processors def http_response(self, request, response): code, msg, hdrs = response.code, response.msg, response.info() if code != 200: # hardcoded http is NOT a bug response = self.parent.error( "http", request, response, code, msg, hdrs) return response https_response = http_response class HTTPDefaultErrorHandler(BaseHandler): def http_error_default(self, req, fp, code, msg, hdrs): # why these error methods took the code, msg, headers args in the first # place rather than a response object, I don't know, but to avoid # multiple wrapping, we're discarding them if isinstance(fp, urllib2.HTTPError): response = fp else: response = urllib2.HTTPError( req.get_full_url(), code, msg, hdrs, fp) assert code == response.code assert msg == response.msg assert hdrs == response.hdrs raise response class AbstractHTTPHandler(BaseHandler): def __init__(self, debuglevel=0): self._debuglevel = debuglevel def set_http_debuglevel(self, level): self._debuglevel = level def do_request_(self, request): host = request.get_host() if not host: raise URLError('no host given') if request.has_data(): # POST data = request.get_data() if not request.has_header('Content-type'): request.add_unredirected_header( 'Content-type', 'application/x-www-form-urlencoded') if not request.has_header('Content-length'): request.add_unredirected_header( 'Content-length', '%d' % len(data)) scheme, sel = urllib.splittype(request.get_selector()) sel_host, sel_path = urllib.splithost(sel) if not request.has_header('Host'): request.add_unredirected_header('Host', sel_host or host) for name, value in self.parent.addheaders: name = name.capitalize() if not request.has_header(name): request.add_unredirected_header(name, value) return request def do_open(self, http_class, req): """Return an addinfourl object for the request, using http_class. http_class must implement the HTTPConnection API from httplib. The addinfourl return value is a file-like object. It also has methods and attributes including: - info(): return a mimetools.Message object for the headers - geturl(): return the original request URL - code: HTTP status code """ host_port = req.get_host() if not host_port: raise URLError('no host given') try: h = http_class(host_port, timeout=req.timeout) except TypeError: # Python < 2.6, no per-connection timeout support h = http_class(host_port) h.set_debuglevel(self._debuglevel) headers = dict(req.headers) headers.update(req.unredirected_hdrs) # We want to make an HTTP/1.1 request, but the addinfourl # class isn't prepared to deal with a persistent connection. # It will try to read all remaining data from the socket, # which will block while the server waits for the next request. # So make sure the connection gets closed after the (only) # request. headers["Connection"] = "close" headers = dict( [(name.title(), val) for name, val in headers.items()]) try: h.request(req.get_method(), req.get_selector(), req.data, headers) r = h.getresponse() except socket.error, err: # XXX what error? raise URLError(err) # Pick apart the HTTPResponse object to get the addinfourl # object initialized properly. # Wrap the HTTPResponse object in socket's file object adapter # for Windows. That adapter calls recv(), so delegate recv() # to read(). This weird wrapping allows the returned object to # have readline() and readlines() methods. # XXX It might be better to extract the read buffering code # out of socket._fileobject() and into a base class. r.recv = r.read fp = create_readline_wrapper(r) resp = closeable_response(fp, r.msg, req.get_full_url(), r.status, r.reason) return resp class HTTPHandler(AbstractHTTPHandler): def http_open(self, req): return self.do_open(httplib.HTTPConnection, req) http_request = AbstractHTTPHandler.do_request_ if hasattr(httplib, 'HTTPS'): class HTTPSConnectionFactory: def __init__(self, key_file, cert_file): self._key_file = key_file self._cert_file = cert_file def __call__(self, hostport): return httplib.HTTPSConnection( hostport, key_file=self._key_file, cert_file=self._cert_file) class HTTPSHandler(AbstractHTTPHandler): def __init__(self, client_cert_manager=None): AbstractHTTPHandler.__init__(self) self.client_cert_manager = client_cert_manager def https_open(self, req): if self.client_cert_manager is not None: key_file, cert_file = self.client_cert_manager.find_key_cert( req.get_full_url()) conn_factory = HTTPSConnectionFactory(key_file, cert_file) else: conn_factory = httplib.HTTPSConnection return self.do_open(conn_factory, req) https_request = AbstractHTTPHandler.do_request_