[106] | 1 | """Convenient HTTP UserAgent class.
|
---|
| 2 |
|
---|
| 3 | This is a subclass of urllib2.OpenerDirector.
|
---|
| 4 |
|
---|
| 5 |
|
---|
| 6 | Copyright 2003-2006 John J. Lee <jjl@pobox.com>
|
---|
| 7 |
|
---|
| 8 | This code is free software; you can redistribute it and/or modify it under
|
---|
| 9 | the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
---|
| 10 | included with the distribution).
|
---|
| 11 |
|
---|
| 12 | """
|
---|
| 13 |
|
---|
| 14 | import warnings
|
---|
| 15 |
|
---|
| 16 | import _auth
|
---|
| 17 | import _gzip
|
---|
| 18 | import _opener
|
---|
| 19 | import _response
|
---|
| 20 | import _sockettimeout
|
---|
| 21 | import _urllib2
|
---|
| 22 |
|
---|
| 23 |
|
---|
| 24 | class UserAgentBase(_opener.OpenerDirector):
|
---|
| 25 | """Convenient user-agent class.
|
---|
| 26 |
|
---|
| 27 | Do not use .add_handler() to add a handler for something already dealt with
|
---|
| 28 | by this code.
|
---|
| 29 |
|
---|
| 30 | The only reason at present for the distinction between UserAgent and
|
---|
| 31 | UserAgentBase is so that classes that depend on .seek()able responses
|
---|
| 32 | (e.g. mechanize.Browser) can inherit from UserAgentBase. The subclass
|
---|
| 33 | UserAgent exposes a .set_seekable_responses() method that allows switching
|
---|
| 34 | off the adding of a .seek() method to responses.
|
---|
| 35 |
|
---|
| 36 | Public attributes:
|
---|
| 37 |
|
---|
| 38 | addheaders: list of (name, value) pairs specifying headers to send with
|
---|
| 39 | every request, unless they are overridden in the Request instance.
|
---|
| 40 |
|
---|
| 41 | >>> ua = UserAgentBase()
|
---|
| 42 | >>> ua.addheaders = [
|
---|
| 43 | ... ("User-agent", "Mozilla/5.0 (compatible)"),
|
---|
| 44 | ... ("From", "responsible.person@example.com")]
|
---|
| 45 |
|
---|
| 46 | """
|
---|
| 47 |
|
---|
| 48 | handler_classes = {
|
---|
| 49 | # scheme handlers
|
---|
| 50 | "http": _urllib2.HTTPHandler,
|
---|
| 51 | # CacheFTPHandler is buggy, at least in 2.3, so we don't use it
|
---|
| 52 | "ftp": _urllib2.FTPHandler,
|
---|
| 53 | "file": _urllib2.FileHandler,
|
---|
| 54 |
|
---|
| 55 | # other handlers
|
---|
| 56 | "_unknown": _urllib2.UnknownHandler,
|
---|
| 57 | # HTTP{S,}Handler depend on HTTPErrorProcessor too
|
---|
| 58 | "_http_error": _urllib2.HTTPErrorProcessor,
|
---|
| 59 | "_http_request_upgrade": _urllib2.HTTPRequestUpgradeProcessor,
|
---|
| 60 | "_http_default_error": _urllib2.HTTPDefaultErrorHandler,
|
---|
| 61 |
|
---|
| 62 | # feature handlers
|
---|
| 63 | "_basicauth": _urllib2.HTTPBasicAuthHandler,
|
---|
| 64 | "_digestauth": _urllib2.HTTPDigestAuthHandler,
|
---|
| 65 | "_redirect": _urllib2.HTTPRedirectHandler,
|
---|
| 66 | "_cookies": _urllib2.HTTPCookieProcessor,
|
---|
| 67 | "_refresh": _urllib2.HTTPRefreshProcessor,
|
---|
| 68 | "_equiv": _urllib2.HTTPEquivProcessor,
|
---|
| 69 | "_proxy": _urllib2.ProxyHandler,
|
---|
| 70 | "_proxy_basicauth": _urllib2.ProxyBasicAuthHandler,
|
---|
| 71 | "_proxy_digestauth": _urllib2.ProxyDigestAuthHandler,
|
---|
| 72 | "_robots": _urllib2.HTTPRobotRulesProcessor,
|
---|
| 73 | "_gzip": _gzip.HTTPGzipProcessor, # experimental!
|
---|
| 74 |
|
---|
| 75 | # debug handlers
|
---|
| 76 | "_debug_redirect": _urllib2.HTTPRedirectDebugProcessor,
|
---|
| 77 | "_debug_response_body": _urllib2.HTTPResponseDebugProcessor,
|
---|
| 78 | }
|
---|
| 79 |
|
---|
| 80 | default_schemes = ["http", "ftp", "file"]
|
---|
| 81 | default_others = ["_unknown", "_http_error", "_http_request_upgrade",
|
---|
| 82 | "_http_default_error",
|
---|
| 83 | ]
|
---|
| 84 | default_features = ["_redirect", "_cookies",
|
---|
| 85 | "_refresh", "_equiv",
|
---|
| 86 | "_basicauth", "_digestauth",
|
---|
| 87 | "_proxy", "_proxy_basicauth", "_proxy_digestauth",
|
---|
| 88 | "_robots",
|
---|
| 89 | ]
|
---|
| 90 | if hasattr(_urllib2, 'HTTPSHandler'):
|
---|
| 91 | handler_classes["https"] = _urllib2.HTTPSHandler
|
---|
| 92 | default_schemes.append("https")
|
---|
| 93 |
|
---|
| 94 | def __init__(self):
|
---|
| 95 | _opener.OpenerDirector.__init__(self)
|
---|
| 96 |
|
---|
| 97 | ua_handlers = self._ua_handlers = {}
|
---|
| 98 | for scheme in (self.default_schemes +
|
---|
| 99 | self.default_others +
|
---|
| 100 | self.default_features):
|
---|
| 101 | klass = self.handler_classes[scheme]
|
---|
| 102 | ua_handlers[scheme] = klass()
|
---|
| 103 | for handler in ua_handlers.itervalues():
|
---|
| 104 | self.add_handler(handler)
|
---|
| 105 |
|
---|
| 106 | # Yuck.
|
---|
| 107 | # Ensure correct default constructor args were passed to
|
---|
| 108 | # HTTPRefreshProcessor and HTTPEquivProcessor.
|
---|
| 109 | if "_refresh" in ua_handlers:
|
---|
| 110 | self.set_handle_refresh(True)
|
---|
| 111 | if "_equiv" in ua_handlers:
|
---|
| 112 | self.set_handle_equiv(True)
|
---|
| 113 | # Ensure default password managers are installed.
|
---|
| 114 | pm = ppm = None
|
---|
| 115 | if "_basicauth" in ua_handlers or "_digestauth" in ua_handlers:
|
---|
| 116 | pm = _urllib2.HTTPPasswordMgrWithDefaultRealm()
|
---|
| 117 | if ("_proxy_basicauth" in ua_handlers or
|
---|
| 118 | "_proxy_digestauth" in ua_handlers):
|
---|
| 119 | ppm = _auth.HTTPProxyPasswordMgr()
|
---|
| 120 | self.set_password_manager(pm)
|
---|
| 121 | self.set_proxy_password_manager(ppm)
|
---|
| 122 | # set default certificate manager
|
---|
| 123 | if "https" in ua_handlers:
|
---|
| 124 | cm = _urllib2.HTTPSClientCertMgr()
|
---|
| 125 | self.set_client_cert_manager(cm)
|
---|
| 126 |
|
---|
| 127 | def close(self):
|
---|
| 128 | _opener.OpenerDirector.close(self)
|
---|
| 129 | self._ua_handlers = None
|
---|
| 130 |
|
---|
| 131 | # XXX
|
---|
| 132 | ## def set_timeout(self, timeout):
|
---|
| 133 | ## self._timeout = timeout
|
---|
| 134 | ## def set_http_connection_cache(self, conn_cache):
|
---|
| 135 | ## self._http_conn_cache = conn_cache
|
---|
| 136 | ## def set_ftp_connection_cache(self, conn_cache):
|
---|
| 137 | ## # XXX ATM, FTP has cache as part of handler; should it be separate?
|
---|
| 138 | ## self._ftp_conn_cache = conn_cache
|
---|
| 139 |
|
---|
| 140 | def set_handled_schemes(self, schemes):
|
---|
| 141 | """Set sequence of URL scheme (protocol) strings.
|
---|
| 142 |
|
---|
| 143 | For example: ua.set_handled_schemes(["http", "ftp"])
|
---|
| 144 |
|
---|
| 145 | If this fails (with ValueError) because you've passed an unknown
|
---|
| 146 | scheme, the set of handled schemes will not be changed.
|
---|
| 147 |
|
---|
| 148 | """
|
---|
| 149 | want = {}
|
---|
| 150 | for scheme in schemes:
|
---|
| 151 | if scheme.startswith("_"):
|
---|
| 152 | raise ValueError("not a scheme '%s'" % scheme)
|
---|
| 153 | if scheme not in self.handler_classes:
|
---|
| 154 | raise ValueError("unknown scheme '%s'")
|
---|
| 155 | want[scheme] = None
|
---|
| 156 |
|
---|
| 157 | # get rid of scheme handlers we don't want
|
---|
| 158 | for scheme, oldhandler in self._ua_handlers.items():
|
---|
| 159 | if scheme.startswith("_"): continue # not a scheme handler
|
---|
| 160 | if scheme not in want:
|
---|
| 161 | self._replace_handler(scheme, None)
|
---|
| 162 | else:
|
---|
| 163 | del want[scheme] # already got it
|
---|
| 164 | # add the scheme handlers that are missing
|
---|
| 165 | for scheme in want.keys():
|
---|
| 166 | self._set_handler(scheme, True)
|
---|
| 167 |
|
---|
| 168 | def set_cookiejar(self, cookiejar):
|
---|
| 169 | """Set a mechanize.CookieJar, or None."""
|
---|
| 170 | self._set_handler("_cookies", obj=cookiejar)
|
---|
| 171 |
|
---|
| 172 | # XXX could use Greg Stein's httpx for some of this instead?
|
---|
| 173 | # or httplib2??
|
---|
| 174 | def set_proxies(self, proxies):
|
---|
| 175 | """Set a dictionary mapping URL scheme to proxy specification, or None.
|
---|
| 176 |
|
---|
| 177 | e.g. {"http": "joe:password@myproxy.example.com:3128",
|
---|
| 178 | "ftp": "proxy.example.com"}
|
---|
| 179 |
|
---|
| 180 | """
|
---|
| 181 | self._set_handler("_proxy", obj=proxies)
|
---|
| 182 |
|
---|
| 183 | def add_password(self, url, user, password, realm=None):
|
---|
| 184 | self._password_manager.add_password(realm, url, user, password)
|
---|
| 185 | def add_proxy_password(self, user, password, hostport=None, realm=None):
|
---|
| 186 | self._proxy_password_manager.add_password(
|
---|
| 187 | realm, hostport, user, password)
|
---|
| 188 |
|
---|
| 189 | def add_client_certificate(self, url, key_file, cert_file):
|
---|
| 190 | """Add an SSL client certificate, for HTTPS client auth.
|
---|
| 191 |
|
---|
| 192 | key_file and cert_file must be filenames of the key and certificate
|
---|
| 193 | files, in PEM format. You can use e.g. OpenSSL to convert a p12 (PKCS
|
---|
| 194 | 12) file to PEM format:
|
---|
| 195 |
|
---|
| 196 | openssl pkcs12 -clcerts -nokeys -in cert.p12 -out cert.pem
|
---|
| 197 | openssl pkcs12 -nocerts -in cert.p12 -out key.pem
|
---|
| 198 |
|
---|
| 199 |
|
---|
| 200 | Note that client certificate password input is very inflexible ATM. At
|
---|
| 201 | the moment this seems to be console only, which is presumably the
|
---|
| 202 | default behaviour of libopenssl. In future mechanize may support
|
---|
| 203 | third-party libraries that (I assume) allow more options here.
|
---|
| 204 |
|
---|
| 205 | """
|
---|
| 206 | self._client_cert_manager.add_key_cert(url, key_file, cert_file)
|
---|
| 207 |
|
---|
| 208 | # the following are rarely useful -- use add_password / add_proxy_password
|
---|
| 209 | # instead
|
---|
| 210 | def set_password_manager(self, password_manager):
|
---|
| 211 | """Set a mechanize.HTTPPasswordMgrWithDefaultRealm, or None."""
|
---|
| 212 | self._password_manager = password_manager
|
---|
| 213 | self._set_handler("_basicauth", obj=password_manager)
|
---|
| 214 | self._set_handler("_digestauth", obj=password_manager)
|
---|
| 215 | def set_proxy_password_manager(self, password_manager):
|
---|
| 216 | """Set a mechanize.HTTPProxyPasswordMgr, or None."""
|
---|
| 217 | self._proxy_password_manager = password_manager
|
---|
| 218 | self._set_handler("_proxy_basicauth", obj=password_manager)
|
---|
| 219 | self._set_handler("_proxy_digestauth", obj=password_manager)
|
---|
| 220 | def set_client_cert_manager(self, cert_manager):
|
---|
| 221 | """Set a mechanize.HTTPClientCertMgr, or None."""
|
---|
| 222 | self._client_cert_manager = cert_manager
|
---|
| 223 | handler = self._ua_handlers["https"]
|
---|
| 224 | handler.client_cert_manager = cert_manager
|
---|
| 225 |
|
---|
| 226 | # these methods all take a boolean parameter
|
---|
| 227 | def set_handle_robots(self, handle):
|
---|
| 228 | """Set whether to observe rules from robots.txt."""
|
---|
| 229 | self._set_handler("_robots", handle)
|
---|
| 230 | def set_handle_redirect(self, handle):
|
---|
| 231 | """Set whether to handle HTTP 30x redirections."""
|
---|
| 232 | self._set_handler("_redirect", handle)
|
---|
| 233 | def set_handle_refresh(self, handle, max_time=None, honor_time=True):
|
---|
| 234 | """Set whether to handle HTTP Refresh headers."""
|
---|
| 235 | self._set_handler("_refresh", handle, constructor_kwds=
|
---|
| 236 | {"max_time": max_time, "honor_time": honor_time})
|
---|
| 237 | def set_handle_equiv(self, handle, head_parser_class=None):
|
---|
| 238 | """Set whether to treat HTML http-equiv headers like HTTP headers.
|
---|
| 239 |
|
---|
| 240 | Response objects may be .seek()able if this is set (currently returned
|
---|
| 241 | responses are, raised HTTPError exception responses are not).
|
---|
| 242 |
|
---|
| 243 | """
|
---|
| 244 | if head_parser_class is not None:
|
---|
| 245 | constructor_kwds = {"head_parser_class": head_parser_class}
|
---|
| 246 | else:
|
---|
| 247 | constructor_kwds = {}
|
---|
| 248 | self._set_handler("_equiv", handle, constructor_kwds=constructor_kwds)
|
---|
| 249 | def set_handle_gzip(self, handle):
|
---|
| 250 | """Handle gzip transfer encoding.
|
---|
| 251 |
|
---|
| 252 | """
|
---|
| 253 | if handle:
|
---|
| 254 | warnings.warn(
|
---|
| 255 | "gzip transfer encoding is experimental!", stacklevel=2)
|
---|
| 256 | self._set_handler("_gzip", handle)
|
---|
| 257 | def set_debug_redirects(self, handle):
|
---|
| 258 | """Log information about HTTP redirects (including refreshes).
|
---|
| 259 |
|
---|
| 260 | Logging is performed using module logging. The logger name is
|
---|
| 261 | "mechanize.http_redirects". To actually print some debug output,
|
---|
| 262 | eg:
|
---|
| 263 |
|
---|
| 264 | import sys, logging
|
---|
| 265 | logger = logging.getLogger("mechanize.http_redirects")
|
---|
| 266 | logger.addHandler(logging.StreamHandler(sys.stdout))
|
---|
| 267 | logger.setLevel(logging.INFO)
|
---|
| 268 |
|
---|
| 269 | Other logger names relevant to this module:
|
---|
| 270 |
|
---|
| 271 | "mechanize.http_responses"
|
---|
| 272 | "mechanize.cookies" (or "cookielib" if running Python 2.4)
|
---|
| 273 |
|
---|
| 274 | To turn on everything:
|
---|
| 275 |
|
---|
| 276 | import sys, logging
|
---|
| 277 | logger = logging.getLogger("mechanize")
|
---|
| 278 | logger.addHandler(logging.StreamHandler(sys.stdout))
|
---|
| 279 | logger.setLevel(logging.INFO)
|
---|
| 280 |
|
---|
| 281 | """
|
---|
| 282 | self._set_handler("_debug_redirect", handle)
|
---|
| 283 | def set_debug_responses(self, handle):
|
---|
| 284 | """Log HTTP response bodies.
|
---|
| 285 |
|
---|
| 286 | See docstring for .set_debug_redirects() for details of logging.
|
---|
| 287 |
|
---|
| 288 | Response objects may be .seek()able if this is set (currently returned
|
---|
| 289 | responses are, raised HTTPError exception responses are not).
|
---|
| 290 |
|
---|
| 291 | """
|
---|
| 292 | self._set_handler("_debug_response_body", handle)
|
---|
| 293 | def set_debug_http(self, handle):
|
---|
| 294 | """Print HTTP headers to sys.stdout."""
|
---|
| 295 | level = int(bool(handle))
|
---|
| 296 | for scheme in "http", "https":
|
---|
| 297 | h = self._ua_handlers.get(scheme)
|
---|
| 298 | if h is not None:
|
---|
| 299 | h.set_http_debuglevel(level)
|
---|
| 300 |
|
---|
| 301 | def _set_handler(self, name, handle=None, obj=None,
|
---|
| 302 | constructor_args=(), constructor_kwds={}):
|
---|
| 303 | if handle is None:
|
---|
| 304 | handle = obj is not None
|
---|
| 305 | if handle:
|
---|
| 306 | handler_class = self.handler_classes[name]
|
---|
| 307 | if obj is not None:
|
---|
| 308 | newhandler = handler_class(obj)
|
---|
| 309 | else:
|
---|
| 310 | newhandler = handler_class(
|
---|
| 311 | *constructor_args, **constructor_kwds)
|
---|
| 312 | else:
|
---|
| 313 | newhandler = None
|
---|
| 314 | self._replace_handler(name, newhandler)
|
---|
| 315 |
|
---|
| 316 | def _replace_handler(self, name, newhandler=None):
|
---|
| 317 | # first, if handler was previously added, remove it
|
---|
| 318 | if name is not None:
|
---|
| 319 | handler = self._ua_handlers.get(name)
|
---|
| 320 | if handler:
|
---|
| 321 | try:
|
---|
| 322 | self.handlers.remove(handler)
|
---|
| 323 | except ValueError:
|
---|
| 324 | pass
|
---|
| 325 | # then add the replacement, if any
|
---|
| 326 | if newhandler is not None:
|
---|
| 327 | self.add_handler(newhandler)
|
---|
| 328 | self._ua_handlers[name] = newhandler
|
---|
| 329 |
|
---|
| 330 |
|
---|
| 331 | class UserAgent(UserAgentBase):
|
---|
| 332 |
|
---|
| 333 | def __init__(self):
|
---|
| 334 | UserAgentBase.__init__(self)
|
---|
| 335 | self._seekable = False
|
---|
| 336 |
|
---|
| 337 | def set_seekable_responses(self, handle):
|
---|
| 338 | """Make response objects .seek()able."""
|
---|
| 339 | self._seekable = bool(handle)
|
---|
| 340 |
|
---|
| 341 | def open(self, fullurl, data=None,
|
---|
| 342 | timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
---|
| 343 | if self._seekable:
|
---|
| 344 | def bound_open(fullurl, data=None,
|
---|
| 345 | timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
---|
| 346 | return UserAgentBase.open(self, fullurl, data, timeout)
|
---|
| 347 | response = _opener.wrapped_open(
|
---|
| 348 | bound_open, _response.seek_wrapped_response, fullurl, data,
|
---|
| 349 | timeout)
|
---|
| 350 | else:
|
---|
| 351 | response = UserAgentBase.open(self, fullurl, data)
|
---|
| 352 | return response
|
---|