source: py-scraping/mechanize/_useragent.py@ 215

Last change on this file since 215 was 106, checked in by Rick van der Zwet, 15 years ago

Initial commit...

File size: 13.3 KB
RevLine 
[106]1"""Convenient HTTP UserAgent class.
2
3This is a subclass of urllib2.OpenerDirector.
4
5
6Copyright 2003-2006 John J. Lee <jjl@pobox.com>
7
8This code is free software; you can redistribute it and/or modify it under
9the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
10included with the distribution).
11
12"""
13
14import warnings
15
16import _auth
17import _gzip
18import _opener
19import _response
20import _sockettimeout
21import _urllib2
22
23
24class UserAgentBase(_opener.OpenerDirector):
25 """Convenient user-agent class.
26
27 Do not use .add_handler() to add a handler for something already dealt with
28 by this code.
29
30 The only reason at present for the distinction between UserAgent and
31 UserAgentBase is so that classes that depend on .seek()able responses
32 (e.g. mechanize.Browser) can inherit from UserAgentBase. The subclass
33 UserAgent exposes a .set_seekable_responses() method that allows switching
34 off the adding of a .seek() method to responses.
35
36 Public attributes:
37
38 addheaders: list of (name, value) pairs specifying headers to send with
39 every request, unless they are overridden in the Request instance.
40
41 >>> ua = UserAgentBase()
42 >>> ua.addheaders = [
43 ... ("User-agent", "Mozilla/5.0 (compatible)"),
44 ... ("From", "responsible.person@example.com")]
45
46 """
47
48 handler_classes = {
49 # scheme handlers
50 "http": _urllib2.HTTPHandler,
51 # CacheFTPHandler is buggy, at least in 2.3, so we don't use it
52 "ftp": _urllib2.FTPHandler,
53 "file": _urllib2.FileHandler,
54
55 # other handlers
56 "_unknown": _urllib2.UnknownHandler,
57 # HTTP{S,}Handler depend on HTTPErrorProcessor too
58 "_http_error": _urllib2.HTTPErrorProcessor,
59 "_http_request_upgrade": _urllib2.HTTPRequestUpgradeProcessor,
60 "_http_default_error": _urllib2.HTTPDefaultErrorHandler,
61
62 # feature handlers
63 "_basicauth": _urllib2.HTTPBasicAuthHandler,
64 "_digestauth": _urllib2.HTTPDigestAuthHandler,
65 "_redirect": _urllib2.HTTPRedirectHandler,
66 "_cookies": _urllib2.HTTPCookieProcessor,
67 "_refresh": _urllib2.HTTPRefreshProcessor,
68 "_equiv": _urllib2.HTTPEquivProcessor,
69 "_proxy": _urllib2.ProxyHandler,
70 "_proxy_basicauth": _urllib2.ProxyBasicAuthHandler,
71 "_proxy_digestauth": _urllib2.ProxyDigestAuthHandler,
72 "_robots": _urllib2.HTTPRobotRulesProcessor,
73 "_gzip": _gzip.HTTPGzipProcessor, # experimental!
74
75 # debug handlers
76 "_debug_redirect": _urllib2.HTTPRedirectDebugProcessor,
77 "_debug_response_body": _urllib2.HTTPResponseDebugProcessor,
78 }
79
80 default_schemes = ["http", "ftp", "file"]
81 default_others = ["_unknown", "_http_error", "_http_request_upgrade",
82 "_http_default_error",
83 ]
84 default_features = ["_redirect", "_cookies",
85 "_refresh", "_equiv",
86 "_basicauth", "_digestauth",
87 "_proxy", "_proxy_basicauth", "_proxy_digestauth",
88 "_robots",
89 ]
90 if hasattr(_urllib2, 'HTTPSHandler'):
91 handler_classes["https"] = _urllib2.HTTPSHandler
92 default_schemes.append("https")
93
94 def __init__(self):
95 _opener.OpenerDirector.__init__(self)
96
97 ua_handlers = self._ua_handlers = {}
98 for scheme in (self.default_schemes +
99 self.default_others +
100 self.default_features):
101 klass = self.handler_classes[scheme]
102 ua_handlers[scheme] = klass()
103 for handler in ua_handlers.itervalues():
104 self.add_handler(handler)
105
106 # Yuck.
107 # Ensure correct default constructor args were passed to
108 # HTTPRefreshProcessor and HTTPEquivProcessor.
109 if "_refresh" in ua_handlers:
110 self.set_handle_refresh(True)
111 if "_equiv" in ua_handlers:
112 self.set_handle_equiv(True)
113 # Ensure default password managers are installed.
114 pm = ppm = None
115 if "_basicauth" in ua_handlers or "_digestauth" in ua_handlers:
116 pm = _urllib2.HTTPPasswordMgrWithDefaultRealm()
117 if ("_proxy_basicauth" in ua_handlers or
118 "_proxy_digestauth" in ua_handlers):
119 ppm = _auth.HTTPProxyPasswordMgr()
120 self.set_password_manager(pm)
121 self.set_proxy_password_manager(ppm)
122 # set default certificate manager
123 if "https" in ua_handlers:
124 cm = _urllib2.HTTPSClientCertMgr()
125 self.set_client_cert_manager(cm)
126
127 def close(self):
128 _opener.OpenerDirector.close(self)
129 self._ua_handlers = None
130
131 # XXX
132## def set_timeout(self, timeout):
133## self._timeout = timeout
134## def set_http_connection_cache(self, conn_cache):
135## self._http_conn_cache = conn_cache
136## def set_ftp_connection_cache(self, conn_cache):
137## # XXX ATM, FTP has cache as part of handler; should it be separate?
138## self._ftp_conn_cache = conn_cache
139
140 def set_handled_schemes(self, schemes):
141 """Set sequence of URL scheme (protocol) strings.
142
143 For example: ua.set_handled_schemes(["http", "ftp"])
144
145 If this fails (with ValueError) because you've passed an unknown
146 scheme, the set of handled schemes will not be changed.
147
148 """
149 want = {}
150 for scheme in schemes:
151 if scheme.startswith("_"):
152 raise ValueError("not a scheme '%s'" % scheme)
153 if scheme not in self.handler_classes:
154 raise ValueError("unknown scheme '%s'")
155 want[scheme] = None
156
157 # get rid of scheme handlers we don't want
158 for scheme, oldhandler in self._ua_handlers.items():
159 if scheme.startswith("_"): continue # not a scheme handler
160 if scheme not in want:
161 self._replace_handler(scheme, None)
162 else:
163 del want[scheme] # already got it
164 # add the scheme handlers that are missing
165 for scheme in want.keys():
166 self._set_handler(scheme, True)
167
168 def set_cookiejar(self, cookiejar):
169 """Set a mechanize.CookieJar, or None."""
170 self._set_handler("_cookies", obj=cookiejar)
171
172 # XXX could use Greg Stein's httpx for some of this instead?
173 # or httplib2??
174 def set_proxies(self, proxies):
175 """Set a dictionary mapping URL scheme to proxy specification, or None.
176
177 e.g. {"http": "joe:password@myproxy.example.com:3128",
178 "ftp": "proxy.example.com"}
179
180 """
181 self._set_handler("_proxy", obj=proxies)
182
183 def add_password(self, url, user, password, realm=None):
184 self._password_manager.add_password(realm, url, user, password)
185 def add_proxy_password(self, user, password, hostport=None, realm=None):
186 self._proxy_password_manager.add_password(
187 realm, hostport, user, password)
188
189 def add_client_certificate(self, url, key_file, cert_file):
190 """Add an SSL client certificate, for HTTPS client auth.
191
192 key_file and cert_file must be filenames of the key and certificate
193 files, in PEM format. You can use e.g. OpenSSL to convert a p12 (PKCS
194 12) file to PEM format:
195
196 openssl pkcs12 -clcerts -nokeys -in cert.p12 -out cert.pem
197 openssl pkcs12 -nocerts -in cert.p12 -out key.pem
198
199
200 Note that client certificate password input is very inflexible ATM. At
201 the moment this seems to be console only, which is presumably the
202 default behaviour of libopenssl. In future mechanize may support
203 third-party libraries that (I assume) allow more options here.
204
205 """
206 self._client_cert_manager.add_key_cert(url, key_file, cert_file)
207
208 # the following are rarely useful -- use add_password / add_proxy_password
209 # instead
210 def set_password_manager(self, password_manager):
211 """Set a mechanize.HTTPPasswordMgrWithDefaultRealm, or None."""
212 self._password_manager = password_manager
213 self._set_handler("_basicauth", obj=password_manager)
214 self._set_handler("_digestauth", obj=password_manager)
215 def set_proxy_password_manager(self, password_manager):
216 """Set a mechanize.HTTPProxyPasswordMgr, or None."""
217 self._proxy_password_manager = password_manager
218 self._set_handler("_proxy_basicauth", obj=password_manager)
219 self._set_handler("_proxy_digestauth", obj=password_manager)
220 def set_client_cert_manager(self, cert_manager):
221 """Set a mechanize.HTTPClientCertMgr, or None."""
222 self._client_cert_manager = cert_manager
223 handler = self._ua_handlers["https"]
224 handler.client_cert_manager = cert_manager
225
226 # these methods all take a boolean parameter
227 def set_handle_robots(self, handle):
228 """Set whether to observe rules from robots.txt."""
229 self._set_handler("_robots", handle)
230 def set_handle_redirect(self, handle):
231 """Set whether to handle HTTP 30x redirections."""
232 self._set_handler("_redirect", handle)
233 def set_handle_refresh(self, handle, max_time=None, honor_time=True):
234 """Set whether to handle HTTP Refresh headers."""
235 self._set_handler("_refresh", handle, constructor_kwds=
236 {"max_time": max_time, "honor_time": honor_time})
237 def set_handle_equiv(self, handle, head_parser_class=None):
238 """Set whether to treat HTML http-equiv headers like HTTP headers.
239
240 Response objects may be .seek()able if this is set (currently returned
241 responses are, raised HTTPError exception responses are not).
242
243 """
244 if head_parser_class is not None:
245 constructor_kwds = {"head_parser_class": head_parser_class}
246 else:
247 constructor_kwds = {}
248 self._set_handler("_equiv", handle, constructor_kwds=constructor_kwds)
249 def set_handle_gzip(self, handle):
250 """Handle gzip transfer encoding.
251
252 """
253 if handle:
254 warnings.warn(
255 "gzip transfer encoding is experimental!", stacklevel=2)
256 self._set_handler("_gzip", handle)
257 def set_debug_redirects(self, handle):
258 """Log information about HTTP redirects (including refreshes).
259
260 Logging is performed using module logging. The logger name is
261 "mechanize.http_redirects". To actually print some debug output,
262 eg:
263
264 import sys, logging
265 logger = logging.getLogger("mechanize.http_redirects")
266 logger.addHandler(logging.StreamHandler(sys.stdout))
267 logger.setLevel(logging.INFO)
268
269 Other logger names relevant to this module:
270
271 "mechanize.http_responses"
272 "mechanize.cookies" (or "cookielib" if running Python 2.4)
273
274 To turn on everything:
275
276 import sys, logging
277 logger = logging.getLogger("mechanize")
278 logger.addHandler(logging.StreamHandler(sys.stdout))
279 logger.setLevel(logging.INFO)
280
281 """
282 self._set_handler("_debug_redirect", handle)
283 def set_debug_responses(self, handle):
284 """Log HTTP response bodies.
285
286 See docstring for .set_debug_redirects() for details of logging.
287
288 Response objects may be .seek()able if this is set (currently returned
289 responses are, raised HTTPError exception responses are not).
290
291 """
292 self._set_handler("_debug_response_body", handle)
293 def set_debug_http(self, handle):
294 """Print HTTP headers to sys.stdout."""
295 level = int(bool(handle))
296 for scheme in "http", "https":
297 h = self._ua_handlers.get(scheme)
298 if h is not None:
299 h.set_http_debuglevel(level)
300
301 def _set_handler(self, name, handle=None, obj=None,
302 constructor_args=(), constructor_kwds={}):
303 if handle is None:
304 handle = obj is not None
305 if handle:
306 handler_class = self.handler_classes[name]
307 if obj is not None:
308 newhandler = handler_class(obj)
309 else:
310 newhandler = handler_class(
311 *constructor_args, **constructor_kwds)
312 else:
313 newhandler = None
314 self._replace_handler(name, newhandler)
315
316 def _replace_handler(self, name, newhandler=None):
317 # first, if handler was previously added, remove it
318 if name is not None:
319 handler = self._ua_handlers.get(name)
320 if handler:
321 try:
322 self.handlers.remove(handler)
323 except ValueError:
324 pass
325 # then add the replacement, if any
326 if newhandler is not None:
327 self.add_handler(newhandler)
328 self._ua_handlers[name] = newhandler
329
330
331class UserAgent(UserAgentBase):
332
333 def __init__(self):
334 UserAgentBase.__init__(self)
335 self._seekable = False
336
337 def set_seekable_responses(self, handle):
338 """Make response objects .seek()able."""
339 self._seekable = bool(handle)
340
341 def open(self, fullurl, data=None,
342 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
343 if self._seekable:
344 def bound_open(fullurl, data=None,
345 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
346 return UserAgentBase.open(self, fullurl, data, timeout)
347 response = _opener.wrapped_open(
348 bound_open, _response.seek_wrapped_response, fullurl, data,
349 timeout)
350 else:
351 response = UserAgentBase.open(self, fullurl, data)
352 return response
Note: See TracBrowser for help on using the repository browser.