source: py-scraping/mechanize/_auth.py@ 112

Last change on this file since 112 was 106, checked in by Rick van der Zwet, 15 years ago

Initial commit...

File size: 18.4 KB
Line 
1"""HTTP Authentication and Proxy support.
2
3All but HTTPProxyPasswordMgr come from Python 2.5.
4
5
6Copyright 2006 John J. Lee <jjl@pobox.com>
7
8This code is free software; you can redistribute it and/or modify it under
9the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
10included with the distribution).
11
12"""
13
14import base64
15import copy
16import os
17import posixpath
18import random
19import re
20import time
21import urlparse
22
23try:
24 import hashlib
25except ImportError:
26 import md5
27 import sha
28 def sha1_digest(bytes):
29 return sha.new(bytes).hexdigest()
30 def md5_digest(bytes):
31 return md5.new(bytes).hexdigest()
32else:
33 def sha1_digest(bytes):
34 return hashlib.sha1(bytes).hexdigest()
35 def md5_digest(bytes):
36 return hashlib.md5(bytes).hexdigest()
37
38from urllib2 import BaseHandler, HTTPError, parse_keqv_list, parse_http_list
39from urllib import getproxies, unquote, splittype, splituser, splitpasswd, \
40 splitport
41
42
43def _parse_proxy(proxy):
44 """Return (scheme, user, password, host/port) given a URL or an authority.
45
46 If a URL is supplied, it must have an authority (host:port) component.
47 According to RFC 3986, having an authority component means the URL must
48 have two slashes after the scheme:
49
50 >>> _parse_proxy('file:/ftp.example.com/')
51 Traceback (most recent call last):
52 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
53
54 The first three items of the returned tuple may be None.
55
56 Examples of authority parsing:
57
58 >>> _parse_proxy('proxy.example.com')
59 (None, None, None, 'proxy.example.com')
60 >>> _parse_proxy('proxy.example.com:3128')
61 (None, None, None, 'proxy.example.com:3128')
62
63 The authority component may optionally include userinfo (assumed to be
64 username:password):
65
66 >>> _parse_proxy('joe:password@proxy.example.com')
67 (None, 'joe', 'password', 'proxy.example.com')
68 >>> _parse_proxy('joe:password@proxy.example.com:3128')
69 (None, 'joe', 'password', 'proxy.example.com:3128')
70
71 Same examples, but with URLs instead:
72
73 >>> _parse_proxy('http://proxy.example.com/')
74 ('http', None, None, 'proxy.example.com')
75 >>> _parse_proxy('http://proxy.example.com:3128/')
76 ('http', None, None, 'proxy.example.com:3128')
77 >>> _parse_proxy('http://joe:password@proxy.example.com/')
78 ('http', 'joe', 'password', 'proxy.example.com')
79 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
80 ('http', 'joe', 'password', 'proxy.example.com:3128')
81
82 Everything after the authority is ignored:
83
84 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
85 ('ftp', 'joe', 'password', 'proxy.example.com')
86
87 Test for no trailing '/' case:
88
89 >>> _parse_proxy('http://joe:password@proxy.example.com')
90 ('http', 'joe', 'password', 'proxy.example.com')
91
92 """
93 scheme, r_scheme = splittype(proxy)
94 if not r_scheme.startswith("/"):
95 # authority
96 scheme = None
97 authority = proxy
98 else:
99 # URL
100 if not r_scheme.startswith("//"):
101 raise ValueError("proxy URL with no authority: %r" % proxy)
102 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
103 # and 3.3.), path is empty or starts with '/'
104 end = r_scheme.find("/", 2)
105 if end == -1:
106 end = None
107 authority = r_scheme[2:end]
108 userinfo, hostport = splituser(authority)
109 if userinfo is not None:
110 user, password = splitpasswd(userinfo)
111 else:
112 user = password = None
113 return scheme, user, password, hostport
114
115class ProxyHandler(BaseHandler):
116 # Proxies must be in front
117 handler_order = 100
118
119 def __init__(self, proxies=None):
120 if proxies is None:
121 proxies = getproxies()
122 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
123 self.proxies = proxies
124 for type, url in proxies.items():
125 setattr(self, '%s_open' % type,
126 lambda r, proxy=url, type=type, meth=self.proxy_open: \
127 meth(r, proxy, type))
128
129 def proxy_open(self, req, proxy, type):
130 orig_type = req.get_type()
131 proxy_type, user, password, hostport = _parse_proxy(proxy)
132 if proxy_type is None:
133 proxy_type = orig_type
134 if user and password:
135 user_pass = '%s:%s' % (unquote(user), unquote(password))
136 creds = base64.encodestring(user_pass).strip()
137 req.add_header('Proxy-authorization', 'Basic ' + creds)
138 hostport = unquote(hostport)
139 req.set_proxy(hostport, proxy_type)
140 if orig_type == proxy_type:
141 # let other handlers take care of it
142 return None
143 else:
144 # need to start over, because the other handlers don't
145 # grok the proxy's URL type
146 # e.g. if we have a constructor arg proxies like so:
147 # {'http': 'ftp://proxy.example.com'}, we may end up turning
148 # a request for http://acme.example.com/a into one for
149 # ftp://proxy.example.com/a
150 return self.parent.open(req)
151
152class HTTPPasswordMgr:
153
154 def __init__(self):
155 self.passwd = {}
156
157 def add_password(self, realm, uri, user, passwd):
158 # uri could be a single URI or a sequence
159 if isinstance(uri, basestring):
160 uri = [uri]
161 if not realm in self.passwd:
162 self.passwd[realm] = {}
163 for default_port in True, False:
164 reduced_uri = tuple(
165 [self.reduce_uri(u, default_port) for u in uri])
166 self.passwd[realm][reduced_uri] = (user, passwd)
167
168 def find_user_password(self, realm, authuri):
169 domains = self.passwd.get(realm, {})
170 for default_port in True, False:
171 reduced_authuri = self.reduce_uri(authuri, default_port)
172 for uris, authinfo in domains.iteritems():
173 for uri in uris:
174 if self.is_suburi(uri, reduced_authuri):
175 return authinfo
176 return None, None
177
178 def reduce_uri(self, uri, default_port=True):
179 """Accept authority or URI and extract only the authority and path."""
180 # note HTTP URLs do not have a userinfo component
181 parts = urlparse.urlsplit(uri)
182 if parts[1]:
183 # URI
184 scheme = parts[0]
185 authority = parts[1]
186 path = parts[2] or '/'
187 else:
188 # host or host:port
189 scheme = None
190 authority = uri
191 path = '/'
192 host, port = splitport(authority)
193 if default_port and port is None and scheme is not None:
194 dport = {"http": 80,
195 "https": 443,
196 }.get(scheme)
197 if dport is not None:
198 authority = "%s:%d" % (host, dport)
199 return authority, path
200
201 def is_suburi(self, base, test):
202 """Check if test is below base in a URI tree
203
204 Both args must be URIs in reduced form.
205 """
206 if base == test:
207 return True
208 if base[0] != test[0]:
209 return False
210 common = posixpath.commonprefix((base[1], test[1]))
211 if len(common) == len(base[1]):
212 return True
213 return False
214
215
216class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
217
218 def find_user_password(self, realm, authuri):
219 user, password = HTTPPasswordMgr.find_user_password(self, realm,
220 authuri)
221 if user is not None:
222 return user, password
223 return HTTPPasswordMgr.find_user_password(self, None, authuri)
224
225
226class AbstractBasicAuthHandler:
227
228 rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
229
230 # XXX there can actually be multiple auth-schemes in a
231 # www-authenticate header. should probably be a lot more careful
232 # in parsing them to extract multiple alternatives
233
234 def __init__(self, password_mgr=None):
235 if password_mgr is None:
236 password_mgr = HTTPPasswordMgr()
237 self.passwd = password_mgr
238 self.add_password = self.passwd.add_password
239
240 def http_error_auth_reqed(self, authreq, host, req, headers):
241 # host may be an authority (without userinfo) or a URL with an
242 # authority
243 # XXX could be multiple headers
244 authreq = headers.get(authreq, None)
245 if authreq:
246 mo = AbstractBasicAuthHandler.rx.search(authreq)
247 if mo:
248 scheme, realm = mo.groups()
249 if scheme.lower() == 'basic':
250 return self.retry_http_basic_auth(host, req, realm)
251
252 def retry_http_basic_auth(self, host, req, realm):
253 user, pw = self.passwd.find_user_password(realm, host)
254 if pw is not None:
255 raw = "%s:%s" % (user, pw)
256 auth = 'Basic %s' % base64.encodestring(raw).strip()
257 if req.headers.get(self.auth_header, None) == auth:
258 return None
259 newreq = copy.copy(req)
260 newreq.add_header(self.auth_header, auth)
261 newreq.visit = False
262 return self.parent.open(newreq)
263 else:
264 return None
265
266
267class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
268
269 auth_header = 'Authorization'
270
271 def http_error_401(self, req, fp, code, msg, headers):
272 url = req.get_full_url()
273 return self.http_error_auth_reqed('www-authenticate',
274 url, req, headers)
275
276
277class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
278
279 auth_header = 'Proxy-authorization'
280
281 def http_error_407(self, req, fp, code, msg, headers):
282 # http_error_auth_reqed requires that there is no userinfo component in
283 # authority. Assume there isn't one, since urllib2 does not (and
284 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
285 # userinfo.
286 authority = req.get_host()
287 return self.http_error_auth_reqed('proxy-authenticate',
288 authority, req, headers)
289
290
291def randombytes(n):
292 """Return n random bytes."""
293 # Use /dev/urandom if it is available. Fall back to random module
294 # if not. It might be worthwhile to extend this function to use
295 # other platform-specific mechanisms for getting random bytes.
296 if os.path.exists("/dev/urandom"):
297 f = open("/dev/urandom")
298 s = f.read(n)
299 f.close()
300 return s
301 else:
302 L = [chr(random.randrange(0, 256)) for i in range(n)]
303 return "".join(L)
304
305class AbstractDigestAuthHandler:
306 # Digest authentication is specified in RFC 2617.
307
308 # XXX The client does not inspect the Authentication-Info header
309 # in a successful response.
310
311 # XXX It should be possible to test this implementation against
312 # a mock server that just generates a static set of challenges.
313
314 # XXX qop="auth-int" supports is shaky
315
316 def __init__(self, passwd=None):
317 if passwd is None:
318 passwd = HTTPPasswordMgr()
319 self.passwd = passwd
320 self.add_password = self.passwd.add_password
321 self.retried = 0
322 self.nonce_count = 0
323
324 def reset_retry_count(self):
325 self.retried = 0
326
327 def http_error_auth_reqed(self, auth_header, host, req, headers):
328 authreq = headers.get(auth_header, None)
329 if self.retried > 5:
330 # Don't fail endlessly - if we failed once, we'll probably
331 # fail a second time. Hm. Unless the Password Manager is
332 # prompting for the information. Crap. This isn't great
333 # but it's better than the current 'repeat until recursion
334 # depth exceeded' approach <wink>
335 raise HTTPError(req.get_full_url(), 401, "digest auth failed",
336 headers, None)
337 else:
338 self.retried += 1
339 if authreq:
340 scheme = authreq.split()[0]
341 if scheme.lower() == 'digest':
342 return self.retry_http_digest_auth(req, authreq)
343
344 def retry_http_digest_auth(self, req, auth):
345 token, challenge = auth.split(' ', 1)
346 chal = parse_keqv_list(parse_http_list(challenge))
347 auth = self.get_authorization(req, chal)
348 if auth:
349 auth_val = 'Digest %s' % auth
350 if req.headers.get(self.auth_header, None) == auth_val:
351 return None
352 newreq = copy.copy(req)
353 newreq.add_unredirected_header(self.auth_header, auth_val)
354 newreq.visit = False
355 return self.parent.open(newreq)
356
357 def get_cnonce(self, nonce):
358 # The cnonce-value is an opaque
359 # quoted string value provided by the client and used by both client
360 # and server to avoid chosen plaintext attacks, to provide mutual
361 # authentication, and to provide some message integrity protection.
362 # This isn't a fabulous effort, but it's probably Good Enough.
363 dig = sha1_digest("%s:%s:%s:%s" % (self.nonce_count, nonce,
364 time.ctime(), randombytes(8)))
365 return dig[:16]
366
367 def get_authorization(self, req, chal):
368 try:
369 realm = chal['realm']
370 nonce = chal['nonce']
371 qop = chal.get('qop')
372 algorithm = chal.get('algorithm', 'MD5')
373 # mod_digest doesn't send an opaque, even though it isn't
374 # supposed to be optional
375 opaque = chal.get('opaque', None)
376 except KeyError:
377 return None
378
379 H, KD = self.get_algorithm_impls(algorithm)
380 if H is None:
381 return None
382
383 user, pw = self.passwd.find_user_password(realm, req.get_full_url())
384 if user is None:
385 return None
386
387 # XXX not implemented yet
388 if req.has_data():
389 entdig = self.get_entity_digest(req.get_data(), chal)
390 else:
391 entdig = None
392
393 A1 = "%s:%s:%s" % (user, realm, pw)
394 A2 = "%s:%s" % (req.get_method(),
395 # XXX selector: what about proxies and full urls
396 req.get_selector())
397 if qop == 'auth':
398 self.nonce_count += 1
399 ncvalue = '%08x' % self.nonce_count
400 cnonce = self.get_cnonce(nonce)
401 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
402 respdig = KD(H(A1), noncebit)
403 elif qop is None:
404 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
405 else:
406 # XXX handle auth-int.
407 pass
408
409 # XXX should the partial digests be encoded too?
410
411 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
412 'response="%s"' % (user, realm, nonce, req.get_selector(),
413 respdig)
414 if opaque:
415 base += ', opaque="%s"' % opaque
416 if entdig:
417 base += ', digest="%s"' % entdig
418 base += ', algorithm="%s"' % algorithm
419 if qop:
420 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
421 return base
422
423 def get_algorithm_impls(self, algorithm):
424 # lambdas assume digest modules are imported at the top level
425 if algorithm == 'MD5':
426 H = md5_digest
427 elif algorithm == 'SHA':
428 H = sha1_digest
429 # XXX MD5-sess
430 KD = lambda s, d: H("%s:%s" % (s, d))
431 return H, KD
432
433 def get_entity_digest(self, data, chal):
434 # XXX not implemented yet
435 return None
436
437
438class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
439 """An authentication protocol defined by RFC 2069
440
441 Digest authentication improves on basic authentication because it
442 does not transmit passwords in the clear.
443 """
444
445 auth_header = 'Authorization'
446 handler_order = 490
447
448 def http_error_401(self, req, fp, code, msg, headers):
449 host = urlparse.urlparse(req.get_full_url())[1]
450 retry = self.http_error_auth_reqed('www-authenticate',
451 host, req, headers)
452 self.reset_retry_count()
453 return retry
454
455
456class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
457
458 auth_header = 'Proxy-Authorization'
459 handler_order = 490
460
461 def http_error_407(self, req, fp, code, msg, headers):
462 host = req.get_host()
463 retry = self.http_error_auth_reqed('proxy-authenticate',
464 host, req, headers)
465 self.reset_retry_count()
466 return retry
467
468
469# XXX ugly implementation, should probably not bother deriving
470class HTTPProxyPasswordMgr(HTTPPasswordMgr):
471 # has default realm and host/port
472 def add_password(self, realm, uri, user, passwd):
473 # uri could be a single URI or a sequence
474 if uri is None or isinstance(uri, basestring):
475 uris = [uri]
476 else:
477 uris = uri
478 passwd_by_domain = self.passwd.setdefault(realm, {})
479 for uri in uris:
480 for default_port in True, False:
481 reduced_uri = self.reduce_uri(uri, default_port)
482 passwd_by_domain[reduced_uri] = (user, passwd)
483
484 def find_user_password(self, realm, authuri):
485 attempts = [(realm, authuri), (None, authuri)]
486 # bleh, want default realm to take precedence over default
487 # URI/authority, hence this outer loop
488 for default_uri in False, True:
489 for realm, authuri in attempts:
490 authinfo_by_domain = self.passwd.get(realm, {})
491 for default_port in True, False:
492 reduced_authuri = self.reduce_uri(authuri, default_port)
493 for uri, authinfo in authinfo_by_domain.iteritems():
494 if uri is None and not default_uri:
495 continue
496 if self.is_suburi(uri, reduced_authuri):
497 return authinfo
498 user, password = None, None
499
500 if user is not None:
501 break
502 return user, password
503
504 def reduce_uri(self, uri, default_port=True):
505 if uri is None:
506 return None
507 return HTTPPasswordMgr.reduce_uri(self, uri, default_port)
508
509 def is_suburi(self, base, test):
510 if base is None:
511 # default to the proxy's host/port
512 hostport, path = test
513 base = (hostport, "/")
514 return HTTPPasswordMgr.is_suburi(self, base, test)
515
516
517class HTTPSClientCertMgr(HTTPPasswordMgr):
518 # implementation inheritance: this is not a proper subclass
519 def add_key_cert(self, uri, key_file, cert_file):
520 self.add_password(None, uri, key_file, cert_file)
521 def find_key_cert(self, authuri):
522 return HTTPPasswordMgr.find_user_password(self, None, authuri)
Note: See TracBrowser for help on using the repository browser.