source: py-scraping/mechanize/_clientcookie.py@ 173

Last change on this file since 173 was 106, checked in by Rick van der Zwet, 15 years ago

Initial commit...

File size: 63.8 KB
Line 
1"""HTTP cookie handling for web clients.
2
3This module originally developed from my port of Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10 CookieJar____
11 / \ \
12 FileCookieJar \ \
13 / | \ \ \
14 MozillaCookieJar | LWPCookieJar \ \
15 | | \
16 | ---MSIEBase | \
17 | / | | \
18 | / MSIEDBCookieJar BSDDBCookieJar
19 |/
20 MSIECookieJar
21
22Comments to John J Lee <jjl@pobox.com>.
23
24
25Copyright 2002-2006 John J Lee <jjl@pobox.com>
26Copyright 1997-1999 Gisle Aas (original libwww-perl code)
27Copyright 2002-2003 Johnny Lee (original MSIE Perl code)
28
29This code is free software; you can redistribute it and/or modify it
30under the terms of the BSD or ZPL 2.1 licenses (see the file
31COPYING.txt included with the distribution).
32
33"""
34
35import sys, re, copy, time, urllib, types, logging
36try:
37 import threading
38 _threading = threading; del threading
39except ImportError:
40 import dummy_threading
41 _threading = dummy_threading; del dummy_threading
42
43MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
44 "instance initialised with one)")
45DEFAULT_HTTP_PORT = "80"
46
47from _headersutil import split_header_words, parse_ns_headers
48from _util import isstringlike
49import _rfc3986
50
51debug = logging.getLogger("mechanize.cookies").debug
52
53
54def reraise_unmasked_exceptions(unmasked=()):
55 # There are a few catch-all except: statements in this module, for
56 # catching input that's bad in unexpected ways.
57 # This function re-raises some exceptions we don't want to trap.
58 import mechanize, warnings
59 if not mechanize.USE_BARE_EXCEPT:
60 raise
61 unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError)
62 etype = sys.exc_info()[0]
63 if issubclass(etype, unmasked):
64 raise
65 # swallowed an exception
66 import traceback, StringIO
67 f = StringIO.StringIO()
68 traceback.print_exc(None, f)
69 msg = f.getvalue()
70 warnings.warn("mechanize bug!\n%s" % msg, stacklevel=2)
71
72
73IPV4_RE = re.compile(r"\.\d+$")
74def is_HDN(text):
75 """Return True if text is a host domain name."""
76 # XXX
77 # This may well be wrong. Which RFC is HDN defined in, if any (for
78 # the purposes of RFC 2965)?
79 # For the current implementation, what about IPv6? Remember to look
80 # at other uses of IPV4_RE also, if change this.
81 return not (IPV4_RE.search(text) or
82 text == "" or
83 text[0] == "." or text[-1] == ".")
84
85def domain_match(A, B):
86 """Return True if domain A domain-matches domain B, according to RFC 2965.
87
88 A and B may be host domain names or IP addresses.
89
90 RFC 2965, section 1:
91
92 Host names can be specified either as an IP address or a HDN string.
93 Sometimes we compare one host name with another. (Such comparisons SHALL
94 be case-insensitive.) Host A's name domain-matches host B's if
95
96 * their host name strings string-compare equal; or
97
98 * A is a HDN string and has the form NB, where N is a non-empty
99 name string, B has the form .B', and B' is a HDN string. (So,
100 x.y.com domain-matches .Y.com but not Y.com.)
101
102 Note that domain-match is not a commutative operation: a.b.c.com
103 domain-matches .c.com, but not the reverse.
104
105 """
106 # Note that, if A or B are IP addresses, the only relevant part of the
107 # definition of the domain-match algorithm is the direct string-compare.
108 A = A.lower()
109 B = B.lower()
110 if A == B:
111 return True
112 if not is_HDN(A):
113 return False
114 i = A.rfind(B)
115 has_form_nb = not (i == -1 or i == 0)
116 return (
117 has_form_nb and
118 B.startswith(".") and
119 is_HDN(B[1:])
120 )
121
122def liberal_is_HDN(text):
123 """Return True if text is a sort-of-like a host domain name.
124
125 For accepting/blocking domains.
126
127 """
128 return not IPV4_RE.search(text)
129
130def user_domain_match(A, B):
131 """For blocking/accepting domains.
132
133 A and B may be host domain names or IP addresses.
134
135 """
136 A = A.lower()
137 B = B.lower()
138 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
139 if A == B:
140 # equal IP addresses
141 return True
142 return False
143 initial_dot = B.startswith(".")
144 if initial_dot and A.endswith(B):
145 return True
146 if not initial_dot and A == B:
147 return True
148 return False
149
150cut_port_re = re.compile(r":\d+$")
151def request_host(request):
152 """Return request-host, as defined by RFC 2965.
153
154 Variation from RFC: returned value is lowercased, for convenient
155 comparison.
156
157 """
158 url = request.get_full_url()
159 host = _rfc3986.urlsplit(url)[1]
160 if host is None:
161 host = request.get_header("Host", "")
162 # remove port, if present
163 return cut_port_re.sub("", host, 1)
164
165def request_host_lc(request):
166 return request_host(request).lower()
167
168def eff_request_host(request):
169 """Return a tuple (request-host, effective request-host name)."""
170 erhn = req_host = request_host(request)
171 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
172 erhn = req_host + ".local"
173 return req_host, erhn
174
175def eff_request_host_lc(request):
176 req_host, erhn = eff_request_host(request)
177 return req_host.lower(), erhn.lower()
178
179def effective_request_host(request):
180 """Return the effective request-host, as defined by RFC 2965."""
181 return eff_request_host(request)[1]
182
183def request_path(request):
184 """request-URI, as defined by RFC 2965."""
185 url = request.get_full_url()
186 path, query, frag = _rfc3986.urlsplit(url)[2:]
187 path = escape_path(path)
188 req_path = _rfc3986.urlunsplit((None, None, path, query, frag))
189 if not req_path.startswith("/"):
190 req_path = "/" + req_path
191 return req_path
192
193def request_port(request):
194 host = request.get_host()
195 i = host.find(':')
196 if i >= 0:
197 port = host[i + 1:]
198 try:
199 int(port)
200 except ValueError:
201 debug("nonnumeric port: '%s'", port)
202 return None
203 else:
204 port = DEFAULT_HTTP_PORT
205 return port
206
207def request_is_unverifiable(request):
208 try:
209 return request.is_unverifiable()
210 except AttributeError:
211 if hasattr(request, "unverifiable"):
212 return request.unverifiable
213 else:
214 raise
215
216# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
217# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
218HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
219ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
220def uppercase_escaped_char(match):
221 return "%%%s" % match.group(1).upper()
222def escape_path(path):
223 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
224 # There's no knowing what character encoding was used to create URLs
225 # containing %-escapes, but since we have to pick one to escape invalid
226 # path characters, we pick UTF-8, as recommended in the HTML 4.0
227 # specification:
228 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
229 # And here, kind of: draft-fielding-uri-rfc2396bis-03
230 # (And in draft IRI specification: draft-duerst-iri-05)
231 # (And here, for new URI schemes: RFC 2718)
232 if isinstance(path, types.UnicodeType):
233 path = path.encode("utf-8")
234 path = urllib.quote(path, HTTP_PATH_SAFE)
235 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
236 return path
237
238def reach(h):
239 """Return reach of host h, as defined by RFC 2965, section 1.
240
241 The reach R of a host name H is defined as follows:
242
243 * If
244
245 - H is the host domain name of a host; and,
246
247 - H has the form A.B; and
248
249 - A has no embedded (that is, interior) dots; and
250
251 - B has at least one embedded dot, or B is the string "local".
252 then the reach of H is .B.
253
254 * Otherwise, the reach of H is H.
255
256 >>> reach("www.acme.com")
257 '.acme.com'
258 >>> reach("acme.com")
259 'acme.com'
260 >>> reach("acme.local")
261 '.local'
262
263 """
264 i = h.find(".")
265 if i >= 0:
266 #a = h[:i] # this line is only here to show what a is
267 b = h[i + 1:]
268 i = b.find(".")
269 if is_HDN(h) and (i >= 0 or b == "local"):
270 return "." + b
271 return h
272
273def is_third_party(request):
274 """
275
276 RFC 2965, section 3.3.6:
277
278 An unverifiable transaction is to a third-party host if its request-
279 host U does not domain-match the reach R of the request-host O in the
280 origin transaction.
281
282 """
283 req_host = request_host_lc(request)
284 # the origin request's request-host was stuffed into request by
285 # _urllib2_support.AbstractHTTPHandler
286 return not domain_match(req_host, reach(request.origin_req_host))
287
288
289class Cookie:
290 """HTTP Cookie.
291
292 This class represents both Netscape and RFC 2965 cookies.
293
294 This is deliberately a very simple class. It just holds attributes. It's
295 possible to construct Cookie instances that don't comply with the cookie
296 standards. CookieJar.make_cookies is the factory function for Cookie
297 objects -- it deals with cookie parsing, supplying defaults, and
298 normalising to the representation used in this class. CookiePolicy is
299 responsible for checking them to see whether they should be accepted from
300 and returned to the server.
301
302 version: integer;
303 name: string;
304 value: string (may be None);
305 port: string; None indicates no attribute was supplied (eg. "Port", rather
306 than eg. "Port=80"); otherwise, a port string (eg. "80") or a port list
307 string (eg. "80,8080")
308 port_specified: boolean; true if a value was supplied with the Port
309 cookie-attribute
310 domain: string;
311 domain_specified: boolean; true if Domain was explicitly set
312 domain_initial_dot: boolean; true if Domain as set in HTTP header by server
313 started with a dot (yes, this really is necessary!)
314 path: string;
315 path_specified: boolean; true if Path was explicitly set
316 secure: boolean; true if should only be returned over secure connection
317 expires: integer; seconds since epoch (RFC 2965 cookies should calculate
318 this value from the Max-Age attribute)
319 discard: boolean, true if this is a session cookie; (if no expires value,
320 this should be true)
321 comment: string;
322 comment_url: string;
323 rfc2109: boolean; true if cookie arrived in a Set-Cookie: (not
324 Set-Cookie2:) header, but had a version cookie-attribute of 1
325 rest: mapping of other cookie-attributes
326
327 Note that the port may be present in the headers, but unspecified ("Port"
328 rather than"Port=80", for example); if this is the case, port is None.
329
330 """
331
332 def __init__(self, version, name, value,
333 port, port_specified,
334 domain, domain_specified, domain_initial_dot,
335 path, path_specified,
336 secure,
337 expires,
338 discard,
339 comment,
340 comment_url,
341 rest,
342 rfc2109=False,
343 ):
344
345 if version is not None: version = int(version)
346 if expires is not None: expires = int(expires)
347 if port is None and port_specified is True:
348 raise ValueError("if port is None, port_specified must be false")
349
350 self.version = version
351 self.name = name
352 self.value = value
353 self.port = port
354 self.port_specified = port_specified
355 # normalise case, as per RFC 2965 section 3.3.3
356 self.domain = domain.lower()
357 self.domain_specified = domain_specified
358 # Sigh. We need to know whether the domain given in the
359 # cookie-attribute had an initial dot, in order to follow RFC 2965
360 # (as clarified in draft errata). Needed for the returned $Domain
361 # value.
362 self.domain_initial_dot = domain_initial_dot
363 self.path = path
364 self.path_specified = path_specified
365 self.secure = secure
366 self.expires = expires
367 self.discard = discard
368 self.comment = comment
369 self.comment_url = comment_url
370 self.rfc2109 = rfc2109
371
372 self._rest = copy.copy(rest)
373
374 def has_nonstandard_attr(self, name):
375 return self._rest.has_key(name)
376 def get_nonstandard_attr(self, name, default=None):
377 return self._rest.get(name, default)
378 def set_nonstandard_attr(self, name, value):
379 self._rest[name] = value
380 def nonstandard_attr_keys(self):
381 return self._rest.keys()
382
383 def is_expired(self, now=None):
384 if now is None: now = time.time()
385 return (self.expires is not None) and (self.expires <= now)
386
387 def __str__(self):
388 if self.port is None: p = ""
389 else: p = ":" + self.port
390 limit = self.domain + p + self.path
391 if self.value is not None:
392 namevalue = "%s=%s" % (self.name, self.value)
393 else:
394 namevalue = self.name
395 return "<Cookie %s for %s>" % (namevalue, limit)
396
397 def __repr__(self):
398 args = []
399 for name in ["version", "name", "value",
400 "port", "port_specified",
401 "domain", "domain_specified", "domain_initial_dot",
402 "path", "path_specified",
403 "secure", "expires", "discard", "comment", "comment_url",
404 ]:
405 attr = getattr(self, name)
406 args.append("%s=%s" % (name, repr(attr)))
407 args.append("rest=%s" % repr(self._rest))
408 args.append("rfc2109=%s" % repr(self.rfc2109))
409 return "Cookie(%s)" % ", ".join(args)
410
411
412class CookiePolicy:
413 """Defines which cookies get accepted from and returned to server.
414
415 May also modify cookies.
416
417 The subclass DefaultCookiePolicy defines the standard rules for Netscape
418 and RFC 2965 cookies -- override that if you want a customised policy.
419
420 As well as implementing set_ok and return_ok, implementations of this
421 interface must also supply the following attributes, indicating which
422 protocols should be used, and how. These can be read and set at any time,
423 though whether that makes complete sense from the protocol point of view is
424 doubtful.
425
426 Public attributes:
427
428 netscape: implement netscape protocol
429 rfc2965: implement RFC 2965 protocol
430 rfc2109_as_netscape:
431 WARNING: This argument will change or go away if is not accepted into
432 the Python standard library in this form!
433 If true, treat RFC 2109 cookies as though they were Netscape cookies. The
434 default is for this attribute to be None, which means treat 2109 cookies
435 as RFC 2965 cookies unless RFC 2965 handling is switched off (which it is,
436 by default), and as Netscape cookies otherwise.
437 hide_cookie2: don't add Cookie2 header to requests (the presence of
438 this header indicates to the server that we understand RFC 2965
439 cookies)
440
441 """
442 def set_ok(self, cookie, request):
443 """Return true if (and only if) cookie should be accepted from server.
444
445 Currently, pre-expired cookies never get this far -- the CookieJar
446 class deletes such cookies itself.
447
448 cookie: mechanize.Cookie object
449 request: object implementing the interface defined by
450 CookieJar.extract_cookies.__doc__
451
452 """
453 raise NotImplementedError()
454
455 def return_ok(self, cookie, request):
456 """Return true if (and only if) cookie should be returned to server.
457
458 cookie: mechanize.Cookie object
459 request: object implementing the interface defined by
460 CookieJar.add_cookie_header.__doc__
461
462 """
463 raise NotImplementedError()
464
465 def domain_return_ok(self, domain, request):
466 """Return false if cookies should not be returned, given cookie domain.
467
468 This is here as an optimization, to remove the need for checking every
469 cookie with a particular domain (which may involve reading many files).
470 The default implementations of domain_return_ok and path_return_ok
471 (return True) leave all the work to return_ok.
472
473 If domain_return_ok returns true for the cookie domain, path_return_ok
474 is called for the cookie path. Otherwise, path_return_ok and return_ok
475 are never called for that cookie domain. If path_return_ok returns
476 true, return_ok is called with the Cookie object itself for a full
477 check. Otherwise, return_ok is never called for that cookie path.
478
479 Note that domain_return_ok is called for every *cookie* domain, not
480 just for the *request* domain. For example, the function might be
481 called with both ".acme.com" and "www.acme.com" if the request domain
482 is "www.acme.com". The same goes for path_return_ok.
483
484 For argument documentation, see the docstring for return_ok.
485
486 """
487 return True
488
489 def path_return_ok(self, path, request):
490 """Return false if cookies should not be returned, given cookie path.
491
492 See the docstring for domain_return_ok.
493
494 """
495 return True
496
497
498class DefaultCookiePolicy(CookiePolicy):
499 """Implements the standard rules for accepting and returning cookies.
500
501 Both RFC 2965 and Netscape cookies are covered. RFC 2965 handling is
502 switched off by default.
503
504 The easiest way to provide your own policy is to override this class and
505 call its methods in your overriden implementations before adding your own
506 additional checks.
507
508 import mechanize
509 class MyCookiePolicy(mechanize.DefaultCookiePolicy):
510 def set_ok(self, cookie, request):
511 if not mechanize.DefaultCookiePolicy.set_ok(
512 self, cookie, request):
513 return False
514 if i_dont_want_to_store_this_cookie():
515 return False
516 return True
517
518 In addition to the features required to implement the CookiePolicy
519 interface, this class allows you to block and allow domains from setting
520 and receiving cookies. There are also some strictness switches that allow
521 you to tighten up the rather loose Netscape protocol rules a little bit (at
522 the cost of blocking some benign cookies).
523
524 A domain blacklist and whitelist is provided (both off by default). Only
525 domains not in the blacklist and present in the whitelist (if the whitelist
526 is active) participate in cookie setting and returning. Use the
527 blocked_domains constructor argument, and blocked_domains and
528 set_blocked_domains methods (and the corresponding argument and methods for
529 allowed_domains). If you set a whitelist, you can turn it off again by
530 setting it to None.
531
532 Domains in block or allow lists that do not start with a dot must
533 string-compare equal. For example, "acme.com" matches a blacklist entry of
534 "acme.com", but "www.acme.com" does not. Domains that do start with a dot
535 are matched by more specific domains too. For example, both "www.acme.com"
536 and "www.munitions.acme.com" match ".acme.com" (but "acme.com" itself does
537 not). IP addresses are an exception, and must match exactly. For example,
538 if blocked_domains contains "192.168.1.2" and ".168.1.2" 192.168.1.2 is
539 blocked, but 193.168.1.2 is not.
540
541 Additional Public Attributes:
542
543 General strictness switches
544
545 strict_domain: don't allow sites to set two-component domains with
546 country-code top-level domains like .co.uk, .gov.uk, .co.nz. etc.
547 This is far from perfect and isn't guaranteed to work!
548
549 RFC 2965 protocol strictness switches
550
551 strict_rfc2965_unverifiable: follow RFC 2965 rules on unverifiable
552 transactions (usually, an unverifiable transaction is one resulting from
553 a redirect or an image hosted on another site); if this is false, cookies
554 are NEVER blocked on the basis of verifiability
555
556 Netscape protocol strictness switches
557
558 strict_ns_unverifiable: apply RFC 2965 rules on unverifiable transactions
559 even to Netscape cookies
560 strict_ns_domain: flags indicating how strict to be with domain-matching
561 rules for Netscape cookies:
562 DomainStrictNoDots: when setting cookies, host prefix must not contain a
563 dot (eg. www.foo.bar.com can't set a cookie for .bar.com, because
564 www.foo contains a dot)
565 DomainStrictNonDomain: cookies that did not explicitly specify a Domain
566 cookie-attribute can only be returned to a domain that string-compares
567 equal to the domain that set the cookie (eg. rockets.acme.com won't
568 be returned cookies from acme.com that had no Domain cookie-attribute)
569 DomainRFC2965Match: when setting cookies, require a full RFC 2965
570 domain-match
571 DomainLiberal and DomainStrict are the most useful combinations of the
572 above flags, for convenience
573 strict_ns_set_initial_dollar: ignore cookies in Set-Cookie: headers that
574 have names starting with '$'
575 strict_ns_set_path: don't allow setting cookies whose path doesn't
576 path-match request URI
577
578 """
579
580 DomainStrictNoDots = 1
581 DomainStrictNonDomain = 2
582 DomainRFC2965Match = 4
583
584 DomainLiberal = 0
585 DomainStrict = DomainStrictNoDots | DomainStrictNonDomain
586
587 def __init__(self,
588 blocked_domains=None, allowed_domains=None,
589 netscape=True, rfc2965=False,
590 # WARNING: this argument will change or go away if is not
591 # accepted into the Python standard library in this form!
592 # default, ie. treat 2109 as netscape iff not rfc2965
593 rfc2109_as_netscape=None,
594 hide_cookie2=False,
595 strict_domain=False,
596 strict_rfc2965_unverifiable=True,
597 strict_ns_unverifiable=False,
598 strict_ns_domain=DomainLiberal,
599 strict_ns_set_initial_dollar=False,
600 strict_ns_set_path=False,
601 ):
602 """
603 Constructor arguments should be used as keyword arguments only.
604
605 blocked_domains: sequence of domain names that we never accept cookies
606 from, nor return cookies to
607 allowed_domains: if not None, this is a sequence of the only domains
608 for which we accept and return cookies
609
610 For other arguments, see CookiePolicy.__doc__ and
611 DefaultCookiePolicy.__doc__..
612
613 """
614 self.netscape = netscape
615 self.rfc2965 = rfc2965
616 self.rfc2109_as_netscape = rfc2109_as_netscape
617 self.hide_cookie2 = hide_cookie2
618 self.strict_domain = strict_domain
619 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
620 self.strict_ns_unverifiable = strict_ns_unverifiable
621 self.strict_ns_domain = strict_ns_domain
622 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
623 self.strict_ns_set_path = strict_ns_set_path
624
625 if blocked_domains is not None:
626 self._blocked_domains = tuple(blocked_domains)
627 else:
628 self._blocked_domains = ()
629
630 if allowed_domains is not None:
631 allowed_domains = tuple(allowed_domains)
632 self._allowed_domains = allowed_domains
633
634 def blocked_domains(self):
635 """Return the sequence of blocked domains (as a tuple)."""
636 return self._blocked_domains
637 def set_blocked_domains(self, blocked_domains):
638 """Set the sequence of blocked domains."""
639 self._blocked_domains = tuple(blocked_domains)
640
641 def is_blocked(self, domain):
642 for blocked_domain in self._blocked_domains:
643 if user_domain_match(domain, blocked_domain):
644 return True
645 return False
646
647 def allowed_domains(self):
648 """Return None, or the sequence of allowed domains (as a tuple)."""
649 return self._allowed_domains
650 def set_allowed_domains(self, allowed_domains):
651 """Set the sequence of allowed domains, or None."""
652 if allowed_domains is not None:
653 allowed_domains = tuple(allowed_domains)
654 self._allowed_domains = allowed_domains
655
656 def is_not_allowed(self, domain):
657 if self._allowed_domains is None:
658 return False
659 for allowed_domain in self._allowed_domains:
660 if user_domain_match(domain, allowed_domain):
661 return False
662 return True
663
664 def set_ok(self, cookie, request):
665 """
666 If you override set_ok, be sure to call this method. If it returns
667 false, so should your subclass (assuming your subclass wants to be more
668 strict about which cookies to accept).
669
670 """
671 debug(" - checking cookie %s", cookie)
672
673 assert cookie.name is not None
674
675 for n in "version", "verifiability", "name", "path", "domain", "port":
676 fn_name = "set_ok_" + n
677 fn = getattr(self, fn_name)
678 if not fn(cookie, request):
679 return False
680
681 return True
682
683 def set_ok_version(self, cookie, request):
684 if cookie.version is None:
685 # Version is always set to 0 by parse_ns_headers if it's a Netscape
686 # cookie, so this must be an invalid RFC 2965 cookie.
687 debug(" Set-Cookie2 without version attribute (%s)", cookie)
688 return False
689 if cookie.version > 0 and not self.rfc2965:
690 debug(" RFC 2965 cookies are switched off")
691 return False
692 elif cookie.version == 0 and not self.netscape:
693 debug(" Netscape cookies are switched off")
694 return False
695 return True
696
697 def set_ok_verifiability(self, cookie, request):
698 if request_is_unverifiable(request) and is_third_party(request):
699 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
700 debug(" third-party RFC 2965 cookie during "
701 "unverifiable transaction")
702 return False
703 elif cookie.version == 0 and self.strict_ns_unverifiable:
704 debug(" third-party Netscape cookie during "
705 "unverifiable transaction")
706 return False
707 return True
708
709 def set_ok_name(self, cookie, request):
710 # Try and stop servers setting V0 cookies designed to hack other
711 # servers that know both V0 and V1 protocols.
712 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
713 cookie.name.startswith("$")):
714 debug(" illegal name (starts with '$'): '%s'", cookie.name)
715 return False
716 return True
717
718 def set_ok_path(self, cookie, request):
719 if cookie.path_specified:
720 req_path = request_path(request)
721 if ((cookie.version > 0 or
722 (cookie.version == 0 and self.strict_ns_set_path)) and
723 not req_path.startswith(cookie.path)):
724 debug(" path attribute %s is not a prefix of request "
725 "path %s", cookie.path, req_path)
726 return False
727 return True
728
729 def set_ok_countrycode_domain(self, cookie, request):
730 """Return False if explicit cookie domain is not acceptable.
731
732 Called by set_ok_domain, for convenience of overriding by
733 subclasses.
734
735 """
736 if cookie.domain_specified and self.strict_domain:
737 domain = cookie.domain
738 # since domain was specified, we know that:
739 assert domain.startswith(".")
740 if domain.count(".") == 2:
741 # domain like .foo.bar
742 i = domain.rfind(".")
743 tld = domain[i + 1:]
744 sld = domain[1:i]
745 if (sld.lower() in [
746 "co", "ac",
747 "com", "edu", "org", "net", "gov", "mil", "int",
748 "aero", "biz", "cat", "coop", "info", "jobs", "mobi",
749 "museum", "name", "pro", "travel",
750 ] and
751 len(tld) == 2):
752 # domain like .co.uk
753 return False
754 return True
755
756 def set_ok_domain(self, cookie, request):
757 if self.is_blocked(cookie.domain):
758 debug(" domain %s is in user block-list", cookie.domain)
759 return False
760 if self.is_not_allowed(cookie.domain):
761 debug(" domain %s is not in user allow-list", cookie.domain)
762 return False
763 if not self.set_ok_countrycode_domain(cookie, request):
764 debug(" country-code second level domain %s", cookie.domain)
765 return False
766 if cookie.domain_specified:
767 req_host, erhn = eff_request_host_lc(request)
768 domain = cookie.domain
769 if domain.startswith("."):
770 undotted_domain = domain[1:]
771 else:
772 undotted_domain = domain
773 embedded_dots = (undotted_domain.find(".") >= 0)
774 if not embedded_dots and domain != ".local":
775 debug(" non-local domain %s contains no embedded dot",
776 domain)
777 return False
778 if cookie.version == 0:
779 if (not erhn.endswith(domain) and
780 (not erhn.startswith(".") and
781 not ("." + erhn).endswith(domain))):
782 debug(" effective request-host %s (even with added "
783 "initial dot) does not end end with %s",
784 erhn, domain)
785 return False
786 if (cookie.version > 0 or
787 (self.strict_ns_domain & self.DomainRFC2965Match)):
788 if not domain_match(erhn, domain):
789 debug(" effective request-host %s does not domain-match "
790 "%s", erhn, domain)
791 return False
792 if (cookie.version > 0 or
793 (self.strict_ns_domain & self.DomainStrictNoDots)):
794 host_prefix = req_host[:-len(domain)]
795 if (host_prefix.find(".") >= 0 and
796 not IPV4_RE.search(req_host)):
797 debug(" host prefix %s for domain %s contains a dot",
798 host_prefix, domain)
799 return False
800 return True
801
802 def set_ok_port(self, cookie, request):
803 if cookie.port_specified:
804 req_port = request_port(request)
805 if req_port is None:
806 req_port = "80"
807 else:
808 req_port = str(req_port)
809 for p in cookie.port.split(","):
810 try:
811 int(p)
812 except ValueError:
813 debug(" bad port %s (not numeric)", p)
814 return False
815 if p == req_port:
816 break
817 else:
818 debug(" request port (%s) not found in %s",
819 req_port, cookie.port)
820 return False
821 return True
822
823 def return_ok(self, cookie, request):
824 """
825 If you override return_ok, be sure to call this method. If it returns
826 false, so should your subclass (assuming your subclass wants to be more
827 strict about which cookies to return).
828
829 """
830 # Path has already been checked by path_return_ok, and domain blocking
831 # done by domain_return_ok.
832 debug(" - checking cookie %s", cookie)
833
834 for n in ("version", "verifiability", "secure", "expires", "port",
835 "domain"):
836 fn_name = "return_ok_" + n
837 fn = getattr(self, fn_name)
838 if not fn(cookie, request):
839 return False
840 return True
841
842 def return_ok_version(self, cookie, request):
843 if cookie.version > 0 and not self.rfc2965:
844 debug(" RFC 2965 cookies are switched off")
845 return False
846 elif cookie.version == 0 and not self.netscape:
847 debug(" Netscape cookies are switched off")
848 return False
849 return True
850
851 def return_ok_verifiability(self, cookie, request):
852 if request_is_unverifiable(request) and is_third_party(request):
853 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
854 debug(" third-party RFC 2965 cookie during unverifiable "
855 "transaction")
856 return False
857 elif cookie.version == 0 and self.strict_ns_unverifiable:
858 debug(" third-party Netscape cookie during unverifiable "
859 "transaction")
860 return False
861 return True
862
863 def return_ok_secure(self, cookie, request):
864 if cookie.secure and request.get_type() != "https":
865 debug(" secure cookie with non-secure request")
866 return False
867 return True
868
869 def return_ok_expires(self, cookie, request):
870 if cookie.is_expired(self._now):
871 debug(" cookie expired")
872 return False
873 return True
874
875 def return_ok_port(self, cookie, request):
876 if cookie.port:
877 req_port = request_port(request)
878 if req_port is None:
879 req_port = "80"
880 for p in cookie.port.split(","):
881 if p == req_port:
882 break
883 else:
884 debug(" request port %s does not match cookie port %s",
885 req_port, cookie.port)
886 return False
887 return True
888
889 def return_ok_domain(self, cookie, request):
890 req_host, erhn = eff_request_host_lc(request)
891 domain = cookie.domain
892
893 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
894 if (cookie.version == 0 and
895 (self.strict_ns_domain & self.DomainStrictNonDomain) and
896 not cookie.domain_specified and domain != erhn):
897 debug(" cookie with unspecified domain does not string-compare "
898 "equal to request domain")
899 return False
900
901 if cookie.version > 0 and not domain_match(erhn, domain):
902 debug(" effective request-host name %s does not domain-match "
903 "RFC 2965 cookie domain %s", erhn, domain)
904 return False
905 if cookie.version == 0 and not ("." + erhn).endswith(domain):
906 debug(" request-host %s does not match Netscape cookie domain "
907 "%s", req_host, domain)
908 return False
909 return True
910
911 def domain_return_ok(self, domain, request):
912 # Liberal check of domain. This is here as an optimization to avoid
913 # having to load lots of MSIE cookie files unless necessary.
914
915 # Munge req_host and erhn to always start with a dot, so as to err on
916 # the side of letting cookies through.
917 dotted_req_host, dotted_erhn = eff_request_host_lc(request)
918 if not dotted_req_host.startswith("."):
919 dotted_req_host = "." + dotted_req_host
920 if not dotted_erhn.startswith("."):
921 dotted_erhn = "." + dotted_erhn
922 if not (dotted_req_host.endswith(domain) or
923 dotted_erhn.endswith(domain)):
924 #debug(" request domain %s does not match cookie domain %s",
925 # req_host, domain)
926 return False
927
928 if self.is_blocked(domain):
929 debug(" domain %s is in user block-list", domain)
930 return False
931 if self.is_not_allowed(domain):
932 debug(" domain %s is not in user allow-list", domain)
933 return False
934
935 return True
936
937 def path_return_ok(self, path, request):
938 debug("- checking cookie path=%s", path)
939 req_path = request_path(request)
940 if not req_path.startswith(path):
941 debug(" %s does not path-match %s", req_path, path)
942 return False
943 return True
944
945
946def vals_sorted_by_key(adict):
947 keys = adict.keys()
948 keys.sort()
949 return map(adict.get, keys)
950
951class MappingIterator:
952 """Iterates over nested mapping, depth-first, in sorted order by key."""
953 def __init__(self, mapping):
954 self._s = [(vals_sorted_by_key(mapping), 0, None)] # LIFO stack
955
956 def __iter__(self): return self
957
958 def next(self):
959 # this is hairy because of lack of generators
960 while 1:
961 try:
962 vals, i, prev_item = self._s.pop()
963 except IndexError:
964 raise StopIteration()
965 if i < len(vals):
966 item = vals[i]
967 i = i + 1
968 self._s.append((vals, i, prev_item))
969 try:
970 item.items
971 except AttributeError:
972 # non-mapping
973 break
974 else:
975 # mapping
976 self._s.append((vals_sorted_by_key(item), 0, item))
977 continue
978 return item
979
980
981# Used as second parameter to dict.get method, to distinguish absent
982# dict key from one with a None value.
983class Absent: pass
984
985class CookieJar:
986 """Collection of HTTP cookies.
987
988 You may not need to know about this class: try mechanize.urlopen().
989
990 The major methods are extract_cookies and add_cookie_header; these are all
991 you are likely to need.
992
993 CookieJar supports the iterator protocol:
994
995 for cookie in cookiejar:
996 # do something with cookie
997
998 Methods:
999
1000 add_cookie_header(request)
1001 extract_cookies(response, request)
1002 get_policy()
1003 set_policy(policy)
1004 cookies_for_request(request)
1005 make_cookies(response, request)
1006 set_cookie_if_ok(cookie, request)
1007 set_cookie(cookie)
1008 clear_session_cookies()
1009 clear_expired_cookies()
1010 clear(domain=None, path=None, name=None)
1011
1012 Public attributes
1013
1014 policy: CookiePolicy object
1015
1016 """
1017
1018 non_word_re = re.compile(r"\W")
1019 quote_re = re.compile(r"([\"\\])")
1020 strict_domain_re = re.compile(r"\.?[^.]*")
1021 domain_re = re.compile(r"[^.]*")
1022 dots_re = re.compile(r"^\.+")
1023
1024 def __init__(self, policy=None):
1025 """
1026 See CookieJar.__doc__ for argument documentation.
1027
1028 """
1029 if policy is None:
1030 policy = DefaultCookiePolicy()
1031 self._policy = policy
1032
1033 self._cookies = {}
1034
1035 # for __getitem__ iteration in pre-2.2 Pythons
1036 self._prev_getitem_index = 0
1037
1038 def get_policy(self):
1039 return self._policy
1040
1041 def set_policy(self, policy):
1042 self._policy = policy
1043
1044 def _cookies_for_domain(self, domain, request):
1045 cookies = []
1046 if not self._policy.domain_return_ok(domain, request):
1047 return []
1048 debug("Checking %s for cookies to return", domain)
1049 cookies_by_path = self._cookies[domain]
1050 for path in cookies_by_path.keys():
1051 if not self._policy.path_return_ok(path, request):
1052 continue
1053 cookies_by_name = cookies_by_path[path]
1054 for cookie in cookies_by_name.values():
1055 if not self._policy.return_ok(cookie, request):
1056 debug(" not returning cookie")
1057 continue
1058 debug(" it's a match")
1059 cookies.append(cookie)
1060 return cookies
1061
1062 def cookies_for_request(self, request):
1063 """Return a list of cookies to be returned to server.
1064
1065 The returned list of cookie instances is sorted in the order they
1066 should appear in the Cookie: header for return to the server.
1067
1068 See add_cookie_header.__doc__ for the interface required of the
1069 request argument.
1070
1071 New in version 0.1.10
1072
1073 """
1074 self._policy._now = self._now = int(time.time())
1075 cookies = self._cookies_for_request(request)
1076 # add cookies in order of most specific (i.e. longest) path first
1077 def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
1078 cookies.sort(decreasing_size)
1079 return cookies
1080
1081 def _cookies_for_request(self, request):
1082 """Return a list of cookies to be returned to server."""
1083 # this method still exists (alongside cookies_for_request) because it
1084 # is part of an implied protected interface for subclasses of cookiejar
1085 # XXX document that implied interface, or provide another way of
1086 # implementing cookiejars than subclassing
1087 cookies = []
1088 for domain in self._cookies.keys():
1089 cookies.extend(self._cookies_for_domain(domain, request))
1090 return cookies
1091
1092 def _cookie_attrs(self, cookies):
1093 """Return a list of cookie-attributes to be returned to server.
1094
1095 The $Version attribute is also added when appropriate (currently only
1096 once per request).
1097
1098 >>> jar = CookieJar()
1099 >>> ns_cookie = Cookie(0, "foo", '"bar"', None, False,
1100 ... "example.com", False, False,
1101 ... "/", False, False, None, True,
1102 ... None, None, {})
1103 >>> jar._cookie_attrs([ns_cookie])
1104 ['foo="bar"']
1105 >>> rfc2965_cookie = Cookie(1, "foo", "bar", None, False,
1106 ... ".example.com", True, False,
1107 ... "/", False, False, None, True,
1108 ... None, None, {})
1109 >>> jar._cookie_attrs([rfc2965_cookie])
1110 ['$Version=1', 'foo=bar', '$Domain="example.com"']
1111
1112 """
1113 version_set = False
1114
1115 attrs = []
1116 for cookie in cookies:
1117 # set version of Cookie header
1118 # XXX
1119 # What should it be if multiple matching Set-Cookie headers have
1120 # different versions themselves?
1121 # Answer: there is no answer; was supposed to be settled by
1122 # RFC 2965 errata, but that may never appear...
1123 version = cookie.version
1124 if not version_set:
1125 version_set = True
1126 if version > 0:
1127 attrs.append("$Version=%s" % version)
1128
1129 # quote cookie value if necessary
1130 # (not for Netscape protocol, which already has any quotes
1131 # intact, due to the poorly-specified Netscape Cookie: syntax)
1132 if ((cookie.value is not None) and
1133 self.non_word_re.search(cookie.value) and version > 0):
1134 value = self.quote_re.sub(r"\\\1", cookie.value)
1135 else:
1136 value = cookie.value
1137
1138 # add cookie-attributes to be returned in Cookie header
1139 if cookie.value is None:
1140 attrs.append(cookie.name)
1141 else:
1142 attrs.append("%s=%s" % (cookie.name, value))
1143 if version > 0:
1144 if cookie.path_specified:
1145 attrs.append('$Path="%s"' % cookie.path)
1146 if cookie.domain.startswith("."):
1147 domain = cookie.domain
1148 if (not cookie.domain_initial_dot and
1149 domain.startswith(".")):
1150 domain = domain[1:]
1151 attrs.append('$Domain="%s"' % domain)
1152 if cookie.port is not None:
1153 p = "$Port"
1154 if cookie.port_specified:
1155 p = p + ('="%s"' % cookie.port)
1156 attrs.append(p)
1157
1158 return attrs
1159
1160 def add_cookie_header(self, request):
1161 """Add correct Cookie: header to request (urllib2.Request object).
1162
1163 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1164
1165 The request object (usually a urllib2.Request instance) must support
1166 the methods get_full_url, get_host, is_unverifiable, get_type,
1167 has_header, get_header, header_items and add_unredirected_header, as
1168 documented by urllib2, and the port attribute (the port number).
1169 Actually, RequestUpgradeProcessor will automatically upgrade your
1170 Request object to one with has_header, get_header, header_items and
1171 add_unredirected_header, if it lacks those methods, for compatibility
1172 with pre-2.4 versions of urllib2.
1173
1174 """
1175 debug("add_cookie_header")
1176 cookies = self.cookies_for_request(request)
1177
1178 attrs = self._cookie_attrs(cookies)
1179 if attrs:
1180 if not request.has_header("Cookie"):
1181 request.add_unredirected_header("Cookie", "; ".join(attrs))
1182
1183 # if necessary, advertise that we know RFC 2965
1184 if self._policy.rfc2965 and not self._policy.hide_cookie2:
1185 for cookie in cookies:
1186 if cookie.version != 1 and not request.has_header("Cookie2"):
1187 request.add_unredirected_header("Cookie2", '$Version="1"')
1188 break
1189
1190 self.clear_expired_cookies()
1191
1192 def _normalized_cookie_tuples(self, attrs_set):
1193 """Return list of tuples containing normalised cookie information.
1194
1195 attrs_set is the list of lists of key,value pairs extracted from
1196 the Set-Cookie or Set-Cookie2 headers.
1197
1198 Tuples are name, value, standard, rest, where name and value are the
1199 cookie name and value, standard is a dictionary containing the standard
1200 cookie-attributes (discard, secure, version, expires or max-age,
1201 domain, path and port) and rest is a dictionary containing the rest of
1202 the cookie-attributes.
1203
1204 """
1205 cookie_tuples = []
1206
1207 boolean_attrs = "discard", "secure"
1208 value_attrs = ("version",
1209 "expires", "max-age",
1210 "domain", "path", "port",
1211 "comment", "commenturl")
1212
1213 for cookie_attrs in attrs_set:
1214 name, value = cookie_attrs[0]
1215
1216 # Build dictionary of standard cookie-attributes (standard) and
1217 # dictionary of other cookie-attributes (rest).
1218
1219 # Note: expiry time is normalised to seconds since epoch. V0
1220 # cookies should have the Expires cookie-attribute, and V1 cookies
1221 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1222 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1223 # accept either (but prefer Max-Age).
1224 max_age_set = False
1225
1226 bad_cookie = False
1227
1228 standard = {}
1229 rest = {}
1230 for k, v in cookie_attrs[1:]:
1231 lc = k.lower()
1232 # don't lose case distinction for unknown fields
1233 if lc in value_attrs or lc in boolean_attrs:
1234 k = lc
1235 if k in boolean_attrs and v is None:
1236 # boolean cookie-attribute is present, but has no value
1237 # (like "discard", rather than "port=80")
1238 v = True
1239 if standard.has_key(k):
1240 # only first value is significant
1241 continue
1242 if k == "domain":
1243 if v is None:
1244 debug(" missing value for domain attribute")
1245 bad_cookie = True
1246 break
1247 # RFC 2965 section 3.3.3
1248 v = v.lower()
1249 if k == "expires":
1250 if max_age_set:
1251 # Prefer max-age to expires (like Mozilla)
1252 continue
1253 if v is None:
1254 debug(" missing or invalid value for expires "
1255 "attribute: treating as session cookie")
1256 continue
1257 if k == "max-age":
1258 max_age_set = True
1259 if v is None:
1260 debug(" missing value for max-age attribute")
1261 bad_cookie = True
1262 break
1263 try:
1264 v = int(v)
1265 except ValueError:
1266 debug(" missing or invalid (non-numeric) value for "
1267 "max-age attribute")
1268 bad_cookie = True
1269 break
1270 # convert RFC 2965 Max-Age to seconds since epoch
1271 # XXX Strictly you're supposed to follow RFC 2616
1272 # age-calculation rules. Remember that zero Max-Age is a
1273 # is a request to discard (old and new) cookie, though.
1274 k = "expires"
1275 v = self._now + v
1276 if (k in value_attrs) or (k in boolean_attrs):
1277 if (v is None and
1278 k not in ["port", "comment", "commenturl"]):
1279 debug(" missing value for %s attribute" % k)
1280 bad_cookie = True
1281 break
1282 standard[k] = v
1283 else:
1284 rest[k] = v
1285
1286 if bad_cookie:
1287 continue
1288
1289 cookie_tuples.append((name, value, standard, rest))
1290
1291 return cookie_tuples
1292
1293 def _cookie_from_cookie_tuple(self, tup, request):
1294 # standard is dict of standard cookie-attributes, rest is dict of the
1295 # rest of them
1296 name, value, standard, rest = tup
1297
1298 domain = standard.get("domain", Absent)
1299 path = standard.get("path", Absent)
1300 port = standard.get("port", Absent)
1301 expires = standard.get("expires", Absent)
1302
1303 # set the easy defaults
1304 version = standard.get("version", None)
1305 if version is not None:
1306 try:
1307 version = int(version)
1308 except ValueError:
1309 return None # invalid version, ignore cookie
1310 secure = standard.get("secure", False)
1311 # (discard is also set if expires is Absent)
1312 discard = standard.get("discard", False)
1313 comment = standard.get("comment", None)
1314 comment_url = standard.get("commenturl", None)
1315
1316 # set default path
1317 if path is not Absent and path != "":
1318 path_specified = True
1319 path = escape_path(path)
1320 else:
1321 path_specified = False
1322 path = request_path(request)
1323 i = path.rfind("/")
1324 if i != -1:
1325 if version == 0:
1326 # Netscape spec parts company from reality here
1327 path = path[:i]
1328 else:
1329 path = path[:i + 1]
1330 if len(path) == 0: path = "/"
1331
1332 # set default domain
1333 domain_specified = domain is not Absent
1334 # but first we have to remember whether it starts with a dot
1335 domain_initial_dot = False
1336 if domain_specified:
1337 domain_initial_dot = bool(domain.startswith("."))
1338 if domain is Absent:
1339 req_host, erhn = eff_request_host_lc(request)
1340 domain = erhn
1341 elif not domain.startswith("."):
1342 domain = "." + domain
1343
1344 # set default port
1345 port_specified = False
1346 if port is not Absent:
1347 if port is None:
1348 # Port attr present, but has no value: default to request port.
1349 # Cookie should then only be sent back on that port.
1350 port = request_port(request)
1351 else:
1352 port_specified = True
1353 port = re.sub(r"\s+", "", port)
1354 else:
1355 # No port attr present. Cookie can be sent back on any port.
1356 port = None
1357
1358 # set default expires and discard
1359 if expires is Absent:
1360 expires = None
1361 discard = True
1362
1363 return Cookie(version,
1364 name, value,
1365 port, port_specified,
1366 domain, domain_specified, domain_initial_dot,
1367 path, path_specified,
1368 secure,
1369 expires,
1370 discard,
1371 comment,
1372 comment_url,
1373 rest)
1374
1375 def _cookies_from_attrs_set(self, attrs_set, request):
1376 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1377
1378 cookies = []
1379 for tup in cookie_tuples:
1380 cookie = self._cookie_from_cookie_tuple(tup, request)
1381 if cookie: cookies.append(cookie)
1382 return cookies
1383
1384 def _process_rfc2109_cookies(self, cookies):
1385 if self._policy.rfc2109_as_netscape is None:
1386 rfc2109_as_netscape = not self._policy.rfc2965
1387 else:
1388 rfc2109_as_netscape = self._policy.rfc2109_as_netscape
1389 for cookie in cookies:
1390 if cookie.version == 1:
1391 cookie.rfc2109 = True
1392 if rfc2109_as_netscape:
1393 # treat 2109 cookies as Netscape cookies rather than
1394 # as RFC2965 cookies
1395 cookie.version = 0
1396
1397 def _make_cookies(self, response, request):
1398 # get cookie-attributes for RFC 2965 and Netscape protocols
1399 headers = response.info()
1400 rfc2965_hdrs = headers.getheaders("Set-Cookie2")
1401 ns_hdrs = headers.getheaders("Set-Cookie")
1402
1403 rfc2965 = self._policy.rfc2965
1404 netscape = self._policy.netscape
1405
1406 if ((not rfc2965_hdrs and not ns_hdrs) or
1407 (not ns_hdrs and not rfc2965) or
1408 (not rfc2965_hdrs and not netscape) or
1409 (not netscape and not rfc2965)):
1410 return [] # no relevant cookie headers: quick exit
1411
1412 try:
1413 cookies = self._cookies_from_attrs_set(
1414 split_header_words(rfc2965_hdrs), request)
1415 except:
1416 reraise_unmasked_exceptions()
1417 cookies = []
1418
1419 if ns_hdrs and netscape:
1420 try:
1421 # RFC 2109 and Netscape cookies
1422 ns_cookies = self._cookies_from_attrs_set(
1423 parse_ns_headers(ns_hdrs), request)
1424 except:
1425 reraise_unmasked_exceptions()
1426 ns_cookies = []
1427 self._process_rfc2109_cookies(ns_cookies)
1428
1429 # Look for Netscape cookies (from Set-Cookie headers) that match
1430 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1431 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1432 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1433 # bundled in with the Netscape cookies for this purpose, which is
1434 # reasonable behaviour.
1435 if rfc2965:
1436 lookup = {}
1437 for cookie in cookies:
1438 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1439
1440 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1441 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1442 return not lookup.has_key(key)
1443 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1444
1445 if ns_cookies:
1446 cookies.extend(ns_cookies)
1447
1448 return cookies
1449
1450 def make_cookies(self, response, request):
1451 """Return sequence of Cookie objects extracted from response object.
1452
1453 See extract_cookies.__doc__ for the interface required of the
1454 response and request arguments.
1455
1456 """
1457 self._policy._now = self._now = int(time.time())
1458 return [cookie for cookie in self._make_cookies(response, request)
1459 if cookie.expires is None or not cookie.expires <= self._now]
1460
1461 def set_cookie_if_ok(self, cookie, request):
1462 """Set a cookie if policy says it's OK to do so.
1463
1464 cookie: mechanize.Cookie instance
1465 request: see extract_cookies.__doc__ for the required interface
1466
1467 """
1468 self._policy._now = self._now = int(time.time())
1469
1470 if self._policy.set_ok(cookie, request):
1471 self.set_cookie(cookie)
1472
1473 def set_cookie(self, cookie):
1474 """Set a cookie, without checking whether or not it should be set.
1475
1476 cookie: mechanize.Cookie instance
1477 """
1478 c = self._cookies
1479 if not c.has_key(cookie.domain): c[cookie.domain] = {}
1480 c2 = c[cookie.domain]
1481 if not c2.has_key(cookie.path): c2[cookie.path] = {}
1482 c3 = c2[cookie.path]
1483 c3[cookie.name] = cookie
1484
1485 def extract_cookies(self, response, request):
1486 """Extract cookies from response, where allowable given the request.
1487
1488 Look for allowable Set-Cookie: and Set-Cookie2: headers in the response
1489 object passed as argument. Any of these headers that are found are
1490 used to update the state of the object (subject to the policy.set_ok
1491 method's approval).
1492
1493 The response object (usually be the result of a call to
1494 mechanize.urlopen, or similar) should support an info method, which
1495 returns a mimetools.Message object (in fact, the 'mimetools.Message
1496 object' may be any object that provides a getheaders method).
1497
1498 The request object (usually a urllib2.Request instance) must support
1499 the methods get_full_url, get_type, get_host, and is_unverifiable, as
1500 documented by urllib2, and the port attribute (the port number). The
1501 request is used to set default values for cookie-attributes as well as
1502 for checking that the cookie is OK to be set.
1503
1504 """
1505 debug("extract_cookies: %s", response.info())
1506 self._policy._now = self._now = int(time.time())
1507
1508 for cookie in self._make_cookies(response, request):
1509 if cookie.expires is not None and cookie.expires <= self._now:
1510 # Expiry date in past is request to delete cookie. This can't be
1511 # in DefaultCookiePolicy, because can't delete cookies there.
1512 try:
1513 self.clear(cookie.domain, cookie.path, cookie.name)
1514 except KeyError:
1515 pass
1516 debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1517 cookie.domain, cookie.path, cookie.name)
1518 elif self._policy.set_ok(cookie, request):
1519 debug(" setting cookie: %s", cookie)
1520 self.set_cookie(cookie)
1521
1522 def clear(self, domain=None, path=None, name=None):
1523 """Clear some cookies.
1524
1525 Invoking this method without arguments will clear all cookies. If
1526 given a single argument, only cookies belonging to that domain will be
1527 removed. If given two arguments, cookies belonging to the specified
1528 path within that domain are removed. If given three arguments, then
1529 the cookie with the specified name, path and domain is removed.
1530
1531 Raises KeyError if no matching cookie exists.
1532
1533 """
1534 if name is not None:
1535 if (domain is None) or (path is None):
1536 raise ValueError(
1537 "domain and path must be given to remove a cookie by name")
1538 del self._cookies[domain][path][name]
1539 elif path is not None:
1540 if domain is None:
1541 raise ValueError(
1542 "domain must be given to remove cookies by path")
1543 del self._cookies[domain][path]
1544 elif domain is not None:
1545 del self._cookies[domain]
1546 else:
1547 self._cookies = {}
1548
1549 def clear_session_cookies(self):
1550 """Discard all session cookies.
1551
1552 Discards all cookies held by object which had either no Max-Age or
1553 Expires cookie-attribute or an explicit Discard cookie-attribute, or
1554 which otherwise have ended up with a true discard attribute. For
1555 interactive browsers, the end of a session usually corresponds to
1556 closing the browser window.
1557
1558 Note that the save method won't save session cookies anyway, unless you
1559 ask otherwise by passing a true ignore_discard argument.
1560
1561 """
1562 for cookie in self:
1563 if cookie.discard:
1564 self.clear(cookie.domain, cookie.path, cookie.name)
1565
1566 def clear_expired_cookies(self):
1567 """Discard all expired cookies.
1568
1569 You probably don't need to call this method: expired cookies are never
1570 sent back to the server (provided you're using DefaultCookiePolicy),
1571 this method is called by CookieJar itself every so often, and the save
1572 method won't save expired cookies anyway (unless you ask otherwise by
1573 passing a true ignore_expires argument).
1574
1575 """
1576 now = time.time()
1577 for cookie in self:
1578 if cookie.is_expired(now):
1579 self.clear(cookie.domain, cookie.path, cookie.name)
1580
1581 def __getitem__(self, i):
1582 if i == 0:
1583 self._getitem_iterator = self.__iter__()
1584 elif self._prev_getitem_index != i - 1: raise IndexError(
1585 "CookieJar.__getitem__ only supports sequential iteration")
1586 self._prev_getitem_index = i
1587 try:
1588 return self._getitem_iterator.next()
1589 except StopIteration:
1590 raise IndexError()
1591
1592 def __iter__(self):
1593 return MappingIterator(self._cookies)
1594
1595 def __len__(self):
1596 """Return number of contained cookies."""
1597 i = 0
1598 for cookie in self: i = i + 1
1599 return i
1600
1601 def __repr__(self):
1602 r = []
1603 for cookie in self: r.append(repr(cookie))
1604 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1605
1606 def __str__(self):
1607 r = []
1608 for cookie in self: r.append(str(cookie))
1609 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1610
1611
1612class LoadError(Exception): pass
1613
1614class FileCookieJar(CookieJar):
1615 """CookieJar that can be loaded from and saved to a file.
1616
1617 Additional methods
1618
1619 save(filename=None, ignore_discard=False, ignore_expires=False)
1620 load(filename=None, ignore_discard=False, ignore_expires=False)
1621 revert(filename=None, ignore_discard=False, ignore_expires=False)
1622
1623 Additional public attributes
1624
1625 filename: filename for loading and saving cookies
1626
1627 Additional public readable attributes
1628
1629 delayload: request that cookies are lazily loaded from disk; this is only
1630 a hint since this only affects performance, not behaviour (unless the
1631 cookies on disk are changing); a CookieJar object may ignore it (in fact,
1632 only MSIECookieJar lazily loads cookies at the moment)
1633
1634 """
1635
1636 def __init__(self, filename=None, delayload=False, policy=None):
1637 """
1638 See FileCookieJar.__doc__ for argument documentation.
1639
1640 Cookies are NOT loaded from the named file until either the load or
1641 revert method is called.
1642
1643 """
1644 CookieJar.__init__(self, policy)
1645 if filename is not None and not isstringlike(filename):
1646 raise ValueError("filename must be string-like")
1647 self.filename = filename
1648 self.delayload = bool(delayload)
1649
1650 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1651 """Save cookies to a file.
1652
1653 filename: name of file in which to save cookies
1654 ignore_discard: save even cookies set to be discarded
1655 ignore_expires: save even cookies that have expired
1656
1657 The file is overwritten if it already exists, thus wiping all its
1658 cookies. Saved cookies can be restored later using the load or revert
1659 methods. If filename is not specified, self.filename is used; if
1660 self.filename is None, ValueError is raised.
1661
1662 """
1663 raise NotImplementedError()
1664
1665 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1666 """Load cookies from a file.
1667
1668 Old cookies are kept unless overwritten by newly loaded ones.
1669
1670 Arguments are as for .save().
1671
1672 If filename is not specified, self.filename is used; if self.filename
1673 is None, ValueError is raised. The named file must be in the format
1674 understood by the class, or LoadError will be raised. This format will
1675 be identical to that written by the save method, unless the load format
1676 is not sufficiently well understood (as is the case for MSIECookieJar).
1677
1678 """
1679 if filename is None:
1680 if self.filename is not None: filename = self.filename
1681 else: raise ValueError(MISSING_FILENAME_TEXT)
1682
1683 f = open(filename)
1684 try:
1685 self._really_load(f, filename, ignore_discard, ignore_expires)
1686 finally:
1687 f.close()
1688
1689 def revert(self, filename=None,
1690 ignore_discard=False, ignore_expires=False):
1691 """Clear all cookies and reload cookies from a saved file.
1692
1693 Raises LoadError (or IOError) if reversion is not successful; the
1694 object's state will not be altered if this happens.
1695
1696 """
1697 if filename is None:
1698 if self.filename is not None: filename = self.filename
1699 else: raise ValueError(MISSING_FILENAME_TEXT)
1700
1701 old_state = copy.deepcopy(self._cookies)
1702 self._cookies = {}
1703 try:
1704 self.load(filename, ignore_discard, ignore_expires)
1705 except (LoadError, IOError):
1706 self._cookies = old_state
1707 raise
Note: See TracBrowser for help on using the repository browser.