[106] | 1 | """HTTP cookie handling for web clients.
|
---|
| 2 |
|
---|
| 3 | This module originally developed from my port of Gisle Aas' Perl module
|
---|
| 4 | HTTP::Cookies, from the libwww-perl library.
|
---|
| 5 |
|
---|
| 6 | Docstrings, comments and debug strings in this code refer to the
|
---|
| 7 | attributes of the HTTP cookie system as cookie-attributes, to distinguish
|
---|
| 8 | them clearly from Python attributes.
|
---|
| 9 |
|
---|
| 10 | CookieJar____
|
---|
| 11 | / \ \
|
---|
| 12 | FileCookieJar \ \
|
---|
| 13 | / | \ \ \
|
---|
| 14 | MozillaCookieJar | LWPCookieJar \ \
|
---|
| 15 | | | \
|
---|
| 16 | | ---MSIEBase | \
|
---|
| 17 | | / | | \
|
---|
| 18 | | / MSIEDBCookieJar BSDDBCookieJar
|
---|
| 19 | |/
|
---|
| 20 | MSIECookieJar
|
---|
| 21 |
|
---|
| 22 | Comments to John J Lee <jjl@pobox.com>.
|
---|
| 23 |
|
---|
| 24 |
|
---|
| 25 | Copyright 2002-2006 John J Lee <jjl@pobox.com>
|
---|
| 26 | Copyright 1997-1999 Gisle Aas (original libwww-perl code)
|
---|
| 27 | Copyright 2002-2003 Johnny Lee (original MSIE Perl code)
|
---|
| 28 |
|
---|
| 29 | This code is free software; you can redistribute it and/or modify it
|
---|
| 30 | under the terms of the BSD or ZPL 2.1 licenses (see the file
|
---|
| 31 | COPYING.txt included with the distribution).
|
---|
| 32 |
|
---|
| 33 | """
|
---|
| 34 |
|
---|
| 35 | import sys, re, copy, time, urllib, types, logging
|
---|
| 36 | try:
|
---|
| 37 | import threading
|
---|
| 38 | _threading = threading; del threading
|
---|
| 39 | except ImportError:
|
---|
| 40 | import dummy_threading
|
---|
| 41 | _threading = dummy_threading; del dummy_threading
|
---|
| 42 |
|
---|
| 43 | MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
|
---|
| 44 | "instance initialised with one)")
|
---|
| 45 | DEFAULT_HTTP_PORT = "80"
|
---|
| 46 |
|
---|
| 47 | from _headersutil import split_header_words, parse_ns_headers
|
---|
| 48 | from _util import isstringlike
|
---|
| 49 | import _rfc3986
|
---|
| 50 |
|
---|
| 51 | debug = logging.getLogger("mechanize.cookies").debug
|
---|
| 52 |
|
---|
| 53 |
|
---|
| 54 | def reraise_unmasked_exceptions(unmasked=()):
|
---|
| 55 | # There are a few catch-all except: statements in this module, for
|
---|
| 56 | # catching input that's bad in unexpected ways.
|
---|
| 57 | # This function re-raises some exceptions we don't want to trap.
|
---|
| 58 | import mechanize, warnings
|
---|
| 59 | if not mechanize.USE_BARE_EXCEPT:
|
---|
| 60 | raise
|
---|
| 61 | unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError)
|
---|
| 62 | etype = sys.exc_info()[0]
|
---|
| 63 | if issubclass(etype, unmasked):
|
---|
| 64 | raise
|
---|
| 65 | # swallowed an exception
|
---|
| 66 | import traceback, StringIO
|
---|
| 67 | f = StringIO.StringIO()
|
---|
| 68 | traceback.print_exc(None, f)
|
---|
| 69 | msg = f.getvalue()
|
---|
| 70 | warnings.warn("mechanize bug!\n%s" % msg, stacklevel=2)
|
---|
| 71 |
|
---|
| 72 |
|
---|
| 73 | IPV4_RE = re.compile(r"\.\d+$")
|
---|
| 74 | def is_HDN(text):
|
---|
| 75 | """Return True if text is a host domain name."""
|
---|
| 76 | # XXX
|
---|
| 77 | # This may well be wrong. Which RFC is HDN defined in, if any (for
|
---|
| 78 | # the purposes of RFC 2965)?
|
---|
| 79 | # For the current implementation, what about IPv6? Remember to look
|
---|
| 80 | # at other uses of IPV4_RE also, if change this.
|
---|
| 81 | return not (IPV4_RE.search(text) or
|
---|
| 82 | text == "" or
|
---|
| 83 | text[0] == "." or text[-1] == ".")
|
---|
| 84 |
|
---|
| 85 | def domain_match(A, B):
|
---|
| 86 | """Return True if domain A domain-matches domain B, according to RFC 2965.
|
---|
| 87 |
|
---|
| 88 | A and B may be host domain names or IP addresses.
|
---|
| 89 |
|
---|
| 90 | RFC 2965, section 1:
|
---|
| 91 |
|
---|
| 92 | Host names can be specified either as an IP address or a HDN string.
|
---|
| 93 | Sometimes we compare one host name with another. (Such comparisons SHALL
|
---|
| 94 | be case-insensitive.) Host A's name domain-matches host B's if
|
---|
| 95 |
|
---|
| 96 | * their host name strings string-compare equal; or
|
---|
| 97 |
|
---|
| 98 | * A is a HDN string and has the form NB, where N is a non-empty
|
---|
| 99 | name string, B has the form .B', and B' is a HDN string. (So,
|
---|
| 100 | x.y.com domain-matches .Y.com but not Y.com.)
|
---|
| 101 |
|
---|
| 102 | Note that domain-match is not a commutative operation: a.b.c.com
|
---|
| 103 | domain-matches .c.com, but not the reverse.
|
---|
| 104 |
|
---|
| 105 | """
|
---|
| 106 | # Note that, if A or B are IP addresses, the only relevant part of the
|
---|
| 107 | # definition of the domain-match algorithm is the direct string-compare.
|
---|
| 108 | A = A.lower()
|
---|
| 109 | B = B.lower()
|
---|
| 110 | if A == B:
|
---|
| 111 | return True
|
---|
| 112 | if not is_HDN(A):
|
---|
| 113 | return False
|
---|
| 114 | i = A.rfind(B)
|
---|
| 115 | has_form_nb = not (i == -1 or i == 0)
|
---|
| 116 | return (
|
---|
| 117 | has_form_nb and
|
---|
| 118 | B.startswith(".") and
|
---|
| 119 | is_HDN(B[1:])
|
---|
| 120 | )
|
---|
| 121 |
|
---|
| 122 | def liberal_is_HDN(text):
|
---|
| 123 | """Return True if text is a sort-of-like a host domain name.
|
---|
| 124 |
|
---|
| 125 | For accepting/blocking domains.
|
---|
| 126 |
|
---|
| 127 | """
|
---|
| 128 | return not IPV4_RE.search(text)
|
---|
| 129 |
|
---|
| 130 | def user_domain_match(A, B):
|
---|
| 131 | """For blocking/accepting domains.
|
---|
| 132 |
|
---|
| 133 | A and B may be host domain names or IP addresses.
|
---|
| 134 |
|
---|
| 135 | """
|
---|
| 136 | A = A.lower()
|
---|
| 137 | B = B.lower()
|
---|
| 138 | if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
|
---|
| 139 | if A == B:
|
---|
| 140 | # equal IP addresses
|
---|
| 141 | return True
|
---|
| 142 | return False
|
---|
| 143 | initial_dot = B.startswith(".")
|
---|
| 144 | if initial_dot and A.endswith(B):
|
---|
| 145 | return True
|
---|
| 146 | if not initial_dot and A == B:
|
---|
| 147 | return True
|
---|
| 148 | return False
|
---|
| 149 |
|
---|
| 150 | cut_port_re = re.compile(r":\d+$")
|
---|
| 151 | def request_host(request):
|
---|
| 152 | """Return request-host, as defined by RFC 2965.
|
---|
| 153 |
|
---|
| 154 | Variation from RFC: returned value is lowercased, for convenient
|
---|
| 155 | comparison.
|
---|
| 156 |
|
---|
| 157 | """
|
---|
| 158 | url = request.get_full_url()
|
---|
| 159 | host = _rfc3986.urlsplit(url)[1]
|
---|
| 160 | if host is None:
|
---|
| 161 | host = request.get_header("Host", "")
|
---|
| 162 | # remove port, if present
|
---|
| 163 | return cut_port_re.sub("", host, 1)
|
---|
| 164 |
|
---|
| 165 | def request_host_lc(request):
|
---|
| 166 | return request_host(request).lower()
|
---|
| 167 |
|
---|
| 168 | def eff_request_host(request):
|
---|
| 169 | """Return a tuple (request-host, effective request-host name)."""
|
---|
| 170 | erhn = req_host = request_host(request)
|
---|
| 171 | if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
|
---|
| 172 | erhn = req_host + ".local"
|
---|
| 173 | return req_host, erhn
|
---|
| 174 |
|
---|
| 175 | def eff_request_host_lc(request):
|
---|
| 176 | req_host, erhn = eff_request_host(request)
|
---|
| 177 | return req_host.lower(), erhn.lower()
|
---|
| 178 |
|
---|
| 179 | def effective_request_host(request):
|
---|
| 180 | """Return the effective request-host, as defined by RFC 2965."""
|
---|
| 181 | return eff_request_host(request)[1]
|
---|
| 182 |
|
---|
| 183 | def request_path(request):
|
---|
| 184 | """request-URI, as defined by RFC 2965."""
|
---|
| 185 | url = request.get_full_url()
|
---|
| 186 | path, query, frag = _rfc3986.urlsplit(url)[2:]
|
---|
| 187 | path = escape_path(path)
|
---|
| 188 | req_path = _rfc3986.urlunsplit((None, None, path, query, frag))
|
---|
| 189 | if not req_path.startswith("/"):
|
---|
| 190 | req_path = "/" + req_path
|
---|
| 191 | return req_path
|
---|
| 192 |
|
---|
| 193 | def request_port(request):
|
---|
| 194 | host = request.get_host()
|
---|
| 195 | i = host.find(':')
|
---|
| 196 | if i >= 0:
|
---|
| 197 | port = host[i + 1:]
|
---|
| 198 | try:
|
---|
| 199 | int(port)
|
---|
| 200 | except ValueError:
|
---|
| 201 | debug("nonnumeric port: '%s'", port)
|
---|
| 202 | return None
|
---|
| 203 | else:
|
---|
| 204 | port = DEFAULT_HTTP_PORT
|
---|
| 205 | return port
|
---|
| 206 |
|
---|
| 207 | def request_is_unverifiable(request):
|
---|
| 208 | try:
|
---|
| 209 | return request.is_unverifiable()
|
---|
| 210 | except AttributeError:
|
---|
| 211 | if hasattr(request, "unverifiable"):
|
---|
| 212 | return request.unverifiable
|
---|
| 213 | else:
|
---|
| 214 | raise
|
---|
| 215 |
|
---|
| 216 | # Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
|
---|
| 217 | # need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
|
---|
| 218 | HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
|
---|
| 219 | ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
|
---|
| 220 | def uppercase_escaped_char(match):
|
---|
| 221 | return "%%%s" % match.group(1).upper()
|
---|
| 222 | def escape_path(path):
|
---|
| 223 | """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
|
---|
| 224 | # There's no knowing what character encoding was used to create URLs
|
---|
| 225 | # containing %-escapes, but since we have to pick one to escape invalid
|
---|
| 226 | # path characters, we pick UTF-8, as recommended in the HTML 4.0
|
---|
| 227 | # specification:
|
---|
| 228 | # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
|
---|
| 229 | # And here, kind of: draft-fielding-uri-rfc2396bis-03
|
---|
| 230 | # (And in draft IRI specification: draft-duerst-iri-05)
|
---|
| 231 | # (And here, for new URI schemes: RFC 2718)
|
---|
| 232 | if isinstance(path, types.UnicodeType):
|
---|
| 233 | path = path.encode("utf-8")
|
---|
| 234 | path = urllib.quote(path, HTTP_PATH_SAFE)
|
---|
| 235 | path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
|
---|
| 236 | return path
|
---|
| 237 |
|
---|
| 238 | def reach(h):
|
---|
| 239 | """Return reach of host h, as defined by RFC 2965, section 1.
|
---|
| 240 |
|
---|
| 241 | The reach R of a host name H is defined as follows:
|
---|
| 242 |
|
---|
| 243 | * If
|
---|
| 244 |
|
---|
| 245 | - H is the host domain name of a host; and,
|
---|
| 246 |
|
---|
| 247 | - H has the form A.B; and
|
---|
| 248 |
|
---|
| 249 | - A has no embedded (that is, interior) dots; and
|
---|
| 250 |
|
---|
| 251 | - B has at least one embedded dot, or B is the string "local".
|
---|
| 252 | then the reach of H is .B.
|
---|
| 253 |
|
---|
| 254 | * Otherwise, the reach of H is H.
|
---|
| 255 |
|
---|
| 256 | >>> reach("www.acme.com")
|
---|
| 257 | '.acme.com'
|
---|
| 258 | >>> reach("acme.com")
|
---|
| 259 | 'acme.com'
|
---|
| 260 | >>> reach("acme.local")
|
---|
| 261 | '.local'
|
---|
| 262 |
|
---|
| 263 | """
|
---|
| 264 | i = h.find(".")
|
---|
| 265 | if i >= 0:
|
---|
| 266 | #a = h[:i] # this line is only here to show what a is
|
---|
| 267 | b = h[i + 1:]
|
---|
| 268 | i = b.find(".")
|
---|
| 269 | if is_HDN(h) and (i >= 0 or b == "local"):
|
---|
| 270 | return "." + b
|
---|
| 271 | return h
|
---|
| 272 |
|
---|
| 273 | def is_third_party(request):
|
---|
| 274 | """
|
---|
| 275 |
|
---|
| 276 | RFC 2965, section 3.3.6:
|
---|
| 277 |
|
---|
| 278 | An unverifiable transaction is to a third-party host if its request-
|
---|
| 279 | host U does not domain-match the reach R of the request-host O in the
|
---|
| 280 | origin transaction.
|
---|
| 281 |
|
---|
| 282 | """
|
---|
| 283 | req_host = request_host_lc(request)
|
---|
| 284 | # the origin request's request-host was stuffed into request by
|
---|
| 285 | # _urllib2_support.AbstractHTTPHandler
|
---|
| 286 | return not domain_match(req_host, reach(request.origin_req_host))
|
---|
| 287 |
|
---|
| 288 |
|
---|
| 289 | class Cookie:
|
---|
| 290 | """HTTP Cookie.
|
---|
| 291 |
|
---|
| 292 | This class represents both Netscape and RFC 2965 cookies.
|
---|
| 293 |
|
---|
| 294 | This is deliberately a very simple class. It just holds attributes. It's
|
---|
| 295 | possible to construct Cookie instances that don't comply with the cookie
|
---|
| 296 | standards. CookieJar.make_cookies is the factory function for Cookie
|
---|
| 297 | objects -- it deals with cookie parsing, supplying defaults, and
|
---|
| 298 | normalising to the representation used in this class. CookiePolicy is
|
---|
| 299 | responsible for checking them to see whether they should be accepted from
|
---|
| 300 | and returned to the server.
|
---|
| 301 |
|
---|
| 302 | version: integer;
|
---|
| 303 | name: string;
|
---|
| 304 | value: string (may be None);
|
---|
| 305 | port: string; None indicates no attribute was supplied (eg. "Port", rather
|
---|
| 306 | than eg. "Port=80"); otherwise, a port string (eg. "80") or a port list
|
---|
| 307 | string (eg. "80,8080")
|
---|
| 308 | port_specified: boolean; true if a value was supplied with the Port
|
---|
| 309 | cookie-attribute
|
---|
| 310 | domain: string;
|
---|
| 311 | domain_specified: boolean; true if Domain was explicitly set
|
---|
| 312 | domain_initial_dot: boolean; true if Domain as set in HTTP header by server
|
---|
| 313 | started with a dot (yes, this really is necessary!)
|
---|
| 314 | path: string;
|
---|
| 315 | path_specified: boolean; true if Path was explicitly set
|
---|
| 316 | secure: boolean; true if should only be returned over secure connection
|
---|
| 317 | expires: integer; seconds since epoch (RFC 2965 cookies should calculate
|
---|
| 318 | this value from the Max-Age attribute)
|
---|
| 319 | discard: boolean, true if this is a session cookie; (if no expires value,
|
---|
| 320 | this should be true)
|
---|
| 321 | comment: string;
|
---|
| 322 | comment_url: string;
|
---|
| 323 | rfc2109: boolean; true if cookie arrived in a Set-Cookie: (not
|
---|
| 324 | Set-Cookie2:) header, but had a version cookie-attribute of 1
|
---|
| 325 | rest: mapping of other cookie-attributes
|
---|
| 326 |
|
---|
| 327 | Note that the port may be present in the headers, but unspecified ("Port"
|
---|
| 328 | rather than"Port=80", for example); if this is the case, port is None.
|
---|
| 329 |
|
---|
| 330 | """
|
---|
| 331 |
|
---|
| 332 | def __init__(self, version, name, value,
|
---|
| 333 | port, port_specified,
|
---|
| 334 | domain, domain_specified, domain_initial_dot,
|
---|
| 335 | path, path_specified,
|
---|
| 336 | secure,
|
---|
| 337 | expires,
|
---|
| 338 | discard,
|
---|
| 339 | comment,
|
---|
| 340 | comment_url,
|
---|
| 341 | rest,
|
---|
| 342 | rfc2109=False,
|
---|
| 343 | ):
|
---|
| 344 |
|
---|
| 345 | if version is not None: version = int(version)
|
---|
| 346 | if expires is not None: expires = int(expires)
|
---|
| 347 | if port is None and port_specified is True:
|
---|
| 348 | raise ValueError("if port is None, port_specified must be false")
|
---|
| 349 |
|
---|
| 350 | self.version = version
|
---|
| 351 | self.name = name
|
---|
| 352 | self.value = value
|
---|
| 353 | self.port = port
|
---|
| 354 | self.port_specified = port_specified
|
---|
| 355 | # normalise case, as per RFC 2965 section 3.3.3
|
---|
| 356 | self.domain = domain.lower()
|
---|
| 357 | self.domain_specified = domain_specified
|
---|
| 358 | # Sigh. We need to know whether the domain given in the
|
---|
| 359 | # cookie-attribute had an initial dot, in order to follow RFC 2965
|
---|
| 360 | # (as clarified in draft errata). Needed for the returned $Domain
|
---|
| 361 | # value.
|
---|
| 362 | self.domain_initial_dot = domain_initial_dot
|
---|
| 363 | self.path = path
|
---|
| 364 | self.path_specified = path_specified
|
---|
| 365 | self.secure = secure
|
---|
| 366 | self.expires = expires
|
---|
| 367 | self.discard = discard
|
---|
| 368 | self.comment = comment
|
---|
| 369 | self.comment_url = comment_url
|
---|
| 370 | self.rfc2109 = rfc2109
|
---|
| 371 |
|
---|
| 372 | self._rest = copy.copy(rest)
|
---|
| 373 |
|
---|
| 374 | def has_nonstandard_attr(self, name):
|
---|
| 375 | return self._rest.has_key(name)
|
---|
| 376 | def get_nonstandard_attr(self, name, default=None):
|
---|
| 377 | return self._rest.get(name, default)
|
---|
| 378 | def set_nonstandard_attr(self, name, value):
|
---|
| 379 | self._rest[name] = value
|
---|
| 380 | def nonstandard_attr_keys(self):
|
---|
| 381 | return self._rest.keys()
|
---|
| 382 |
|
---|
| 383 | def is_expired(self, now=None):
|
---|
| 384 | if now is None: now = time.time()
|
---|
| 385 | return (self.expires is not None) and (self.expires <= now)
|
---|
| 386 |
|
---|
| 387 | def __str__(self):
|
---|
| 388 | if self.port is None: p = ""
|
---|
| 389 | else: p = ":" + self.port
|
---|
| 390 | limit = self.domain + p + self.path
|
---|
| 391 | if self.value is not None:
|
---|
| 392 | namevalue = "%s=%s" % (self.name, self.value)
|
---|
| 393 | else:
|
---|
| 394 | namevalue = self.name
|
---|
| 395 | return "<Cookie %s for %s>" % (namevalue, limit)
|
---|
| 396 |
|
---|
| 397 | def __repr__(self):
|
---|
| 398 | args = []
|
---|
| 399 | for name in ["version", "name", "value",
|
---|
| 400 | "port", "port_specified",
|
---|
| 401 | "domain", "domain_specified", "domain_initial_dot",
|
---|
| 402 | "path", "path_specified",
|
---|
| 403 | "secure", "expires", "discard", "comment", "comment_url",
|
---|
| 404 | ]:
|
---|
| 405 | attr = getattr(self, name)
|
---|
| 406 | args.append("%s=%s" % (name, repr(attr)))
|
---|
| 407 | args.append("rest=%s" % repr(self._rest))
|
---|
| 408 | args.append("rfc2109=%s" % repr(self.rfc2109))
|
---|
| 409 | return "Cookie(%s)" % ", ".join(args)
|
---|
| 410 |
|
---|
| 411 |
|
---|
| 412 | class CookiePolicy:
|
---|
| 413 | """Defines which cookies get accepted from and returned to server.
|
---|
| 414 |
|
---|
| 415 | May also modify cookies.
|
---|
| 416 |
|
---|
| 417 | The subclass DefaultCookiePolicy defines the standard rules for Netscape
|
---|
| 418 | and RFC 2965 cookies -- override that if you want a customised policy.
|
---|
| 419 |
|
---|
| 420 | As well as implementing set_ok and return_ok, implementations of this
|
---|
| 421 | interface must also supply the following attributes, indicating which
|
---|
| 422 | protocols should be used, and how. These can be read and set at any time,
|
---|
| 423 | though whether that makes complete sense from the protocol point of view is
|
---|
| 424 | doubtful.
|
---|
| 425 |
|
---|
| 426 | Public attributes:
|
---|
| 427 |
|
---|
| 428 | netscape: implement netscape protocol
|
---|
| 429 | rfc2965: implement RFC 2965 protocol
|
---|
| 430 | rfc2109_as_netscape:
|
---|
| 431 | WARNING: This argument will change or go away if is not accepted into
|
---|
| 432 | the Python standard library in this form!
|
---|
| 433 | If true, treat RFC 2109 cookies as though they were Netscape cookies. The
|
---|
| 434 | default is for this attribute to be None, which means treat 2109 cookies
|
---|
| 435 | as RFC 2965 cookies unless RFC 2965 handling is switched off (which it is,
|
---|
| 436 | by default), and as Netscape cookies otherwise.
|
---|
| 437 | hide_cookie2: don't add Cookie2 header to requests (the presence of
|
---|
| 438 | this header indicates to the server that we understand RFC 2965
|
---|
| 439 | cookies)
|
---|
| 440 |
|
---|
| 441 | """
|
---|
| 442 | def set_ok(self, cookie, request):
|
---|
| 443 | """Return true if (and only if) cookie should be accepted from server.
|
---|
| 444 |
|
---|
| 445 | Currently, pre-expired cookies never get this far -- the CookieJar
|
---|
| 446 | class deletes such cookies itself.
|
---|
| 447 |
|
---|
| 448 | cookie: mechanize.Cookie object
|
---|
| 449 | request: object implementing the interface defined by
|
---|
| 450 | CookieJar.extract_cookies.__doc__
|
---|
| 451 |
|
---|
| 452 | """
|
---|
| 453 | raise NotImplementedError()
|
---|
| 454 |
|
---|
| 455 | def return_ok(self, cookie, request):
|
---|
| 456 | """Return true if (and only if) cookie should be returned to server.
|
---|
| 457 |
|
---|
| 458 | cookie: mechanize.Cookie object
|
---|
| 459 | request: object implementing the interface defined by
|
---|
| 460 | CookieJar.add_cookie_header.__doc__
|
---|
| 461 |
|
---|
| 462 | """
|
---|
| 463 | raise NotImplementedError()
|
---|
| 464 |
|
---|
| 465 | def domain_return_ok(self, domain, request):
|
---|
| 466 | """Return false if cookies should not be returned, given cookie domain.
|
---|
| 467 |
|
---|
| 468 | This is here as an optimization, to remove the need for checking every
|
---|
| 469 | cookie with a particular domain (which may involve reading many files).
|
---|
| 470 | The default implementations of domain_return_ok and path_return_ok
|
---|
| 471 | (return True) leave all the work to return_ok.
|
---|
| 472 |
|
---|
| 473 | If domain_return_ok returns true for the cookie domain, path_return_ok
|
---|
| 474 | is called for the cookie path. Otherwise, path_return_ok and return_ok
|
---|
| 475 | are never called for that cookie domain. If path_return_ok returns
|
---|
| 476 | true, return_ok is called with the Cookie object itself for a full
|
---|
| 477 | check. Otherwise, return_ok is never called for that cookie path.
|
---|
| 478 |
|
---|
| 479 | Note that domain_return_ok is called for every *cookie* domain, not
|
---|
| 480 | just for the *request* domain. For example, the function might be
|
---|
| 481 | called with both ".acme.com" and "www.acme.com" if the request domain
|
---|
| 482 | is "www.acme.com". The same goes for path_return_ok.
|
---|
| 483 |
|
---|
| 484 | For argument documentation, see the docstring for return_ok.
|
---|
| 485 |
|
---|
| 486 | """
|
---|
| 487 | return True
|
---|
| 488 |
|
---|
| 489 | def path_return_ok(self, path, request):
|
---|
| 490 | """Return false if cookies should not be returned, given cookie path.
|
---|
| 491 |
|
---|
| 492 | See the docstring for domain_return_ok.
|
---|
| 493 |
|
---|
| 494 | """
|
---|
| 495 | return True
|
---|
| 496 |
|
---|
| 497 |
|
---|
| 498 | class DefaultCookiePolicy(CookiePolicy):
|
---|
| 499 | """Implements the standard rules for accepting and returning cookies.
|
---|
| 500 |
|
---|
| 501 | Both RFC 2965 and Netscape cookies are covered. RFC 2965 handling is
|
---|
| 502 | switched off by default.
|
---|
| 503 |
|
---|
| 504 | The easiest way to provide your own policy is to override this class and
|
---|
| 505 | call its methods in your overriden implementations before adding your own
|
---|
| 506 | additional checks.
|
---|
| 507 |
|
---|
| 508 | import mechanize
|
---|
| 509 | class MyCookiePolicy(mechanize.DefaultCookiePolicy):
|
---|
| 510 | def set_ok(self, cookie, request):
|
---|
| 511 | if not mechanize.DefaultCookiePolicy.set_ok(
|
---|
| 512 | self, cookie, request):
|
---|
| 513 | return False
|
---|
| 514 | if i_dont_want_to_store_this_cookie():
|
---|
| 515 | return False
|
---|
| 516 | return True
|
---|
| 517 |
|
---|
| 518 | In addition to the features required to implement the CookiePolicy
|
---|
| 519 | interface, this class allows you to block and allow domains from setting
|
---|
| 520 | and receiving cookies. There are also some strictness switches that allow
|
---|
| 521 | you to tighten up the rather loose Netscape protocol rules a little bit (at
|
---|
| 522 | the cost of blocking some benign cookies).
|
---|
| 523 |
|
---|
| 524 | A domain blacklist and whitelist is provided (both off by default). Only
|
---|
| 525 | domains not in the blacklist and present in the whitelist (if the whitelist
|
---|
| 526 | is active) participate in cookie setting and returning. Use the
|
---|
| 527 | blocked_domains constructor argument, and blocked_domains and
|
---|
| 528 | set_blocked_domains methods (and the corresponding argument and methods for
|
---|
| 529 | allowed_domains). If you set a whitelist, you can turn it off again by
|
---|
| 530 | setting it to None.
|
---|
| 531 |
|
---|
| 532 | Domains in block or allow lists that do not start with a dot must
|
---|
| 533 | string-compare equal. For example, "acme.com" matches a blacklist entry of
|
---|
| 534 | "acme.com", but "www.acme.com" does not. Domains that do start with a dot
|
---|
| 535 | are matched by more specific domains too. For example, both "www.acme.com"
|
---|
| 536 | and "www.munitions.acme.com" match ".acme.com" (but "acme.com" itself does
|
---|
| 537 | not). IP addresses are an exception, and must match exactly. For example,
|
---|
| 538 | if blocked_domains contains "192.168.1.2" and ".168.1.2" 192.168.1.2 is
|
---|
| 539 | blocked, but 193.168.1.2 is not.
|
---|
| 540 |
|
---|
| 541 | Additional Public Attributes:
|
---|
| 542 |
|
---|
| 543 | General strictness switches
|
---|
| 544 |
|
---|
| 545 | strict_domain: don't allow sites to set two-component domains with
|
---|
| 546 | country-code top-level domains like .co.uk, .gov.uk, .co.nz. etc.
|
---|
| 547 | This is far from perfect and isn't guaranteed to work!
|
---|
| 548 |
|
---|
| 549 | RFC 2965 protocol strictness switches
|
---|
| 550 |
|
---|
| 551 | strict_rfc2965_unverifiable: follow RFC 2965 rules on unverifiable
|
---|
| 552 | transactions (usually, an unverifiable transaction is one resulting from
|
---|
| 553 | a redirect or an image hosted on another site); if this is false, cookies
|
---|
| 554 | are NEVER blocked on the basis of verifiability
|
---|
| 555 |
|
---|
| 556 | Netscape protocol strictness switches
|
---|
| 557 |
|
---|
| 558 | strict_ns_unverifiable: apply RFC 2965 rules on unverifiable transactions
|
---|
| 559 | even to Netscape cookies
|
---|
| 560 | strict_ns_domain: flags indicating how strict to be with domain-matching
|
---|
| 561 | rules for Netscape cookies:
|
---|
| 562 | DomainStrictNoDots: when setting cookies, host prefix must not contain a
|
---|
| 563 | dot (eg. www.foo.bar.com can't set a cookie for .bar.com, because
|
---|
| 564 | www.foo contains a dot)
|
---|
| 565 | DomainStrictNonDomain: cookies that did not explicitly specify a Domain
|
---|
| 566 | cookie-attribute can only be returned to a domain that string-compares
|
---|
| 567 | equal to the domain that set the cookie (eg. rockets.acme.com won't
|
---|
| 568 | be returned cookies from acme.com that had no Domain cookie-attribute)
|
---|
| 569 | DomainRFC2965Match: when setting cookies, require a full RFC 2965
|
---|
| 570 | domain-match
|
---|
| 571 | DomainLiberal and DomainStrict are the most useful combinations of the
|
---|
| 572 | above flags, for convenience
|
---|
| 573 | strict_ns_set_initial_dollar: ignore cookies in Set-Cookie: headers that
|
---|
| 574 | have names starting with '$'
|
---|
| 575 | strict_ns_set_path: don't allow setting cookies whose path doesn't
|
---|
| 576 | path-match request URI
|
---|
| 577 |
|
---|
| 578 | """
|
---|
| 579 |
|
---|
| 580 | DomainStrictNoDots = 1
|
---|
| 581 | DomainStrictNonDomain = 2
|
---|
| 582 | DomainRFC2965Match = 4
|
---|
| 583 |
|
---|
| 584 | DomainLiberal = 0
|
---|
| 585 | DomainStrict = DomainStrictNoDots | DomainStrictNonDomain
|
---|
| 586 |
|
---|
| 587 | def __init__(self,
|
---|
| 588 | blocked_domains=None, allowed_domains=None,
|
---|
| 589 | netscape=True, rfc2965=False,
|
---|
| 590 | # WARNING: this argument will change or go away if is not
|
---|
| 591 | # accepted into the Python standard library in this form!
|
---|
| 592 | # default, ie. treat 2109 as netscape iff not rfc2965
|
---|
| 593 | rfc2109_as_netscape=None,
|
---|
| 594 | hide_cookie2=False,
|
---|
| 595 | strict_domain=False,
|
---|
| 596 | strict_rfc2965_unverifiable=True,
|
---|
| 597 | strict_ns_unverifiable=False,
|
---|
| 598 | strict_ns_domain=DomainLiberal,
|
---|
| 599 | strict_ns_set_initial_dollar=False,
|
---|
| 600 | strict_ns_set_path=False,
|
---|
| 601 | ):
|
---|
| 602 | """
|
---|
| 603 | Constructor arguments should be used as keyword arguments only.
|
---|
| 604 |
|
---|
| 605 | blocked_domains: sequence of domain names that we never accept cookies
|
---|
| 606 | from, nor return cookies to
|
---|
| 607 | allowed_domains: if not None, this is a sequence of the only domains
|
---|
| 608 | for which we accept and return cookies
|
---|
| 609 |
|
---|
| 610 | For other arguments, see CookiePolicy.__doc__ and
|
---|
| 611 | DefaultCookiePolicy.__doc__..
|
---|
| 612 |
|
---|
| 613 | """
|
---|
| 614 | self.netscape = netscape
|
---|
| 615 | self.rfc2965 = rfc2965
|
---|
| 616 | self.rfc2109_as_netscape = rfc2109_as_netscape
|
---|
| 617 | self.hide_cookie2 = hide_cookie2
|
---|
| 618 | self.strict_domain = strict_domain
|
---|
| 619 | self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
|
---|
| 620 | self.strict_ns_unverifiable = strict_ns_unverifiable
|
---|
| 621 | self.strict_ns_domain = strict_ns_domain
|
---|
| 622 | self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
|
---|
| 623 | self.strict_ns_set_path = strict_ns_set_path
|
---|
| 624 |
|
---|
| 625 | if blocked_domains is not None:
|
---|
| 626 | self._blocked_domains = tuple(blocked_domains)
|
---|
| 627 | else:
|
---|
| 628 | self._blocked_domains = ()
|
---|
| 629 |
|
---|
| 630 | if allowed_domains is not None:
|
---|
| 631 | allowed_domains = tuple(allowed_domains)
|
---|
| 632 | self._allowed_domains = allowed_domains
|
---|
| 633 |
|
---|
| 634 | def blocked_domains(self):
|
---|
| 635 | """Return the sequence of blocked domains (as a tuple)."""
|
---|
| 636 | return self._blocked_domains
|
---|
| 637 | def set_blocked_domains(self, blocked_domains):
|
---|
| 638 | """Set the sequence of blocked domains."""
|
---|
| 639 | self._blocked_domains = tuple(blocked_domains)
|
---|
| 640 |
|
---|
| 641 | def is_blocked(self, domain):
|
---|
| 642 | for blocked_domain in self._blocked_domains:
|
---|
| 643 | if user_domain_match(domain, blocked_domain):
|
---|
| 644 | return True
|
---|
| 645 | return False
|
---|
| 646 |
|
---|
| 647 | def allowed_domains(self):
|
---|
| 648 | """Return None, or the sequence of allowed domains (as a tuple)."""
|
---|
| 649 | return self._allowed_domains
|
---|
| 650 | def set_allowed_domains(self, allowed_domains):
|
---|
| 651 | """Set the sequence of allowed domains, or None."""
|
---|
| 652 | if allowed_domains is not None:
|
---|
| 653 | allowed_domains = tuple(allowed_domains)
|
---|
| 654 | self._allowed_domains = allowed_domains
|
---|
| 655 |
|
---|
| 656 | def is_not_allowed(self, domain):
|
---|
| 657 | if self._allowed_domains is None:
|
---|
| 658 | return False
|
---|
| 659 | for allowed_domain in self._allowed_domains:
|
---|
| 660 | if user_domain_match(domain, allowed_domain):
|
---|
| 661 | return False
|
---|
| 662 | return True
|
---|
| 663 |
|
---|
| 664 | def set_ok(self, cookie, request):
|
---|
| 665 | """
|
---|
| 666 | If you override set_ok, be sure to call this method. If it returns
|
---|
| 667 | false, so should your subclass (assuming your subclass wants to be more
|
---|
| 668 | strict about which cookies to accept).
|
---|
| 669 |
|
---|
| 670 | """
|
---|
| 671 | debug(" - checking cookie %s", cookie)
|
---|
| 672 |
|
---|
| 673 | assert cookie.name is not None
|
---|
| 674 |
|
---|
| 675 | for n in "version", "verifiability", "name", "path", "domain", "port":
|
---|
| 676 | fn_name = "set_ok_" + n
|
---|
| 677 | fn = getattr(self, fn_name)
|
---|
| 678 | if not fn(cookie, request):
|
---|
| 679 | return False
|
---|
| 680 |
|
---|
| 681 | return True
|
---|
| 682 |
|
---|
| 683 | def set_ok_version(self, cookie, request):
|
---|
| 684 | if cookie.version is None:
|
---|
| 685 | # Version is always set to 0 by parse_ns_headers if it's a Netscape
|
---|
| 686 | # cookie, so this must be an invalid RFC 2965 cookie.
|
---|
| 687 | debug(" Set-Cookie2 without version attribute (%s)", cookie)
|
---|
| 688 | return False
|
---|
| 689 | if cookie.version > 0 and not self.rfc2965:
|
---|
| 690 | debug(" RFC 2965 cookies are switched off")
|
---|
| 691 | return False
|
---|
| 692 | elif cookie.version == 0 and not self.netscape:
|
---|
| 693 | debug(" Netscape cookies are switched off")
|
---|
| 694 | return False
|
---|
| 695 | return True
|
---|
| 696 |
|
---|
| 697 | def set_ok_verifiability(self, cookie, request):
|
---|
| 698 | if request_is_unverifiable(request) and is_third_party(request):
|
---|
| 699 | if cookie.version > 0 and self.strict_rfc2965_unverifiable:
|
---|
| 700 | debug(" third-party RFC 2965 cookie during "
|
---|
| 701 | "unverifiable transaction")
|
---|
| 702 | return False
|
---|
| 703 | elif cookie.version == 0 and self.strict_ns_unverifiable:
|
---|
| 704 | debug(" third-party Netscape cookie during "
|
---|
| 705 | "unverifiable transaction")
|
---|
| 706 | return False
|
---|
| 707 | return True
|
---|
| 708 |
|
---|
| 709 | def set_ok_name(self, cookie, request):
|
---|
| 710 | # Try and stop servers setting V0 cookies designed to hack other
|
---|
| 711 | # servers that know both V0 and V1 protocols.
|
---|
| 712 | if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
|
---|
| 713 | cookie.name.startswith("$")):
|
---|
| 714 | debug(" illegal name (starts with '$'): '%s'", cookie.name)
|
---|
| 715 | return False
|
---|
| 716 | return True
|
---|
| 717 |
|
---|
| 718 | def set_ok_path(self, cookie, request):
|
---|
| 719 | if cookie.path_specified:
|
---|
| 720 | req_path = request_path(request)
|
---|
| 721 | if ((cookie.version > 0 or
|
---|
| 722 | (cookie.version == 0 and self.strict_ns_set_path)) and
|
---|
| 723 | not req_path.startswith(cookie.path)):
|
---|
| 724 | debug(" path attribute %s is not a prefix of request "
|
---|
| 725 | "path %s", cookie.path, req_path)
|
---|
| 726 | return False
|
---|
| 727 | return True
|
---|
| 728 |
|
---|
| 729 | def set_ok_countrycode_domain(self, cookie, request):
|
---|
| 730 | """Return False if explicit cookie domain is not acceptable.
|
---|
| 731 |
|
---|
| 732 | Called by set_ok_domain, for convenience of overriding by
|
---|
| 733 | subclasses.
|
---|
| 734 |
|
---|
| 735 | """
|
---|
| 736 | if cookie.domain_specified and self.strict_domain:
|
---|
| 737 | domain = cookie.domain
|
---|
| 738 | # since domain was specified, we know that:
|
---|
| 739 | assert domain.startswith(".")
|
---|
| 740 | if domain.count(".") == 2:
|
---|
| 741 | # domain like .foo.bar
|
---|
| 742 | i = domain.rfind(".")
|
---|
| 743 | tld = domain[i + 1:]
|
---|
| 744 | sld = domain[1:i]
|
---|
| 745 | if (sld.lower() in [
|
---|
| 746 | "co", "ac",
|
---|
| 747 | "com", "edu", "org", "net", "gov", "mil", "int",
|
---|
| 748 | "aero", "biz", "cat", "coop", "info", "jobs", "mobi",
|
---|
| 749 | "museum", "name", "pro", "travel",
|
---|
| 750 | ] and
|
---|
| 751 | len(tld) == 2):
|
---|
| 752 | # domain like .co.uk
|
---|
| 753 | return False
|
---|
| 754 | return True
|
---|
| 755 |
|
---|
| 756 | def set_ok_domain(self, cookie, request):
|
---|
| 757 | if self.is_blocked(cookie.domain):
|
---|
| 758 | debug(" domain %s is in user block-list", cookie.domain)
|
---|
| 759 | return False
|
---|
| 760 | if self.is_not_allowed(cookie.domain):
|
---|
| 761 | debug(" domain %s is not in user allow-list", cookie.domain)
|
---|
| 762 | return False
|
---|
| 763 | if not self.set_ok_countrycode_domain(cookie, request):
|
---|
| 764 | debug(" country-code second level domain %s", cookie.domain)
|
---|
| 765 | return False
|
---|
| 766 | if cookie.domain_specified:
|
---|
| 767 | req_host, erhn = eff_request_host_lc(request)
|
---|
| 768 | domain = cookie.domain
|
---|
| 769 | if domain.startswith("."):
|
---|
| 770 | undotted_domain = domain[1:]
|
---|
| 771 | else:
|
---|
| 772 | undotted_domain = domain
|
---|
| 773 | embedded_dots = (undotted_domain.find(".") >= 0)
|
---|
| 774 | if not embedded_dots and domain != ".local":
|
---|
| 775 | debug(" non-local domain %s contains no embedded dot",
|
---|
| 776 | domain)
|
---|
| 777 | return False
|
---|
| 778 | if cookie.version == 0:
|
---|
| 779 | if (not erhn.endswith(domain) and
|
---|
| 780 | (not erhn.startswith(".") and
|
---|
| 781 | not ("." + erhn).endswith(domain))):
|
---|
| 782 | debug(" effective request-host %s (even with added "
|
---|
| 783 | "initial dot) does not end end with %s",
|
---|
| 784 | erhn, domain)
|
---|
| 785 | return False
|
---|
| 786 | if (cookie.version > 0 or
|
---|
| 787 | (self.strict_ns_domain & self.DomainRFC2965Match)):
|
---|
| 788 | if not domain_match(erhn, domain):
|
---|
| 789 | debug(" effective request-host %s does not domain-match "
|
---|
| 790 | "%s", erhn, domain)
|
---|
| 791 | return False
|
---|
| 792 | if (cookie.version > 0 or
|
---|
| 793 | (self.strict_ns_domain & self.DomainStrictNoDots)):
|
---|
| 794 | host_prefix = req_host[:-len(domain)]
|
---|
| 795 | if (host_prefix.find(".") >= 0 and
|
---|
| 796 | not IPV4_RE.search(req_host)):
|
---|
| 797 | debug(" host prefix %s for domain %s contains a dot",
|
---|
| 798 | host_prefix, domain)
|
---|
| 799 | return False
|
---|
| 800 | return True
|
---|
| 801 |
|
---|
| 802 | def set_ok_port(self, cookie, request):
|
---|
| 803 | if cookie.port_specified:
|
---|
| 804 | req_port = request_port(request)
|
---|
| 805 | if req_port is None:
|
---|
| 806 | req_port = "80"
|
---|
| 807 | else:
|
---|
| 808 | req_port = str(req_port)
|
---|
| 809 | for p in cookie.port.split(","):
|
---|
| 810 | try:
|
---|
| 811 | int(p)
|
---|
| 812 | except ValueError:
|
---|
| 813 | debug(" bad port %s (not numeric)", p)
|
---|
| 814 | return False
|
---|
| 815 | if p == req_port:
|
---|
| 816 | break
|
---|
| 817 | else:
|
---|
| 818 | debug(" request port (%s) not found in %s",
|
---|
| 819 | req_port, cookie.port)
|
---|
| 820 | return False
|
---|
| 821 | return True
|
---|
| 822 |
|
---|
| 823 | def return_ok(self, cookie, request):
|
---|
| 824 | """
|
---|
| 825 | If you override return_ok, be sure to call this method. If it returns
|
---|
| 826 | false, so should your subclass (assuming your subclass wants to be more
|
---|
| 827 | strict about which cookies to return).
|
---|
| 828 |
|
---|
| 829 | """
|
---|
| 830 | # Path has already been checked by path_return_ok, and domain blocking
|
---|
| 831 | # done by domain_return_ok.
|
---|
| 832 | debug(" - checking cookie %s", cookie)
|
---|
| 833 |
|
---|
| 834 | for n in ("version", "verifiability", "secure", "expires", "port",
|
---|
| 835 | "domain"):
|
---|
| 836 | fn_name = "return_ok_" + n
|
---|
| 837 | fn = getattr(self, fn_name)
|
---|
| 838 | if not fn(cookie, request):
|
---|
| 839 | return False
|
---|
| 840 | return True
|
---|
| 841 |
|
---|
| 842 | def return_ok_version(self, cookie, request):
|
---|
| 843 | if cookie.version > 0 and not self.rfc2965:
|
---|
| 844 | debug(" RFC 2965 cookies are switched off")
|
---|
| 845 | return False
|
---|
| 846 | elif cookie.version == 0 and not self.netscape:
|
---|
| 847 | debug(" Netscape cookies are switched off")
|
---|
| 848 | return False
|
---|
| 849 | return True
|
---|
| 850 |
|
---|
| 851 | def return_ok_verifiability(self, cookie, request):
|
---|
| 852 | if request_is_unverifiable(request) and is_third_party(request):
|
---|
| 853 | if cookie.version > 0 and self.strict_rfc2965_unverifiable:
|
---|
| 854 | debug(" third-party RFC 2965 cookie during unverifiable "
|
---|
| 855 | "transaction")
|
---|
| 856 | return False
|
---|
| 857 | elif cookie.version == 0 and self.strict_ns_unverifiable:
|
---|
| 858 | debug(" third-party Netscape cookie during unverifiable "
|
---|
| 859 | "transaction")
|
---|
| 860 | return False
|
---|
| 861 | return True
|
---|
| 862 |
|
---|
| 863 | def return_ok_secure(self, cookie, request):
|
---|
| 864 | if cookie.secure and request.get_type() != "https":
|
---|
| 865 | debug(" secure cookie with non-secure request")
|
---|
| 866 | return False
|
---|
| 867 | return True
|
---|
| 868 |
|
---|
| 869 | def return_ok_expires(self, cookie, request):
|
---|
| 870 | if cookie.is_expired(self._now):
|
---|
| 871 | debug(" cookie expired")
|
---|
| 872 | return False
|
---|
| 873 | return True
|
---|
| 874 |
|
---|
| 875 | def return_ok_port(self, cookie, request):
|
---|
| 876 | if cookie.port:
|
---|
| 877 | req_port = request_port(request)
|
---|
| 878 | if req_port is None:
|
---|
| 879 | req_port = "80"
|
---|
| 880 | for p in cookie.port.split(","):
|
---|
| 881 | if p == req_port:
|
---|
| 882 | break
|
---|
| 883 | else:
|
---|
| 884 | debug(" request port %s does not match cookie port %s",
|
---|
| 885 | req_port, cookie.port)
|
---|
| 886 | return False
|
---|
| 887 | return True
|
---|
| 888 |
|
---|
| 889 | def return_ok_domain(self, cookie, request):
|
---|
| 890 | req_host, erhn = eff_request_host_lc(request)
|
---|
| 891 | domain = cookie.domain
|
---|
| 892 |
|
---|
| 893 | # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
|
---|
| 894 | if (cookie.version == 0 and
|
---|
| 895 | (self.strict_ns_domain & self.DomainStrictNonDomain) and
|
---|
| 896 | not cookie.domain_specified and domain != erhn):
|
---|
| 897 | debug(" cookie with unspecified domain does not string-compare "
|
---|
| 898 | "equal to request domain")
|
---|
| 899 | return False
|
---|
| 900 |
|
---|
| 901 | if cookie.version > 0 and not domain_match(erhn, domain):
|
---|
| 902 | debug(" effective request-host name %s does not domain-match "
|
---|
| 903 | "RFC 2965 cookie domain %s", erhn, domain)
|
---|
| 904 | return False
|
---|
| 905 | if cookie.version == 0 and not ("." + erhn).endswith(domain):
|
---|
| 906 | debug(" request-host %s does not match Netscape cookie domain "
|
---|
| 907 | "%s", req_host, domain)
|
---|
| 908 | return False
|
---|
| 909 | return True
|
---|
| 910 |
|
---|
| 911 | def domain_return_ok(self, domain, request):
|
---|
| 912 | # Liberal check of domain. This is here as an optimization to avoid
|
---|
| 913 | # having to load lots of MSIE cookie files unless necessary.
|
---|
| 914 |
|
---|
| 915 | # Munge req_host and erhn to always start with a dot, so as to err on
|
---|
| 916 | # the side of letting cookies through.
|
---|
| 917 | dotted_req_host, dotted_erhn = eff_request_host_lc(request)
|
---|
| 918 | if not dotted_req_host.startswith("."):
|
---|
| 919 | dotted_req_host = "." + dotted_req_host
|
---|
| 920 | if not dotted_erhn.startswith("."):
|
---|
| 921 | dotted_erhn = "." + dotted_erhn
|
---|
| 922 | if not (dotted_req_host.endswith(domain) or
|
---|
| 923 | dotted_erhn.endswith(domain)):
|
---|
| 924 | #debug(" request domain %s does not match cookie domain %s",
|
---|
| 925 | # req_host, domain)
|
---|
| 926 | return False
|
---|
| 927 |
|
---|
| 928 | if self.is_blocked(domain):
|
---|
| 929 | debug(" domain %s is in user block-list", domain)
|
---|
| 930 | return False
|
---|
| 931 | if self.is_not_allowed(domain):
|
---|
| 932 | debug(" domain %s is not in user allow-list", domain)
|
---|
| 933 | return False
|
---|
| 934 |
|
---|
| 935 | return True
|
---|
| 936 |
|
---|
| 937 | def path_return_ok(self, path, request):
|
---|
| 938 | debug("- checking cookie path=%s", path)
|
---|
| 939 | req_path = request_path(request)
|
---|
| 940 | if not req_path.startswith(path):
|
---|
| 941 | debug(" %s does not path-match %s", req_path, path)
|
---|
| 942 | return False
|
---|
| 943 | return True
|
---|
| 944 |
|
---|
| 945 |
|
---|
| 946 | def vals_sorted_by_key(adict):
|
---|
| 947 | keys = adict.keys()
|
---|
| 948 | keys.sort()
|
---|
| 949 | return map(adict.get, keys)
|
---|
| 950 |
|
---|
| 951 | class MappingIterator:
|
---|
| 952 | """Iterates over nested mapping, depth-first, in sorted order by key."""
|
---|
| 953 | def __init__(self, mapping):
|
---|
| 954 | self._s = [(vals_sorted_by_key(mapping), 0, None)] # LIFO stack
|
---|
| 955 |
|
---|
| 956 | def __iter__(self): return self
|
---|
| 957 |
|
---|
| 958 | def next(self):
|
---|
| 959 | # this is hairy because of lack of generators
|
---|
| 960 | while 1:
|
---|
| 961 | try:
|
---|
| 962 | vals, i, prev_item = self._s.pop()
|
---|
| 963 | except IndexError:
|
---|
| 964 | raise StopIteration()
|
---|
| 965 | if i < len(vals):
|
---|
| 966 | item = vals[i]
|
---|
| 967 | i = i + 1
|
---|
| 968 | self._s.append((vals, i, prev_item))
|
---|
| 969 | try:
|
---|
| 970 | item.items
|
---|
| 971 | except AttributeError:
|
---|
| 972 | # non-mapping
|
---|
| 973 | break
|
---|
| 974 | else:
|
---|
| 975 | # mapping
|
---|
| 976 | self._s.append((vals_sorted_by_key(item), 0, item))
|
---|
| 977 | continue
|
---|
| 978 | return item
|
---|
| 979 |
|
---|
| 980 |
|
---|
| 981 | # Used as second parameter to dict.get method, to distinguish absent
|
---|
| 982 | # dict key from one with a None value.
|
---|
| 983 | class Absent: pass
|
---|
| 984 |
|
---|
| 985 | class CookieJar:
|
---|
| 986 | """Collection of HTTP cookies.
|
---|
| 987 |
|
---|
| 988 | You may not need to know about this class: try mechanize.urlopen().
|
---|
| 989 |
|
---|
| 990 | The major methods are extract_cookies and add_cookie_header; these are all
|
---|
| 991 | you are likely to need.
|
---|
| 992 |
|
---|
| 993 | CookieJar supports the iterator protocol:
|
---|
| 994 |
|
---|
| 995 | for cookie in cookiejar:
|
---|
| 996 | # do something with cookie
|
---|
| 997 |
|
---|
| 998 | Methods:
|
---|
| 999 |
|
---|
| 1000 | add_cookie_header(request)
|
---|
| 1001 | extract_cookies(response, request)
|
---|
| 1002 | get_policy()
|
---|
| 1003 | set_policy(policy)
|
---|
| 1004 | cookies_for_request(request)
|
---|
| 1005 | make_cookies(response, request)
|
---|
| 1006 | set_cookie_if_ok(cookie, request)
|
---|
| 1007 | set_cookie(cookie)
|
---|
| 1008 | clear_session_cookies()
|
---|
| 1009 | clear_expired_cookies()
|
---|
| 1010 | clear(domain=None, path=None, name=None)
|
---|
| 1011 |
|
---|
| 1012 | Public attributes
|
---|
| 1013 |
|
---|
| 1014 | policy: CookiePolicy object
|
---|
| 1015 |
|
---|
| 1016 | """
|
---|
| 1017 |
|
---|
| 1018 | non_word_re = re.compile(r"\W")
|
---|
| 1019 | quote_re = re.compile(r"([\"\\])")
|
---|
| 1020 | strict_domain_re = re.compile(r"\.?[^.]*")
|
---|
| 1021 | domain_re = re.compile(r"[^.]*")
|
---|
| 1022 | dots_re = re.compile(r"^\.+")
|
---|
| 1023 |
|
---|
| 1024 | def __init__(self, policy=None):
|
---|
| 1025 | """
|
---|
| 1026 | See CookieJar.__doc__ for argument documentation.
|
---|
| 1027 |
|
---|
| 1028 | """
|
---|
| 1029 | if policy is None:
|
---|
| 1030 | policy = DefaultCookiePolicy()
|
---|
| 1031 | self._policy = policy
|
---|
| 1032 |
|
---|
| 1033 | self._cookies = {}
|
---|
| 1034 |
|
---|
| 1035 | # for __getitem__ iteration in pre-2.2 Pythons
|
---|
| 1036 | self._prev_getitem_index = 0
|
---|
| 1037 |
|
---|
| 1038 | def get_policy(self):
|
---|
| 1039 | return self._policy
|
---|
| 1040 |
|
---|
| 1041 | def set_policy(self, policy):
|
---|
| 1042 | self._policy = policy
|
---|
| 1043 |
|
---|
| 1044 | def _cookies_for_domain(self, domain, request):
|
---|
| 1045 | cookies = []
|
---|
| 1046 | if not self._policy.domain_return_ok(domain, request):
|
---|
| 1047 | return []
|
---|
| 1048 | debug("Checking %s for cookies to return", domain)
|
---|
| 1049 | cookies_by_path = self._cookies[domain]
|
---|
| 1050 | for path in cookies_by_path.keys():
|
---|
| 1051 | if not self._policy.path_return_ok(path, request):
|
---|
| 1052 | continue
|
---|
| 1053 | cookies_by_name = cookies_by_path[path]
|
---|
| 1054 | for cookie in cookies_by_name.values():
|
---|
| 1055 | if not self._policy.return_ok(cookie, request):
|
---|
| 1056 | debug(" not returning cookie")
|
---|
| 1057 | continue
|
---|
| 1058 | debug(" it's a match")
|
---|
| 1059 | cookies.append(cookie)
|
---|
| 1060 | return cookies
|
---|
| 1061 |
|
---|
| 1062 | def cookies_for_request(self, request):
|
---|
| 1063 | """Return a list of cookies to be returned to server.
|
---|
| 1064 |
|
---|
| 1065 | The returned list of cookie instances is sorted in the order they
|
---|
| 1066 | should appear in the Cookie: header for return to the server.
|
---|
| 1067 |
|
---|
| 1068 | See add_cookie_header.__doc__ for the interface required of the
|
---|
| 1069 | request argument.
|
---|
| 1070 |
|
---|
| 1071 | New in version 0.1.10
|
---|
| 1072 |
|
---|
| 1073 | """
|
---|
| 1074 | self._policy._now = self._now = int(time.time())
|
---|
| 1075 | cookies = self._cookies_for_request(request)
|
---|
| 1076 | # add cookies in order of most specific (i.e. longest) path first
|
---|
| 1077 | def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
|
---|
| 1078 | cookies.sort(decreasing_size)
|
---|
| 1079 | return cookies
|
---|
| 1080 |
|
---|
| 1081 | def _cookies_for_request(self, request):
|
---|
| 1082 | """Return a list of cookies to be returned to server."""
|
---|
| 1083 | # this method still exists (alongside cookies_for_request) because it
|
---|
| 1084 | # is part of an implied protected interface for subclasses of cookiejar
|
---|
| 1085 | # XXX document that implied interface, or provide another way of
|
---|
| 1086 | # implementing cookiejars than subclassing
|
---|
| 1087 | cookies = []
|
---|
| 1088 | for domain in self._cookies.keys():
|
---|
| 1089 | cookies.extend(self._cookies_for_domain(domain, request))
|
---|
| 1090 | return cookies
|
---|
| 1091 |
|
---|
| 1092 | def _cookie_attrs(self, cookies):
|
---|
| 1093 | """Return a list of cookie-attributes to be returned to server.
|
---|
| 1094 |
|
---|
| 1095 | The $Version attribute is also added when appropriate (currently only
|
---|
| 1096 | once per request).
|
---|
| 1097 |
|
---|
| 1098 | >>> jar = CookieJar()
|
---|
| 1099 | >>> ns_cookie = Cookie(0, "foo", '"bar"', None, False,
|
---|
| 1100 | ... "example.com", False, False,
|
---|
| 1101 | ... "/", False, False, None, True,
|
---|
| 1102 | ... None, None, {})
|
---|
| 1103 | >>> jar._cookie_attrs([ns_cookie])
|
---|
| 1104 | ['foo="bar"']
|
---|
| 1105 | >>> rfc2965_cookie = Cookie(1, "foo", "bar", None, False,
|
---|
| 1106 | ... ".example.com", True, False,
|
---|
| 1107 | ... "/", False, False, None, True,
|
---|
| 1108 | ... None, None, {})
|
---|
| 1109 | >>> jar._cookie_attrs([rfc2965_cookie])
|
---|
| 1110 | ['$Version=1', 'foo=bar', '$Domain="example.com"']
|
---|
| 1111 |
|
---|
| 1112 | """
|
---|
| 1113 | version_set = False
|
---|
| 1114 |
|
---|
| 1115 | attrs = []
|
---|
| 1116 | for cookie in cookies:
|
---|
| 1117 | # set version of Cookie header
|
---|
| 1118 | # XXX
|
---|
| 1119 | # What should it be if multiple matching Set-Cookie headers have
|
---|
| 1120 | # different versions themselves?
|
---|
| 1121 | # Answer: there is no answer; was supposed to be settled by
|
---|
| 1122 | # RFC 2965 errata, but that may never appear...
|
---|
| 1123 | version = cookie.version
|
---|
| 1124 | if not version_set:
|
---|
| 1125 | version_set = True
|
---|
| 1126 | if version > 0:
|
---|
| 1127 | attrs.append("$Version=%s" % version)
|
---|
| 1128 |
|
---|
| 1129 | # quote cookie value if necessary
|
---|
| 1130 | # (not for Netscape protocol, which already has any quotes
|
---|
| 1131 | # intact, due to the poorly-specified Netscape Cookie: syntax)
|
---|
| 1132 | if ((cookie.value is not None) and
|
---|
| 1133 | self.non_word_re.search(cookie.value) and version > 0):
|
---|
| 1134 | value = self.quote_re.sub(r"\\\1", cookie.value)
|
---|
| 1135 | else:
|
---|
| 1136 | value = cookie.value
|
---|
| 1137 |
|
---|
| 1138 | # add cookie-attributes to be returned in Cookie header
|
---|
| 1139 | if cookie.value is None:
|
---|
| 1140 | attrs.append(cookie.name)
|
---|
| 1141 | else:
|
---|
| 1142 | attrs.append("%s=%s" % (cookie.name, value))
|
---|
| 1143 | if version > 0:
|
---|
| 1144 | if cookie.path_specified:
|
---|
| 1145 | attrs.append('$Path="%s"' % cookie.path)
|
---|
| 1146 | if cookie.domain.startswith("."):
|
---|
| 1147 | domain = cookie.domain
|
---|
| 1148 | if (not cookie.domain_initial_dot and
|
---|
| 1149 | domain.startswith(".")):
|
---|
| 1150 | domain = domain[1:]
|
---|
| 1151 | attrs.append('$Domain="%s"' % domain)
|
---|
| 1152 | if cookie.port is not None:
|
---|
| 1153 | p = "$Port"
|
---|
| 1154 | if cookie.port_specified:
|
---|
| 1155 | p = p + ('="%s"' % cookie.port)
|
---|
| 1156 | attrs.append(p)
|
---|
| 1157 |
|
---|
| 1158 | return attrs
|
---|
| 1159 |
|
---|
| 1160 | def add_cookie_header(self, request):
|
---|
| 1161 | """Add correct Cookie: header to request (urllib2.Request object).
|
---|
| 1162 |
|
---|
| 1163 | The Cookie2 header is also added unless policy.hide_cookie2 is true.
|
---|
| 1164 |
|
---|
| 1165 | The request object (usually a urllib2.Request instance) must support
|
---|
| 1166 | the methods get_full_url, get_host, is_unverifiable, get_type,
|
---|
| 1167 | has_header, get_header, header_items and add_unredirected_header, as
|
---|
| 1168 | documented by urllib2, and the port attribute (the port number).
|
---|
| 1169 | Actually, RequestUpgradeProcessor will automatically upgrade your
|
---|
| 1170 | Request object to one with has_header, get_header, header_items and
|
---|
| 1171 | add_unredirected_header, if it lacks those methods, for compatibility
|
---|
| 1172 | with pre-2.4 versions of urllib2.
|
---|
| 1173 |
|
---|
| 1174 | """
|
---|
| 1175 | debug("add_cookie_header")
|
---|
| 1176 | cookies = self.cookies_for_request(request)
|
---|
| 1177 |
|
---|
| 1178 | attrs = self._cookie_attrs(cookies)
|
---|
| 1179 | if attrs:
|
---|
| 1180 | if not request.has_header("Cookie"):
|
---|
| 1181 | request.add_unredirected_header("Cookie", "; ".join(attrs))
|
---|
| 1182 |
|
---|
| 1183 | # if necessary, advertise that we know RFC 2965
|
---|
| 1184 | if self._policy.rfc2965 and not self._policy.hide_cookie2:
|
---|
| 1185 | for cookie in cookies:
|
---|
| 1186 | if cookie.version != 1 and not request.has_header("Cookie2"):
|
---|
| 1187 | request.add_unredirected_header("Cookie2", '$Version="1"')
|
---|
| 1188 | break
|
---|
| 1189 |
|
---|
| 1190 | self.clear_expired_cookies()
|
---|
| 1191 |
|
---|
| 1192 | def _normalized_cookie_tuples(self, attrs_set):
|
---|
| 1193 | """Return list of tuples containing normalised cookie information.
|
---|
| 1194 |
|
---|
| 1195 | attrs_set is the list of lists of key,value pairs extracted from
|
---|
| 1196 | the Set-Cookie or Set-Cookie2 headers.
|
---|
| 1197 |
|
---|
| 1198 | Tuples are name, value, standard, rest, where name and value are the
|
---|
| 1199 | cookie name and value, standard is a dictionary containing the standard
|
---|
| 1200 | cookie-attributes (discard, secure, version, expires or max-age,
|
---|
| 1201 | domain, path and port) and rest is a dictionary containing the rest of
|
---|
| 1202 | the cookie-attributes.
|
---|
| 1203 |
|
---|
| 1204 | """
|
---|
| 1205 | cookie_tuples = []
|
---|
| 1206 |
|
---|
| 1207 | boolean_attrs = "discard", "secure"
|
---|
| 1208 | value_attrs = ("version",
|
---|
| 1209 | "expires", "max-age",
|
---|
| 1210 | "domain", "path", "port",
|
---|
| 1211 | "comment", "commenturl")
|
---|
| 1212 |
|
---|
| 1213 | for cookie_attrs in attrs_set:
|
---|
| 1214 | name, value = cookie_attrs[0]
|
---|
| 1215 |
|
---|
| 1216 | # Build dictionary of standard cookie-attributes (standard) and
|
---|
| 1217 | # dictionary of other cookie-attributes (rest).
|
---|
| 1218 |
|
---|
| 1219 | # Note: expiry time is normalised to seconds since epoch. V0
|
---|
| 1220 | # cookies should have the Expires cookie-attribute, and V1 cookies
|
---|
| 1221 | # should have Max-Age, but since V1 includes RFC 2109 cookies (and
|
---|
| 1222 | # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
|
---|
| 1223 | # accept either (but prefer Max-Age).
|
---|
| 1224 | max_age_set = False
|
---|
| 1225 |
|
---|
| 1226 | bad_cookie = False
|
---|
| 1227 |
|
---|
| 1228 | standard = {}
|
---|
| 1229 | rest = {}
|
---|
| 1230 | for k, v in cookie_attrs[1:]:
|
---|
| 1231 | lc = k.lower()
|
---|
| 1232 | # don't lose case distinction for unknown fields
|
---|
| 1233 | if lc in value_attrs or lc in boolean_attrs:
|
---|
| 1234 | k = lc
|
---|
| 1235 | if k in boolean_attrs and v is None:
|
---|
| 1236 | # boolean cookie-attribute is present, but has no value
|
---|
| 1237 | # (like "discard", rather than "port=80")
|
---|
| 1238 | v = True
|
---|
| 1239 | if standard.has_key(k):
|
---|
| 1240 | # only first value is significant
|
---|
| 1241 | continue
|
---|
| 1242 | if k == "domain":
|
---|
| 1243 | if v is None:
|
---|
| 1244 | debug(" missing value for domain attribute")
|
---|
| 1245 | bad_cookie = True
|
---|
| 1246 | break
|
---|
| 1247 | # RFC 2965 section 3.3.3
|
---|
| 1248 | v = v.lower()
|
---|
| 1249 | if k == "expires":
|
---|
| 1250 | if max_age_set:
|
---|
| 1251 | # Prefer max-age to expires (like Mozilla)
|
---|
| 1252 | continue
|
---|
| 1253 | if v is None:
|
---|
| 1254 | debug(" missing or invalid value for expires "
|
---|
| 1255 | "attribute: treating as session cookie")
|
---|
| 1256 | continue
|
---|
| 1257 | if k == "max-age":
|
---|
| 1258 | max_age_set = True
|
---|
| 1259 | if v is None:
|
---|
| 1260 | debug(" missing value for max-age attribute")
|
---|
| 1261 | bad_cookie = True
|
---|
| 1262 | break
|
---|
| 1263 | try:
|
---|
| 1264 | v = int(v)
|
---|
| 1265 | except ValueError:
|
---|
| 1266 | debug(" missing or invalid (non-numeric) value for "
|
---|
| 1267 | "max-age attribute")
|
---|
| 1268 | bad_cookie = True
|
---|
| 1269 | break
|
---|
| 1270 | # convert RFC 2965 Max-Age to seconds since epoch
|
---|
| 1271 | # XXX Strictly you're supposed to follow RFC 2616
|
---|
| 1272 | # age-calculation rules. Remember that zero Max-Age is a
|
---|
| 1273 | # is a request to discard (old and new) cookie, though.
|
---|
| 1274 | k = "expires"
|
---|
| 1275 | v = self._now + v
|
---|
| 1276 | if (k in value_attrs) or (k in boolean_attrs):
|
---|
| 1277 | if (v is None and
|
---|
| 1278 | k not in ["port", "comment", "commenturl"]):
|
---|
| 1279 | debug(" missing value for %s attribute" % k)
|
---|
| 1280 | bad_cookie = True
|
---|
| 1281 | break
|
---|
| 1282 | standard[k] = v
|
---|
| 1283 | else:
|
---|
| 1284 | rest[k] = v
|
---|
| 1285 |
|
---|
| 1286 | if bad_cookie:
|
---|
| 1287 | continue
|
---|
| 1288 |
|
---|
| 1289 | cookie_tuples.append((name, value, standard, rest))
|
---|
| 1290 |
|
---|
| 1291 | return cookie_tuples
|
---|
| 1292 |
|
---|
| 1293 | def _cookie_from_cookie_tuple(self, tup, request):
|
---|
| 1294 | # standard is dict of standard cookie-attributes, rest is dict of the
|
---|
| 1295 | # rest of them
|
---|
| 1296 | name, value, standard, rest = tup
|
---|
| 1297 |
|
---|
| 1298 | domain = standard.get("domain", Absent)
|
---|
| 1299 | path = standard.get("path", Absent)
|
---|
| 1300 | port = standard.get("port", Absent)
|
---|
| 1301 | expires = standard.get("expires", Absent)
|
---|
| 1302 |
|
---|
| 1303 | # set the easy defaults
|
---|
| 1304 | version = standard.get("version", None)
|
---|
| 1305 | if version is not None:
|
---|
| 1306 | try:
|
---|
| 1307 | version = int(version)
|
---|
| 1308 | except ValueError:
|
---|
| 1309 | return None # invalid version, ignore cookie
|
---|
| 1310 | secure = standard.get("secure", False)
|
---|
| 1311 | # (discard is also set if expires is Absent)
|
---|
| 1312 | discard = standard.get("discard", False)
|
---|
| 1313 | comment = standard.get("comment", None)
|
---|
| 1314 | comment_url = standard.get("commenturl", None)
|
---|
| 1315 |
|
---|
| 1316 | # set default path
|
---|
| 1317 | if path is not Absent and path != "":
|
---|
| 1318 | path_specified = True
|
---|
| 1319 | path = escape_path(path)
|
---|
| 1320 | else:
|
---|
| 1321 | path_specified = False
|
---|
| 1322 | path = request_path(request)
|
---|
| 1323 | i = path.rfind("/")
|
---|
| 1324 | if i != -1:
|
---|
| 1325 | if version == 0:
|
---|
| 1326 | # Netscape spec parts company from reality here
|
---|
| 1327 | path = path[:i]
|
---|
| 1328 | else:
|
---|
| 1329 | path = path[:i + 1]
|
---|
| 1330 | if len(path) == 0: path = "/"
|
---|
| 1331 |
|
---|
| 1332 | # set default domain
|
---|
| 1333 | domain_specified = domain is not Absent
|
---|
| 1334 | # but first we have to remember whether it starts with a dot
|
---|
| 1335 | domain_initial_dot = False
|
---|
| 1336 | if domain_specified:
|
---|
| 1337 | domain_initial_dot = bool(domain.startswith("."))
|
---|
| 1338 | if domain is Absent:
|
---|
| 1339 | req_host, erhn = eff_request_host_lc(request)
|
---|
| 1340 | domain = erhn
|
---|
| 1341 | elif not domain.startswith("."):
|
---|
| 1342 | domain = "." + domain
|
---|
| 1343 |
|
---|
| 1344 | # set default port
|
---|
| 1345 | port_specified = False
|
---|
| 1346 | if port is not Absent:
|
---|
| 1347 | if port is None:
|
---|
| 1348 | # Port attr present, but has no value: default to request port.
|
---|
| 1349 | # Cookie should then only be sent back on that port.
|
---|
| 1350 | port = request_port(request)
|
---|
| 1351 | else:
|
---|
| 1352 | port_specified = True
|
---|
| 1353 | port = re.sub(r"\s+", "", port)
|
---|
| 1354 | else:
|
---|
| 1355 | # No port attr present. Cookie can be sent back on any port.
|
---|
| 1356 | port = None
|
---|
| 1357 |
|
---|
| 1358 | # set default expires and discard
|
---|
| 1359 | if expires is Absent:
|
---|
| 1360 | expires = None
|
---|
| 1361 | discard = True
|
---|
| 1362 |
|
---|
| 1363 | return Cookie(version,
|
---|
| 1364 | name, value,
|
---|
| 1365 | port, port_specified,
|
---|
| 1366 | domain, domain_specified, domain_initial_dot,
|
---|
| 1367 | path, path_specified,
|
---|
| 1368 | secure,
|
---|
| 1369 | expires,
|
---|
| 1370 | discard,
|
---|
| 1371 | comment,
|
---|
| 1372 | comment_url,
|
---|
| 1373 | rest)
|
---|
| 1374 |
|
---|
| 1375 | def _cookies_from_attrs_set(self, attrs_set, request):
|
---|
| 1376 | cookie_tuples = self._normalized_cookie_tuples(attrs_set)
|
---|
| 1377 |
|
---|
| 1378 | cookies = []
|
---|
| 1379 | for tup in cookie_tuples:
|
---|
| 1380 | cookie = self._cookie_from_cookie_tuple(tup, request)
|
---|
| 1381 | if cookie: cookies.append(cookie)
|
---|
| 1382 | return cookies
|
---|
| 1383 |
|
---|
| 1384 | def _process_rfc2109_cookies(self, cookies):
|
---|
| 1385 | if self._policy.rfc2109_as_netscape is None:
|
---|
| 1386 | rfc2109_as_netscape = not self._policy.rfc2965
|
---|
| 1387 | else:
|
---|
| 1388 | rfc2109_as_netscape = self._policy.rfc2109_as_netscape
|
---|
| 1389 | for cookie in cookies:
|
---|
| 1390 | if cookie.version == 1:
|
---|
| 1391 | cookie.rfc2109 = True
|
---|
| 1392 | if rfc2109_as_netscape:
|
---|
| 1393 | # treat 2109 cookies as Netscape cookies rather than
|
---|
| 1394 | # as RFC2965 cookies
|
---|
| 1395 | cookie.version = 0
|
---|
| 1396 |
|
---|
| 1397 | def _make_cookies(self, response, request):
|
---|
| 1398 | # get cookie-attributes for RFC 2965 and Netscape protocols
|
---|
| 1399 | headers = response.info()
|
---|
| 1400 | rfc2965_hdrs = headers.getheaders("Set-Cookie2")
|
---|
| 1401 | ns_hdrs = headers.getheaders("Set-Cookie")
|
---|
| 1402 |
|
---|
| 1403 | rfc2965 = self._policy.rfc2965
|
---|
| 1404 | netscape = self._policy.netscape
|
---|
| 1405 |
|
---|
| 1406 | if ((not rfc2965_hdrs and not ns_hdrs) or
|
---|
| 1407 | (not ns_hdrs and not rfc2965) or
|
---|
| 1408 | (not rfc2965_hdrs and not netscape) or
|
---|
| 1409 | (not netscape and not rfc2965)):
|
---|
| 1410 | return [] # no relevant cookie headers: quick exit
|
---|
| 1411 |
|
---|
| 1412 | try:
|
---|
| 1413 | cookies = self._cookies_from_attrs_set(
|
---|
| 1414 | split_header_words(rfc2965_hdrs), request)
|
---|
| 1415 | except:
|
---|
| 1416 | reraise_unmasked_exceptions()
|
---|
| 1417 | cookies = []
|
---|
| 1418 |
|
---|
| 1419 | if ns_hdrs and netscape:
|
---|
| 1420 | try:
|
---|
| 1421 | # RFC 2109 and Netscape cookies
|
---|
| 1422 | ns_cookies = self._cookies_from_attrs_set(
|
---|
| 1423 | parse_ns_headers(ns_hdrs), request)
|
---|
| 1424 | except:
|
---|
| 1425 | reraise_unmasked_exceptions()
|
---|
| 1426 | ns_cookies = []
|
---|
| 1427 | self._process_rfc2109_cookies(ns_cookies)
|
---|
| 1428 |
|
---|
| 1429 | # Look for Netscape cookies (from Set-Cookie headers) that match
|
---|
| 1430 | # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
|
---|
| 1431 | # For each match, keep the RFC 2965 cookie and ignore the Netscape
|
---|
| 1432 | # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
|
---|
| 1433 | # bundled in with the Netscape cookies for this purpose, which is
|
---|
| 1434 | # reasonable behaviour.
|
---|
| 1435 | if rfc2965:
|
---|
| 1436 | lookup = {}
|
---|
| 1437 | for cookie in cookies:
|
---|
| 1438 | lookup[(cookie.domain, cookie.path, cookie.name)] = None
|
---|
| 1439 |
|
---|
| 1440 | def no_matching_rfc2965(ns_cookie, lookup=lookup):
|
---|
| 1441 | key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
|
---|
| 1442 | return not lookup.has_key(key)
|
---|
| 1443 | ns_cookies = filter(no_matching_rfc2965, ns_cookies)
|
---|
| 1444 |
|
---|
| 1445 | if ns_cookies:
|
---|
| 1446 | cookies.extend(ns_cookies)
|
---|
| 1447 |
|
---|
| 1448 | return cookies
|
---|
| 1449 |
|
---|
| 1450 | def make_cookies(self, response, request):
|
---|
| 1451 | """Return sequence of Cookie objects extracted from response object.
|
---|
| 1452 |
|
---|
| 1453 | See extract_cookies.__doc__ for the interface required of the
|
---|
| 1454 | response and request arguments.
|
---|
| 1455 |
|
---|
| 1456 | """
|
---|
| 1457 | self._policy._now = self._now = int(time.time())
|
---|
| 1458 | return [cookie for cookie in self._make_cookies(response, request)
|
---|
| 1459 | if cookie.expires is None or not cookie.expires <= self._now]
|
---|
| 1460 |
|
---|
| 1461 | def set_cookie_if_ok(self, cookie, request):
|
---|
| 1462 | """Set a cookie if policy says it's OK to do so.
|
---|
| 1463 |
|
---|
| 1464 | cookie: mechanize.Cookie instance
|
---|
| 1465 | request: see extract_cookies.__doc__ for the required interface
|
---|
| 1466 |
|
---|
| 1467 | """
|
---|
| 1468 | self._policy._now = self._now = int(time.time())
|
---|
| 1469 |
|
---|
| 1470 | if self._policy.set_ok(cookie, request):
|
---|
| 1471 | self.set_cookie(cookie)
|
---|
| 1472 |
|
---|
| 1473 | def set_cookie(self, cookie):
|
---|
| 1474 | """Set a cookie, without checking whether or not it should be set.
|
---|
| 1475 |
|
---|
| 1476 | cookie: mechanize.Cookie instance
|
---|
| 1477 | """
|
---|
| 1478 | c = self._cookies
|
---|
| 1479 | if not c.has_key(cookie.domain): c[cookie.domain] = {}
|
---|
| 1480 | c2 = c[cookie.domain]
|
---|
| 1481 | if not c2.has_key(cookie.path): c2[cookie.path] = {}
|
---|
| 1482 | c3 = c2[cookie.path]
|
---|
| 1483 | c3[cookie.name] = cookie
|
---|
| 1484 |
|
---|
| 1485 | def extract_cookies(self, response, request):
|
---|
| 1486 | """Extract cookies from response, where allowable given the request.
|
---|
| 1487 |
|
---|
| 1488 | Look for allowable Set-Cookie: and Set-Cookie2: headers in the response
|
---|
| 1489 | object passed as argument. Any of these headers that are found are
|
---|
| 1490 | used to update the state of the object (subject to the policy.set_ok
|
---|
| 1491 | method's approval).
|
---|
| 1492 |
|
---|
| 1493 | The response object (usually be the result of a call to
|
---|
| 1494 | mechanize.urlopen, or similar) should support an info method, which
|
---|
| 1495 | returns a mimetools.Message object (in fact, the 'mimetools.Message
|
---|
| 1496 | object' may be any object that provides a getheaders method).
|
---|
| 1497 |
|
---|
| 1498 | The request object (usually a urllib2.Request instance) must support
|
---|
| 1499 | the methods get_full_url, get_type, get_host, and is_unverifiable, as
|
---|
| 1500 | documented by urllib2, and the port attribute (the port number). The
|
---|
| 1501 | request is used to set default values for cookie-attributes as well as
|
---|
| 1502 | for checking that the cookie is OK to be set.
|
---|
| 1503 |
|
---|
| 1504 | """
|
---|
| 1505 | debug("extract_cookies: %s", response.info())
|
---|
| 1506 | self._policy._now = self._now = int(time.time())
|
---|
| 1507 |
|
---|
| 1508 | for cookie in self._make_cookies(response, request):
|
---|
| 1509 | if cookie.expires is not None and cookie.expires <= self._now:
|
---|
| 1510 | # Expiry date in past is request to delete cookie. This can't be
|
---|
| 1511 | # in DefaultCookiePolicy, because can't delete cookies there.
|
---|
| 1512 | try:
|
---|
| 1513 | self.clear(cookie.domain, cookie.path, cookie.name)
|
---|
| 1514 | except KeyError:
|
---|
| 1515 | pass
|
---|
| 1516 | debug("Expiring cookie, domain='%s', path='%s', name='%s'",
|
---|
| 1517 | cookie.domain, cookie.path, cookie.name)
|
---|
| 1518 | elif self._policy.set_ok(cookie, request):
|
---|
| 1519 | debug(" setting cookie: %s", cookie)
|
---|
| 1520 | self.set_cookie(cookie)
|
---|
| 1521 |
|
---|
| 1522 | def clear(self, domain=None, path=None, name=None):
|
---|
| 1523 | """Clear some cookies.
|
---|
| 1524 |
|
---|
| 1525 | Invoking this method without arguments will clear all cookies. If
|
---|
| 1526 | given a single argument, only cookies belonging to that domain will be
|
---|
| 1527 | removed. If given two arguments, cookies belonging to the specified
|
---|
| 1528 | path within that domain are removed. If given three arguments, then
|
---|
| 1529 | the cookie with the specified name, path and domain is removed.
|
---|
| 1530 |
|
---|
| 1531 | Raises KeyError if no matching cookie exists.
|
---|
| 1532 |
|
---|
| 1533 | """
|
---|
| 1534 | if name is not None:
|
---|
| 1535 | if (domain is None) or (path is None):
|
---|
| 1536 | raise ValueError(
|
---|
| 1537 | "domain and path must be given to remove a cookie by name")
|
---|
| 1538 | del self._cookies[domain][path][name]
|
---|
| 1539 | elif path is not None:
|
---|
| 1540 | if domain is None:
|
---|
| 1541 | raise ValueError(
|
---|
| 1542 | "domain must be given to remove cookies by path")
|
---|
| 1543 | del self._cookies[domain][path]
|
---|
| 1544 | elif domain is not None:
|
---|
| 1545 | del self._cookies[domain]
|
---|
| 1546 | else:
|
---|
| 1547 | self._cookies = {}
|
---|
| 1548 |
|
---|
| 1549 | def clear_session_cookies(self):
|
---|
| 1550 | """Discard all session cookies.
|
---|
| 1551 |
|
---|
| 1552 | Discards all cookies held by object which had either no Max-Age or
|
---|
| 1553 | Expires cookie-attribute or an explicit Discard cookie-attribute, or
|
---|
| 1554 | which otherwise have ended up with a true discard attribute. For
|
---|
| 1555 | interactive browsers, the end of a session usually corresponds to
|
---|
| 1556 | closing the browser window.
|
---|
| 1557 |
|
---|
| 1558 | Note that the save method won't save session cookies anyway, unless you
|
---|
| 1559 | ask otherwise by passing a true ignore_discard argument.
|
---|
| 1560 |
|
---|
| 1561 | """
|
---|
| 1562 | for cookie in self:
|
---|
| 1563 | if cookie.discard:
|
---|
| 1564 | self.clear(cookie.domain, cookie.path, cookie.name)
|
---|
| 1565 |
|
---|
| 1566 | def clear_expired_cookies(self):
|
---|
| 1567 | """Discard all expired cookies.
|
---|
| 1568 |
|
---|
| 1569 | You probably don't need to call this method: expired cookies are never
|
---|
| 1570 | sent back to the server (provided you're using DefaultCookiePolicy),
|
---|
| 1571 | this method is called by CookieJar itself every so often, and the save
|
---|
| 1572 | method won't save expired cookies anyway (unless you ask otherwise by
|
---|
| 1573 | passing a true ignore_expires argument).
|
---|
| 1574 |
|
---|
| 1575 | """
|
---|
| 1576 | now = time.time()
|
---|
| 1577 | for cookie in self:
|
---|
| 1578 | if cookie.is_expired(now):
|
---|
| 1579 | self.clear(cookie.domain, cookie.path, cookie.name)
|
---|
| 1580 |
|
---|
| 1581 | def __getitem__(self, i):
|
---|
| 1582 | if i == 0:
|
---|
| 1583 | self._getitem_iterator = self.__iter__()
|
---|
| 1584 | elif self._prev_getitem_index != i - 1: raise IndexError(
|
---|
| 1585 | "CookieJar.__getitem__ only supports sequential iteration")
|
---|
| 1586 | self._prev_getitem_index = i
|
---|
| 1587 | try:
|
---|
| 1588 | return self._getitem_iterator.next()
|
---|
| 1589 | except StopIteration:
|
---|
| 1590 | raise IndexError()
|
---|
| 1591 |
|
---|
| 1592 | def __iter__(self):
|
---|
| 1593 | return MappingIterator(self._cookies)
|
---|
| 1594 |
|
---|
| 1595 | def __len__(self):
|
---|
| 1596 | """Return number of contained cookies."""
|
---|
| 1597 | i = 0
|
---|
| 1598 | for cookie in self: i = i + 1
|
---|
| 1599 | return i
|
---|
| 1600 |
|
---|
| 1601 | def __repr__(self):
|
---|
| 1602 | r = []
|
---|
| 1603 | for cookie in self: r.append(repr(cookie))
|
---|
| 1604 | return "<%s[%s]>" % (self.__class__, ", ".join(r))
|
---|
| 1605 |
|
---|
| 1606 | def __str__(self):
|
---|
| 1607 | r = []
|
---|
| 1608 | for cookie in self: r.append(str(cookie))
|
---|
| 1609 | return "<%s[%s]>" % (self.__class__, ", ".join(r))
|
---|
| 1610 |
|
---|
| 1611 |
|
---|
| 1612 | class LoadError(Exception): pass
|
---|
| 1613 |
|
---|
| 1614 | class FileCookieJar(CookieJar):
|
---|
| 1615 | """CookieJar that can be loaded from and saved to a file.
|
---|
| 1616 |
|
---|
| 1617 | Additional methods
|
---|
| 1618 |
|
---|
| 1619 | save(filename=None, ignore_discard=False, ignore_expires=False)
|
---|
| 1620 | load(filename=None, ignore_discard=False, ignore_expires=False)
|
---|
| 1621 | revert(filename=None, ignore_discard=False, ignore_expires=False)
|
---|
| 1622 |
|
---|
| 1623 | Additional public attributes
|
---|
| 1624 |
|
---|
| 1625 | filename: filename for loading and saving cookies
|
---|
| 1626 |
|
---|
| 1627 | Additional public readable attributes
|
---|
| 1628 |
|
---|
| 1629 | delayload: request that cookies are lazily loaded from disk; this is only
|
---|
| 1630 | a hint since this only affects performance, not behaviour (unless the
|
---|
| 1631 | cookies on disk are changing); a CookieJar object may ignore it (in fact,
|
---|
| 1632 | only MSIECookieJar lazily loads cookies at the moment)
|
---|
| 1633 |
|
---|
| 1634 | """
|
---|
| 1635 |
|
---|
| 1636 | def __init__(self, filename=None, delayload=False, policy=None):
|
---|
| 1637 | """
|
---|
| 1638 | See FileCookieJar.__doc__ for argument documentation.
|
---|
| 1639 |
|
---|
| 1640 | Cookies are NOT loaded from the named file until either the load or
|
---|
| 1641 | revert method is called.
|
---|
| 1642 |
|
---|
| 1643 | """
|
---|
| 1644 | CookieJar.__init__(self, policy)
|
---|
| 1645 | if filename is not None and not isstringlike(filename):
|
---|
| 1646 | raise ValueError("filename must be string-like")
|
---|
| 1647 | self.filename = filename
|
---|
| 1648 | self.delayload = bool(delayload)
|
---|
| 1649 |
|
---|
| 1650 | def save(self, filename=None, ignore_discard=False, ignore_expires=False):
|
---|
| 1651 | """Save cookies to a file.
|
---|
| 1652 |
|
---|
| 1653 | filename: name of file in which to save cookies
|
---|
| 1654 | ignore_discard: save even cookies set to be discarded
|
---|
| 1655 | ignore_expires: save even cookies that have expired
|
---|
| 1656 |
|
---|
| 1657 | The file is overwritten if it already exists, thus wiping all its
|
---|
| 1658 | cookies. Saved cookies can be restored later using the load or revert
|
---|
| 1659 | methods. If filename is not specified, self.filename is used; if
|
---|
| 1660 | self.filename is None, ValueError is raised.
|
---|
| 1661 |
|
---|
| 1662 | """
|
---|
| 1663 | raise NotImplementedError()
|
---|
| 1664 |
|
---|
| 1665 | def load(self, filename=None, ignore_discard=False, ignore_expires=False):
|
---|
| 1666 | """Load cookies from a file.
|
---|
| 1667 |
|
---|
| 1668 | Old cookies are kept unless overwritten by newly loaded ones.
|
---|
| 1669 |
|
---|
| 1670 | Arguments are as for .save().
|
---|
| 1671 |
|
---|
| 1672 | If filename is not specified, self.filename is used; if self.filename
|
---|
| 1673 | is None, ValueError is raised. The named file must be in the format
|
---|
| 1674 | understood by the class, or LoadError will be raised. This format will
|
---|
| 1675 | be identical to that written by the save method, unless the load format
|
---|
| 1676 | is not sufficiently well understood (as is the case for MSIECookieJar).
|
---|
| 1677 |
|
---|
| 1678 | """
|
---|
| 1679 | if filename is None:
|
---|
| 1680 | if self.filename is not None: filename = self.filename
|
---|
| 1681 | else: raise ValueError(MISSING_FILENAME_TEXT)
|
---|
| 1682 |
|
---|
| 1683 | f = open(filename)
|
---|
| 1684 | try:
|
---|
| 1685 | self._really_load(f, filename, ignore_discard, ignore_expires)
|
---|
| 1686 | finally:
|
---|
| 1687 | f.close()
|
---|
| 1688 |
|
---|
| 1689 | def revert(self, filename=None,
|
---|
| 1690 | ignore_discard=False, ignore_expires=False):
|
---|
| 1691 | """Clear all cookies and reload cookies from a saved file.
|
---|
| 1692 |
|
---|
| 1693 | Raises LoadError (or IOError) if reversion is not successful; the
|
---|
| 1694 | object's state will not be altered if this happens.
|
---|
| 1695 |
|
---|
| 1696 | """
|
---|
| 1697 | if filename is None:
|
---|
| 1698 | if self.filename is not None: filename = self.filename
|
---|
| 1699 | else: raise ValueError(MISSING_FILENAME_TEXT)
|
---|
| 1700 |
|
---|
| 1701 | old_state = copy.deepcopy(self._cookies)
|
---|
| 1702 | self._cookies = {}
|
---|
| 1703 | try:
|
---|
| 1704 | self.load(filename, ignore_discard, ignore_expires)
|
---|
| 1705 | except (LoadError, IOError):
|
---|
| 1706 | self._cookies = old_state
|
---|
| 1707 | raise
|
---|