source: py-scraping/mechanize/_msiecookiejar.py@ 151

Last change on this file since 151 was 106, checked in by Rick van der Zwet, 15 years ago

Initial commit...

File size: 14.4 KB
RevLine 
[106]1"""Microsoft Internet Explorer cookie loading on Windows.
2
3Copyright 2002-2003 Johnny Lee <typo_pl@hotmail.com> (MSIE Perl code)
4Copyright 2002-2006 John J Lee <jjl@pobox.com> (The Python port)
5
6This code is free software; you can redistribute it and/or modify it
7under the terms of the BSD or ZPL 2.1 licenses (see the file
8COPYING.txt included with the distribution).
9
10"""
11
12# XXX names and comments are not great here
13
14import os, re, time, struct, logging
15if os.name == "nt":
16 import _winreg
17
18from _clientcookie import FileCookieJar, CookieJar, Cookie, \
19 MISSING_FILENAME_TEXT, LoadError
20
21debug = logging.getLogger("mechanize").debug
22
23
24def regload(path, leaf):
25 key = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, path, 0,
26 _winreg.KEY_ALL_ACCESS)
27 try:
28 value = _winreg.QueryValueEx(key, leaf)[0]
29 except WindowsError:
30 value = None
31 return value
32
33WIN32_EPOCH = 0x019db1ded53e8000L # 1970 Jan 01 00:00:00 in Win32 FILETIME
34
35def epoch_time_offset_from_win32_filetime(filetime):
36 """Convert from win32 filetime to seconds-since-epoch value.
37
38 MSIE stores create and expire times as Win32 FILETIME, which is 64
39 bits of 100 nanosecond intervals since Jan 01 1601.
40
41 mechanize expects time in 32-bit value expressed in seconds since the
42 epoch (Jan 01 1970).
43
44 """
45 if filetime < WIN32_EPOCH:
46 raise ValueError("filetime (%d) is before epoch (%d)" %
47 (filetime, WIN32_EPOCH))
48
49 return divmod((filetime - WIN32_EPOCH), 10000000L)[0]
50
51def binary_to_char(c): return "%02X" % ord(c)
52def binary_to_str(d): return "".join(map(binary_to_char, list(d)))
53
54class MSIEBase:
55 magic_re = re.compile(r"Client UrlCache MMF Ver \d\.\d.*")
56 padding = "\x0d\xf0\xad\x0b"
57
58 msie_domain_re = re.compile(r"^([^/]+)(/.*)$")
59 cookie_re = re.compile("Cookie\:.+\@([\x21-\xFF]+).*?"
60 "(.+\@[\x21-\xFF]+\.txt)")
61
62 # path under HKEY_CURRENT_USER from which to get location of index.dat
63 reg_path = r"software\microsoft\windows" \
64 r"\currentversion\explorer\shell folders"
65 reg_key = "Cookies"
66
67 def __init__(self):
68 self._delayload_domains = {}
69
70 def _delayload_domain(self, domain):
71 # if necessary, lazily load cookies for this domain
72 delayload_info = self._delayload_domains.get(domain)
73 if delayload_info is not None:
74 cookie_file, ignore_discard, ignore_expires = delayload_info
75 try:
76 self.load_cookie_data(cookie_file,
77 ignore_discard, ignore_expires)
78 except (LoadError, IOError):
79 debug("error reading cookie file, skipping: %s", cookie_file)
80 else:
81 del self._delayload_domains[domain]
82
83 def _load_cookies_from_file(self, filename):
84 debug("Loading MSIE cookies file: %s", filename)
85 cookies = []
86
87 cookies_fh = open(filename)
88
89 try:
90 while 1:
91 key = cookies_fh.readline()
92 if key == "": break
93
94 rl = cookies_fh.readline
95 def getlong(rl=rl): return long(rl().rstrip())
96 def getstr(rl=rl): return rl().rstrip()
97
98 key = key.rstrip()
99 value = getstr()
100 domain_path = getstr()
101 flags = getlong() # 0x2000 bit is for secure I think
102 lo_expire = getlong()
103 hi_expire = getlong()
104 lo_create = getlong()
105 hi_create = getlong()
106 sep = getstr()
107
108 if "" in (key, value, domain_path, flags, hi_expire, lo_expire,
109 hi_create, lo_create, sep) or (sep != "*"):
110 break
111
112 m = self.msie_domain_re.search(domain_path)
113 if m:
114 domain = m.group(1)
115 path = m.group(2)
116
117 cookies.append({"KEY": key, "VALUE": value,
118 "DOMAIN": domain, "PATH": path,
119 "FLAGS": flags, "HIXP": hi_expire,
120 "LOXP": lo_expire, "HICREATE": hi_create,
121 "LOCREATE": lo_create})
122 finally:
123 cookies_fh.close()
124
125 return cookies
126
127 def load_cookie_data(self, filename,
128 ignore_discard=False, ignore_expires=False):
129 """Load cookies from file containing actual cookie data.
130
131 Old cookies are kept unless overwritten by newly loaded ones.
132
133 You should not call this method if the delayload attribute is set.
134
135 I think each of these files contain all cookies for one user, domain,
136 and path.
137
138 filename: file containing cookies -- usually found in a file like
139 C:\WINNT\Profiles\joe\Cookies\joe@blah[1].txt
140
141 """
142 now = int(time.time())
143
144 cookie_data = self._load_cookies_from_file(filename)
145
146 for cookie in cookie_data:
147 flags = cookie["FLAGS"]
148 secure = ((flags & 0x2000) != 0)
149 filetime = (cookie["HIXP"] << 32) + cookie["LOXP"]
150 expires = epoch_time_offset_from_win32_filetime(filetime)
151 if expires < now:
152 discard = True
153 else:
154 discard = False
155 domain = cookie["DOMAIN"]
156 initial_dot = domain.startswith(".")
157 if initial_dot:
158 domain_specified = True
159 else:
160 # MSIE 5 does not record whether the domain cookie-attribute
161 # was specified.
162 # Assuming it wasn't is conservative, because with strict
163 # domain matching this will match less frequently; with regular
164 # Netscape tail-matching, this will match at exactly the same
165 # times that domain_specified = True would. It also means we
166 # don't have to prepend a dot to achieve consistency with our
167 # own & Mozilla's domain-munging scheme.
168 domain_specified = False
169
170 # assume path_specified is false
171 # XXX is there other stuff in here? -- eg. comment, commentURL?
172 c = Cookie(0,
173 cookie["KEY"], cookie["VALUE"],
174 None, False,
175 domain, domain_specified, initial_dot,
176 cookie["PATH"], False,
177 secure,
178 expires,
179 discard,
180 None,
181 None,
182 {"flags": flags})
183 if not ignore_discard and c.discard:
184 continue
185 if not ignore_expires and c.is_expired(now):
186 continue
187 CookieJar.set_cookie(self, c)
188
189 def load_from_registry(self, ignore_discard=False, ignore_expires=False,
190 username=None):
191 """
192 username: only required on win9x
193
194 """
195 cookies_dir = regload(self.reg_path, self.reg_key)
196 filename = os.path.normpath(os.path.join(cookies_dir, "INDEX.DAT"))
197 self.load(filename, ignore_discard, ignore_expires, username)
198
199 def _really_load(self, index, filename, ignore_discard, ignore_expires,
200 username):
201 now = int(time.time())
202
203 if username is None:
204 username = os.environ['USERNAME'].lower()
205
206 cookie_dir = os.path.dirname(filename)
207
208 data = index.read(256)
209 if len(data) != 256:
210 raise LoadError("%s file is too short" % filename)
211
212 # Cookies' index.dat file starts with 32 bytes of signature
213 # followed by an offset to the first record, stored as a little-
214 # endian DWORD.
215 sig, size, data = data[:32], data[32:36], data[36:]
216 size = struct.unpack("<L", size)[0]
217
218 # check that sig is valid
219 if not self.magic_re.match(sig) or size != 0x4000:
220 raise LoadError("%s ['%s' %s] does not seem to contain cookies" %
221 (str(filename), sig, size))
222
223 # skip to start of first record
224 index.seek(size, 0)
225
226 sector = 128 # size of sector in bytes
227
228 while 1:
229 data = ""
230
231 # Cookies are usually in two contiguous sectors, so read in two
232 # sectors and adjust if not a Cookie.
233 to_read = 2 * sector
234 d = index.read(to_read)
235 if len(d) != to_read:
236 break
237 data = data + d
238
239 # Each record starts with a 4-byte signature and a count
240 # (little-endian DWORD) of sectors for the record.
241 sig, size, data = data[:4], data[4:8], data[8:]
242 size = struct.unpack("<L", size)[0]
243
244 to_read = (size - 2) * sector
245
246## from urllib import quote
247## print "data", quote(data)
248## print "sig", quote(sig)
249## print "size in sectors", size
250## print "size in bytes", size*sector
251## print "size in units of 16 bytes", (size*sector) / 16
252## print "size to read in bytes", to_read
253## print
254
255 if sig != "URL ":
256 assert sig in ("HASH", "LEAK", \
257 self.padding, "\x00\x00\x00\x00"), \
258 "unrecognized MSIE index.dat record: %s" % \
259 binary_to_str(sig)
260 if sig == "\x00\x00\x00\x00":
261 # assume we've got all the cookies, and stop
262 break
263 if sig == self.padding:
264 continue
265 # skip the rest of this record
266 assert to_read >= 0
267 if size != 2:
268 assert to_read != 0
269 index.seek(to_read, 1)
270 continue
271
272 # read in rest of record if necessary
273 if size > 2:
274 more_data = index.read(to_read)
275 if len(more_data) != to_read: break
276 data = data + more_data
277
278 cookie_re = ("Cookie\:%s\@([\x21-\xFF]+).*?" % username +
279 "(%s\@[\x21-\xFF]+\.txt)" % username)
280 m = re.search(cookie_re, data, re.I)
281 if m:
282 cookie_file = os.path.join(cookie_dir, m.group(2))
283 if not self.delayload:
284 try:
285 self.load_cookie_data(cookie_file,
286 ignore_discard, ignore_expires)
287 except (LoadError, IOError):
288 debug("error reading cookie file, skipping: %s",
289 cookie_file)
290 else:
291 domain = m.group(1)
292 i = domain.find("/")
293 if i != -1:
294 domain = domain[:i]
295
296 self._delayload_domains[domain] = (
297 cookie_file, ignore_discard, ignore_expires)
298
299
300class MSIECookieJar(MSIEBase, FileCookieJar):
301 """FileCookieJar that reads from the Windows MSIE cookies database.
302
303 MSIECookieJar can read the cookie files of Microsoft Internet Explorer
304 (MSIE) for Windows version 5 on Windows NT and version 6 on Windows XP and
305 Windows 98. Other configurations may also work, but are untested. Saving
306 cookies in MSIE format is NOT supported. If you save cookies, they'll be
307 in the usual Set-Cookie3 format, which you can read back in using an
308 instance of the plain old CookieJar class. Don't save using the same
309 filename that you loaded cookies from, because you may succeed in
310 clobbering your MSIE cookies index file!
311
312 You should be able to have LWP share Internet Explorer's cookies like
313 this (note you need to supply a username to load_from_registry if you're on
314 Windows 9x or Windows ME):
315
316 cj = MSIECookieJar(delayload=1)
317 # find cookies index file in registry and load cookies from it
318 cj.load_from_registry()
319 opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))
320 response = opener.open("http://example.com/")
321
322 Iterating over a delayloaded MSIECookieJar instance will not cause any
323 cookies to be read from disk. To force reading of all cookies from disk,
324 call read_all_cookies. Note that the following methods iterate over self:
325 clear_temporary_cookies, clear_expired_cookies, __len__, __repr__, __str__
326 and as_string.
327
328 Additional methods:
329
330 load_from_registry(ignore_discard=False, ignore_expires=False,
331 username=None)
332 load_cookie_data(filename, ignore_discard=False, ignore_expires=False)
333 read_all_cookies()
334
335 """
336 def __init__(self, filename=None, delayload=False, policy=None):
337 MSIEBase.__init__(self)
338 FileCookieJar.__init__(self, filename, delayload, policy)
339
340 def set_cookie(self, cookie):
341 if self.delayload:
342 self._delayload_domain(cookie.domain)
343 CookieJar.set_cookie(self, cookie)
344
345 def _cookies_for_request(self, request):
346 """Return a list of cookies to be returned to server."""
347 domains = self._cookies.copy()
348 domains.update(self._delayload_domains)
349 domains = domains.keys()
350
351 cookies = []
352 for domain in domains:
353 cookies.extend(self._cookies_for_domain(domain, request))
354 return cookies
355
356 def _cookies_for_domain(self, domain, request):
357 if not self._policy.domain_return_ok(domain, request):
358 return []
359 debug("Checking %s for cookies to return", domain)
360 if self.delayload:
361 self._delayload_domain(domain)
362 return CookieJar._cookies_for_domain(self, domain, request)
363
364 def read_all_cookies(self):
365 """Eagerly read in all cookies."""
366 if self.delayload:
367 for domain in self._delayload_domains.keys():
368 self._delayload_domain(domain)
369
370 def load(self, filename, ignore_discard=False, ignore_expires=False,
371 username=None):
372 """Load cookies from an MSIE 'index.dat' cookies index file.
373
374 filename: full path to cookie index file
375 username: only required on win9x
376
377 """
378 if filename is None:
379 if self.filename is not None: filename = self.filename
380 else: raise ValueError(MISSING_FILENAME_TEXT)
381
382 index = open(filename, "rb")
383
384 try:
385 self._really_load(index, filename, ignore_discard, ignore_expires,
386 username)
387 finally:
388 index.close()
Note: See TracBrowser for help on using the repository browser.