1 | """Integration with Python standard library module urllib2: OpenerDirector
|
---|
2 | class.
|
---|
3 |
|
---|
4 | Copyright 2004-2006 John J Lee <jjl@pobox.com>
|
---|
5 |
|
---|
6 | This code is free software; you can redistribute it and/or modify it
|
---|
7 | under the terms of the BSD or ZPL 2.1 licenses (see the file
|
---|
8 | COPYING.txt included with the distribution).
|
---|
9 |
|
---|
10 | """
|
---|
11 |
|
---|
12 | import os, urllib2, bisect, httplib, types, tempfile
|
---|
13 | try:
|
---|
14 | import threading as _threading
|
---|
15 | except ImportError:
|
---|
16 | import dummy_threading as _threading
|
---|
17 | try:
|
---|
18 | set
|
---|
19 | except NameError:
|
---|
20 | import sets
|
---|
21 | set = sets.Set
|
---|
22 |
|
---|
23 | import _file
|
---|
24 | import _http
|
---|
25 | from _request import Request
|
---|
26 | import _response
|
---|
27 | import _rfc3986
|
---|
28 | import _sockettimeout
|
---|
29 | import _upgrade
|
---|
30 | from _util import isstringlike
|
---|
31 |
|
---|
32 |
|
---|
33 | class ContentTooShortError(urllib2.URLError):
|
---|
34 | def __init__(self, reason, result):
|
---|
35 | urllib2.URLError.__init__(self, reason)
|
---|
36 | self.result = result
|
---|
37 |
|
---|
38 |
|
---|
39 | def set_request_attr(req, name, value, default):
|
---|
40 | try:
|
---|
41 | getattr(req, name)
|
---|
42 | except AttributeError:
|
---|
43 | setattr(req, name, default)
|
---|
44 | if value is not default:
|
---|
45 | setattr(req, name, value)
|
---|
46 |
|
---|
47 |
|
---|
48 | class OpenerDirector(urllib2.OpenerDirector):
|
---|
49 | def __init__(self):
|
---|
50 | urllib2.OpenerDirector.__init__(self)
|
---|
51 | # really none of these are (sanely) public -- the lack of initial
|
---|
52 | # underscore on some is just due to following urllib2
|
---|
53 | self.process_response = {}
|
---|
54 | self.process_request = {}
|
---|
55 | self._any_request = {}
|
---|
56 | self._any_response = {}
|
---|
57 | self._handler_index_valid = True
|
---|
58 | self._tempfiles = []
|
---|
59 |
|
---|
60 | def add_handler(self, handler):
|
---|
61 | if handler in self.handlers:
|
---|
62 | return
|
---|
63 | # XXX why does self.handlers need to be sorted?
|
---|
64 | bisect.insort(self.handlers, handler)
|
---|
65 | handler.add_parent(self)
|
---|
66 | self._handler_index_valid = False
|
---|
67 |
|
---|
68 | def _maybe_reindex_handlers(self):
|
---|
69 | if self._handler_index_valid:
|
---|
70 | return
|
---|
71 |
|
---|
72 | handle_error = {}
|
---|
73 | handle_open = {}
|
---|
74 | process_request = {}
|
---|
75 | process_response = {}
|
---|
76 | any_request = set()
|
---|
77 | any_response = set()
|
---|
78 | unwanted = []
|
---|
79 |
|
---|
80 | for handler in self.handlers:
|
---|
81 | added = False
|
---|
82 | for meth in dir(handler):
|
---|
83 | if meth in ["redirect_request", "do_open", "proxy_open"]:
|
---|
84 | # oops, coincidental match
|
---|
85 | continue
|
---|
86 |
|
---|
87 | if meth == "any_request":
|
---|
88 | any_request.add(handler)
|
---|
89 | added = True
|
---|
90 | continue
|
---|
91 | elif meth == "any_response":
|
---|
92 | any_response.add(handler)
|
---|
93 | added = True
|
---|
94 | continue
|
---|
95 |
|
---|
96 | ii = meth.find("_")
|
---|
97 | scheme = meth[:ii]
|
---|
98 | condition = meth[ii + 1:]
|
---|
99 |
|
---|
100 | if condition.startswith("error"):
|
---|
101 | jj = meth[ii + 1:].find("_") + ii + 1
|
---|
102 | kind = meth[jj + 1:]
|
---|
103 | try:
|
---|
104 | kind = int(kind)
|
---|
105 | except ValueError:
|
---|
106 | pass
|
---|
107 | lookup = handle_error.setdefault(scheme, {})
|
---|
108 | elif condition == "open":
|
---|
109 | kind = scheme
|
---|
110 | lookup = handle_open
|
---|
111 | elif condition == "request":
|
---|
112 | kind = scheme
|
---|
113 | lookup = process_request
|
---|
114 | elif condition == "response":
|
---|
115 | kind = scheme
|
---|
116 | lookup = process_response
|
---|
117 | else:
|
---|
118 | continue
|
---|
119 |
|
---|
120 | lookup.setdefault(kind, set()).add(handler)
|
---|
121 | added = True
|
---|
122 |
|
---|
123 | if not added:
|
---|
124 | unwanted.append(handler)
|
---|
125 |
|
---|
126 | for handler in unwanted:
|
---|
127 | self.handlers.remove(handler)
|
---|
128 |
|
---|
129 | # sort indexed methods
|
---|
130 | # XXX could be cleaned up
|
---|
131 | for lookup in [process_request, process_response]:
|
---|
132 | for scheme, handlers in lookup.iteritems():
|
---|
133 | lookup[scheme] = handlers
|
---|
134 | for scheme, lookup in handle_error.iteritems():
|
---|
135 | for code, handlers in lookup.iteritems():
|
---|
136 | handlers = list(handlers)
|
---|
137 | handlers.sort()
|
---|
138 | lookup[code] = handlers
|
---|
139 | for scheme, handlers in handle_open.iteritems():
|
---|
140 | handlers = list(handlers)
|
---|
141 | handlers.sort()
|
---|
142 | handle_open[scheme] = handlers
|
---|
143 |
|
---|
144 | # cache the indexes
|
---|
145 | self.handle_error = handle_error
|
---|
146 | self.handle_open = handle_open
|
---|
147 | self.process_request = process_request
|
---|
148 | self.process_response = process_response
|
---|
149 | self._any_request = any_request
|
---|
150 | self._any_response = any_response
|
---|
151 |
|
---|
152 | def _request(self, url_or_req, data, visit,
|
---|
153 | timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
---|
154 | if isstringlike(url_or_req):
|
---|
155 | req = Request(url_or_req, data, visit=visit, timeout=timeout)
|
---|
156 | else:
|
---|
157 | # already a urllib2.Request or mechanize.Request instance
|
---|
158 | req = url_or_req
|
---|
159 | if data is not None:
|
---|
160 | req.add_data(data)
|
---|
161 | # XXX yuck
|
---|
162 | set_request_attr(req, "visit", visit, None)
|
---|
163 | set_request_attr(req, "timeout", timeout,
|
---|
164 | _sockettimeout._GLOBAL_DEFAULT_TIMEOUT)
|
---|
165 | return req
|
---|
166 |
|
---|
167 | def open(self, fullurl, data=None,
|
---|
168 | timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
---|
169 | req = self._request(fullurl, data, None, timeout)
|
---|
170 | req_scheme = req.get_type()
|
---|
171 |
|
---|
172 | self._maybe_reindex_handlers()
|
---|
173 |
|
---|
174 | # pre-process request
|
---|
175 | # XXX should we allow a Processor to change the URL scheme
|
---|
176 | # of the request?
|
---|
177 | request_processors = set(self.process_request.get(req_scheme, []))
|
---|
178 | request_processors.update(self._any_request)
|
---|
179 | request_processors = list(request_processors)
|
---|
180 | request_processors.sort()
|
---|
181 | for processor in request_processors:
|
---|
182 | for meth_name in ["any_request", req_scheme + "_request"]:
|
---|
183 | meth = getattr(processor, meth_name, None)
|
---|
184 | if meth:
|
---|
185 | req = meth(req)
|
---|
186 |
|
---|
187 | # In Python >= 2.4, .open() supports processors already, so we must
|
---|
188 | # call ._open() instead.
|
---|
189 | urlopen = getattr(urllib2.OpenerDirector, "_open",
|
---|
190 | urllib2.OpenerDirector.open)
|
---|
191 | response = urlopen(self, req, data)
|
---|
192 |
|
---|
193 | # post-process response
|
---|
194 | response_processors = set(self.process_response.get(req_scheme, []))
|
---|
195 | response_processors.update(self._any_response)
|
---|
196 | response_processors = list(response_processors)
|
---|
197 | response_processors.sort()
|
---|
198 | for processor in response_processors:
|
---|
199 | for meth_name in ["any_response", req_scheme + "_response"]:
|
---|
200 | meth = getattr(processor, meth_name, None)
|
---|
201 | if meth:
|
---|
202 | response = meth(req, response)
|
---|
203 |
|
---|
204 | return response
|
---|
205 |
|
---|
206 | def error(self, proto, *args):
|
---|
207 | if proto in ['http', 'https']:
|
---|
208 | # XXX http[s] protocols are special-cased
|
---|
209 | dict = self.handle_error['http'] # https is not different than http
|
---|
210 | proto = args[2] # YUCK!
|
---|
211 | meth_name = 'http_error_%s' % proto
|
---|
212 | http_err = 1
|
---|
213 | orig_args = args
|
---|
214 | else:
|
---|
215 | dict = self.handle_error
|
---|
216 | meth_name = proto + '_error'
|
---|
217 | http_err = 0
|
---|
218 | args = (dict, proto, meth_name) + args
|
---|
219 | result = apply(self._call_chain, args)
|
---|
220 | if result:
|
---|
221 | return result
|
---|
222 |
|
---|
223 | if http_err:
|
---|
224 | args = (dict, 'default', 'http_error_default') + orig_args
|
---|
225 | return apply(self._call_chain, args)
|
---|
226 |
|
---|
227 | BLOCK_SIZE = 1024 * 8
|
---|
228 | def retrieve(self, fullurl, filename=None, reporthook=None, data=None,
|
---|
229 | timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
---|
230 | """Returns (filename, headers).
|
---|
231 |
|
---|
232 | For remote objects, the default filename will refer to a temporary
|
---|
233 | file. Temporary files are removed when the OpenerDirector.close()
|
---|
234 | method is called.
|
---|
235 |
|
---|
236 | For file: URLs, at present the returned filename is None. This may
|
---|
237 | change in future.
|
---|
238 |
|
---|
239 | If the actual number of bytes read is less than indicated by the
|
---|
240 | Content-Length header, raises ContentTooShortError (a URLError
|
---|
241 | subclass). The exception's .result attribute contains the (filename,
|
---|
242 | headers) that would have been returned.
|
---|
243 |
|
---|
244 | """
|
---|
245 | req = self._request(fullurl, data, False, timeout)
|
---|
246 | scheme = req.get_type()
|
---|
247 | fp = self.open(req)
|
---|
248 | headers = fp.info()
|
---|
249 | if filename is None and scheme == 'file':
|
---|
250 | # XXX req.get_selector() seems broken here, return None,
|
---|
251 | # pending sanity :-/
|
---|
252 | return None, headers
|
---|
253 | #return urllib.url2pathname(req.get_selector()), headers
|
---|
254 | if filename:
|
---|
255 | tfp = open(filename, 'wb')
|
---|
256 | else:
|
---|
257 | path = _rfc3986.urlsplit(req.get_full_url())[2]
|
---|
258 | suffix = os.path.splitext(path)[1]
|
---|
259 | fd, filename = tempfile.mkstemp(suffix)
|
---|
260 | self._tempfiles.append(filename)
|
---|
261 | tfp = os.fdopen(fd, 'wb')
|
---|
262 |
|
---|
263 | result = filename, headers
|
---|
264 | bs = self.BLOCK_SIZE
|
---|
265 | size = -1
|
---|
266 | read = 0
|
---|
267 | blocknum = 0
|
---|
268 | if reporthook:
|
---|
269 | if "content-length" in headers:
|
---|
270 | size = int(headers["Content-Length"])
|
---|
271 | reporthook(blocknum, bs, size)
|
---|
272 | while 1:
|
---|
273 | block = fp.read(bs)
|
---|
274 | if block == "":
|
---|
275 | break
|
---|
276 | read += len(block)
|
---|
277 | tfp.write(block)
|
---|
278 | blocknum += 1
|
---|
279 | if reporthook:
|
---|
280 | reporthook(blocknum, bs, size)
|
---|
281 | fp.close()
|
---|
282 | tfp.close()
|
---|
283 | del fp
|
---|
284 | del tfp
|
---|
285 |
|
---|
286 | # raise exception if actual size does not match content-length header
|
---|
287 | if size >= 0 and read < size:
|
---|
288 | raise ContentTooShortError(
|
---|
289 | "retrieval incomplete: "
|
---|
290 | "got only %i out of %i bytes" % (read, size),
|
---|
291 | result
|
---|
292 | )
|
---|
293 |
|
---|
294 | return result
|
---|
295 |
|
---|
296 | def close(self):
|
---|
297 | urllib2.OpenerDirector.close(self)
|
---|
298 |
|
---|
299 | # make it very obvious this object is no longer supposed to be used
|
---|
300 | self.open = self.error = self.retrieve = self.add_handler = None
|
---|
301 |
|
---|
302 | if self._tempfiles:
|
---|
303 | for filename in self._tempfiles:
|
---|
304 | try:
|
---|
305 | os.unlink(filename)
|
---|
306 | except OSError:
|
---|
307 | pass
|
---|
308 | del self._tempfiles[:]
|
---|
309 |
|
---|
310 |
|
---|
311 | def wrapped_open(urlopen, process_response_object, fullurl, data=None,
|
---|
312 | timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
---|
313 | success = True
|
---|
314 | try:
|
---|
315 | response = urlopen(fullurl, data, timeout)
|
---|
316 | except urllib2.HTTPError, error:
|
---|
317 | success = False
|
---|
318 | if error.fp is None: # not a response
|
---|
319 | raise
|
---|
320 | response = error
|
---|
321 |
|
---|
322 | if response is not None:
|
---|
323 | response = process_response_object(response)
|
---|
324 |
|
---|
325 | if not success:
|
---|
326 | raise response
|
---|
327 | return response
|
---|
328 |
|
---|
329 | class ResponseProcessingOpener(OpenerDirector):
|
---|
330 |
|
---|
331 | def open(self, fullurl, data=None,
|
---|
332 | timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
---|
333 | def bound_open(fullurl, data=None,
|
---|
334 | timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
---|
335 | return OpenerDirector.open(self, fullurl, data, timeout)
|
---|
336 | return wrapped_open(
|
---|
337 | bound_open, self.process_response_object, fullurl, data, timeout)
|
---|
338 |
|
---|
339 | def process_response_object(self, response):
|
---|
340 | return response
|
---|
341 |
|
---|
342 |
|
---|
343 | class SeekableResponseOpener(ResponseProcessingOpener):
|
---|
344 | def process_response_object(self, response):
|
---|
345 | return _response.seek_wrapped_response(response)
|
---|
346 |
|
---|
347 |
|
---|
348 | class OpenerFactory:
|
---|
349 | """This class's interface is quite likely to change."""
|
---|
350 |
|
---|
351 | default_classes = [
|
---|
352 | # handlers
|
---|
353 | urllib2.ProxyHandler,
|
---|
354 | urllib2.UnknownHandler,
|
---|
355 | _http.HTTPHandler, # derived from new AbstractHTTPHandler
|
---|
356 | _http.HTTPDefaultErrorHandler,
|
---|
357 | _http.HTTPRedirectHandler, # bugfixed
|
---|
358 | urllib2.FTPHandler,
|
---|
359 | _file.FileHandler,
|
---|
360 | # processors
|
---|
361 | _upgrade.HTTPRequestUpgradeProcessor,
|
---|
362 | _http.HTTPCookieProcessor,
|
---|
363 | _http.HTTPErrorProcessor,
|
---|
364 | ]
|
---|
365 | if hasattr(httplib, 'HTTPS'):
|
---|
366 | default_classes.append(_http.HTTPSHandler)
|
---|
367 | handlers = []
|
---|
368 | replacement_handlers = []
|
---|
369 |
|
---|
370 | def __init__(self, klass=OpenerDirector):
|
---|
371 | self.klass = klass
|
---|
372 |
|
---|
373 | def build_opener(self, *handlers):
|
---|
374 | """Create an opener object from a list of handlers and processors.
|
---|
375 |
|
---|
376 | The opener will use several default handlers and processors, including
|
---|
377 | support for HTTP and FTP.
|
---|
378 |
|
---|
379 | If any of the handlers passed as arguments are subclasses of the
|
---|
380 | default handlers, the default handlers will not be used.
|
---|
381 |
|
---|
382 | """
|
---|
383 | opener = self.klass()
|
---|
384 | default_classes = list(self.default_classes)
|
---|
385 | skip = []
|
---|
386 | for klass in default_classes:
|
---|
387 | for check in handlers:
|
---|
388 | if type(check) == types.ClassType:
|
---|
389 | if issubclass(check, klass):
|
---|
390 | skip.append(klass)
|
---|
391 | elif type(check) == types.InstanceType:
|
---|
392 | if isinstance(check, klass):
|
---|
393 | skip.append(klass)
|
---|
394 | for klass in skip:
|
---|
395 | default_classes.remove(klass)
|
---|
396 |
|
---|
397 | for klass in default_classes:
|
---|
398 | opener.add_handler(klass())
|
---|
399 | for h in handlers:
|
---|
400 | if type(h) == types.ClassType:
|
---|
401 | h = h()
|
---|
402 | opener.add_handler(h)
|
---|
403 |
|
---|
404 | return opener
|
---|
405 |
|
---|
406 |
|
---|
407 | build_opener = OpenerFactory().build_opener
|
---|
408 |
|
---|
409 | _opener = None
|
---|
410 | urlopen_lock = _threading.Lock()
|
---|
411 | def urlopen(url, data=None, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
---|
412 | global _opener
|
---|
413 | if _opener is None:
|
---|
414 | urlopen_lock.acquire()
|
---|
415 | try:
|
---|
416 | if _opener is None:
|
---|
417 | _opener = build_opener()
|
---|
418 | finally:
|
---|
419 | urlopen_lock.release()
|
---|
420 | return _opener.open(url, data, timeout)
|
---|
421 |
|
---|
422 | def urlretrieve(url, filename=None, reporthook=None, data=None,
|
---|
423 | timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
---|
424 | global _opener
|
---|
425 | if _opener is None:
|
---|
426 | urlopen_lock.acquire()
|
---|
427 | try:
|
---|
428 | if _opener is None:
|
---|
429 | _opener = build_opener()
|
---|
430 | finally:
|
---|
431 | urlopen_lock.release()
|
---|
432 | return _opener.retrieve(url, filename, reporthook, data, timeout)
|
---|
433 |
|
---|
434 | def install_opener(opener):
|
---|
435 | global _opener
|
---|
436 | _opener = opener
|
---|