source: py-scraping/mechanize/_opener.py@ 153

Last change on this file since 153 was 106, checked in by Rick van der Zwet, 15 years ago

Initial commit...

File size: 14.2 KB
Line 
1"""Integration with Python standard library module urllib2: OpenerDirector
2class.
3
4Copyright 2004-2006 John J Lee <jjl@pobox.com>
5
6This code is free software; you can redistribute it and/or modify it
7under the terms of the BSD or ZPL 2.1 licenses (see the file
8COPYING.txt included with the distribution).
9
10"""
11
12import os, urllib2, bisect, httplib, types, tempfile
13try:
14 import threading as _threading
15except ImportError:
16 import dummy_threading as _threading
17try:
18 set
19except NameError:
20 import sets
21 set = sets.Set
22
23import _file
24import _http
25from _request import Request
26import _response
27import _rfc3986
28import _sockettimeout
29import _upgrade
30from _util import isstringlike
31
32
33class ContentTooShortError(urllib2.URLError):
34 def __init__(self, reason, result):
35 urllib2.URLError.__init__(self, reason)
36 self.result = result
37
38
39def set_request_attr(req, name, value, default):
40 try:
41 getattr(req, name)
42 except AttributeError:
43 setattr(req, name, default)
44 if value is not default:
45 setattr(req, name, value)
46
47
48class OpenerDirector(urllib2.OpenerDirector):
49 def __init__(self):
50 urllib2.OpenerDirector.__init__(self)
51 # really none of these are (sanely) public -- the lack of initial
52 # underscore on some is just due to following urllib2
53 self.process_response = {}
54 self.process_request = {}
55 self._any_request = {}
56 self._any_response = {}
57 self._handler_index_valid = True
58 self._tempfiles = []
59
60 def add_handler(self, handler):
61 if handler in self.handlers:
62 return
63 # XXX why does self.handlers need to be sorted?
64 bisect.insort(self.handlers, handler)
65 handler.add_parent(self)
66 self._handler_index_valid = False
67
68 def _maybe_reindex_handlers(self):
69 if self._handler_index_valid:
70 return
71
72 handle_error = {}
73 handle_open = {}
74 process_request = {}
75 process_response = {}
76 any_request = set()
77 any_response = set()
78 unwanted = []
79
80 for handler in self.handlers:
81 added = False
82 for meth in dir(handler):
83 if meth in ["redirect_request", "do_open", "proxy_open"]:
84 # oops, coincidental match
85 continue
86
87 if meth == "any_request":
88 any_request.add(handler)
89 added = True
90 continue
91 elif meth == "any_response":
92 any_response.add(handler)
93 added = True
94 continue
95
96 ii = meth.find("_")
97 scheme = meth[:ii]
98 condition = meth[ii + 1:]
99
100 if condition.startswith("error"):
101 jj = meth[ii + 1:].find("_") + ii + 1
102 kind = meth[jj + 1:]
103 try:
104 kind = int(kind)
105 except ValueError:
106 pass
107 lookup = handle_error.setdefault(scheme, {})
108 elif condition == "open":
109 kind = scheme
110 lookup = handle_open
111 elif condition == "request":
112 kind = scheme
113 lookup = process_request
114 elif condition == "response":
115 kind = scheme
116 lookup = process_response
117 else:
118 continue
119
120 lookup.setdefault(kind, set()).add(handler)
121 added = True
122
123 if not added:
124 unwanted.append(handler)
125
126 for handler in unwanted:
127 self.handlers.remove(handler)
128
129 # sort indexed methods
130 # XXX could be cleaned up
131 for lookup in [process_request, process_response]:
132 for scheme, handlers in lookup.iteritems():
133 lookup[scheme] = handlers
134 for scheme, lookup in handle_error.iteritems():
135 for code, handlers in lookup.iteritems():
136 handlers = list(handlers)
137 handlers.sort()
138 lookup[code] = handlers
139 for scheme, handlers in handle_open.iteritems():
140 handlers = list(handlers)
141 handlers.sort()
142 handle_open[scheme] = handlers
143
144 # cache the indexes
145 self.handle_error = handle_error
146 self.handle_open = handle_open
147 self.process_request = process_request
148 self.process_response = process_response
149 self._any_request = any_request
150 self._any_response = any_response
151
152 def _request(self, url_or_req, data, visit,
153 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
154 if isstringlike(url_or_req):
155 req = Request(url_or_req, data, visit=visit, timeout=timeout)
156 else:
157 # already a urllib2.Request or mechanize.Request instance
158 req = url_or_req
159 if data is not None:
160 req.add_data(data)
161 # XXX yuck
162 set_request_attr(req, "visit", visit, None)
163 set_request_attr(req, "timeout", timeout,
164 _sockettimeout._GLOBAL_DEFAULT_TIMEOUT)
165 return req
166
167 def open(self, fullurl, data=None,
168 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
169 req = self._request(fullurl, data, None, timeout)
170 req_scheme = req.get_type()
171
172 self._maybe_reindex_handlers()
173
174 # pre-process request
175 # XXX should we allow a Processor to change the URL scheme
176 # of the request?
177 request_processors = set(self.process_request.get(req_scheme, []))
178 request_processors.update(self._any_request)
179 request_processors = list(request_processors)
180 request_processors.sort()
181 for processor in request_processors:
182 for meth_name in ["any_request", req_scheme + "_request"]:
183 meth = getattr(processor, meth_name, None)
184 if meth:
185 req = meth(req)
186
187 # In Python >= 2.4, .open() supports processors already, so we must
188 # call ._open() instead.
189 urlopen = getattr(urllib2.OpenerDirector, "_open",
190 urllib2.OpenerDirector.open)
191 response = urlopen(self, req, data)
192
193 # post-process response
194 response_processors = set(self.process_response.get(req_scheme, []))
195 response_processors.update(self._any_response)
196 response_processors = list(response_processors)
197 response_processors.sort()
198 for processor in response_processors:
199 for meth_name in ["any_response", req_scheme + "_response"]:
200 meth = getattr(processor, meth_name, None)
201 if meth:
202 response = meth(req, response)
203
204 return response
205
206 def error(self, proto, *args):
207 if proto in ['http', 'https']:
208 # XXX http[s] protocols are special-cased
209 dict = self.handle_error['http'] # https is not different than http
210 proto = args[2] # YUCK!
211 meth_name = 'http_error_%s' % proto
212 http_err = 1
213 orig_args = args
214 else:
215 dict = self.handle_error
216 meth_name = proto + '_error'
217 http_err = 0
218 args = (dict, proto, meth_name) + args
219 result = apply(self._call_chain, args)
220 if result:
221 return result
222
223 if http_err:
224 args = (dict, 'default', 'http_error_default') + orig_args
225 return apply(self._call_chain, args)
226
227 BLOCK_SIZE = 1024 * 8
228 def retrieve(self, fullurl, filename=None, reporthook=None, data=None,
229 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
230 """Returns (filename, headers).
231
232 For remote objects, the default filename will refer to a temporary
233 file. Temporary files are removed when the OpenerDirector.close()
234 method is called.
235
236 For file: URLs, at present the returned filename is None. This may
237 change in future.
238
239 If the actual number of bytes read is less than indicated by the
240 Content-Length header, raises ContentTooShortError (a URLError
241 subclass). The exception's .result attribute contains the (filename,
242 headers) that would have been returned.
243
244 """
245 req = self._request(fullurl, data, False, timeout)
246 scheme = req.get_type()
247 fp = self.open(req)
248 headers = fp.info()
249 if filename is None and scheme == 'file':
250 # XXX req.get_selector() seems broken here, return None,
251 # pending sanity :-/
252 return None, headers
253 #return urllib.url2pathname(req.get_selector()), headers
254 if filename:
255 tfp = open(filename, 'wb')
256 else:
257 path = _rfc3986.urlsplit(req.get_full_url())[2]
258 suffix = os.path.splitext(path)[1]
259 fd, filename = tempfile.mkstemp(suffix)
260 self._tempfiles.append(filename)
261 tfp = os.fdopen(fd, 'wb')
262
263 result = filename, headers
264 bs = self.BLOCK_SIZE
265 size = -1
266 read = 0
267 blocknum = 0
268 if reporthook:
269 if "content-length" in headers:
270 size = int(headers["Content-Length"])
271 reporthook(blocknum, bs, size)
272 while 1:
273 block = fp.read(bs)
274 if block == "":
275 break
276 read += len(block)
277 tfp.write(block)
278 blocknum += 1
279 if reporthook:
280 reporthook(blocknum, bs, size)
281 fp.close()
282 tfp.close()
283 del fp
284 del tfp
285
286 # raise exception if actual size does not match content-length header
287 if size >= 0 and read < size:
288 raise ContentTooShortError(
289 "retrieval incomplete: "
290 "got only %i out of %i bytes" % (read, size),
291 result
292 )
293
294 return result
295
296 def close(self):
297 urllib2.OpenerDirector.close(self)
298
299 # make it very obvious this object is no longer supposed to be used
300 self.open = self.error = self.retrieve = self.add_handler = None
301
302 if self._tempfiles:
303 for filename in self._tempfiles:
304 try:
305 os.unlink(filename)
306 except OSError:
307 pass
308 del self._tempfiles[:]
309
310
311def wrapped_open(urlopen, process_response_object, fullurl, data=None,
312 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
313 success = True
314 try:
315 response = urlopen(fullurl, data, timeout)
316 except urllib2.HTTPError, error:
317 success = False
318 if error.fp is None: # not a response
319 raise
320 response = error
321
322 if response is not None:
323 response = process_response_object(response)
324
325 if not success:
326 raise response
327 return response
328
329class ResponseProcessingOpener(OpenerDirector):
330
331 def open(self, fullurl, data=None,
332 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
333 def bound_open(fullurl, data=None,
334 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
335 return OpenerDirector.open(self, fullurl, data, timeout)
336 return wrapped_open(
337 bound_open, self.process_response_object, fullurl, data, timeout)
338
339 def process_response_object(self, response):
340 return response
341
342
343class SeekableResponseOpener(ResponseProcessingOpener):
344 def process_response_object(self, response):
345 return _response.seek_wrapped_response(response)
346
347
348class OpenerFactory:
349 """This class's interface is quite likely to change."""
350
351 default_classes = [
352 # handlers
353 urllib2.ProxyHandler,
354 urllib2.UnknownHandler,
355 _http.HTTPHandler, # derived from new AbstractHTTPHandler
356 _http.HTTPDefaultErrorHandler,
357 _http.HTTPRedirectHandler, # bugfixed
358 urllib2.FTPHandler,
359 _file.FileHandler,
360 # processors
361 _upgrade.HTTPRequestUpgradeProcessor,
362 _http.HTTPCookieProcessor,
363 _http.HTTPErrorProcessor,
364 ]
365 if hasattr(httplib, 'HTTPS'):
366 default_classes.append(_http.HTTPSHandler)
367 handlers = []
368 replacement_handlers = []
369
370 def __init__(self, klass=OpenerDirector):
371 self.klass = klass
372
373 def build_opener(self, *handlers):
374 """Create an opener object from a list of handlers and processors.
375
376 The opener will use several default handlers and processors, including
377 support for HTTP and FTP.
378
379 If any of the handlers passed as arguments are subclasses of the
380 default handlers, the default handlers will not be used.
381
382 """
383 opener = self.klass()
384 default_classes = list(self.default_classes)
385 skip = []
386 for klass in default_classes:
387 for check in handlers:
388 if type(check) == types.ClassType:
389 if issubclass(check, klass):
390 skip.append(klass)
391 elif type(check) == types.InstanceType:
392 if isinstance(check, klass):
393 skip.append(klass)
394 for klass in skip:
395 default_classes.remove(klass)
396
397 for klass in default_classes:
398 opener.add_handler(klass())
399 for h in handlers:
400 if type(h) == types.ClassType:
401 h = h()
402 opener.add_handler(h)
403
404 return opener
405
406
407build_opener = OpenerFactory().build_opener
408
409_opener = None
410urlopen_lock = _threading.Lock()
411def urlopen(url, data=None, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
412 global _opener
413 if _opener is None:
414 urlopen_lock.acquire()
415 try:
416 if _opener is None:
417 _opener = build_opener()
418 finally:
419 urlopen_lock.release()
420 return _opener.open(url, data, timeout)
421
422def urlretrieve(url, filename=None, reporthook=None, data=None,
423 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
424 global _opener
425 if _opener is None:
426 urlopen_lock.acquire()
427 try:
428 if _opener is None:
429 _opener = build_opener()
430 finally:
431 urlopen_lock.release()
432 return _opener.retrieve(url, filename, reporthook, data, timeout)
433
434def install_opener(opener):
435 global _opener
436 _opener = opener
Note: See TracBrowser for help on using the repository browser.