source: py-scraping/mechanize/_response.py@ 182

Last change on this file since 182 was 106, checked in by Rick van der Zwet, 15 years ago

Initial commit...

File size: 17.5 KB
Line 
1"""Response classes.
2
3The seek_wrapper code is not used if you're using UserAgent with
4.set_seekable_responses(False), or if you're using the urllib2-level interface
5without SeekableProcessor or HTTPEquivProcessor. Class closeable_response is
6instantiated by some handlers (AbstractHTTPHandler), but the closeable_response
7interface is only depended upon by Browser-level code. Function
8upgrade_response is only used if you're using Browser or
9ResponseUpgradeProcessor.
10
11
12Copyright 2006 John J. Lee <jjl@pobox.com>
13
14This code is free software; you can redistribute it and/or modify it
15under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
16included with the distribution).
17
18"""
19
20import copy, mimetools
21from cStringIO import StringIO
22import urllib2
23
24
25def len_of_seekable(file_):
26 # this function exists because evaluation of len(file_.getvalue()) on every
27 # .read() from seek_wrapper would be O(N**2) in number of .read()s
28 pos = file_.tell()
29 file_.seek(0, 2) # to end
30 try:
31 return file_.tell()
32 finally:
33 file_.seek(pos)
34
35
36# XXX Andrew Dalke kindly sent me a similar class in response to my request on
37# comp.lang.python, which I then proceeded to lose. I wrote this class
38# instead, but I think he's released his code publicly since, could pinch the
39# tests from it, at least...
40
41# For testing seek_wrapper invariant (note that
42# test_urllib2.HandlerTest.test_seekable is expected to fail when this
43# invariant checking is turned on). The invariant checking is done by module
44# ipdc, which is available here:
45# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/436834
46## from ipdbc import ContractBase
47## class seek_wrapper(ContractBase):
48class seek_wrapper:
49 """Adds a seek method to a file object.
50
51 This is only designed for seeking on readonly file-like objects.
52
53 Wrapped file-like object must have a read method. The readline method is
54 only supported if that method is present on the wrapped object. The
55 readlines method is always supported. xreadlines and iteration are
56 supported only for Python 2.2 and above.
57
58 Public attributes:
59
60 wrapped: the wrapped file object
61 is_closed: true iff .close() has been called
62
63 WARNING: All other attributes of the wrapped object (ie. those that are not
64 one of wrapped, read, readline, readlines, xreadlines, __iter__ and next)
65 are passed through unaltered, which may or may not make sense for your
66 particular file object.
67
68 """
69 # General strategy is to check that cache is full enough, then delegate to
70 # the cache (self.__cache, which is a cStringIO.StringIO instance). A seek
71 # position (self.__pos) is maintained independently of the cache, in order
72 # that a single cache may be shared between multiple seek_wrapper objects.
73 # Copying using module copy shares the cache in this way.
74
75 def __init__(self, wrapped):
76 self.wrapped = wrapped
77 self.__read_complete_state = [False]
78 self.__is_closed_state = [False]
79 self.__have_readline = hasattr(self.wrapped, "readline")
80 self.__cache = StringIO()
81 self.__pos = 0 # seek position
82
83 def invariant(self):
84 # The end of the cache is always at the same place as the end of the
85 # wrapped file (though the .tell() method is not required to be present
86 # on wrapped file).
87 return self.wrapped.tell() == len(self.__cache.getvalue())
88
89 def close(self):
90 self.wrapped.close()
91 self.is_closed = True
92
93 def __getattr__(self, name):
94 if name == "is_closed":
95 return self.__is_closed_state[0]
96 elif name == "read_complete":
97 return self.__read_complete_state[0]
98
99 wrapped = self.__dict__.get("wrapped")
100 if wrapped:
101 return getattr(wrapped, name)
102
103 return getattr(self.__class__, name)
104
105 def __setattr__(self, name, value):
106 if name == "is_closed":
107 self.__is_closed_state[0] = bool(value)
108 elif name == "read_complete":
109 if not self.is_closed:
110 self.__read_complete_state[0] = bool(value)
111 else:
112 self.__dict__[name] = value
113
114 def seek(self, offset, whence=0):
115 assert whence in [0, 1, 2]
116
117 # how much data, if any, do we need to read?
118 if whence == 2: # 2: relative to end of *wrapped* file
119 if offset < 0: raise ValueError("negative seek offset")
120 # since we don't know yet where the end of that file is, we must
121 # read everything
122 to_read = None
123 else:
124 if whence == 0: # 0: absolute
125 if offset < 0: raise ValueError("negative seek offset")
126 dest = offset
127 else: # 1: relative to current position
128 pos = self.__pos
129 if pos < offset:
130 raise ValueError("seek to before start of file")
131 dest = pos + offset
132 end = len_of_seekable(self.__cache)
133 to_read = dest - end
134 if to_read < 0:
135 to_read = 0
136
137 if to_read != 0:
138 self.__cache.seek(0, 2)
139 if to_read is None:
140 assert whence == 2
141 self.__cache.write(self.wrapped.read())
142 self.read_complete = True
143 self.__pos = self.__cache.tell() - offset
144 else:
145 data = self.wrapped.read(to_read)
146 if not data:
147 self.read_complete = True
148 else:
149 self.__cache.write(data)
150 # Don't raise an exception even if we've seek()ed past the end
151 # of .wrapped, since fseek() doesn't complain in that case.
152 # Also like fseek(), pretend we have seek()ed past the end,
153 # i.e. not:
154 #self.__pos = self.__cache.tell()
155 # but rather:
156 self.__pos = dest
157 else:
158 self.__pos = dest
159
160 def tell(self):
161 return self.__pos
162
163 def __copy__(self):
164 cpy = self.__class__(self.wrapped)
165 cpy.__cache = self.__cache
166 cpy.__read_complete_state = self.__read_complete_state
167 cpy.__is_closed_state = self.__is_closed_state
168 return cpy
169
170 def get_data(self):
171 pos = self.__pos
172 try:
173 self.seek(0)
174 return self.read(-1)
175 finally:
176 self.__pos = pos
177
178 def read(self, size= -1):
179 pos = self.__pos
180 end = len_of_seekable(self.__cache)
181 available = end - pos
182
183 # enough data already cached?
184 if size <= available and size != -1:
185 self.__cache.seek(pos)
186 self.__pos = pos + size
187 return self.__cache.read(size)
188
189 # no, so read sufficient data from wrapped file and cache it
190 self.__cache.seek(0, 2)
191 if size == -1:
192 self.__cache.write(self.wrapped.read())
193 self.read_complete = True
194 else:
195 to_read = size - available
196 assert to_read > 0
197 data = self.wrapped.read(to_read)
198 if not data:
199 self.read_complete = True
200 else:
201 self.__cache.write(data)
202 self.__cache.seek(pos)
203
204 data = self.__cache.read(size)
205 self.__pos = self.__cache.tell()
206 assert self.__pos == pos + len(data)
207 return data
208
209 def readline(self, size= -1):
210 if not self.__have_readline:
211 raise NotImplementedError("no readline method on wrapped object")
212
213 # line we're about to read might not be complete in the cache, so
214 # read another line first
215 pos = self.__pos
216 self.__cache.seek(0, 2)
217 data = self.wrapped.readline()
218 if not data:
219 self.read_complete = True
220 else:
221 self.__cache.write(data)
222 self.__cache.seek(pos)
223
224 data = self.__cache.readline()
225 if size != -1:
226 r = data[:size]
227 self.__pos = pos + size
228 else:
229 r = data
230 self.__pos = pos + len(data)
231 return r
232
233 def readlines(self, sizehint= -1):
234 pos = self.__pos
235 self.__cache.seek(0, 2)
236 self.__cache.write(self.wrapped.read())
237 self.read_complete = True
238 self.__cache.seek(pos)
239 data = self.__cache.readlines(sizehint)
240 self.__pos = self.__cache.tell()
241 return data
242
243 def __iter__(self): return self
244 def next(self):
245 line = self.readline()
246 if line == "": raise StopIteration
247 return line
248
249 xreadlines = __iter__
250
251 def __repr__(self):
252 return ("<%s at %s whose wrapped object = %r>" %
253 (self.__class__.__name__, hex(abs(id(self))), self.wrapped))
254
255
256class response_seek_wrapper(seek_wrapper):
257
258 """
259 Supports copying response objects and setting response body data.
260
261 """
262
263 def __init__(self, wrapped):
264 seek_wrapper.__init__(self, wrapped)
265 self._headers = self.wrapped.info()
266
267 def __copy__(self):
268 cpy = seek_wrapper.__copy__(self)
269 # copy headers from delegate
270 cpy._headers = copy.copy(self.info())
271 return cpy
272
273 # Note that .info() and .geturl() (the only two urllib2 response methods
274 # that are not implemented by seek_wrapper) must be here explicitly rather
275 # than by seek_wrapper's __getattr__ delegation) so that the nasty
276 # dynamically-created HTTPError classes in get_seek_wrapper_class() get the
277 # wrapped object's implementation, and not HTTPError's.
278
279 def info(self):
280 return self._headers
281
282 def geturl(self):
283 return self.wrapped.geturl()
284
285 def set_data(self, data):
286 self.seek(0)
287 self.read()
288 self.close()
289 cache = self._seek_wrapper__cache = StringIO()
290 cache.write(data)
291 self.seek(0)
292
293
294class eoffile:
295 # file-like object that always claims to be at end-of-file...
296 def read(self, size= -1): return ""
297 def readline(self, size= -1): return ""
298 def __iter__(self): return self
299 def next(self): return ""
300 def close(self): pass
301
302class eofresponse(eoffile):
303 def __init__(self, url, headers, code, msg):
304 self._url = url
305 self._headers = headers
306 self.code = code
307 self.msg = msg
308 def geturl(self): return self._url
309 def info(self): return self._headers
310
311
312class closeable_response:
313 """Avoids unnecessarily clobbering urllib.addinfourl methods on .close().
314
315 Only supports responses returned by mechanize.HTTPHandler.
316
317 After .close(), the following methods are supported:
318
319 .read()
320 .readline()
321 .info()
322 .geturl()
323 .__iter__()
324 .next()
325 .close()
326
327 and the following attributes are supported:
328
329 .code
330 .msg
331
332 Also supports pickling (but the stdlib currently does something to prevent
333 it: http://python.org/sf/1144636).
334
335 """
336 # presence of this attr indicates is useable after .close()
337 closeable_response = None
338
339 def __init__(self, fp, headers, url, code, msg):
340 self._set_fp(fp)
341 self._headers = headers
342 self._url = url
343 self.code = code
344 self.msg = msg
345
346 def _set_fp(self, fp):
347 self.fp = fp
348 self.read = self.fp.read
349 self.readline = self.fp.readline
350 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
351 if hasattr(self.fp, "fileno"):
352 self.fileno = self.fp.fileno
353 else:
354 self.fileno = lambda: None
355 self.__iter__ = self.fp.__iter__
356 self.next = self.fp.next
357
358 def __repr__(self):
359 return '<%s at %s whose fp = %r>' % (
360 self.__class__.__name__, hex(abs(id(self))), self.fp)
361
362 def info(self):
363 return self._headers
364
365 def geturl(self):
366 return self._url
367
368 def close(self):
369 wrapped = self.fp
370 wrapped.close()
371 new_wrapped = eofresponse(
372 self._url, self._headers, self.code, self.msg)
373 self._set_fp(new_wrapped)
374
375 def __getstate__(self):
376 # There are three obvious options here:
377 # 1. truncate
378 # 2. read to end
379 # 3. close socket, pickle state including read position, then open
380 # again on unpickle and use Range header
381 # XXXX um, 4. refuse to pickle unless .close()d. This is better,
382 # actually ("errors should never pass silently"). Pickling doesn't
383 # work anyway ATM, because of http://python.org/sf/1144636 so fix
384 # this later
385
386 # 2 breaks pickle protocol, because one expects the original object
387 # to be left unscathed by pickling. 3 is too complicated and
388 # surprising (and too much work ;-) to happen in a sane __getstate__.
389 # So we do 1.
390
391 state = self.__dict__.copy()
392 new_wrapped = eofresponse(
393 self._url, self._headers, self.code, self.msg)
394 state["wrapped"] = new_wrapped
395 return state
396
397def test_response(data='test data', headers=[],
398 url="http://example.com/", code=200, msg="OK"):
399 return make_response(data, headers, url, code, msg)
400
401def test_html_response(data='test data', headers=[],
402 url="http://example.com/", code=200, msg="OK"):
403 headers += [("Content-type", "text/html")]
404 return make_response(data, headers, url, code, msg)
405
406def make_response(data, headers, url, code, msg):
407 """Convenient factory for objects implementing response interface.
408
409 data: string containing response body data
410 headers: sequence of (name, value) pairs
411 url: URL of response
412 code: integer response code (e.g. 200)
413 msg: string response code message (e.g. "OK")
414
415 """
416 mime_headers = make_headers(headers)
417 r = closeable_response(StringIO(data), mime_headers, url, code, msg)
418 return response_seek_wrapper(r)
419
420
421def make_headers(headers):
422 """
423 headers: sequence of (name, value) pairs
424 """
425 hdr_text = []
426 for name_value in headers:
427 hdr_text.append("%s: %s" % name_value)
428 return mimetools.Message(StringIO("\n".join(hdr_text)))
429
430
431# Rest of this module is especially horrible, but needed, at least until fork
432# urllib2. Even then, may want to preseve urllib2 compatibility.
433
434def get_seek_wrapper_class(response):
435 # in order to wrap response objects that are also exceptions, we must
436 # dynamically subclass the exception :-(((
437 if (isinstance(response, urllib2.HTTPError) and
438 not hasattr(response, "seek")):
439 if response.__class__.__module__ == "__builtin__":
440 exc_class_name = response.__class__.__name__
441 else:
442 exc_class_name = "%s.%s" % (
443 response.__class__.__module__, response.__class__.__name__)
444
445 class httperror_seek_wrapper(response_seek_wrapper, response.__class__):
446 # this only derives from HTTPError in order to be a subclass --
447 # the HTTPError behaviour comes from delegation
448
449 _exc_class_name = exc_class_name
450
451 def __init__(self, wrapped):
452 response_seek_wrapper.__init__(self, wrapped)
453 # be compatible with undocumented HTTPError attributes :-(
454 self.hdrs = wrapped.info()
455 self.filename = wrapped.geturl()
456
457 def __repr__(self):
458 return (
459 "<%s (%s instance) at %s "
460 "whose wrapped object = %r>" % (
461 self.__class__.__name__, self._exc_class_name,
462 hex(abs(id(self))), self.wrapped)
463 )
464 wrapper_class = httperror_seek_wrapper
465 else:
466 wrapper_class = response_seek_wrapper
467 return wrapper_class
468
469def seek_wrapped_response(response):
470 """Return a copy of response that supports seekable response interface.
471
472 Accepts responses from both mechanize and urllib2 handlers.
473
474 Copes with both oridinary response instances and HTTPError instances (which
475 can't be simply wrapped due to the requirement of preserving the exception
476 base class).
477 """
478 if not hasattr(response, "seek"):
479 wrapper_class = get_seek_wrapper_class(response)
480 response = wrapper_class(response)
481 assert hasattr(response, "get_data")
482 return response
483
484def upgrade_response(response):
485 """Return a copy of response that supports Browser response interface.
486
487 Browser response interface is that of "seekable responses"
488 (response_seek_wrapper), plus the requirement that responses must be
489 useable after .close() (closeable_response).
490
491 Accepts responses from both mechanize and urllib2 handlers.
492
493 Copes with both ordinary response instances and HTTPError instances (which
494 can't be simply wrapped due to the requirement of preserving the exception
495 base class).
496 """
497 wrapper_class = get_seek_wrapper_class(response)
498 if hasattr(response, "closeable_response"):
499 if not hasattr(response, "seek"):
500 response = wrapper_class(response)
501 assert hasattr(response, "get_data")
502 return copy.copy(response)
503
504 # a urllib2 handler constructed the response, i.e. the response is an
505 # urllib.addinfourl or a urllib2.HTTPError, instead of a
506 # _Util.closeable_response as returned by e.g. mechanize.HTTPHandler
507 try:
508 code = response.code
509 except AttributeError:
510 code = None
511 try:
512 msg = response.msg
513 except AttributeError:
514 msg = None
515
516 # may have already-.read() data from .seek() cache
517 data = None
518 get_data = getattr(response, "get_data", None)
519 if get_data:
520 data = get_data()
521
522 response = closeable_response(
523 response.fp, response.info(), response.geturl(), code, msg)
524 response = wrapper_class(response)
525 if data:
526 response.set_data(data)
527 return response
Note: See TracBrowser for help on using the repository browser.