Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: py-scraping/mechanize/_mechanize.py@ 200

Last change on this file since 200 was 106, checked in by Rick van der Zwet, 15 years ago
Initial commit...
File size: 24.6 KB

Line
1	"""Stateful programmatic WWW navigation, after Perl's WWW::Mechanize.
2
3	Copyright 2003-2006 John J. Lee <jjl@pobox.com>
4	Copyright 2003 Andy Lester (original Perl code)
5
6	This code is free software; you can redistribute it and/or modify it
7	under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
8	included with the distribution).
9
10	"""
11
12	import urllib2, copy, re, os, urllib
13
14
15	from _html import DefaultFactory
16	import _response
17	import _request
18	import _rfc3986
19	import _sockettimeout
20	from _useragent import UserAgentBase
21
22	__version__ = (0, 1, 11, None, None) # 0.1.11
23
24	class BrowserStateError(Exception): pass
25	class LinkNotFoundError(Exception): pass
26	class FormNotFoundError(Exception): pass
27
28
29	def sanepathname2url(path):
30	urlpath = urllib.pathname2url(path)
31	if os.name == "nt" and urlpath.startswith("///"):
32	urlpath = urlpath[2:]
33	# XXX don't ask me about the mac...
34	return urlpath
35
36
37	class History:
38	"""
39
40	Though this will become public, the implied interface is not yet stable.
41
42	"""
43	def __init__(self):
44	self._history = [] # LIFO
45	def add(self, request, response):
46	self._history.append((request, response))
47	def back(self, n, _response):
48	response = _response # XXX move Browser._response into this class?
49	while n > 0 or response is None:
50	try:
51	request, response = self._history.pop()
52	except IndexError:
53	raise BrowserStateError("already at start of history")
54	n -= 1
55	return request, response
56	def clear(self):
57	del self._history[:]
58	def close(self):
59	for request, response in self._history:
60	if response is not None:
61	response.close()
62	del self._history[:]
63
64
65	class HTTPRefererProcessor(urllib2.BaseHandler):
66	def http_request(self, request):
67	# See RFC 2616 14.36. The only times we know the source of the
68	# request URI has a URI associated with it are redirect, and
69	# Browser.click() / Browser.submit() / Browser.follow_link().
70	# Otherwise, it's the user's job to add any Referer header before
71	# .open()ing.
72	if hasattr(request, "redirect_dict"):
73	request = self.parent._add_referer_header(
74	request, origin_request=False)
75	return request
76
77	https_request = http_request
78
79
80	class Browser(UserAgentBase):
81	"""Browser-like class with support for history, forms and links.
82
83	BrowserStateError is raised whenever the browser is in the wrong state to
84	complete the requested operation - eg., when .back() is called when the
85	browser history is empty, or when .follow_link() is called when the current
86	response does not contain HTML data.
87
88	Public attributes:
89
90	request: current request (mechanize.Request or urllib2.Request)
91	form: currently selected form (see .select_form())
92
93	"""
94
95	handler_classes = copy.copy(UserAgentBase.handler_classes)
96	handler_classes["_referer"] = HTTPRefererProcessor
97	default_features = copy.copy(UserAgentBase.default_features)
98	default_features.append("_referer")
99
100	def __init__(self,
101	factory=None,
102	history=None,
103	request_class=None,
104	):
105	"""
106
107	Only named arguments should be passed to this constructor.
108
109	factory: object implementing the mechanize.Factory interface.
110	history: object implementing the mechanize.History interface. Note
111	this interface is still experimental and may change in future.
112	request_class: Request class to use. Defaults to mechanize.Request
113	by default for Pythons older than 2.4, urllib2.Request otherwise.
114
115	The Factory and History objects passed in are 'owned' by the Browser,
116	so they should not be shared across Browsers. In particular,
117	factory.set_response() should not be called except by the owning
118	Browser itself.
119
120	Note that the supplied factory's request_class is overridden by this
121	constructor, to ensure only one Request class is used.
122
123	"""
124	self._handle_referer = True
125
126	if history is None:
127	history = History()
128	self._history = history
129
130	if request_class is None:
131	if not hasattr(urllib2.Request, "add_unredirected_header"):
132	request_class = _request.Request
133	else:
134	request_class = urllib2.Request # Python >= 2.4
135
136	if factory is None:
137	factory = DefaultFactory()
138	factory.set_request_class(request_class)
139	self._factory = factory
140	self.request_class = request_class
141
142	self.request = None
143	self._set_response(None, False)
144
145	# do this last to avoid __getattr__ problems
146	UserAgentBase.__init__(self)
147
148	def close(self):
149	UserAgentBase.close(self)
150	if self._response is not None:
151	self._response.close()
152	if self._history is not None:
153	self._history.close()
154	self._history = None
155
156	# make use after .close easy to spot
157	self.form = None
158	self.request = self._response = None
159	self.request = self.response = self.set_response = None
160	self.geturl = self.reload = self.back = None
161	self.clear_history = self.set_cookie = self.links = self.forms = None
162	self.viewing_html = self.encoding = self.title = None
163	self.select_form = self.click = self.submit = self.click_link = None
164	self.follow_link = self.find_link = None
165
166	def set_handle_referer(self, handle):
167	"""Set whether to add Referer header to each request."""
168	self._set_handler("_referer", handle)
169	self._handle_referer = bool(handle)
170
171	def _add_referer_header(self, request, origin_request=True):
172	if self.request is None:
173	return request
174	scheme = request.get_type()
175	original_scheme = self.request.get_type()
176	if scheme not in ["http", "https"]:
177	return request
178	if not origin_request and not self.request.has_header("Referer"):
179	return request
180
181	if (self._handle_referer and
182	original_scheme in ["http", "https"] and
183	not (original_scheme == "https" and scheme != "https")):
184	# strip URL fragment (RFC 2616 14.36)
185	parts = _rfc3986.urlsplit(self.request.get_full_url())
186	parts = parts[:-1] + (None,)
187	referer = _rfc3986.urlunsplit(parts)
188	request.add_unredirected_header("Referer", referer)
189	return request
190
191	def open_novisit(self, url, data=None,
192	timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
193	"""Open a URL without visiting it.
194
195	Browser state (including request, response, history, forms and links)
196	is left unchanged by calling this function.
197
198	The interface is the same as for .open().
199
200	This is useful for things like fetching images.
201
202	See also .retrieve().
203
204	"""
205	return self._mech_open(url, data, visit=False, timeout=timeout)
206
207	def open(self, url, data=None,
208	timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
209	return self._mech_open(url, data, timeout=timeout)
210
211	def _mech_open(self, url, data=None, update_history=True, visit=None,
212	timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
213	try:
214	url.get_full_url
215	except AttributeError:
216	# string URL -- convert to absolute URL if required
217	scheme, authority = _rfc3986.urlsplit(url)[:2]
218	if scheme is None:
219	# relative URL
220	if self._response is None:
221	raise BrowserStateError(
222	"can't fetch relative reference: "
223	"not viewing any document")
224	url = _rfc3986.urljoin(self._response.geturl(), url)
225
226	request = self._request(url, data, visit, timeout)
227	visit = request.visit
228	if visit is None:
229	visit = True
230
231	if visit:
232	self._visit_request(request, update_history)
233
234	success = True
235	try:
236	response = UserAgentBase.open(self, request, data)
237	except urllib2.HTTPError, error:
238	success = False
239	if error.fp is None: # not a response
240	raise
241	response = error
242	## except (IOError, socket.error, OSError), error:
243	## # Yes, urllib2 really does raise all these :-((
244	## # See test_urllib2.py for examples of socket.gaierror and OSError,
245	## # plus note that FTPHandler raises IOError.
246	## # XXX I don't seem to have an example of exactly socket.error being
247	## # raised, only socket.gaierror...
248	## # I don't want to start fixing these here, though, since this is a
249	## # subclass of OpenerDirector, and it would break old code. Even in
250	## # Python core, a fix would need some backwards-compat. hack to be
251	## # acceptable.
252	## raise
253
254	if visit:
255	self._set_response(response, False)
256	response = copy.copy(self._response)
257	elif response is not None:
258	response = _response.upgrade_response(response)
259
260	if not success:
261	raise response
262	return response
263
264	def __str__(self):
265	text = []
266	text.append("<%s " % self.__class__.__name__)
267	if self._response:
268	text.append("visiting %s" % self._response.geturl())
269	else:
270	text.append("(not visiting a URL)")
271	if self.form:
272	text.append("\n selected form:\n %s\n" % str(self.form))
273	text.append(">")
274	return "".join(text)
275
276	def response(self):
277	"""Return a copy of the current response.
278
279	The returned object has the same interface as the object returned by
280	.open() (or urllib2.urlopen()).
281
282	"""
283	return copy.copy(self._response)
284
285	def open_local_file(self, filename):
286	path = sanepathname2url(os.path.abspath(filename))
287	url = 'file://' + path
288	return self.open(url)
289
290	def set_response(self, response):
291	"""Replace current response with (a copy of) response.
292
293	response may be None.
294
295	This is intended mostly for HTML-preprocessing.
296	"""
297	self._set_response(response, True)
298
299	def _set_response(self, response, close_current):
300	# sanity check, necessary but far from sufficient
301	if not (response is None or
302	(hasattr(response, "info") and hasattr(response, "geturl") and
303	hasattr(response, "read")
304	)
305	):
306	raise ValueError("not a response object")
307
308	self.form = None
309	if response is not None:
310	response = _response.upgrade_response(response)
311	if close_current and self._response is not None:
312	self._response.close()
313	self._response = response
314	self._factory.set_response(response)
315
316	def visit_response(self, response, request=None):
317	"""Visit the response, as if it had been .open()ed.
318
319	Unlike .set_response(), this updates history rather than replacing the
320	current response.
321	"""
322	if request is None:
323	request = _request.Request(response.geturl())
324	self._visit_request(request, True)
325	self._set_response(response, False)
326
327	def _visit_request(self, request, update_history):
328	if self._response is not None:
329	self._response.close()
330	if self.request is not None and update_history:
331	self._history.add(self.request, self._response)
332	self._response = None
333	# we want self.request to be assigned even if UserAgentBase.open
334	# fails
335	self.request = request
336
337	def geturl(self):
338	"""Get URL of current document."""
339	if self._response is None:
340	raise BrowserStateError("not viewing any document")
341	return self._response.geturl()
342
343	def reload(self):
344	"""Reload current document, and return response object."""
345	if self.request is None:
346	raise BrowserStateError("no URL has yet been .open()ed")
347	if self._response is not None:
348	self._response.close()
349	return self._mech_open(self.request, update_history=False)
350
351	def back(self, n=1):
352	"""Go back n steps in history, and return response object.
353
354	n: go back this number of steps (default 1 step)
355
356	"""
357	if self._response is not None:
358	self._response.close()
359	self.request, response = self._history.back(n, self._response)
360	self.set_response(response)
361	if not response.read_complete:
362	return self.reload()
363	return copy.copy(response)
364
365	def clear_history(self):
366	self._history.clear()
367
368	def set_cookie(self, cookie_string):
369	"""Request to set a cookie.
370
371	Note that it is NOT necessary to call this method under ordinary
372	circumstances: cookie handling is normally entirely automatic. The
373	intended use case is rather to simulate the setting of a cookie by
374	client script in a web page (e.g. JavaScript). In that case, use of
375	this method is necessary because mechanize currently does not support
376	JavaScript, VBScript, etc.
377
378	The cookie is added in the same way as if it had arrived with the
379	current response, as a result of the current request. This means that,
380	for example, if it is not appropriate to set the cookie based on the
381	current request, no cookie will be set.
382
383	The cookie will be returned automatically with subsequent responses
384	made by the Browser instance whenever that's appropriate.
385
386	cookie_string should be a valid value of the Set-Cookie header.
387
388	For example:
389
390	browser.set_cookie(
391	"sid=abcdef; expires=Wednesday, 09-Nov-06 23:12:40 GMT")
392
393	Currently, this method does not allow for adding RFC 2986 cookies.
394	This limitation will be lifted if anybody requests it.
395
396	"""
397	if self._response is None:
398	raise BrowserStateError("not viewing any document")
399	if self.request.get_type() not in ["http", "https"]:
400	raise BrowserStateError("can't set cookie for non-HTTP/HTTPS "
401	"transactions")
402	cookiejar = self._ua_handlers["_cookies"].cookiejar
403	response = self.response() # copy
404	headers = response.info()
405	headers["Set-cookie"] = cookie_string
406	cookiejar.extract_cookies(response, self.request)
407
408	def links(self, **kwds):
409	"""Return iterable over links (mechanize.Link objects)."""
410	if not self.viewing_html():
411	raise BrowserStateError("not viewing HTML")
412	links = self._factory.links()
413	if kwds:
414	return self._filter_links(links, **kwds)
415	else:
416	return links
417
418	def forms(self):
419	"""Return iterable over forms.
420
421	The returned form objects implement the ClientForm.HTMLForm interface.
422
423	"""
424	if not self.viewing_html():
425	raise BrowserStateError("not viewing HTML")
426	return self._factory.forms()
427
428	def global_form(self):
429	"""Return the global form object, or None if the factory implementation
430	did not supply one.
431
432	The "global" form object contains all controls that are not descendants
433	of any FORM element.
434
435	The returned form object implements the ClientForm.HTMLForm interface.
436
437	This is a separate method since the global form is not regarded as part
438	of the sequence of forms in the document -- mostly for
439	backwards-compatibility.
440
441	"""
442	if not self.viewing_html():
443	raise BrowserStateError("not viewing HTML")
444	return self._factory.global_form
445
446	def viewing_html(self):
447	"""Return whether the current response contains HTML data."""
448	if self._response is None:
449	raise BrowserStateError("not viewing any document")
450	return self._factory.is_html
451
452	def encoding(self):
453	if self._response is None:
454	raise BrowserStateError("not viewing any document")
455	return self._factory.encoding
456
457	def title(self):
458	r"""Return title, or None if there is no title element in the document.
459
460	Treatment of any tag children of attempts to follow Firefox and IE
461	(currently, tags are preserved).
462
463	"""
464	if not self.viewing_html():
465	raise BrowserStateError("not viewing HTML")
466	return self._factory.title
467
468	def select_form(self, name=None, predicate=None, nr=None):
469	"""Select an HTML form for input.
470
471	This is a bit like giving a form the "input focus" in a browser.
472
473	If a form is selected, the Browser object supports the HTMLForm
474	interface, so you can call methods like .set_value(), .set(), and
475	.click().
476
477	Another way to select a form is to assign to the .form attribute. The
478	form assigned should be one of the objects returned by the .forms()
479	method.
480
481	At least one of the name, predicate and nr arguments must be supplied.
482	If no matching form is found, mechanize.FormNotFoundError is raised.
483
484	If name is specified, then the form must have the indicated name.
485
486	If predicate is specified, then the form must match that function. The
487	predicate function is passed the HTMLForm as its single argument, and
488	should return a boolean value indicating whether the form matched.
489
490	nr, if supplied, is the sequence number of the form (where 0 is the
491	first). Note that control 0 is the first form matching all the other
492	arguments (if supplied); it is not necessarily the first control in the
493	form. The "global form" (consisting of all form controls not contained
494	in any FORM element) is considered not to be part of this sequence and
495	to have no name, so will not be matched unless both name and nr are
496	None.
497
498	"""
499	if not self.viewing_html():
500	raise BrowserStateError("not viewing HTML")
501	if (name is None) and (predicate is None) and (nr is None):
502	raise ValueError(
503	"at least one argument must be supplied to specify form")
504
505	global_form = self._factory.global_form
506	if nr is None and name is None and \
507	predicate is not None and predicate(global_form):
508	self.form = global_form
509	return
510
511	orig_nr = nr
512	for form in self.forms():
513	if name is not None and name != form.name:
514	continue
515	if predicate is not None and not predicate(form):
516	continue
517	if nr:
518	nr -= 1
519	continue
520	self.form = form
521	break # success
522	else:
523	# failure
524	description = []
525	if name is not None: description.append("name '%s'" % name)
526	if predicate is not None:
527	description.append("predicate %s" % predicate)
528	if orig_nr is not None: description.append("nr %d" % orig_nr)
529	description = ", ".join(description)
530	raise FormNotFoundError("no form matching " + description)
531
532	def click(self, args, *kwds):
533	"""See ClientForm.HTMLForm.click for documentation."""
534	if not self.viewing_html():
535	raise BrowserStateError("not viewing HTML")
536	request = self.form.click(args, *kwds)
537	return self._add_referer_header(request)
538
539	def submit(self, args, *kwds):
540	"""Submit current form.
541
542	Arguments are as for ClientForm.HTMLForm.click().
543
544	Return value is same as for Browser.open().
545
546	"""
547	return self.open(self.click(args, *kwds))
548
549	def click_link(self, link=None, **kwds):
550	"""Find a link and return a Request object for it.
551
552	Arguments are as for .find_link(), except that a link may be supplied
553	as the first argument.
554
555	"""
556	if not self.viewing_html():
557	raise BrowserStateError("not viewing HTML")
558	if not link:
559	link = self.find_link(**kwds)
560	else:
561	if kwds:
562	raise ValueError(
563	"either pass a Link, or keyword arguments, not both")
564	request = self.request_class(link.absolute_url)
565	return self._add_referer_header(request)
566
567	def follow_link(self, link=None, **kwds):
568	"""Find a link and .open() it.
569
570	Arguments are as for .click_link().
571
572	Return value is same as for Browser.open().
573
574	"""
575	return self.open(self.click_link(link, **kwds))
576
577	def find_link(self, **kwds):
578	"""Find a link in current page.
579
580	Links are returned as mechanize.Link objects.
581
582	# Return third link that .search()-matches the regexp "python"
583	# (by ".search()-matches", I mean that the regular expression method
584	# .search() is used, rather than .match()).
585	find_link(text_regex=re.compile("python"), nr=2)
586
587	# Return first http link in the current page that points to somewhere
588	# on python.org whose link text (after tags have been removed) is
589	# exactly "monty python".
590	find_link(text="monty python",
591	url_regex=re.compile("http.*python.org"))
592
593	# Return first link with exactly three HTML attributes.
594	find_link(predicate=lambda link: len(link.attrs) == 3)
595
596	Links include anchors (<a>), image maps (<area>), and frames (<frame>,
597	<iframe>).
598
599	All arguments must be passed by keyword, not position. Zero or more
600	arguments may be supplied. In order to find a link, all arguments
601	supplied must match.
602
603	If a matching link is not found, mechanize.LinkNotFoundError is raised.
604
605	text: link text between link tags: eg. <a href="blah">this bit</a> (as
606	returned by pullparser.get_compressed_text(), ie. without tags but
607	with opening tags "textified" as per the pullparser docs) must compare
608	equal to this argument, if supplied
609	text_regex: link text between tag (as defined above) must match the
610	regular expression object or regular expression string passed as this
611	argument, if supplied
612	name, name_regex: as for text and text_regex, but matched against the
613	name HTML attribute of the link tag
614	url, url_regex: as for text and text_regex, but matched against the
615	URL of the link tag (note this matches against Link.url, which is a
616	relative or absolute URL according to how it was written in the HTML)
617	tag: element name of opening tag, eg. "a"
618	predicate: a function taking a Link object as its single argument,
619	returning a boolean result, indicating whether the links
620	nr: matches the nth link that matches all other criteria (default 0)
621
622	"""
623	try:
624	return self._filter_links(self._factory.links(), **kwds).next()
625	except StopIteration:
626	raise LinkNotFoundError()
627
628	def __getattr__(self, name):
629	# pass through ClientForm / DOMForm methods and attributes
630	form = self.__dict__.get("form")
631	if form is None:
632	raise AttributeError(
633	"%s instance has no attribute %s (perhaps you forgot to "
634	".select_form()?)" % (self.__class__, name))
635	return getattr(form, name)
636
637	def _filter_links(self, links,
638	text=None, text_regex=None,
639	name=None, name_regex=None,
640	url=None, url_regex=None,
641	tag=None,
642	predicate=None,
643	nr=0
644	):
645	if not self.viewing_html():
646	raise BrowserStateError("not viewing HTML")
647
648	found_links = []
649	orig_nr = nr
650
651	for link in links:
652	if url is not None and url != link.url:
653	continue
654	if url_regex is not None and not re.search(url_regex, link.url):
655	continue
656	if (text is not None and
657	(link.text is None or text != link.text)):
658	continue
659	if (text_regex is not None and
660	(link.text is None or not re.search(text_regex, link.text))):
661	continue
662	if name is not None and name != dict(link.attrs).get("name"):
663	continue
664	if name_regex is not None:
665	link_name = dict(link.attrs).get("name")
666	if link_name is None or not re.search(name_regex, link_name):
667	continue
668	if tag is not None and tag != link.tag:
669	continue
670	if predicate is not None and not predicate(link):
671	continue
672	if nr:
673	nr -= 1
674	continue
675	yield link
676	nr = orig_nr

Note: See TracBrowser for help on using the repository browser.

Download in other formats: