Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: py-scraping/mechanize/_mechanize.py@ 191

Last change on this file since 191 was 106, checked in by Rick van der Zwet, 15 years ago
Initial commit...
File size: 24.6 KB

Rev	Line
[106]	1	"""Stateful programmatic WWW navigation, after Perl's WWW::Mechanize.
	2
	3	Copyright 2003-2006 John J. Lee <jjl@pobox.com>
	4	Copyright 2003 Andy Lester (original Perl code)
	5
	6	This code is free software; you can redistribute it and/or modify it
	7	under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
	8	included with the distribution).
	9
	10	"""
	11
	12	import urllib2, copy, re, os, urllib
	13
	14
	15	from _html import DefaultFactory
	16	import _response
	17	import _request
	18	import _rfc3986
	19	import _sockettimeout
	20	from _useragent import UserAgentBase
	21
	22	__version__ = (0, 1, 11, None, None) # 0.1.11
	23
	24	class BrowserStateError(Exception): pass
	25	class LinkNotFoundError(Exception): pass
	26	class FormNotFoundError(Exception): pass
	27
	28
	29	def sanepathname2url(path):
	30	urlpath = urllib.pathname2url(path)
	31	if os.name == "nt" and urlpath.startswith("///"):
	32	urlpath = urlpath[2:]
	33	# XXX don't ask me about the mac...
	34	return urlpath
	35
	36
	37	class History:
	38	"""
	39
	40	Though this will become public, the implied interface is not yet stable.
	41
	42	"""
	43	def __init__(self):
	44	self._history = [] # LIFO
	45	def add(self, request, response):
	46	self._history.append((request, response))
	47	def back(self, n, _response):
	48	response = _response # XXX move Browser._response into this class?
	49	while n > 0 or response is None:
	50	try:
	51	request, response = self._history.pop()
	52	except IndexError:
	53	raise BrowserStateError("already at start of history")
	54	n -= 1
	55	return request, response
	56	def clear(self):
	57	del self._history[:]
	58	def close(self):
	59	for request, response in self._history:
	60	if response is not None:
	61	response.close()
	62	del self._history[:]
	63
	64
	65	class HTTPRefererProcessor(urllib2.BaseHandler):
	66	def http_request(self, request):
	67	# See RFC 2616 14.36. The only times we know the source of the
	68	# request URI has a URI associated with it are redirect, and
	69	# Browser.click() / Browser.submit() / Browser.follow_link().
	70	# Otherwise, it's the user's job to add any Referer header before
	71	# .open()ing.
	72	if hasattr(request, "redirect_dict"):
	73	request = self.parent._add_referer_header(
	74	request, origin_request=False)
	75	return request
	76
	77	https_request = http_request
	78
	79
	80	class Browser(UserAgentBase):
	81	"""Browser-like class with support for history, forms and links.
	82
	83	BrowserStateError is raised whenever the browser is in the wrong state to
	84	complete the requested operation - eg., when .back() is called when the
	85	browser history is empty, or when .follow_link() is called when the current
	86	response does not contain HTML data.
	87
	88	Public attributes:
	89
	90	request: current request (mechanize.Request or urllib2.Request)
	91	form: currently selected form (see .select_form())
	92
	93	"""
	94
	95	handler_classes = copy.copy(UserAgentBase.handler_classes)
	96	handler_classes["_referer"] = HTTPRefererProcessor
	97	default_features = copy.copy(UserAgentBase.default_features)
	98	default_features.append("_referer")
	99
	100	def __init__(self,
	101	factory=None,
	102	history=None,
	103	request_class=None,
	104	):
	105	"""
	106
	107	Only named arguments should be passed to this constructor.
	108
	109	factory: object implementing the mechanize.Factory interface.
	110	history: object implementing the mechanize.History interface. Note
	111	this interface is still experimental and may change in future.
	112	request_class: Request class to use. Defaults to mechanize.Request
	113	by default for Pythons older than 2.4, urllib2.Request otherwise.
	114
	115	The Factory and History objects passed in are 'owned' by the Browser,
	116	so they should not be shared across Browsers. In particular,
	117	factory.set_response() should not be called except by the owning
	118	Browser itself.
	119
	120	Note that the supplied factory's request_class is overridden by this
	121	constructor, to ensure only one Request class is used.
	122
	123	"""
	124	self._handle_referer = True
	125
	126	if history is None:
	127	history = History()
	128	self._history = history
	129
	130	if request_class is None:
	131	if not hasattr(urllib2.Request, "add_unredirected_header"):
	132	request_class = _request.Request
	133	else:
	134	request_class = urllib2.Request # Python >= 2.4
	135
	136	if factory is None:
	137	factory = DefaultFactory()
	138	factory.set_request_class(request_class)
	139	self._factory = factory
	140	self.request_class = request_class
	141
	142	self.request = None
	143	self._set_response(None, False)
	144
	145	# do this last to avoid __getattr__ problems
	146	UserAgentBase.__init__(self)
	147
	148	def close(self):
	149	UserAgentBase.close(self)
	150	if self._response is not None:
	151	self._response.close()
	152	if self._history is not None:
	153	self._history.close()
	154	self._history = None
	155
	156	# make use after .close easy to spot
	157	self.form = None
	158	self.request = self._response = None
	159	self.request = self.response = self.set_response = None
	160	self.geturl = self.reload = self.back = None
	161	self.clear_history = self.set_cookie = self.links = self.forms = None
	162	self.viewing_html = self.encoding = self.title = None
	163	self.select_form = self.click = self.submit = self.click_link = None
	164	self.follow_link = self.find_link = None
	165
	166	def set_handle_referer(self, handle):
	167	"""Set whether to add Referer header to each request."""
	168	self._set_handler("_referer", handle)
	169	self._handle_referer = bool(handle)
	170
	171	def _add_referer_header(self, request, origin_request=True):
	172	if self.request is None:
	173	return request
	174	scheme = request.get_type()
	175	original_scheme = self.request.get_type()
	176	if scheme not in ["http", "https"]:
	177	return request
	178	if not origin_request and not self.request.has_header("Referer"):
	179	return request
	180
	181	if (self._handle_referer and
	182	original_scheme in ["http", "https"] and
	183	not (original_scheme == "https" and scheme != "https")):
	184	# strip URL fragment (RFC 2616 14.36)
	185	parts = _rfc3986.urlsplit(self.request.get_full_url())
	186	parts = parts[:-1] + (None,)
	187	referer = _rfc3986.urlunsplit(parts)
	188	request.add_unredirected_header("Referer", referer)
	189	return request
	190
	191	def open_novisit(self, url, data=None,
	192	timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
	193	"""Open a URL without visiting it.
	194
	195	Browser state (including request, response, history, forms and links)
	196	is left unchanged by calling this function.
	197
	198	The interface is the same as for .open().
	199
	200	This is useful for things like fetching images.
	201
	202	See also .retrieve().
	203
	204	"""
	205	return self._mech_open(url, data, visit=False, timeout=timeout)
	206
	207	def open(self, url, data=None,
	208	timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
	209	return self._mech_open(url, data, timeout=timeout)
	210
	211	def _mech_open(self, url, data=None, update_history=True, visit=None,
	212	timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
	213	try:
	214	url.get_full_url
	215	except AttributeError:
	216	# string URL -- convert to absolute URL if required
	217	scheme, authority = _rfc3986.urlsplit(url)[:2]
	218	if scheme is None:
	219	# relative URL
	220	if self._response is None:
	221	raise BrowserStateError(
	222	"can't fetch relative reference: "
	223	"not viewing any document")
	224	url = _rfc3986.urljoin(self._response.geturl(), url)
	225
	226	request = self._request(url, data, visit, timeout)
	227	visit = request.visit
	228	if visit is None:
	229	visit = True
	230
	231	if visit:
	232	self._visit_request(request, update_history)
	233
	234	success = True
	235	try:
	236	response = UserAgentBase.open(self, request, data)
	237	except urllib2.HTTPError, error:
	238	success = False
	239	if error.fp is None: # not a response
	240	raise
	241	response = error
	242	## except (IOError, socket.error, OSError), error:
	243	## # Yes, urllib2 really does raise all these :-((
	244	## # See test_urllib2.py for examples of socket.gaierror and OSError,
	245	## # plus note that FTPHandler raises IOError.
	246	## # XXX I don't seem to have an example of exactly socket.error being
	247	## # raised, only socket.gaierror...
	248	## # I don't want to start fixing these here, though, since this is a
	249	## # subclass of OpenerDirector, and it would break old code. Even in
	250	## # Python core, a fix would need some backwards-compat. hack to be
	251	## # acceptable.
	252	## raise
	253
	254	if visit:
	255	self._set_response(response, False)
	256	response = copy.copy(self._response)
	257	elif response is not None:
	258	response = _response.upgrade_response(response)
	259
	260	if not success:
	261	raise response
	262	return response
	263
	264	def __str__(self):
	265	text = []
	266	text.append("<%s " % self.__class__.__name__)
	267	if self._response:
	268	text.append("visiting %s" % self._response.geturl())
	269	else:
	270	text.append("(not visiting a URL)")
	271	if self.form:
	272	text.append("\n selected form:\n %s\n" % str(self.form))
	273	text.append(">")
	274	return "".join(text)
	275
	276	def response(self):
	277	"""Return a copy of the current response.
	278
	279	The returned object has the same interface as the object returned by
	280	.open() (or urllib2.urlopen()).
	281
	282	"""
	283	return copy.copy(self._response)
	284
	285	def open_local_file(self, filename):
	286	path = sanepathname2url(os.path.abspath(filename))
	287	url = 'file://' + path
	288	return self.open(url)
	289
	290	def set_response(self, response):
	291	"""Replace current response with (a copy of) response.
	292
	293	response may be None.
	294
	295	This is intended mostly for HTML-preprocessing.
	296	"""
	297	self._set_response(response, True)
	298
	299	def _set_response(self, response, close_current):
	300	# sanity check, necessary but far from sufficient
	301	if not (response is None or
	302	(hasattr(response, "info") and hasattr(response, "geturl") and
	303	hasattr(response, "read")
	304	)
	305	):
	306	raise ValueError("not a response object")
	307
	308	self.form = None
	309	if response is not None:
	310	response = _response.upgrade_response(response)
	311	if close_current and self._response is not None:
	312	self._response.close()
	313	self._response = response
	314	self._factory.set_response(response)
	315
	316	def visit_response(self, response, request=None):
	317	"""Visit the response, as if it had been .open()ed.
	318
	319	Unlike .set_response(), this updates history rather than replacing the
	320	current response.
	321	"""
	322	if request is None:
	323	request = _request.Request(response.geturl())
	324	self._visit_request(request, True)
	325	self._set_response(response, False)
	326
	327	def _visit_request(self, request, update_history):
	328	if self._response is not None:
	329	self._response.close()
	330	if self.request is not None and update_history:
	331	self._history.add(self.request, self._response)
	332	self._response = None
	333	# we want self.request to be assigned even if UserAgentBase.open
	334	# fails
	335	self.request = request
	336
	337	def geturl(self):
	338	"""Get URL of current document."""
	339	if self._response is None:
	340	raise BrowserStateError("not viewing any document")
	341	return self._response.geturl()
	342
	343	def reload(self):
	344	"""Reload current document, and return response object."""
	345	if self.request is None:
	346	raise BrowserStateError("no URL has yet been .open()ed")
	347	if self._response is not None:
	348	self._response.close()
	349	return self._mech_open(self.request, update_history=False)
	350
	351	def back(self, n=1):
	352	"""Go back n steps in history, and return response object.
	353
	354	n: go back this number of steps (default 1 step)
	355
	356	"""
	357	if self._response is not None:
	358	self._response.close()
	359	self.request, response = self._history.back(n, self._response)
	360	self.set_response(response)
	361	if not response.read_complete:
	362	return self.reload()
	363	return copy.copy(response)
	364
	365	def clear_history(self):
	366	self._history.clear()
	367
	368	def set_cookie(self, cookie_string):
	369	"""Request to set a cookie.
	370
	371	Note that it is NOT necessary to call this method under ordinary
	372	circumstances: cookie handling is normally entirely automatic. The
	373	intended use case is rather to simulate the setting of a cookie by
	374	client script in a web page (e.g. JavaScript). In that case, use of
	375	this method is necessary because mechanize currently does not support
	376	JavaScript, VBScript, etc.
	377
	378	The cookie is added in the same way as if it had arrived with the
	379	current response, as a result of the current request. This means that,
	380	for example, if it is not appropriate to set the cookie based on the
	381	current request, no cookie will be set.
	382
	383	The cookie will be returned automatically with subsequent responses
	384	made by the Browser instance whenever that's appropriate.
	385
	386	cookie_string should be a valid value of the Set-Cookie header.
	387
	388	For example:
	389
	390	browser.set_cookie(
	391	"sid=abcdef; expires=Wednesday, 09-Nov-06 23:12:40 GMT")
	392
	393	Currently, this method does not allow for adding RFC 2986 cookies.
	394	This limitation will be lifted if anybody requests it.
	395
	396	"""
	397	if self._response is None:
	398	raise BrowserStateError("not viewing any document")
	399	if self.request.get_type() not in ["http", "https"]:
	400	raise BrowserStateError("can't set cookie for non-HTTP/HTTPS "
	401	"transactions")
	402	cookiejar = self._ua_handlers["_cookies"].cookiejar
	403	response = self.response() # copy
	404	headers = response.info()
	405	headers["Set-cookie"] = cookie_string
	406	cookiejar.extract_cookies(response, self.request)
	407
	408	def links(self, **kwds):
	409	"""Return iterable over links (mechanize.Link objects)."""
	410	if not self.viewing_html():
	411	raise BrowserStateError("not viewing HTML")
	412	links = self._factory.links()
	413	if kwds:
	414	return self._filter_links(links, **kwds)
	415	else:
	416	return links
	417
	418	def forms(self):
	419	"""Return iterable over forms.
	420
	421	The returned form objects implement the ClientForm.HTMLForm interface.
	422
	423	"""
	424	if not self.viewing_html():
	425	raise BrowserStateError("not viewing HTML")
	426	return self._factory.forms()
	427
	428	def global_form(self):
	429	"""Return the global form object, or None if the factory implementation
	430	did not supply one.
	431
	432	The "global" form object contains all controls that are not descendants
	433	of any FORM element.
	434
	435	The returned form object implements the ClientForm.HTMLForm interface.
	436
	437	This is a separate method since the global form is not regarded as part
	438	of the sequence of forms in the document -- mostly for
	439	backwards-compatibility.
	440
	441	"""
	442	if not self.viewing_html():
	443	raise BrowserStateError("not viewing HTML")
	444	return self._factory.global_form
	445
	446	def viewing_html(self):
	447	"""Return whether the current response contains HTML data."""
	448	if self._response is None:
	449	raise BrowserStateError("not viewing any document")
	450	return self._factory.is_html
	451
	452	def encoding(self):
	453	if self._response is None:
	454	raise BrowserStateError("not viewing any document")
	455	return self._factory.encoding
	456
	457	def title(self):
	458	r"""Return title, or None if there is no title element in the document.
	459
	460	Treatment of any tag children of attempts to follow Firefox and IE
	461	(currently, tags are preserved).
	462
	463	"""
	464	if not self.viewing_html():
	465	raise BrowserStateError("not viewing HTML")
	466	return self._factory.title
	467
	468	def select_form(self, name=None, predicate=None, nr=None):
	469	"""Select an HTML form for input.
	470
	471	This is a bit like giving a form the "input focus" in a browser.
	472
	473	If a form is selected, the Browser object supports the HTMLForm
	474	interface, so you can call methods like .set_value(), .set(), and
	475	.click().
	476
	477	Another way to select a form is to assign to the .form attribute. The
	478	form assigned should be one of the objects returned by the .forms()
	479	method.
	480
	481	At least one of the name, predicate and nr arguments must be supplied.
	482	If no matching form is found, mechanize.FormNotFoundError is raised.
	483
	484	If name is specified, then the form must have the indicated name.
	485
	486	If predicate is specified, then the form must match that function. The
	487	predicate function is passed the HTMLForm as its single argument, and
	488	should return a boolean value indicating whether the form matched.
	489
	490	nr, if supplied, is the sequence number of the form (where 0 is the
	491	first). Note that control 0 is the first form matching all the other
	492	arguments (if supplied); it is not necessarily the first control in the
	493	form. The "global form" (consisting of all form controls not contained
	494	in any FORM element) is considered not to be part of this sequence and
	495	to have no name, so will not be matched unless both name and nr are
	496	None.
	497
	498	"""
	499	if not self.viewing_html():
	500	raise BrowserStateError("not viewing HTML")
	501	if (name is None) and (predicate is None) and (nr is None):
	502	raise ValueError(
	503	"at least one argument must be supplied to specify form")
	504
	505	global_form = self._factory.global_form
	506	if nr is None and name is None and \
	507	predicate is not None and predicate(global_form):
	508	self.form = global_form
	509	return
	510
	511	orig_nr = nr
	512	for form in self.forms():
	513	if name is not None and name != form.name:
	514	continue
	515	if predicate is not None and not predicate(form):
	516	continue
	517	if nr:
	518	nr -= 1
	519	continue
	520	self.form = form
	521	break # success
	522	else:
	523	# failure
	524	description = []
	525	if name is not None: description.append("name '%s'" % name)
	526	if predicate is not None:
	527	description.append("predicate %s" % predicate)
	528	if orig_nr is not None: description.append("nr %d" % orig_nr)
	529	description = ", ".join(description)
	530	raise FormNotFoundError("no form matching " + description)
	531
	532	def click(self, args, *kwds):
	533	"""See ClientForm.HTMLForm.click for documentation."""
	534	if not self.viewing_html():
	535	raise BrowserStateError("not viewing HTML")
	536	request = self.form.click(args, *kwds)
	537	return self._add_referer_header(request)
	538
	539	def submit(self, args, *kwds):
	540	"""Submit current form.
	541
	542	Arguments are as for ClientForm.HTMLForm.click().
	543
	544	Return value is same as for Browser.open().
	545
	546	"""
	547	return self.open(self.click(args, *kwds))
	548
	549	def click_link(self, link=None, **kwds):
	550	"""Find a link and return a Request object for it.
	551
	552	Arguments are as for .find_link(), except that a link may be supplied
	553	as the first argument.
	554
	555	"""
	556	if not self.viewing_html():
	557	raise BrowserStateError("not viewing HTML")
	558	if not link:
	559	link = self.find_link(**kwds)
	560	else:
	561	if kwds:
	562	raise ValueError(
	563	"either pass a Link, or keyword arguments, not both")
	564	request = self.request_class(link.absolute_url)
	565	return self._add_referer_header(request)
	566
	567	def follow_link(self, link=None, **kwds):
	568	"""Find a link and .open() it.
	569
	570	Arguments are as for .click_link().
	571
	572	Return value is same as for Browser.open().
	573
	574	"""
	575	return self.open(self.click_link(link, **kwds))
	576
	577	def find_link(self, **kwds):
	578	"""Find a link in current page.
	579
	580	Links are returned as mechanize.Link objects.
	581
	582	# Return third link that .search()-matches the regexp "python"
	583	# (by ".search()-matches", I mean that the regular expression method
	584	# .search() is used, rather than .match()).
	585	find_link(text_regex=re.compile("python"), nr=2)
	586
	587	# Return first http link in the current page that points to somewhere
	588	# on python.org whose link text (after tags have been removed) is
	589	# exactly "monty python".
	590	find_link(text="monty python",
	591	url_regex=re.compile("http.*python.org"))
	592
	593	# Return first link with exactly three HTML attributes.
	594	find_link(predicate=lambda link: len(link.attrs) == 3)
	595
	596	Links include anchors (<a>), image maps (<area>), and frames (<frame>,
	597	<iframe>).
	598
	599	All arguments must be passed by keyword, not position. Zero or more
	600	arguments may be supplied. In order to find a link, all arguments
	601	supplied must match.
	602
	603	If a matching link is not found, mechanize.LinkNotFoundError is raised.
	604
	605	text: link text between link tags: eg. <a href="blah">this bit</a> (as
	606	returned by pullparser.get_compressed_text(), ie. without tags but
	607	with opening tags "textified" as per the pullparser docs) must compare
	608	equal to this argument, if supplied
	609	text_regex: link text between tag (as defined above) must match the
	610	regular expression object or regular expression string passed as this
	611	argument, if supplied
	612	name, name_regex: as for text and text_regex, but matched against the
	613	name HTML attribute of the link tag
	614	url, url_regex: as for text and text_regex, but matched against the
	615	URL of the link tag (note this matches against Link.url, which is a
	616	relative or absolute URL according to how it was written in the HTML)
	617	tag: element name of opening tag, eg. "a"
	618	predicate: a function taking a Link object as its single argument,
	619	returning a boolean result, indicating whether the links
	620	nr: matches the nth link that matches all other criteria (default 0)
	621
	622	"""
	623	try:
	624	return self._filter_links(self._factory.links(), **kwds).next()
	625	except StopIteration:
	626	raise LinkNotFoundError()
	627
	628	def __getattr__(self, name):
	629	# pass through ClientForm / DOMForm methods and attributes
	630	form = self.__dict__.get("form")
	631	if form is None:
	632	raise AttributeError(
	633	"%s instance has no attribute %s (perhaps you forgot to "
	634	".select_form()?)" % (self.__class__, name))
	635	return getattr(form, name)
	636
	637	def _filter_links(self, links,
	638	text=None, text_regex=None,
	639	name=None, name_regex=None,
	640	url=None, url_regex=None,
	641	tag=None,
	642	predicate=None,
	643	nr=0
	644	):
	645	if not self.viewing_html():
	646	raise BrowserStateError("not viewing HTML")
	647
	648	found_links = []
	649	orig_nr = nr
	650
	651	for link in links:
	652	if url is not None and url != link.url:
	653	continue
	654	if url_regex is not None and not re.search(url_regex, link.url):
	655	continue
	656	if (text is not None and
	657	(link.text is None or text != link.text)):
	658	continue
	659	if (text_regex is not None and
	660	(link.text is None or not re.search(text_regex, link.text))):
	661	continue
	662	if name is not None and name != dict(link.attrs).get("name"):
	663	continue
	664	if name_regex is not None:
	665	link_name = dict(link.attrs).get("name")
	666	if link_name is None or not re.search(name_regex, link_name):
	667	continue
	668	if tag is not None and tag != link.tag:
	669	continue
	670	if predicate is not None and not predicate(link):
	671	continue
	672	if nr:
	673	nr -= 1
	674	continue
	675	yield link
	676	nr = orig_nr

Note: See TracBrowser for help on using the repository browser.

Download in other formats: