Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: py-scraping/mechanize/_html.py@ 201

Last change on this file since 201 was 106, checked in by Rick van der Zwet, 15 years ago
Initial commit...
File size: 20.8 KB

Rev	Line
[106]	1	"""HTML handling.
	2
	3	Copyright 2003-2006 John J. Lee <jjl@pobox.com>
	4
	5	This code is free software; you can redistribute it and/or modify it under
	6	the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
	7	included with the distribution).
	8
	9	"""
	10
	11	import re, copy, htmlentitydefs
	12	import sgmllib, ClientForm
	13
	14	import _request
	15	from _headersutil import split_header_words, is_html as _is_html
	16	import _rfc3986
	17
	18	DEFAULT_ENCODING = "latin-1"
	19
	20	COMPRESS_RE = re.compile(r"\s+")
	21
	22
	23	# the base classe is purely for backwards compatibility
	24	class ParseError(ClientForm.ParseError): pass
	25
	26
	27	class CachingGeneratorFunction(object):
	28	"""Caching wrapper around a no-arguments iterable."""
	29
	30	def __init__(self, iterable):
	31	self._cache = []
	32	# wrap iterable to make it non-restartable (otherwise, repeated
	33	# __call__ would give incorrect results)
	34	self._iterator = iter(iterable)
	35
	36	def __call__(self):
	37	cache = self._cache
	38	for item in cache:
	39	yield item
	40	for item in self._iterator:
	41	cache.append(item)
	42	yield item
	43
	44
	45	class EncodingFinder:
	46	def __init__(self, default_encoding):
	47	self._default_encoding = default_encoding
	48	def encoding(self, response):
	49	# HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
	50	# headers may be in the response. HTTP-EQUIV headers come last,
	51	# so try in order from first to last.
	52	for ct in response.info().getheaders("content-type"):
	53	for k, v in split_header_words([ct])[0]:
	54	if k == "charset":
	55	return v
	56	return self._default_encoding
	57
	58	class ResponseTypeFinder:
	59	def __init__(self, allow_xhtml):
	60	self._allow_xhtml = allow_xhtml
	61	def is_html(self, response, encoding):
	62	ct_hdrs = response.info().getheaders("content-type")
	63	url = response.geturl()
	64	# XXX encoding
	65	return _is_html(ct_hdrs, url, self._allow_xhtml)
	66
	67
	68	# idea for this argument-processing trick is from Peter Otten
	69	class Args:
	70	def __init__(self, args_map):
	71	self.dictionary = dict(args_map)
	72	def __getattr__(self, key):
	73	try:
	74	return self.dictionary[key]
	75	except KeyError:
	76	return getattr(self.__class__, key)
	77
	78	def form_parser_args(
	79	select_default=False,
	80	form_parser_class=None,
	81	request_class=None,
	82	backwards_compat=False,
	83	):
	84	return Args(locals())
	85
	86
	87	class Link:
	88	def __init__(self, base_url, url, text, tag, attrs):
	89	assert None not in [url, tag, attrs]
	90	self.base_url = base_url
	91	self.absolute_url = _rfc3986.urljoin(base_url, url)
	92	self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
	93	def __cmp__(self, other):
	94	try:
	95	for name in "url", "text", "tag", "attrs":
	96	if getattr(self, name) != getattr(other, name):
	97	return - 1
	98	except AttributeError:
	99	return - 1
	100	return 0
	101	def __repr__(self):
	102	return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
	103	self.base_url, self.url, self.text, self.tag, self.attrs)
	104
	105
	106	class LinksFactory:
	107
	108	def __init__(self,
	109	link_parser_class=None,
	110	link_class=Link,
	111	urltags=None,
	112	):
	113	import _pullparser
	114	if link_parser_class is None:
	115	link_parser_class = _pullparser.TolerantPullParser
	116	self.link_parser_class = link_parser_class
	117	self.link_class = link_class
	118	if urltags is None:
	119	urltags = {
	120	"a": "href",
	121	"area": "href",
	122	"frame": "src",
	123	"iframe": "src",
	124	}
	125	self.urltags = urltags
	126	self._response = None
	127	self._encoding = None
	128
	129	def set_response(self, response, base_url, encoding):
	130	self._response = response
	131	self._encoding = encoding
	132	self._base_url = base_url
	133
	134	def links(self):
	135	"""Return an iterator that provides links of the document."""
	136	response = self._response
	137	encoding = self._encoding
	138	base_url = self._base_url
	139	p = self.link_parser_class(response, encoding=encoding)
	140
	141	try:
	142	for token in p.tags(*(self.urltags.keys() + ["base"])):
	143	if token.type == "endtag":
	144	continue
	145	if token.data == "base":
	146	base_href = dict(token.attrs).get("href")
	147	if base_href is not None:
	148	base_url = base_href
	149	continue
	150	attrs = dict(token.attrs)
	151	tag = token.data
	152	name = attrs.get("name")
	153	text = None
	154	# XXX use attr_encoding for ref'd doc if that doc does not
	155	# provide one by other means
	156	#attr_encoding = attrs.get("charset")
	157	url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
	158	if not url:
	159	# Probably an <A NAME="blah"> link or <AREA NOHREF...>.
	160	# For our purposes a link is something with a URL, so
	161	# ignore this.
	162	continue
	163
	164	url = _rfc3986.clean_url(url, encoding)
	165	if tag == "a":
	166	if token.type != "startendtag":
	167	# hmm, this'd break if end tag is missing
	168	text = p.get_compressed_text(("endtag", tag))
	169	# but this doesn't work for eg.
	170	# <a href="blah"><b>Andy</b></a>
	171	#text = p.get_compressed_text()
	172
	173	yield Link(base_url, url, text, tag, token.attrs)
	174	except sgmllib.SGMLParseError, exc:
	175	raise ParseError(exc)
	176
	177	class FormsFactory:
	178
	179	"""Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
	180
	181	After calling .forms(), the .global_form attribute is a form object
	182	containing all controls not a descendant of any FORM element.
	183
	184	For constructor argument docs, see ClientForm.ParseResponse
	185	argument docs.
	186
	187	"""
	188
	189	def __init__(self,
	190	select_default=False,
	191	form_parser_class=None,
	192	request_class=None,
	193	backwards_compat=False,
	194	):
	195	import ClientForm
	196	self.select_default = select_default
	197	if form_parser_class is None:
	198	form_parser_class = ClientForm.FormParser
	199	self.form_parser_class = form_parser_class
	200	if request_class is None:
	201	request_class = _request.Request
	202	self.request_class = request_class
	203	self.backwards_compat = backwards_compat
	204	self._response = None
	205	self.encoding = None
	206	self.global_form = None
	207
	208	def set_response(self, response, encoding):
	209	self._response = response
	210	self.encoding = encoding
	211	self.global_form = None
	212
	213	def forms(self):
	214	import ClientForm
	215	encoding = self.encoding
	216	try:
	217	forms = ClientForm.ParseResponseEx(
	218	self._response,
	219	select_default=self.select_default,
	220	form_parser_class=self.form_parser_class,
	221	request_class=self.request_class,
	222	encoding=encoding,
	223	_urljoin=_rfc3986.urljoin,
	224	_urlparse=_rfc3986.urlsplit,
	225	_urlunparse=_rfc3986.urlunsplit,
	226	)
	227	except ClientForm.ParseError, exc:
	228	raise ParseError(exc)
	229	self.global_form = forms[0]
	230	return forms[1:]
	231
	232	class TitleFactory:
	233	def __init__(self):
	234	self._response = self._encoding = None
	235
	236	def set_response(self, response, encoding):
	237	self._response = response
	238	self._encoding = encoding
	239
	240	def _get_title_text(self, parser):
	241	import _pullparser
	242	text = []
	243	tok = None
	244	while 1:
	245	try:
	246	tok = parser.get_token()
	247	except _pullparser.NoMoreTokensError:
	248	break
	249	if tok.type == "data":
	250	text.append(str(tok))
	251	elif tok.type == "entityref":
	252	t = unescape("&%s;" % tok.data,
	253	parser._entitydefs, parser.encoding)
	254	text.append(t)
	255	elif tok.type == "charref":
	256	t = unescape_charref(tok.data, parser.encoding)
	257	text.append(t)
	258	elif tok.type in ["starttag", "endtag", "startendtag"]:
	259	tag_name = tok.data
	260	if tok.type == "endtag" and tag_name == "title":
	261	break
	262	text.append(str(tok))
	263	return COMPRESS_RE.sub(" ", "".join(text).strip())
	264
	265	def title(self):
	266	import _pullparser
	267	p = _pullparser.TolerantPullParser(
	268	self._response, encoding=self._encoding)
	269	try:
	270	try:
	271	p.get_tag("title")
	272	except _pullparser.NoMoreTokensError:
	273	return None
	274	else:
	275	return self._get_title_text(p)
	276	except sgmllib.SGMLParseError, exc:
	277	raise ParseError(exc)
	278
	279
	280	def unescape(data, entities, encoding):
	281	if data is None or "&" not in data:
	282	return data
	283
	284	def replace_entities(match):
	285	ent = match.group()
	286	if ent[1] == "#":
	287	return unescape_charref(ent[2:-1], encoding)
	288
	289	repl = entities.get(ent[1:-1])
	290	if repl is not None:
	291	repl = unichr(repl)
	292	if type(repl) != type(""):
	293	try:
	294	repl = repl.encode(encoding)
	295	except UnicodeError:
	296	repl = ent
	297	else:
	298	repl = ent
	299	return repl
	300
	301	return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
	302
	303	def unescape_charref(data, encoding):
	304	name, base = data, 10
	305	if name.startswith("x"):
	306	name, base = name[1:], 16
	307	uc = unichr(int(name, base))
	308	if encoding is None:
	309	return uc
	310	else:
	311	try:
	312	repl = uc.encode(encoding)
	313	except UnicodeError:
	314	repl = "&#%s;" % data
	315	return repl
	316
	317
	318	# bizarre import gymnastics for bundled BeautifulSoup
	319	import _beautifulsoup
	320	import ClientForm
	321	RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
	322	_beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
	323	)
	324	# monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
	325	sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
	326
	327	class MechanizeBs(_beautifulsoup.BeautifulSoup):
	328	_entitydefs = htmlentitydefs.name2codepoint
	329	# don't want the magic Microsoft-char workaround
	330	PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
	331	lambda(x):x.group(1) + ' />'),
	332	(re.compile('<!\s+([^<>]*)>'),
	333	lambda(x):'<!' + x.group(1) + '>')
	334	]
	335
	336	def __init__(self, encoding, text=None, avoidParserProblems=True,
	337	initialTextIsEverything=True):
	338	self._encoding = encoding
	339	_beautifulsoup.BeautifulSoup.__init__(
	340	self, text, avoidParserProblems, initialTextIsEverything)
	341
	342	def handle_charref(self, ref):
	343	t = unescape("&#%s;" % ref, self._entitydefs, self._encoding)
	344	self.handle_data(t)
	345	def handle_entityref(self, ref):
	346	t = unescape("&%s;" % ref, self._entitydefs, self._encoding)
	347	self.handle_data(t)
	348	def unescape_attrs(self, attrs):
	349	escaped_attrs = []
	350	for key, val in attrs:
	351	val = unescape(val, self._entitydefs, self._encoding)
	352	escaped_attrs.append((key, val))
	353	return escaped_attrs
	354
	355	class RobustLinksFactory:
	356
	357	compress_re = COMPRESS_RE
	358
	359	def __init__(self,
	360	link_parser_class=None,
	361	link_class=Link,
	362	urltags=None,
	363	):
	364	if link_parser_class is None:
	365	link_parser_class = MechanizeBs
	366	self.link_parser_class = link_parser_class
	367	self.link_class = link_class
	368	if urltags is None:
	369	urltags = {
	370	"a": "href",
	371	"area": "href",
	372	"frame": "src",
	373	"iframe": "src",
	374	}
	375	self.urltags = urltags
	376	self._bs = None
	377	self._encoding = None
	378	self._base_url = None
	379
	380	def set_soup(self, soup, base_url, encoding):
	381	self._bs = soup
	382	self._base_url = base_url
	383	self._encoding = encoding
	384
	385	def links(self):
	386	import _beautifulsoup
	387	bs = self._bs
	388	base_url = self._base_url
	389	encoding = self._encoding
	390	gen = bs.recursiveChildGenerator()
	391	for ch in bs.recursiveChildGenerator():
	392	if (isinstance(ch, _beautifulsoup.Tag) and
	393	ch.name in self.urltags.keys() + ["base"]):
	394	link = ch
	395	attrs = bs.unescape_attrs(link.attrs)
	396	attrs_dict = dict(attrs)
	397	if link.name == "base":
	398	base_href = attrs_dict.get("href")
	399	if base_href is not None:
	400	base_url = base_href
	401	continue
	402	url_attr = self.urltags[link.name]
	403	url = attrs_dict.get(url_attr)
	404	if not url:
	405	continue
	406	url = _rfc3986.clean_url(url, encoding)
	407	text = link.fetchText(lambda t: True)
	408	if not text:
	409	# follow _pullparser's weird behaviour rigidly
	410	if link.name == "a":
	411	text = ""
	412	else:
	413	text = None
	414	else:
	415	text = self.compress_re.sub(" ", " ".join(text).strip())
	416	yield Link(base_url, url, text, link.name, attrs)
	417
	418
	419	class RobustFormsFactory(FormsFactory):
	420	def __init__(self, args, *kwds):
	421	args = form_parser_args(args, *kwds)
	422	if args.form_parser_class is None:
	423	args.form_parser_class = RobustFormParser
	424	FormsFactory.__init__(self, **args.dictionary)
	425
	426	def set_response(self, response, encoding):
	427	self._response = response
	428	self.encoding = encoding
	429
	430
	431	class RobustTitleFactory:
	432	def __init__(self):
	433	self._bs = self._encoding = None
	434
	435	def set_soup(self, soup, encoding):
	436	self._bs = soup
	437	self._encoding = encoding
	438
	439	def title(self):
	440	import _beautifulsoup
	441	title = self._bs.first("title")
	442	if title == _beautifulsoup.Null:
	443	return None
	444	else:
	445	inner_html = "".join([str(node) for node in title.contents])
	446	return COMPRESS_RE.sub(" ", inner_html.strip())
	447
	448
	449	class Factory:
	450	"""Factory for forms, links, etc.
	451
	452	This interface may expand in future.
	453
	454	Public methods:
	455
	456	set_request_class(request_class)
	457	set_response(response)
	458	forms()
	459	links()
	460
	461	Public attributes:
	462
	463	Note that accessing these attributes may raise ParseError.
	464
	465	encoding: string specifying the encoding of response if it contains a text
	466	document (this value is left unspecified for documents that do not have
	467	an encoding, e.g. an image file)
	468	is_html: true if response contains an HTML document (XHTML may be
	469	regarded as HTML too)
	470	title: page title, or None if no title or not HTML
	471	global_form: form object containing all controls that are not descendants
	472	of any FORM element, or None if the forms_factory does not support
	473	supplying a global form
	474
	475	"""
	476
	477	LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
	478
	479	def __init__(self, forms_factory, links_factory, title_factory,
	480	encoding_finder=EncodingFinder(DEFAULT_ENCODING),
	481	response_type_finder=ResponseTypeFinder(allow_xhtml=False),
	482	):
	483	"""
	484
	485	Pass keyword arguments only.
	486
	487	default_encoding: character encoding to use if encoding cannot be
	488	determined (or guessed) from the response. You should turn on
	489	HTTP-EQUIV handling if you want the best chance of getting this right
	490	without resorting to this default. The default value of this
	491	parameter (currently latin-1) may change in future.
	492
	493	"""
	494	self._forms_factory = forms_factory
	495	self._links_factory = links_factory
	496	self._title_factory = title_factory
	497	self._encoding_finder = encoding_finder
	498	self._response_type_finder = response_type_finder
	499
	500	self.set_response(None)
	501
	502	def set_request_class(self, request_class):
	503	"""Set urllib2.Request class.
	504
	505	ClientForm.HTMLForm instances returned by .forms() will return
	506	instances of this class when .click()ed.
	507
	508	"""
	509	self._forms_factory.request_class = request_class
	510
	511	def set_response(self, response):
	512	"""Set response.
	513
	514	The response must either be None or implement the same interface as
	515	objects returned by urllib2.urlopen().
	516
	517	"""
	518	self._response = response
	519	self._forms_genf = self._links_genf = None
	520	self._get_title = None
	521	for name in self.LAZY_ATTRS:
	522	try:
	523	delattr(self, name)
	524	except AttributeError:
	525	pass
	526
	527	def __getattr__(self, name):
	528	if name not in self.LAZY_ATTRS:
	529	return getattr(self.__class__, name)
	530
	531	if name == "encoding":
	532	self.encoding = self._encoding_finder.encoding(
	533	copy.copy(self._response))
	534	return self.encoding
	535	elif name == "is_html":
	536	self.is_html = self._response_type_finder.is_html(
	537	copy.copy(self._response), self.encoding)
	538	return self.is_html
	539	elif name == "title":
	540	if self.is_html:
	541	self.title = self._title_factory.title()
	542	else:
	543	self.title = None
	544	return self.title
	545	elif name == "global_form":
	546	self.forms()
	547	return self.global_form
	548
	549	def forms(self):
	550	"""Return iterable over ClientForm.HTMLForm-like objects.
	551
	552	Raises mechanize.ParseError on failure.
	553	"""
	554	# this implementation sets .global_form as a side-effect, for benefit
	555	# of __getattr__ impl
	556	if self._forms_genf is None:
	557	try:
	558	self._forms_genf = CachingGeneratorFunction(
	559	self._forms_factory.forms())
	560	except: # XXXX define exception!
	561	self.set_response(self._response)
	562	raise
	563	self.global_form = getattr(
	564	self._forms_factory, "global_form", None)
	565	return self._forms_genf()
	566
	567	def links(self):
	568	"""Return iterable over mechanize.Link-like objects.
	569
	570	Raises mechanize.ParseError on failure.
	571	"""
	572	if self._links_genf is None:
	573	try:
	574	self._links_genf = CachingGeneratorFunction(
	575	self._links_factory.links())
	576	except: # XXXX define exception!
	577	self.set_response(self._response)
	578	raise
	579	return self._links_genf()
	580
	581	class DefaultFactory(Factory):
	582	"""Based on sgmllib."""
	583	def __init__(self, i_want_broken_xhtml_support=False):
	584	Factory.__init__(
	585	self,
	586	forms_factory=FormsFactory(),
	587	links_factory=LinksFactory(),
	588	title_factory=TitleFactory(),
	589	response_type_finder=ResponseTypeFinder(
	590	allow_xhtml=i_want_broken_xhtml_support),
	591	)
	592
	593	def set_response(self, response):
	594	Factory.set_response(self, response)
	595	if response is not None:
	596	self._forms_factory.set_response(
	597	copy.copy(response), self.encoding)
	598	self._links_factory.set_response(
	599	copy.copy(response), response.geturl(), self.encoding)
	600	self._title_factory.set_response(
	601	copy.copy(response), self.encoding)
	602
	603	class RobustFactory(Factory):
	604	"""Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
	605	DefaultFactory.
	606
	607	"""
	608	def __init__(self, i_want_broken_xhtml_support=False,
	609	soup_class=None):
	610	Factory.__init__(
	611	self,
	612	forms_factory=RobustFormsFactory(),
	613	links_factory=RobustLinksFactory(),
	614	title_factory=RobustTitleFactory(),
	615	response_type_finder=ResponseTypeFinder(
	616	allow_xhtml=i_want_broken_xhtml_support),
	617	)
	618	if soup_class is None:
	619	soup_class = MechanizeBs
	620	self._soup_class = soup_class
	621
	622	def set_response(self, response):
	623	Factory.set_response(self, response)
	624	if response is not None:
	625	data = response.read()
	626	soup = self._soup_class(self.encoding, data)
	627	self._forms_factory.set_response(
	628	copy.copy(response), self.encoding)
	629	self._links_factory.set_soup(
	630	soup, response.geturl(), self.encoding)
	631	self._title_factory.set_soup(soup, self.encoding)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: