Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: py-scraping/mechanize/_html.py@ 177

Last change on this file since 177 was 106, checked in by Rick van der Zwet, 15 years ago
Initial commit...
File size: 20.8 KB

Line
1	"""HTML handling.
2
3	Copyright 2003-2006 John J. Lee <jjl@pobox.com>
4
5	This code is free software; you can redistribute it and/or modify it under
6	the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
7	included with the distribution).
8
9	"""
10
11	import re, copy, htmlentitydefs
12	import sgmllib, ClientForm
13
14	import _request
15	from _headersutil import split_header_words, is_html as _is_html
16	import _rfc3986
17
18	DEFAULT_ENCODING = "latin-1"
19
20	COMPRESS_RE = re.compile(r"\s+")
21
22
23	# the base classe is purely for backwards compatibility
24	class ParseError(ClientForm.ParseError): pass
25
26
27	class CachingGeneratorFunction(object):
28	"""Caching wrapper around a no-arguments iterable."""
29
30	def __init__(self, iterable):
31	self._cache = []
32	# wrap iterable to make it non-restartable (otherwise, repeated
33	# __call__ would give incorrect results)
34	self._iterator = iter(iterable)
35
36	def __call__(self):
37	cache = self._cache
38	for item in cache:
39	yield item
40	for item in self._iterator:
41	cache.append(item)
42	yield item
43
44
45	class EncodingFinder:
46	def __init__(self, default_encoding):
47	self._default_encoding = default_encoding
48	def encoding(self, response):
49	# HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
50	# headers may be in the response. HTTP-EQUIV headers come last,
51	# so try in order from first to last.
52	for ct in response.info().getheaders("content-type"):
53	for k, v in split_header_words([ct])[0]:
54	if k == "charset":
55	return v
56	return self._default_encoding
57
58	class ResponseTypeFinder:
59	def __init__(self, allow_xhtml):
60	self._allow_xhtml = allow_xhtml
61	def is_html(self, response, encoding):
62	ct_hdrs = response.info().getheaders("content-type")
63	url = response.geturl()
64	# XXX encoding
65	return _is_html(ct_hdrs, url, self._allow_xhtml)
66
67
68	# idea for this argument-processing trick is from Peter Otten
69	class Args:
70	def __init__(self, args_map):
71	self.dictionary = dict(args_map)
72	def __getattr__(self, key):
73	try:
74	return self.dictionary[key]
75	except KeyError:
76	return getattr(self.__class__, key)
77
78	def form_parser_args(
79	select_default=False,
80	form_parser_class=None,
81	request_class=None,
82	backwards_compat=False,
83	):
84	return Args(locals())
85
86
87	class Link:
88	def __init__(self, base_url, url, text, tag, attrs):
89	assert None not in [url, tag, attrs]
90	self.base_url = base_url
91	self.absolute_url = _rfc3986.urljoin(base_url, url)
92	self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
93	def __cmp__(self, other):
94	try:
95	for name in "url", "text", "tag", "attrs":
96	if getattr(self, name) != getattr(other, name):
97	return - 1
98	except AttributeError:
99	return - 1
100	return 0
101	def __repr__(self):
102	return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
103	self.base_url, self.url, self.text, self.tag, self.attrs)
104
105
106	class LinksFactory:
107
108	def __init__(self,
109	link_parser_class=None,
110	link_class=Link,
111	urltags=None,
112	):
113	import _pullparser
114	if link_parser_class is None:
115	link_parser_class = _pullparser.TolerantPullParser
116	self.link_parser_class = link_parser_class
117	self.link_class = link_class
118	if urltags is None:
119	urltags = {
120	"a": "href",
121	"area": "href",
122	"frame": "src",
123	"iframe": "src",
124	}
125	self.urltags = urltags
126	self._response = None
127	self._encoding = None
128
129	def set_response(self, response, base_url, encoding):
130	self._response = response
131	self._encoding = encoding
132	self._base_url = base_url
133
134	def links(self):
135	"""Return an iterator that provides links of the document."""
136	response = self._response
137	encoding = self._encoding
138	base_url = self._base_url
139	p = self.link_parser_class(response, encoding=encoding)
140
141	try:
142	for token in p.tags(*(self.urltags.keys() + ["base"])):
143	if token.type == "endtag":
144	continue
145	if token.data == "base":
146	base_href = dict(token.attrs).get("href")
147	if base_href is not None:
148	base_url = base_href
149	continue
150	attrs = dict(token.attrs)
151	tag = token.data
152	name = attrs.get("name")
153	text = None
154	# XXX use attr_encoding for ref'd doc if that doc does not
155	# provide one by other means
156	#attr_encoding = attrs.get("charset")
157	url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
158	if not url:
159	# Probably an <A NAME="blah"> link or <AREA NOHREF...>.
160	# For our purposes a link is something with a URL, so
161	# ignore this.
162	continue
163
164	url = _rfc3986.clean_url(url, encoding)
165	if tag == "a":
166	if token.type != "startendtag":
167	# hmm, this'd break if end tag is missing
168	text = p.get_compressed_text(("endtag", tag))
169	# but this doesn't work for eg.
170	# <a href="blah"><b>Andy</b></a>
171	#text = p.get_compressed_text()
172
173	yield Link(base_url, url, text, tag, token.attrs)
174	except sgmllib.SGMLParseError, exc:
175	raise ParseError(exc)
176
177	class FormsFactory:
178
179	"""Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
180
181	After calling .forms(), the .global_form attribute is a form object
182	containing all controls not a descendant of any FORM element.
183
184	For constructor argument docs, see ClientForm.ParseResponse
185	argument docs.
186
187	"""
188
189	def __init__(self,
190	select_default=False,
191	form_parser_class=None,
192	request_class=None,
193	backwards_compat=False,
194	):
195	import ClientForm
196	self.select_default = select_default
197	if form_parser_class is None:
198	form_parser_class = ClientForm.FormParser
199	self.form_parser_class = form_parser_class
200	if request_class is None:
201	request_class = _request.Request
202	self.request_class = request_class
203	self.backwards_compat = backwards_compat
204	self._response = None
205	self.encoding = None
206	self.global_form = None
207
208	def set_response(self, response, encoding):
209	self._response = response
210	self.encoding = encoding
211	self.global_form = None
212
213	def forms(self):
214	import ClientForm
215	encoding = self.encoding
216	try:
217	forms = ClientForm.ParseResponseEx(
218	self._response,
219	select_default=self.select_default,
220	form_parser_class=self.form_parser_class,
221	request_class=self.request_class,
222	encoding=encoding,
223	_urljoin=_rfc3986.urljoin,
224	_urlparse=_rfc3986.urlsplit,
225	_urlunparse=_rfc3986.urlunsplit,
226	)
227	except ClientForm.ParseError, exc:
228	raise ParseError(exc)
229	self.global_form = forms[0]
230	return forms[1:]
231
232	class TitleFactory:
233	def __init__(self):
234	self._response = self._encoding = None
235
236	def set_response(self, response, encoding):
237	self._response = response
238	self._encoding = encoding
239
240	def _get_title_text(self, parser):
241	import _pullparser
242	text = []
243	tok = None
244	while 1:
245	try:
246	tok = parser.get_token()
247	except _pullparser.NoMoreTokensError:
248	break
249	if tok.type == "data":
250	text.append(str(tok))
251	elif tok.type == "entityref":
252	t = unescape("&%s;" % tok.data,
253	parser._entitydefs, parser.encoding)
254	text.append(t)
255	elif tok.type == "charref":
256	t = unescape_charref(tok.data, parser.encoding)
257	text.append(t)
258	elif tok.type in ["starttag", "endtag", "startendtag"]:
259	tag_name = tok.data
260	if tok.type == "endtag" and tag_name == "title":
261	break
262	text.append(str(tok))
263	return COMPRESS_RE.sub(" ", "".join(text).strip())
264
265	def title(self):
266	import _pullparser
267	p = _pullparser.TolerantPullParser(
268	self._response, encoding=self._encoding)
269	try:
270	try:
271	p.get_tag("title")
272	except _pullparser.NoMoreTokensError:
273	return None
274	else:
275	return self._get_title_text(p)
276	except sgmllib.SGMLParseError, exc:
277	raise ParseError(exc)
278
279
280	def unescape(data, entities, encoding):
281	if data is None or "&" not in data:
282	return data
283
284	def replace_entities(match):
285	ent = match.group()
286	if ent[1] == "#":
287	return unescape_charref(ent[2:-1], encoding)
288
289	repl = entities.get(ent[1:-1])
290	if repl is not None:
291	repl = unichr(repl)
292	if type(repl) != type(""):
293	try:
294	repl = repl.encode(encoding)
295	except UnicodeError:
296	repl = ent
297	else:
298	repl = ent
299	return repl
300
301	return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
302
303	def unescape_charref(data, encoding):
304	name, base = data, 10
305	if name.startswith("x"):
306	name, base = name[1:], 16
307	uc = unichr(int(name, base))
308	if encoding is None:
309	return uc
310	else:
311	try:
312	repl = uc.encode(encoding)
313	except UnicodeError:
314	repl = "&#%s;" % data
315	return repl
316
317
318	# bizarre import gymnastics for bundled BeautifulSoup
319	import _beautifulsoup
320	import ClientForm
321	RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
322	_beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
323	)
324	# monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
325	sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
326
327	class MechanizeBs(_beautifulsoup.BeautifulSoup):
328	_entitydefs = htmlentitydefs.name2codepoint
329	# don't want the magic Microsoft-char workaround
330	PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
331	lambda(x):x.group(1) + ' />'),
332	(re.compile('<!\s+([^<>]*)>'),
333	lambda(x):'<!' + x.group(1) + '>')
334	]
335
336	def __init__(self, encoding, text=None, avoidParserProblems=True,
337	initialTextIsEverything=True):
338	self._encoding = encoding
339	_beautifulsoup.BeautifulSoup.__init__(
340	self, text, avoidParserProblems, initialTextIsEverything)
341
342	def handle_charref(self, ref):
343	t = unescape("&#%s;" % ref, self._entitydefs, self._encoding)
344	self.handle_data(t)
345	def handle_entityref(self, ref):
346	t = unescape("&%s;" % ref, self._entitydefs, self._encoding)
347	self.handle_data(t)
348	def unescape_attrs(self, attrs):
349	escaped_attrs = []
350	for key, val in attrs:
351	val = unescape(val, self._entitydefs, self._encoding)
352	escaped_attrs.append((key, val))
353	return escaped_attrs
354
355	class RobustLinksFactory:
356
357	compress_re = COMPRESS_RE
358
359	def __init__(self,
360	link_parser_class=None,
361	link_class=Link,
362	urltags=None,
363	):
364	if link_parser_class is None:
365	link_parser_class = MechanizeBs
366	self.link_parser_class = link_parser_class
367	self.link_class = link_class
368	if urltags is None:
369	urltags = {
370	"a": "href",
371	"area": "href",
372	"frame": "src",
373	"iframe": "src",
374	}
375	self.urltags = urltags
376	self._bs = None
377	self._encoding = None
378	self._base_url = None
379
380	def set_soup(self, soup, base_url, encoding):
381	self._bs = soup
382	self._base_url = base_url
383	self._encoding = encoding
384
385	def links(self):
386	import _beautifulsoup
387	bs = self._bs
388	base_url = self._base_url
389	encoding = self._encoding
390	gen = bs.recursiveChildGenerator()
391	for ch in bs.recursiveChildGenerator():
392	if (isinstance(ch, _beautifulsoup.Tag) and
393	ch.name in self.urltags.keys() + ["base"]):
394	link = ch
395	attrs = bs.unescape_attrs(link.attrs)
396	attrs_dict = dict(attrs)
397	if link.name == "base":
398	base_href = attrs_dict.get("href")
399	if base_href is not None:
400	base_url = base_href
401	continue
402	url_attr = self.urltags[link.name]
403	url = attrs_dict.get(url_attr)
404	if not url:
405	continue
406	url = _rfc3986.clean_url(url, encoding)
407	text = link.fetchText(lambda t: True)
408	if not text:
409	# follow _pullparser's weird behaviour rigidly
410	if link.name == "a":
411	text = ""
412	else:
413	text = None
414	else:
415	text = self.compress_re.sub(" ", " ".join(text).strip())
416	yield Link(base_url, url, text, link.name, attrs)
417
418
419	class RobustFormsFactory(FormsFactory):
420	def __init__(self, args, *kwds):
421	args = form_parser_args(args, *kwds)
422	if args.form_parser_class is None:
423	args.form_parser_class = RobustFormParser
424	FormsFactory.__init__(self, **args.dictionary)
425
426	def set_response(self, response, encoding):
427	self._response = response
428	self.encoding = encoding
429
430
431	class RobustTitleFactory:
432	def __init__(self):
433	self._bs = self._encoding = None
434
435	def set_soup(self, soup, encoding):
436	self._bs = soup
437	self._encoding = encoding
438
439	def title(self):
440	import _beautifulsoup
441	title = self._bs.first("title")
442	if title == _beautifulsoup.Null:
443	return None
444	else:
445	inner_html = "".join([str(node) for node in title.contents])
446	return COMPRESS_RE.sub(" ", inner_html.strip())
447
448
449	class Factory:
450	"""Factory for forms, links, etc.
451
452	This interface may expand in future.
453
454	Public methods:
455
456	set_request_class(request_class)
457	set_response(response)
458	forms()
459	links()
460
461	Public attributes:
462
463	Note that accessing these attributes may raise ParseError.
464
465	encoding: string specifying the encoding of response if it contains a text
466	document (this value is left unspecified for documents that do not have
467	an encoding, e.g. an image file)
468	is_html: true if response contains an HTML document (XHTML may be
469	regarded as HTML too)
470	title: page title, or None if no title or not HTML
471	global_form: form object containing all controls that are not descendants
472	of any FORM element, or None if the forms_factory does not support
473	supplying a global form
474
475	"""
476
477	LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
478
479	def __init__(self, forms_factory, links_factory, title_factory,
480	encoding_finder=EncodingFinder(DEFAULT_ENCODING),
481	response_type_finder=ResponseTypeFinder(allow_xhtml=False),
482	):
483	"""
484
485	Pass keyword arguments only.
486
487	default_encoding: character encoding to use if encoding cannot be
488	determined (or guessed) from the response. You should turn on
489	HTTP-EQUIV handling if you want the best chance of getting this right
490	without resorting to this default. The default value of this
491	parameter (currently latin-1) may change in future.
492
493	"""
494	self._forms_factory = forms_factory
495	self._links_factory = links_factory
496	self._title_factory = title_factory
497	self._encoding_finder = encoding_finder
498	self._response_type_finder = response_type_finder
499
500	self.set_response(None)
501
502	def set_request_class(self, request_class):
503	"""Set urllib2.Request class.
504
505	ClientForm.HTMLForm instances returned by .forms() will return
506	instances of this class when .click()ed.
507
508	"""
509	self._forms_factory.request_class = request_class
510
511	def set_response(self, response):
512	"""Set response.
513
514	The response must either be None or implement the same interface as
515	objects returned by urllib2.urlopen().
516
517	"""
518	self._response = response
519	self._forms_genf = self._links_genf = None
520	self._get_title = None
521	for name in self.LAZY_ATTRS:
522	try:
523	delattr(self, name)
524	except AttributeError:
525	pass
526
527	def __getattr__(self, name):
528	if name not in self.LAZY_ATTRS:
529	return getattr(self.__class__, name)
530
531	if name == "encoding":
532	self.encoding = self._encoding_finder.encoding(
533	copy.copy(self._response))
534	return self.encoding
535	elif name == "is_html":
536	self.is_html = self._response_type_finder.is_html(
537	copy.copy(self._response), self.encoding)
538	return self.is_html
539	elif name == "title":
540	if self.is_html:
541	self.title = self._title_factory.title()
542	else:
543	self.title = None
544	return self.title
545	elif name == "global_form":
546	self.forms()
547	return self.global_form
548
549	def forms(self):
550	"""Return iterable over ClientForm.HTMLForm-like objects.
551
552	Raises mechanize.ParseError on failure.
553	"""
554	# this implementation sets .global_form as a side-effect, for benefit
555	# of __getattr__ impl
556	if self._forms_genf is None:
557	try:
558	self._forms_genf = CachingGeneratorFunction(
559	self._forms_factory.forms())
560	except: # XXXX define exception!
561	self.set_response(self._response)
562	raise
563	self.global_form = getattr(
564	self._forms_factory, "global_form", None)
565	return self._forms_genf()
566
567	def links(self):
568	"""Return iterable over mechanize.Link-like objects.
569
570	Raises mechanize.ParseError on failure.
571	"""
572	if self._links_genf is None:
573	try:
574	self._links_genf = CachingGeneratorFunction(
575	self._links_factory.links())
576	except: # XXXX define exception!
577	self.set_response(self._response)
578	raise
579	return self._links_genf()
580
581	class DefaultFactory(Factory):
582	"""Based on sgmllib."""
583	def __init__(self, i_want_broken_xhtml_support=False):
584	Factory.__init__(
585	self,
586	forms_factory=FormsFactory(),
587	links_factory=LinksFactory(),
588	title_factory=TitleFactory(),
589	response_type_finder=ResponseTypeFinder(
590	allow_xhtml=i_want_broken_xhtml_support),
591	)
592
593	def set_response(self, response):
594	Factory.set_response(self, response)
595	if response is not None:
596	self._forms_factory.set_response(
597	copy.copy(response), self.encoding)
598	self._links_factory.set_response(
599	copy.copy(response), response.geturl(), self.encoding)
600	self._title_factory.set_response(
601	copy.copy(response), self.encoding)
602
603	class RobustFactory(Factory):
604	"""Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
605	DefaultFactory.
606
607	"""
608	def __init__(self, i_want_broken_xhtml_support=False,
609	soup_class=None):
610	Factory.__init__(
611	self,
612	forms_factory=RobustFormsFactory(),
613	links_factory=RobustLinksFactory(),
614	title_factory=RobustTitleFactory(),
615	response_type_finder=ResponseTypeFinder(
616	allow_xhtml=i_want_broken_xhtml_support),
617	)
618	if soup_class is None:
619	soup_class = MechanizeBs
620	self._soup_class = soup_class
621
622	def set_response(self, response):
623	Factory.set_response(self, response)
624	if response is not None:
625	data = response.read()
626	soup = self._soup_class(self.encoding, data)
627	self._forms_factory.set_response(
628	copy.copy(response), self.encoding)
629	self._links_factory.set_soup(
630	soup, response.geturl(), self.encoding)
631	self._title_factory.set_soup(soup, self.encoding)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: