Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: py-scraping/mechanize/_headersutil.py@ 201

Last change on this file since 201 was 106, checked in by Rick van der Zwet, 15 years ago
Initial commit...
File size: 7.9 KB

Line
1	"""Utility functions for HTTP header value parsing and construction.
2
3	Copyright 1997-1998, Gisle Aas
4	Copyright 2002-2006, John J. Lee
5
6	This code is free software; you can redistribute it and/or modify it
7	under the terms of the BSD or ZPL 2.1 licenses (see the file
8	COPYING.txt included with the distribution).
9
10	"""
11
12	import os, re
13	from types import StringType
14	from types import UnicodeType
15	STRING_TYPES = StringType, UnicodeType
16
17	from _util import http2time
18	import _rfc3986
19
20	def is_html(ct_headers, url, allow_xhtml=False):
21	"""
22	ct_headers: Sequence of Content-Type headers
23	url: Response URL
24
25	"""
26	if not ct_headers:
27	# guess
28	ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1]
29	html_exts = [".htm", ".html"]
30	if allow_xhtml:
31	html_exts += [".xhtml"]
32	return ext in html_exts
33	# use first header
34	ct = split_header_words(ct_headers)[0][0][0]
35	html_types = ["text/html"]
36	if allow_xhtml:
37	html_types += [
38	"text/xhtml", "text/xml",
39	"application/xml", "application/xhtml+xml",
40	]
41	return ct in html_types
42
43	def unmatched(match):
44	"""Return unmatched part of re.Match object."""
45	start, end = match.span(0)
46	return match.string[:start] + match.string[end:]
47
48	token_re = re.compile(r"^\s*([^=\s;,]+)")
49	quoted_value_re = re.compile(r"^\s=\s\"([^\"\\](?:\\.[^\"\\])*)\"")
50	value_re = re.compile(r"^\s=\s([^\s;,]*)")
51	escape_re = re.compile(r"\\(.)")
52	def split_header_words(header_values):
53	r"""Parse header values into a list of lists containing key,value pairs.
54
55	The function knows how to deal with ",", ";" and "=" as well as quoted
56	values after "=". A list of space separated tokens are parsed as if they
57	were separated by ";".
58
59	If the header_values passed as argument contains multiple values, then they
60	are treated as if they were a single value separated by comma ",".
61
62	This means that this function is useful for parsing header fields that
63	follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
64	the requirement for tokens).
65
66	headers = #header
67	header = (token \| parameter) *( [";"] (token \| parameter))
68
69	token = 1*<any CHAR except CTLs or separators>
70	separators = "(" \| ")" \| "<" \| ">" \| "@"
71	\| "," \| ";" \| ":" \| "\" \| <">
72	\| "/" \| "[" \| "]" \| "?" \| "="
73	\| "{" \| "}" \| SP \| HT
74
75	quoted-string = ( <"> *(qdtext \| quoted-pair ) <"> )
76	qdtext = <any TEXT except <">>
77	quoted-pair = "\" CHAR
78
79	parameter = attribute "=" value
80	attribute = token
81	value = token \| quoted-string
82
83	Each header is represented by a list of key/value pairs. The value for a
84	simple token (not part of a parameter) is None. Syntactically incorrect
85	headers will not necessarily be parsed as you would want.
86
87	This is easier to describe with some examples:
88
89	>>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
90	[[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
91	>>> split_header_words(['text/html; charset="iso-8859-1"'])
92	[[('text/html', None), ('charset', 'iso-8859-1')]]
93	>>> split_header_words([r'Basic realm="\"foo\bar\""'])
94	[[('Basic', None), ('realm', '"foobar"')]]
95
96	"""
97	assert type(header_values) not in STRING_TYPES
98	result = []
99	for text in header_values:
100	orig_text = text
101	pairs = []
102	while text:
103	m = token_re.search(text)
104	if m:
105	text = unmatched(m)
106	name = m.group(1)
107	m = quoted_value_re.search(text)
108	if m: # quoted value
109	text = unmatched(m)
110	value = m.group(1)
111	value = escape_re.sub(r"\1", value)
112	else:
113	m = value_re.search(text)
114	if m: # unquoted value
115	text = unmatched(m)
116	value = m.group(1)
117	value = value.rstrip()
118	else:
119	# no value, a lone token
120	value = None
121	pairs.append((name, value))
122	elif text.lstrip().startswith(","):
123	# concatenated headers, as per RFC 2616 section 4.2
124	text = text.lstrip()[1:]
125	if pairs: result.append(pairs)
126	pairs = []
127	else:
128	# skip junk
129	non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
130	assert nr_junk_chars > 0, (
131	"split_header_words bug: '%s', '%s', %s" %
132	(orig_text, text, pairs))
133	text = non_junk
134	if pairs: result.append(pairs)
135	return result
136
137	join_escape_re = re.compile(r"([\"\\])")
138	def join_header_words(lists):
139	"""Do the inverse of the conversion done by split_header_words.
140
141	Takes a list of lists of (key, value) pairs and produces a single header
142	value. Attribute values are quoted if needed.
143
144	>>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
145	'text/plain; charset="iso-8859/1"'
146	>>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
147	'text/plain, charset="iso-8859/1"'
148
149	"""
150	headers = []
151	for pairs in lists:
152	attr = []
153	for k, v in pairs:
154	if v is not None:
155	if not re.search(r"^\w+$", v):
156	v = join_escape_re.sub(r"\\\1", v) # escape " and \
157	v = '"%s"' % v
158	if k is None: # Netscape cookies may have no name
159	k = v
160	else:
161	k = "%s=%s" % (k, v)
162	attr.append(k)
163	if attr: headers.append("; ".join(attr))
164	return ", ".join(headers)
165
166	def strip_quotes(text):
167	if text.startswith('"'):
168	text = text[1:]
169	if text.endswith('"'):
170	text = text[:-1]
171	return text
172
173	def parse_ns_headers(ns_headers):
174	"""Ad-hoc parser for Netscape protocol cookie-attributes.
175
176	The old Netscape cookie format for Set-Cookie can for instance contain
177	an unquoted "," in the expires field, so we have to use this ad-hoc
178	parser instead of split_header_words.
179
180	XXX This may not make the best possible effort to parse all the crap
181	that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
182	parser is probably better, so could do worse than following that if
183	this ever gives any trouble.
184
185	Currently, this is also used for parsing RFC 2109 cookies.
186
187	"""
188	known_attrs = ("expires", "domain", "path", "secure",
189	# RFC 2109 attrs (may turn up in Netscape cookies, too)
190	"version", "port", "max-age")
191
192	result = []
193	for ns_header in ns_headers:
194	pairs = []
195	version_set = False
196	params = re.split(r";\s*", ns_header)
197	for ii in range(len(params)):
198	param = params[ii]
199	param = param.rstrip()
200	if param == "": continue
201	if "=" not in param:
202	k, v = param, None
203	else:
204	k, v = re.split(r"\s=\s", param, 1)
205	k = k.lstrip()
206	if ii != 0:
207	lc = k.lower()
208	if lc in known_attrs:
209	k = lc
210	if k == "version":
211	# This is an RFC 2109 cookie.
212	v = strip_quotes(v)
213	version_set = True
214	if k == "expires":
215	# convert expires date to seconds since epoch
216	v = http2time(strip_quotes(v)) # None if invalid
217	pairs.append((k, v))
218
219	if pairs:
220	if not version_set:
221	pairs.append(("version", "0"))
222	result.append(pairs)
223
224	return result
225
226
227	def _test():
228	import doctest, _headersutil
229	return doctest.testmod(_headersutil)
230
231	if __name__ == "__main__":
232	_test()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: