Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: py-scraping/mechanize/_headersutil.py@ 137

Last change on this file since 137 was 106, checked in by Rick van der Zwet, 15 years ago
Initial commit...
File size: 7.9 KB

Rev	Line
[106]	1	"""Utility functions for HTTP header value parsing and construction.
	2
	3	Copyright 1997-1998, Gisle Aas
	4	Copyright 2002-2006, John J. Lee
	5
	6	This code is free software; you can redistribute it and/or modify it
	7	under the terms of the BSD or ZPL 2.1 licenses (see the file
	8	COPYING.txt included with the distribution).
	9
	10	"""
	11
	12	import os, re
	13	from types import StringType
	14	from types import UnicodeType
	15	STRING_TYPES = StringType, UnicodeType
	16
	17	from _util import http2time
	18	import _rfc3986
	19
	20	def is_html(ct_headers, url, allow_xhtml=False):
	21	"""
	22	ct_headers: Sequence of Content-Type headers
	23	url: Response URL
	24
	25	"""
	26	if not ct_headers:
	27	# guess
	28	ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1]
	29	html_exts = [".htm", ".html"]
	30	if allow_xhtml:
	31	html_exts += [".xhtml"]
	32	return ext in html_exts
	33	# use first header
	34	ct = split_header_words(ct_headers)[0][0][0]
	35	html_types = ["text/html"]
	36	if allow_xhtml:
	37	html_types += [
	38	"text/xhtml", "text/xml",
	39	"application/xml", "application/xhtml+xml",
	40	]
	41	return ct in html_types
	42
	43	def unmatched(match):
	44	"""Return unmatched part of re.Match object."""
	45	start, end = match.span(0)
	46	return match.string[:start] + match.string[end:]
	47
	48	token_re = re.compile(r"^\s*([^=\s;,]+)")
	49	quoted_value_re = re.compile(r"^\s=\s\"([^\"\\](?:\\.[^\"\\])*)\"")
	50	value_re = re.compile(r"^\s=\s([^\s;,]*)")
	51	escape_re = re.compile(r"\\(.)")
	52	def split_header_words(header_values):
	53	r"""Parse header values into a list of lists containing key,value pairs.
	54
	55	The function knows how to deal with ",", ";" and "=" as well as quoted
	56	values after "=". A list of space separated tokens are parsed as if they
	57	were separated by ";".
	58
	59	If the header_values passed as argument contains multiple values, then they
	60	are treated as if they were a single value separated by comma ",".
	61
	62	This means that this function is useful for parsing header fields that
	63	follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
	64	the requirement for tokens).
	65
	66	headers = #header
	67	header = (token \| parameter) *( [";"] (token \| parameter))
	68
	69	token = 1*<any CHAR except CTLs or separators>
	70	separators = "(" \| ")" \| "<" \| ">" \| "@"
	71	\| "," \| ";" \| ":" \| "\" \| <">
	72	\| "/" \| "[" \| "]" \| "?" \| "="
	73	\| "{" \| "}" \| SP \| HT
	74
	75	quoted-string = ( <"> *(qdtext \| quoted-pair ) <"> )
	76	qdtext = <any TEXT except <">>
	77	quoted-pair = "\" CHAR
	78
	79	parameter = attribute "=" value
	80	attribute = token
	81	value = token \| quoted-string
	82
	83	Each header is represented by a list of key/value pairs. The value for a
	84	simple token (not part of a parameter) is None. Syntactically incorrect
	85	headers will not necessarily be parsed as you would want.
	86
	87	This is easier to describe with some examples:
	88
	89	>>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
	90	[[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
	91	>>> split_header_words(['text/html; charset="iso-8859-1"'])
	92	[[('text/html', None), ('charset', 'iso-8859-1')]]
	93	>>> split_header_words([r'Basic realm="\"foo\bar\""'])
	94	[[('Basic', None), ('realm', '"foobar"')]]
	95
	96	"""
	97	assert type(header_values) not in STRING_TYPES
	98	result = []
	99	for text in header_values:
	100	orig_text = text
	101	pairs = []
	102	while text:
	103	m = token_re.search(text)
	104	if m:
	105	text = unmatched(m)
	106	name = m.group(1)
	107	m = quoted_value_re.search(text)
	108	if m: # quoted value
	109	text = unmatched(m)
	110	value = m.group(1)
	111	value = escape_re.sub(r"\1", value)
	112	else:
	113	m = value_re.search(text)
	114	if m: # unquoted value
	115	text = unmatched(m)
	116	value = m.group(1)
	117	value = value.rstrip()
	118	else:
	119	# no value, a lone token
	120	value = None
	121	pairs.append((name, value))
	122	elif text.lstrip().startswith(","):
	123	# concatenated headers, as per RFC 2616 section 4.2
	124	text = text.lstrip()[1:]
	125	if pairs: result.append(pairs)
	126	pairs = []
	127	else:
	128	# skip junk
	129	non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
	130	assert nr_junk_chars > 0, (
	131	"split_header_words bug: '%s', '%s', %s" %
	132	(orig_text, text, pairs))
	133	text = non_junk
	134	if pairs: result.append(pairs)
	135	return result
	136
	137	join_escape_re = re.compile(r"([\"\\])")
	138	def join_header_words(lists):
	139	"""Do the inverse of the conversion done by split_header_words.
	140
	141	Takes a list of lists of (key, value) pairs and produces a single header
	142	value. Attribute values are quoted if needed.
	143
	144	>>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
	145	'text/plain; charset="iso-8859/1"'
	146	>>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
	147	'text/plain, charset="iso-8859/1"'
	148
	149	"""
	150	headers = []
	151	for pairs in lists:
	152	attr = []
	153	for k, v in pairs:
	154	if v is not None:
	155	if not re.search(r"^\w+$", v):
	156	v = join_escape_re.sub(r"\\\1", v) # escape " and \
	157	v = '"%s"' % v
	158	if k is None: # Netscape cookies may have no name
	159	k = v
	160	else:
	161	k = "%s=%s" % (k, v)
	162	attr.append(k)
	163	if attr: headers.append("; ".join(attr))
	164	return ", ".join(headers)
	165
	166	def strip_quotes(text):
	167	if text.startswith('"'):
	168	text = text[1:]
	169	if text.endswith('"'):
	170	text = text[:-1]
	171	return text
	172
	173	def parse_ns_headers(ns_headers):
	174	"""Ad-hoc parser for Netscape protocol cookie-attributes.
	175
	176	The old Netscape cookie format for Set-Cookie can for instance contain
	177	an unquoted "," in the expires field, so we have to use this ad-hoc
	178	parser instead of split_header_words.
	179
	180	XXX This may not make the best possible effort to parse all the crap
	181	that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
	182	parser is probably better, so could do worse than following that if
	183	this ever gives any trouble.
	184
	185	Currently, this is also used for parsing RFC 2109 cookies.
	186
	187	"""
	188	known_attrs = ("expires", "domain", "path", "secure",
	189	# RFC 2109 attrs (may turn up in Netscape cookies, too)
	190	"version", "port", "max-age")
	191
	192	result = []
	193	for ns_header in ns_headers:
	194	pairs = []
	195	version_set = False
	196	params = re.split(r";\s*", ns_header)
	197	for ii in range(len(params)):
	198	param = params[ii]
	199	param = param.rstrip()
	200	if param == "": continue
	201	if "=" not in param:
	202	k, v = param, None
	203	else:
	204	k, v = re.split(r"\s=\s", param, 1)
	205	k = k.lstrip()
	206	if ii != 0:
	207	lc = k.lower()
	208	if lc in known_attrs:
	209	k = lc
	210	if k == "version":
	211	# This is an RFC 2109 cookie.
	212	v = strip_quotes(v)
	213	version_set = True
	214	if k == "expires":
	215	# convert expires date to seconds since epoch
	216	v = http2time(strip_quotes(v)) # None if invalid
	217	pairs.append((k, v))
	218
	219	if pairs:
	220	if not version_set:
	221	pairs.append(("version", "0"))
	222	result.append(pairs)
	223
	224	return result
	225
	226
	227	def _test():
	228	import doctest, _headersutil
	229	return doctest.testmod(_headersutil)
	230
	231	if __name__ == "__main__":
	232	_test()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: