source: py-scraping/mechanize/_headersutil.py@ 106

Last change on this file since 106 was 106, checked in by Rick van der Zwet, 15 years ago

Initial commit...

File size: 7.9 KB
Line 
1"""Utility functions for HTTP header value parsing and construction.
2
3Copyright 1997-1998, Gisle Aas
4Copyright 2002-2006, John J. Lee
5
6This code is free software; you can redistribute it and/or modify it
7under the terms of the BSD or ZPL 2.1 licenses (see the file
8COPYING.txt included with the distribution).
9
10"""
11
12import os, re
13from types import StringType
14from types import UnicodeType
15STRING_TYPES = StringType, UnicodeType
16
17from _util import http2time
18import _rfc3986
19
20def is_html(ct_headers, url, allow_xhtml=False):
21 """
22 ct_headers: Sequence of Content-Type headers
23 url: Response URL
24
25 """
26 if not ct_headers:
27 # guess
28 ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1]
29 html_exts = [".htm", ".html"]
30 if allow_xhtml:
31 html_exts += [".xhtml"]
32 return ext in html_exts
33 # use first header
34 ct = split_header_words(ct_headers)[0][0][0]
35 html_types = ["text/html"]
36 if allow_xhtml:
37 html_types += [
38 "text/xhtml", "text/xml",
39 "application/xml", "application/xhtml+xml",
40 ]
41 return ct in html_types
42
43def unmatched(match):
44 """Return unmatched part of re.Match object."""
45 start, end = match.span(0)
46 return match.string[:start] + match.string[end:]
47
48token_re = re.compile(r"^\s*([^=\s;,]+)")
49quoted_value_re = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
50value_re = re.compile(r"^\s*=\s*([^\s;,]*)")
51escape_re = re.compile(r"\\(.)")
52def split_header_words(header_values):
53 r"""Parse header values into a list of lists containing key,value pairs.
54
55 The function knows how to deal with ",", ";" and "=" as well as quoted
56 values after "=". A list of space separated tokens are parsed as if they
57 were separated by ";".
58
59 If the header_values passed as argument contains multiple values, then they
60 are treated as if they were a single value separated by comma ",".
61
62 This means that this function is useful for parsing header fields that
63 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
64 the requirement for tokens).
65
66 headers = #header
67 header = (token | parameter) *( [";"] (token | parameter))
68
69 token = 1*<any CHAR except CTLs or separators>
70 separators = "(" | ")" | "<" | ">" | "@"
71 | "," | ";" | ":" | "\" | <">
72 | "/" | "[" | "]" | "?" | "="
73 | "{" | "}" | SP | HT
74
75 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
76 qdtext = <any TEXT except <">>
77 quoted-pair = "\" CHAR
78
79 parameter = attribute "=" value
80 attribute = token
81 value = token | quoted-string
82
83 Each header is represented by a list of key/value pairs. The value for a
84 simple token (not part of a parameter) is None. Syntactically incorrect
85 headers will not necessarily be parsed as you would want.
86
87 This is easier to describe with some examples:
88
89 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
90 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
91 >>> split_header_words(['text/html; charset="iso-8859-1"'])
92 [[('text/html', None), ('charset', 'iso-8859-1')]]
93 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
94 [[('Basic', None), ('realm', '"foobar"')]]
95
96 """
97 assert type(header_values) not in STRING_TYPES
98 result = []
99 for text in header_values:
100 orig_text = text
101 pairs = []
102 while text:
103 m = token_re.search(text)
104 if m:
105 text = unmatched(m)
106 name = m.group(1)
107 m = quoted_value_re.search(text)
108 if m: # quoted value
109 text = unmatched(m)
110 value = m.group(1)
111 value = escape_re.sub(r"\1", value)
112 else:
113 m = value_re.search(text)
114 if m: # unquoted value
115 text = unmatched(m)
116 value = m.group(1)
117 value = value.rstrip()
118 else:
119 # no value, a lone token
120 value = None
121 pairs.append((name, value))
122 elif text.lstrip().startswith(","):
123 # concatenated headers, as per RFC 2616 section 4.2
124 text = text.lstrip()[1:]
125 if pairs: result.append(pairs)
126 pairs = []
127 else:
128 # skip junk
129 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
130 assert nr_junk_chars > 0, (
131 "split_header_words bug: '%s', '%s', %s" %
132 (orig_text, text, pairs))
133 text = non_junk
134 if pairs: result.append(pairs)
135 return result
136
137join_escape_re = re.compile(r"([\"\\])")
138def join_header_words(lists):
139 """Do the inverse of the conversion done by split_header_words.
140
141 Takes a list of lists of (key, value) pairs and produces a single header
142 value. Attribute values are quoted if needed.
143
144 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
145 'text/plain; charset="iso-8859/1"'
146 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
147 'text/plain, charset="iso-8859/1"'
148
149 """
150 headers = []
151 for pairs in lists:
152 attr = []
153 for k, v in pairs:
154 if v is not None:
155 if not re.search(r"^\w+$", v):
156 v = join_escape_re.sub(r"\\\1", v) # escape " and \
157 v = '"%s"' % v
158 if k is None: # Netscape cookies may have no name
159 k = v
160 else:
161 k = "%s=%s" % (k, v)
162 attr.append(k)
163 if attr: headers.append("; ".join(attr))
164 return ", ".join(headers)
165
166def strip_quotes(text):
167 if text.startswith('"'):
168 text = text[1:]
169 if text.endswith('"'):
170 text = text[:-1]
171 return text
172
173def parse_ns_headers(ns_headers):
174 """Ad-hoc parser for Netscape protocol cookie-attributes.
175
176 The old Netscape cookie format for Set-Cookie can for instance contain
177 an unquoted "," in the expires field, so we have to use this ad-hoc
178 parser instead of split_header_words.
179
180 XXX This may not make the best possible effort to parse all the crap
181 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
182 parser is probably better, so could do worse than following that if
183 this ever gives any trouble.
184
185 Currently, this is also used for parsing RFC 2109 cookies.
186
187 """
188 known_attrs = ("expires", "domain", "path", "secure",
189 # RFC 2109 attrs (may turn up in Netscape cookies, too)
190 "version", "port", "max-age")
191
192 result = []
193 for ns_header in ns_headers:
194 pairs = []
195 version_set = False
196 params = re.split(r";\s*", ns_header)
197 for ii in range(len(params)):
198 param = params[ii]
199 param = param.rstrip()
200 if param == "": continue
201 if "=" not in param:
202 k, v = param, None
203 else:
204 k, v = re.split(r"\s*=\s*", param, 1)
205 k = k.lstrip()
206 if ii != 0:
207 lc = k.lower()
208 if lc in known_attrs:
209 k = lc
210 if k == "version":
211 # This is an RFC 2109 cookie.
212 v = strip_quotes(v)
213 version_set = True
214 if k == "expires":
215 # convert expires date to seconds since epoch
216 v = http2time(strip_quotes(v)) # None if invalid
217 pairs.append((k, v))
218
219 if pairs:
220 if not version_set:
221 pairs.append(("version", "0"))
222 result.append(pairs)
223
224 return result
225
226
227def _test():
228 import doctest, _headersutil
229 return doctest.testmod(_headersutil)
230
231if __name__ == "__main__":
232 _test()
Note: See TracBrowser for help on using the repository browser.