source: py-scraping/mechanize/_rfc3986.py@ 146

Last change on this file since 146 was 106, checked in by Rick van der Zwet, 15 years ago

Initial commit...

File size: 7.4 KB
RevLine 
[106]1"""RFC 3986 URI parsing and relative reference resolution / absolutization.
2
3(aka splitting and joining)
4
5Copyright 2006 John J. Lee <jjl@pobox.com>
6
7This code is free software; you can redistribute it and/or modify it under
8the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
9included with the distribution).
10
11"""
12
13# XXX Wow, this is ugly. Overly-direct translation of the RFC ATM.
14
15import re, urllib
16
17## def chr_range(a, b):
18## return "".join(map(chr, range(ord(a), ord(b)+1)))
19
20## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
21## "abcdefghijklmnopqrstuvwxyz"
22## "0123456789"
23## "-_.~")
24## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
25## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
26# this re matches any character that's not in URI_CHARS
27BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")
28
29
30def clean_url(url, encoding):
31 # percent-encode illegal URI characters
32 # Trying to come up with test cases for this gave me a headache, revisit
33 # when do switch to unicode.
34 # Somebody else's comments (lost the attribution):
35## - IE will return you the url in the encoding you send it
36## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
37## characters in your link. It will send you utf-8 however if there are...
38 if type(url) == type(""):
39 url = url.decode(encoding, "replace")
40 url = url.strip()
41 # for second param to urllib.quote(), we want URI_CHARS, minus the
42 # 'always_safe' characters that urllib.quote() never percent-encodes
43 return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
44
45def is_clean_uri(uri):
46 """
47 >>> is_clean_uri("ABC!")
48 True
49 >>> is_clean_uri(u"ABC!")
50 True
51 >>> is_clean_uri("ABC|")
52 False
53 >>> is_clean_uri(u"ABC|")
54 False
55 >>> is_clean_uri("http://example.com/0")
56 True
57 >>> is_clean_uri(u"http://example.com/0")
58 True
59 """
60 # note module re treats bytestrings as through they were decoded as latin-1
61 # so this function accepts both unicode and bytestrings
62 return not bool(BAD_URI_CHARS_RE.search(uri))
63
64
65SPLIT_MATCH = re.compile(
66 r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match
67def urlsplit(absolute_uri):
68 """Return scheme, authority, path, query, fragment."""
69 match = SPLIT_MATCH(absolute_uri)
70 if match:
71 g = match.groups()
72 return g[1], g[3], g[4], g[6], g[8]
73
74def urlunsplit(parts):
75 scheme, authority, path, query, fragment = parts
76 r = []
77 append = r.append
78 if scheme is not None:
79 append(scheme)
80 append(":")
81 if authority is not None:
82 append("//")
83 append(authority)
84 append(path)
85 if query is not None:
86 append("?")
87 append(query)
88 if fragment is not None:
89 append("#")
90 append(fragment)
91 return "".join(r)
92
93def urljoin(base_uri, uri_reference):
94 return urlunsplit(urljoin_parts(urlsplit(base_uri),
95 urlsplit(uri_reference)))
96
97# oops, this doesn't do the same thing as the literal translation
98# from the RFC below
99## import posixpath
100## def urljoin_parts(base_parts, reference_parts):
101## scheme, authority, path, query, fragment = base_parts
102## rscheme, rauthority, rpath, rquery, rfragment = reference_parts
103
104## # compute target URI path
105## if rpath == "":
106## tpath = path
107## else:
108## tpath = rpath
109## if not tpath.startswith("/"):
110## tpath = merge(authority, path, tpath)
111## tpath = posixpath.normpath(tpath)
112
113## if rscheme is not None:
114## return (rscheme, rauthority, tpath, rquery, rfragment)
115## elif rauthority is not None:
116## return (scheme, rauthority, tpath, rquery, rfragment)
117## elif rpath == "":
118## if rquery is not None:
119## tquery = rquery
120## else:
121## tquery = query
122## return (scheme, authority, tpath, tquery, rfragment)
123## else:
124## return (scheme, authority, tpath, rquery, rfragment)
125
126def urljoin_parts(base_parts, reference_parts):
127 scheme, authority, path, query, fragment = base_parts
128 rscheme, rauthority, rpath, rquery, rfragment = reference_parts
129
130 if rscheme == scheme:
131 rscheme = None
132
133 if rscheme is not None:
134 tscheme, tauthority, tpath, tquery = (
135 rscheme, rauthority, remove_dot_segments(rpath), rquery)
136 else:
137 if rauthority is not None:
138 tauthority, tpath, tquery = (
139 rauthority, remove_dot_segments(rpath), rquery)
140 else:
141 if rpath == "":
142 tpath = path
143 if rquery is not None:
144 tquery = rquery
145 else:
146 tquery = query
147 else:
148 if rpath.startswith("/"):
149 tpath = remove_dot_segments(rpath)
150 else:
151 tpath = merge(authority, path, rpath)
152 tpath = remove_dot_segments(tpath)
153 tquery = rquery
154 tauthority = authority
155 tscheme = scheme
156 tfragment = rfragment
157 return (tscheme, tauthority, tpath, tquery, tfragment)
158
159# um, something *vaguely* like this is what I want, but I have to generate
160# lots of test cases first, if only to understand what it is that
161# remove_dot_segments really does...
162## def remove_dot_segments(path):
163## if path == '':
164## return ''
165## comps = path.split('/')
166## new_comps = []
167## for comp in comps:
168## if comp in ['.', '']:
169## if not new_comps or new_comps[-1]:
170## new_comps.append('')
171## continue
172## if comp != '..':
173## new_comps.append(comp)
174## elif new_comps:
175## new_comps.pop()
176## return '/'.join(new_comps)
177
178
179def remove_dot_segments(path):
180 r = []
181 while path:
182 # A
183 if path.startswith("../"):
184 path = path[3:]
185 continue
186 if path.startswith("./"):
187 path = path[2:]
188 continue
189 # B
190 if path.startswith("/./"):
191 path = path[2:]
192 continue
193 if path == "/.":
194 path = "/"
195 continue
196 # C
197 if path.startswith("/../"):
198 path = path[3:]
199 if r:
200 r.pop()
201 continue
202 if path == "/..":
203 path = "/"
204 if r:
205 r.pop()
206 continue
207 # D
208 if path == ".":
209 path = path[1:]
210 continue
211 if path == "..":
212 path = path[2:]
213 continue
214 # E
215 start = 0
216 if path.startswith("/"):
217 start = 1
218 ii = path.find("/", start)
219 if ii < 0:
220 ii = None
221 r.append(path[:ii])
222 if ii is None:
223 break
224 path = path[ii:]
225 return "".join(r)
226
227def merge(base_authority, base_path, ref_path):
228 # XXXX Oddly, the sample Perl implementation of this by Roy Fielding
229 # doesn't even take base_authority as a parameter, despite the wording in
230 # the RFC suggesting otherwise. Perhaps I'm missing some obvious identity.
231 #if base_authority is not None and base_path == "":
232 if base_path == "":
233 return "/" + ref_path
234 ii = base_path.rfind("/")
235 if ii >= 0:
236 return base_path[:ii + 1] + ref_path
237 return ref_path
238
239if __name__ == "__main__":
240 import doctest
241 doctest.testmod()
Note: See TracBrowser for help on using the repository browser.