1 | """RFC 3986 URI parsing and relative reference resolution / absolutization.
|
---|
2 |
|
---|
3 | (aka splitting and joining)
|
---|
4 |
|
---|
5 | Copyright 2006 John J. Lee <jjl@pobox.com>
|
---|
6 |
|
---|
7 | This code is free software; you can redistribute it and/or modify it under
|
---|
8 | the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
---|
9 | included with the distribution).
|
---|
10 |
|
---|
11 | """
|
---|
12 |
|
---|
13 | # XXX Wow, this is ugly. Overly-direct translation of the RFC ATM.
|
---|
14 |
|
---|
15 | import re, urllib
|
---|
16 |
|
---|
17 | ## def chr_range(a, b):
|
---|
18 | ## return "".join(map(chr, range(ord(a), ord(b)+1)))
|
---|
19 |
|
---|
20 | ## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
---|
21 | ## "abcdefghijklmnopqrstuvwxyz"
|
---|
22 | ## "0123456789"
|
---|
23 | ## "-_.~")
|
---|
24 | ## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
|
---|
25 | ## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
|
---|
26 | # this re matches any character that's not in URI_CHARS
|
---|
27 | BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")
|
---|
28 |
|
---|
29 |
|
---|
30 | def clean_url(url, encoding):
|
---|
31 | # percent-encode illegal URI characters
|
---|
32 | # Trying to come up with test cases for this gave me a headache, revisit
|
---|
33 | # when do switch to unicode.
|
---|
34 | # Somebody else's comments (lost the attribution):
|
---|
35 | ## - IE will return you the url in the encoding you send it
|
---|
36 | ## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
|
---|
37 | ## characters in your link. It will send you utf-8 however if there are...
|
---|
38 | if type(url) == type(""):
|
---|
39 | url = url.decode(encoding, "replace")
|
---|
40 | url = url.strip()
|
---|
41 | # for second param to urllib.quote(), we want URI_CHARS, minus the
|
---|
42 | # 'always_safe' characters that urllib.quote() never percent-encodes
|
---|
43 | return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
|
---|
44 |
|
---|
45 | def is_clean_uri(uri):
|
---|
46 | """
|
---|
47 | >>> is_clean_uri("ABC!")
|
---|
48 | True
|
---|
49 | >>> is_clean_uri(u"ABC!")
|
---|
50 | True
|
---|
51 | >>> is_clean_uri("ABC|")
|
---|
52 | False
|
---|
53 | >>> is_clean_uri(u"ABC|")
|
---|
54 | False
|
---|
55 | >>> is_clean_uri("http://example.com/0")
|
---|
56 | True
|
---|
57 | >>> is_clean_uri(u"http://example.com/0")
|
---|
58 | True
|
---|
59 | """
|
---|
60 | # note module re treats bytestrings as through they were decoded as latin-1
|
---|
61 | # so this function accepts both unicode and bytestrings
|
---|
62 | return not bool(BAD_URI_CHARS_RE.search(uri))
|
---|
63 |
|
---|
64 |
|
---|
65 | SPLIT_MATCH = re.compile(
|
---|
66 | r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match
|
---|
67 | def urlsplit(absolute_uri):
|
---|
68 | """Return scheme, authority, path, query, fragment."""
|
---|
69 | match = SPLIT_MATCH(absolute_uri)
|
---|
70 | if match:
|
---|
71 | g = match.groups()
|
---|
72 | return g[1], g[3], g[4], g[6], g[8]
|
---|
73 |
|
---|
74 | def urlunsplit(parts):
|
---|
75 | scheme, authority, path, query, fragment = parts
|
---|
76 | r = []
|
---|
77 | append = r.append
|
---|
78 | if scheme is not None:
|
---|
79 | append(scheme)
|
---|
80 | append(":")
|
---|
81 | if authority is not None:
|
---|
82 | append("//")
|
---|
83 | append(authority)
|
---|
84 | append(path)
|
---|
85 | if query is not None:
|
---|
86 | append("?")
|
---|
87 | append(query)
|
---|
88 | if fragment is not None:
|
---|
89 | append("#")
|
---|
90 | append(fragment)
|
---|
91 | return "".join(r)
|
---|
92 |
|
---|
93 | def urljoin(base_uri, uri_reference):
|
---|
94 | return urlunsplit(urljoin_parts(urlsplit(base_uri),
|
---|
95 | urlsplit(uri_reference)))
|
---|
96 |
|
---|
97 | # oops, this doesn't do the same thing as the literal translation
|
---|
98 | # from the RFC below
|
---|
99 | ## import posixpath
|
---|
100 | ## def urljoin_parts(base_parts, reference_parts):
|
---|
101 | ## scheme, authority, path, query, fragment = base_parts
|
---|
102 | ## rscheme, rauthority, rpath, rquery, rfragment = reference_parts
|
---|
103 |
|
---|
104 | ## # compute target URI path
|
---|
105 | ## if rpath == "":
|
---|
106 | ## tpath = path
|
---|
107 | ## else:
|
---|
108 | ## tpath = rpath
|
---|
109 | ## if not tpath.startswith("/"):
|
---|
110 | ## tpath = merge(authority, path, tpath)
|
---|
111 | ## tpath = posixpath.normpath(tpath)
|
---|
112 |
|
---|
113 | ## if rscheme is not None:
|
---|
114 | ## return (rscheme, rauthority, tpath, rquery, rfragment)
|
---|
115 | ## elif rauthority is not None:
|
---|
116 | ## return (scheme, rauthority, tpath, rquery, rfragment)
|
---|
117 | ## elif rpath == "":
|
---|
118 | ## if rquery is not None:
|
---|
119 | ## tquery = rquery
|
---|
120 | ## else:
|
---|
121 | ## tquery = query
|
---|
122 | ## return (scheme, authority, tpath, tquery, rfragment)
|
---|
123 | ## else:
|
---|
124 | ## return (scheme, authority, tpath, rquery, rfragment)
|
---|
125 |
|
---|
126 | def urljoin_parts(base_parts, reference_parts):
|
---|
127 | scheme, authority, path, query, fragment = base_parts
|
---|
128 | rscheme, rauthority, rpath, rquery, rfragment = reference_parts
|
---|
129 |
|
---|
130 | if rscheme == scheme:
|
---|
131 | rscheme = None
|
---|
132 |
|
---|
133 | if rscheme is not None:
|
---|
134 | tscheme, tauthority, tpath, tquery = (
|
---|
135 | rscheme, rauthority, remove_dot_segments(rpath), rquery)
|
---|
136 | else:
|
---|
137 | if rauthority is not None:
|
---|
138 | tauthority, tpath, tquery = (
|
---|
139 | rauthority, remove_dot_segments(rpath), rquery)
|
---|
140 | else:
|
---|
141 | if rpath == "":
|
---|
142 | tpath = path
|
---|
143 | if rquery is not None:
|
---|
144 | tquery = rquery
|
---|
145 | else:
|
---|
146 | tquery = query
|
---|
147 | else:
|
---|
148 | if rpath.startswith("/"):
|
---|
149 | tpath = remove_dot_segments(rpath)
|
---|
150 | else:
|
---|
151 | tpath = merge(authority, path, rpath)
|
---|
152 | tpath = remove_dot_segments(tpath)
|
---|
153 | tquery = rquery
|
---|
154 | tauthority = authority
|
---|
155 | tscheme = scheme
|
---|
156 | tfragment = rfragment
|
---|
157 | return (tscheme, tauthority, tpath, tquery, tfragment)
|
---|
158 |
|
---|
159 | # um, something *vaguely* like this is what I want, but I have to generate
|
---|
160 | # lots of test cases first, if only to understand what it is that
|
---|
161 | # remove_dot_segments really does...
|
---|
162 | ## def remove_dot_segments(path):
|
---|
163 | ## if path == '':
|
---|
164 | ## return ''
|
---|
165 | ## comps = path.split('/')
|
---|
166 | ## new_comps = []
|
---|
167 | ## for comp in comps:
|
---|
168 | ## if comp in ['.', '']:
|
---|
169 | ## if not new_comps or new_comps[-1]:
|
---|
170 | ## new_comps.append('')
|
---|
171 | ## continue
|
---|
172 | ## if comp != '..':
|
---|
173 | ## new_comps.append(comp)
|
---|
174 | ## elif new_comps:
|
---|
175 | ## new_comps.pop()
|
---|
176 | ## return '/'.join(new_comps)
|
---|
177 |
|
---|
178 |
|
---|
179 | def remove_dot_segments(path):
|
---|
180 | r = []
|
---|
181 | while path:
|
---|
182 | # A
|
---|
183 | if path.startswith("../"):
|
---|
184 | path = path[3:]
|
---|
185 | continue
|
---|
186 | if path.startswith("./"):
|
---|
187 | path = path[2:]
|
---|
188 | continue
|
---|
189 | # B
|
---|
190 | if path.startswith("/./"):
|
---|
191 | path = path[2:]
|
---|
192 | continue
|
---|
193 | if path == "/.":
|
---|
194 | path = "/"
|
---|
195 | continue
|
---|
196 | # C
|
---|
197 | if path.startswith("/../"):
|
---|
198 | path = path[3:]
|
---|
199 | if r:
|
---|
200 | r.pop()
|
---|
201 | continue
|
---|
202 | if path == "/..":
|
---|
203 | path = "/"
|
---|
204 | if r:
|
---|
205 | r.pop()
|
---|
206 | continue
|
---|
207 | # D
|
---|
208 | if path == ".":
|
---|
209 | path = path[1:]
|
---|
210 | continue
|
---|
211 | if path == "..":
|
---|
212 | path = path[2:]
|
---|
213 | continue
|
---|
214 | # E
|
---|
215 | start = 0
|
---|
216 | if path.startswith("/"):
|
---|
217 | start = 1
|
---|
218 | ii = path.find("/", start)
|
---|
219 | if ii < 0:
|
---|
220 | ii = None
|
---|
221 | r.append(path[:ii])
|
---|
222 | if ii is None:
|
---|
223 | break
|
---|
224 | path = path[ii:]
|
---|
225 | return "".join(r)
|
---|
226 |
|
---|
227 | def merge(base_authority, base_path, ref_path):
|
---|
228 | # XXXX Oddly, the sample Perl implementation of this by Roy Fielding
|
---|
229 | # doesn't even take base_authority as a parameter, despite the wording in
|
---|
230 | # the RFC suggesting otherwise. Perhaps I'm missing some obvious identity.
|
---|
231 | #if base_authority is not None and base_path == "":
|
---|
232 | if base_path == "":
|
---|
233 | return "/" + ref_path
|
---|
234 | ii = base_path.rfind("/")
|
---|
235 | if ii >= 0:
|
---|
236 | return base_path[:ii + 1] + ref_path
|
---|
237 | return ref_path
|
---|
238 |
|
---|
239 | if __name__ == "__main__":
|
---|
240 | import doctest
|
---|
241 | doctest.testmod()
|
---|