[106] | 1 | """RFC 3986 URI parsing and relative reference resolution / absolutization.
|
---|
| 2 |
|
---|
| 3 | (aka splitting and joining)
|
---|
| 4 |
|
---|
| 5 | Copyright 2006 John J. Lee <jjl@pobox.com>
|
---|
| 6 |
|
---|
| 7 | This code is free software; you can redistribute it and/or modify it under
|
---|
| 8 | the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
---|
| 9 | included with the distribution).
|
---|
| 10 |
|
---|
| 11 | """
|
---|
| 12 |
|
---|
| 13 | # XXX Wow, this is ugly. Overly-direct translation of the RFC ATM.
|
---|
| 14 |
|
---|
| 15 | import re, urllib
|
---|
| 16 |
|
---|
| 17 | ## def chr_range(a, b):
|
---|
| 18 | ## return "".join(map(chr, range(ord(a), ord(b)+1)))
|
---|
| 19 |
|
---|
| 20 | ## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
---|
| 21 | ## "abcdefghijklmnopqrstuvwxyz"
|
---|
| 22 | ## "0123456789"
|
---|
| 23 | ## "-_.~")
|
---|
| 24 | ## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
|
---|
| 25 | ## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
|
---|
| 26 | # this re matches any character that's not in URI_CHARS
|
---|
| 27 | BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")
|
---|
| 28 |
|
---|
| 29 |
|
---|
| 30 | def clean_url(url, encoding):
|
---|
| 31 | # percent-encode illegal URI characters
|
---|
| 32 | # Trying to come up with test cases for this gave me a headache, revisit
|
---|
| 33 | # when do switch to unicode.
|
---|
| 34 | # Somebody else's comments (lost the attribution):
|
---|
| 35 | ## - IE will return you the url in the encoding you send it
|
---|
| 36 | ## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
|
---|
| 37 | ## characters in your link. It will send you utf-8 however if there are...
|
---|
| 38 | if type(url) == type(""):
|
---|
| 39 | url = url.decode(encoding, "replace")
|
---|
| 40 | url = url.strip()
|
---|
| 41 | # for second param to urllib.quote(), we want URI_CHARS, minus the
|
---|
| 42 | # 'always_safe' characters that urllib.quote() never percent-encodes
|
---|
| 43 | return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
|
---|
| 44 |
|
---|
| 45 | def is_clean_uri(uri):
|
---|
| 46 | """
|
---|
| 47 | >>> is_clean_uri("ABC!")
|
---|
| 48 | True
|
---|
| 49 | >>> is_clean_uri(u"ABC!")
|
---|
| 50 | True
|
---|
| 51 | >>> is_clean_uri("ABC|")
|
---|
| 52 | False
|
---|
| 53 | >>> is_clean_uri(u"ABC|")
|
---|
| 54 | False
|
---|
| 55 | >>> is_clean_uri("http://example.com/0")
|
---|
| 56 | True
|
---|
| 57 | >>> is_clean_uri(u"http://example.com/0")
|
---|
| 58 | True
|
---|
| 59 | """
|
---|
| 60 | # note module re treats bytestrings as through they were decoded as latin-1
|
---|
| 61 | # so this function accepts both unicode and bytestrings
|
---|
| 62 | return not bool(BAD_URI_CHARS_RE.search(uri))
|
---|
| 63 |
|
---|
| 64 |
|
---|
| 65 | SPLIT_MATCH = re.compile(
|
---|
| 66 | r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match
|
---|
| 67 | def urlsplit(absolute_uri):
|
---|
| 68 | """Return scheme, authority, path, query, fragment."""
|
---|
| 69 | match = SPLIT_MATCH(absolute_uri)
|
---|
| 70 | if match:
|
---|
| 71 | g = match.groups()
|
---|
| 72 | return g[1], g[3], g[4], g[6], g[8]
|
---|
| 73 |
|
---|
| 74 | def urlunsplit(parts):
|
---|
| 75 | scheme, authority, path, query, fragment = parts
|
---|
| 76 | r = []
|
---|
| 77 | append = r.append
|
---|
| 78 | if scheme is not None:
|
---|
| 79 | append(scheme)
|
---|
| 80 | append(":")
|
---|
| 81 | if authority is not None:
|
---|
| 82 | append("//")
|
---|
| 83 | append(authority)
|
---|
| 84 | append(path)
|
---|
| 85 | if query is not None:
|
---|
| 86 | append("?")
|
---|
| 87 | append(query)
|
---|
| 88 | if fragment is not None:
|
---|
| 89 | append("#")
|
---|
| 90 | append(fragment)
|
---|
| 91 | return "".join(r)
|
---|
| 92 |
|
---|
| 93 | def urljoin(base_uri, uri_reference):
|
---|
| 94 | return urlunsplit(urljoin_parts(urlsplit(base_uri),
|
---|
| 95 | urlsplit(uri_reference)))
|
---|
| 96 |
|
---|
| 97 | # oops, this doesn't do the same thing as the literal translation
|
---|
| 98 | # from the RFC below
|
---|
| 99 | ## import posixpath
|
---|
| 100 | ## def urljoin_parts(base_parts, reference_parts):
|
---|
| 101 | ## scheme, authority, path, query, fragment = base_parts
|
---|
| 102 | ## rscheme, rauthority, rpath, rquery, rfragment = reference_parts
|
---|
| 103 |
|
---|
| 104 | ## # compute target URI path
|
---|
| 105 | ## if rpath == "":
|
---|
| 106 | ## tpath = path
|
---|
| 107 | ## else:
|
---|
| 108 | ## tpath = rpath
|
---|
| 109 | ## if not tpath.startswith("/"):
|
---|
| 110 | ## tpath = merge(authority, path, tpath)
|
---|
| 111 | ## tpath = posixpath.normpath(tpath)
|
---|
| 112 |
|
---|
| 113 | ## if rscheme is not None:
|
---|
| 114 | ## return (rscheme, rauthority, tpath, rquery, rfragment)
|
---|
| 115 | ## elif rauthority is not None:
|
---|
| 116 | ## return (scheme, rauthority, tpath, rquery, rfragment)
|
---|
| 117 | ## elif rpath == "":
|
---|
| 118 | ## if rquery is not None:
|
---|
| 119 | ## tquery = rquery
|
---|
| 120 | ## else:
|
---|
| 121 | ## tquery = query
|
---|
| 122 | ## return (scheme, authority, tpath, tquery, rfragment)
|
---|
| 123 | ## else:
|
---|
| 124 | ## return (scheme, authority, tpath, rquery, rfragment)
|
---|
| 125 |
|
---|
| 126 | def urljoin_parts(base_parts, reference_parts):
|
---|
| 127 | scheme, authority, path, query, fragment = base_parts
|
---|
| 128 | rscheme, rauthority, rpath, rquery, rfragment = reference_parts
|
---|
| 129 |
|
---|
| 130 | if rscheme == scheme:
|
---|
| 131 | rscheme = None
|
---|
| 132 |
|
---|
| 133 | if rscheme is not None:
|
---|
| 134 | tscheme, tauthority, tpath, tquery = (
|
---|
| 135 | rscheme, rauthority, remove_dot_segments(rpath), rquery)
|
---|
| 136 | else:
|
---|
| 137 | if rauthority is not None:
|
---|
| 138 | tauthority, tpath, tquery = (
|
---|
| 139 | rauthority, remove_dot_segments(rpath), rquery)
|
---|
| 140 | else:
|
---|
| 141 | if rpath == "":
|
---|
| 142 | tpath = path
|
---|
| 143 | if rquery is not None:
|
---|
| 144 | tquery = rquery
|
---|
| 145 | else:
|
---|
| 146 | tquery = query
|
---|
| 147 | else:
|
---|
| 148 | if rpath.startswith("/"):
|
---|
| 149 | tpath = remove_dot_segments(rpath)
|
---|
| 150 | else:
|
---|
| 151 | tpath = merge(authority, path, rpath)
|
---|
| 152 | tpath = remove_dot_segments(tpath)
|
---|
| 153 | tquery = rquery
|
---|
| 154 | tauthority = authority
|
---|
| 155 | tscheme = scheme
|
---|
| 156 | tfragment = rfragment
|
---|
| 157 | return (tscheme, tauthority, tpath, tquery, tfragment)
|
---|
| 158 |
|
---|
| 159 | # um, something *vaguely* like this is what I want, but I have to generate
|
---|
| 160 | # lots of test cases first, if only to understand what it is that
|
---|
| 161 | # remove_dot_segments really does...
|
---|
| 162 | ## def remove_dot_segments(path):
|
---|
| 163 | ## if path == '':
|
---|
| 164 | ## return ''
|
---|
| 165 | ## comps = path.split('/')
|
---|
| 166 | ## new_comps = []
|
---|
| 167 | ## for comp in comps:
|
---|
| 168 | ## if comp in ['.', '']:
|
---|
| 169 | ## if not new_comps or new_comps[-1]:
|
---|
| 170 | ## new_comps.append('')
|
---|
| 171 | ## continue
|
---|
| 172 | ## if comp != '..':
|
---|
| 173 | ## new_comps.append(comp)
|
---|
| 174 | ## elif new_comps:
|
---|
| 175 | ## new_comps.pop()
|
---|
| 176 | ## return '/'.join(new_comps)
|
---|
| 177 |
|
---|
| 178 |
|
---|
| 179 | def remove_dot_segments(path):
|
---|
| 180 | r = []
|
---|
| 181 | while path:
|
---|
| 182 | # A
|
---|
| 183 | if path.startswith("../"):
|
---|
| 184 | path = path[3:]
|
---|
| 185 | continue
|
---|
| 186 | if path.startswith("./"):
|
---|
| 187 | path = path[2:]
|
---|
| 188 | continue
|
---|
| 189 | # B
|
---|
| 190 | if path.startswith("/./"):
|
---|
| 191 | path = path[2:]
|
---|
| 192 | continue
|
---|
| 193 | if path == "/.":
|
---|
| 194 | path = "/"
|
---|
| 195 | continue
|
---|
| 196 | # C
|
---|
| 197 | if path.startswith("/../"):
|
---|
| 198 | path = path[3:]
|
---|
| 199 | if r:
|
---|
| 200 | r.pop()
|
---|
| 201 | continue
|
---|
| 202 | if path == "/..":
|
---|
| 203 | path = "/"
|
---|
| 204 | if r:
|
---|
| 205 | r.pop()
|
---|
| 206 | continue
|
---|
| 207 | # D
|
---|
| 208 | if path == ".":
|
---|
| 209 | path = path[1:]
|
---|
| 210 | continue
|
---|
| 211 | if path == "..":
|
---|
| 212 | path = path[2:]
|
---|
| 213 | continue
|
---|
| 214 | # E
|
---|
| 215 | start = 0
|
---|
| 216 | if path.startswith("/"):
|
---|
| 217 | start = 1
|
---|
| 218 | ii = path.find("/", start)
|
---|
| 219 | if ii < 0:
|
---|
| 220 | ii = None
|
---|
| 221 | r.append(path[:ii])
|
---|
| 222 | if ii is None:
|
---|
| 223 | break
|
---|
| 224 | path = path[ii:]
|
---|
| 225 | return "".join(r)
|
---|
| 226 |
|
---|
| 227 | def merge(base_authority, base_path, ref_path):
|
---|
| 228 | # XXXX Oddly, the sample Perl implementation of this by Roy Fielding
|
---|
| 229 | # doesn't even take base_authority as a parameter, despite the wording in
|
---|
| 230 | # the RFC suggesting otherwise. Perhaps I'm missing some obvious identity.
|
---|
| 231 | #if base_authority is not None and base_path == "":
|
---|
| 232 | if base_path == "":
|
---|
| 233 | return "/" + ref_path
|
---|
| 234 | ii = base_path.rfind("/")
|
---|
| 235 | if ii >= 0:
|
---|
| 236 | return base_path[:ii + 1] + ref_path
|
---|
| 237 | return ref_path
|
---|
| 238 |
|
---|
| 239 | if __name__ == "__main__":
|
---|
| 240 | import doctest
|
---|
| 241 | doctest.testmod()
|
---|