Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: py-scraping/mechanize/_rfc3986.py@ 160

Last change on this file since 160 was 106, checked in by Rick van der Zwet, 15 years ago
Initial commit...
File size: 7.4 KB

Line
1	"""RFC 3986 URI parsing and relative reference resolution / absolutization.
2
3	(aka splitting and joining)
4
5	Copyright 2006 John J. Lee <jjl@pobox.com>
6
7	This code is free software; you can redistribute it and/or modify it under
8	the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
9	included with the distribution).
10
11	"""
12
13	# XXX Wow, this is ugly. Overly-direct translation of the RFC ATM.
14
15	import re, urllib
16
17	## def chr_range(a, b):
18	## return "".join(map(chr, range(ord(a), ord(b)+1)))
19
20	## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
21	## "abcdefghijklmnopqrstuvwxyz"
22	## "0123456789"
23	## "-_.~")
24	## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
25	## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
26	# this re matches any character that's not in URI_CHARS
27	BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")
28
29
30	def clean_url(url, encoding):
31	# percent-encode illegal URI characters
32	# Trying to come up with test cases for this gave me a headache, revisit
33	# when do switch to unicode.
34	# Somebody else's comments (lost the attribution):
35	## - IE will return you the url in the encoding you send it
36	## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
37	## characters in your link. It will send you utf-8 however if there are...
38	if type(url) == type(""):
39	url = url.decode(encoding, "replace")
40	url = url.strip()
41	# for second param to urllib.quote(), we want URI_CHARS, minus the
42	# 'always_safe' characters that urllib.quote() never percent-encodes
43	return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
44
45	def is_clean_uri(uri):
46	"""
47	>>> is_clean_uri("ABC!")
48	True
49	>>> is_clean_uri(u"ABC!")
50	True
51	>>> is_clean_uri("ABC\|")
52	False
53	>>> is_clean_uri(u"ABC\|")
54	False
55	>>> is_clean_uri("http://example.com/0")
56	True
57	>>> is_clean_uri(u"http://example.com/0")
58	True
59	"""
60	# note module re treats bytestrings as through they were decoded as latin-1
61	# so this function accepts both unicode and bytestrings
62	return not bool(BAD_URI_CHARS_RE.search(uri))
63
64
65	SPLIT_MATCH = re.compile(
66	r"^(([^:/?#]+):)?(//([^/?#]))?([^?#])(\?([^#]))?(#(.))?").match
67	def urlsplit(absolute_uri):
68	"""Return scheme, authority, path, query, fragment."""
69	match = SPLIT_MATCH(absolute_uri)
70	if match:
71	g = match.groups()
72	return g[1], g[3], g[4], g[6], g[8]
73
74	def urlunsplit(parts):
75	scheme, authority, path, query, fragment = parts
76	r = []
77	append = r.append
78	if scheme is not None:
79	append(scheme)
80	append(":")
81	if authority is not None:
82	append("//")
83	append(authority)
84	append(path)
85	if query is not None:
86	append("?")
87	append(query)
88	if fragment is not None:
89	append("#")
90	append(fragment)
91	return "".join(r)
92
93	def urljoin(base_uri, uri_reference):
94	return urlunsplit(urljoin_parts(urlsplit(base_uri),
95	urlsplit(uri_reference)))
96
97	# oops, this doesn't do the same thing as the literal translation
98	# from the RFC below
99	## import posixpath
100	## def urljoin_parts(base_parts, reference_parts):
101	## scheme, authority, path, query, fragment = base_parts
102	## rscheme, rauthority, rpath, rquery, rfragment = reference_parts
103
104	## # compute target URI path
105	## if rpath == "":
106	## tpath = path
107	## else:
108	## tpath = rpath
109	## if not tpath.startswith("/"):
110	## tpath = merge(authority, path, tpath)
111	## tpath = posixpath.normpath(tpath)
112
113	## if rscheme is not None:
114	## return (rscheme, rauthority, tpath, rquery, rfragment)
115	## elif rauthority is not None:
116	## return (scheme, rauthority, tpath, rquery, rfragment)
117	## elif rpath == "":
118	## if rquery is not None:
119	## tquery = rquery
120	## else:
121	## tquery = query
122	## return (scheme, authority, tpath, tquery, rfragment)
123	## else:
124	## return (scheme, authority, tpath, rquery, rfragment)
125
126	def urljoin_parts(base_parts, reference_parts):
127	scheme, authority, path, query, fragment = base_parts
128	rscheme, rauthority, rpath, rquery, rfragment = reference_parts
129
130	if rscheme == scheme:
131	rscheme = None
132
133	if rscheme is not None:
134	tscheme, tauthority, tpath, tquery = (
135	rscheme, rauthority, remove_dot_segments(rpath), rquery)
136	else:
137	if rauthority is not None:
138	tauthority, tpath, tquery = (
139	rauthority, remove_dot_segments(rpath), rquery)
140	else:
141	if rpath == "":
142	tpath = path
143	if rquery is not None:
144	tquery = rquery
145	else:
146	tquery = query
147	else:
148	if rpath.startswith("/"):
149	tpath = remove_dot_segments(rpath)
150	else:
151	tpath = merge(authority, path, rpath)
152	tpath = remove_dot_segments(tpath)
153	tquery = rquery
154	tauthority = authority
155	tscheme = scheme
156	tfragment = rfragment
157	return (tscheme, tauthority, tpath, tquery, tfragment)
158
159	# um, something vaguely like this is what I want, but I have to generate
160	# lots of test cases first, if only to understand what it is that
161	# remove_dot_segments really does...
162	## def remove_dot_segments(path):
163	## if path == '':
164	## return ''
165	## comps = path.split('/')
166	## new_comps = []
167	## for comp in comps:
168	## if comp in ['.', '']:
169	## if not new_comps or new_comps[-1]:
170	## new_comps.append('')
171	## continue
172	## if comp != '..':
173	## new_comps.append(comp)
174	## elif new_comps:
175	## new_comps.pop()
176	## return '/'.join(new_comps)
177
178
179	def remove_dot_segments(path):
180	r = []
181	while path:
182	# A
183	if path.startswith("../"):
184	path = path[3:]
185	continue
186	if path.startswith("./"):
187	path = path[2:]
188	continue
189	# B
190	if path.startswith("/./"):
191	path = path[2:]
192	continue
193	if path == "/.":
194	path = "/"
195	continue
196	# C
197	if path.startswith("/../"):
198	path = path[3:]
199	if r:
200	r.pop()
201	continue
202	if path == "/..":
203	path = "/"
204	if r:
205	r.pop()
206	continue
207	# D
208	if path == ".":
209	path = path[1:]
210	continue
211	if path == "..":
212	path = path[2:]
213	continue
214	# E
215	start = 0
216	if path.startswith("/"):
217	start = 1
218	ii = path.find("/", start)
219	if ii < 0:
220	ii = None
221	r.append(path[:ii])
222	if ii is None:
223	break
224	path = path[ii:]
225	return "".join(r)
226
227	def merge(base_authority, base_path, ref_path):
228	# XXXX Oddly, the sample Perl implementation of this by Roy Fielding
229	# doesn't even take base_authority as a parameter, despite the wording in
230	# the RFC suggesting otherwise. Perhaps I'm missing some obvious identity.
231	#if base_authority is not None and base_path == "":
232	if base_path == "":
233	return "/" + ref_path
234	ii = base_path.rfind("/")
235	if ii >= 0:
236	return base_path[:ii + 1] + ref_path
237	return ref_path
238
239	if __name__ == "__main__":
240	import doctest
241	doctest.testmod()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: