Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: py-scraping/mechanize/_rfc3986.py@ 108

Last change on this file since 108 was 106, checked in by Rick van der Zwet, 15 years ago
Initial commit...
File size: 7.4 KB

Rev	Line
[106]	1	"""RFC 3986 URI parsing and relative reference resolution / absolutization.
	2
	3	(aka splitting and joining)
	4
	5	Copyright 2006 John J. Lee <jjl@pobox.com>
	6
	7	This code is free software; you can redistribute it and/or modify it under
	8	the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
	9	included with the distribution).
	10
	11	"""
	12
	13	# XXX Wow, this is ugly. Overly-direct translation of the RFC ATM.
	14
	15	import re, urllib
	16
	17	## def chr_range(a, b):
	18	## return "".join(map(chr, range(ord(a), ord(b)+1)))
	19
	20	## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	21	## "abcdefghijklmnopqrstuvwxyz"
	22	## "0123456789"
	23	## "-_.~")
	24	## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
	25	## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
	26	# this re matches any character that's not in URI_CHARS
	27	BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")
	28
	29
	30	def clean_url(url, encoding):
	31	# percent-encode illegal URI characters
	32	# Trying to come up with test cases for this gave me a headache, revisit
	33	# when do switch to unicode.
	34	# Somebody else's comments (lost the attribution):
	35	## - IE will return you the url in the encoding you send it
	36	## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
	37	## characters in your link. It will send you utf-8 however if there are...
	38	if type(url) == type(""):
	39	url = url.decode(encoding, "replace")
	40	url = url.strip()
	41	# for second param to urllib.quote(), we want URI_CHARS, minus the
	42	# 'always_safe' characters that urllib.quote() never percent-encodes
	43	return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
	44
	45	def is_clean_uri(uri):
	46	"""
	47	>>> is_clean_uri("ABC!")
	48	True
	49	>>> is_clean_uri(u"ABC!")
	50	True
	51	>>> is_clean_uri("ABC\|")
	52	False
	53	>>> is_clean_uri(u"ABC\|")
	54	False
	55	>>> is_clean_uri("http://example.com/0")
	56	True
	57	>>> is_clean_uri(u"http://example.com/0")
	58	True
	59	"""
	60	# note module re treats bytestrings as through they were decoded as latin-1
	61	# so this function accepts both unicode and bytestrings
	62	return not bool(BAD_URI_CHARS_RE.search(uri))
	63
	64
	65	SPLIT_MATCH = re.compile(
	66	r"^(([^:/?#]+):)?(//([^/?#]))?([^?#])(\?([^#]))?(#(.))?").match
	67	def urlsplit(absolute_uri):
	68	"""Return scheme, authority, path, query, fragment."""
	69	match = SPLIT_MATCH(absolute_uri)
	70	if match:
	71	g = match.groups()
	72	return g[1], g[3], g[4], g[6], g[8]
	73
	74	def urlunsplit(parts):
	75	scheme, authority, path, query, fragment = parts
	76	r = []
	77	append = r.append
	78	if scheme is not None:
	79	append(scheme)
	80	append(":")
	81	if authority is not None:
	82	append("//")
	83	append(authority)
	84	append(path)
	85	if query is not None:
	86	append("?")
	87	append(query)
	88	if fragment is not None:
	89	append("#")
	90	append(fragment)
	91	return "".join(r)
	92
	93	def urljoin(base_uri, uri_reference):
	94	return urlunsplit(urljoin_parts(urlsplit(base_uri),
	95	urlsplit(uri_reference)))
	96
	97	# oops, this doesn't do the same thing as the literal translation
	98	# from the RFC below
	99	## import posixpath
	100	## def urljoin_parts(base_parts, reference_parts):
	101	## scheme, authority, path, query, fragment = base_parts
	102	## rscheme, rauthority, rpath, rquery, rfragment = reference_parts
	103
	104	## # compute target URI path
	105	## if rpath == "":
	106	## tpath = path
	107	## else:
	108	## tpath = rpath
	109	## if not tpath.startswith("/"):
	110	## tpath = merge(authority, path, tpath)
	111	## tpath = posixpath.normpath(tpath)
	112
	113	## if rscheme is not None:
	114	## return (rscheme, rauthority, tpath, rquery, rfragment)
	115	## elif rauthority is not None:
	116	## return (scheme, rauthority, tpath, rquery, rfragment)
	117	## elif rpath == "":
	118	## if rquery is not None:
	119	## tquery = rquery
	120	## else:
	121	## tquery = query
	122	## return (scheme, authority, tpath, tquery, rfragment)
	123	## else:
	124	## return (scheme, authority, tpath, rquery, rfragment)
	125
	126	def urljoin_parts(base_parts, reference_parts):
	127	scheme, authority, path, query, fragment = base_parts
	128	rscheme, rauthority, rpath, rquery, rfragment = reference_parts
	129
	130	if rscheme == scheme:
	131	rscheme = None
	132
	133	if rscheme is not None:
	134	tscheme, tauthority, tpath, tquery = (
	135	rscheme, rauthority, remove_dot_segments(rpath), rquery)
	136	else:
	137	if rauthority is not None:
	138	tauthority, tpath, tquery = (
	139	rauthority, remove_dot_segments(rpath), rquery)
	140	else:
	141	if rpath == "":
	142	tpath = path
	143	if rquery is not None:
	144	tquery = rquery
	145	else:
	146	tquery = query
	147	else:
	148	if rpath.startswith("/"):
	149	tpath = remove_dot_segments(rpath)
	150	else:
	151	tpath = merge(authority, path, rpath)
	152	tpath = remove_dot_segments(tpath)
	153	tquery = rquery
	154	tauthority = authority
	155	tscheme = scheme
	156	tfragment = rfragment
	157	return (tscheme, tauthority, tpath, tquery, tfragment)
	158
	159	# um, something vaguely like this is what I want, but I have to generate
	160	# lots of test cases first, if only to understand what it is that
	161	# remove_dot_segments really does...
	162	## def remove_dot_segments(path):
	163	## if path == '':
	164	## return ''
	165	## comps = path.split('/')
	166	## new_comps = []
	167	## for comp in comps:
	168	## if comp in ['.', '']:
	169	## if not new_comps or new_comps[-1]:
	170	## new_comps.append('')
	171	## continue
	172	## if comp != '..':
	173	## new_comps.append(comp)
	174	## elif new_comps:
	175	## new_comps.pop()
	176	## return '/'.join(new_comps)
	177
	178
	179	def remove_dot_segments(path):
	180	r = []
	181	while path:
	182	# A
	183	if path.startswith("../"):
	184	path = path[3:]
	185	continue
	186	if path.startswith("./"):
	187	path = path[2:]
	188	continue
	189	# B
	190	if path.startswith("/./"):
	191	path = path[2:]
	192	continue
	193	if path == "/.":
	194	path = "/"
	195	continue
	196	# C
	197	if path.startswith("/../"):
	198	path = path[3:]
	199	if r:
	200	r.pop()
	201	continue
	202	if path == "/..":
	203	path = "/"
	204	if r:
	205	r.pop()
	206	continue
	207	# D
	208	if path == ".":
	209	path = path[1:]
	210	continue
	211	if path == "..":
	212	path = path[2:]
	213	continue
	214	# E
	215	start = 0
	216	if path.startswith("/"):
	217	start = 1
	218	ii = path.find("/", start)
	219	if ii < 0:
	220	ii = None
	221	r.append(path[:ii])
	222	if ii is None:
	223	break
	224	path = path[ii:]
	225	return "".join(r)
	226
	227	def merge(base_authority, base_path, ref_path):
	228	# XXXX Oddly, the sample Perl implementation of this by Roy Fielding
	229	# doesn't even take base_authority as a parameter, despite the wording in
	230	# the RFC suggesting otherwise. Perhaps I'm missing some obvious identity.
	231	#if base_authority is not None and base_path == "":
	232	if base_path == "":
	233	return "/" + ref_path
	234	ii = base_path.rfind("/")
	235	if ii >= 0:
	236	return base_path[:ii + 1] + ref_path
	237	return ref_path
	238
	239	if __name__ == "__main__":
	240	import doctest
	241	doctest.testmod()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: