python/lib/Lib/urlparse.py - platform/tools/idea - Git at Google

 """Parse (absolute and relative) URLs.

 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
 UC Irvine, June 1995.
 """

 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
            "urlsplit", "urlunsplit"]

 # A classification of schemes ('' means apply by default)
 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
                  'wais', 'file', 'https', 'shttp', 'mms',
                  'prospero', 'rtsp', 'rtspu', '', 'sftp']
 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
                'imap', 'wais', 'file', 'mms', 'https', 'shttp',
                'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
                'svn', 'svn+ssh', 'sftp']
 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
                     'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
                'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
                'mms', '', 'sftp']
 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
               'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
                  'nntp', 'wais', 'https', 'shttp', 'snews',
                  'file', 'prospero', '']

 # Characters valid in scheme names
 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
                 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                 '0123456789'
                 '+-.')

 MAX_CACHE_SIZE = 20
 _parse_cache = {}

 def clear_cache():
     """Clear the parse cache."""
     global _parse_cache
     _parse_cache = {}


 class BaseResult(tuple):
     """Base class for the parsed result objects.

     This provides the attributes shared by the two derived result
     objects as read-only properties.  The derived classes are
     responsible for checking the right number of arguments were
     supplied to the constructor.

     """

     __slots__ = ()

     # Attributes that access the basic components of the URL:

     @property
     def scheme(self):
         return self[0]

     @property
     def netloc(self):
         return self[1]

     @property
     def path(self):
         return self[2]

     @property
     def query(self):
         return self[-2]

     @property
     def fragment(self):
         return self[-1]

     # Additional attributes that provide access to parsed-out portions
     # of the netloc:

     @property
     def username(self):
         netloc = self.netloc
         if "@" in netloc:
             userinfo = netloc.split("@", 1)[0]
             if ":" in userinfo:
                 userinfo = userinfo.split(":", 1)[0]
             return userinfo
         return None

     @property
     def password(self):
         netloc = self.netloc
         if "@" in netloc:
             userinfo = netloc.split("@", 1)[0]
             if ":" in userinfo:
                 return userinfo.split(":", 1)[1]
         return None

     @property
     def hostname(self):
         netloc = self.netloc
         if "@" in netloc:
             netloc = netloc.split("@", 1)[1]
         if ":" in netloc:
             netloc = netloc.split(":", 1)[0]
         return netloc.lower() or None

     @property
     def port(self):
         netloc = self.netloc
         if "@" in netloc:
             netloc = netloc.split("@", 1)[1]
         if ":" in netloc:
             port = netloc.split(":", 1)[1]
             return int(port, 10)
         return None


 class SplitResult(BaseResult):

     __slots__ = ()

     def __new__(cls, scheme, netloc, path, query, fragment):
         return BaseResult.__new__(
             cls, (scheme, netloc, path, query, fragment))

     def geturl(self):
         return urlunsplit(self)


 class ParseResult(BaseResult):

     __slots__ = ()

     def __new__(cls, scheme, netloc, path, params, query, fragment):
         return BaseResult.__new__(
             cls, (scheme, netloc, path, params, query, fragment))

     @property
     def params(self):
         return self[3]

     def geturl(self):
         return urlunparse(self)


 def urlparse(url, scheme='', allow_fragments=True):
     """Parse a URL into 6 components:
     <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
     Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
     Note that we don't break the components up in smaller bits
     (e.g. netloc is a single string) and we don't expand % escapes."""
     tuple = urlsplit(url, scheme, allow_fragments)
     scheme, netloc, url, query, fragment = tuple
     if scheme in uses_params and ';' in url:
         url, params = _splitparams(url)
     else:
         params = ''
     return ParseResult(scheme, netloc, url, params, query, fragment)

 def _splitparams(url):
     if '/'  in url:
         i = url.find(';', url.rfind('/'))
         if i < 0:
             return url, ''
     else:
         i = url.find(';')
     return url[:i], url[i+1:]

 def _splitnetloc(url, start=0):
     delim = len(url)   # position of end of domain part of url, default is end
     for c in '/?#':    # look for delimiters; the order is NOT important
         wdelim = url.find(c, start)        # find first of this delim
         if wdelim >= 0:                    # if found
             delim = min(delim, wdelim)     # use earliest delim position
     return url[start:delim], url[delim:]   # return (domain, rest)

 def urlsplit(url, scheme='', allow_fragments=True):
     """Parse a URL into 5 components:
     <scheme>://<netloc>/<path>?<query>#<fragment>
     Return a 5-tuple: (scheme, netloc, path, query, fragment).
     Note that we don't break the components up in smaller bits
     (e.g. netloc is a single string) and we don't expand % escapes."""
     allow_fragments = bool(allow_fragments)
     key = url, scheme, allow_fragments, type(url), type(scheme)
     cached = _parse_cache.get(key, None)
     if cached:
         return cached
     if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
         clear_cache()
     netloc = query = fragment = ''
     i = url.find(':')
     if i > 0:
         if url[:i] == 'http': # optimize the common case
             scheme = url[:i].lower()
             url = url[i+1:]
             if url[:2] == '//':
                 netloc, url = _splitnetloc(url, 2)
             if allow_fragments and '#' in url:
                 url, fragment = url.split('#', 1)
             if '?' in url:
                 url, query = url.split('?', 1)
             v = SplitResult(scheme, netloc, url, query, fragment)
             _parse_cache[key] = v
             return v
         for c in url[:i]:
             if c not in scheme_chars:
                 break
         else:
             scheme, url = url[:i].lower(), url[i+1:]
     if scheme in uses_netloc and url[:2] == '//':
         netloc, url = _splitnetloc(url, 2)
     if allow_fragments and scheme in uses_fragment and '#' in url:
         url, fragment = url.split('#', 1)
     if scheme in uses_query and '?' in url:
         url, query = url.split('?', 1)
     v = SplitResult(scheme, netloc, url, query, fragment)
     _parse_cache[key] = v
     return v

 def urlunparse((scheme, netloc, url, params, query, fragment)):
     """Put a parsed URL back together again.  This may result in a
     slightly different, but equivalent URL, if the URL that was parsed
     originally had redundant delimiters, e.g. a ? with an empty query
     (the draft states that these are equivalent)."""
     if params:
         url = "%s;%s" % (url, params)
     return urlunsplit((scheme, netloc, url, query, fragment))

 def urlunsplit((scheme, netloc, url, query, fragment)):
     if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
         if url and url[:1] != '/': url = '/' + url
         url = '//' + (netloc or '') + url
     if scheme:
         url = scheme + ':' + url
     if query:
         url = url + '?' + query
     if fragment:
         url = url + '#' + fragment
     return url

 def urljoin(base, url, allow_fragments=True):
     """Join a base URL and a possibly relative URL to form an absolute
     interpretation of the latter."""
     if not base:
         return url
     if not url:
         return base
     bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
             urlparse(base, '', allow_fragments)
     scheme, netloc, path, params, query, fragment = \
             urlparse(url, bscheme, allow_fragments)
     if scheme != bscheme or scheme not in uses_relative:
         return url
     if scheme in uses_netloc:
         if netloc:
             return urlunparse((scheme, netloc, path,
                                params, query, fragment))
         netloc = bnetloc
     if path[:1] == '/':
         return urlunparse((scheme, netloc, path,
                            params, query, fragment))
     if not (path or params or query):
         return urlunparse((scheme, netloc, bpath,
                            bparams, bquery, fragment))
     segments = bpath.split('/')[:-1] + path.split('/')
     # XXX The stuff below is bogus in various ways...
     if segments[-1] == '.':
         segments[-1] = ''
     while '.' in segments:
         segments.remove('.')
     while 1:
         i = 1
         n = len(segments) - 1
         while i < n:
             if (segments[i] == '..'
                 and segments[i-1] not in ('', '..')):
                 del segments[i-1:i+1]
                 break
             i = i+1
         else:
             break
     if segments == ['', '..']:
         segments[-1] = ''
     elif len(segments) >= 2 and segments[-1] == '..':
         segments[-2:] = ['']
     return urlunparse((scheme, netloc, '/'.join(segments),
                        params, query, fragment))

 def urldefrag(url):
     """Removes any existing fragment from URL.

     Returns a tuple of the defragmented URL and the fragment.  If
     the URL contained no fragments, the second element is the
     empty string.
     """
     if '#' in url:
         s, n, p, a, q, frag = urlparse(url)
         defrag = urlunparse((s, n, p, a, q, ''))
         return defrag, frag
     else:
         return url, ''


 test_input = """
       http://a/b/c/d

       g:h        = <URL:g:h>
       http:g     = <URL:http://a/b/c/g>
       http:      = <URL:http://a/b/c/d>
       g          = <URL:http://a/b/c/g>
       ./g        = <URL:http://a/b/c/g>
       g/         = <URL:http://a/b/c/g/>
       /g         = <URL:http://a/g>
       //g        = <URL:http://g>
       ?y         = <URL:http://a/b/c/d?y>
       g?y        = <URL:http://a/b/c/g?y>
       g?y/./x    = <URL:http://a/b/c/g?y/./x>
       .          = <URL:http://a/b/c/>
       ./         = <URL:http://a/b/c/>
       ..         = <URL:http://a/b/>
       ../        = <URL:http://a/b/>
       ../g       = <URL:http://a/b/g>
       ../..      = <URL:http://a/>
       ../../g    = <URL:http://a/g>
       ../../../g = <URL:http://a/../g>
       ./../g     = <URL:http://a/b/g>
       ./g/.      = <URL:http://a/b/c/g/>
       /./g       = <URL:http://a/./g>
       g/./h      = <URL:http://a/b/c/g/h>
       g/../h     = <URL:http://a/b/c/h>
       http:g     = <URL:http://a/b/c/g>
       http:      = <URL:http://a/b/c/d>
       http:?y         = <URL:http://a/b/c/d?y>
       http:g?y        = <URL:http://a/b/c/g?y>
       http:g?y/./x    = <URL:http://a/b/c/g?y/./x>
 """

 def test():
     import sys
     base = ''
     if sys.argv[1:]:
         fn = sys.argv[1]
         if fn == '-':
             fp = sys.stdin
         else:
             fp = open(fn)
     else:
         try:
             from cStringIO import StringIO
         except ImportError:
             from StringIO import StringIO
         fp = StringIO(test_input)
     while 1:
         line = fp.readline()
         if not line: break
         words = line.split()
         if not words:
             continue
         url = words[0]
         parts = urlparse(url)
         print '%-10s : %s' % (url, parts)
         abs = urljoin(base, url)
         if not base:
             base = abs
         wrapped = '<URL:%s>' % abs
         print '%-10s = %s' % (url, wrapped)
         if len(words) == 3 and words[1] == '=':
             if wrapped != words[2]:
                 print 'EXPECTED', words[2], '!!!!!!!!!!'

 if __name__ == '__main__':
     test()
	"""Parse (absolute and relative) URLs.

	See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
	UC Irvine, June 1995.
	"""

	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
	"urlsplit", "urlunsplit"]

	# A classification of schemes ('' means apply by default)
	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
	'wais', 'file', 'https', 'shttp', 'mms',
	'prospero', 'rtsp', 'rtspu', '', 'sftp']
	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
	'svn', 'svn+ssh', 'sftp']
	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
	'mms', '', 'sftp']
	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
	'nntp', 'wais', 'https', 'shttp', 'snews',
	'file', 'prospero', '']

	# Characters valid in scheme names
	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
	'0123456789'
	'+-.')

	MAX_CACHE_SIZE = 20
	_parse_cache = {}

	def clear_cache():
	"""Clear the parse cache."""
	global _parse_cache
	_parse_cache = {}


	class BaseResult(tuple):
	"""Base class for the parsed result objects.

	This provides the attributes shared by the two derived result
	objects as read-only properties. The derived classes are
	responsible for checking the right number of arguments were
	supplied to the constructor.

	"""

	__slots__ = ()

	# Attributes that access the basic components of the URL:

	@property
	def scheme(self):
	return self[0]

	@property
	def netloc(self):
	return self[1]

	@property
	def path(self):
	return self[2]

	@property
	def query(self):
	return self[-2]

	@property
	def fragment(self):
	return self[-1]

	# Additional attributes that provide access to parsed-out portions
	# of the netloc:

	@property
	def username(self):
	netloc = self.netloc
	if "@" in netloc:
	userinfo = netloc.split("@", 1)[0]
	if ":" in userinfo:
	userinfo = userinfo.split(":", 1)[0]
	return userinfo
	return None

	@property
	def password(self):
	netloc = self.netloc
	if "@" in netloc:
	userinfo = netloc.split("@", 1)[0]
	if ":" in userinfo:
	return userinfo.split(":", 1)[1]
	return None

	@property
	def hostname(self):
	netloc = self.netloc
	if "@" in netloc:
	netloc = netloc.split("@", 1)[1]
	if ":" in netloc:
	netloc = netloc.split(":", 1)[0]
	return netloc.lower() or None

	@property
	def port(self):
	netloc = self.netloc
	if "@" in netloc:
	netloc = netloc.split("@", 1)[1]
	if ":" in netloc:
	port = netloc.split(":", 1)[1]
	return int(port, 10)
	return None


	class SplitResult(BaseResult):

	__slots__ = ()

	def __new__(cls, scheme, netloc, path, query, fragment):
	return BaseResult.__new__(
	cls, (scheme, netloc, path, query, fragment))

	def geturl(self):
	return urlunsplit(self)


	class ParseResult(BaseResult):

	__slots__ = ()

	def __new__(cls, scheme, netloc, path, params, query, fragment):
	return BaseResult.__new__(
	cls, (scheme, netloc, path, params, query, fragment))

	@property
	def params(self):
	return self[3]

	def geturl(self):
	return urlunparse(self)


	def urlparse(url, scheme='', allow_fragments=True):
	"""Parse a URL into 6 components:
	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
	Note that we don't break the components up in smaller bits
	(e.g. netloc is a single string) and we don't expand % escapes."""
	tuple = urlsplit(url, scheme, allow_fragments)
	scheme, netloc, url, query, fragment = tuple
	if scheme in uses_params and ';' in url:
	url, params = _splitparams(url)
	else:
	params = ''
	return ParseResult(scheme, netloc, url, params, query, fragment)

	def _splitparams(url):
	if '/' in url:
	i = url.find(';', url.rfind('/'))
	if i < 0:
	return url, ''
	else:
	i = url.find(';')
	return url[:i], url[i+1:]

	def _splitnetloc(url, start=0):
	delim = len(url) # position of end of domain part of url, default is end
	for c in '/?#': # look for delimiters; the order is NOT important
	wdelim = url.find(c, start) # find first of this delim
	if wdelim >= 0: # if found
	delim = min(delim, wdelim) # use earliest delim position
	return url[start:delim], url[delim:] # return (domain, rest)

	def urlsplit(url, scheme='', allow_fragments=True):
	"""Parse a URL into 5 components:
	<scheme>://<netloc>/<path>?<query>#<fragment>
	Return a 5-tuple: (scheme, netloc, path, query, fragment).
	Note that we don't break the components up in smaller bits
	(e.g. netloc is a single string) and we don't expand % escapes."""
	allow_fragments = bool(allow_fragments)
	key = url, scheme, allow_fragments, type(url), type(scheme)
	cached = _parse_cache.get(key, None)
	if cached:
	return cached
	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
	clear_cache()
	netloc = query = fragment = ''
	i = url.find(':')
	if i > 0:
	if url[:i] == 'http': # optimize the common case
	scheme = url[:i].lower()
	url = url[i+1:]
	if url[:2] == '//':
	netloc, url = _splitnetloc(url, 2)
	if allow_fragments and '#' in url:
	url, fragment = url.split('#', 1)
	if '?' in url:
	url, query = url.split('?', 1)
	v = SplitResult(scheme, netloc, url, query, fragment)
	_parse_cache[key] = v
	return v
	for c in url[:i]:
	if c not in scheme_chars:
	break
	else:
	scheme, url = url[:i].lower(), url[i+1:]
	if scheme in uses_netloc and url[:2] == '//':
	netloc, url = _splitnetloc(url, 2)
	if allow_fragments and scheme in uses_fragment and '#' in url:
	url, fragment = url.split('#', 1)
	if scheme in uses_query and '?' in url:
	url, query = url.split('?', 1)
	v = SplitResult(scheme, netloc, url, query, fragment)
	_parse_cache[key] = v
	return v

	def urlunparse((scheme, netloc, url, params, query, fragment)):
	"""Put a parsed URL back together again. This may result in a
	slightly different, but equivalent URL, if the URL that was parsed
	originally had redundant delimiters, e.g. a ? with an empty query
	(the draft states that these are equivalent)."""
	if params:
	url = "%s;%s" % (url, params)
	return urlunsplit((scheme, netloc, url, query, fragment))

	def urlunsplit((scheme, netloc, url, query, fragment)):
	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
	if url and url[:1] != '/': url = '/' + url
	url = '//' + (netloc or '') + url
	if scheme:
	url = scheme + ':' + url
	if query:
	url = url + '?' + query
	if fragment:
	url = url + '#' + fragment
	return url

	def urljoin(base, url, allow_fragments=True):
	"""Join a base URL and a possibly relative URL to form an absolute
	interpretation of the latter."""
	if not base:
	return url
	if not url:
	return base
	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
	urlparse(base, '', allow_fragments)
	scheme, netloc, path, params, query, fragment = \
	urlparse(url, bscheme, allow_fragments)
	if scheme != bscheme or scheme not in uses_relative:
	return url
	if scheme in uses_netloc:
	if netloc:
	return urlunparse((scheme, netloc, path,
	params, query, fragment))
	netloc = bnetloc
	if path[:1] == '/':
	return urlunparse((scheme, netloc, path,
	params, query, fragment))
	if not (path or params or query):
	return urlunparse((scheme, netloc, bpath,
	bparams, bquery, fragment))
	segments = bpath.split('/')[:-1] + path.split('/')
	# XXX The stuff below is bogus in various ways...
	if segments[-1] == '.':
	segments[-1] = ''
	while '.' in segments:
	segments.remove('.')
	while 1:
	i = 1
	n = len(segments) - 1
	while i < n:
	if (segments[i] == '..'
	and segments[i-1] not in ('', '..')):
	del segments[i-1:i+1]
	break
	i = i+1
	else:
	break
	if segments == ['', '..']:
	segments[-1] = ''
	elif len(segments) >= 2 and segments[-1] == '..':
	segments[-2:] = ['']
	return urlunparse((scheme, netloc, '/'.join(segments),
	params, query, fragment))

	def urldefrag(url):
	"""Removes any existing fragment from URL.

	Returns a tuple of the defragmented URL and the fragment. If
	the URL contained no fragments, the second element is the
	empty string.
	"""
	if '#' in url:
	s, n, p, a, q, frag = urlparse(url)
	defrag = urlunparse((s, n, p, a, q, ''))
	return defrag, frag
	else:
	return url, ''


	test_input = """
	http://a/b/c/d

	g:h = <URL:g:h>
	http:g = <URL:http://a/b/c/g>
	http: = <URL:http://a/b/c/d>
	g = <URL:http://a/b/c/g>
	./g = <URL:http://a/b/c/g>
	g/ = <URL:http://a/b/c/g/>
	/g = <URL:http://a/g>
	//g = <URL:http://g>
	?y = <URL:http://a/b/c/d?y>
	g?y = <URL:http://a/b/c/g?y>
	g?y/./x = <URL:http://a/b/c/g?y/./x>
	. = <URL:http://a/b/c/>
	./ = <URL:http://a/b/c/>
	.. = <URL:http://a/b/>
	../ = <URL:http://a/b/>
	../g = <URL:http://a/b/g>
	../.. = <URL:http://a/>
	../../g = <URL:http://a/g>
	../../../g = <URL:http://a/../g>
	./../g = <URL:http://a/b/g>
	./g/. = <URL:http://a/b/c/g/>
	/./g = <URL:http://a/./g>
	g/./h = <URL:http://a/b/c/g/h>
	g/../h = <URL:http://a/b/c/h>
	http:g = <URL:http://a/b/c/g>
	http: = <URL:http://a/b/c/d>
	http:?y = <URL:http://a/b/c/d?y>
	http:g?y = <URL:http://a/b/c/g?y>
	http:g?y/./x = <URL:http://a/b/c/g?y/./x>
	"""

	def test():
	import sys
	base = ''
	if sys.argv[1:]:
	fn = sys.argv[1]
	if fn == '-':
	fp = sys.stdin
	else:
	fp = open(fn)
	else:
	try:
	from cStringIO import StringIO
	except ImportError:
	from StringIO import StringIO
	fp = StringIO(test_input)
	while 1:
	line = fp.readline()
	if not line: break
	words = line.split()
	if not words:
	continue
	url = words[0]
	parts = urlparse(url)
	print '%-10s : %s' % (url, parts)
	abs = urljoin(base, url)
	if not base:
	base = abs
	wrapped = '<URL:%s>' % abs
	print '%-10s = %s' % (url, wrapped)
	if len(words) == 3 and words[1] == '=':
	if wrapped != words[2]:
	print 'EXPECTED', words[2], '!!!!!!!!!!'

	if __name__ == '__main__':
	test()