asn1crypto/_iri.py - platform/external/python/asn1crypto - Git at Google

 # coding: utf-8

 """
 Functions to convert unicode IRIs into ASCII byte string URIs and back. Exports
 the following items:

  - iri_to_uri()
  - uri_to_iri()
 """

 from __future__ import unicode_literals, division, absolute_import, print_function

 from encodings import idna  # noqa
 import codecs
 import re
 import sys

 from ._errors import unwrap
 from ._types import byte_cls, str_cls, type_name, bytes_to_list, int_types

 if sys.version_info < (3,):
     from urlparse import urlsplit, urlunsplit
     from urllib import (
         quote as urlquote,
         unquote as unquote_to_bytes,
     )

 else:
     from urllib.parse import (
         quote as urlquote,
         unquote_to_bytes,
         urlsplit,
         urlunsplit,
     )


 def iri_to_uri(value, normalize=False):
     """
     Encodes a unicode IRI into an ASCII byte string URI

     :param value:
         A unicode string of an IRI

     :param normalize:
         A bool that controls URI normalization

     :return:
         A byte string of the ASCII-encoded URI
     """

     if not isinstance(value, str_cls):
         raise TypeError(unwrap(
             '''
             value must be a unicode string, not %s
             ''',
             type_name(value)
         ))

     scheme = None
     # Python 2.6 doesn't split properly is the URL doesn't start with http:// or https://
     if sys.version_info < (2, 7) and not value.startswith('http://') and not value.startswith('https://'):
         real_prefix = None
         prefix_match = re.match('^[^:]*://', value)
         if prefix_match:
             real_prefix = prefix_match.group(0)
             value = 'http://' + value[len(real_prefix):]
         parsed = urlsplit(value)
         if real_prefix:
             value = real_prefix + value[7:]
             scheme = _urlquote(real_prefix[:-3])
     else:
         parsed = urlsplit(value)

     if scheme is None:
         scheme = _urlquote(parsed.scheme)
     hostname = parsed.hostname
     if hostname is not None:
         hostname = hostname.encode('idna')
     # RFC 3986 allows userinfo to contain sub-delims
     username = _urlquote(parsed.username, safe='!$&\'()*+,;=')
     password = _urlquote(parsed.password, safe='!$&\'()*+,;=')
     port = parsed.port
     if port is not None:
         port = str_cls(port).encode('ascii')

     netloc = b''
     if username is not None:
         netloc += username
         if password:
             netloc += b':' + password
         netloc += b'@'
     if hostname is not None:
         netloc += hostname
     if port is not None:
         default_http = scheme == b'http' and port == b'80'
         default_https = scheme == b'https' and port == b'443'
         if not normalize or (not default_http and not default_https):
             netloc += b':' + port

     # RFC 3986 allows a path to contain sub-delims, plus "@" and ":"
     path = _urlquote(parsed.path, safe='/!$&\'()*+,;=@:')
     # RFC 3986 allows the query to contain sub-delims, plus "@", ":" , "/" and "?"
     query = _urlquote(parsed.query, safe='/?!$&\'()*+,;=@:')
     # RFC 3986 allows the fragment to contain sub-delims, plus "@", ":" , "/" and "?"
     fragment = _urlquote(parsed.fragment, safe='/?!$&\'()*+,;=@:')

     if normalize and query is None and fragment is None and path == b'/':
         path = None

     # Python 2.7 compat
     if path is None:
         path = ''

     output = urlunsplit((scheme, netloc, path, query, fragment))
     if isinstance(output, str_cls):
         output = output.encode('latin1')
     return output


 def uri_to_iri(value):
     """
     Converts an ASCII URI byte string into a unicode IRI

     :param value:
         An ASCII-encoded byte string of the URI

     :return:
         A unicode string of the IRI
     """

     if not isinstance(value, byte_cls):
         raise TypeError(unwrap(
             '''
             value must be a byte string, not %s
             ''',
             type_name(value)
         ))

     parsed = urlsplit(value)

     scheme = parsed.scheme
     if scheme is not None:
         scheme = scheme.decode('ascii')

     username = _urlunquote(parsed.username, remap=[':', '@'])
     password = _urlunquote(parsed.password, remap=[':', '@'])
     hostname = parsed.hostname
     if hostname:
         hostname = hostname.decode('idna')
     port = parsed.port
     if port and not isinstance(port, int_types):
         port = port.decode('ascii')

     netloc = ''
     if username is not None:
         netloc += username
         if password:
             netloc += ':' + password
         netloc += '@'
     if hostname is not None:
         netloc += hostname
     if port is not None:
         netloc += ':' + str_cls(port)

     path = _urlunquote(parsed.path, remap=['/'], preserve=True)
     query = _urlunquote(parsed.query, remap=['&', '='], preserve=True)
     fragment = _urlunquote(parsed.fragment)

     return urlunsplit((scheme, netloc, path, query, fragment))


 def _iri_utf8_errors_handler(exc):
     """
     Error handler for decoding UTF-8 parts of a URI into an IRI. Leaves byte
     sequences encoded in %XX format, but as part of a unicode string.

     :param exc:
         The UnicodeDecodeError exception

     :return:
         A 2-element tuple of (replacement unicode string, integer index to
         resume at)
     """

     bytes_as_ints = bytes_to_list(exc.object[exc.start:exc.end])
     replacements = ['%%%02x' % num for num in bytes_as_ints]
     return (''.join(replacements), exc.end)


 codecs.register_error('iriutf8', _iri_utf8_errors_handler)


 def _urlquote(string, safe=''):
     """
     Quotes a unicode string for use in a URL

     :param string:
         A unicode string

     :param safe:
         A unicode string of character to not encode

     :return:
         None (if string is None) or an ASCII byte string of the quoted string
     """

     if string is None or string == '':
         return None

     # Anything already hex quoted is pulled out of the URL and unquoted if
     # possible
     escapes = []
     if re.search('%[0-9a-fA-F]{2}', string):
         # Try to unquote any percent values, restoring them if they are not
         # valid UTF-8. Also, requote any safe chars since encoded versions of
         # those are functionally different than the unquoted ones.
         def _try_unescape(match):
             byte_string = unquote_to_bytes(match.group(0))
             unicode_string = byte_string.decode('utf-8', 'iriutf8')
             for safe_char in list(safe):
                 unicode_string = unicode_string.replace(safe_char, '%%%02x' % ord(safe_char))
             return unicode_string
         string = re.sub('(?:%[0-9a-fA-F]{2})+', _try_unescape, string)

         # Once we have the minimal set of hex quoted values, removed them from
         # the string so that they are not double quoted
         def _extract_escape(match):
             escapes.append(match.group(0).encode('ascii'))
             return '\x00'
         string = re.sub('%[0-9a-fA-F]{2}', _extract_escape, string)

     output = urlquote(string.encode('utf-8'), safe=safe.encode('utf-8'))
     if not isinstance(output, byte_cls):
         output = output.encode('ascii')

     # Restore the existing quoted values that we extracted
     if len(escapes) > 0:
         def _return_escape(_):
             return escapes.pop(0)
         output = re.sub(b'%00', _return_escape, output)

     return output


 def _urlunquote(byte_string, remap=None, preserve=None):
     """
     Unquotes a URI portion from a byte string into unicode using UTF-8

     :param byte_string:
         A byte string of the data to unquote

     :param remap:
         A list of characters (as unicode) that should be re-mapped to a
         %XX encoding. This is used when characters are not valid in part of a
         URL.

     :param preserve:
         A bool - indicates that the chars to be remapped if they occur in
         non-hex form, should be preserved. E.g. / for URL path.

     :return:
         A unicode string
     """

     if byte_string is None:
         return byte_string

     if byte_string == b'':
         return ''

     if preserve:
         replacements = ['\x1A', '\x1C', '\x1D', '\x1E', '\x1F']
         preserve_unmap = {}
         for char in remap:
             replacement = replacements.pop(0)
             preserve_unmap[replacement] = char
             byte_string = byte_string.replace(char.encode('ascii'), replacement.encode('ascii'))

     byte_string = unquote_to_bytes(byte_string)

     if remap:
         for char in remap:
             byte_string = byte_string.replace(char.encode('ascii'), ('%%%02x' % ord(char)).encode('ascii'))

     output = byte_string.decode('utf-8', 'iriutf8')

     if preserve:
         for replacement, original in preserve_unmap.items():
             output = output.replace(replacement, original)

     return output
	# coding: utf-8

	"""
	Functions to convert unicode IRIs into ASCII byte string URIs and back. Exports
	the following items:

	- iri_to_uri()
	- uri_to_iri()
	"""

	from __future__ import unicode_literals, division, absolute_import, print_function

	from encodings import idna # noqa
	import codecs
	import re
	import sys

	from ._errors import unwrap
	from ._types import byte_cls, str_cls, type_name, bytes_to_list, int_types

	if sys.version_info < (3,):
	from urlparse import urlsplit, urlunsplit
	from urllib import (
	quote as urlquote,
	unquote as unquote_to_bytes,
	)

	else:
	from urllib.parse import (
	quote as urlquote,
	unquote_to_bytes,
	urlsplit,
	urlunsplit,
	)


	def iri_to_uri(value, normalize=False):
	"""
	Encodes a unicode IRI into an ASCII byte string URI

	:param value:
	A unicode string of an IRI

	:param normalize:
	A bool that controls URI normalization

	:return:
	A byte string of the ASCII-encoded URI
	"""

	if not isinstance(value, str_cls):
	raise TypeError(unwrap(
	'''
	value must be a unicode string, not %s
	''',
	type_name(value)
	))

	scheme = None
	# Python 2.6 doesn't split properly is the URL doesn't start with http:// or https://
	if sys.version_info < (2, 7) and not value.startswith('http://') and not value.startswith('https://'):
	real_prefix = None
	prefix_match = re.match('^[^:]*://', value)
	if prefix_match:
	real_prefix = prefix_match.group(0)
	value = 'http://' + value[len(real_prefix):]
	parsed = urlsplit(value)
	if real_prefix:
	value = real_prefix + value[7:]
	scheme = _urlquote(real_prefix[:-3])
	else:
	parsed = urlsplit(value)

	if scheme is None:
	scheme = _urlquote(parsed.scheme)
	hostname = parsed.hostname
	if hostname is not None:
	hostname = hostname.encode('idna')
	# RFC 3986 allows userinfo to contain sub-delims
	username = _urlquote(parsed.username, safe='!$&\'()*+,;=')
	password = _urlquote(parsed.password, safe='!$&\'()*+,;=')
	port = parsed.port
	if port is not None:
	port = str_cls(port).encode('ascii')

	netloc = b''
	if username is not None:
	netloc += username
	if password:
	netloc += b':' + password
	netloc += b'@'
	if hostname is not None:
	netloc += hostname
	if port is not None:
	default_http = scheme == b'http' and port == b'80'
	default_https = scheme == b'https' and port == b'443'
	if not normalize or (not default_http and not default_https):
	netloc += b':' + port

	# RFC 3986 allows a path to contain sub-delims, plus "@" and ":"
	path = _urlquote(parsed.path, safe='/!$&\'()*+,;=@:')
	# RFC 3986 allows the query to contain sub-delims, plus "@", ":" , "/" and "?"
	query = _urlquote(parsed.query, safe='/?!$&\'()*+,;=@:')
	# RFC 3986 allows the fragment to contain sub-delims, plus "@", ":" , "/" and "?"
	fragment = _urlquote(parsed.fragment, safe='/?!$&\'()*+,;=@:')

	if normalize and query is None and fragment is None and path == b'/':
	path = None

	# Python 2.7 compat
	if path is None:
	path = ''

	output = urlunsplit((scheme, netloc, path, query, fragment))
	if isinstance(output, str_cls):
	output = output.encode('latin1')
	return output


	def uri_to_iri(value):
	"""
	Converts an ASCII URI byte string into a unicode IRI

	:param value:
	An ASCII-encoded byte string of the URI

	:return:
	A unicode string of the IRI
	"""

	if not isinstance(value, byte_cls):
	raise TypeError(unwrap(
	'''
	value must be a byte string, not %s
	''',
	type_name(value)
	))

	parsed = urlsplit(value)

	scheme = parsed.scheme
	if scheme is not None:
	scheme = scheme.decode('ascii')

	username = _urlunquote(parsed.username, remap=[':', '@'])
	password = _urlunquote(parsed.password, remap=[':', '@'])
	hostname = parsed.hostname
	if hostname:
	hostname = hostname.decode('idna')
	port = parsed.port
	if port and not isinstance(port, int_types):
	port = port.decode('ascii')

	netloc = ''
	if username is not None:
	netloc += username
	if password:
	netloc += ':' + password
	netloc += '@'
	if hostname is not None:
	netloc += hostname
	if port is not None:
	netloc += ':' + str_cls(port)

	path = _urlunquote(parsed.path, remap=['/'], preserve=True)
	query = _urlunquote(parsed.query, remap=['&', '='], preserve=True)
	fragment = _urlunquote(parsed.fragment)

	return urlunsplit((scheme, netloc, path, query, fragment))


	def _iri_utf8_errors_handler(exc):
	"""
	Error handler for decoding UTF-8 parts of a URI into an IRI. Leaves byte
	sequences encoded in %XX format, but as part of a unicode string.

	:param exc:
	The UnicodeDecodeError exception

	:return:
	A 2-element tuple of (replacement unicode string, integer index to
	resume at)
	"""

	bytes_as_ints = bytes_to_list(exc.object[exc.start:exc.end])
	replacements = ['%%%02x' % num for num in bytes_as_ints]
	return (''.join(replacements), exc.end)


	codecs.register_error('iriutf8', _iri_utf8_errors_handler)


	def _urlquote(string, safe=''):
	"""
	Quotes a unicode string for use in a URL

	:param string:
	A unicode string

	:param safe:
	A unicode string of character to not encode

	:return:
	None (if string is None) or an ASCII byte string of the quoted string
	"""

	if string is None or string == '':
	return None

	# Anything already hex quoted is pulled out of the URL and unquoted if
	# possible
	escapes = []
	if re.search('%[0-9a-fA-F]{2}', string):
	# Try to unquote any percent values, restoring them if they are not
	# valid UTF-8. Also, requote any safe chars since encoded versions of
	# those are functionally different than the unquoted ones.
	def _try_unescape(match):
	byte_string = unquote_to_bytes(match.group(0))
	unicode_string = byte_string.decode('utf-8', 'iriutf8')
	for safe_char in list(safe):
	unicode_string = unicode_string.replace(safe_char, '%%%02x' % ord(safe_char))
	return unicode_string
	string = re.sub('(?:%[0-9a-fA-F]{2})+', _try_unescape, string)

	# Once we have the minimal set of hex quoted values, removed them from
	# the string so that they are not double quoted
	def _extract_escape(match):
	escapes.append(match.group(0).encode('ascii'))
	return '\x00'
	string = re.sub('%[0-9a-fA-F]{2}', _extract_escape, string)

	output = urlquote(string.encode('utf-8'), safe=safe.encode('utf-8'))
	if not isinstance(output, byte_cls):
	output = output.encode('ascii')

	# Restore the existing quoted values that we extracted
	if len(escapes) > 0:
	def _return_escape(_):
	return escapes.pop(0)
	output = re.sub(b'%00', _return_escape, output)

	return output


	def _urlunquote(byte_string, remap=None, preserve=None):
	"""
	Unquotes a URI portion from a byte string into unicode using UTF-8

	:param byte_string:
	A byte string of the data to unquote

	:param remap:
	A list of characters (as unicode) that should be re-mapped to a
	%XX encoding. This is used when characters are not valid in part of a
	URL.

	:param preserve:
	A bool - indicates that the chars to be remapped if they occur in
	non-hex form, should be preserved. E.g. / for URL path.

	:return:
	A unicode string
	"""

	if byte_string is None:
	return byte_string

	if byte_string == b'':
	return ''

	if preserve:
	replacements = ['\x1A', '\x1C', '\x1D', '\x1E', '\x1F']
	preserve_unmap = {}
	for char in remap:
	replacement = replacements.pop(0)
	preserve_unmap[replacement] = char
	byte_string = byte_string.replace(char.encode('ascii'), replacement.encode('ascii'))

	byte_string = unquote_to_bytes(byte_string)

	if remap:
	for char in remap:
	byte_string = byte_string.replace(char.encode('ascii'), ('%%%02x' % ord(char)).encode('ascii'))

	output = byte_string.decode('utf-8', 'iriutf8')

	if preserve:
	for replacement, original in preserve_unmap.items():
	output = output.replace(replacement, original)

	return output