plugins/hg4idea/testData/bin/mercurial/encoding.py - platform/tools/idea - Git at Google

 # encoding.py - character transcoding support for Mercurial
 #
 #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
 #
 # This software may be used and distributed according to the terms of the
 # GNU General Public License version 2 or any later version.

 import error
 import unicodedata, locale, os

 def _getpreferredencoding():
     '''
     On darwin, getpreferredencoding ignores the locale environment and
     always returns mac-roman. http://bugs.python.org/issue6202 fixes this
     for Python 2.7 and up. This is the same corrected code for earlier
     Python versions.

     However, we can't use a version check for this method, as some distributions
     patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
     encoding, as it is unlikely that this encoding is the actually expected.
     '''
     try:
         locale.CODESET
     except AttributeError:
         # Fall back to parsing environment variables :-(
         return locale.getdefaultlocale()[1]

     oldloc = locale.setlocale(locale.LC_CTYPE)
     locale.setlocale(locale.LC_CTYPE, "")
     result = locale.nl_langinfo(locale.CODESET)
     locale.setlocale(locale.LC_CTYPE, oldloc)

     return result

 _encodingfixers = {
     '646': lambda: 'ascii',
     'ANSI_X3.4-1968': lambda: 'ascii',
     'mac-roman': _getpreferredencoding
 }

 try:
     encoding = os.environ.get("HGENCODING")
     if not encoding:
         encoding = locale.getpreferredencoding() or 'ascii'
         encoding = _encodingfixers.get(encoding, lambda: encoding)()
 except locale.Error:
     encoding = 'ascii'
 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
 fallbackencoding = 'ISO-8859-1'

 class localstr(str):
     '''This class allows strings that are unmodified to be
     round-tripped to the local encoding and back'''
     def __new__(cls, u, l):
         s = str.__new__(cls, l)
         s._utf8 = u
         return s
     def __hash__(self):
         return hash(self._utf8) # avoid collisions in local string space

 def tolocal(s):
     """
     Convert a string from internal UTF-8 to local encoding

     All internal strings should be UTF-8 but some repos before the
     implementation of locale support may contain latin1 or possibly
     other character sets. We attempt to decode everything strictly
     using UTF-8, then Latin-1, and failing that, we use UTF-8 and
     replace unknown characters.

     The localstr class is used to cache the known UTF-8 encoding of
     strings next to their local representation to allow lossless
     round-trip conversion back to UTF-8.

     >>> u = 'foo: \\xc3\\xa4' # utf-8
     >>> l = tolocal(u)
     >>> l
     'foo: ?'
     >>> fromlocal(l)
     'foo: \\xc3\\xa4'
     >>> u2 = 'foo: \\xc3\\xa1'
     >>> d = { l: 1, tolocal(u2): 2 }
     >>> len(d) # no collision
     2
     >>> 'foo: ?' in d
     False
     >>> l1 = 'foo: \\xe4' # historical latin1 fallback
     >>> l = tolocal(l1)
     >>> l
     'foo: ?'
     >>> fromlocal(l) # magically in utf-8
     'foo: \\xc3\\xa4'
     """

     try:
         try:
             # make sure string is actually stored in UTF-8
             u = s.decode('UTF-8')
             if encoding == 'UTF-8':
                 # fast path
                 return s
             r = u.encode(encoding, "replace")
             if u == r.decode(encoding):
                 # r is a safe, non-lossy encoding of s
                 return r
             return localstr(s, r)
         except UnicodeDecodeError:
             # we should only get here if we're looking at an ancient changeset
             try:
                 u = s.decode(fallbackencoding)
                 r = u.encode(encoding, "replace")
                 if u == r.decode(encoding):
                     # r is a safe, non-lossy encoding of s
                     return r
                 return localstr(u.encode('UTF-8'), r)
             except UnicodeDecodeError:
                 u = s.decode("utf-8", "replace") # last ditch
                 return u.encode(encoding, "replace") # can't round-trip
     except LookupError, k:
         raise error.Abort(k, hint="please check your locale settings")

 def fromlocal(s):
     """
     Convert a string from the local character encoding to UTF-8

     We attempt to decode strings using the encoding mode set by
     HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
     characters will cause an error message. Other modes include
     'replace', which replaces unknown characters with a special
     Unicode character, and 'ignore', which drops the character.
     """

     # can we do a lossless round-trip?
     if isinstance(s, localstr):
         return s._utf8

     try:
         return s.decode(encoding, encodingmode).encode("utf-8")
     except UnicodeDecodeError, inst:
         sub = s[max(0, inst.start - 10):inst.start + 10]
         raise error.Abort("decoding near '%s': %s!" % (sub, inst))
     except LookupError, k:
         raise error.Abort(k, hint="please check your locale settings")

 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
         and "WFA" or "WF")

 def colwidth(s):
     "Find the column width of a string for display in the local encoding"
     return ucolwidth(s.decode(encoding, 'replace'))

 def ucolwidth(d):
     "Find the column width of a Unicode string for display"
     eaw = getattr(unicodedata, 'east_asian_width', None)
     if eaw is not None:
         return sum([eaw(c) in wide and 2 or 1 for c in d])
     return len(d)

 def getcols(s, start, c):
     '''Use colwidth to find a c-column substring of s starting at byte
     index start'''
     for x in xrange(start + c, len(s)):
         t = s[start:x]
         if colwidth(t) == c:
             return t

 def lower(s):
     "best-effort encoding-aware case-folding of local string s"
     try:
         s.decode('ascii') # throw exception for non-ASCII character
         return s.lower()
     except UnicodeDecodeError:
         pass
     try:
         if isinstance(s, localstr):
             u = s._utf8.decode("utf-8")
         else:
             u = s.decode(encoding, encodingmode)

         lu = u.lower()
         if u == lu:
             return s # preserve localstring
         return lu.encode(encoding)
     except UnicodeError:
         return s.lower() # we don't know how to fold this except in ASCII
     except LookupError, k:
         raise error.Abort(k, hint="please check your locale settings")

 def upper(s):
     "best-effort encoding-aware case-folding of local string s"
     try:
         s.decode('ascii') # throw exception for non-ASCII character
         return s.upper()
     except UnicodeDecodeError:
         pass
     try:
         if isinstance(s, localstr):
             u = s._utf8.decode("utf-8")
         else:
             u = s.decode(encoding, encodingmode)

         uu = u.upper()
         if u == uu:
             return s # preserve localstring
         return uu.encode(encoding)
     except UnicodeError:
         return s.upper() # we don't know how to fold this except in ASCII
     except LookupError, k:
         raise error.Abort(k, hint="please check your locale settings")

 def toutf8b(s):
     '''convert a local, possibly-binary string into UTF-8b

     This is intended as a generic method to preserve data when working
     with schemes like JSON and XML that have no provision for
     arbitrary byte strings. As Mercurial often doesn't know
     what encoding data is in, we use so-called UTF-8b.

     If a string is already valid UTF-8 (or ASCII), it passes unmodified.
     Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
     uDC00-uDCFF.

     Principles of operation:

     - ASCII and UTF-8 data successfully round-trips and is understood
       by Unicode-oriented clients
     - filenames and file contents in arbitrary other encodings can have
       be round-tripped or recovered by clueful clients
     - local strings that have a cached known UTF-8 encoding (aka
       localstr) get sent as UTF-8 so Unicode-oriented clients get the
       Unicode data they want
     - because we must preserve UTF-8 bytestring in places such as
       filenames, metadata can't be roundtripped without help

     (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
     arbitrary bytes into an internal Unicode format that can be
     re-encoded back into the original. Here we are exposing the
     internal surrogate encoding as a UTF-8 string.)
     '''

     if isinstance(s, localstr):
         return s._utf8

     try:
         if s.decode('utf-8'):
             return s
     except UnicodeDecodeError:
         # surrogate-encode any characters that don't round-trip
         s2 = s.decode('utf-8', 'ignore').encode('utf-8')
         r = ""
         pos = 0
         for c in s:
             if s2[pos:pos + 1] == c:
                 r += c
                 pos += 1
             else:
                 r += unichr(0xdc00 + ord(c)).encode('utf-8')
         return r

 def fromutf8b(s):
     '''Given a UTF-8b string, return a local, possibly-binary string.

     return the original binary string. This
     is a round-trip process for strings like filenames, but metadata
     that's was passed through tolocal will remain in UTF-8.

     >>> m = "\\xc3\\xa9\\x99abcd"
     >>> n = toutf8b(m)
     >>> n
     '\\xc3\\xa9\\xed\\xb2\\x99abcd'
     >>> fromutf8b(n) == m
     True
     '''

     # fast path - look for uDxxx prefixes in s
     if "\xed" not in s:
         return s

     u = s.decode("utf-8")
     r = ""
     for c in u:
         if ord(c) & 0xff00 == 0xdc00:
             r += chr(ord(c) & 0xff)
         else:
             r += c.encode("utf-8")
     return r
	# encoding.py - character transcoding support for Mercurial
	#
	# Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
	#
	# This software may be used and distributed according to the terms of the
	# GNU General Public License version 2 or any later version.

	import error
	import unicodedata, locale, os

	def _getpreferredencoding():
	'''
	On darwin, getpreferredencoding ignores the locale environment and
	always returns mac-roman. http://bugs.python.org/issue6202 fixes this
	for Python 2.7 and up. This is the same corrected code for earlier
	Python versions.

	However, we can't use a version check for this method, as some distributions
	patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
	encoding, as it is unlikely that this encoding is the actually expected.
	'''
	try:
	locale.CODESET
	except AttributeError:
	# Fall back to parsing environment variables :-(
	return locale.getdefaultlocale()[1]

	oldloc = locale.setlocale(locale.LC_CTYPE)
	locale.setlocale(locale.LC_CTYPE, "")
	result = locale.nl_langinfo(locale.CODESET)
	locale.setlocale(locale.LC_CTYPE, oldloc)

	return result

	_encodingfixers = {
	'646': lambda: 'ascii',
	'ANSI_X3.4-1968': lambda: 'ascii',
	'mac-roman': _getpreferredencoding
	}

	try:
	encoding = os.environ.get("HGENCODING")
	if not encoding:
	encoding = locale.getpreferredencoding() or 'ascii'
	encoding = _encodingfixers.get(encoding, lambda: encoding)()
	except locale.Error:
	encoding = 'ascii'
	encodingmode = os.environ.get("HGENCODINGMODE", "strict")
	fallbackencoding = 'ISO-8859-1'

	class localstr(str):
	'''This class allows strings that are unmodified to be
	round-tripped to the local encoding and back'''
	def __new__(cls, u, l):
	s = str.__new__(cls, l)
	s._utf8 = u
	return s
	def __hash__(self):
	return hash(self._utf8) # avoid collisions in local string space

	def tolocal(s):
	"""
	Convert a string from internal UTF-8 to local encoding

	All internal strings should be UTF-8 but some repos before the
	implementation of locale support may contain latin1 or possibly
	other character sets. We attempt to decode everything strictly
	using UTF-8, then Latin-1, and failing that, we use UTF-8 and
	replace unknown characters.

	The localstr class is used to cache the known UTF-8 encoding of
	strings next to their local representation to allow lossless
	round-trip conversion back to UTF-8.

	>>> u = 'foo: \\xc3\\xa4' # utf-8
	>>> l = tolocal(u)
	>>> l
	'foo: ?'
	>>> fromlocal(l)
	'foo: \\xc3\\xa4'
	>>> u2 = 'foo: \\xc3\\xa1'
	>>> d = { l: 1, tolocal(u2): 2 }
	>>> len(d) # no collision
	2
	>>> 'foo: ?' in d
	False
	>>> l1 = 'foo: \\xe4' # historical latin1 fallback
	>>> l = tolocal(l1)
	>>> l
	'foo: ?'
	>>> fromlocal(l) # magically in utf-8
	'foo: \\xc3\\xa4'
	"""

	try:
	try:
	# make sure string is actually stored in UTF-8
	u = s.decode('UTF-8')
	if encoding == 'UTF-8':
	# fast path
	return s
	r = u.encode(encoding, "replace")
	if u == r.decode(encoding):
	# r is a safe, non-lossy encoding of s
	return r
	return localstr(s, r)
	except UnicodeDecodeError:
	# we should only get here if we're looking at an ancient changeset
	try:
	u = s.decode(fallbackencoding)
	r = u.encode(encoding, "replace")
	if u == r.decode(encoding):
	# r is a safe, non-lossy encoding of s
	return r
	return localstr(u.encode('UTF-8'), r)
	except UnicodeDecodeError:
	u = s.decode("utf-8", "replace") # last ditch
	return u.encode(encoding, "replace") # can't round-trip
	except LookupError, k:
	raise error.Abort(k, hint="please check your locale settings")

	def fromlocal(s):
	"""
	Convert a string from the local character encoding to UTF-8

	We attempt to decode strings using the encoding mode set by
	HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
	characters will cause an error message. Other modes include
	'replace', which replaces unknown characters with a special
	Unicode character, and 'ignore', which drops the character.
	"""

	# can we do a lossless round-trip?
	if isinstance(s, localstr):
	return s._utf8

	try:
	return s.decode(encoding, encodingmode).encode("utf-8")
	except UnicodeDecodeError, inst:
	sub = s[max(0, inst.start - 10):inst.start + 10]
	raise error.Abort("decoding near '%s': %s!" % (sub, inst))
	except LookupError, k:
	raise error.Abort(k, hint="please check your locale settings")

	# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
	wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
	and "WFA" or "WF")

	def colwidth(s):
	"Find the column width of a string for display in the local encoding"
	return ucolwidth(s.decode(encoding, 'replace'))

	def ucolwidth(d):
	"Find the column width of a Unicode string for display"
	eaw = getattr(unicodedata, 'east_asian_width', None)
	if eaw is not None:
	return sum([eaw(c) in wide and 2 or 1 for c in d])
	return len(d)

	def getcols(s, start, c):
	'''Use colwidth to find a c-column substring of s starting at byte
	index start'''
	for x in xrange(start + c, len(s)):
	t = s[start:x]
	if colwidth(t) == c:
	return t

	def lower(s):
	"best-effort encoding-aware case-folding of local string s"
	try:
	s.decode('ascii') # throw exception for non-ASCII character
	return s.lower()
	except UnicodeDecodeError:
	pass
	try:
	if isinstance(s, localstr):
	u = s._utf8.decode("utf-8")
	else:
	u = s.decode(encoding, encodingmode)

	lu = u.lower()
	if u == lu:
	return s # preserve localstring
	return lu.encode(encoding)
	except UnicodeError:
	return s.lower() # we don't know how to fold this except in ASCII
	except LookupError, k:
	raise error.Abort(k, hint="please check your locale settings")

	def upper(s):
	"best-effort encoding-aware case-folding of local string s"
	try:
	s.decode('ascii') # throw exception for non-ASCII character
	return s.upper()
	except UnicodeDecodeError:
	pass
	try:
	if isinstance(s, localstr):
	u = s._utf8.decode("utf-8")
	else:
	u = s.decode(encoding, encodingmode)

	uu = u.upper()
	if u == uu:
	return s # preserve localstring
	return uu.encode(encoding)
	except UnicodeError:
	return s.upper() # we don't know how to fold this except in ASCII
	except LookupError, k:
	raise error.Abort(k, hint="please check your locale settings")

	def toutf8b(s):
	'''convert a local, possibly-binary string into UTF-8b

	This is intended as a generic method to preserve data when working
	with schemes like JSON and XML that have no provision for
	arbitrary byte strings. As Mercurial often doesn't know
	what encoding data is in, we use so-called UTF-8b.

	If a string is already valid UTF-8 (or ASCII), it passes unmodified.
	Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
	uDC00-uDCFF.

	Principles of operation:

	- ASCII and UTF-8 data successfully round-trips and is understood
	by Unicode-oriented clients
	- filenames and file contents in arbitrary other encodings can have
	be round-tripped or recovered by clueful clients
	- local strings that have a cached known UTF-8 encoding (aka
	localstr) get sent as UTF-8 so Unicode-oriented clients get the
	Unicode data they want
	- because we must preserve UTF-8 bytestring in places such as
	filenames, metadata can't be roundtripped without help

	(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
	arbitrary bytes into an internal Unicode format that can be
	re-encoded back into the original. Here we are exposing the
	internal surrogate encoding as a UTF-8 string.)
	'''

	if isinstance(s, localstr):
	return s._utf8

	try:
	if s.decode('utf-8'):
	return s
	except UnicodeDecodeError:
	# surrogate-encode any characters that don't round-trip
	s2 = s.decode('utf-8', 'ignore').encode('utf-8')
	r = ""
	pos = 0
	for c in s:
	if s2[pos:pos + 1] == c:
	r += c
	pos += 1
	else:
	r += unichr(0xdc00 + ord(c)).encode('utf-8')
	return r

	def fromutf8b(s):
	'''Given a UTF-8b string, return a local, possibly-binary string.

	return the original binary string. This
	is a round-trip process for strings like filenames, but metadata
	that's was passed through tolocal will remain in UTF-8.

	>>> m = "\\xc3\\xa9\\x99abcd"
	>>> n = toutf8b(m)
	>>> n
	'\\xc3\\xa9\\xed\\xb2\\x99abcd'
	>>> fromutf8b(n) == m
	True
	'''

	# fast path - look for uDxxx prefixes in s
	if "\xed" not in s:
	return s

	u = s.decode("utf-8")
	r = ""
	for c in u:
	if ord(c) & 0xff00 == 0xdc00:
	r += chr(ord(c) & 0xff)
	else:
	r += c.encode("utf-8")
	return r