catapult/third_party/gsutil/third_party/boto/boto/glacier/utils.py - platform/external/chromium-trace - Git at Google

 # Copyright (c) 2012 Amazon.com, Inc. or its affiliates.  All Rights Reserved
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the
 # "Software"), to deal in the Software without restriction, including
 # without limitation the rights to use, copy, modify, merge, publish, dis-
 # tribute, sublicense, and/or sell copies of the Software, and to permit
 # persons to whom the Software is furnished to do so, subject to the fol-
 # lowing conditions:
 #
 # The above copyright notice and this permission notice shall be included
 # in all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
 # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
 # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 #
 import hashlib
 import math
 import binascii

 from boto.compat import six


 _MEGABYTE = 1024 * 1024
 DEFAULT_PART_SIZE = 4 * _MEGABYTE
 MAXIMUM_NUMBER_OF_PARTS = 10000


 def minimum_part_size(size_in_bytes, default_part_size=DEFAULT_PART_SIZE):
     """Calculate the minimum part size needed for a multipart upload.

     Glacier allows a maximum of 10,000 parts per upload.  It also
     states that the maximum archive size is 10,000 * 4 GB, which means
     the part size can range from 1MB to 4GB (provided it is one 1MB
     multiplied by a power of 2).

     This function will compute what the minimum part size must be in
     order to upload a file of size ``size_in_bytes``.

     It will first check if ``default_part_size`` is sufficient for
     a part size given the ``size_in_bytes``.  If this is not the case,
     then the smallest part size than can accomodate a file of size
     ``size_in_bytes`` will be returned.

     If the file size is greater than the maximum allowed archive
     size of 10,000 * 4GB, a ``ValueError`` will be raised.

     """
     # The default part size (4 MB) will be too small for a very large
     # archive, as there is a limit of 10,000 parts in a multipart upload.
     # This puts the maximum allowed archive size with the default part size
     # at 40,000 MB. We need to do a sanity check on the part size, and find
     # one that works if the default is too small.
     part_size = _MEGABYTE
     if (default_part_size * MAXIMUM_NUMBER_OF_PARTS) < size_in_bytes:
         if size_in_bytes > (4096 * _MEGABYTE * 10000):
             raise ValueError("File size too large: %s" % size_in_bytes)
         min_part_size = size_in_bytes / 10000
         power = 3
         while part_size < min_part_size:
             part_size = math.ldexp(_MEGABYTE, power)
             power += 1
         part_size = int(part_size)
     else:
         part_size = default_part_size
     return part_size


 def chunk_hashes(bytestring, chunk_size=_MEGABYTE):
     chunk_count = int(math.ceil(len(bytestring) / float(chunk_size)))
     hashes = []
     for i in range(chunk_count):
         start = i * chunk_size
         end = (i + 1) * chunk_size
         hashes.append(hashlib.sha256(bytestring[start:end]).digest())
     if not hashes:
         return [hashlib.sha256(b'').digest()]
     return hashes


 def tree_hash(fo):
     """
     Given a hash of each 1MB chunk (from chunk_hashes) this will hash
     together adjacent hashes until it ends up with one big one. So a
     tree of hashes.
     """
     hashes = []
     hashes.extend(fo)
     while len(hashes) > 1:
         new_hashes = []
         while True:
             if len(hashes) > 1:
                 first = hashes.pop(0)
                 second = hashes.pop(0)
                 new_hashes.append(hashlib.sha256(first + second).digest())
             elif len(hashes) == 1:
                 only = hashes.pop(0)
                 new_hashes.append(only)
             else:
                 break
         hashes.extend(new_hashes)
     return hashes[0]


 def compute_hashes_from_fileobj(fileobj, chunk_size=1024 * 1024):
     """Compute the linear and tree hash from a fileobj.

     This function will compute the linear/tree hash of a fileobj
     in a single pass through the fileobj.

     :param fileobj: A file like object.

     :param chunk_size: The size of the chunks to use for the tree
         hash.  This is also the buffer size used to read from
         `fileobj`.

     :rtype: tuple
     :return: A tuple of (linear_hash, tree_hash).  Both hashes
         are returned in hex.

     """
     # Python 3+, not binary
     if six.PY3 and hasattr(fileobj, 'mode') and 'b' not in fileobj.mode:
         raise ValueError('File-like object must be opened in binary mode!')

     linear_hash = hashlib.sha256()
     chunks = []
     chunk = fileobj.read(chunk_size)
     while chunk:
         # It's possible to get a file-like object that has no mode (checked
         # above) and returns something other than bytes (e.g. str). So here
         # we try to catch that and encode to bytes.
         if not isinstance(chunk, bytes):
             chunk = chunk.encode(getattr(fileobj, 'encoding', '') or 'utf-8')
         linear_hash.update(chunk)
         chunks.append(hashlib.sha256(chunk).digest())
         chunk = fileobj.read(chunk_size)
     if not chunks:
         chunks = [hashlib.sha256(b'').digest()]
     return linear_hash.hexdigest(), bytes_to_hex(tree_hash(chunks))


 def bytes_to_hex(str_as_bytes):
     return binascii.hexlify(str_as_bytes)


 def tree_hash_from_str(str_as_bytes):
     """

     :type str_as_bytes: str
     :param str_as_bytes: The string for which to compute the tree hash.

     :rtype: str
     :return: The computed tree hash, returned as hex.

     """
     return bytes_to_hex(tree_hash(chunk_hashes(str_as_bytes)))


 class ResettingFileSender(object):
     def __init__(self, archive):
         self._archive = archive
         self._starting_offset = archive.tell()

     def __call__(self, connection, method, path, body, headers):
         try:
             connection.request(method, path, self._archive, headers)
             return connection.getresponse()
         finally:
             self._archive.seek(self._starting_offset)
	# Copyright (c) 2012 Amazon.com, Inc. or its affiliates. All Rights Reserved
	#
	# Permission is hereby granted, free of charge, to any person obtaining a
	# copy of this software and associated documentation files (the
	# "Software"), to deal in the Software without restriction, including
	# without limitation the rights to use, copy, modify, merge, publish, dis-
	# tribute, sublicense, and/or sell copies of the Software, and to permit
	# persons to whom the Software is furnished to do so, subject to the fol-
	# lowing conditions:
	#
	# The above copyright notice and this permission notice shall be included
	# in all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
	# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
	# ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
	# SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
	# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	# IN THE SOFTWARE.
	#
	import hashlib
	import math
	import binascii

	from boto.compat import six


	_MEGABYTE = 1024 * 1024
	DEFAULT_PART_SIZE = 4 * _MEGABYTE
	MAXIMUM_NUMBER_OF_PARTS = 10000


	def minimum_part_size(size_in_bytes, default_part_size=DEFAULT_PART_SIZE):
	"""Calculate the minimum part size needed for a multipart upload.

	Glacier allows a maximum of 10,000 parts per upload. It also
	states that the maximum archive size is 10,000 * 4 GB, which means
	the part size can range from 1MB to 4GB (provided it is one 1MB
	multiplied by a power of 2).

	This function will compute what the minimum part size must be in
	order to upload a file of size ``size_in_bytes``.

	It will first check if ``default_part_size`` is sufficient for
	a part size given the ``size_in_bytes``. If this is not the case,
	then the smallest part size than can accomodate a file of size
	``size_in_bytes`` will be returned.

	If the file size is greater than the maximum allowed archive
	size of 10,000 * 4GB, a ``ValueError`` will be raised.

	"""
	# The default part size (4 MB) will be too small for a very large
	# archive, as there is a limit of 10,000 parts in a multipart upload.
	# This puts the maximum allowed archive size with the default part size
	# at 40,000 MB. We need to do a sanity check on the part size, and find
	# one that works if the default is too small.
	part_size = _MEGABYTE
	if (default_part_size * MAXIMUM_NUMBER_OF_PARTS) < size_in_bytes:
	if size_in_bytes > (4096 * _MEGABYTE * 10000):
	raise ValueError("File size too large: %s" % size_in_bytes)
	min_part_size = size_in_bytes / 10000
	power = 3
	while part_size < min_part_size:
	part_size = math.ldexp(_MEGABYTE, power)
	power += 1
	part_size = int(part_size)
	else:
	part_size = default_part_size
	return part_size


	def chunk_hashes(bytestring, chunk_size=_MEGABYTE):
	chunk_count = int(math.ceil(len(bytestring) / float(chunk_size)))
	hashes = []
	for i in range(chunk_count):
	start = i * chunk_size
	end = (i + 1) * chunk_size
	hashes.append(hashlib.sha256(bytestring[start:end]).digest())
	if not hashes:
	return [hashlib.sha256(b'').digest()]
	return hashes


	def tree_hash(fo):
	"""
	Given a hash of each 1MB chunk (from chunk_hashes) this will hash
	together adjacent hashes until it ends up with one big one. So a
	tree of hashes.
	"""
	hashes = []
	hashes.extend(fo)
	while len(hashes) > 1:
	new_hashes = []
	while True:
	if len(hashes) > 1:
	first = hashes.pop(0)
	second = hashes.pop(0)
	new_hashes.append(hashlib.sha256(first + second).digest())
	elif len(hashes) == 1:
	only = hashes.pop(0)
	new_hashes.append(only)
	else:
	break
	hashes.extend(new_hashes)
	return hashes[0]


	def compute_hashes_from_fileobj(fileobj, chunk_size=1024 * 1024):
	"""Compute the linear and tree hash from a fileobj.

	This function will compute the linear/tree hash of a fileobj
	in a single pass through the fileobj.

	:param fileobj: A file like object.

	:param chunk_size: The size of the chunks to use for the tree
	hash. This is also the buffer size used to read from
	`fileobj`.

	:rtype: tuple
	:return: A tuple of (linear_hash, tree_hash). Both hashes
	are returned in hex.

	"""
	# Python 3+, not binary
	if six.PY3 and hasattr(fileobj, 'mode') and 'b' not in fileobj.mode:
	raise ValueError('File-like object must be opened in binary mode!')

	linear_hash = hashlib.sha256()
	chunks = []
	chunk = fileobj.read(chunk_size)
	while chunk:
	# It's possible to get a file-like object that has no mode (checked
	# above) and returns something other than bytes (e.g. str). So here
	# we try to catch that and encode to bytes.
	if not isinstance(chunk, bytes):
	chunk = chunk.encode(getattr(fileobj, 'encoding', '') or 'utf-8')
	linear_hash.update(chunk)
	chunks.append(hashlib.sha256(chunk).digest())
	chunk = fileobj.read(chunk_size)
	if not chunks:
	chunks = [hashlib.sha256(b'').digest()]
	return linear_hash.hexdigest(), bytes_to_hex(tree_hash(chunks))


	def bytes_to_hex(str_as_bytes):
	return binascii.hexlify(str_as_bytes)


	def tree_hash_from_str(str_as_bytes):
	"""

	:type str_as_bytes: str
	:param str_as_bytes: The string for which to compute the tree hash.

	:rtype: str
	:return: The computed tree hash, returned as hex.

	"""
	return bytes_to_hex(tree_hash(chunk_hashes(str_as_bytes)))


	class ResettingFileSender(object):
	def __init__(self, archive):
	self._archive = archive
	self._starting_offset = archive.tell()

	def __call__(self, connection, method, path, body, headers):
	try:
	connection.request(method, path, self._archive, headers)
	return connection.getresponse()
	finally:
	self._archive.seek(self._starting_offset)