build/third_party/gsutil/gslib/util.py - platform/external/adt-infra - Git at Google

 # -*- coding: utf-8 -*-
 # Copyright 2010 Google Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Static data and helper functions."""

 from __future__ import absolute_import

 import collections
 import errno
 import locale
 import logging
 import math
 import multiprocessing
 import os
 import pkgutil
 import re
 import struct
 import sys
 import tempfile
 import textwrap
 import threading
 import traceback
 import xml.etree.ElementTree as ElementTree

 import boto
 from boto import config
 import boto.auth
 from boto.exception import NoAuthHandlerFound
 from boto.gs.connection import GSConnection
 from boto.provider import Provider
 from boto.pyami.config import BotoConfigLocations

 import gslib
 from gslib.exception import CommandException
 from gslib.storage_url import StorageUrlFromString
 from gslib.translation_helper import AclTranslation
 from gslib.translation_helper import GenerationFromUrlAndString
 from gslib.translation_helper import S3_ACL_MARKER_GUID
 from gslib.translation_helper import S3_DELETE_MARKER_GUID
 from gslib.translation_helper import S3_MARKER_GUIDS

 import httplib2
 from oauth2client.client import HAS_CRYPTO
 from retry_decorator import retry_decorator

 # Detect platform types.
 PLATFORM = str(sys.platform).lower()
 IS_WINDOWS = 'win32' in PLATFORM
 IS_CYGWIN = 'cygwin' in PLATFORM
 IS_LINUX = 'linux' in PLATFORM
 IS_OSX = 'darwin' in PLATFORM

 UTF8 = 'utf-8'
 WINDOWS_1252 = 'cp1252'

 # pylint: disable=g-import-not-at-top
 if IS_WINDOWS:
   from ctypes import c_int
   from ctypes import c_uint64
   from ctypes import c_char_p
   from ctypes import c_wchar_p
   from ctypes import windll
   from ctypes import POINTER
   from ctypes import WINFUNCTYPE
   from ctypes import WinError
   IS_CP1252 = locale.getdefaultlocale()[1] == WINDOWS_1252
 else:
   IS_CP1252 = False

 # pylint: disable=g-import-not-at-top
 try:
   # This module doesn't necessarily exist on Windows.
   import resource
   HAS_RESOURCE_MODULE = True
 except ImportError, e:
   HAS_RESOURCE_MODULE = False

 DEBUGLEVEL_DUMP_REQUESTS = 3
 DEBUGLEVEL_DUMP_REQUESTS_AND_PAYLOADS = 4

 ONE_KIB = 1024
 ONE_MIB = 1024 * 1024
 TWO_MIB = 2 * ONE_MIB
 EIGHT_MIB = 8 * ONE_MIB
 TEN_MIB = 10 * ONE_MIB
 DEFAULT_FILE_BUFFER_SIZE = 8 * ONE_KIB
 _DEFAULT_LINES = 25

 # By default, the timeout for SSL read errors is infinite. This could
 # cause gsutil to hang on network disconnect, so pick a more reasonable
 # timeout.
 SSL_TIMEOUT = 60

 # Start with a progress callback every 64 KiB during uploads/downloads (JSON
 # API). Callback implementation should back off until it hits the maximum size
 # so that callbacks do not create huge amounts of log output.
 START_CALLBACK_PER_BYTES = 1024*64
 MAX_CALLBACK_PER_BYTES = 1024*1024*100

 # Upload/download files in 8 KiB chunks over the HTTP connection.
 TRANSFER_BUFFER_SIZE = 1024*8

 # Default number of progress callbacks during transfer (XML API).
 XML_PROGRESS_CALLBACKS = 10

 # For files >= this size, output a message indicating that we're running an
 # operation on the file (like hashing or gzipping) so it does not appear to the
 # user that the command is hanging.
 MIN_SIZE_COMPUTE_LOGGING = 100*1024*1024  # 100 MiB

 NO_MAX = sys.maxint

 VERSION_MATCHER = re.compile(r'^(?P<maj>\d+)(\.(?P<min>\d+)(?P<suffix>.*))?')

 RELEASE_NOTES_URL = 'https://pub.storage.googleapis.com/gsutil_ReleaseNotes.txt'

 # Binary exponentiation strings.
 _EXP_STRINGS = [
     (0, 'B', 'bit'),
     (10, 'KiB', 'Kibit', 'K'),
     (20, 'MiB', 'Mibit', 'M'),
     (30, 'GiB', 'Gibit', 'G'),
     (40, 'TiB', 'Tibit', 'T'),
     (50, 'PiB', 'Pibit', 'P'),
     (60, 'EiB', 'Eibit', 'E'),
 ]


 global manager  # pylint: disable=global-at-module-level
 certs_file_lock = threading.Lock()
 configured_certs_files = []


 def _GenerateSuffixRegex():
   """Creates a suffix regex for human-readable byte counts."""
   human_bytes_re = r'(?P<num>\d*\.\d+|\d+)\s*(?P<suffix>%s)?'
   suffixes = []
   suffix_to_si = {}
   for i, si in enumerate(_EXP_STRINGS):
     si_suffixes = [s.lower() for s in list(si)[1:]]
     for suffix in si_suffixes:
       suffix_to_si[suffix] = i
     suffixes.extend(si_suffixes)
   human_bytes_re %= '|'.join(suffixes)
   matcher = re.compile(human_bytes_re)
   return suffix_to_si, matcher

 SUFFIX_TO_SI, MATCH_HUMAN_BYTES = _GenerateSuffixRegex()

 SECONDS_PER_DAY = 3600 * 24

 # On Unix-like systems, we will set the maximum number of open files to avoid
 # hitting the limit imposed by the OS. This number was obtained experimentally.
 MIN_ACCEPTABLE_OPEN_FILES_LIMIT = 1000

 GSUTIL_PUB_TARBALL = 'gs://pub/gsutil.tar.gz'

 Retry = retry_decorator.retry  # pylint: disable=invalid-name

 # Cache the values from this check such that they're available to all callers
 # without needing to run all the checks again (some of these, such as calling
 # multiprocessing.Manager(), are expensive operations).
 cached_multiprocessing_is_available = None
 cached_multiprocessing_is_available_stack_trace = None
 cached_multiprocessing_is_available_message = None


 # Enum class for specifying listing style.
 class ListingStyle(object):
   SHORT = 'SHORT'
   LONG = 'LONG'
   LONG_LONG = 'LONG_LONG'


 def UsingCrcmodExtension(crcmod):
   return (boto.config.get('GSUtil', 'test_assume_fast_crcmod', None) or
           (getattr(crcmod, 'crcmod', None) and
            getattr(crcmod.crcmod, '_usingExtension', None)))


 def ObjectIsGzipEncoded(obj_metadata):
   """Returns true if source apitools Object has gzip content-encoding."""
   return (obj_metadata.contentEncoding and
           obj_metadata.contentEncoding.lower().endswith('gzip'))


 def AddAcceptEncodingGzipIfNeeded(headers_dict, compressed_encoding=False):
   if compressed_encoding:
     # If we send accept-encoding: gzip with a range request, the service
     # may respond with the whole object, which would be bad for resuming.
     # So only accept gzip encoding if the object we are downloading has
     # a gzip content encoding.
     # TODO: If we want to support compressive transcoding fully in the client,
     # condition on whether we are requesting the entire range of the object.
     # In this case, we can accept the first bytes of the object compressively
     # transcoded, but we must perform data integrity checking on bytes after
     # they are decompressed on-the-fly, and any connection break must be
     # resumed without compressive transcoding since we cannot specify an
     # offset. We would also need to ensure that hashes for downloaded data
     # from objects stored with content-encoding:gzip continue to be calculated
     # prior to our own on-the-fly decompression so they match the stored hashes.
     headers_dict['accept-encoding'] = 'gzip'


 def CheckFreeSpace(path):
   """Return path/drive free space (in bytes)."""
   if IS_WINDOWS:
     try:
       # pylint: disable=invalid-name
       get_disk_free_space_ex = WINFUNCTYPE(c_int, c_wchar_p,
                                            POINTER(c_uint64),
                                            POINTER(c_uint64),
                                            POINTER(c_uint64))
       get_disk_free_space_ex = get_disk_free_space_ex(
           ('GetDiskFreeSpaceExW', windll.kernel32), (
               (1, 'lpszPathName'),
               (2, 'lpFreeUserSpace'),
               (2, 'lpTotalSpace'),
               (2, 'lpFreeSpace'),))
     except AttributeError:
       get_disk_free_space_ex = WINFUNCTYPE(c_int, c_char_p,
                                            POINTER(c_uint64),
                                            POINTER(c_uint64),
                                            POINTER(c_uint64))
       get_disk_free_space_ex = get_disk_free_space_ex(
           ('GetDiskFreeSpaceExA', windll.kernel32), (
               (1, 'lpszPathName'),
               (2, 'lpFreeUserSpace'),
               (2, 'lpTotalSpace'),
               (2, 'lpFreeSpace'),))

     def GetDiskFreeSpaceExErrCheck(result, unused_func, args):
       if not result:
         raise WinError()
       return args[1].value
     get_disk_free_space_ex.errcheck = GetDiskFreeSpaceExErrCheck

     return get_disk_free_space_ex(os.getenv('SystemDrive'))
   else:
     (_, f_frsize, _, _, f_bavail, _, _, _, _, _) = os.statvfs(path)
     return f_frsize * f_bavail


 def CreateDirIfNeeded(dir_path, mode=0777):
   """Creates a directory, suppressing already-exists errors."""
   if not os.path.exists(dir_path):
     try:
       # Unfortunately, even though we catch and ignore EEXIST, this call will
       # output a (needless) error message (no way to avoid that in Python).
       os.makedirs(dir_path, mode)
     # Ignore 'already exists' in case user tried to start up several
     # resumable uploads concurrently from a machine where no tracker dir had
     # yet been created.
     except OSError as e:
       if e.errno != errno.EEXIST:
         raise


 def DivideAndCeil(dividend, divisor):
   """Returns ceil(dividend / divisor).

   Takes care to avoid the pitfalls of floating point arithmetic that could
   otherwise yield the wrong result for large numbers.

   Args:
     dividend: Dividend for the operation.
     divisor: Divisor for the operation.

   Returns:
     Quotient.
   """
   quotient = dividend // divisor
   if (dividend % divisor) != 0:
     quotient += 1
   return quotient


 def GetGsutilStateDir():
   """Returns the location of the directory for gsutil state files.

   Certain operations, such as cross-process credential sharing and
   resumable transfer tracking, need a known location for state files which
   are created by gsutil as-needed.

   This location should only be used for storing data that is required to be in
   a static location.

   Returns:
     Path to directory for gsutil static state files.
   """
   config_file_dir = config.get(
       'GSUtil', 'state_dir',
       os.path.expanduser(os.path.join('~', '.gsutil')))
   CreateDirIfNeeded(config_file_dir)
   return config_file_dir


 def GetCredentialStoreFilename():
   return os.path.join(GetGsutilStateDir(), 'credstore')


 def GetGceCredentialCacheFilename():
   return os.path.join(GetGsutilStateDir(), 'gcecredcache')


 def GetTabCompletionLogFilename():
   return os.path.join(GetGsutilStateDir(), 'tab-completion-logs')


 def GetTabCompletionCacheFilename():
   tab_completion_dir = os.path.join(GetGsutilStateDir(), 'tab-completion')
   # Limit read permissions on the directory to owner for privacy.
   CreateDirIfNeeded(tab_completion_dir, mode=0700)
   return os.path.join(tab_completion_dir, 'cache')


 def GetPrintableExceptionString(exc):
   """Returns a short Unicode string describing the exception."""
   return unicode(exc).encode(UTF8) or str(exc.__class__)


 def PrintableStr(input_str):
   return input_str.encode(UTF8) if input_str is not None else None


 def PrintTrackerDirDeprecationWarningIfNeeded():
   # TODO: Remove this along with the tracker_dir config value 1 year after
   # 4.6 release date. Use state_dir instead.
   if config.has_option('GSUtil', 'resumable_tracker_dir'):
     sys.stderr.write('Warning: you have set resumable_tracker_dir in your '
                      '.boto configuration file. This configuration option is '
                      'deprecated; please use the state_dir configuration '
                      'option instead.\n')


 # Name of file where we keep the timestamp for the last time we checked whether
 # a new version of gsutil is available.
 PrintTrackerDirDeprecationWarningIfNeeded()
 CreateDirIfNeeded(GetGsutilStateDir())
 LAST_CHECKED_FOR_GSUTIL_UPDATE_TIMESTAMP_FILE = (
     os.path.join(GetGsutilStateDir(), '.last_software_update_check'))


 def HasConfiguredCredentials():
   """Determines if boto credential/config file exists."""
   has_goog_creds = (config.has_option('Credentials', 'gs_access_key_id') and
                     config.has_option('Credentials', 'gs_secret_access_key'))
   has_amzn_creds = (config.has_option('Credentials', 'aws_access_key_id') and
                     config.has_option('Credentials', 'aws_secret_access_key'))
   has_oauth_creds = (
       config.has_option('Credentials', 'gs_oauth2_refresh_token'))
   has_service_account_creds = (
       HAS_CRYPTO and
       config.has_option('Credentials', 'gs_service_client_id') and
       config.has_option('Credentials', 'gs_service_key_file'))

   if (has_goog_creds or has_amzn_creds or has_oauth_creds or
       has_service_account_creds):
     return True

   valid_auth_handler = None
   try:
     valid_auth_handler = boto.auth.get_auth_handler(
         GSConnection.DefaultHost, config, Provider('google'),
         requested_capability=['s3'])
     # Exclude the no-op auth handler as indicating credentials are configured.
     # Note we can't use isinstance() here because the no-op module may not be
     # imported so we can't get a reference to the class type.
     if getattr(getattr(valid_auth_handler, '__class__', None),
                '__name__', None) == 'NoOpAuth':
       valid_auth_handler = None
   except NoAuthHandlerFound:
     pass

   return valid_auth_handler


 def ConfigureNoOpAuthIfNeeded():
   """Sets up no-op auth handler if no boto credentials are configured."""
   if not HasConfiguredCredentials():
     if (config.has_option('Credentials', 'gs_service_client_id')
         and not HAS_CRYPTO):
       if os.environ.get('CLOUDSDK_WRAPPER') == '1':
         raise CommandException('\n'.join(textwrap.wrap(
             'Your gsutil is configured with an OAuth2 service account, but '
             'you do not have PyOpenSSL or PyCrypto 2.6 or later installed. '
             'Service account authentication requires one of these libraries; '
             'please reactivate your service account via the gcloud auth '
             'command and ensure any gcloud packages necessary for '
             'service accounts are present.')))
       else:
         raise CommandException('\n'.join(textwrap.wrap(
             'Your gsutil is configured with an OAuth2 service account, but '
             'you do not have PyOpenSSL or PyCrypto 2.6 or later installed. '
             'Service account authentication requires one of these libraries; '
             'please install either of them to proceed, or configure a '
             'different type of credentials with "gsutil config".')))
     else:
       # With no boto config file the user can still access publicly readable
       # buckets and objects.
       from gslib import no_op_auth_plugin  # pylint: disable=unused-variable


 def GetConfigFilePath():
   config_path = 'no config found'
   for path in BotoConfigLocations:
     try:
       with open(path, 'r'):
         config_path = path
       break
     except IOError:
       pass
   return config_path


 def GetBotoConfigFileList():
   """Returns list of boto config files that exist."""
   config_paths = boto.pyami.config.BotoConfigLocations
   if 'AWS_CREDENTIAL_FILE' in os.environ:
     config_paths.append(os.environ['AWS_CREDENTIAL_FILE'])
   config_files = {}
   for config_path in config_paths:
     if os.path.exists(config_path):
       config_files[config_path] = 1
   cf_list = []
   for config_file in config_files:
     cf_list.append(config_file)
   return cf_list


 def GetCertsFile():
   """Configures and returns the CA Certificates file.

   If one is already configured, use it. Otherwise, amend the configuration
   (in boto.config) to use the cert roots distributed with gsutil.

   Returns:
     string filename of the certs file to use.
   """
   certs_file = boto.config.get('Boto', 'ca_certificates_file', None)
   if not certs_file:
     with certs_file_lock:
       if configured_certs_files:
         disk_certs_file = configured_certs_files[0]
       else:
         disk_certs_file = os.path.abspath(
             os.path.join(gslib.GSLIB_DIR, 'data', 'cacerts.txt'))
         if not os.path.exists(disk_certs_file):
           # If the file is not present on disk, this means the gslib module
           # doesn't actually exist on disk anywhere. This can happen if it's
           # being imported from a zip file. Unfortunately, we have to copy the
           # certs file to a local temp file on disk because the underlying SSL
           # socket requires it to be a filesystem path.
           certs_data = pkgutil.get_data('gslib', 'data/cacerts.txt')
           if not certs_data:
             raise CommandException('Certificates file not found. Please '
                                    'reinstall gsutil from scratch')
           fd, fname = tempfile.mkstemp(suffix='.txt', prefix='gsutil-cacerts')
           f = os.fdopen(fd, 'w')
           f.write(certs_data)
           f.close()
           configured_certs_files.append(fname)
           disk_certs_file = fname
       certs_file = disk_certs_file
   return certs_file


 def GetCleanupFiles():
   """Returns a list of temp files to delete (if possible) when program exits."""
   cleanup_files = []
   if configured_certs_files:
     cleanup_files += configured_certs_files
   return cleanup_files


 def ProxyInfoFromEnvironmentVar(proxy_env_var):
   """Reads proxy info from the environment and converts to httplib2.ProxyInfo.

   Args:
     proxy_env_var: Environment variable string to read, such as http_proxy or
        https_proxy.

   Returns:
     httplib2.ProxyInfo constructed from the environment string.
   """
   proxy_url = os.environ.get(proxy_env_var)
   if not proxy_url or not proxy_env_var.lower().startswith('http'):
     return httplib2.ProxyInfo(httplib2.socks.PROXY_TYPE_HTTP, None, 0)
   proxy_protocol = proxy_env_var.lower().split('_')[0]
   if not proxy_url.lower().startswith('http'):
     # proxy_info_from_url requires a protocol, which is always http or https.
     proxy_url = proxy_protocol + '://' + proxy_url
   return httplib2.proxy_info_from_url(proxy_url, method=proxy_protocol)


 def GetNewHttp(http_class=httplib2.Http, **kwargs):
   """Creates and returns a new httplib2.Http instance.

   Args:
     http_class: Optional custom Http class to use.
     **kwargs: Arguments to pass to http_class constructor.

   Returns:
     An initialized httplib2.Http instance.
   """
   proxy_info = httplib2.ProxyInfo(
       proxy_type=3,
       proxy_host=boto.config.get('Boto', 'proxy', None),
       proxy_port=boto.config.getint('Boto', 'proxy_port', 0),
       proxy_user=boto.config.get('Boto', 'proxy_user', None),
       proxy_pass=boto.config.get('Boto', 'proxy_pass', None),
       proxy_rdns=boto.config.get('Boto', 'proxy_rdns', False))

   if not (proxy_info.proxy_host and proxy_info.proxy_port):
     # Fall back to using the environment variable.
     for proxy_env_var in ['http_proxy', 'https_proxy', 'HTTPS_PROXY']:
       if proxy_env_var in os.environ and os.environ[proxy_env_var]:
         proxy_info = ProxyInfoFromEnvironmentVar(proxy_env_var)
         # Assume proxy_rnds is True if a proxy environment variable exists.
         proxy_info.proxy_rdns = boto.config.get('Boto', 'proxy_rdns', True)
         break

   # Some installers don't package a certs file with httplib2, so use the
   # one included with gsutil.
   kwargs['ca_certs'] = GetCertsFile()
   # Use a non-infinite SSL timeout to avoid hangs during network flakiness.
   kwargs['timeout'] = SSL_TIMEOUT
   http = http_class(proxy_info=proxy_info, **kwargs)
   http.disable_ssl_certificate_validation = (not config.getbool(
       'Boto', 'https_validate_certificates'))
   return http


 # Retry for 10 minutes with exponential backoff, which corresponds to
 # the maximum Downtime Period specified in the GCS SLA
 # (https://cloud.google.com/storage/sla)
 def GetNumRetries():
   return config.getint('Boto', 'num_retries', 23)


 def GetMaxRetryDelay():
   return config.getint('Boto', 'max_retry_delay', 32)


 # Resumable downloads and uploads make one HTTP call per chunk (and must be
 # in multiples of 256KiB). Overridable for testing.
 def GetJsonResumableChunkSize():
   chunk_size = config.getint('GSUtil', 'json_resumable_chunk_size',
                              1024*1024*100L)
   if chunk_size == 0:
     chunk_size = 1024*256L
   elif chunk_size % 1024*256L != 0:
     chunk_size += (1024*256L - (chunk_size % (1024*256L)))
   return chunk_size


 def JsonResumableChunkSizeDefined():
   chunk_size_defined = config.get('GSUtil', 'json_resumable_chunk_size',
                                   None)
   return chunk_size_defined is not None


 def _RoundToNearestExponent(num):
   i = 0
   while i+1 < len(_EXP_STRINGS) and num >= (2 ** _EXP_STRINGS[i+1][0]):
     i += 1
   return i, round(float(num) / 2 ** _EXP_STRINGS[i][0], 2)


 def MakeHumanReadable(num):
   """Generates human readable string for a number of bytes.

   Args:
     num: The number, in bytes.

   Returns:
     A string form of the number using size abbreviations (KiB, MiB, etc.).
   """
   i, rounded_val = _RoundToNearestExponent(num)
   return '%g %s' % (rounded_val, _EXP_STRINGS[i][1])


 def MakeBitsHumanReadable(num):
   """Generates human readable string for a number of bits.

   Args:
     num: The number, in bits.

   Returns:
     A string form of the number using bit size abbreviations (kbit, Mbit, etc.)
   """
   i, rounded_val = _RoundToNearestExponent(num)
   return '%g %s' % (rounded_val, _EXP_STRINGS[i][2])


 def HumanReadableToBytes(human_string):
   """Tries to convert a human-readable string to a number of bytes.

   Args:
     human_string: A string supplied by user, e.g. '1M', '3 GiB'.
   Returns:
     An integer containing the number of bytes.
   Raises:
     ValueError: on an invalid string.
   """
   human_string = human_string.lower()
   m = MATCH_HUMAN_BYTES.match(human_string)
   if m:
     num = float(m.group('num'))
     if m.group('suffix'):
       power = _EXP_STRINGS[SUFFIX_TO_SI[m.group('suffix')]][0]
       num *= (2.0 ** power)
     num = int(round(num))
     return num
   raise ValueError('Invalid byte string specified: %s' % human_string)


 def Percentile(values, percent, key=lambda x: x):
   """Find the percentile of a list of values.

   Taken from: http://code.activestate.com/recipes/511478/

   Args:
     values: a list of numeric values. Note that the values MUST BE already
             sorted.
     percent: a float value from 0.0 to 1.0.
     key: optional key function to compute value from each element of the list
          of values.

   Returns:
     The percentile of the values.
   """
   if not values:
     return None
   k = (len(values) - 1) * percent
   f = math.floor(k)
   c = math.ceil(k)
   if f == c:
     return key(values[int(k)])
   d0 = key(values[int(f)]) * (c-k)
   d1 = key(values[int(c)]) * (k-f)
   return d0 + d1


 def RemoveCRLFFromString(input_str):
   """Returns the input string with all \\n and \\r removed."""
   return re.sub(r'[\r\n]', '', input_str)


 def UnaryDictToXml(message):
   """Generates XML representation of a nested dict.

   This dict contains exactly one top-level entry and an arbitrary number of
   2nd-level entries, e.g. capturing a WebsiteConfiguration message.

   Args:
     message: The dict encoding the message.

   Returns:
     XML string representation of the input dict.

   Raises:
     Exception: if dict contains more than one top-level entry.
   """
   if len(message) != 1:
     raise Exception('Expected dict of size 1, got size %d' % len(message))

   name, content = message.items()[0]
   element_type = ElementTree.Element(name)
   for element_property, value in sorted(content.items()):
     node = ElementTree.SubElement(element_type, element_property)
     node.text = value
   return ElementTree.tostring(element_type)


 def LookUpGsutilVersion(gsutil_api, url_str):
   """Looks up the gsutil version of the specified gsutil tarball URL.

   Version is specified in the metadata field set on that object.

   Args:
     gsutil_api: gsutil Cloud API to use when retrieving gsutil tarball.
     url_str: tarball URL to retrieve (such as 'gs://pub/gsutil.tar.gz').

   Returns:
     Version string if URL is a cloud URL containing x-goog-meta-gsutil-version
     metadata, else None.
   """
   url = StorageUrlFromString(url_str)
   if url.IsCloudUrl():
     obj = gsutil_api.GetObjectMetadata(url.bucket_name, url.object_name,
                                        provider=url.scheme,
                                        fields=['metadata'])
     if obj.metadata and obj.metadata.additionalProperties:
       for prop in obj.metadata.additionalProperties:
         if prop.key == 'gsutil_version':
           return prop.value


 def GetGsutilVersionModifiedTime():
   """Returns unix timestamp of when the VERSION file was last modified."""
   if not gslib.VERSION_FILE:
     return 0
   return int(os.path.getmtime(gslib.VERSION_FILE))


 def IsRunningInteractively():
   """Returns True if currently running interactively on a TTY."""
   return sys.stdout.isatty() and sys.stderr.isatty() and sys.stdin.isatty()


 def _HttpsValidateCertifcatesEnabled():
   return config.get('Boto', 'https_validate_certificates', True)

 CERTIFICATE_VALIDATION_ENABLED = _HttpsValidateCertifcatesEnabled()


 def _BotoIsSecure():
   return config.get('Boto', 'is_secure', True)

 BOTO_IS_SECURE = _BotoIsSecure()


 def ResumableThreshold():
   return config.getint('GSUtil', 'resumable_threshold', EIGHT_MIB)


 # pylint: disable=too-many-statements
 def PrintFullInfoAboutObject(bucket_listing_ref, incl_acl=True):
   """Print full info for given object (like what displays for gsutil ls -L).

   Args:
     bucket_listing_ref: BucketListingRef being listed.
                         Must have ref_type OBJECT and a populated root_object
                         with the desired fields.
     incl_acl: True if ACL info should be output.

   Returns:
     Tuple (number of objects, object_length)

   Raises:
     Exception: if calling bug encountered.
   """
   url_str = bucket_listing_ref.url_string
   storage_url = StorageUrlFromString(url_str)
   obj = bucket_listing_ref.root_object

   if (obj.metadata and S3_DELETE_MARKER_GUID in
       obj.metadata.additionalProperties):
     num_bytes = 0
     num_objs = 0
     url_str += '<DeleteMarker>'
   else:
     num_bytes = obj.size
     num_objs = 1

   print '%s:' % url_str.encode(UTF8)
   if obj.updated:
     print '\tCreation time:\t\t%s' % obj.updated.strftime(
         '%a, %d %b %Y %H:%M:%S GMT')
   if obj.cacheControl:
     print '\tCache-Control:\t\t%s' % obj.cacheControl
   if obj.contentDisposition:
     print '\tContent-Disposition:\t\t%s' % obj.contentDisposition
   if obj.contentEncoding:
     print '\tContent-Encoding:\t\t%s' % obj.contentEncoding
   if obj.contentLanguage:
     print '\tContent-Language:\t%s' % obj.contentLanguage
   print '\tContent-Length:\t\t%s' % obj.size
   print '\tContent-Type:\t\t%s' % obj.contentType
   if obj.componentCount:
     print '\tComponent-Count:\t%d' % obj.componentCount
   marker_props = {}
   if obj.metadata and obj.metadata.additionalProperties:
     non_marker_props = []
     for add_prop in obj.metadata.additionalProperties:
       if add_prop.key not in S3_MARKER_GUIDS:
         non_marker_props.append(add_prop)
       else:
         marker_props[add_prop.key] = add_prop.value
     if non_marker_props:
       print '\tMetadata:'
       for ap in non_marker_props:
         meta_string = '\t\t%s:\t\t%s' % (ap.key, ap.value)
         print meta_string.encode(UTF8)
   if obj.customerEncryption:
     if not obj.crc32c: print '\tHash (crc32c):\t\tencrypted'
     if not obj.md5Hash: print '\tHash (md5):\t\tencrypted'
     print ('\tEncryption algorithm:\t%s' %
            obj.customerEncryption.encryptionAlgorithm)
     print '\tEncryption key SHA256:\t%s' % obj.customerEncryption.keySha256
   if obj.crc32c: print '\tHash (crc32c):\t\t%s' % obj.crc32c
   if obj.md5Hash: print '\tHash (md5):\t\t%s' % obj.md5Hash
   print '\tETag:\t\t\t%s' % obj.etag.strip('"\'')
   if obj.generation:
     generation_str = GenerationFromUrlAndString(storage_url, obj.generation)
     print '\tGeneration:\t\t%s' % generation_str
   if obj.metageneration:
     print '\tMetageneration:\t\t%s' % obj.metageneration
   if incl_acl:
     # JSON API won't return acls as part of the response unless we have
     # full control scope
     if obj.acl:
       print '\tACL:\t\t%s' % AclTranslation.JsonFromMessage(obj.acl)
     elif S3_ACL_MARKER_GUID in marker_props:
       print '\tACL:\t\t%s' % marker_props[S3_ACL_MARKER_GUID]
     else:
       print ('\tACL:\t\t\tACCESS DENIED. Note: you need OWNER '
              'permission\n\t\t\t\ton the object to read its ACL.')

   return (num_objs, num_bytes)


 def CompareVersions(first, second):
   """Compares the first and second gsutil version strings.

   For example, 3.33 > 3.7, and 4.1 is a greater major version than 3.33.
   Does not handle multiple periods (e.g. 3.3.4) or complicated suffixes
   (e.g., 3.3RC4 vs. 3.3RC5). A version string with a suffix is treated as
   less than its non-suffix counterpart (e.g. 3.32 > 3.32pre).

   Args:
     first: First gsutil version string.
     second: Second gsutil version string.

   Returns:
     (g, m):
        g is True if first known to be greater than second, else False.
        m is True if first known to be greater by at least 1 major version,
          else False.
   """
   m1 = VERSION_MATCHER.match(str(first))
   m2 = VERSION_MATCHER.match(str(second))

   # If passed strings we don't know how to handle, be conservative.
   if not m1 or not m2:
     return (False, False)

   major_ver1 = int(m1.group('maj'))
   minor_ver1 = int(m1.group('min')) if m1.group('min') else 0
   suffix_ver1 = m1.group('suffix')
   major_ver2 = int(m2.group('maj'))
   minor_ver2 = int(m2.group('min')) if m2.group('min') else 0
   suffix_ver2 = m2.group('suffix')

   if major_ver1 > major_ver2:
     return (True, True)
   elif major_ver1 == major_ver2:
     if minor_ver1 > minor_ver2:
       return (True, False)
     elif minor_ver1 == minor_ver2:
       return (bool(suffix_ver2) and not suffix_ver1, False)
   return (False, False)


 def _IncreaseSoftLimitForResource(resource_name, fallback_value):
   """Sets a new soft limit for the maximum number of open files.

   The soft limit is used for this process (and its children), but the
   hard limit is set by the system and cannot be exceeded.

   We will first try to set the soft limit to the hard limit's value; if that
   fails, we will try to set the soft limit to the fallback_value iff this would
   increase the soft limit.

   Args:
     resource_name: Name of the resource to increase the soft limit for.
     fallback_value: Fallback value to be used if we couldn't set the
                     soft value to the hard value (e.g., if the hard value
                     is "unlimited").

   Returns:
     Current soft limit for the resource (after any changes we were able to
     make), or -1 if the resource doesn't exist.
   """

   # Get the value of the resource.
   try:
     (soft_limit, hard_limit) = resource.getrlimit(resource_name)
   except (resource.error, ValueError):
     # The resource wasn't present, so we can't do anything here.
     return -1

   # Try to set the value of the soft limit to the value of the hard limit.
   if hard_limit > soft_limit:  # Some OS's report 0 for "unlimited".
     try:
       resource.setrlimit(resource_name, (hard_limit, hard_limit))
       return hard_limit
     except (resource.error, ValueError):
       # We'll ignore this and try the fallback value.
       pass

   # Try to set the value of the soft limit to the fallback value.
   if soft_limit < fallback_value:
     try:
       resource.setrlimit(resource_name, (fallback_value, hard_limit))
       return fallback_value
     except (resource.error, ValueError):
       # We couldn't change the soft limit, so just report the current
       # value of the soft limit.
       return soft_limit
   else:
     return soft_limit


 def GetCloudApiInstance(cls, thread_state=None):
   """Gets a gsutil Cloud API instance.

   Since Cloud API implementations are not guaranteed to be thread-safe, each
   thread needs its own instance. These instances are passed to each thread
   via the thread pool logic in command.

   Args:
     cls: Command class to be used for single-threaded case.
     thread_state: Per thread state from this thread containing a gsutil
                   Cloud API instance.

   Returns:
     gsutil Cloud API instance.
   """
   return thread_state or cls.gsutil_api


 def GetFileSize(fp, position_to_eof=False):
   """Returns size of file, optionally leaving fp positioned at EOF."""
   if not position_to_eof:
     cur_pos = fp.tell()
   fp.seek(0, os.SEEK_END)
   cur_file_size = fp.tell()
   if not position_to_eof:
     fp.seek(cur_pos)
   return cur_file_size


 def GetStreamFromFileUrl(storage_url):
   if storage_url.IsStream():
     return sys.stdin
   else:
     return open(storage_url.object_name, 'rb')


 def UrlsAreForSingleProvider(url_args):
   """Tests whether the URLs are all for a single provider.

   Args:
     url_args: Strings to check.

   Returns:
     True if URLs are for single provider, False otherwise.
   """
   provider = None
   url = None
   for url_str in url_args:
     url = StorageUrlFromString(url_str)
     if not provider:
       provider = url.scheme
     elif url.scheme != provider:
       return False
   return provider is not None


 def HaveFileUrls(args_to_check):
   """Checks whether args_to_check contain any file URLs.

   Args:
     args_to_check: Command-line argument subset to check.

   Returns:
     True if args_to_check contains any file URLs.
   """
   for url_str in args_to_check:
     storage_url = StorageUrlFromString(url_str)
     if storage_url.IsFileUrl():
       return True
   return False


 def HaveProviderUrls(args_to_check):
   """Checks whether args_to_check contains any provider URLs (like 'gs://').

   Args:
     args_to_check: Command-line argument subset to check.

   Returns:
     True if args_to_check contains any provider URLs.
   """
   for url_str in args_to_check:
     storage_url = StorageUrlFromString(url_str)
     if storage_url.IsCloudUrl() and storage_url.IsProvider():
       return True
   return False

 # This must be defined at the module level for pickling across processes.
 MultiprocessingIsAvailableResult = collections.namedtuple(
     'MultiprocessingIsAvailableResult', ['is_available', 'stack_trace'])


 def CheckMultiprocessingAvailableAndInit(logger=None):
   """Checks if multiprocessing is available.

   There are some environments in which there is no way to use multiprocessing
   logic that's built into Python (e.g., if /dev/shm is not available, then
   we can't create semaphores). This simply tries out a few things that will be
   needed to make sure the environment can support the pieces of the
   multiprocessing module that we need.

   If multiprocessing is available, this performs necessary initialization for
   multiprocessing.  See gslib.command.InitializeMultiprocessingVariables for
   an explanation of why this is necessary.

   Args:
     logger: logging.logger to use for debug output.

   Returns:
     (multiprocessing_is_available, stack_trace):
       multiprocessing_is_available: True iff the multiprocessing module is
                                     available for use.
       stack_trace: The stack trace generated by the call we tried that failed.
   """
   # pylint: disable=global-variable-undefined
   global cached_multiprocessing_is_available
   global cached_multiprocessing_check_stack_trace
   global cached_multiprocessing_is_available_message
   if cached_multiprocessing_is_available is not None:
     if logger:
       logger.debug(cached_multiprocessing_check_stack_trace)
       logger.warn(cached_multiprocessing_is_available_message)
     return MultiprocessingIsAvailableResult(
         is_available=cached_multiprocessing_is_available,
         stack_trace=cached_multiprocessing_check_stack_trace)

   if IS_WINDOWS:
     message = """
 Multiple processes are not supported on Windows. Operations requesting
 parallelism will be executed with multiple threads in a single process only.
 """
     if logger:
       logger.warn(message)
     return MultiprocessingIsAvailableResult(is_available=False,
                                             stack_trace=None)

   stack_trace = None
   multiprocessing_is_available = True
   message = """
 You have requested multiple processes for an operation, but the
 required functionality of Python\'s multiprocessing module is not available.
 Operations requesting parallelism will be executed with multiple threads in a
 single process only.
 """
   try:
     # Fails if /dev/shm (or some equivalent thereof) is not available for use
     # (e.g., there's no implementation, or we can't write to it, etc.).
     try:
       multiprocessing.Value('i', 0)
     except:
       message += """
 Please ensure that you have write access to both /dev/shm and /run/shm.
 """
       raise  # We'll handle this in one place below.

     global manager  # pylint: disable=global-variable-undefined
     manager = multiprocessing.Manager()

     # Check that the max number of open files is reasonable. Always check this
     # after we're sure that the basic multiprocessing functionality is
     # available, since this won't matter unless that's true.
     limit = -1
     if HAS_RESOURCE_MODULE:
       # Try to set this with both resource names - RLIMIT_NOFILE for most Unix
       # platforms, and RLIMIT_OFILE for BSD. Ignore AttributeError because the
       # "resource" module is not guaranteed to know about these names.
       try:
         limit = max(limit,
                     _IncreaseSoftLimitForResource(
                         resource.RLIMIT_NOFILE,
                         MIN_ACCEPTABLE_OPEN_FILES_LIMIT))
       except AttributeError:
         pass
       try:
         limit = max(limit,
                     _IncreaseSoftLimitForResource(
                         resource.RLIMIT_OFILE, MIN_ACCEPTABLE_OPEN_FILES_LIMIT))
       except AttributeError:
         pass

     if limit < MIN_ACCEPTABLE_OPEN_FILES_LIMIT:
       message += ("""
 Your max number of open files, %s, is too low to allow safe multiprocessing.
 On Linux you can fix this by adding something like "ulimit -n 10000" to your
 ~/.bashrc or equivalent file and opening a new terminal.

 On MacOS, you may also need to run a command like this once (in addition to the
 above instructions), which might require a restart of your system to take
 effect:
   launchctl limit maxfiles 10000

 Alternatively, edit /etc/launchd.conf with something like:
   limit maxfiles 10000 10000

 """ % limit)
       raise Exception('Max number of open files, %s, is too low.' % limit)
   except:  # pylint: disable=bare-except
     stack_trace = traceback.format_exc()
     multiprocessing_is_available = False
     if logger is not None:
       logger.debug(stack_trace)
       logger.warn(message)

   # Set the cached values so that we never need to do this check again.
   cached_multiprocessing_is_available = multiprocessing_is_available
   cached_multiprocessing_check_stack_trace = stack_trace
   cached_multiprocessing_is_available_message = message
   return MultiprocessingIsAvailableResult(
       is_available=cached_multiprocessing_is_available,
       stack_trace=cached_multiprocessing_check_stack_trace)


 def CreateLock():
   """Returns either a multiprocessing lock or a threading lock.

   Use Multiprocessing lock iff we have access to the parts of the
   multiprocessing module that are necessary to enable parallelism in operations.

   Returns:
     Multiprocessing or threading lock.
   """
   if CheckMultiprocessingAvailableAndInit().is_available:
     return manager.Lock()
   else:
     return threading.Lock()


 def IsCloudSubdirPlaceholder(url, blr=None):
   """Determines if URL is a cloud subdir placeholder.

   This function is needed because GUI tools (like the GCS cloud console) allow
   users to create empty "folders" by creating a placeholder object; and parts
   of gsutil need to treat those placeholder objects specially. For example,
   gsutil rsync needs to avoid downloading those objects because they can cause
   conflicts (see comments in rsync command for details).

   We currently detect two cases:
     - Cloud objects whose name ends with '_$folder$'
     - Cloud objects whose name ends with '/'

   Args:
     url: The URL to be checked.
     blr: BucketListingRef to check, or None if not available.
          If None, size won't be checked.

   Returns:
     True/False.
   """
   if not url.IsCloudUrl():
     return False
   url_str = url.url_string
   if url_str.endswith('_$folder$'):
     return True
   if blr and blr.IsObject():
     size = blr.root_object.size
   else:
     size = 0
   return size == 0 and url_str.endswith('/')


 def GetTermLines():
   """Returns number of terminal lines."""
   # fcntl isn't supported in Windows.
   try:
     import fcntl    # pylint: disable=g-import-not-at-top
     import termios  # pylint: disable=g-import-not-at-top
   except ImportError:
     return _DEFAULT_LINES
   def ioctl_GWINSZ(fd):  # pylint: disable=invalid-name
     try:
       return struct.unpack(
           'hh', fcntl.ioctl(fd, termios.TIOCGWINSZ, '1234'))[0]
     except:  # pylint: disable=bare-except
       return 0  # Failure (so will retry on different file descriptor below).
   # Try to find a valid number of lines from termio for stdin, stdout,
   # or stderr, in that order.
   ioc = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2)
   if not ioc:
     try:
       fd = os.open(os.ctermid(), os.O_RDONLY)
       ioc = ioctl_GWINSZ(fd)
       os.close(fd)
     except:  # pylint: disable=bare-except
       pass
   if not ioc:
     ioc = os.environ.get('LINES', _DEFAULT_LINES)
   return int(ioc)


 def FixWindowsEncodingIfNeeded(input_str):
   """Attempts to detect Windows CP1252 encoding and convert to UTF8.

   Windows doesn't provide a way to set UTF-8 for string encodings; you can set
   the system locale (see
   http://windows.microsoft.com/en-us/windows/change-system-locale#1TC=windows-7)
   but that takes you to a "Change system locale" dropdown that just lists
   languages (e.g., "English (United States)". Instead, we're forced to check if
   a encoding as UTF8 raises an exception and if so, try converting from CP1252
   to Unicode.

   Args:
     input_str: The input string.
   Returns:
     The converted string (or the original, if conversion wasn't needed).
   """
   if IS_CP1252:
     return input_str.decode(WINDOWS_1252).encode(UTF8)
   else:
     return input_str


 class GsutilStreamHandler(logging.StreamHandler):
   """A subclass of StreamHandler for use in gsutil."""

   def flush(self):
     # Note: we override the flush method here due to a python 2.6 bug. The
     # python logging module will try to flush all stream handlers at exit.
     # If the StreamHandler is pointing to a file that is already closed, the
     # method throws an exception. Our unit tests temporarily redirect stderr,
     # which causes the default StreamHandler to open its stream against a
     # temporary file. By the time the process shuts down, the underlying file
     # is closed, causing an exception. This was fixed in Python 2.7, but to
     # remove the flake from Python 2.6, we maintain this here.
     try:
       logging.StreamHandler.flush(self)
     except ValueError:
       pass


 def StdinIterator():
   """A generator function that returns lines from stdin."""
   for line in sys.stdin:
     # Strip CRLF.
     yield line.rstrip()


 def ConvertRecursiveToFlatWildcard(url_strs):
   """A generator that adds '**' to each url string in url_strs."""
   for url_str in url_strs:
     yield '%s**' % url_str