| # -*- coding: utf-8 -*- |
| # Copyright 2010 Google Inc. All Rights Reserved. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| """Static data and helper functions.""" |
| |
| from __future__ import absolute_import |
| |
| import collections |
| import errno |
| import locale |
| import logging |
| import math |
| import multiprocessing |
| import os |
| import pkgutil |
| import re |
| import struct |
| import sys |
| import tempfile |
| import textwrap |
| import threading |
| import traceback |
| import xml.etree.ElementTree as ElementTree |
| |
| import boto |
| from boto import config |
| import boto.auth |
| from boto.exception import NoAuthHandlerFound |
| from boto.gs.connection import GSConnection |
| from boto.provider import Provider |
| from boto.pyami.config import BotoConfigLocations |
| |
| import gslib |
| from gslib.exception import CommandException |
| from gslib.storage_url import StorageUrlFromString |
| from gslib.translation_helper import AclTranslation |
| from gslib.translation_helper import GenerationFromUrlAndString |
| from gslib.translation_helper import S3_ACL_MARKER_GUID |
| from gslib.translation_helper import S3_DELETE_MARKER_GUID |
| from gslib.translation_helper import S3_MARKER_GUIDS |
| |
| import httplib2 |
| from oauth2client.client import HAS_CRYPTO |
| from retry_decorator import retry_decorator |
| |
| # Detect platform types. |
| PLATFORM = str(sys.platform).lower() |
| IS_WINDOWS = 'win32' in PLATFORM |
| IS_CYGWIN = 'cygwin' in PLATFORM |
| IS_LINUX = 'linux' in PLATFORM |
| IS_OSX = 'darwin' in PLATFORM |
| |
| UTF8 = 'utf-8' |
| WINDOWS_1252 = 'cp1252' |
| |
| # pylint: disable=g-import-not-at-top |
| if IS_WINDOWS: |
| from ctypes import c_int |
| from ctypes import c_uint64 |
| from ctypes import c_char_p |
| from ctypes import c_wchar_p |
| from ctypes import windll |
| from ctypes import POINTER |
| from ctypes import WINFUNCTYPE |
| from ctypes import WinError |
| IS_CP1252 = locale.getdefaultlocale()[1] == WINDOWS_1252 |
| else: |
| IS_CP1252 = False |
| |
| # pylint: disable=g-import-not-at-top |
| try: |
| # This module doesn't necessarily exist on Windows. |
| import resource |
| HAS_RESOURCE_MODULE = True |
| except ImportError, e: |
| HAS_RESOURCE_MODULE = False |
| |
| DEBUGLEVEL_DUMP_REQUESTS = 3 |
| DEBUGLEVEL_DUMP_REQUESTS_AND_PAYLOADS = 4 |
| |
| ONE_KIB = 1024 |
| ONE_MIB = 1024 * 1024 |
| TWO_MIB = 2 * ONE_MIB |
| EIGHT_MIB = 8 * ONE_MIB |
| TEN_MIB = 10 * ONE_MIB |
| DEFAULT_FILE_BUFFER_SIZE = 8 * ONE_KIB |
| _DEFAULT_LINES = 25 |
| |
| # By default, the timeout for SSL read errors is infinite. This could |
| # cause gsutil to hang on network disconnect, so pick a more reasonable |
| # timeout. |
| SSL_TIMEOUT = 60 |
| |
| # Start with a progress callback every 64 KiB during uploads/downloads (JSON |
| # API). Callback implementation should back off until it hits the maximum size |
| # so that callbacks do not create huge amounts of log output. |
| START_CALLBACK_PER_BYTES = 1024*64 |
| MAX_CALLBACK_PER_BYTES = 1024*1024*100 |
| |
| # Upload/download files in 8 KiB chunks over the HTTP connection. |
| TRANSFER_BUFFER_SIZE = 1024*8 |
| |
| # Default number of progress callbacks during transfer (XML API). |
| XML_PROGRESS_CALLBACKS = 10 |
| |
| # For files >= this size, output a message indicating that we're running an |
| # operation on the file (like hashing or gzipping) so it does not appear to the |
| # user that the command is hanging. |
| MIN_SIZE_COMPUTE_LOGGING = 100*1024*1024 # 100 MiB |
| |
| NO_MAX = sys.maxint |
| |
| VERSION_MATCHER = re.compile(r'^(?P<maj>\d+)(\.(?P<min>\d+)(?P<suffix>.*))?') |
| |
| RELEASE_NOTES_URL = 'https://pub.storage.googleapis.com/gsutil_ReleaseNotes.txt' |
| |
| # Binary exponentiation strings. |
| _EXP_STRINGS = [ |
| (0, 'B', 'bit'), |
| (10, 'KiB', 'Kibit', 'K'), |
| (20, 'MiB', 'Mibit', 'M'), |
| (30, 'GiB', 'Gibit', 'G'), |
| (40, 'TiB', 'Tibit', 'T'), |
| (50, 'PiB', 'Pibit', 'P'), |
| (60, 'EiB', 'Eibit', 'E'), |
| ] |
| |
| |
| global manager # pylint: disable=global-at-module-level |
| certs_file_lock = threading.Lock() |
| configured_certs_files = [] |
| |
| |
| def _GenerateSuffixRegex(): |
| """Creates a suffix regex for human-readable byte counts.""" |
| human_bytes_re = r'(?P<num>\d*\.\d+|\d+)\s*(?P<suffix>%s)?' |
| suffixes = [] |
| suffix_to_si = {} |
| for i, si in enumerate(_EXP_STRINGS): |
| si_suffixes = [s.lower() for s in list(si)[1:]] |
| for suffix in si_suffixes: |
| suffix_to_si[suffix] = i |
| suffixes.extend(si_suffixes) |
| human_bytes_re %= '|'.join(suffixes) |
| matcher = re.compile(human_bytes_re) |
| return suffix_to_si, matcher |
| |
| SUFFIX_TO_SI, MATCH_HUMAN_BYTES = _GenerateSuffixRegex() |
| |
| SECONDS_PER_DAY = 3600 * 24 |
| |
| # On Unix-like systems, we will set the maximum number of open files to avoid |
| # hitting the limit imposed by the OS. This number was obtained experimentally. |
| MIN_ACCEPTABLE_OPEN_FILES_LIMIT = 1000 |
| |
| GSUTIL_PUB_TARBALL = 'gs://pub/gsutil.tar.gz' |
| |
| Retry = retry_decorator.retry # pylint: disable=invalid-name |
| |
| # Cache the values from this check such that they're available to all callers |
| # without needing to run all the checks again (some of these, such as calling |
| # multiprocessing.Manager(), are expensive operations). |
| cached_multiprocessing_is_available = None |
| cached_multiprocessing_is_available_stack_trace = None |
| cached_multiprocessing_is_available_message = None |
| |
| |
| # Enum class for specifying listing style. |
| class ListingStyle(object): |
| SHORT = 'SHORT' |
| LONG = 'LONG' |
| LONG_LONG = 'LONG_LONG' |
| |
| |
| def UsingCrcmodExtension(crcmod): |
| return (boto.config.get('GSUtil', 'test_assume_fast_crcmod', None) or |
| (getattr(crcmod, 'crcmod', None) and |
| getattr(crcmod.crcmod, '_usingExtension', None))) |
| |
| |
| def ObjectIsGzipEncoded(obj_metadata): |
| """Returns true if source apitools Object has gzip content-encoding.""" |
| return (obj_metadata.contentEncoding and |
| obj_metadata.contentEncoding.lower().endswith('gzip')) |
| |
| |
| def AddAcceptEncodingGzipIfNeeded(headers_dict, compressed_encoding=False): |
| if compressed_encoding: |
| # If we send accept-encoding: gzip with a range request, the service |
| # may respond with the whole object, which would be bad for resuming. |
| # So only accept gzip encoding if the object we are downloading has |
| # a gzip content encoding. |
| # TODO: If we want to support compressive transcoding fully in the client, |
| # condition on whether we are requesting the entire range of the object. |
| # In this case, we can accept the first bytes of the object compressively |
| # transcoded, but we must perform data integrity checking on bytes after |
| # they are decompressed on-the-fly, and any connection break must be |
| # resumed without compressive transcoding since we cannot specify an |
| # offset. We would also need to ensure that hashes for downloaded data |
| # from objects stored with content-encoding:gzip continue to be calculated |
| # prior to our own on-the-fly decompression so they match the stored hashes. |
| headers_dict['accept-encoding'] = 'gzip' |
| |
| |
| def CheckFreeSpace(path): |
| """Return path/drive free space (in bytes).""" |
| if IS_WINDOWS: |
| try: |
| # pylint: disable=invalid-name |
| get_disk_free_space_ex = WINFUNCTYPE(c_int, c_wchar_p, |
| POINTER(c_uint64), |
| POINTER(c_uint64), |
| POINTER(c_uint64)) |
| get_disk_free_space_ex = get_disk_free_space_ex( |
| ('GetDiskFreeSpaceExW', windll.kernel32), ( |
| (1, 'lpszPathName'), |
| (2, 'lpFreeUserSpace'), |
| (2, 'lpTotalSpace'), |
| (2, 'lpFreeSpace'),)) |
| except AttributeError: |
| get_disk_free_space_ex = WINFUNCTYPE(c_int, c_char_p, |
| POINTER(c_uint64), |
| POINTER(c_uint64), |
| POINTER(c_uint64)) |
| get_disk_free_space_ex = get_disk_free_space_ex( |
| ('GetDiskFreeSpaceExA', windll.kernel32), ( |
| (1, 'lpszPathName'), |
| (2, 'lpFreeUserSpace'), |
| (2, 'lpTotalSpace'), |
| (2, 'lpFreeSpace'),)) |
| |
| def GetDiskFreeSpaceExErrCheck(result, unused_func, args): |
| if not result: |
| raise WinError() |
| return args[1].value |
| get_disk_free_space_ex.errcheck = GetDiskFreeSpaceExErrCheck |
| |
| return get_disk_free_space_ex(os.getenv('SystemDrive')) |
| else: |
| (_, f_frsize, _, _, f_bavail, _, _, _, _, _) = os.statvfs(path) |
| return f_frsize * f_bavail |
| |
| |
| def CreateDirIfNeeded(dir_path, mode=0777): |
| """Creates a directory, suppressing already-exists errors.""" |
| if not os.path.exists(dir_path): |
| try: |
| # Unfortunately, even though we catch and ignore EEXIST, this call will |
| # output a (needless) error message (no way to avoid that in Python). |
| os.makedirs(dir_path, mode) |
| # Ignore 'already exists' in case user tried to start up several |
| # resumable uploads concurrently from a machine where no tracker dir had |
| # yet been created. |
| except OSError as e: |
| if e.errno != errno.EEXIST: |
| raise |
| |
| |
| def DivideAndCeil(dividend, divisor): |
| """Returns ceil(dividend / divisor). |
| |
| Takes care to avoid the pitfalls of floating point arithmetic that could |
| otherwise yield the wrong result for large numbers. |
| |
| Args: |
| dividend: Dividend for the operation. |
| divisor: Divisor for the operation. |
| |
| Returns: |
| Quotient. |
| """ |
| quotient = dividend // divisor |
| if (dividend % divisor) != 0: |
| quotient += 1 |
| return quotient |
| |
| |
| def GetGsutilStateDir(): |
| """Returns the location of the directory for gsutil state files. |
| |
| Certain operations, such as cross-process credential sharing and |
| resumable transfer tracking, need a known location for state files which |
| are created by gsutil as-needed. |
| |
| This location should only be used for storing data that is required to be in |
| a static location. |
| |
| Returns: |
| Path to directory for gsutil static state files. |
| """ |
| config_file_dir = config.get( |
| 'GSUtil', 'state_dir', |
| os.path.expanduser(os.path.join('~', '.gsutil'))) |
| CreateDirIfNeeded(config_file_dir) |
| return config_file_dir |
| |
| |
| def GetCredentialStoreFilename(): |
| return os.path.join(GetGsutilStateDir(), 'credstore') |
| |
| |
| def GetGceCredentialCacheFilename(): |
| return os.path.join(GetGsutilStateDir(), 'gcecredcache') |
| |
| |
| def GetTabCompletionLogFilename(): |
| return os.path.join(GetGsutilStateDir(), 'tab-completion-logs') |
| |
| |
| def GetTabCompletionCacheFilename(): |
| tab_completion_dir = os.path.join(GetGsutilStateDir(), 'tab-completion') |
| # Limit read permissions on the directory to owner for privacy. |
| CreateDirIfNeeded(tab_completion_dir, mode=0700) |
| return os.path.join(tab_completion_dir, 'cache') |
| |
| |
| def GetPrintableExceptionString(exc): |
| """Returns a short Unicode string describing the exception.""" |
| return unicode(exc).encode(UTF8) or str(exc.__class__) |
| |
| |
| def PrintableStr(input_str): |
| return input_str.encode(UTF8) if input_str is not None else None |
| |
| |
| def PrintTrackerDirDeprecationWarningIfNeeded(): |
| # TODO: Remove this along with the tracker_dir config value 1 year after |
| # 4.6 release date. Use state_dir instead. |
| if config.has_option('GSUtil', 'resumable_tracker_dir'): |
| sys.stderr.write('Warning: you have set resumable_tracker_dir in your ' |
| '.boto configuration file. This configuration option is ' |
| 'deprecated; please use the state_dir configuration ' |
| 'option instead.\n') |
| |
| |
| # Name of file where we keep the timestamp for the last time we checked whether |
| # a new version of gsutil is available. |
| PrintTrackerDirDeprecationWarningIfNeeded() |
| CreateDirIfNeeded(GetGsutilStateDir()) |
| LAST_CHECKED_FOR_GSUTIL_UPDATE_TIMESTAMP_FILE = ( |
| os.path.join(GetGsutilStateDir(), '.last_software_update_check')) |
| |
| |
| def HasConfiguredCredentials(): |
| """Determines if boto credential/config file exists.""" |
| has_goog_creds = (config.has_option('Credentials', 'gs_access_key_id') and |
| config.has_option('Credentials', 'gs_secret_access_key')) |
| has_amzn_creds = (config.has_option('Credentials', 'aws_access_key_id') and |
| config.has_option('Credentials', 'aws_secret_access_key')) |
| has_oauth_creds = ( |
| config.has_option('Credentials', 'gs_oauth2_refresh_token')) |
| has_service_account_creds = ( |
| HAS_CRYPTO and |
| config.has_option('Credentials', 'gs_service_client_id') and |
| config.has_option('Credentials', 'gs_service_key_file')) |
| |
| if (has_goog_creds or has_amzn_creds or has_oauth_creds or |
| has_service_account_creds): |
| return True |
| |
| valid_auth_handler = None |
| try: |
| valid_auth_handler = boto.auth.get_auth_handler( |
| GSConnection.DefaultHost, config, Provider('google'), |
| requested_capability=['s3']) |
| # Exclude the no-op auth handler as indicating credentials are configured. |
| # Note we can't use isinstance() here because the no-op module may not be |
| # imported so we can't get a reference to the class type. |
| if getattr(getattr(valid_auth_handler, '__class__', None), |
| '__name__', None) == 'NoOpAuth': |
| valid_auth_handler = None |
| except NoAuthHandlerFound: |
| pass |
| |
| return valid_auth_handler |
| |
| |
| def ConfigureNoOpAuthIfNeeded(): |
| """Sets up no-op auth handler if no boto credentials are configured.""" |
| if not HasConfiguredCredentials(): |
| if (config.has_option('Credentials', 'gs_service_client_id') |
| and not HAS_CRYPTO): |
| if os.environ.get('CLOUDSDK_WRAPPER') == '1': |
| raise CommandException('\n'.join(textwrap.wrap( |
| 'Your gsutil is configured with an OAuth2 service account, but ' |
| 'you do not have PyOpenSSL or PyCrypto 2.6 or later installed. ' |
| 'Service account authentication requires one of these libraries; ' |
| 'please reactivate your service account via the gcloud auth ' |
| 'command and ensure any gcloud packages necessary for ' |
| 'service accounts are present.'))) |
| else: |
| raise CommandException('\n'.join(textwrap.wrap( |
| 'Your gsutil is configured with an OAuth2 service account, but ' |
| 'you do not have PyOpenSSL or PyCrypto 2.6 or later installed. ' |
| 'Service account authentication requires one of these libraries; ' |
| 'please install either of them to proceed, or configure a ' |
| 'different type of credentials with "gsutil config".'))) |
| else: |
| # With no boto config file the user can still access publicly readable |
| # buckets and objects. |
| from gslib import no_op_auth_plugin # pylint: disable=unused-variable |
| |
| |
| def GetConfigFilePath(): |
| config_path = 'no config found' |
| for path in BotoConfigLocations: |
| try: |
| with open(path, 'r'): |
| config_path = path |
| break |
| except IOError: |
| pass |
| return config_path |
| |
| |
| def GetBotoConfigFileList(): |
| """Returns list of boto config files that exist.""" |
| config_paths = boto.pyami.config.BotoConfigLocations |
| if 'AWS_CREDENTIAL_FILE' in os.environ: |
| config_paths.append(os.environ['AWS_CREDENTIAL_FILE']) |
| config_files = {} |
| for config_path in config_paths: |
| if os.path.exists(config_path): |
| config_files[config_path] = 1 |
| cf_list = [] |
| for config_file in config_files: |
| cf_list.append(config_file) |
| return cf_list |
| |
| |
| def GetCertsFile(): |
| """Configures and returns the CA Certificates file. |
| |
| If one is already configured, use it. Otherwise, amend the configuration |
| (in boto.config) to use the cert roots distributed with gsutil. |
| |
| Returns: |
| string filename of the certs file to use. |
| """ |
| certs_file = boto.config.get('Boto', 'ca_certificates_file', None) |
| if not certs_file: |
| with certs_file_lock: |
| if configured_certs_files: |
| disk_certs_file = configured_certs_files[0] |
| else: |
| disk_certs_file = os.path.abspath( |
| os.path.join(gslib.GSLIB_DIR, 'data', 'cacerts.txt')) |
| if not os.path.exists(disk_certs_file): |
| # If the file is not present on disk, this means the gslib module |
| # doesn't actually exist on disk anywhere. This can happen if it's |
| # being imported from a zip file. Unfortunately, we have to copy the |
| # certs file to a local temp file on disk because the underlying SSL |
| # socket requires it to be a filesystem path. |
| certs_data = pkgutil.get_data('gslib', 'data/cacerts.txt') |
| if not certs_data: |
| raise CommandException('Certificates file not found. Please ' |
| 'reinstall gsutil from scratch') |
| fd, fname = tempfile.mkstemp(suffix='.txt', prefix='gsutil-cacerts') |
| f = os.fdopen(fd, 'w') |
| f.write(certs_data) |
| f.close() |
| configured_certs_files.append(fname) |
| disk_certs_file = fname |
| certs_file = disk_certs_file |
| return certs_file |
| |
| |
| def GetCleanupFiles(): |
| """Returns a list of temp files to delete (if possible) when program exits.""" |
| cleanup_files = [] |
| if configured_certs_files: |
| cleanup_files += configured_certs_files |
| return cleanup_files |
| |
| |
| def ProxyInfoFromEnvironmentVar(proxy_env_var): |
| """Reads proxy info from the environment and converts to httplib2.ProxyInfo. |
| |
| Args: |
| proxy_env_var: Environment variable string to read, such as http_proxy or |
| https_proxy. |
| |
| Returns: |
| httplib2.ProxyInfo constructed from the environment string. |
| """ |
| proxy_url = os.environ.get(proxy_env_var) |
| if not proxy_url or not proxy_env_var.lower().startswith('http'): |
| return httplib2.ProxyInfo(httplib2.socks.PROXY_TYPE_HTTP, None, 0) |
| proxy_protocol = proxy_env_var.lower().split('_')[0] |
| if not proxy_url.lower().startswith('http'): |
| # proxy_info_from_url requires a protocol, which is always http or https. |
| proxy_url = proxy_protocol + '://' + proxy_url |
| return httplib2.proxy_info_from_url(proxy_url, method=proxy_protocol) |
| |
| |
| def GetNewHttp(http_class=httplib2.Http, **kwargs): |
| """Creates and returns a new httplib2.Http instance. |
| |
| Args: |
| http_class: Optional custom Http class to use. |
| **kwargs: Arguments to pass to http_class constructor. |
| |
| Returns: |
| An initialized httplib2.Http instance. |
| """ |
| proxy_info = httplib2.ProxyInfo( |
| proxy_type=3, |
| proxy_host=boto.config.get('Boto', 'proxy', None), |
| proxy_port=boto.config.getint('Boto', 'proxy_port', 0), |
| proxy_user=boto.config.get('Boto', 'proxy_user', None), |
| proxy_pass=boto.config.get('Boto', 'proxy_pass', None), |
| proxy_rdns=boto.config.get('Boto', 'proxy_rdns', False)) |
| |
| if not (proxy_info.proxy_host and proxy_info.proxy_port): |
| # Fall back to using the environment variable. |
| for proxy_env_var in ['http_proxy', 'https_proxy', 'HTTPS_PROXY']: |
| if proxy_env_var in os.environ and os.environ[proxy_env_var]: |
| proxy_info = ProxyInfoFromEnvironmentVar(proxy_env_var) |
| # Assume proxy_rnds is True if a proxy environment variable exists. |
| proxy_info.proxy_rdns = boto.config.get('Boto', 'proxy_rdns', True) |
| break |
| |
| # Some installers don't package a certs file with httplib2, so use the |
| # one included with gsutil. |
| kwargs['ca_certs'] = GetCertsFile() |
| # Use a non-infinite SSL timeout to avoid hangs during network flakiness. |
| kwargs['timeout'] = SSL_TIMEOUT |
| http = http_class(proxy_info=proxy_info, **kwargs) |
| http.disable_ssl_certificate_validation = (not config.getbool( |
| 'Boto', 'https_validate_certificates')) |
| return http |
| |
| |
| # Retry for 10 minutes with exponential backoff, which corresponds to |
| # the maximum Downtime Period specified in the GCS SLA |
| # (https://cloud.google.com/storage/sla) |
| def GetNumRetries(): |
| return config.getint('Boto', 'num_retries', 23) |
| |
| |
| def GetMaxRetryDelay(): |
| return config.getint('Boto', 'max_retry_delay', 32) |
| |
| |
| # Resumable downloads and uploads make one HTTP call per chunk (and must be |
| # in multiples of 256KiB). Overridable for testing. |
| def GetJsonResumableChunkSize(): |
| chunk_size = config.getint('GSUtil', 'json_resumable_chunk_size', |
| 1024*1024*100L) |
| if chunk_size == 0: |
| chunk_size = 1024*256L |
| elif chunk_size % 1024*256L != 0: |
| chunk_size += (1024*256L - (chunk_size % (1024*256L))) |
| return chunk_size |
| |
| |
| def JsonResumableChunkSizeDefined(): |
| chunk_size_defined = config.get('GSUtil', 'json_resumable_chunk_size', |
| None) |
| return chunk_size_defined is not None |
| |
| |
| def _RoundToNearestExponent(num): |
| i = 0 |
| while i+1 < len(_EXP_STRINGS) and num >= (2 ** _EXP_STRINGS[i+1][0]): |
| i += 1 |
| return i, round(float(num) / 2 ** _EXP_STRINGS[i][0], 2) |
| |
| |
| def MakeHumanReadable(num): |
| """Generates human readable string for a number of bytes. |
| |
| Args: |
| num: The number, in bytes. |
| |
| Returns: |
| A string form of the number using size abbreviations (KiB, MiB, etc.). |
| """ |
| i, rounded_val = _RoundToNearestExponent(num) |
| return '%g %s' % (rounded_val, _EXP_STRINGS[i][1]) |
| |
| |
| def MakeBitsHumanReadable(num): |
| """Generates human readable string for a number of bits. |
| |
| Args: |
| num: The number, in bits. |
| |
| Returns: |
| A string form of the number using bit size abbreviations (kbit, Mbit, etc.) |
| """ |
| i, rounded_val = _RoundToNearestExponent(num) |
| return '%g %s' % (rounded_val, _EXP_STRINGS[i][2]) |
| |
| |
| def HumanReadableToBytes(human_string): |
| """Tries to convert a human-readable string to a number of bytes. |
| |
| Args: |
| human_string: A string supplied by user, e.g. '1M', '3 GiB'. |
| Returns: |
| An integer containing the number of bytes. |
| Raises: |
| ValueError: on an invalid string. |
| """ |
| human_string = human_string.lower() |
| m = MATCH_HUMAN_BYTES.match(human_string) |
| if m: |
| num = float(m.group('num')) |
| if m.group('suffix'): |
| power = _EXP_STRINGS[SUFFIX_TO_SI[m.group('suffix')]][0] |
| num *= (2.0 ** power) |
| num = int(round(num)) |
| return num |
| raise ValueError('Invalid byte string specified: %s' % human_string) |
| |
| |
| def Percentile(values, percent, key=lambda x: x): |
| """Find the percentile of a list of values. |
| |
| Taken from: http://code.activestate.com/recipes/511478/ |
| |
| Args: |
| values: a list of numeric values. Note that the values MUST BE already |
| sorted. |
| percent: a float value from 0.0 to 1.0. |
| key: optional key function to compute value from each element of the list |
| of values. |
| |
| Returns: |
| The percentile of the values. |
| """ |
| if not values: |
| return None |
| k = (len(values) - 1) * percent |
| f = math.floor(k) |
| c = math.ceil(k) |
| if f == c: |
| return key(values[int(k)]) |
| d0 = key(values[int(f)]) * (c-k) |
| d1 = key(values[int(c)]) * (k-f) |
| return d0 + d1 |
| |
| |
| def RemoveCRLFFromString(input_str): |
| """Returns the input string with all \\n and \\r removed.""" |
| return re.sub(r'[\r\n]', '', input_str) |
| |
| |
| def UnaryDictToXml(message): |
| """Generates XML representation of a nested dict. |
| |
| This dict contains exactly one top-level entry and an arbitrary number of |
| 2nd-level entries, e.g. capturing a WebsiteConfiguration message. |
| |
| Args: |
| message: The dict encoding the message. |
| |
| Returns: |
| XML string representation of the input dict. |
| |
| Raises: |
| Exception: if dict contains more than one top-level entry. |
| """ |
| if len(message) != 1: |
| raise Exception('Expected dict of size 1, got size %d' % len(message)) |
| |
| name, content = message.items()[0] |
| element_type = ElementTree.Element(name) |
| for element_property, value in sorted(content.items()): |
| node = ElementTree.SubElement(element_type, element_property) |
| node.text = value |
| return ElementTree.tostring(element_type) |
| |
| |
| def LookUpGsutilVersion(gsutil_api, url_str): |
| """Looks up the gsutil version of the specified gsutil tarball URL. |
| |
| Version is specified in the metadata field set on that object. |
| |
| Args: |
| gsutil_api: gsutil Cloud API to use when retrieving gsutil tarball. |
| url_str: tarball URL to retrieve (such as 'gs://pub/gsutil.tar.gz'). |
| |
| Returns: |
| Version string if URL is a cloud URL containing x-goog-meta-gsutil-version |
| metadata, else None. |
| """ |
| url = StorageUrlFromString(url_str) |
| if url.IsCloudUrl(): |
| obj = gsutil_api.GetObjectMetadata(url.bucket_name, url.object_name, |
| provider=url.scheme, |
| fields=['metadata']) |
| if obj.metadata and obj.metadata.additionalProperties: |
| for prop in obj.metadata.additionalProperties: |
| if prop.key == 'gsutil_version': |
| return prop.value |
| |
| |
| def GetGsutilVersionModifiedTime(): |
| """Returns unix timestamp of when the VERSION file was last modified.""" |
| if not gslib.VERSION_FILE: |
| return 0 |
| return int(os.path.getmtime(gslib.VERSION_FILE)) |
| |
| |
| def IsRunningInteractively(): |
| """Returns True if currently running interactively on a TTY.""" |
| return sys.stdout.isatty() and sys.stderr.isatty() and sys.stdin.isatty() |
| |
| |
| def _HttpsValidateCertifcatesEnabled(): |
| return config.get('Boto', 'https_validate_certificates', True) |
| |
| CERTIFICATE_VALIDATION_ENABLED = _HttpsValidateCertifcatesEnabled() |
| |
| |
| def _BotoIsSecure(): |
| return config.get('Boto', 'is_secure', True) |
| |
| BOTO_IS_SECURE = _BotoIsSecure() |
| |
| |
| def ResumableThreshold(): |
| return config.getint('GSUtil', 'resumable_threshold', EIGHT_MIB) |
| |
| |
| # pylint: disable=too-many-statements |
| def PrintFullInfoAboutObject(bucket_listing_ref, incl_acl=True): |
| """Print full info for given object (like what displays for gsutil ls -L). |
| |
| Args: |
| bucket_listing_ref: BucketListingRef being listed. |
| Must have ref_type OBJECT and a populated root_object |
| with the desired fields. |
| incl_acl: True if ACL info should be output. |
| |
| Returns: |
| Tuple (number of objects, object_length) |
| |
| Raises: |
| Exception: if calling bug encountered. |
| """ |
| url_str = bucket_listing_ref.url_string |
| storage_url = StorageUrlFromString(url_str) |
| obj = bucket_listing_ref.root_object |
| |
| if (obj.metadata and S3_DELETE_MARKER_GUID in |
| obj.metadata.additionalProperties): |
| num_bytes = 0 |
| num_objs = 0 |
| url_str += '<DeleteMarker>' |
| else: |
| num_bytes = obj.size |
| num_objs = 1 |
| |
| print '%s:' % url_str.encode(UTF8) |
| if obj.updated: |
| print '\tCreation time:\t\t%s' % obj.updated.strftime( |
| '%a, %d %b %Y %H:%M:%S GMT') |
| if obj.cacheControl: |
| print '\tCache-Control:\t\t%s' % obj.cacheControl |
| if obj.contentDisposition: |
| print '\tContent-Disposition:\t\t%s' % obj.contentDisposition |
| if obj.contentEncoding: |
| print '\tContent-Encoding:\t\t%s' % obj.contentEncoding |
| if obj.contentLanguage: |
| print '\tContent-Language:\t%s' % obj.contentLanguage |
| print '\tContent-Length:\t\t%s' % obj.size |
| print '\tContent-Type:\t\t%s' % obj.contentType |
| if obj.componentCount: |
| print '\tComponent-Count:\t%d' % obj.componentCount |
| marker_props = {} |
| if obj.metadata and obj.metadata.additionalProperties: |
| non_marker_props = [] |
| for add_prop in obj.metadata.additionalProperties: |
| if add_prop.key not in S3_MARKER_GUIDS: |
| non_marker_props.append(add_prop) |
| else: |
| marker_props[add_prop.key] = add_prop.value |
| if non_marker_props: |
| print '\tMetadata:' |
| for ap in non_marker_props: |
| meta_string = '\t\t%s:\t\t%s' % (ap.key, ap.value) |
| print meta_string.encode(UTF8) |
| if obj.customerEncryption: |
| if not obj.crc32c: print '\tHash (crc32c):\t\tencrypted' |
| if not obj.md5Hash: print '\tHash (md5):\t\tencrypted' |
| print ('\tEncryption algorithm:\t%s' % |
| obj.customerEncryption.encryptionAlgorithm) |
| print '\tEncryption key SHA256:\t%s' % obj.customerEncryption.keySha256 |
| if obj.crc32c: print '\tHash (crc32c):\t\t%s' % obj.crc32c |
| if obj.md5Hash: print '\tHash (md5):\t\t%s' % obj.md5Hash |
| print '\tETag:\t\t\t%s' % obj.etag.strip('"\'') |
| if obj.generation: |
| generation_str = GenerationFromUrlAndString(storage_url, obj.generation) |
| print '\tGeneration:\t\t%s' % generation_str |
| if obj.metageneration: |
| print '\tMetageneration:\t\t%s' % obj.metageneration |
| if incl_acl: |
| # JSON API won't return acls as part of the response unless we have |
| # full control scope |
| if obj.acl: |
| print '\tACL:\t\t%s' % AclTranslation.JsonFromMessage(obj.acl) |
| elif S3_ACL_MARKER_GUID in marker_props: |
| print '\tACL:\t\t%s' % marker_props[S3_ACL_MARKER_GUID] |
| else: |
| print ('\tACL:\t\t\tACCESS DENIED. Note: you need OWNER ' |
| 'permission\n\t\t\t\ton the object to read its ACL.') |
| |
| return (num_objs, num_bytes) |
| |
| |
| def CompareVersions(first, second): |
| """Compares the first and second gsutil version strings. |
| |
| For example, 3.33 > 3.7, and 4.1 is a greater major version than 3.33. |
| Does not handle multiple periods (e.g. 3.3.4) or complicated suffixes |
| (e.g., 3.3RC4 vs. 3.3RC5). A version string with a suffix is treated as |
| less than its non-suffix counterpart (e.g. 3.32 > 3.32pre). |
| |
| Args: |
| first: First gsutil version string. |
| second: Second gsutil version string. |
| |
| Returns: |
| (g, m): |
| g is True if first known to be greater than second, else False. |
| m is True if first known to be greater by at least 1 major version, |
| else False. |
| """ |
| m1 = VERSION_MATCHER.match(str(first)) |
| m2 = VERSION_MATCHER.match(str(second)) |
| |
| # If passed strings we don't know how to handle, be conservative. |
| if not m1 or not m2: |
| return (False, False) |
| |
| major_ver1 = int(m1.group('maj')) |
| minor_ver1 = int(m1.group('min')) if m1.group('min') else 0 |
| suffix_ver1 = m1.group('suffix') |
| major_ver2 = int(m2.group('maj')) |
| minor_ver2 = int(m2.group('min')) if m2.group('min') else 0 |
| suffix_ver2 = m2.group('suffix') |
| |
| if major_ver1 > major_ver2: |
| return (True, True) |
| elif major_ver1 == major_ver2: |
| if minor_ver1 > minor_ver2: |
| return (True, False) |
| elif minor_ver1 == minor_ver2: |
| return (bool(suffix_ver2) and not suffix_ver1, False) |
| return (False, False) |
| |
| |
| def _IncreaseSoftLimitForResource(resource_name, fallback_value): |
| """Sets a new soft limit for the maximum number of open files. |
| |
| The soft limit is used for this process (and its children), but the |
| hard limit is set by the system and cannot be exceeded. |
| |
| We will first try to set the soft limit to the hard limit's value; if that |
| fails, we will try to set the soft limit to the fallback_value iff this would |
| increase the soft limit. |
| |
| Args: |
| resource_name: Name of the resource to increase the soft limit for. |
| fallback_value: Fallback value to be used if we couldn't set the |
| soft value to the hard value (e.g., if the hard value |
| is "unlimited"). |
| |
| Returns: |
| Current soft limit for the resource (after any changes we were able to |
| make), or -1 if the resource doesn't exist. |
| """ |
| |
| # Get the value of the resource. |
| try: |
| (soft_limit, hard_limit) = resource.getrlimit(resource_name) |
| except (resource.error, ValueError): |
| # The resource wasn't present, so we can't do anything here. |
| return -1 |
| |
| # Try to set the value of the soft limit to the value of the hard limit. |
| if hard_limit > soft_limit: # Some OS's report 0 for "unlimited". |
| try: |
| resource.setrlimit(resource_name, (hard_limit, hard_limit)) |
| return hard_limit |
| except (resource.error, ValueError): |
| # We'll ignore this and try the fallback value. |
| pass |
| |
| # Try to set the value of the soft limit to the fallback value. |
| if soft_limit < fallback_value: |
| try: |
| resource.setrlimit(resource_name, (fallback_value, hard_limit)) |
| return fallback_value |
| except (resource.error, ValueError): |
| # We couldn't change the soft limit, so just report the current |
| # value of the soft limit. |
| return soft_limit |
| else: |
| return soft_limit |
| |
| |
| def GetCloudApiInstance(cls, thread_state=None): |
| """Gets a gsutil Cloud API instance. |
| |
| Since Cloud API implementations are not guaranteed to be thread-safe, each |
| thread needs its own instance. These instances are passed to each thread |
| via the thread pool logic in command. |
| |
| Args: |
| cls: Command class to be used for single-threaded case. |
| thread_state: Per thread state from this thread containing a gsutil |
| Cloud API instance. |
| |
| Returns: |
| gsutil Cloud API instance. |
| """ |
| return thread_state or cls.gsutil_api |
| |
| |
| def GetFileSize(fp, position_to_eof=False): |
| """Returns size of file, optionally leaving fp positioned at EOF.""" |
| if not position_to_eof: |
| cur_pos = fp.tell() |
| fp.seek(0, os.SEEK_END) |
| cur_file_size = fp.tell() |
| if not position_to_eof: |
| fp.seek(cur_pos) |
| return cur_file_size |
| |
| |
| def GetStreamFromFileUrl(storage_url): |
| if storage_url.IsStream(): |
| return sys.stdin |
| else: |
| return open(storage_url.object_name, 'rb') |
| |
| |
| def UrlsAreForSingleProvider(url_args): |
| """Tests whether the URLs are all for a single provider. |
| |
| Args: |
| url_args: Strings to check. |
| |
| Returns: |
| True if URLs are for single provider, False otherwise. |
| """ |
| provider = None |
| url = None |
| for url_str in url_args: |
| url = StorageUrlFromString(url_str) |
| if not provider: |
| provider = url.scheme |
| elif url.scheme != provider: |
| return False |
| return provider is not None |
| |
| |
| def HaveFileUrls(args_to_check): |
| """Checks whether args_to_check contain any file URLs. |
| |
| Args: |
| args_to_check: Command-line argument subset to check. |
| |
| Returns: |
| True if args_to_check contains any file URLs. |
| """ |
| for url_str in args_to_check: |
| storage_url = StorageUrlFromString(url_str) |
| if storage_url.IsFileUrl(): |
| return True |
| return False |
| |
| |
| def HaveProviderUrls(args_to_check): |
| """Checks whether args_to_check contains any provider URLs (like 'gs://'). |
| |
| Args: |
| args_to_check: Command-line argument subset to check. |
| |
| Returns: |
| True if args_to_check contains any provider URLs. |
| """ |
| for url_str in args_to_check: |
| storage_url = StorageUrlFromString(url_str) |
| if storage_url.IsCloudUrl() and storage_url.IsProvider(): |
| return True |
| return False |
| |
| # This must be defined at the module level for pickling across processes. |
| MultiprocessingIsAvailableResult = collections.namedtuple( |
| 'MultiprocessingIsAvailableResult', ['is_available', 'stack_trace']) |
| |
| |
| def CheckMultiprocessingAvailableAndInit(logger=None): |
| """Checks if multiprocessing is available. |
| |
| There are some environments in which there is no way to use multiprocessing |
| logic that's built into Python (e.g., if /dev/shm is not available, then |
| we can't create semaphores). This simply tries out a few things that will be |
| needed to make sure the environment can support the pieces of the |
| multiprocessing module that we need. |
| |
| If multiprocessing is available, this performs necessary initialization for |
| multiprocessing. See gslib.command.InitializeMultiprocessingVariables for |
| an explanation of why this is necessary. |
| |
| Args: |
| logger: logging.logger to use for debug output. |
| |
| Returns: |
| (multiprocessing_is_available, stack_trace): |
| multiprocessing_is_available: True iff the multiprocessing module is |
| available for use. |
| stack_trace: The stack trace generated by the call we tried that failed. |
| """ |
| # pylint: disable=global-variable-undefined |
| global cached_multiprocessing_is_available |
| global cached_multiprocessing_check_stack_trace |
| global cached_multiprocessing_is_available_message |
| if cached_multiprocessing_is_available is not None: |
| if logger: |
| logger.debug(cached_multiprocessing_check_stack_trace) |
| logger.warn(cached_multiprocessing_is_available_message) |
| return MultiprocessingIsAvailableResult( |
| is_available=cached_multiprocessing_is_available, |
| stack_trace=cached_multiprocessing_check_stack_trace) |
| |
| if IS_WINDOWS: |
| message = """ |
| Multiple processes are not supported on Windows. Operations requesting |
| parallelism will be executed with multiple threads in a single process only. |
| """ |
| if logger: |
| logger.warn(message) |
| return MultiprocessingIsAvailableResult(is_available=False, |
| stack_trace=None) |
| |
| stack_trace = None |
| multiprocessing_is_available = True |
| message = """ |
| You have requested multiple processes for an operation, but the |
| required functionality of Python\'s multiprocessing module is not available. |
| Operations requesting parallelism will be executed with multiple threads in a |
| single process only. |
| """ |
| try: |
| # Fails if /dev/shm (or some equivalent thereof) is not available for use |
| # (e.g., there's no implementation, or we can't write to it, etc.). |
| try: |
| multiprocessing.Value('i', 0) |
| except: |
| message += """ |
| Please ensure that you have write access to both /dev/shm and /run/shm. |
| """ |
| raise # We'll handle this in one place below. |
| |
| global manager # pylint: disable=global-variable-undefined |
| manager = multiprocessing.Manager() |
| |
| # Check that the max number of open files is reasonable. Always check this |
| # after we're sure that the basic multiprocessing functionality is |
| # available, since this won't matter unless that's true. |
| limit = -1 |
| if HAS_RESOURCE_MODULE: |
| # Try to set this with both resource names - RLIMIT_NOFILE for most Unix |
| # platforms, and RLIMIT_OFILE for BSD. Ignore AttributeError because the |
| # "resource" module is not guaranteed to know about these names. |
| try: |
| limit = max(limit, |
| _IncreaseSoftLimitForResource( |
| resource.RLIMIT_NOFILE, |
| MIN_ACCEPTABLE_OPEN_FILES_LIMIT)) |
| except AttributeError: |
| pass |
| try: |
| limit = max(limit, |
| _IncreaseSoftLimitForResource( |
| resource.RLIMIT_OFILE, MIN_ACCEPTABLE_OPEN_FILES_LIMIT)) |
| except AttributeError: |
| pass |
| |
| if limit < MIN_ACCEPTABLE_OPEN_FILES_LIMIT: |
| message += (""" |
| Your max number of open files, %s, is too low to allow safe multiprocessing. |
| On Linux you can fix this by adding something like "ulimit -n 10000" to your |
| ~/.bashrc or equivalent file and opening a new terminal. |
| |
| On MacOS, you may also need to run a command like this once (in addition to the |
| above instructions), which might require a restart of your system to take |
| effect: |
| launchctl limit maxfiles 10000 |
| |
| Alternatively, edit /etc/launchd.conf with something like: |
| limit maxfiles 10000 10000 |
| |
| """ % limit) |
| raise Exception('Max number of open files, %s, is too low.' % limit) |
| except: # pylint: disable=bare-except |
| stack_trace = traceback.format_exc() |
| multiprocessing_is_available = False |
| if logger is not None: |
| logger.debug(stack_trace) |
| logger.warn(message) |
| |
| # Set the cached values so that we never need to do this check again. |
| cached_multiprocessing_is_available = multiprocessing_is_available |
| cached_multiprocessing_check_stack_trace = stack_trace |
| cached_multiprocessing_is_available_message = message |
| return MultiprocessingIsAvailableResult( |
| is_available=cached_multiprocessing_is_available, |
| stack_trace=cached_multiprocessing_check_stack_trace) |
| |
| |
| def CreateLock(): |
| """Returns either a multiprocessing lock or a threading lock. |
| |
| Use Multiprocessing lock iff we have access to the parts of the |
| multiprocessing module that are necessary to enable parallelism in operations. |
| |
| Returns: |
| Multiprocessing or threading lock. |
| """ |
| if CheckMultiprocessingAvailableAndInit().is_available: |
| return manager.Lock() |
| else: |
| return threading.Lock() |
| |
| |
| def IsCloudSubdirPlaceholder(url, blr=None): |
| """Determines if URL is a cloud subdir placeholder. |
| |
| This function is needed because GUI tools (like the GCS cloud console) allow |
| users to create empty "folders" by creating a placeholder object; and parts |
| of gsutil need to treat those placeholder objects specially. For example, |
| gsutil rsync needs to avoid downloading those objects because they can cause |
| conflicts (see comments in rsync command for details). |
| |
| We currently detect two cases: |
| - Cloud objects whose name ends with '_$folder$' |
| - Cloud objects whose name ends with '/' |
| |
| Args: |
| url: The URL to be checked. |
| blr: BucketListingRef to check, or None if not available. |
| If None, size won't be checked. |
| |
| Returns: |
| True/False. |
| """ |
| if not url.IsCloudUrl(): |
| return False |
| url_str = url.url_string |
| if url_str.endswith('_$folder$'): |
| return True |
| if blr and blr.IsObject(): |
| size = blr.root_object.size |
| else: |
| size = 0 |
| return size == 0 and url_str.endswith('/') |
| |
| |
| def GetTermLines(): |
| """Returns number of terminal lines.""" |
| # fcntl isn't supported in Windows. |
| try: |
| import fcntl # pylint: disable=g-import-not-at-top |
| import termios # pylint: disable=g-import-not-at-top |
| except ImportError: |
| return _DEFAULT_LINES |
| def ioctl_GWINSZ(fd): # pylint: disable=invalid-name |
| try: |
| return struct.unpack( |
| 'hh', fcntl.ioctl(fd, termios.TIOCGWINSZ, '1234'))[0] |
| except: # pylint: disable=bare-except |
| return 0 # Failure (so will retry on different file descriptor below). |
| # Try to find a valid number of lines from termio for stdin, stdout, |
| # or stderr, in that order. |
| ioc = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2) |
| if not ioc: |
| try: |
| fd = os.open(os.ctermid(), os.O_RDONLY) |
| ioc = ioctl_GWINSZ(fd) |
| os.close(fd) |
| except: # pylint: disable=bare-except |
| pass |
| if not ioc: |
| ioc = os.environ.get('LINES', _DEFAULT_LINES) |
| return int(ioc) |
| |
| |
| def FixWindowsEncodingIfNeeded(input_str): |
| """Attempts to detect Windows CP1252 encoding and convert to UTF8. |
| |
| Windows doesn't provide a way to set UTF-8 for string encodings; you can set |
| the system locale (see |
| http://windows.microsoft.com/en-us/windows/change-system-locale#1TC=windows-7) |
| but that takes you to a "Change system locale" dropdown that just lists |
| languages (e.g., "English (United States)". Instead, we're forced to check if |
| a encoding as UTF8 raises an exception and if so, try converting from CP1252 |
| to Unicode. |
| |
| Args: |
| input_str: The input string. |
| Returns: |
| The converted string (or the original, if conversion wasn't needed). |
| """ |
| if IS_CP1252: |
| return input_str.decode(WINDOWS_1252).encode(UTF8) |
| else: |
| return input_str |
| |
| |
| class GsutilStreamHandler(logging.StreamHandler): |
| """A subclass of StreamHandler for use in gsutil.""" |
| |
| def flush(self): |
| # Note: we override the flush method here due to a python 2.6 bug. The |
| # python logging module will try to flush all stream handlers at exit. |
| # If the StreamHandler is pointing to a file that is already closed, the |
| # method throws an exception. Our unit tests temporarily redirect stderr, |
| # which causes the default StreamHandler to open its stream against a |
| # temporary file. By the time the process shuts down, the underlying file |
| # is closed, causing an exception. This was fixed in Python 2.7, but to |
| # remove the flake from Python 2.6, we maintain this here. |
| try: |
| logging.StreamHandler.flush(self) |
| except ValueError: |
| pass |
| |
| |
| def StdinIterator(): |
| """A generator function that returns lines from stdin.""" |
| for line in sys.stdin: |
| # Strip CRLF. |
| yield line.rstrip() |
| |
| |
| def ConvertRecursiveToFlatWildcard(url_strs): |
| """A generator that adds '**' to each url string in url_strs.""" |
| for url_str in url_strs: |
| yield '%s**' % url_str |