blob: 57b4f638a4bfe7b788ec580b289cc011c315222f [file] [log] [blame]
# -*- coding: utf-8 -*-
# Copyright 2010 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Wildcard iterator class and supporting functions."""
from __future__ import absolute_import
import fnmatch
import glob
import os
import re
import sys
import textwrap
from gslib.bucket_listing_ref import BucketListingBucket
from gslib.bucket_listing_ref import BucketListingObject
from gslib.bucket_listing_ref import BucketListingPrefix
from gslib.cloud_api import AccessDeniedException
from gslib.cloud_api import CloudApi
from gslib.cloud_api import NotFoundException
from gslib.exception import CommandException
from gslib.storage_url import ContainsWildcard
from gslib.storage_url import StorageUrlFromString
from gslib.storage_url import StripOneSlash
from gslib.storage_url import WILDCARD_REGEX
from gslib.translation_helper import GenerationFromUrlAndString
from gslib.util import UTF8
FLAT_LIST_REGEX = re.compile(r'(?P<before>.*?)\*\*(?P<after>.*)')
class WildcardIterator(object):
"""Class for iterating over Google Cloud Storage strings containing wildcards.
The base class is abstract; you should instantiate using the
wildcard_iterator() static factory method, which chooses the right
implementation depending on the base string.
"""
# TODO: Standardize on __str__ and __repr__ here and elsewhere. Define both
# and make one return the other.
def __repr__(self):
"""Returns string representation of WildcardIterator."""
return 'WildcardIterator(%s)' % self.wildcard_url.url_string
class CloudWildcardIterator(WildcardIterator):
"""WildcardIterator subclass for buckets, bucket subdirs and objects.
Iterates over BucketListingRef matching the Url string wildcard. It's
much more efficient to first get metadata that's available in the Bucket
(for example to get the name and size of each object), because that
information is available in the object list results.
"""
def __init__(self, wildcard_url, gsutil_api, all_versions=False,
debug=0, project_id=None):
"""Instantiates an iterator that matches the wildcard URL.
Args:
wildcard_url: CloudUrl that contains the wildcard to iterate.
gsutil_api: Cloud storage interface. Passed in for thread safety, also
settable for testing/mocking.
all_versions: If true, the iterator yields all versions of objects
matching the wildcard. If false, yields just the live
object version.
debug: Debug level to control debug output for iterator.
project_id: Project ID to use for bucket listings.
"""
self.wildcard_url = wildcard_url
self.all_versions = all_versions
self.debug = debug
self.gsutil_api = gsutil_api
self.project_id = project_id
def __iter__(self, bucket_listing_fields=None,
expand_top_level_buckets=False):
"""Iterator that gets called when iterating over the cloud wildcard.
In the case where no wildcard is present, returns a single matching object,
single matching prefix, or one of each if both exist.
Args:
bucket_listing_fields: Iterable fields to include in bucket listings.
Ex. ['name', 'acl']. Iterator is
responsible for converting these to list-style
format ['items/name', 'items/acl'] as well as
adding any fields necessary for listing such as
prefixes. API implemenation is responsible for
adding pagination fields. If this is None,
all fields are returned.
expand_top_level_buckets: If true, yield no BUCKET references. Instead,
expand buckets into top-level objects and
prefixes.
Yields:
BucketListingRef of type BUCKET, OBJECT or PREFIX.
"""
single_version_request = self.wildcard_url.HasGeneration()
# For wildcard expansion purposes, we need at a minimum the name of
# each object and prefix. If we're not using the default of requesting
# all fields, make sure at least these are requested. The Cloud API
# tolerates specifying the same field twice.
get_fields = None
if bucket_listing_fields:
get_fields = set()
for field in bucket_listing_fields:
get_fields.add(field)
bucket_listing_fields = self._GetToListFields(
get_fields=bucket_listing_fields)
bucket_listing_fields.update(['items/name', 'prefixes'])
get_fields.update(['name'])
# If we're making versioned requests, ensure generation and
# metageneration are also included.
if single_version_request or self.all_versions:
bucket_listing_fields.update(['items/generation',
'items/metageneration'])
get_fields.update(['generation', 'metageneration'])
# Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then
# iterate over the expanded bucket strings and handle any object
# wildcarding.
for bucket_listing_ref in self._ExpandBucketWildcards(bucket_fields=['id']):
bucket_url_string = bucket_listing_ref.url_string
if self.wildcard_url.IsBucket():
# IsBucket() guarantees there are no prefix or object wildcards, and
# thus this is a top-level listing of buckets.
if expand_top_level_buckets:
url = StorageUrlFromString(bucket_url_string)
for obj_or_prefix in self.gsutil_api.ListObjects(
url.bucket_name, delimiter='/', all_versions=self.all_versions,
provider=self.wildcard_url.scheme,
fields=bucket_listing_fields):
if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:
yield self._GetObjectRef(bucket_url_string, obj_or_prefix.data,
with_version=self.all_versions)
else: # CloudApi.CsObjectOrPrefixType.PREFIX:
yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data)
else:
yield bucket_listing_ref
else:
# By default, assume a non-wildcarded URL is an object, not a prefix.
# This prevents unnecessary listings (which are slower, more expensive,
# and also subject to eventual consistency).
if (not ContainsWildcard(self.wildcard_url.url_string) and
self.wildcard_url.IsObject() and not self.all_versions):
try:
get_object = self.gsutil_api.GetObjectMetadata(
self.wildcard_url.bucket_name,
self.wildcard_url.object_name,
generation=self.wildcard_url.generation,
provider=self.wildcard_url.scheme,
fields=get_fields)
yield self._GetObjectRef(
self.wildcard_url.bucket_url_string, get_object,
with_version=(self.all_versions or single_version_request))
return
except (NotFoundException, AccessDeniedException):
# It's possible this is a prefix - try to list instead.
pass
# Expand iteratively by building prefix/delimiter bucket listing
# request, filtering the results per the current level's wildcard
# (if present), and continuing with the next component of the
# wildcard. See _BuildBucketFilterStrings() documentation for details.
if single_version_request:
url_string = '%s%s#%s' % (bucket_url_string,
self.wildcard_url.object_name,
self.wildcard_url.generation)
else:
# Rstrip any prefixes to correspond with rstripped prefix wildcard
# from _BuildBucketFilterStrings().
url_string = '%s%s' % (bucket_url_string,
StripOneSlash(self.wildcard_url.object_name)
or '/') # Cover root object named '/' case.
urls_needing_expansion = [url_string]
while urls_needing_expansion:
url = StorageUrlFromString(urls_needing_expansion.pop(0))
(prefix, delimiter, prefix_wildcard, suffix_wildcard) = (
self._BuildBucketFilterStrings(url.object_name))
prog = re.compile(fnmatch.translate(prefix_wildcard))
# List bucket for objects matching prefix up to delimiter.
for obj_or_prefix in self.gsutil_api.ListObjects(
url.bucket_name, prefix=prefix, delimiter=delimiter,
all_versions=self.all_versions or single_version_request,
provider=self.wildcard_url.scheme,
fields=bucket_listing_fields):
if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:
gcs_object = obj_or_prefix.data
if prog.match(gcs_object.name):
if not suffix_wildcard or (
StripOneSlash(gcs_object.name) == suffix_wildcard):
if not single_version_request or (
self._SingleVersionMatches(gcs_object.generation)):
yield self._GetObjectRef(
bucket_url_string, gcs_object, with_version=(
self.all_versions or single_version_request))
else: # CloudApi.CsObjectOrPrefixType.PREFIX
prefix = obj_or_prefix.data
# If the prefix ends with a slash, remove it. Note that we only
# remove one slash so that we can successfully enumerate dirs
# containing multiple slashes.
rstripped_prefix = StripOneSlash(prefix)
if prog.match(rstripped_prefix):
if suffix_wildcard and rstripped_prefix != suffix_wildcard:
# There's more wildcard left to expand.
url_append_string = '%s%s' % (
bucket_url_string, rstripped_prefix + '/' +
suffix_wildcard)
urls_needing_expansion.append(url_append_string)
else:
# No wildcard to expand, just yield the prefix
yield self._GetPrefixRef(bucket_url_string, prefix)
def _BuildBucketFilterStrings(self, wildcard):
"""Builds strings needed for querying a bucket and filtering results.
This implements wildcard object name matching.
Args:
wildcard: The wildcard string to match to objects.
Returns:
(prefix, delimiter, prefix_wildcard, suffix_wildcard)
where:
prefix is the prefix to be sent in bucket GET request.
delimiter is the delimiter to be sent in bucket GET request.
prefix_wildcard is the wildcard to be used to filter bucket GET results.
suffix_wildcard is wildcard to be appended to filtered bucket GET
results for next wildcard expansion iteration.
For example, given the wildcard gs://bucket/abc/d*e/f*.txt we
would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and
suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket
listing request will then produce a listing result set that can be
filtered using this prefix_wildcard; and we'd use this suffix_wildcard
to feed into the next call(s) to _BuildBucketFilterStrings(), for the
next iteration of listing/filtering.
Raises:
AssertionError if wildcard doesn't contain any wildcard chars.
"""
# Generate a request prefix if the object name part of the wildcard starts
# with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz').
match = WILDCARD_REGEX.search(wildcard)
if not match:
# Input "wildcard" has no wildcard chars, so just return tuple that will
# cause a bucket listing to match the given input wildcard. Example: if
# previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc,
# the next iteration will call _BuildBucketFilterStrings() with
# gs://bucket/dir/abc, and we will return prefix ='dir/abc',
# delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''.
prefix = wildcard
delimiter = '/'
prefix_wildcard = wildcard
suffix_wildcard = ''
else:
if match.start() > 0:
# Wildcard does not occur at beginning of object name, so construct a
# prefix string to send to server.
prefix = wildcard[:match.start()]
wildcard_part = wildcard[match.start():]
else:
prefix = None
wildcard_part = wildcard
end = wildcard_part.find('/')
if end != -1:
wildcard_part = wildcard_part[:end+1]
# Remove trailing '/' so we will match gs://bucket/abc* as well as
# gs://bucket/abc*/ with the same wildcard regex.
prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part)
suffix_wildcard = wildcard[match.end():]
end = suffix_wildcard.find('/')
if end == -1:
suffix_wildcard = ''
else:
suffix_wildcard = suffix_wildcard[end+1:]
# To implement recursive (**) wildcarding, if prefix_wildcard
# suffix_wildcard starts with '**' don't send a delimiter, and combine
# suffix_wildcard at end of prefix_wildcard.
if prefix_wildcard.find('**') != -1:
delimiter = None
prefix_wildcard += suffix_wildcard
suffix_wildcard = ''
else:
delimiter = '/'
# The following debug output is useful for tracing how the algorithm
# walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt
if self.debug > 1:
sys.stderr.write(
'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, '
'prefix_wildcard=%s, suffix_wildcard=%s\n' %
(wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard))
return (prefix, delimiter, prefix_wildcard, suffix_wildcard)
def _SingleVersionMatches(self, listed_generation):
decoded_generation = GenerationFromUrlAndString(self.wildcard_url,
listed_generation)
return str(self.wildcard_url.generation) == str(decoded_generation)
def _ExpandBucketWildcards(self, bucket_fields=None):
"""Expands bucket and provider wildcards.
Builds a list of bucket url strings that can be iterated on.
Args:
bucket_fields: If present, populate only these metadata fields for
buckets. Example value: ['acl', 'defaultObjectAcl']
Yields:
BucketListingRefereneces of type BUCKET.
"""
bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string)
if (bucket_fields and set(bucket_fields) == set(['id']) and
not ContainsWildcard(self.wildcard_url.bucket_name)):
# If we just want the name of a non-wildcarded bucket URL,
# don't make an RPC.
yield BucketListingBucket(bucket_url)
elif(self.wildcard_url.IsBucket() and
not ContainsWildcard(self.wildcard_url.bucket_name)):
# If we have a non-wildcarded bucket URL, get just that bucket.
yield BucketListingBucket(
bucket_url, root_object=self.gsutil_api.GetBucket(
self.wildcard_url.bucket_name, provider=self.wildcard_url.scheme,
fields=bucket_fields))
else:
regex = fnmatch.translate(self.wildcard_url.bucket_name)
prog = re.compile(regex)
fields = self._GetToListFields(bucket_fields)
if fields:
fields.add('items/id')
for bucket in self.gsutil_api.ListBuckets(
fields=fields, project_id=self.project_id,
provider=self.wildcard_url.scheme):
if prog.match(bucket.id):
url = StorageUrlFromString(
'%s://%s/' % (self.wildcard_url.scheme, bucket.id))
yield BucketListingBucket(url, root_object=bucket)
def _GetToListFields(self, get_fields=None):
"""Prepends 'items/' to the input fields and converts it to a set.
This way field sets requested for GetBucket can be used in ListBucket calls.
Note that the input set must contain only bucket or object fields; listing
fields such as prefixes or nextPageToken should be added after calling
this function.
Args:
get_fields: Iterable fields usable in GetBucket/GetObject calls.
Returns:
Set of fields usable in ListBuckets/ListObjects calls.
"""
if get_fields:
list_fields = set()
for field in get_fields:
list_fields.add('items/' + field)
return list_fields
def _GetObjectRef(self, bucket_url_string, gcs_object, with_version=False):
"""Creates a BucketListingRef of type OBJECT from the arguments.
Args:
bucket_url_string: Wildcardless string describing the containing bucket.
gcs_object: gsutil_api root Object for populating the BucketListingRef.
with_version: If true, return a reference with a versioned string.
Returns:
BucketListingRef of type OBJECT.
"""
# Generation can be None in test mocks, so just return the
# live object for simplicity.
if with_version and gcs_object.generation is not None:
generation_str = GenerationFromUrlAndString(self.wildcard_url,
gcs_object.generation)
object_string = '%s%s#%s' % (bucket_url_string, gcs_object.name,
generation_str)
else:
object_string = '%s%s' % (bucket_url_string, gcs_object.name)
object_url = StorageUrlFromString(object_string)
return BucketListingObject(object_url, root_object=gcs_object)
def _GetPrefixRef(self, bucket_url_string, prefix):
"""Creates a BucketListingRef of type PREFIX from the arguments.
Args:
bucket_url_string: Wildcardless string describing the containing bucket.
prefix: gsutil_api Prefix for populating the BucketListingRef
Returns:
BucketListingRef of type PREFIX.
"""
prefix_url = StorageUrlFromString('%s%s' % (bucket_url_string, prefix))
return BucketListingPrefix(prefix_url, root_object=prefix)
def IterBuckets(self, bucket_fields=None):
"""Iterates over the wildcard, returning refs for each expanded bucket.
This ignores the object part of the URL entirely and expands only the
the bucket portion. It will yield BucketListingRefs of type BUCKET only.
Args:
bucket_fields: Iterable fields to include in bucket listings.
Ex. ['defaultObjectAcl', 'logging']. This function is
responsible for converting these to listing-style
format ['items/defaultObjectAcl', 'items/logging'], as
well as adding any fields necessary for listing such as
'items/id'. API implemenation is responsible for
adding pagination fields. If this is None, all fields are
returned.
Yields:
BucketListingRef of type BUCKET, or empty iterator if no matches.
"""
for blr in self._ExpandBucketWildcards(bucket_fields=bucket_fields):
yield blr
def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):
"""Iterates over the wildcard, yielding bucket, prefix or object refs.
Args:
bucket_listing_fields: If present, populate only these metadata
fields for listed objects.
expand_top_level_buckets: If true and the wildcard expands only to
Bucket(s), yields the expansion of each bucket
into a top-level listing of prefixes and objects
in that bucket instead of a BucketListingRef
to that bucket.
Yields:
BucketListingRef, or empty iterator if no matches.
"""
for blr in self.__iter__(
bucket_listing_fields=bucket_listing_fields,
expand_top_level_buckets=expand_top_level_buckets):
yield blr
def IterObjects(self, bucket_listing_fields=None):
"""Iterates over the wildcard, yielding only object BucketListingRefs.
Args:
bucket_listing_fields: If present, populate only these metadata
fields for listed objects.
Yields:
BucketListingRefs of type OBJECT or empty iterator if no matches.
"""
for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields,
expand_top_level_buckets=True):
if blr.IsObject():
yield blr
class FileWildcardIterator(WildcardIterator):
"""WildcardIterator subclass for files and directories.
If you use recursive wildcards ('**') only a single such wildcard is
supported. For example you could use the wildcard '**/*.txt' to list all .txt
files in any subdirectory of the current directory, but you couldn't use a
wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt
files in any subdirectory named 'abc').
"""
def __init__(self, wildcard_url, debug=0):
"""Instantiates an iterator over BucketListingRefs matching wildcard URL.
Args:
wildcard_url: FileUrl that contains the wildcard to iterate.
debug: Debug level (range 0..3).
"""
self.wildcard_url = wildcard_url
self.debug = debug
def __iter__(self):
"""Iterator that gets called when iterating over the file wildcard.
In the case where no wildcard is present, returns a single matching file
or directory.
Raises:
WildcardException: if invalid wildcard found.
Yields:
BucketListingRef of type OBJECT (for files) or PREFIX (for directories)
"""
wildcard = self.wildcard_url.object_name
match = FLAT_LIST_REGEX.match(wildcard)
if match:
# Recursive wildcarding request ('.../**/...').
# Example input: wildcard = '/tmp/tmp2pQJAX/**/*'
base_dir = match.group('before')[:-1]
remaining_wildcard = match.group('after')
# At this point for the above example base_dir = '/tmp/tmp2pQJAX' and
# remaining_wildcard = '/*'
if remaining_wildcard.startswith('*'):
raise WildcardException('Invalid wildcard with more than 2 consecutive '
'*s (%s)' % wildcard)
# If there was no remaining wildcard past the recursive wildcard,
# treat it as if it were a '*'. For example, file://tmp/** is equivalent
# to file://tmp/**/*
if not remaining_wildcard:
remaining_wildcard = '*'
# Skip slash(es).
remaining_wildcard = remaining_wildcard.lstrip(os.sep)
filepaths = self._IterDir(base_dir, remaining_wildcard)
else:
# Not a recursive wildcarding request.
filepaths = glob.iglob(wildcard)
for filepath in filepaths:
expanded_url = StorageUrlFromString(filepath)
if os.path.isdir(filepath):
yield BucketListingPrefix(expanded_url)
else:
yield BucketListingObject(expanded_url)
def _IterDir(self, directory, wildcard):
"""An iterator over the specified dir and wildcard."""
# UTF8-encode directory before passing it to os.walk() so if there are
# non-valid UTF8 chars in the file name (e.g., that can happen if the file
# originated on Windows) os.walk() will not attempt to decode and then die
# with a "codec can't decode byte" error, and instead we can catch the error
# at yield time and print a more informative error message.
for dirpath, unused_dirnames, filenames in os.walk(directory.encode(UTF8)):
for f in fnmatch.filter(filenames, wildcard):
try:
yield os.path.join(dirpath, f).decode(UTF8)
except UnicodeDecodeError:
# Note: We considered several ways to deal with this, but each had
# problems:
# 1. Raise an exception and try to catch in a higher layer (the
# gsutil cp command), so we can properly support the gsutil cp -c
# option. That doesn't work because raising an exception during
# iteration terminates the generator.
# 2. Accumulate a list of bad filenames and skip processing each
# during iteration, then raise at the end, with exception text
# printing the bad paths. That doesn't work because iteration is
# wrapped in PluralityCheckableIterator, so it's possible there
# are not-yet-performed copy operations at the time we reach the
# end of the iteration and raise the exception - which would cause
# us to skip copying validly named files. Moreover, the gsutil
# cp command loops over argv, so if you run the command gsutil cp
# -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1
# would cause dir2 never to be visited.
# 3. Print the invalid pathname and skip it during iteration. That
# would work but would mean gsutil cp could exit with status 0
# even though some files weren't copied.
# 4. Change the WildcardIterator to include an error status along with
# the result. That would solve the problem but would be a
# substantial change (WildcardIterator is used in many parts of
# gsutil), and we didn't feel that magnitude of change was
# warranted by this relatively uncommon corner case.
# Instead we chose to abort when one such file is encountered, and
# require the user to remove or rename the files and try again.
raise CommandException('\n'.join(textwrap.wrap(
'Invalid Unicode path encountered (%s). gsutil cannot proceed '
'with such files present. Please remove or rename this file and '
'try again. NOTE: the path printed above replaces the '
'problematic characters with a hex-encoded printable '
'representation. For more details (including how to convert to a '
'gsutil-compatible encoding) see `gsutil help encoding`.' %
repr(os.path.join(dirpath, f)))))
# pylint: disable=unused-argument
def IterObjects(self, bucket_listing_fields=None):
"""Iterates over the wildcard, yielding only object (file) refs.
Args:
bucket_listing_fields: Ignored as filesystems don't have buckets.
Yields:
BucketListingRefs of type OBJECT or empty iterator if no matches.
"""
for bucket_listing_ref in self.IterAll():
if bucket_listing_ref.IsObject():
yield bucket_listing_ref
# pylint: disable=unused-argument
def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):
"""Iterates over the wildcard, yielding BucketListingRefs.
Args:
bucket_listing_fields: Ignored; filesystems don't have buckets.
expand_top_level_buckets: Ignored; filesystems don't have buckets.
Yields:
BucketListingRefs of type OBJECT (file) or PREFIX (directory),
or empty iterator if no matches.
"""
for bucket_listing_ref in self.__iter__():
yield bucket_listing_ref
def IterBuckets(self, unused_bucket_fields=None):
"""Placeholder to allow polymorphic use of WildcardIterator.
Args:
unused_bucket_fields: Ignored; filesystems don't have buckets.
Raises:
WildcardException: in all cases.
"""
raise WildcardException(
'Iterating over Buckets not possible for file wildcards')
class WildcardException(StandardError):
"""Exception raised for invalid wildcard URLs."""
def __init__(self, reason):
StandardError.__init__(self)
self.reason = reason
def __repr__(self):
return 'WildcardException: %s' % self.reason
def __str__(self):
return 'WildcardException: %s' % self.reason
def CreateWildcardIterator(url_str, gsutil_api, all_versions=False, debug=0,
project_id=None):
"""Instantiate a WildcardIterator for the given URL string.
Args:
url_str: URL string naming wildcard object(s) to iterate.
gsutil_api: Cloud storage interface. Passed in for thread safety, also
settable for testing/mocking.
all_versions: If true, the iterator yields all versions of objects
matching the wildcard. If false, yields just the live
object version.
debug: Debug level to control debug output for iterator.
project_id: Project id to use for bucket listings.
Returns:
A WildcardIterator that handles the requested iteration.
"""
url = StorageUrlFromString(url_str)
if url.IsFileUrl():
return FileWildcardIterator(url, debug=debug)
else: # Cloud URL
return CloudWildcardIterator(
url, gsutil_api, all_versions=all_versions, debug=debug,
project_id=project_id)