catapult/third_party/gsutil/gslib/commands/cp.py - platform/external/chromium-trace - Git at Google

 # -*- coding: utf-8 -*-
 # Copyright 2011 Google Inc. All Rights Reserved.
 # Copyright 2011, Nexenta Systems Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Implementation of Unix-like cp command for cloud storage providers."""

 from __future__ import absolute_import

 import os
 import time
 import traceback

 from gslib import copy_helper
 from gslib.cat_helper import CatHelper
 from gslib.command import Command
 from gslib.command_argument import CommandArgument
 from gslib.commands.compose import MAX_COMPONENT_COUNT
 from gslib.copy_helper import CreateCopyHelperOpts
 from gslib.copy_helper import ItemExistsError
 from gslib.copy_helper import Manifest
 from gslib.copy_helper import PARALLEL_UPLOAD_TEMP_NAMESPACE
 from gslib.copy_helper import SkipUnsupportedObjectError
 from gslib.cs_api_map import ApiSelector
 from gslib.exception import CommandException
 from gslib.name_expansion import NameExpansionIterator
 from gslib.storage_url import ContainsWildcard
 from gslib.util import CreateLock
 from gslib.util import GetCloudApiInstance
 from gslib.util import IsCloudSubdirPlaceholder
 from gslib.util import MakeHumanReadable
 from gslib.util import NO_MAX
 from gslib.util import RemoveCRLFFromString
 from gslib.util import StdinIterator

 _SYNOPSIS = """
   gsutil cp [OPTION]... src_url dst_url
   gsutil cp [OPTION]... src_url... dst_url
   gsutil cp [OPTION]... -I dst_url
 """

 _SYNOPSIS_TEXT = """
 <B>SYNOPSIS</B>
 """ + _SYNOPSIS

 _DESCRIPTION_TEXT = """
 <B>DESCRIPTION</B>
   The gsutil cp command allows you to copy data between your local file
   system and the cloud, copy data within the cloud, and copy data between
   cloud storage providers. For example, to copy all text files from the
   local directory to a bucket you could do:

     gsutil cp *.txt gs://my_bucket

   Similarly, you can download text files from a bucket by doing:

     gsutil cp gs://my_bucket/*.txt .

   If you want to copy an entire directory tree you need to use the -r option:

     gsutil cp -r dir gs://my_bucket

   If you have a large number of files to upload you might want to use the
   gsutil -m option, to perform a parallel (multi-threaded/multi-processing)
   copy:

     gsutil -m cp -r dir gs://my_bucket

   You can pass a list of URLs (one per line) to copy on stdin instead of as
   command line arguments by using the -I option. This allows you to use gsutil
   in a pipeline to upload or download files / objects as generated by a program,
   such as:

     some_program | gsutil -m cp -I gs://my_bucket

   or:

     some_program | gsutil -m cp -I ./download_dir

   The contents of stdin can name files, cloud URLs, and wildcards of files
   and cloud URLs.
 """

 _NAME_CONSTRUCTION_TEXT = """
 <B>HOW NAMES ARE CONSTRUCTED</B>
   The gsutil cp command strives to name objects in a way consistent with how
   Linux cp works, which causes names to be constructed in varying ways depending
   on whether you're performing a recursive directory copy or copying
   individually named objects; and whether you're copying to an existing or
   non-existent directory.

   When performing recursive directory copies, object names are constructed
   that mirror the source directory structure starting at the point of
   recursive processing. For example, the command:

     gsutil cp -r dir1/dir2 gs://my_bucket

   will create objects named like gs://my_bucket/dir2/a/b/c, assuming
   dir1/dir2 contains the file a/b/c.

   In contrast, copying individually named files will result in objects named
   by the final path component of the source files. For example, the command:

     gsutil cp dir1/dir2/** gs://my_bucket

   will create objects named like gs://my_bucket/c.

   The same rules apply for downloads: recursive copies of buckets and
   bucket subdirectories produce a mirrored filename structure, while copying
   individually (or wildcard) named objects produce flatly named files.

   Note that in the above example the '**' wildcard matches all names
   anywhere under dir. The wildcard '*' will match names just one level deep. For
   more details see 'gsutil help wildcards'.

   There's an additional wrinkle when working with subdirectories: the resulting
   names depend on whether the destination subdirectory exists. For example,
   if gs://my_bucket/subdir exists as a subdirectory, the command:

     gsutil cp -r dir1/dir2 gs://my_bucket/subdir

   will create objects named like gs://my_bucket/subdir/dir2/a/b/c. In contrast,
   if gs://my_bucket/subdir does not exist, this same gsutil cp command will
   create objects named like gs://my_bucket/subdir/a/b/c.

   Note: If you use the
   `Google Developers Console <https://console.developers.google.com>`_
   to create folders, it does so by creating a "placeholder" object that ends
   with a "/" character. gsutil skips these objects when downloading from the
   cloud to the local file system, because attempting to create a file that
   ends with a "/" is not allowed on Linux and MacOS. Because of this, it is
   recommended that you not create objects that end with "/" (unless you don't
   need to be able to download such objects using gsutil).
 """

 _SUBDIRECTORIES_TEXT = """
 <B>COPYING TO/FROM SUBDIRECTORIES; DISTRIBUTING TRANSFERS ACROSS MACHINES</B>
   You can use gsutil to copy to and from subdirectories by using a command
   like:

     gsutil cp -r dir gs://my_bucket/data

   This will cause dir and all of its files and nested subdirectories to be
   copied under the specified destination, resulting in objects with names like
   gs://my_bucket/data/dir/a/b/c. Similarly you can download from bucket
   subdirectories by using a command like:

     gsutil cp -r gs://my_bucket/data dir

   This will cause everything nested under gs://my_bucket/data to be downloaded
   into dir, resulting in files with names like dir/data/a/b/c.

   Copying subdirectories is useful if you want to add data to an existing
   bucket directory structure over time. It's also useful if you want
   to parallelize uploads and downloads across multiple machines (often
   reducing overall transfer time compared with simply running gsutil -m
   cp on one machine). For example, if your bucket contains this structure:

     gs://my_bucket/data/result_set_01/
     gs://my_bucket/data/result_set_02/
     ...
     gs://my_bucket/data/result_set_99/

   you could perform concurrent downloads across 3 machines by running these
   commands on each machine, respectively:

     gsutil -m cp -r gs://my_bucket/data/result_set_[0-3]* dir
     gsutil -m cp -r gs://my_bucket/data/result_set_[4-6]* dir
     gsutil -m cp -r gs://my_bucket/data/result_set_[7-9]* dir

   Note that dir could be a local directory on each machine, or it could
   be a directory mounted off of a shared file server; whether the latter
   performs acceptably may depend on a number of things, so we recommend
   you experiment and find out what works best for you.
 """

 _COPY_IN_CLOUD_TEXT = """
 <B>COPYING IN THE CLOUD AND METADATA PRESERVATION</B>
   If both the source and destination URL are cloud URLs from the same
   provider, gsutil copies data "in the cloud" (i.e., without downloading
   to and uploading from the machine where you run gsutil). In addition to
   the performance and cost advantages of doing this, copying in the cloud
   preserves metadata (like Content-Type and Cache-Control). In contrast,
   when you download data from the cloud it ends up in a file, which has
   no associated metadata. Thus, unless you have some way to hold on to
   or re-create that metadata, downloading to a file will not retain the
   metadata.

   Copies spanning locations and/or storage classes cause data to be rewritten
   in the cloud, which may take some time. Such operations can be resumed with
   the same command if they are interrupted, so long as the command parameters
   are identical.

   Note that by default, the gsutil cp command does not copy the object
   ACL to the new object, and instead will use the default bucket ACL (see
   "gsutil help defacl").  You can override this behavior with the -p
   option (see OPTIONS below).

   One additional note about copying in the cloud: If the destination bucket has
   versioning enabled, gsutil cp will by default copy only live versions of the
   source object(s). For example:

     gsutil cp gs://bucket1/obj gs://bucket2

   will cause only the single live version of of gs://bucket1/obj to be copied
   to gs://bucket2, even if there are archived versions of gs://bucket1/obj. To
   also copy archived versions, use the -A flag:

     gsutil cp -A gs://bucket1/obj gs://bucket2

   The gsutil -m flag is disallowed when using the cp -A flag, to ensure that
   version ordering is preserved.
 """

 _CHECKSUM_VALIDATION_TEXT = """
 <B>CHECKSUM VALIDATION</B>
   At the end of every upload or download the gsutil cp command validates that
   the checksum it computes for the source file/object matches the checksum
   the service computes. If the checksums do not match, gsutil will delete the
   corrupted object and print a warning message. This very rarely happens, but
   if it does, please contact gs-team@google.com.

   If you know the MD5 of a file before uploading you can specify it in the
   Content-MD5 header, which will cause the cloud storage service to reject the
   upload if the MD5 doesn't match the value computed by the service. For
   example:

     % gsutil hash obj
     Hashing     obj:
     Hashes [base64] for obj:
             Hash (crc32c):          lIMoIw==
             Hash (md5):             VgyllJgiiaRAbyUUIqDMmw==

     % gsutil -h Content-MD5:VgyllJgiiaRAbyUUIqDMmw== cp obj gs://your-bucket/obj
     Copying file://obj [Content-Type=text/plain]...
     Uploading   gs://your-bucket/obj:                                182 b/182 B

     If the checksum didn't match the service would instead reject the upload and
     gsutil would print a message like:

     BadRequestException: 400 Provided MD5 hash "VgyllJgiiaRAbyUUIqDMmw=="
     doesn't match calculated MD5 hash "7gyllJgiiaRAbyUUIqDMmw==".

   Even if you don't do this gsutil will delete the object if the computed
   checksum mismatches, but specifying the Content-MD5 header has three
   advantages:

       1. It prevents the corrupted object from becoming visible at all, whereas
       otherwise it would be visible for 1-3 seconds before gsutil deletes it.

       2. It will definitively prevent the corrupted object from being left in
       the cloud, whereas the gsutil approach of deleting after the upload
       completes could fail if (for example) the gsutil process gets ^C'd
       between upload and deletion request.

       3. It supports a customer-to-service integrity check handoff. For example,
       if you have a content production pipeline that generates data to be
       uploaded to the cloud along with checksums of that data, specifying the
       MD5 computed by your content pipeline when you run gsutil cp will ensure
       that the checksums match all the way through the process (e.g., detecting
       if data gets corrupted on your local disk between the time it was written
       by your content pipeline and the time it was uploaded to GCS).

   Note: The Content-MD5 header is ignored for composite objects, because such
   objects only have a CRC32C checksum.
 """

 _RETRY_HANDLING_TEXT = """
 <B>RETRY HANDLING</B>
   The cp command will retry when failures occur, but if enough failures happen
   during a particular copy or delete operation the command will skip that object
   and move on. At the end of the copy run if any failures were not successfully
   retried, the cp command will report the count of failures, and exit with
   non-zero status.

   Note that there are cases where retrying will never succeed, such as if you
   don't have write permission to the destination bucket or if the destination
   path for some objects is longer than the maximum allowed length.

   For more details about gsutil's retry handling, please see
   "gsutil help retries".
 """

 _RESUMABLE_TRANSFERS_TEXT = """
 <B>RESUMABLE TRANSFERS</B>
   gsutil automatically uses the Google Cloud Storage resumable upload feature
   whenever you use the cp command to upload an object that is larger than 2
   MiB. You do not need to specify any special command line options to make this
   happen. If your upload is interrupted you can restart the upload by running
   the same cp command that you ran to start the upload. Until the upload
   has completed successfully, it will not be visible at the destination object
   and will not replace any existing object the upload is intended to overwrite.
   (However, see the section on PARALLEL COMPOSITE UPLOADS, which may leave
   temporary component objects in place during the upload process.)

   Similarly, gsutil automatically performs resumable downloads (using HTTP
   standard Range GET operations) whenever you use the cp command, unless the
   destination is a stream or null. In this case, a partially downloaded
   temporary file will be visible in the destination directory. Upon completion,
   the original file is deleted and overwritten with the downloaded contents.

   Resumable uploads and downloads store some state information in a files
   in ~/.gsutil named by the destination object or file. If you attempt to
   resume a transfer from a machine with a different directory, the transfer
   will start over from scratch.

   See also "gsutil help prod" for details on using resumable transfers
   in production.
 """

 _STREAMING_TRANSFERS_TEXT = """
 <B>STREAMING TRANSFERS</B>
   Use '-' in place of src_url or dst_url to perform a streaming
   transfer. For example:

     long_running_computation | gsutil cp - gs://my_bucket/obj

   Streaming uploads using the JSON API (see "gsutil help apis") are buffered in
   memory and can retry in the event of network flakiness or service errors.

   Streaming transfers (other than uploads using the JSON API) do not support
   resumable uploads/downloads. If you have a large amount of data to upload
   (say, more than 100 MiB) it is recommended to write the data to a local file
   and then copy that file to the cloud rather than streaming it (and similarly
   for large downloads).

   WARNING: When performing streaming transfers gsutil does not compute a
   checksum of the uploaded or downloaded data.  Therefore, we recommend that
   users either perform their own validation of the data or use non-streaming
   transfers (which perform integrity checking automatically).
 """

 _SLICED_OBJECT_DOWNLOADS_TEXT = """
 <B>SLICED OBJECT DOWNLOADS</B>
   gsutil automatically uses HTTP Range GET requests to perform "sliced"
   downloads in parallel for downloads of large objects. This means that, if
   enabled, disk space for the temporary download destination file will be
   pre-allocated and byte ranges (slices) within the file will be downloaded in
   parallel. Once all slices have completed downloading, the temporary file will
   be renamed to the destination file. No additional local disk space is
   required for this operation.

   This feature is only available for Google Cloud Storage objects because it
   requires a fast composable checksum that can be used to verify the data
   integrity of the slices. Thus, using sliced object downloads also requires a
   compiled crcmod (see "gsutil help crcmod") on the machine performing the
   download. If compiled crcmod is not available, normal download will instead
   be used.

   Note: since sliced object downloads cause multiple writes to occur at various
   locations on disk, this can degrade performance for disks with slow seek
   times, especially for large numbers of slices. While the default number of
   slices is small to avoid this, sliced object download can be completely
   disabled by setting the "sliced_object_download_threshold" variable in the
   .boto config file to 0.
 """

 _PARALLEL_COMPOSITE_UPLOADS_TEXT = """
 <B>PARALLEL COMPOSITE UPLOADS</B>
   gsutil can automatically use
   `object composition <https://developers.google.com/storage/docs/composite-objects>`_
   to perform uploads in parallel for large, local files being uploaded to Google
   Cloud Storage. This means that, if enabled (see next paragraph), a large file
   will be split into component pieces that will be uploaded in parallel. Those
   components will then be composed in the cloud, and the temporary components in
   the cloud will be deleted after successful composition. No additional local
   disk space is required for this operation.

   Using parallel composite uploads presents a tradeoff between upload
   performance and download configuration: If you enable parallel composite
   uploads your uploads will run faster, but someone will need to install a
   compiled crcmod (see "gsutil help crcmod") on every machine where objects are
   downloaded by gsutil or other Python applications. For some distributions this
   is easy (e.g., it comes pre-installed on MacOS), but in some cases users have
   found it difficult. Because of this at present parallel composite uploads are
   disabled by default. Google is actively working with a number of the Linux
   distributions to get crcmod included with the stock distribution. Once that is
   done we will re-enable parallel composite uploads by default in gsutil.

   Parallel composite uploads should not be used with NEARLINE storage
   class buckets, as doing this would incur an early deletion charge for each
   component object.

   To try parallel composite uploads you can run the command:

     gsutil -o GSUtil:parallel_composite_upload_threshold=150M cp bigfile gs://your-bucket

   where bigfile is larger than 150 MiB. When you do this notice that the upload
   progress indicator continuously updates for several different uploads at once
   (corresponding to each of the sections of the file being uploaded in
   parallel), until the parallel upload completes. If you then want to enable
   parallel composite uploads for all of your future uploads (notwithstanding the
   caveats mentioned earlier), you can uncomment and set the
   "parallel_composite_upload_threshold" config value in your .boto configuration
   file to this value.

   Note that the crcmod problem only impacts downloads via Python applications
   (such as gsutil). If any users who need to download the data using gsutil or
   other Python applications can install crcmod, it makes sense to enable
   parallel composite uploads (see above). For example, if you use gsutil to
   upload video assets and those assets will only ever be served via a Java
   application (there are efficient crc32c implementations available in Java), it
   would make sense to enable parallel composite uploads on your machine.

   If a parallel composite upload fails prior to composition, re-running the
   gsutil command will take advantage of resumable uploads for those components
   that failed, and the component objects will be deleted after the first
   successful attempt. Any temporary objects that were uploaded successfully
   before gsutil failed will still exist until the upload is completed
   successfully. The temporary objects will be named in the following fashion:

     <random ID>%s<hash>

   where <random ID> is some numerical value, and <hash> is an MD5 hash (not
   related to the hash of the contents of the file or object).

   To avoid leaving temporary objects around, you should make sure to check the
   exit status from the gsutil command.  This can be done in a bash script, for
   example, by doing:

      gsutil cp ./local-file gs://your-bucket/your-object
      if [ "$status" -ne "0" ] ; then
        << Code that handles failures >>
      fi

   Or, for copying a directory, use this instead:

      gsutil cp -c -L cp.log -r ./dir gs://bucket
      if [ "$status" -ne "0" ] ; then
        << Code that handles failures >>
      fi

   One important caveat is that files uploaded in this fashion are still subject
   to the maximum number of components limit. For example, if you upload a large
   file that gets split into %d components, and try to compose it with another
   object with %d components, the operation will fail because it exceeds the %d
   component limit. If you wish to compose an object later and the component
   limit is a concern, it is recommended that you disable parallel composite
   uploads for that transfer.

   Also note that an object uploaded using this feature will have a CRC32C hash,
   but it will not have an MD5 hash (and because of that, requires users who
   download the object to have crcmod installed, as noted earlier). For details
   see 'gsutil help crc32c'.

   Note that this feature can be completely disabled by setting the
   "parallel_composite_upload_threshold" variable in the .boto config file to 0.
 """ % (PARALLEL_UPLOAD_TEMP_NAMESPACE, 10, MAX_COMPONENT_COUNT - 9,
        MAX_COMPONENT_COUNT)


 _CHANGING_TEMP_DIRECTORIES_TEXT = """
 <B>CHANGING TEMP DIRECTORIES</B>
   gsutil writes data to a temporary directory in several cases:

   - when compressing data to be uploaded (see the -z option)
   - when decompressing data being downloaded (when the data has
     Content-Encoding:gzip, e.g., as happens when uploaded using gsutil cp -z)
   - when running integration tests (using the gsutil test command)

   In these cases it's possible the temp file location on your system that
   gsutil selects by default may not have enough space. If you find that
   gsutil runs out of space during one of these operations (e.g., raising
   "CommandException: Inadequate temp space available to compress <your file>"
   during a gsutil cp -z operation), you can change where it writes these
   temp files by setting the TMPDIR environment variable. On Linux and MacOS
   you can do this either by running gsutil this way:

     TMPDIR=/some/directory gsutil cp ...

   or by adding this line to your ~/.bashrc file and then restarting the shell
   before running gsutil:

     export TMPDIR=/some/directory

   On Windows 7 you can change the TMPDIR environment variable from Start ->
   Computer -> System -> Advanced System Settings -> Environment Variables.
   You need to reboot after making this change for it to take effect. (Rebooting
   is not necessary after running the export command on Linux and MacOS.)
 """

 _OPTIONS_TEXT = """
 <B>OPTIONS</B>
   -a canned_acl  Sets named canned_acl when uploaded objects created. See
                  'gsutil help acls' for further details.

   -A             Copy all source versions from a source buckets/folders.
                  If not set, only the live version of each source object is
                  copied. Note: this option is only useful when the destination
                  bucket has versioning enabled.

   -c             If an error occurs, continue to attempt to copy the remaining
                  files. If any copies were unsuccessful, gsutil's exit status
                  will be non-zero even if this flag is set. This option is
                  implicitly set when running "gsutil -m cp...". Note: -c only
                  applies to the actual copying operation. If an error occurs
                  while iterating over the files in the local directory (e.g.,
                  invalid Unicode file name) gsutil will print an error message
                  and abort.

   -D             Copy in "daisy chain" mode, i.e., copying between two buckets
                  by hooking a download to an upload, via the machine where
                  gsutil is run. By default, data are copied between two buckets
                  "in the cloud", i.e., without needing to copy via the machine
                  where gsutil runs.

                  By default, a "copy in the cloud" when the source is a
                  composite object will retain the composite nature of the
                  object. However, Daisy chain mode can be used to change a
                  composite object into a non-composite object. For example:

                      gsutil cp -D -p gs://bucket/obj gs://bucket/obj_tmp
                      gsutil mv -p gs://bucket/obj_tmp gs://bucket/obj

                  Note: Daisy chain mode is automatically used when copying
                  between providers (e.g., to copy data from Google Cloud Storage
                  to another provider).

   -e             Exclude symlinks. When specified, symbolic links will not be
                  copied.

   -I             Causes gsutil to read the list of files or objects to copy from
                  stdin. This allows you to run a program that generates the list
                  of files to upload/download.

   -L <file>      Outputs a manifest log file with detailed information about
                  each item that was copied. This manifest contains the following
                  information for each item:

                  - Source path.
                  - Destination path.
                  - Source size.
                  - Bytes transferred.
                  - MD5 hash.
                  - UTC date and time transfer was started in ISO 8601 format.
                  - UTC date and time transfer was completed in ISO 8601 format.
                  - Upload id, if a resumable upload was performed.
                  - Final result of the attempted transfer, success or failure.
                  - Failure details, if any.

                  If the log file already exists, gsutil will use the file as an
                  input to the copy process, and will also append log items to
                  the existing file. Files/objects that are marked in the
                  existing log file as having been successfully copied (or
                  skipped) will be ignored. Files/objects without entries will be
                  copied and ones previously marked as unsuccessful will be
                  retried. This can be used in conjunction with the -c option to
                  build a script that copies a large number of objects reliably,
                  using a bash script like the following:

                    until gsutil cp -c -L cp.log -r ./dir gs://bucket; do
                      sleep 1
                    done

                  The -c option will cause copying to continue after failures
                  occur, and the -L option will allow gsutil to pick up where it
                  left off without duplicating work. The loop will continue
                  running as long as gsutil exits with a non-zero status (such a
                  status indicates there was at least one failure during the
                  gsutil run).

                  Note: If you're trying to synchronize the contents of a
                  directory and a bucket (or two buckets), see
                  'gsutil help rsync'.

   -n             No-clobber. When specified, existing files or objects at the
                  destination will not be overwritten. Any items that are skipped
                  by this option will be reported as being skipped. This option
                  will perform an additional GET request to check if an item
                  exists before attempting to upload the data. This will save
                  retransmitting data, but the additional HTTP requests may make
                  small object transfers slower and more expensive.

   -p             Causes ACLs to be preserved when copying in the cloud. Note
                  that this option has performance and cost implications when
                  using  the XML API, as it requires separate HTTP calls for
                  interacting with ACLs. The performance issue can be mitigated
                  to some degree by using gsutil -m cp to cause parallel copying.
                  Also, this option only works if you have OWNER access to all of
                  the objects that are copied.

                  You can avoid the additional performance and cost of using
                  cp -p if you want all objects in the destination bucket to end
                  up with the same ACL by setting a default object ACL on that
                  bucket instead of using cp -p. See "help gsutil defacl".

                  Note that it's not valid to specify both the -a and -p options
                  together.

   -R, -r         Causes directories, buckets, and bucket subdirectories to be
                  copied recursively. If you neglect to use this option for
                  an upload, gsutil will copy any files it finds and skip any
                  directories. Similarly, neglecting to specify -r for a download
                  will cause gsutil to copy any objects at the current bucket
                  directory level, and skip any subdirectories.

   -U             Skip objects with unsupported object types instead of failing.
                  Unsupported object types are Amazon S3 Objects in the GLACIER
                  storage class.

   -v             Requests that the version-specific URL for each uploaded object
                  be printed. Given this URL you can make future upload requests
                  that are safe in the face of concurrent updates, because Google
                  Cloud Storage will refuse to perform the update if the current
                  object version doesn't match the version-specific URL. See
                  'gsutil help versions' for more details.

   -z <ext,...>   Applies gzip content-encoding to file uploads with the given
                  extensions. This is useful when uploading files with
                  compressible content (such as .js, .css, or .html files)
                  because it saves network bandwidth and space in Google Cloud
                  Storage, which in turn reduces storage costs.

                  When you specify the -z option, the data from your files is
                  compressed before it is uploaded, but your actual files are
                  left uncompressed on the local disk. The uploaded objects
                  retain the Content-Type and name of the original files but are
                  given a Content-Encoding header with the value "gzip" to
                  indicate that the object data stored are compressed on the
                  Google Cloud Storage servers.

                  For example, the following command:

                    gsutil cp -z html -a public-read cattypes.html gs://mycats

                  will do all of the following:

                  - Upload as the object gs://mycats/cattypes.html (cp command)
                  - Set the Content-Type to text/html (based on file extension)
                  - Compress the data in the file cattypes.html (-z option)
                  - Set the Content-Encoding to gzip (-z option)
                  - Set the ACL to public-read (-a option)
                  - If a user tries to view cattypes.html in a browser, the
                    browser will know to uncompress the data based on the
                    Content-Encoding header, and to render it as HTML based on
                    the Content-Type header.

                  Note that if you download an object with Content-Encoding:gzip
                  gsutil will decompress the content before writing the local
                  file.
 """

 _DETAILED_HELP_TEXT = '\n\n'.join([_SYNOPSIS_TEXT,
                                    _DESCRIPTION_TEXT,
                                    _NAME_CONSTRUCTION_TEXT,
                                    _SUBDIRECTORIES_TEXT,
                                    _COPY_IN_CLOUD_TEXT,
                                    _CHECKSUM_VALIDATION_TEXT,
                                    _RETRY_HANDLING_TEXT,
                                    _RESUMABLE_TRANSFERS_TEXT,
                                    _STREAMING_TRANSFERS_TEXT,
                                    _SLICED_OBJECT_DOWNLOADS_TEXT,
                                    _PARALLEL_COMPOSITE_UPLOADS_TEXT,
                                    _CHANGING_TEMP_DIRECTORIES_TEXT,
                                    _OPTIONS_TEXT])


 CP_SUB_ARGS = 'a:AcDeIL:MNnprRtUvz:'


 def _CopyFuncWrapper(cls, args, thread_state=None):
   cls.CopyFunc(args, thread_state=thread_state)


 def _CopyExceptionHandler(cls, e):
   """Simple exception handler to allow post-completion status."""
   cls.logger.error(str(e))
   cls.op_failure_count += 1
   cls.logger.debug('\n\nEncountered exception while copying:\n%s\n',
                    traceback.format_exc())


 def _RmExceptionHandler(cls, e):
   """Simple exception handler to allow post-completion status."""
   cls.logger.error(str(e))


 class CpCommand(Command):
   """Implementation of gsutil cp command.

   Note that CpCommand is run for both gsutil cp and gsutil mv. The latter
   happens by MvCommand calling CpCommand and passing the hidden (undocumented)
   -M option. This allows the copy and remove needed for each mv to run
   together (rather than first running all the cp's and then all the rm's, as
   we originally had implemented), which in turn avoids the following problem
   with removing the wrong objects: starting with a bucket containing only
   the object gs://bucket/obj, say the user does:
     gsutil mv gs://bucket/* gs://bucket/d.txt
   If we ran all the cp's and then all the rm's and we didn't expand the wildcard
   first, the cp command would first copy gs://bucket/obj to gs://bucket/d.txt,
   and the rm command would then remove that object. In the implementation
   prior to gsutil release 3.12 we avoided this by building a list of objects
   to process and then running the copies and then the removes; but building
   the list up front limits scalability (compared with the current approach
   of processing the bucket listing iterator on the fly).
   """

   # Command specification. See base class for documentation.
   command_spec = Command.CreateCommandSpec(
       'cp',
       command_name_aliases=['copy'],
       usage_synopsis=_SYNOPSIS,
       min_args=1,
       max_args=NO_MAX,
       # -t is deprecated but leave intact for now to avoid breakage.
       supported_sub_args=CP_SUB_ARGS,
       file_url_ok=True,
       provider_url_ok=False,
       urls_start_arg=0,
       gs_api_support=[ApiSelector.XML, ApiSelector.JSON],
       gs_default_api=ApiSelector.JSON,
       supported_private_args=['testcallbackfile='],
       argparse_arguments=[
           CommandArgument.MakeZeroOrMoreCloudOrFileURLsArgument()
       ]
   )
   # Help specification. See help_provider.py for documentation.
   help_spec = Command.HelpSpec(
       help_name='cp',
       help_name_aliases=['copy'],
       help_type='command_help',
       help_one_line_summary='Copy files and objects',
       help_text=_DETAILED_HELP_TEXT,
       subcommand_help_text={},
   )

   # pylint: disable=too-many-statements
   def CopyFunc(self, name_expansion_result, thread_state=None):
     """Worker function for performing the actual copy (and rm, for mv)."""
     gsutil_api = GetCloudApiInstance(self, thread_state=thread_state)

     copy_helper_opts = copy_helper.GetCopyHelperOpts()
     if copy_helper_opts.perform_mv:
       cmd_name = 'mv'
     else:
       cmd_name = self.command_name
     src_url = name_expansion_result.source_storage_url
     exp_src_url = name_expansion_result.expanded_storage_url
     src_url_names_container = name_expansion_result.names_container
     have_multiple_srcs = name_expansion_result.is_multi_source_request

     if src_url.IsCloudUrl() and src_url.IsProvider():
       raise CommandException(
           'The %s command does not allow provider-only source URLs (%s)' %
           (cmd_name, src_url))
     if have_multiple_srcs:
       copy_helper.InsistDstUrlNamesContainer(
           self.exp_dst_url, self.have_existing_dst_container, cmd_name)

     # Various GUI tools (like the GCS web console) create placeholder objects
     # ending with '/' when the user creates an empty directory. Normally these
     # tools should delete those placeholders once objects have been written
     # "under" the directory, but sometimes the placeholders are left around. We
     # need to filter them out here, otherwise if the user tries to rsync from
     # GCS to a local directory it will result in a directory/file conflict
     # (e.g., trying to download an object called "mydata/" where the local
     # directory "mydata" exists).
     if IsCloudSubdirPlaceholder(exp_src_url):
       # We used to output the message 'Skipping cloud sub-directory placeholder
       # object...' but we no longer do so because it caused customer confusion.
       return

     if copy_helper_opts.use_manifest and self.manifest.WasSuccessful(
         exp_src_url.url_string):
       return

     if copy_helper_opts.perform_mv:
       if name_expansion_result.names_container:
         # Use recursion_requested when performing name expansion for the
         # directory mv case so we can determine if any of the source URLs are
         # directories (and then use cp -r and rm -r to perform the move, to
         # match the behavior of Linux mv (which when moving a directory moves
         # all the contained files).
         self.recursion_requested = True
         # Disallow wildcard src URLs when moving directories, as supporting it
         # would make the name transformation too complex and would also be
         # dangerous (e.g., someone could accidentally move many objects to the
         # wrong name, or accidentally overwrite many objects).
         if ContainsWildcard(src_url.url_string):
           raise CommandException('The mv command disallows naming source '
                                  'directories using wildcards')

     if (self.exp_dst_url.IsFileUrl()
         and not os.path.exists(self.exp_dst_url.object_name)
         and have_multiple_srcs):
       os.makedirs(self.exp_dst_url.object_name)

     dst_url = copy_helper.ConstructDstUrl(
         src_url, exp_src_url, src_url_names_container, have_multiple_srcs,
         self.exp_dst_url, self.have_existing_dst_container,
         self.recursion_requested)
     dst_url = copy_helper.FixWindowsNaming(src_url, dst_url)

     copy_helper.CheckForDirFileConflict(exp_src_url, dst_url)
     if copy_helper.SrcDstSame(exp_src_url, dst_url):
       raise CommandException('%s: "%s" and "%s" are the same file - '
                              'abort.' % (cmd_name, exp_src_url, dst_url))

     if dst_url.IsCloudUrl() and dst_url.HasGeneration():
       raise CommandException('%s: a version-specific URL\n(%s)\ncannot be '
                              'the destination for gsutil cp - abort.'
                              % (cmd_name, dst_url))

     elapsed_time = bytes_transferred = 0
     try:
       if copy_helper_opts.use_manifest:
         self.manifest.Initialize(
             exp_src_url.url_string, dst_url.url_string)
       (elapsed_time, bytes_transferred, result_url, md5) = (
           copy_helper.PerformCopy(
               self.logger, exp_src_url, dst_url, gsutil_api,
               self, _CopyExceptionHandler, allow_splitting=True,
               headers=self.headers, manifest=self.manifest,
               gzip_exts=self.gzip_exts))
       if copy_helper_opts.use_manifest:
         if md5:
           self.manifest.Set(exp_src_url.url_string, 'md5', md5)
         self.manifest.SetResult(
             exp_src_url.url_string, bytes_transferred, 'OK')
       if copy_helper_opts.print_ver:
         # Some cases don't return a version-specific URL (e.g., if destination
         # is a file).
         self.logger.info('Created: %s', result_url)
     except ItemExistsError:
       message = 'Skipping existing item: %s' % dst_url
       self.logger.info(message)
       if copy_helper_opts.use_manifest:
         self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message)
     except SkipUnsupportedObjectError, e:
       message = ('Skipping item %s with unsupported object type %s' %
                  (exp_src_url.url_string, e.unsupported_type))
       self.logger.info(message)
       if copy_helper_opts.use_manifest:
         self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message)
     except copy_helper.FileConcurrencySkipError, e:
       self.logger.warn('Skipping copy of source URL %s because destination URL '
                        '%s is already being copied by another gsutil process '
                        'or thread (did you specify the same source URL twice?) '
                        % (src_url, dst_url))
     except Exception, e:
       if (copy_helper_opts.no_clobber and
           copy_helper.IsNoClobberServerException(e)):
         message = 'Rejected (noclobber): %s' % dst_url
         self.logger.info(message)
         if copy_helper_opts.use_manifest:
           self.manifest.SetResult(
               exp_src_url.url_string, 0, 'skip', message)
       elif self.continue_on_error:
         message = 'Error copying %s: %s' % (src_url, str(e))
         self.op_failure_count += 1
         self.logger.error(message)
         if copy_helper_opts.use_manifest:
           self.manifest.SetResult(
               exp_src_url.url_string, 0, 'error',
               RemoveCRLFFromString(message))
       else:
         if copy_helper_opts.use_manifest:
           self.manifest.SetResult(
               exp_src_url.url_string, 0, 'error', str(e))
         raise
     else:
       if copy_helper_opts.perform_mv:
         self.logger.info('Removing %s...', exp_src_url)
         if exp_src_url.IsCloudUrl():
           gsutil_api.DeleteObject(exp_src_url.bucket_name,
                                   exp_src_url.object_name,
                                   generation=exp_src_url.generation,
                                   provider=exp_src_url.scheme)
         else:
           os.unlink(exp_src_url.object_name)

     with self.stats_lock:
       self.total_elapsed_time += elapsed_time
       self.total_bytes_transferred += bytes_transferred

   # Command entry point.
   def RunCommand(self):
     copy_helper_opts = self._ParseOpts()

     self.total_elapsed_time = self.total_bytes_transferred = 0
     if self.args[-1] == '-' or self.args[-1] == 'file://-':
       return CatHelper(self).CatUrlStrings(self.args[:-1])

     if copy_helper_opts.read_args_from_stdin:
       if len(self.args) != 1:
         raise CommandException('Source URLs cannot be specified with -I option')
       url_strs = StdinIterator()
     else:
       if len(self.args) < 2:
         raise CommandException('Wrong number of arguments for "cp" command.')
       url_strs = self.args[:-1]

     (self.exp_dst_url, self.have_existing_dst_container) = (
         copy_helper.ExpandUrlToSingleBlr(self.args[-1], self.gsutil_api,
                                          self.debug, self.project_id))

     name_expansion_iterator = NameExpansionIterator(
         self.command_name, self.debug,
         self.logger, self.gsutil_api, url_strs,
         self.recursion_requested or copy_helper_opts.perform_mv,
         project_id=self.project_id, all_versions=self.all_versions,
         continue_on_error=self.continue_on_error or self.parallel_operations)

     # Use a lock to ensure accurate statistics in the face of
     # multi-threading/multi-processing.
     self.stats_lock = CreateLock()

     # Tracks if any copies failed.
     self.op_failure_count = 0

     # Start the clock.
     start_time = time.time()

     # Tuple of attributes to share/manage across multiple processes in
     # parallel (-m) mode.
     shared_attrs = ('op_failure_count', 'total_bytes_transferred')

     # Perform copy requests in parallel (-m) mode, if requested, using
     # configured number of parallel processes and threads. Otherwise,
     # perform requests with sequential function calls in current process.
     self.Apply(_CopyFuncWrapper, name_expansion_iterator,
                _CopyExceptionHandler, shared_attrs,
                fail_on_error=(not self.continue_on_error))
     self.logger.debug(
         'total_bytes_transferred: %d', self.total_bytes_transferred)

     end_time = time.time()
     self.total_elapsed_time = end_time - start_time

     # Sometimes, particularly when running unit tests, the total elapsed time
     # is really small. On Windows, the timer resolution is too small and
     # causes total_elapsed_time to be zero.
     try:
       float(self.total_bytes_transferred) / float(self.total_elapsed_time)
     except ZeroDivisionError:
       self.total_elapsed_time = 0.01

     self.total_bytes_per_second = (float(self.total_bytes_transferred) /
                                    float(self.total_elapsed_time))

     if self.debug == 3:
       # Note that this only counts the actual GET and PUT bytes for the copy
       # - not any transfers for doing wildcard expansion, the initial
       # HEAD/GET request performed to get the object metadata, etc.
       if self.total_bytes_transferred != 0:
         self.logger.info(
             'Total bytes copied=%d, total elapsed time=%5.3f secs (%sps)',
             self.total_bytes_transferred, self.total_elapsed_time,
             MakeHumanReadable(self.total_bytes_per_second))
     if self.op_failure_count:
       plural_str = 's' if self.op_failure_count > 1 else ''
       raise CommandException('%d file%s/object%s could not be transferred.' % (
           self.op_failure_count, plural_str, plural_str))

     return 0

   def _ParseOpts(self):
     perform_mv = False
     # exclude_symlinks is handled by Command parent class, so save in Command
     # state rather than CopyHelperOpts.
     self.exclude_symlinks = False
     no_clobber = False
     # continue_on_error is handled by Command parent class, so save in Command
     # state rather than CopyHelperOpts.
     self.continue_on_error = False
     daisy_chain = False
     read_args_from_stdin = False
     print_ver = False
     use_manifest = False
     preserve_acl = False
     canned_acl = None
     # canned_acl is handled by a helper function in parent
     # Command class, so save in Command state rather than CopyHelperOpts.
     self.canned = None

     self.all_versions = False

     self.skip_unsupported_objects = False

     # Files matching these extensions should be gzipped before uploading.
     self.gzip_exts = []

     test_callback_file = None

     # self.recursion_requested initialized in command.py (so can be checked
     # in parent class for all commands).
     self.manifest = None
     if self.sub_opts:
       for o, a in self.sub_opts:
         if o == '-a':
           canned_acl = a
           self.canned = True
         if o == '-A':
           self.all_versions = True
         if o == '-c':
           self.continue_on_error = True
         elif o == '-D':
           daisy_chain = True
         elif o == '-e':
           self.exclude_symlinks = True
         elif o == '--testcallbackfile':
           # File path of a pickled class that implements ProgressCallback.call.
           # Used for testing transfer interruptions and resumes.
           test_callback_file = a
         elif o == '-I':
           read_args_from_stdin = True
         elif o == '-L':
           use_manifest = True
           self.manifest = Manifest(a)
         elif o == '-M':
           # Note that we signal to the cp command to perform a move (copy
           # followed by remove) and use directory-move naming rules by passing
           # the undocumented (for internal use) -M option when running the cp
           # command from mv.py.
           perform_mv = True
         elif o == '-n':
           no_clobber = True
         elif o == '-p':
           preserve_acl = True
         elif o == '-r' or o == '-R':
           self.recursion_requested = True
         elif o == '-U':
           self.skip_unsupported_objects = True
         elif o == '-v':
           print_ver = True
         elif o == '-z':
           self.gzip_exts = [x.strip() for x in a.split(',')]
     if preserve_acl and canned_acl:
       raise CommandException(
           'Specifying both the -p and -a options together is invalid.')
     if self.all_versions and self.parallel_operations:
       raise CommandException(
           'The gsutil -m option is not supported with the cp -A flag, to '
           'ensure that object version ordering is preserved. Please re-run '
           'the command without the -m option.')
     return CreateCopyHelperOpts(
         perform_mv=perform_mv,
         no_clobber=no_clobber,
         daisy_chain=daisy_chain,
         read_args_from_stdin=read_args_from_stdin,
         print_ver=print_ver,
         use_manifest=use_manifest,
         preserve_acl=preserve_acl,
         canned_acl=canned_acl,
         skip_unsupported_objects=self.skip_unsupported_objects,
         test_callback_file=test_callback_file)