| /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| // See docs in ../ops/nn_ops.cc. |
| |
| #define USE_EIGEN_TENSOR |
| #define EIGEN_USE_THREADS |
| |
| #include <algorithm> |
| #include <vector> |
| |
| #include "absl/base/dynamic_annotations.h" |
| #include "tensorflow/core/framework/numeric_op.h" |
| #include "tensorflow/core/framework/op_kernel.h" |
| #include "tensorflow/core/framework/register_types.h" |
| #include "tensorflow/core/framework/tensor.h" |
| #include "tensorflow/core/framework/tensor_shape.h" |
| #include "tensorflow/core/framework/tensor_slice.h" |
| #include "tensorflow/core/kernels/conv_2d.h" |
| #include "tensorflow/core/kernels/conv_grad_ops.h" |
| #ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS |
| #include "tensorflow/core/kernels/xsmm_conv2d.h" |
| #endif |
| #include "tensorflow/core/kernels/ops_util.h" |
| #include "tensorflow/core/lib/core/errors.h" |
| #include "tensorflow/core/lib/gtl/array_slice.h" |
| #include "tensorflow/core/platform/logging.h" |
| #include "tensorflow/core/platform/macros.h" |
| #include "tensorflow/core/util/padding.h" |
| #include "tensorflow/core/util/tensor_format.h" |
| #include "tensorflow/core/util/use_cudnn.h" |
| #include "tensorflow/core/util/work_sharder.h" |
| |
| #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL) |
| #include "tensorflow/core/kernels/eigen_contraction_kernel.h" |
| #endif |
| |
| #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
| #include "tensorflow/core/kernels/conv_ops_gpu.h" |
| #include "tensorflow/core/platform/stream_executor.h" |
| #include "tensorflow/core/protobuf/autotuning.pb.h" |
| #include "tensorflow/core/util/proto/proto_utils.h" |
| #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
| #if GOOGLE_CUDA |
| #include "tensorflow/stream_executor/cuda/ptxas_utils.h" |
| #include "tensorflow/stream_executor/cuda/redzone_allocator.h" |
| #include "tensorflow/stream_executor/tf_allocator_adapter.h" |
| #endif // GOOGLE_CUDA |
| |
| namespace { |
| |
| // Returns in 'im_data' (assumes to be zero-initialized) image patch in storage |
| // order (height, width, depth), constructed from patches in 'col_data', which |
| // is required to be in storage order (out_height * out_width, filter_height, |
| // filter_width, in_depth). Implementation by Yangqing Jia (jiayq). |
| template <typename T> |
| void Col2im(const T* col_data, const int depth, const int height, |
| const int width, const int filter_h, const int filter_w, |
| const int pad_t, const int pad_l, const int pad_b, const int pad_r, |
| const int stride_h, const int stride_w, T* im_data) { |
| int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1; |
| int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1; |
| int h_pad = -pad_t; |
| for (int h = 0; h < height_col; ++h) { |
| int w_pad = -pad_l; |
| for (int w = 0; w < width_col; ++w) { |
| T* im_patch_data = im_data + (h_pad * width + w_pad) * depth; |
| for (int ih = h_pad; ih < h_pad + filter_h; ++ih) { |
| for (int iw = w_pad; iw < w_pad + filter_w; ++iw) { |
| if (ih >= 0 && ih < height && iw >= 0 && iw < width) { |
| // TODO(andydavis) Vectorize this loop (if compiler does not). |
| for (int i = 0; i < depth; ++i) { |
| im_patch_data[i] += col_data[i]; |
| } |
| } |
| im_patch_data += depth; |
| col_data += depth; |
| } |
| // Jump over remaining number of depth. |
| im_patch_data += depth * (width - filter_w); |
| } |
| w_pad += stride_w; |
| } |
| h_pad += stride_h; |
| } |
| } |
| |
| } // namespace |
| |
| namespace tensorflow { |
| |
| typedef Eigen::ThreadPoolDevice CPUDevice; |
| typedef Eigen::GpuDevice GPUDevice; |
| |
| // Computes backprop input using Eigen::SpatialConvolutionBackwardInput on CPU. |
| template <typename T> |
| struct LaunchConv2DBackpropInputOp<CPUDevice, T> { |
| void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, |
| const Tensor& out_backprop, const Tensor& filter, |
| int row_dilation, int col_dilation, int row_stride, |
| int col_stride, const Padding& padding, |
| const std::vector<int64>& explicit_paddings, |
| Tensor* in_backprop, TensorFormat data_format) { |
| std::vector<int32> strides(4, 1); |
| std::vector<int32> dilations(4, 1); |
| |
| auto input_h = GetTensorDimIndex(data_format, 'H'); |
| auto input_w = GetTensorDimIndex(data_format, 'W'); |
| strides[input_h] = row_stride; |
| strides[input_w] = col_stride; |
| dilations[input_h] = row_dilation; |
| dilations[input_w] = col_dilation; |
| |
| const TensorShape& input_shape = in_backprop->shape(); |
| const TensorShape& filter_shape = filter.shape(); |
| |
| ConvBackpropDimensions dims; |
| OP_REQUIRES_OK( |
| ctx, ConvBackpropComputeDimensionsV2( |
| "Conv2DBackpropInput", /*num_spatial_dims=*/2, input_shape, |
| filter_shape, out_backprop.shape(), dilations, strides, |
| padding, explicit_paddings, data_format, &dims)); |
| |
| int64 padding_top = -1, padding_bottom = -1; |
| int64 padding_left = -1, padding_right = -1; |
| if (padding == EXPLICIT) { |
| GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', |
| &padding_top, &padding_bottom); |
| GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', |
| &padding_left, &padding_right); |
| } |
| |
| int64 expected_out_rows, expected_out_cols; |
| // The function is guaranteed to succeed because we checked the output and |
| // padding was valid earlier. |
| TF_CHECK_OK(GetWindowedOutputSizeVerboseV2( |
| dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size, |
| row_dilation, row_stride, padding, &expected_out_rows, &padding_top, |
| &padding_bottom)); |
| DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows); |
| |
| TF_CHECK_OK(GetWindowedOutputSizeVerboseV2( |
| dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size, |
| col_dilation, col_stride, padding, &expected_out_cols, &padding_left, |
| &padding_right)); |
| DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols); |
| |
| const CPUDevice& d = ctx->eigen_device<CPUDevice>(); |
| |
| auto in_backprop_t = in_backprop->tensor<T, 4>(); |
| auto out_backprop_t = out_backprop.tensor<T, 4>(); |
| auto filter_t = filter.tensor<T, 4>(); |
| |
| // WARNING: Need to swap row/col, padding_top/padding_left, and |
| // padding_bottom/padding_right when calling Eigen. Eigen expects tensors |
| // in NWHC format, but Tensorflow uses NHWC. |
| |
| if (padding != EXPLICIT) { |
| // If padding was not explicitly defined, Eigen spatial convolution |
| // backward input will infer correct forward paddings from input tensors. |
| in_backprop_t.device(d) = Eigen::SpatialConvolutionBackwardInput( |
| filter_t, out_backprop_t, in_backprop_t.dimension(2), |
| in_backprop_t.dimension(1), col_stride, row_stride, col_dilation, |
| row_dilation); |
| |
| } else { |
| // Otherwise we have to slice the result of a spatial convolution backward |
| // input, before assigning it to the `in_backprop` to remove padding. |
| using Offsets = Eigen::DSizes<Eigen::Index, 4>; |
| |
| // TODO(ezhulenev): Pass explicit paddings to Eigen and do not materialize |
| // intermediate result in memory before slicing. |
| |
| in_backprop_t.device(d) = |
| Eigen::SpatialConvolutionBackwardInput( |
| filter_t, out_backprop_t, |
| in_backprop_t.dimension(2) + (padding_left + padding_right), |
| in_backprop_t.dimension(1) + (padding_top + padding_bottom), |
| col_stride, row_stride, col_dilation, row_dilation) |
| .eval() |
| .slice(Offsets(0, padding_top, padding_left, 0), |
| /*sizes=*/in_backprop_t.dimensions()); |
| } |
| } |
| }; |
| |
| #ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS |
| template <typename Device, class T> |
| struct LaunchXsmmBackwardInputConvolution { |
| bool operator()(OpKernelContext* context, const Device& d, |
| typename TTypes<T, 4>::Tensor input_backward, |
| typename TTypes<T, 4>::ConstTensor kernel, |
| typename TTypes<T, 4>::ConstTensor output_backward, |
| int input_rows, int input_cols, int row_stride, |
| int col_stride, int pad_h, int pad_w, |
| TensorFormat data_format) const { |
| return false; |
| } |
| }; |
| |
| template <> |
| struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> { |
| bool operator()(OpKernelContext* context, const CPUDevice& d, |
| typename TTypes<float, 4>::Tensor input_backward, |
| typename TTypes<float, 4>::ConstTensor kernel, |
| typename TTypes<float, 4>::ConstTensor output_backward, |
| int input_rows, int input_cols, int row_stride, |
| int col_stride, int pad_h, int pad_w, |
| TensorFormat data_format) const { |
| auto batch = input_backward.dimension(0); |
| auto in_depth = input_backward.dimension(3); |
| auto out_depth = output_backward.dimension(3); |
| auto filter_rows = kernel.dimension(0); |
| auto filter_cols = kernel.dimension(1); |
| auto num_threads = |
| context->device()->tensorflow_cpu_worker_threads()->num_threads; |
| // See libxsmm_dnn.h for this struct definition. |
| libxsmm_dnn_conv_desc desc; |
| desc.N = batch; |
| desc.C = in_depth; |
| desc.H = input_rows; |
| desc.W = input_cols; |
| desc.K = out_depth; |
| desc.R = filter_rows; |
| desc.S = filter_cols; |
| desc.u = row_stride; |
| desc.v = col_stride; |
| desc.pad_h = pad_h; |
| desc.pad_w = pad_w; |
| desc.pad_h_in = 0; |
| desc.pad_w_in = 0; |
| desc.pad_h_out = 0; |
| desc.pad_w_out = 0; |
| desc.threads = num_threads; |
| desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT; |
| desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC; |
| desc.filter_format = |
| LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; // LIBXSMM_DNN_TENSOR_FORMAT_RSCK; |
| desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; |
| desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE_OVERWRITE; |
| desc.datatype = LIBXSMM_DNN_DATATYPE_F32; |
| |
| auto input_ptr = input_backward.data(); |
| auto filter_ptr = kernel.data(); |
| auto output_ptr = output_backward.data(); |
| |
| bool success = functor::XsmmBkwInputConv2D<CPUDevice, float>()( |
| context, desc, input_ptr, filter_ptr, output_ptr); |
| return success; |
| } |
| }; |
| #endif |
| |
| template <typename T> |
| struct Conv2DCustomBackpropInputMatMulFunctor { |
| using MatrixMap = Eigen::Map< |
| Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>; |
| using ConstMatrixMap = Eigen::Map< |
| const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>; |
| |
| void operator()(OpKernelContext* ctx, const T* out_data, const T* filter_data, |
| const int filter_total_size, const int output_image_size, |
| const int dims_out_depth, T* im2col_buf) { |
| // Compute gradient into 'im2col_buf'. |
| MatrixMap C(im2col_buf, output_image_size, filter_total_size); |
| |
| ConstMatrixMap A(out_data, output_image_size, dims_out_depth); |
| ConstMatrixMap B(filter_data, filter_total_size, dims_out_depth); |
| |
| C.noalias() = A * B.transpose(); |
| } |
| }; |
| |
| #if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL) |
| template <> |
| struct Conv2DCustomBackpropInputMatMulFunctor<float> { |
| using T = float; |
| |
| void operator()(OpKernelContext* ctx, const T* out_data, const T* filter_data, |
| const int filter_total_size, const int output_image_size, |
| const int dims_out_depth, T* im2col_buf) { |
| // Inputs are in RowMajor order, we "cheat" by swapping the LHS and RHS: |
| // RowMajor: C = A * B |
| // ColMajor: C^T = B^T * A^T |
| // |
| // Dimension names: |
| // out_image_size -> ois |
| // filter_total_size -> fts |
| // dims_out_depth -> dod |
| // |
| // RowMajor: |
| // im2col = out_data * filter_data^T |
| // [ois x fts] = [ois x dod] * [fts x dod]^T |
| // |
| // ColMajor: |
| // im2col^T = filter_data * out_data^T |
| // [fts x ois] = [fts x dod] * [dod x ois]* |
| |
| const int m = filter_total_size; |
| const int n = output_image_size; |
| const int k = dims_out_depth; // contraction dim |
| |
| const char transposeA = 'T'; // sgemm(A) == filter_data |
| const char transposeB = 'N'; // sgemm(B) == out_data |
| |
| const int ldA = dims_out_depth; |
| const int ldB = dims_out_depth; |
| const int ldC = filter_total_size; |
| |
| const float alpha = 1.0; |
| const float beta = 0.0; |
| |
| // mkldnn_sgemm code can't be instrumented with msan. |
| ANNOTATE_MEMORY_IS_INITIALIZED( |
| im2col_buf, filter_total_size * output_image_size * sizeof(T)); |
| |
| mkldnn_status_t st = |
| mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k, &alpha, filter_data, |
| &ldA, out_data, &ldB, &beta, im2col_buf, &ldC); |
| |
| OP_REQUIRES( |
| ctx, st == 0, |
| errors::Internal("Failed to call mkldnn_sgemm. Error code: ", st)); |
| } |
| }; |
| #endif |
| |
| template <typename Device, class T> |
| class Conv2DBackpropInputOp : public OpKernel { |
| public: |
| explicit Conv2DBackpropInputOp(OpKernelConstruction* context) |
| : OpKernel(context) { |
| string data_format; |
| OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); |
| OP_REQUIRES(context, FormatFromString(data_format, &data_format_), |
| errors::InvalidArgument("Invalid data format")); |
| |
| OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); |
| OP_REQUIRES(context, strides_.size() == 4, |
| errors::InvalidArgument("Sliding window strides field must " |
| "specify 4 dimensions")); |
| int stride_n = GetTensorDim(strides_, data_format_, 'N'); |
| int stride_c = GetTensorDim(strides_, data_format_, 'C'); |
| int stride_h = GetTensorDim(strides_, data_format_, 'H'); |
| int stride_w = GetTensorDim(strides_, data_format_, 'W'); |
| OP_REQUIRES( |
| context, (stride_n == 1 && stride_c == 1), |
| errors::InvalidArgument("Current implementation does not yet support " |
| "strides in the batch and depth dimensions.")); |
| OP_REQUIRES(context, stride_h > 0 && stride_w > 0, |
| errors::InvalidArgument( |
| "Row and column strides should be larger than 0.")); |
| |
| OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_)); |
| OP_REQUIRES(context, dilations_.size() == 4, |
| errors::InvalidArgument("Sliding window dilations field must " |
| "specify 4 dimensions")); |
| int dilation_n = GetTensorDim(dilations_, data_format_, 'N'); |
| int dilation_c = GetTensorDim(dilations_, data_format_, 'C'); |
| int dilation_h = GetTensorDim(dilations_, data_format_, 'H'); |
| int dilation_w = GetTensorDim(dilations_, data_format_, 'W'); |
| OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1), |
| errors::InvalidArgument( |
| "Current implementation does not yet support " |
| "dilations in the batch and depth dimensions.")); |
| OP_REQUIRES( |
| context, dilation_h > 0 && dilation_w > 0, |
| errors::InvalidArgument("Dilated rates should be larger than 0.")); |
| |
| OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); |
| OP_REQUIRES_OK(context, |
| context->GetAttr("explicit_paddings", &explicit_paddings_)); |
| OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_, |
| /*num_dims=*/4, data_format_)); |
| |
| OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_)); |
| use_cudnn_ &= CanUseCudnn(); |
| cudnn_use_autotune_ = CudnnUseAutotune(); |
| |
| if (std::is_same<Device, CPUDevice>::value) { |
| OP_REQUIRES(context, data_format_ == FORMAT_NHWC, |
| errors::InvalidArgument("Conv2DBackpropInputOp [CPU] " |
| "only supports NHWC data format.")); |
| |
| // TODO(yangzihao): Add a CPU implementation for dilated convolution. |
| OP_REQUIRES( |
| context, (dilation_h == 1 && dilation_w == 1), |
| errors::InvalidArgument("Conv2DBackpropInputOp [CPU] not yet support " |
| "dilation rates larger than 1.")); |
| } |
| } |
| |
| void Compute(OpKernelContext* context) override { |
| const Tensor& input_sizes = context->input(0); |
| const Tensor& filter = context->input(1); |
| const Tensor& out_backprop = context->input(2); |
| OP_REQUIRES( |
| context, TensorShapeUtils::IsVector(input_sizes.shape()), |
| errors::InvalidArgument( |
| "Conv2DBackpropInput: input_sizes input must be 1-dim, not ", |
| input_sizes.dims())); |
| TensorShape input_shape; |
| OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape( |
| input_sizes.vec<int32>(), &input_shape)); |
| |
| Tensor* in_backprop = nullptr; |
| OP_REQUIRES_OK(context, |
| context->allocate_output(0, input_shape, &in_backprop)); |
| |
| // If there is nothing to compute, return. |
| if (input_shape.num_elements() == 0) { |
| return; |
| } |
| |
| // For now we take the stride from the second and third dimensions only (we |
| // do not support striding on the batch or depth dimension). |
| const int stride_rows = GetTensorDim(strides_, data_format_, 'H'); |
| const int stride_cols = GetTensorDim(strides_, data_format_, 'W'); |
| const int dilation_rows = GetTensorDim(dilations_, data_format_, 'H'); |
| const int dilation_cols = GetTensorDim(dilations_, data_format_, 'W'); |
| |
| VLOG(2) << "Conv2DBackpropInput:" |
| << " input: " << input_shape.DebugString() |
| << " filter:" << filter.shape().DebugString() |
| << " out_backprop: " << out_backprop.shape().DebugString() |
| << " strides: [" << stride_rows << ", " << stride_cols << "]" |
| << " dilations: [" << dilation_rows << ", " << dilation_cols << "]"; |
| |
| LaunchConv2DBackpropInputOp<Device, T> launch; |
| launch(context, use_cudnn_, cudnn_use_autotune_, out_backprop, filter, |
| dilation_rows, dilation_cols, stride_rows, stride_cols, padding_, |
| explicit_paddings_, in_backprop, data_format_); |
| } |
| |
| private: |
| std::vector<int32> dilations_; |
| std::vector<int32> strides_; |
| TensorFormat data_format_; |
| Padding padding_; |
| std::vector<int64> explicit_paddings_; |
| |
| bool use_cudnn_ = false; |
| bool cudnn_use_autotune_ = false; |
| |
| TF_DISALLOW_COPY_AND_ASSIGN(Conv2DBackpropInputOp); |
| }; |
| |
| // Based on implementation written by Yangqing Jia (jiayq). |
| template <typename Device, class T> |
| class Conv2DCustomBackpropInputOp : public OpKernel { |
| public: |
| explicit Conv2DCustomBackpropInputOp(OpKernelConstruction* context) |
| : OpKernel(context) { |
| string data_format; |
| OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); |
| OP_REQUIRES(context, FormatFromString(data_format, &data_format_), |
| errors::InvalidArgument("Invalid data format")); |
| OP_REQUIRES(context, data_format_ == FORMAT_NHWC, |
| errors::InvalidArgument( |
| "Conv2DCustomBackpropInputOp only supports NHWC.")); |
| OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); |
| OP_REQUIRES(context, strides_.size() == 4, |
| errors::InvalidArgument("Sliding window strides field must " |
| "specify 4 dimensions")); |
| OP_REQUIRES( |
| context, (strides_[0] == 1 && strides_[3] == 1), |
| errors::InvalidArgument("Current implementation does not yet support " |
| "strides in the batch and depth dimensions.")); |
| OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0, |
| errors::InvalidArgument( |
| "Row and column strides should be larger than 0.")); |
| OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); |
| OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_)); |
| OP_REQUIRES(context, dilations_.size() == 4, |
| errors::InvalidArgument("Sliding window dilations field must " |
| "specify 4 dimensions")); |
| OP_REQUIRES(context, (dilations_[0] == 1 && dilations_[3] == 1), |
| errors::InvalidArgument( |
| "Current implementation does not yet support " |
| "dilations in the batch and depth dimensions.")); |
| // TODO(yangzihao): Add a CPU implementation for dilated convolution. |
| OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1), |
| errors::InvalidArgument( |
| "Current libxsmm and customized CPU implementations do " |
| "not yet support dilation rates larger than 1.")); |
| OP_REQUIRES_OK(context, |
| context->GetAttr("explicit_paddings", &explicit_paddings_)); |
| OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_, |
| /*num_dims=*/4, data_format_)); |
| } |
| |
| void Compute(OpKernelContext* context) override { |
| const Tensor& input_sizes = context->input(0); |
| const Tensor& filter = context->input(1); |
| const Tensor& out_backprop = context->input(2); |
| OP_REQUIRES( |
| context, TensorShapeUtils::IsVector(input_sizes.shape()), |
| errors::InvalidArgument( |
| "Conv2DBackpropInput: input_sizes input must be 1-dim, not ", |
| input_sizes.dims())); |
| TensorShape input_shape; |
| OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape( |
| input_sizes.vec<int32>(), &input_shape)); |
| |
| ConvBackpropDimensions dims; |
| OP_REQUIRES_OK(context, |
| ConvBackpropComputeDimensionsV2( |
| "Conv2DCustomBackpropInput", /*num_spatial_dims=*/2, |
| input_shape, filter.shape(), out_backprop.shape(), |
| /*dilations=*/{1, 1, 1, 1}, strides_, padding_, |
| explicit_paddings_, data_format_, &dims)); |
| |
| Tensor* in_backprop = nullptr; |
| OP_REQUIRES_OK(context, |
| context->allocate_output(0, input_shape, &in_backprop)); |
| |
| // If there is nothing to compute, return. |
| if (input_shape.num_elements() == 0) { |
| return; |
| } |
| |
| // TODO(ezhulenev): Remove custom kernel and move XSMM support to |
| // LaunchConv2DBackpropInputOp functor. |
| #if defined TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS && \ |
| defined TENSORFLOW_USE_LIBXSMM_BACKWARD_CONVOLUTIONS |
| int64 pad_top, pad_bottom; |
| int64 pad_left, pad_right; |
| OP_REQUIRES_OK( |
| context, |
| GetWindowedOutputSizeVerbose( |
| dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size, |
| dims.spatial_dims[0].stride, padding_, |
| &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom)); |
| OP_REQUIRES_OK( |
| context, |
| GetWindowedOutputSizeVerbose( |
| dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size, |
| dims.spatial_dims[1].stride, padding_, |
| &dims.spatial_dims[1].output_size, &pad_left, &pad_right)); |
| |
| if (pad_left == pad_right && pad_top == pad_bottom) { |
| if (LaunchXsmmBackwardInputConvolution<Device, T>()( |
| context, context->eigen_device<Device>(), |
| in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(), |
| out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size, |
| dims.spatial_dims[1].input_size, |
| static_cast<int>(dims.spatial_dims[0].stride), |
| static_cast<int>(dims.spatial_dims[1].stride), |
| static_cast<int>(pad_top), static_cast<int>(pad_left), |
| data_format_)) { |
| return; |
| } |
| } |
| #else |
| int64 pad_top, pad_bottom; |
| int64 pad_left, pad_right; |
| #endif |
| if (padding_ == Padding::EXPLICIT) { |
| pad_top = explicit_paddings_[2]; |
| pad_bottom = explicit_paddings_[3]; |
| pad_left = explicit_paddings_[4]; |
| pad_right = explicit_paddings_[5]; |
| } |
| OP_REQUIRES_OK( |
| context, |
| GetWindowedOutputSizeVerbose( |
| dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size, |
| dims.spatial_dims[0].stride, padding_, |
| &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom)); |
| OP_REQUIRES_OK( |
| context, |
| GetWindowedOutputSizeVerbose( |
| dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size, |
| dims.spatial_dims[1].stride, padding_, |
| &dims.spatial_dims[1].output_size, &pad_left, &pad_right)); |
| |
| // The total dimension size of each kernel. |
| const int filter_total_size = dims.spatial_dims[0].filter_size * |
| dims.spatial_dims[1].filter_size * |
| dims.in_depth; |
| // The output image size is the spatial size of the output. |
| const int output_image_size = |
| dims.spatial_dims[0].output_size * dims.spatial_dims[1].output_size; |
| |
| // TODO(andydavis) Get L2/L3 cache sizes from device. |
| const size_t l2_cache_size = 256LL << 10; |
| const size_t l3_cache_size = 30LL << 20; |
| |
| // Use L3 cache size as target working set size. |
| const size_t target_working_set_size = l3_cache_size / sizeof(T); |
| |
| // Calculate size of matrices involved in MatMul: C = A x B. |
| const size_t size_A = output_image_size * dims.out_depth; |
| |
| const size_t size_B = filter_total_size * dims.out_depth; |
| |
| const size_t size_C = output_image_size * filter_total_size; |
| |
| const size_t work_unit_size = size_A + size_B + size_C; |
| |
| auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); |
| |
| // Calculate per-thread work unit size. |
| const size_t thread_work_unit_size = |
| work_unit_size / worker_threads.num_threads; |
| |
| // Set minimum per-thread work unit size to size of L2 cache. |
| const size_t min_thread_work_unit_size = l2_cache_size / sizeof(T); |
| |
| // Use parallel tensor contractions if there is no batching, or if the |
| // minimum per-thread work unit size threshold has been exceeded. |
| // Otherwise, revert to multiple single-threaded matmul ops running in |
| // parallel to keep all threads busy. |
| // TODO(andydavis) Explore alternatives to branching the code in this way |
| // (i.e. run multiple, parallel tensor contractions in another thread pool). |
| const bool use_parallel_contraction = |
| dims.batch_size == 1 || |
| thread_work_unit_size >= min_thread_work_unit_size; |
| |
| const size_t shard_size = |
| use_parallel_contraction |
| ? 1 |
| : (target_working_set_size + work_unit_size - 1) / work_unit_size; |
| |
| Tensor col_buffer; |
| OP_REQUIRES_OK(context, |
| context->allocate_temp( |
| DataTypeToEnum<T>::value, |
| TensorShape({static_cast<int64>(shard_size), |
| static_cast<int64>(output_image_size), |
| static_cast<int64>(filter_total_size)}), |
| &col_buffer)); |
| |
| // The input offset corresponding to a single input image. |
| const int input_offset = dims.spatial_dims[0].input_size * |
| dims.spatial_dims[1].input_size * dims.in_depth; |
| // The output offset corresponding to a single output image. |
| const int output_offset = dims.spatial_dims[0].output_size * |
| dims.spatial_dims[1].output_size * dims.out_depth; |
| |
| const T* filter_data = filter.template flat<T>().data(); |
| T* col_buffer_data = col_buffer.template flat<T>().data(); |
| const T* out_backprop_data = out_backprop.template flat<T>().data(); |
| |
| auto in_backprop_flat = in_backprop->template flat<T>(); |
| T* input_backprop_data = in_backprop_flat.data(); |
| in_backprop_flat.device(context->eigen_device<Device>()) = |
| in_backprop_flat.constant(T(0)); |
| |
| if (use_parallel_contraction) { |
| typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, |
| Eigen::Unaligned> |
| TensorMap; |
| typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>, |
| Eigen::Unaligned> |
| ConstTensorMap; |
| |
| // Initialize contraction dims (we need to transpose 'B' below). |
| Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims; |
| contract_dims[0].first = 1; |
| contract_dims[0].second = 1; |
| |
| for (int image_id = 0; image_id < dims.batch_size; ++image_id) { |
| // Compute gradient into col_buffer. |
| TensorMap C(col_buffer_data, output_image_size, filter_total_size); |
| |
| ConstTensorMap A(out_backprop_data + output_offset * image_id, |
| output_image_size, dims.out_depth); |
| ConstTensorMap B(filter_data, filter_total_size, dims.out_depth); |
| |
| C.device(context->eigen_cpu_device()) = A.contract(B, contract_dims); |
| |
| Col2im<T>( |
| col_buffer_data, dims.in_depth, dims.spatial_dims[0].input_size, |
| dims.spatial_dims[1].input_size, dims.spatial_dims[0].filter_size, |
| dims.spatial_dims[1].filter_size, pad_top, pad_left, pad_bottom, |
| pad_right, dims.spatial_dims[0].stride, dims.spatial_dims[1].stride, |
| input_backprop_data); |
| |
| input_backprop_data += input_offset; |
| } |
| } else { |
| for (int image_id = 0; image_id < dims.batch_size; |
| image_id += shard_size) { |
| const int shard_limit = |
| std::min(static_cast<int>(shard_size), |
| static_cast<int>(dims.batch_size) - image_id); |
| |
| auto shard = [&context, &dims, &pad_top, &pad_left, &pad_bottom, |
| &pad_right, &output_image_size, &filter_total_size, |
| &input_backprop_data, &col_buffer_data, |
| &out_backprop_data, &filter_data, &input_offset, |
| &output_offset, &size_C](int64 start, int64 limit) { |
| for (int shard_id = start; shard_id < limit; ++shard_id) { |
| T* im2col_buf = col_buffer_data + shard_id * size_C; |
| T* input_data = input_backprop_data + shard_id * input_offset; |
| const T* out_data = out_backprop_data + shard_id * output_offset; |
| |
| Conv2DCustomBackpropInputMatMulFunctor<T>()( |
| context, out_data, filter_data, filter_total_size, |
| output_image_size, dims.out_depth, im2col_buf); |
| |
| Col2im<T>(im2col_buf, dims.in_depth, |
| dims.spatial_dims[0].input_size, |
| dims.spatial_dims[1].input_size, |
| dims.spatial_dims[0].filter_size, |
| dims.spatial_dims[1].filter_size, pad_top, pad_left, |
| pad_bottom, pad_right, dims.spatial_dims[0].stride, |
| dims.spatial_dims[1].stride, input_data); |
| } |
| }; |
| Shard(worker_threads.num_threads, worker_threads.workers, shard_limit, |
| work_unit_size, shard); |
| |
| input_backprop_data += input_offset * shard_limit; |
| out_backprop_data += output_offset * shard_limit; |
| } |
| } |
| } |
| |
| private: |
| std::vector<int32> dilations_; |
| std::vector<int32> strides_; |
| Padding padding_; |
| std::vector<int64> explicit_paddings_; |
| TensorFormat data_format_; |
| |
| TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropInputOp); |
| }; |
| |
| // TODO(ezhulenev): Add a cost model to switch between custom/Eigen ops. |
| #define DEFAULT_CPU_OP Conv2DCustomBackpropInputOp |
| |
| #define REGISTER_CPU_KERNELS(T) \ |
| REGISTER_KERNEL_BUILDER( \ |
| Name("Conv2DBackpropInput").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ |
| DEFAULT_CPU_OP<CPUDevice, T>); \ |
| REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") \ |
| .Device(DEVICE_CPU) \ |
| .Label("custom") \ |
| .TypeConstraint<T>("T"), \ |
| Conv2DCustomBackpropInputOp<CPUDevice, T>); \ |
| REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") \ |
| .Device(DEVICE_CPU) \ |
| .Label("eigen_tensor") \ |
| .TypeConstraint<T>("T"), \ |
| Conv2DBackpropInputOp<CPUDevice, T>); |
| |
| TF_CALL_half(REGISTER_CPU_KERNELS); |
| TF_CALL_float(REGISTER_CPU_KERNELS); |
| TF_CALL_double(REGISTER_CPU_KERNELS); |
| #undef REGISTER_CPU_KERNELS |
| #undef DEFAULT_CPU_OP |
| |
| // To be used inside depthwise_conv_grad_op.cc. |
| template struct LaunchConv2DBackpropInputOp<CPUDevice, Eigen::half>; |
| template struct LaunchConv2DBackpropInputOp<CPUDevice, float>; |
| template struct LaunchConv2DBackpropInputOp<CPUDevice, double>; |
| |
| // GPU definitions. |
| #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
| // The slow version (but compiles for GPU) |
| |
| // A dummy type to group forward backward data autotune results together. |
| struct ConvBackwardDataAutoTuneGroup { |
| static string name() { return "ConvBwdData"; } |
| }; |
| typedef AutoTuneSingleton<ConvBackwardDataAutoTuneGroup, ConvParameters, |
| se::dnn::AlgorithmConfig> |
| AutoTuneConvBwdData; |
| |
| template <typename T> |
| void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()( |
| OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, |
| const Tensor& out_backprop, const Tensor& filter, int row_dilation, |
| int col_dilation, int row_stride, int col_stride, const Padding& padding, |
| const std::vector<int64>& explicit_paddings, Tensor* in_backprop, |
| TensorFormat data_format) { |
| using se::dnn::AlgorithmConfig; |
| using se::dnn::AlgorithmDesc; |
| using se::dnn::ProfileResult; |
| |
| std::vector<int32> strides(4, 1); |
| std::vector<int32> dilations(4, 1); |
| auto input_h = GetTensorDimIndex(data_format, 'H'); |
| auto input_w = GetTensorDimIndex(data_format, 'W'); |
| strides[input_h] = row_stride; |
| strides[input_w] = col_stride; |
| dilations[input_h] = row_dilation; |
| dilations[input_w] = col_dilation; |
| TensorShape input_shape = in_backprop->shape(); |
| |
| const TensorShape& filter_shape = filter.shape(); |
| ConvBackpropDimensions dims; |
| OP_REQUIRES_OK( |
| ctx, ConvBackpropComputeDimensionsV2( |
| "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2, input_shape, |
| filter_shape, out_backprop.shape(), dilations, strides, padding, |
| explicit_paddings, data_format, &dims)); |
| |
| int64 padding_top = -1, padding_bottom = -1; |
| int64 padding_left = -1, padding_right = -1; |
| if (padding == EXPLICIT) { |
| GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top, |
| &padding_bottom); |
| GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left, |
| &padding_right); |
| } |
| int64 expected_out_rows, expected_out_cols; |
| // The function is guaranteed to succeed because we checked the output and |
| // padding was valid earlier. |
| TF_CHECK_OK(GetWindowedOutputSizeVerboseV2( |
| dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size, |
| row_dilation, row_stride, padding, &expected_out_rows, &padding_top, |
| &padding_bottom)); |
| DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows); |
| TF_CHECK_OK(GetWindowedOutputSizeVerboseV2( |
| dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size, |
| col_dilation, col_stride, padding, &expected_out_cols, &padding_left, |
| &padding_right)); |
| DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols); |
| |
| auto* stream = ctx->op_device_context()->stream(); |
| OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available.")); |
| |
| if (!use_cudnn) { |
| ctx->SetStatus(errors::Unimplemented( |
| "Conv2DBackpropInput for GPU is not currently supported " |
| "without cudnn")); |
| return; |
| } |
| |
| // If the filter in-depth (filter_shape.dim_size(2)) is 1 and smaller than the |
| // input depth, it's a depthwise convolution. More generally, if the filter |
| // in-depth divides but is smaller than the input depth, it is a grouped |
| // convolution. |
| bool is_grouped_convolution = filter_shape.dim_size(2) != dims.in_depth; |
| if (dims.spatial_dims[0].filter_size == 1 && |
| dims.spatial_dims[1].filter_size == 1 && !is_grouped_convolution && |
| dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 && |
| data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) { |
| // 1x1 filter, so call cublas directly. |
| const uint64 m = dims.batch_size * dims.spatial_dims[0].input_size * |
| dims.spatial_dims[1].input_size; |
| const uint64 k = dims.out_depth; |
| const uint64 n = dims.in_depth; |
| |
| auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(), |
| out_backprop.template flat<T>().size()); |
| auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(), |
| filter.template flat<T>().size()); |
| auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(), |
| in_backprop->template flat<T>().size()); |
| |
| auto transpose = se::blas::Transpose::kTranspose; |
| auto no_transpose = se::blas::Transpose::kNoTranspose; |
| |
| bool blas_launch_status = |
| stream |
| ->ThenBlasGemm(transpose, no_transpose, n, m, k, 1.0f, b_ptr, k, |
| a_ptr, k, 0.0f, &c_ptr, n) |
| .ok(); |
| if (!blas_launch_status) { |
| ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m, |
| ", n=", n, ", k=", k)); |
| } |
| return; |
| } else if (dims.spatial_dims[0].filter_size == |
| dims.spatial_dims[0].input_size && |
| dims.spatial_dims[1].filter_size == |
| dims.spatial_dims[1].input_size && |
| !is_grouped_convolution && padding == VALID && |
| data_format == FORMAT_NHWC) { |
| // The input data and filter have the same height/width, and we are not |
| // using grouped convolution, so call cublas directly. |
| const uint64 m = dims.batch_size; |
| const uint64 k = dims.out_depth; |
| const uint64 n = dims.spatial_dims[0].input_size * |
| dims.spatial_dims[1].input_size * dims.in_depth; |
| |
| auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(), |
| out_backprop.template flat<T>().size()); |
| auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(), |
| filter.template flat<T>().size()); |
| auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(), |
| in_backprop->template flat<T>().size()); |
| |
| auto transpose = se::blas::Transpose::kTranspose; |
| auto no_transpose = se::blas::Transpose::kNoTranspose; |
| |
| bool blas_launch_status = |
| stream |
| ->ThenBlasGemm(transpose, no_transpose, n, m, k, 1.0f, b_ptr, k, |
| a_ptr, k, 0.0f, &c_ptr, n) |
| .ok(); |
| if (!blas_launch_status) { |
| ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m, |
| ", n=", n, ", k=", k)); |
| } |
| return; |
| } |
| |
| const int64 common_padding_rows = std::min(padding_top, padding_bottom); |
| const int64 common_padding_cols = std::min(padding_left, padding_right); |
| TensorShape compatible_input_shape; |
| if (padding_top != padding_bottom || padding_left != padding_right) { |
| // Pad the input in the same way we did during the forward pass, so that |
| // cuDNN or MIOpen receives the same input during the backward pass function |
| // as it did during the forward pass function. |
| const int64 padding_rows_diff = std::abs(padding_bottom - padding_top); |
| const int64 padding_cols_diff = std::abs(padding_right - padding_left); |
| const int64 new_in_rows = |
| dims.spatial_dims[0].input_size + padding_rows_diff; |
| const int64 new_in_cols = |
| dims.spatial_dims[1].input_size + padding_cols_diff; |
| compatible_input_shape = ShapeFromFormat( |
| data_format, dims.batch_size, new_in_rows, new_in_cols, dims.in_depth); |
| } else { |
| compatible_input_shape = input_shape; |
| } |
| |
| CHECK(common_padding_rows >= 0 && common_padding_cols >= 0) // Crash OK |
| << "Negative row or col paddings: (" << common_padding_rows << ", " |
| << common_padding_cols << ")"; |
| |
| // The Tensor Core in NVIDIA Volta+ GPUs supports efficient convolution with |
| // fp16 in NHWC data layout. In all other configurations it's more efficient |
| // to run computation in NCHW data format. |
| const bool compute_in_nhwc = |
| DataTypeToEnum<T>::value == DT_HALF && IsVoltaOrLater(*stream->parent()); |
| |
| // We only do one directional conversion: NHWC->NCHW. We never convert in the |
| // other direction. Grappler layout optimizer selects the preferred layout and |
| // adds necessary annotations to the graph. |
| const TensorFormat compute_data_format = |
| (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC |
| : FORMAT_NCHW; |
| |
| VLOG(3) << "Compute Conv2DBackpropInput with cuDNN:" |
| << " data_format=" << ToString(data_format) |
| << " compute_data_format=" << ToString(compute_data_format); |
| |
| constexpr auto kComputeInNHWC = |
| std::make_tuple(se::dnn::DataLayout::kBatchYXDepth, |
| se::dnn::FilterLayout::kOutputYXInput); |
| constexpr auto kComputeInNCHW = |
| std::make_tuple(se::dnn::DataLayout::kBatchDepthYX, |
| se::dnn::FilterLayout::kOutputInputYX); |
| |
| se::dnn::DataLayout compute_data_layout; |
| se::dnn::FilterLayout filter_layout; |
| |
| std::tie(compute_data_layout, filter_layout) = |
| compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW; |
| |
| se::dnn::BatchDescriptor input_desc; |
| input_desc.set_count(dims.batch_size) |
| .set_height(GetTensorDim(compatible_input_shape, data_format, 'H')) |
| .set_width(GetTensorDim(compatible_input_shape, data_format, 'W')) |
| .set_feature_map_count(dims.in_depth) |
| .set_layout(compute_data_layout); |
| se::dnn::BatchDescriptor output_desc; |
| output_desc.set_count(dims.batch_size) |
| .set_height(dims.spatial_dims[0].output_size) |
| .set_width(dims.spatial_dims[1].output_size) |
| .set_feature_map_count(dims.out_depth) |
| .set_layout(compute_data_layout); |
| se::dnn::FilterDescriptor filter_desc; |
| filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size) |
| .set_input_filter_width(dims.spatial_dims[1].filter_size) |
| .set_input_feature_map_count(filter_shape.dim_size(2)) |
| .set_output_feature_map_count(filter_shape.dim_size(3)) |
| .set_layout(filter_layout); |
| se::dnn::ConvolutionDescriptor conv_desc; |
| conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation) |
| .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation) |
| .set_vertical_filter_stride(dims.spatial_dims[0].stride) |
| .set_horizontal_filter_stride(dims.spatial_dims[1].stride) |
| .set_zero_padding_height(common_padding_rows) |
| .set_zero_padding_width(common_padding_cols) |
| .set_group_count(dims.in_depth / filter_shape.dim_size(2)); |
| |
| // Tensorflow filter format: HWIO |
| // cuDNN filter formats: (data format) -> (filter format) |
| // (1) NCHW -> OIHW |
| // (2) NHWC -> OHWI |
| |
| Tensor transformed_filter; |
| const auto transform_filter = [&](FilterTensorFormat dst_format) -> Status { |
| VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO) |
| << " to " << ToString(dst_format); |
| |
| TensorShape dst_shape = |
| dst_format == FORMAT_OIHW |
| ? TensorShape({filter.dim_size(3), filter.dim_size(2), |
| filter.dim_size(0), filter.dim_size(1)}) |
| : TensorShape({filter.dim_size(3), filter.dim_size(0), |
| filter.dim_size(1), filter.dim_size(2)}); |
| |
| TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<T>::value, dst_shape, |
| &transformed_filter)); |
| functor::TransformFilter<GPUDevice, T, int, 4>()( |
| ctx->eigen_device<GPUDevice>(), dst_format, |
| To32Bit(filter.tensor<T, 4>()), |
| To32Bit(transformed_filter.tensor<T, 4>())); |
| |
| return Status::OK(); |
| }; |
| |
| if (compute_data_format == FORMAT_NCHW) { |
| OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OIHW)); |
| } else if (compute_data_format == FORMAT_NHWC) { |
| OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OHWI)); |
| } else { |
| ctx->SetStatus(errors::InvalidArgument("Invalid compute data format: ", |
| ToString(compute_data_format))); |
| return; |
| } |
| |
| Tensor transformed_out_backprop; |
| if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) { |
| VLOG(4) << "Convert the `out_backprop` tensor from NHWC to NCHW."; |
| TensorShape compute_shape = ShapeFromFormat( |
| compute_data_format, dims.batch_size, dims.spatial_dims[0].output_size, |
| dims.spatial_dims[1].output_size, dims.out_depth); |
| if (dims.out_depth > 1) { |
| OP_REQUIRES_OK(ctx, |
| ctx->allocate_temp(DataTypeToEnum<T>::value, compute_shape, |
| &transformed_out_backprop)); |
| functor::NHWCToNCHW<GPUDevice, T, 4>()( |
| ctx->eigen_device<GPUDevice>(), out_backprop.tensor<T, 4>(), |
| transformed_out_backprop.tensor<T, 4>()); |
| } else { |
| // If depth <= 1, then just reshape. |
| CHECK(transformed_out_backprop.CopyFrom(out_backprop, compute_shape)); |
| } |
| } else { |
| transformed_out_backprop = out_backprop; |
| } |
| |
| Tensor pre_transformed_in_backprop; |
| OP_REQUIRES_OK( |
| ctx, ctx->allocate_temp( |
| DataTypeToEnum<T>::value, |
| ShapeFromFormat( |
| compute_data_format, |
| GetTensorDim(compatible_input_shape, data_format, 'N'), |
| GetTensorDim(compatible_input_shape, data_format, 'H'), |
| GetTensorDim(compatible_input_shape, data_format, 'W'), |
| GetTensorDim(compatible_input_shape, data_format, 'C')), |
| &pre_transformed_in_backprop)); |
| |
| auto out_backprop_ptr = |
| AsDeviceMemory(transformed_out_backprop.template flat<T>().data(), |
| transformed_out_backprop.template flat<T>().size()); |
| auto filter_ptr = |
| AsDeviceMemory(transformed_filter.template flat<T>().data(), |
| transformed_filter.template flat<T>().size()); |
| auto in_backprop_ptr = |
| AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(), |
| pre_transformed_in_backprop.template flat<T>().size()); |
| |
| static int64 ConvolveBackwardDataScratchSize = GetDnnWorkspaceLimit( |
| "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB by default |
| ); |
| DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, ctx); |
| int device_id = stream->parent()->device_ordinal(); |
| DataType dtype = out_backprop.dtype(); |
| ConvParameters conv_parameters = { |
| dims.batch_size, // batch |
| dims.in_depth, // in_depths |
| {{input_desc.height(), // in_rows |
| input_desc.width()}}, // in_cols |
| compute_data_format, // compute_data_format |
| dims.out_depth, // out_depths |
| {{dims.spatial_dims[0].filter_size, // filter_rows |
| dims.spatial_dims[1].filter_size, // filter_cols |
| filter_shape.dim_size(2)}}, // filter_depths |
| {{dims.spatial_dims[0].dilation, // dilation_rows |
| dims.spatial_dims[1].dilation}}, // dilation_cols |
| {{dims.spatial_dims[0].stride, // stride_rows |
| dims.spatial_dims[1].stride}}, // stride_cols |
| {{common_padding_rows, // padding_rows |
| common_padding_cols}}, // padding_cols |
| dtype, // tensor data type |
| device_id, // device_id |
| }; |
| AlgorithmConfig algorithm_config; |
| if (cudnn_use_autotune && !AutoTuneConvBwdData::GetInstance()->Find( |
| conv_parameters, &algorithm_config)) { |
| #if GOOGLE_CUDA |
| |
| se::TfAllocatorAdapter tf_allocator_adapter( |
| stream->parent()->platform(), ctx->device()->GetAllocator({})); |
| |
| se::cuda::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter, |
| se::cuda::PtxCompilationOptions()); |
| |
| se::DeviceMemory<T> in_backprop_ptr_rz( |
| WrapRedzoneBestEffort(&rz_allocator, in_backprop_ptr)); |
| |
| std::vector<AlgorithmDesc> algorithms; |
| CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms( |
| conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()), |
| &algorithms)); |
| std::vector<tensorflow::AutotuneResult> results; |
| for (auto profile_algorithm : algorithms) { |
| // TODO(zhengxq): profile each algorithm multiple times to better |
| // accuracy. |
| DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, |
| ctx); |
| se::cuda::RedzoneAllocator rz_scratch_allocator( |
| stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(), |
| /*memory_limit=*/ConvolveBackwardDataScratchSize); |
| se::ScratchAllocator* allocator_used = |
| !RedzoneCheckDisabled() |
| ? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator) |
| : static_cast<se::ScratchAllocator*>(&scratch_allocator); |
| ProfileResult profile_result; |
| bool cudnn_launch_status = |
| stream |
| ->ThenConvolveBackwardDataWithAlgorithm( |
| filter_desc, filter_ptr, output_desc, out_backprop_ptr, |
| conv_desc, input_desc, &in_backprop_ptr_rz, allocator_used, |
| AlgorithmConfig(profile_algorithm), &profile_result) |
| .ok(); |
| if (cudnn_launch_status && profile_result.is_valid()) { |
| results.emplace_back(); |
| auto& result = results.back(); |
| result.mutable_conv()->set_algorithm(profile_algorithm.algo_id()); |
| result.mutable_conv()->set_tensor_ops_enabled( |
| profile_algorithm.tensor_ops_enabled()); |
| result.set_scratch_bytes( |
| !RedzoneCheckDisabled() |
| ? rz_scratch_allocator.TotalAllocatedBytesExcludingRedzones() |
| : scratch_allocator.TotalByteSize()); |
| *result.mutable_run_time() = proto_utils::ToDurationProto( |
| absl::Milliseconds(profile_result.elapsed_time_in_ms())); |
| |
| CheckRedzones(rz_scratch_allocator, &result); |
| CheckRedzones(rz_allocator, &result); |
| } |
| } |
| LogConvAutotuneResults( |
| se::dnn::ConvolutionKind::BACKWARD_DATA, se::dnn::ToDataType<T>::value, |
| in_backprop_ptr, filter_ptr, out_backprop_ptr, input_desc, filter_desc, |
| output_desc, conv_desc, stream->parent(), results); |
| OP_REQUIRES_OK(ctx, BestCudnnConvAlgorithm(results, &algorithm_config)); |
| #elif TENSORFLOW_USE_ROCM |
| // MIOpen has its own Find and autotuner so use it here, passing |
| // default AlgorithmConfig to force a search |
| DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, ctx); |
| ProfileResult best_result; |
| bool miopen_find_status = |
| stream |
| ->ThenConvolveBackwardDataWithAlgorithm( |
| filter_desc, filter_ptr, output_desc, out_backprop_ptr, |
| conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator, |
| AlgorithmConfig(), &best_result) |
| .ok(); |
| OP_REQUIRES(ctx, miopen_find_status && best_result.is_valid(), |
| errors::NotFound("Failed to find backwards-data algorithm!")); |
| |
| algorithm_config.set_algorithm(best_result.algorithm()); |
| algorithm_config.set_scratch_size(best_result.scratch_size()); |
| #endif |
| AutoTuneConvBwdData::GetInstance()->Insert(conv_parameters, |
| algorithm_config); |
| } |
| bool cudnn_launch_status = |
| stream |
| ->ThenConvolveBackwardDataWithAlgorithm( |
| filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc, |
| input_desc, &in_backprop_ptr, &scratch_allocator, |
| algorithm_config, nullptr) |
| .ok(); |
| |
| if (!cudnn_launch_status) { |
| ctx->SetStatus(errors::Internal( |
| "DNN Backward Data function launch failure : input shape(", |
| input_shape.DebugString(), ") filter shape(", |
| filter_shape.DebugString(), ")")); |
| return; |
| } |
| |
| if (padding_top != padding_bottom || padding_left != padding_right) { |
| Tensor in_backprop_remove_padding; |
| OP_REQUIRES_OK( |
| ctx, ctx->allocate_temp( |
| DataTypeToEnum<T>::value, |
| ShapeFromFormat(compute_data_format, |
| GetTensorDim(input_shape, data_format, 'N'), |
| GetTensorDim(input_shape, data_format, 'H'), |
| GetTensorDim(input_shape, data_format, 'W'), |
| GetTensorDim(input_shape, data_format, 'C')), |
| &in_backprop_remove_padding)); |
| |
| // Remove the padding that was added to the input shape above. |
| const int64 input_pad_top = padding_top - common_padding_rows; |
| const int64 input_pad_bottom = padding_bottom - common_padding_rows; |
| const int64 input_pad_left = padding_left - common_padding_cols; |
| const int64 input_pad_right = padding_right - common_padding_cols; |
| functor::PadInput<GPUDevice, T, int, 4>()( |
| ctx->template eigen_device<GPUDevice>(), |
| To32Bit(const_cast<const Tensor&>(pre_transformed_in_backprop) |
| .tensor<T, 4>()), |
| {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}}, |
| {{static_cast<int>(-input_pad_bottom), |
| static_cast<int>(-input_pad_right)}}, |
| To32Bit(in_backprop_remove_padding.tensor<T, 4>()), |
| compute_data_format); |
| |
| pre_transformed_in_backprop = in_backprop_remove_padding; |
| } |
| |
| if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) { |
| VLOG(4) << "Convert the output tensor back from NCHW to NHWC."; |
| auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; |
| functor::NCHWToNHWC<GPUDevice, T, 4>()( |
| ctx->eigen_device<GPUDevice>(), |
| toConstTensor(pre_transformed_in_backprop).template tensor<T, 4>(), |
| in_backprop->tensor<T, 4>()); |
| } else { |
| *in_backprop = pre_transformed_in_backprop; |
| } |
| } |
| |
| // Forward declarations of the functor specializations for GPU. |
| namespace functor { |
| #define DECLARE_GPU_SPEC(T) \ |
| template <> \ |
| void ShuffleAndReverse<GPUDevice, T, 4, int>::operator()( \ |
| const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor input, \ |
| const Eigen::DSizes<int, 4>& order, \ |
| const Eigen::array<bool, 4>& reverse_dims, \ |
| typename TTypes<T, 4, int>::Tensor output); \ |
| extern template struct ShuffleAndReverse<GPUDevice, T, 4, int>; \ |
| template <> \ |
| void InflatePadAndShuffle<GPUDevice, T, 4, int>::operator()( \ |
| const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor input, \ |
| const Eigen::DSizes<int, 4>& strides, \ |
| const Eigen::array<Eigen::IndexPair<int>, 4>& pad_dims, \ |
| const Eigen::DSizes<int, 4>& order, \ |
| typename TTypes<T, 4, int>::Tensor output); \ |
| extern template struct InflatePadAndShuffle<GPUDevice, T, 4, int>; \ |
| template <> \ |
| void TransformFilter<GPUDevice, T, int, 4>::operator()( \ |
| const GPUDevice& d, FilterTensorFormat dst_filter_format, \ |
| typename TTypes<T, 4, int>::ConstTensor in, \ |
| typename TTypes<T, 4, int>::Tensor out); \ |
| extern template struct TransformFilter<GPUDevice, T, int, 4>; \ |
| template <> \ |
| void TransformDepth<GPUDevice, T, int>::operator()( \ |
| const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in, \ |
| const Eigen::DSizes<int, 4>& shuffle, \ |
| typename TTypes<T, 4, int>::Tensor out); \ |
| extern template struct TransformDepth<GPUDevice, T, int>; \ |
| template <> \ |
| void PadInput<GPUDevice, T, int, 4>::operator()( \ |
| const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in, \ |
| const std::array<int, 2>& padding_left, \ |
| const std::array<int, 2>& padding_right, \ |
| typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \ |
| extern template struct PadInput<GPUDevice, T, int, 4>; |
| |
| DECLARE_GPU_SPEC(float); |
| DECLARE_GPU_SPEC(Eigen::half); |
| DECLARE_GPU_SPEC(double); |
| #undef DECLARE_GPU_SPEC |
| } // namespace functor |
| |
| REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") |
| .Device(DEVICE_GPU) |
| .TypeConstraint<double>("T") |
| .HostMemory("input_sizes"), |
| Conv2DBackpropInputOp<GPUDevice, double>); |
| REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") |
| .Device(DEVICE_GPU) |
| .TypeConstraint<float>("T") |
| .HostMemory("input_sizes"), |
| Conv2DBackpropInputOp<GPUDevice, float>); |
| REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") |
| .Device(DEVICE_GPU) |
| .TypeConstraint<Eigen::half>("T") |
| .HostMemory("input_sizes"), |
| Conv2DBackpropInputOp<GPUDevice, Eigen::half>); |
| |
| // To be used inside depthwise_conv_grad_op.cc. |
| // TODO(reedwm): Move this and the definition to depthwise_conv_grad_op.cc. |
| template struct LaunchConv2DBackpropInputOp<GPUDevice, float>; |
| template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>; |
| template struct LaunchConv2DBackpropInputOp<GPUDevice, double>; |
| |
| #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
| |
| } // namespace tensorflow |