| /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #define EIGEN_USE_THREADS |
| |
| #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
| #define EIGEN_USE_GPU |
| #if GOOGLE_CUDA |
| #include "third_party/gpus/cudnn/cudnn.h" |
| #endif // GOOGLE_CUDA |
| |
| #include "tensorflow/core/kernels/conv_2d.h" |
| #include "tensorflow/core/platform/stream_executor.h" |
| #include "tensorflow/core/util/stream_executor_util.h" |
| #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
| |
| #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" |
| #include "tensorflow/core/framework/op_kernel.h" |
| #include "tensorflow/core/framework/register_types.h" |
| #include "tensorflow/core/framework/tensor.h" |
| #include "tensorflow/core/framework/tensor_types.h" |
| #include "tensorflow/core/kernels/fill_functor.h" |
| #include "tensorflow/core/kernels/fused_batch_norm_op.h" |
| #include "tensorflow/core/kernels/redux_functor.h" |
| #include "tensorflow/core/kernels/transpose_functor.h" |
| #include "tensorflow/core/util/env_var.h" |
| #include "tensorflow/core/util/tensor_format.h" |
| |
| namespace tensorflow { |
| using CPUDevice = Eigen::ThreadPoolDevice; |
| using GPUDevice = Eigen::GpuDevice; |
| |
| namespace functor { |
| |
| string ToString(FusedBatchNormActivationMode activation_mode) { |
| switch (activation_mode) { |
| case FusedBatchNormActivationMode::kIdentity: |
| return "Identity"; |
| case FusedBatchNormActivationMode::kRelu: |
| return "Relu"; |
| } |
| } |
| |
| Status ParseActivationMode(OpKernelConstruction* context, |
| FusedBatchNormActivationMode* activation_mode) { |
| string activation_mode_str; |
| TF_RETURN_IF_ERROR(context->GetAttr("activation_mode", &activation_mode_str)); |
| |
| if (activation_mode_str == "Identity") { |
| *activation_mode = FusedBatchNormActivationMode::kIdentity; |
| return Status::OK(); |
| } |
| if (activation_mode_str == "Relu") { |
| *activation_mode = FusedBatchNormActivationMode::kRelu; |
| return Status::OK(); |
| } |
| return errors::InvalidArgument("Unsupported activation mode: ", |
| activation_mode_str); |
| } |
| |
| // Functor used by FusedBatchNormOp to do the computations. |
| template <typename Device, typename T, typename U> |
| struct FusedBatchNorm; |
| // Functor used by FusedBatchNormGradOp to do the computations when |
| // is_training=True. |
| template <typename Device, typename T, typename U> |
| struct FusedBatchNormGrad; |
| |
| #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
| using se::DeviceMemory; |
| using se::ScratchAllocator; |
| using se::Stream; |
| using se::port::StatusOr; |
| |
| template <typename U, typename T> |
| DeviceMemory<U> CastDeviceMemory(Tensor* tensor) { |
| return DeviceMemory<U>::MakeFromByteSize( |
| tensor->template flat<T>().data(), |
| tensor->template flat<T>().size() * sizeof(T)); |
| } |
| |
| // A helper to allocate temporary scratch memory for Cudnn BatchNormEx ops. It |
| // takes the ownership of the underlying memory. The expectation is that the |
| // memory should be alive for the span of the Cudnn BatchNormEx itself. |
| template <typename T> |
| class CudnnBatchNormAllocatorInTemp : public ScratchAllocator { |
| public: |
| ~CudnnBatchNormAllocatorInTemp() override = default; |
| |
| explicit CudnnBatchNormAllocatorInTemp(OpKernelContext* context) |
| : context_(context) {} |
| |
| int64 GetMemoryLimitInBytes() override { |
| return std::numeric_limits<int64>::max(); |
| } |
| |
| StatusOr<DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override { |
| Tensor temporary_memory; |
| const DataType tf_data_type = DataTypeToEnum<T>::v(); |
| int64 allocate_count = |
| Eigen::divup(byte_size, static_cast<int64>(sizeof(T))); |
| Status allocation_status(context_->allocate_temp( |
| tf_data_type, TensorShape({allocate_count}), &temporary_memory)); |
| if (!allocation_status.ok()) { |
| return allocation_status; |
| } |
| // Hold the reference of the allocated tensors until the end of the |
| // allocator. |
| allocated_tensors_.push_back(temporary_memory); |
| total_byte_size_ += byte_size; |
| return DeviceMemory<uint8>::MakeFromByteSize( |
| temporary_memory.template flat<T>().data(), |
| temporary_memory.template flat<T>().size() * sizeof(T)); |
| } |
| |
| int64 TotalByteSize() const { return total_byte_size_; } |
| |
| Tensor get_allocated_tensor(int index) const { |
| return allocated_tensors_[index]; |
| } |
| |
| private: |
| int64 total_byte_size_ = 0; |
| OpKernelContext* context_; // not owned |
| std::vector<Tensor> allocated_tensors_; |
| }; |
| |
| // A helper to allocate memory for Cudnn BatchNormEx as a kernel output. It is |
| // used by forward pass kernel to feed the output to the backward pass. |
| // The memory is expected to live long enough after the backward pass is |
| // finished. |
| template <typename T> |
| class CudnnBatchNormAllocatorInOutput : public ScratchAllocator { |
| public: |
| ~CudnnBatchNormAllocatorInOutput() override { |
| if (!output_allocated) { |
| Tensor* dummy_reserve_space = nullptr; |
| OP_REQUIRES_OK(context_, context_->allocate_output(output_index_, {}, |
| &dummy_reserve_space)); |
| } |
| } |
| |
| CudnnBatchNormAllocatorInOutput(OpKernelContext* context, int output_index) |
| : context_(context), output_index_(output_index) {} |
| |
| int64 GetMemoryLimitInBytes() override { |
| return std::numeric_limits<int64>::max(); |
| } |
| |
| StatusOr<DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override { |
| output_allocated = true; |
| DCHECK(total_byte_size_ == 0) |
| << "Reserve space allocator can only be called once"; |
| int64 allocate_count = |
| Eigen::divup(byte_size, static_cast<int64>(sizeof(T))); |
| |
| Tensor* temporary_memory = nullptr; |
| Status allocation_status(context_->allocate_output( |
| output_index_, TensorShape({allocate_count}), &temporary_memory)); |
| if (!allocation_status.ok()) { |
| return allocation_status; |
| } |
| total_byte_size_ += byte_size; |
| auto memory_uint8 = DeviceMemory<uint8>::MakeFromByteSize( |
| temporary_memory->template flat<T>().data(), |
| temporary_memory->template flat<T>().size() * sizeof(T)); |
| return StatusOr<DeviceMemory<uint8>>(memory_uint8); |
| } |
| |
| int64 TotalByteSize() { return total_byte_size_; } |
| |
| private: |
| int64 total_byte_size_ = 0; |
| OpKernelContext* context_; // not owned |
| int output_index_; |
| bool output_allocated = false; |
| }; |
| #else |
| // A dummy class for the non-GPU environment. Its child classes |
| // CudnnBatchNormAllocatorInTemp and CudnnBatchNormAllocatorInOutput are used |
| // to make the non-GPU operations compatible with GPU ones. |
| class ScratchAllocator { |
| public: |
| virtual ~ScratchAllocator() {} |
| }; |
| |
| template <typename T> |
| class CudnnBatchNormAllocatorInTemp : public ScratchAllocator { |
| public: |
| explicit CudnnBatchNormAllocatorInTemp(OpKernelContext* context) {} |
| }; |
| |
| template <typename T> |
| class CudnnBatchNormAllocatorInOutput : public ScratchAllocator { |
| public: |
| ~CudnnBatchNormAllocatorInOutput() override { |
| Tensor* dummy_reserve_space = nullptr; |
| OP_REQUIRES_OK(context_, context_->allocate_output(output_index_, {}, |
| &dummy_reserve_space)); |
| // Initialize the memory, to avoid sanitizer alerts. |
| dummy_reserve_space->flat<T>()(0) = T(); |
| } |
| CudnnBatchNormAllocatorInOutput(OpKernelContext* context, int output_index) |
| : context_(context), output_index_(output_index) {} |
| |
| private: |
| OpKernelContext* context_; // not owned |
| int output_index_; |
| }; |
| #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
| |
| template <bool IsSame, typename Y, typename X, typename T> |
| struct CastIfNecessary { |
| static inline void process( |
| Y& y, X& x_shifted, const Eigen::DSizes<Eigen::Index, 2>& rest_by_depth, |
| const CPUDevice& d) { |
| y.reshape(rest_by_depth).device(d) = x_shifted.template cast<T>(); |
| } |
| }; |
| |
| template <typename Y, typename X, typename T> |
| struct CastIfNecessary<true, Y, X, T> { |
| static inline void process( |
| Y& y, X& x_shifted, const Eigen::DSizes<Eigen::Index, 2>& rest_by_depth, |
| const CPUDevice& d) { |
| y.reshape(rest_by_depth).device(d) = x_shifted; |
| } |
| }; |
| |
| template <typename T, typename U> |
| struct FusedBatchNorm<CPUDevice, T, U> { |
| void operator()(OpKernelContext* context, const Tensor& x_input, |
| const Tensor& scale_input, const Tensor& offset_input, |
| const Tensor& estimated_mean_input, |
| const Tensor& estimated_variance_input, |
| const Tensor& side_input, U epsilon, |
| FusedBatchNormActivationMode activation_mode, |
| Tensor* y_output, Tensor* batch_mean_output, |
| Tensor* batch_var_output, Tensor* saved_mean_output, |
| Tensor* saved_var_output, TensorFormat tensor_format, |
| ScratchAllocator* reserve_space_allocator, |
| ScratchAllocator* workspace_allocator, bool is_training) { |
| OP_REQUIRES(context, side_input.dim_size(0) == 0, |
| errors::Internal( |
| "The CPU implementation of FusedBatchNorm does not support " |
| "side input.")); |
| OP_REQUIRES(context, |
| activation_mode == FusedBatchNormActivationMode::kIdentity, |
| errors::Internal("The CPU implementation of FusedBatchNorm " |
| "does not support activations.")); |
| Tensor transformed_x; |
| Tensor transformed_y; |
| if (tensor_format == FORMAT_NCHW) { |
| const int64 in_batch = GetTensorDim(x_input, tensor_format, 'N'); |
| const int64 in_rows = GetTensorDim(x_input, tensor_format, 'H'); |
| const int64 in_cols = GetTensorDim(x_input, tensor_format, 'W'); |
| const int64 in_depths = GetTensorDim(x_input, tensor_format, 'C'); |
| OP_REQUIRES_OK(context, context->allocate_temp( |
| DataTypeToEnum<T>::value, |
| ShapeFromFormat(FORMAT_NHWC, in_batch, |
| in_rows, in_cols, in_depths), |
| &transformed_x)); |
| OP_REQUIRES_OK(context, context->allocate_temp( |
| DataTypeToEnum<T>::value, |
| ShapeFromFormat(FORMAT_NHWC, in_batch, |
| in_rows, in_cols, in_depths), |
| &transformed_y)); |
| // Perform NCHW to NHWC |
| std::vector<int32> perm = {0, 2, 3, 1}; |
| OP_REQUIRES_OK( |
| context, ::tensorflow::DoTranspose(context->eigen_device<CPUDevice>(), |
| x_input, perm, &transformed_x)); |
| } else { |
| transformed_x = x_input; |
| transformed_y = *y_output; |
| } |
| typename TTypes<T, 4>::Tensor x(transformed_x.tensor<T, 4>()); |
| typename TTypes<U>::ConstVec scale(scale_input.vec<U>()); |
| typename TTypes<U>::ConstVec offset(offset_input.vec<U>()); |
| typename TTypes<U>::ConstVec estimated_mean(estimated_mean_input.vec<U>()); |
| typename TTypes<U>::ConstVec estimated_variance( |
| estimated_variance_input.vec<U>()); |
| typename TTypes<T, 4>::Tensor y(transformed_y.tensor<T, 4>()); |
| typename TTypes<U>::Vec batch_mean(batch_mean_output->vec<U>()); |
| typename TTypes<U>::Vec batch_var(batch_var_output->vec<U>()); |
| typename TTypes<U>::Vec saved_mean(saved_mean_output->vec<U>()); |
| typename TTypes<U>::Vec saved_var(saved_var_output->vec<U>()); |
| |
| const CPUDevice& d = context->eigen_device<CPUDevice>(); |
| |
| const int depth = x.dimension(3); |
| const int size = x.size(); |
| const int rest_size = size / depth; |
| Eigen::DSizes<Eigen::Index, 2> rest_by_depth(rest_size, depth); |
| |
| #if !defined(EIGEN_HAS_INDEX_LIST) |
| Eigen::DSizes<Eigen::Index, 2> one_by_depth(1, depth); |
| Eigen::array<int, 1> reduce_dims({0}); |
| Eigen::array<int, 2> bcast_spec({rest_size, 1}); |
| #else |
| Eigen::IndexList<Eigen::type2index<1>, Eigen::Index> one_by_depth; |
| one_by_depth.set(1, depth); |
| Eigen::IndexList<Eigen::type2index<0>> reduce_dims; |
| Eigen::IndexList<Eigen::Index, Eigen::type2index<1>> bcast_spec; |
| bcast_spec.set(0, rest_size); |
| #endif |
| |
| auto x_rest_by_depth = x.reshape(rest_by_depth).template cast<U>(); |
| const int rest_size_minus_one = (rest_size > 1) ? (rest_size - 1) : 1; |
| U rest_size_inv = static_cast<U>(1.0f / static_cast<U>(rest_size)); |
| // This adjustment is for Bessel's correction |
| U rest_size_adjust = |
| static_cast<U>(rest_size) / static_cast<U>(rest_size_minus_one); |
| |
| Eigen::Tensor<U, 1, Eigen::RowMajor> mean(depth); |
| Eigen::Tensor<U, 1, Eigen::RowMajor> variance(depth); |
| if (is_training) { |
| mean.device(d) = (x_rest_by_depth.sum(reduce_dims) * rest_size_inv); |
| batch_mean.device(d) = mean; |
| saved_mean.device(d) = mean; |
| } else { |
| mean.device(d) = estimated_mean; |
| } |
| |
| auto x_centered = |
| x_rest_by_depth - mean.reshape(one_by_depth).broadcast(bcast_spec); |
| |
| if (is_training) { |
| variance.device(d) = x_centered.square().sum(reduce_dims) * rest_size_inv; |
| batch_var.device(d) = variance * rest_size_adjust; |
| saved_var.device(d) = variance; |
| } else { |
| variance.device(d) = estimated_variance; |
| } |
| |
| auto scaling_factor = ((variance + epsilon).rsqrt() * scale) |
| .eval() |
| .reshape(one_by_depth) |
| .broadcast(bcast_spec); |
| auto x_scaled = x_centered * scaling_factor; |
| auto x_shifted = |
| x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec); |
| |
| // Explicitly checks the types of T and U and only casts x_shifted when |
| // T != U. (Not doing so caused a 35-50% performance slowdown for |
| // some compiler flags.) |
| CastIfNecessary<std::is_same<T, U>::value, decltype(y), decltype(x_shifted), |
| T>::process(y, x_shifted, rest_by_depth, d); |
| |
| if (tensor_format == FORMAT_NCHW) { |
| // Perform NHWC to NCHW |
| std::vector<int32> perm = {0, 3, 1, 2}; |
| OP_REQUIRES_OK( |
| context, ::tensorflow::DoTranspose(context->eigen_device<CPUDevice>(), |
| transformed_y, perm, y_output)); |
| } |
| } |
| }; |
| |
| template <typename T, typename U> |
| struct FusedBatchNormGrad<CPUDevice, T, U> { |
| void operator()(OpKernelContext* context, const Tensor& y_backprop_input, |
| const Tensor& x_input, const Tensor& scale_input, |
| const Tensor& mean_input, const Tensor& variance_input, |
| U epsilon, Tensor* x_backprop_output, |
| Tensor* scale_backprop_output, Tensor* offset_backprop_output, |
| const Tensor* reserve_space, |
| ScratchAllocator* workspace_allocator, |
| TensorFormat tensor_format) { |
| Tensor transformed_y_backprop_input; |
| Tensor transformed_x_input; |
| Tensor transformed_x_backprop_output; |
| if (tensor_format == FORMAT_NCHW) { |
| const int64 in_batch = GetTensorDim(x_input, tensor_format, 'N'); |
| const int64 in_rows = GetTensorDim(x_input, tensor_format, 'H'); |
| const int64 in_cols = GetTensorDim(x_input, tensor_format, 'W'); |
| const int64 in_depths = GetTensorDim(x_input, tensor_format, 'C'); |
| OP_REQUIRES_OK(context, context->allocate_temp( |
| DataTypeToEnum<T>::value, |
| ShapeFromFormat(FORMAT_NHWC, in_batch, |
| in_rows, in_cols, in_depths), |
| &transformed_y_backprop_input)); |
| OP_REQUIRES_OK(context, context->allocate_temp( |
| DataTypeToEnum<T>::value, |
| ShapeFromFormat(FORMAT_NHWC, in_batch, |
| in_rows, in_cols, in_depths), |
| &transformed_x_input)); |
| OP_REQUIRES_OK(context, context->allocate_temp( |
| DataTypeToEnum<T>::value, |
| ShapeFromFormat(FORMAT_NHWC, in_batch, |
| in_rows, in_cols, in_depths), |
| &transformed_x_backprop_output)); |
| // Perform NCHW to NHWC |
| std::vector<int32> perm = {0, 2, 3, 1}; |
| OP_REQUIRES_OK( |
| context, ::tensorflow::DoTranspose(context->eigen_device<CPUDevice>(), |
| y_backprop_input, perm, |
| &transformed_y_backprop_input)); |
| OP_REQUIRES_OK(context, ::tensorflow::DoTranspose( |
| context->eigen_device<CPUDevice>(), x_input, |
| perm, &transformed_x_input)); |
| } else { |
| transformed_y_backprop_input = y_backprop_input; |
| transformed_x_input = x_input; |
| transformed_x_backprop_output = *x_backprop_output; |
| } |
| typename TTypes<T, 4>::Tensor y_backprop( |
| transformed_y_backprop_input.tensor<T, 4>()); |
| typename TTypes<T, 4>::Tensor x(transformed_x_input.tensor<T, 4>()); |
| typename TTypes<U>::ConstVec scale(scale_input.vec<U>()); |
| typename TTypes<U>::ConstVec mean(mean_input.vec<U>()); |
| typename TTypes<U>::ConstVec variance(variance_input.vec<U>()); |
| typename TTypes<T, 4>::Tensor x_backprop( |
| transformed_x_backprop_output.tensor<T, 4>()); |
| typename TTypes<U>::Vec offset_backprop(offset_backprop_output->vec<U>()); |
| |
| // Note: the following formulas are used to compute the gradients for |
| // back propagation. |
| // x_backprop = scale * rsqrt(variance + epsilon) * |
| // [y_backprop - mean(y_backprop) - (x - mean(x)) * |
| // mean(y_backprop * (x - mean(x))) / (variance + epsilon)] |
| // scale_backprop = sum(y_backprop * |
| // (x - mean(x)) * rsqrt(variance + epsilon)) |
| // offset_backprop = sum(y_backprop) |
| |
| const CPUDevice& d = context->eigen_device<CPUDevice>(); |
| const int depth = x.dimension(3); |
| const int size = x.size(); |
| const int rest_size = size / depth; |
| Eigen::DSizes<Eigen::Index, 2> rest_by_depth(rest_size, depth); |
| |
| #if !defined(EIGEN_HAS_INDEX_LIST) |
| Eigen::DSizes<Eigen::Index, 2> one_by_depth(1, depth); |
| Eigen::array<int, 2> bcast_spec({rest_size, 1}); |
| #else |
| Eigen::IndexList<Eigen::type2index<1>, Eigen::Index> one_by_depth; |
| one_by_depth.set(1, depth); |
| Eigen::IndexList<Eigen::Index, Eigen::type2index<1>> bcast_spec; |
| bcast_spec.set(0, rest_size); |
| #endif |
| |
| auto x_rest_by_depth = x.reshape(rest_by_depth).template cast<U>(); |
| U rest_size_inv = static_cast<U>(1.0f / static_cast<U>(rest_size)); |
| |
| // Eigen is notoriously bad at reducing outer dimension, so we materialize |
| // all temporary tensors that require reduction, and then use Eigen redux |
| // functor, that is optimized for this particular task. |
| // |
| // All reductions are of this type: [rest_size, depth] -> [depth]. |
| using ScalarSum = Eigen::internal::scalar_sum_op<U>; |
| const functor::ReduceOuterDimensions<T, U, U, ScalarSum> redux_sum_t; |
| const functor::ReduceOuterDimensions<U, U, U, ScalarSum> redux_sum_u; |
| |
| auto scratch_dtype = DataTypeToEnum<U>::value; |
| |
| // Allocate a temporary workspace of [depth] shape. |
| Tensor scratch_one_by_depth; |
| OP_REQUIRES_OK(context, context->allocate_temp(scratch_dtype, {depth}, |
| &scratch_one_by_depth)); |
| |
| // Maybe allocate a temporary workspace of [rest_size, depth] shape. |
| Tensor scratch_rest_by_depth; |
| if (std::is_same<T, U>::value) { |
| OP_REQUIRES(context, |
| scratch_rest_by_depth.CopyFrom(transformed_x_backprop_output, |
| {rest_size, depth}), |
| errors::Internal("Failed to copy a tensor")); |
| } else { |
| OP_REQUIRES_OK(context, |
| context->allocate_temp(scratch_dtype, {rest_size, depth}, |
| &scratch_rest_by_depth)); |
| } |
| |
| typename TTypes<U, 2>::Tensor scratch_tensor( |
| scratch_rest_by_depth.tensor<U, 2>()); |
| typename TTypes<U>::Vec scratch_vector(scratch_one_by_depth.vec<U>()); |
| |
| auto x_mean_rest_by_depth = |
| mean.reshape(one_by_depth).broadcast(bcast_spec); |
| auto x_centered = (x_rest_by_depth - x_mean_rest_by_depth); |
| auto coef0_one_by_depth = |
| (variance.reshape(one_by_depth) + epsilon).rsqrt(); |
| auto coef0_rest_by_depth = coef0_one_by_depth.broadcast(bcast_spec); |
| auto x_scaled = x_centered * coef0_rest_by_depth; |
| |
| auto y_backprop_rest_by_depth = |
| y_backprop.reshape(rest_by_depth).template cast<U>(); |
| |
| // Compute `scale_backprop_output`: |
| // scale_backprop = |
| // (y_backprop_rest_by_depth * x_scaled).sum(reduce_dims) |
| scratch_tensor.device(d) = y_backprop_rest_by_depth * x_scaled; |
| redux_sum_u(d, rest_by_depth, scratch_rest_by_depth, scale_backprop_output); |
| |
| // Compute 'offset_backprop_output': |
| // offset_backprop = |
| // y_backprop_rest_by_depth.sum(reduce_dims) |
| redux_sum_t(d, rest_by_depth, transformed_y_backprop_input, |
| offset_backprop_output); |
| auto y_backprop_sum = offset_backprop; |
| |
| auto y_backprop_sum_one_by_depth = y_backprop_sum.reshape(one_by_depth); |
| auto y_backprop_mean_one_by_depth = |
| y_backprop_sum_one_by_depth * rest_size_inv; |
| auto y_backprop_mean_rest_by_depth = |
| y_backprop_mean_one_by_depth.broadcast(bcast_spec); |
| auto y_backprop_centered = |
| y_backprop_rest_by_depth - y_backprop_mean_rest_by_depth; |
| |
| // Compute expression: |
| // y_backprop_centered_mean = |
| // (y_backprop_rest_by_depth * x_centered).mean(reduce_dims) |
| scratch_tensor.device(d) = y_backprop_rest_by_depth * x_centered; |
| redux_sum_u(d, rest_by_depth, scratch_rest_by_depth, &scratch_one_by_depth); |
| auto y_backprop_centered_mean = |
| scratch_vector.reshape(one_by_depth) / static_cast<U>(rest_size); |
| |
| auto coef1 = (scale.reshape(one_by_depth) * coef0_one_by_depth) |
| .broadcast(bcast_spec); |
| auto coef2 = (coef0_one_by_depth.square() * y_backprop_centered_mean) |
| .broadcast(bcast_spec); |
| |
| x_backprop.reshape(rest_by_depth).device(d) = |
| (coef1 * (y_backprop_centered - x_centered * coef2)).template cast<T>(); |
| |
| if (tensor_format == FORMAT_NCHW) { |
| // Perform NHWC to NCHW |
| std::vector<int32> perm = {0, 3, 1, 2}; |
| OP_REQUIRES_OK( |
| context, ::tensorflow::DoTranspose(context->eigen_device<CPUDevice>(), |
| transformed_x_backprop_output, |
| perm, x_backprop_output)); |
| } |
| } |
| }; |
| |
| template <typename T, typename U> |
| struct FusedBatchNormFreezeGrad<CPUDevice, T, U> { |
| void operator()(OpKernelContext* context, const Tensor& y_backprop_input, |
| const Tensor& x_input, const Tensor& scale_input, |
| const Tensor& pop_mean_input, |
| const Tensor& pop_variance_input, U epsilon, |
| Tensor* x_backprop_output, Tensor* scale_backprop_output, |
| Tensor* offset_backprop_output) { |
| typename TTypes<T, 4>::ConstTensor y_backprop( |
| y_backprop_input.tensor<T, 4>()); |
| typename TTypes<T, 4>::ConstTensor input(x_input.tensor<T, 4>()); |
| typename TTypes<U>::ConstVec scale(scale_input.vec<U>()); |
| typename TTypes<U>::ConstVec pop_mean(pop_mean_input.vec<U>()); |
| typename TTypes<U>::ConstVec pop_var(pop_variance_input.vec<U>()); |
| typename TTypes<T, 4>::Tensor x_backprop(x_backprop_output->tensor<T, 4>()); |
| typename TTypes<U>::Vec scale_backprop(scale_backprop_output->vec<U>()); |
| |
| const int depth = pop_mean.dimension(0); |
| const int rest_size = input.size() / depth; |
| |
| const CPUDevice& d = context->eigen_device<CPUDevice>(); |
| |
| // Allocate two temporary workspaces of [depth] shape. |
| Tensor scratch1_vec, scratch2_vec; |
| OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<U>::value, |
| {depth}, &scratch1_vec)); |
| OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<U>::value, |
| {depth}, &scratch2_vec)); |
| |
| // Maybe allocate a temporary workspace of [rest_size, depth] shape. |
| Tensor scratch3_tensor; |
| if (std::is_same<T, U>::value) { |
| OP_REQUIRES( |
| context, |
| scratch3_tensor.CopyFrom(*x_backprop_output, {rest_size, depth}), |
| errors::Internal("Failed to copy a tensor")); |
| } else { |
| OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<U>::value, |
| {rest_size, depth}, |
| &scratch3_tensor)); |
| } |
| |
| typename TTypes<U>::Vec scratch1(scratch1_vec.vec<U>()); |
| typename TTypes<U>::Vec scratch2(scratch2_vec.vec<U>()); |
| typename TTypes<U, 2>::Tensor scratch3(scratch3_tensor.tensor<U, 2>()); |
| |
| Eigen::DSizes<Eigen::Index, 2> rest_by_depth(rest_size, depth); |
| #if !defined(EIGEN_HAS_INDEX_LIST) |
| Eigen::DSizes<Eigen::Index, 2> one_by_depth(1, depth); |
| Eigen::array<int, 2> rest_by_one({rest_size, 1}); |
| #else |
| Eigen::IndexList<Eigen::type2index<1>, Eigen::Index> one_by_depth; |
| one_by_depth.set(1, depth); |
| Eigen::IndexList<Eigen::Index, Eigen::type2index<1>> rest_by_one; |
| rest_by_one.set(0, rest_size); |
| #endif |
| |
| // Sum reduction along the 0th dimension using custom CPU functor. |
| using ScalarSum = Eigen::internal::scalar_sum_op<U>; |
| const functor::ReduceOuterDimensions<T, U, U, ScalarSum> redux_sum_t; |
| const functor::ReduceOuterDimensions<U, U, U, ScalarSum> redux_sum_u; |
| |
| // offset_backprop = sum(y_backprop) |
| // scale_backprop = y_backprop * ((x - pop_mean) * rsqrt(pop_var + epsilon)) |
| // x_backprop = y_backprop * (scale * rsqrt(pop_var + epsilon)) |
| |
| // NOTE: DEFAULT DEVICE comment is added to expression assignments that |
| // we don't want to be executed in a thread pool. |
| |
| auto y_backprop_rest_by_depth = |
| y_backprop.reshape(rest_by_depth).template cast<U>(); |
| auto input_rest_by_depth = input.reshape(rest_by_depth).template cast<U>(); |
| |
| // offset_backprop = sum(y_backprop) |
| redux_sum_t(d, rest_by_depth, y_backprop_input, offset_backprop_output); |
| |
| // scratch1 = rsqrt(pop_var + epsilon) |
| scratch1 = (pop_var + pop_var.constant(epsilon)).rsqrt(); // DEFAULT DEVICE |
| |
| // scratch2 = sum(y_backprop * (x - mean)) |
| scratch3.device(d) = |
| y_backprop_rest_by_depth * |
| (input_rest_by_depth - |
| pop_mean.reshape(one_by_depth).broadcast(rest_by_one)); |
| redux_sum_u(d, rest_by_depth, scratch3_tensor, &scratch2_vec); |
| |
| x_backprop.reshape(rest_by_depth).device(d) = |
| (y_backprop_rest_by_depth * |
| ((scratch1.reshape(one_by_depth) * scale.reshape(one_by_depth)) |
| .broadcast(rest_by_one))) |
| .template cast<T>(); |
| scale_backprop = scratch2 * scratch1; // DEFAULT DEVICE |
| } |
| }; |
| |
| #if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM |
| namespace { |
| // See implementation under GOOGLE_CUDA #ifdef below. |
| bool BatchnormSpatialPersistentEnabled() { return false; } |
| } // namespace |
| #endif |
| |
| #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
| |
| namespace { |
| |
| se::dnn::ActivationMode AsDnnActivationMode( |
| const FusedBatchNormActivationMode activation_mode) { |
| switch (activation_mode) { |
| case FusedBatchNormActivationMode::kIdentity: |
| return se::dnn::ActivationMode::kNone; |
| case FusedBatchNormActivationMode::kRelu: |
| return se::dnn::ActivationMode::kRelu; |
| } |
| } |
| |
| // NOTE(ezhulenev): See `BatchnormSpatialPersistentEnabled` documentation in the |
| // `cuda_dnn.cc` for details. |
| bool BatchnormSpatialPersistentEnabled() { |
| #if CUDNN_VERSION >= 7402 |
| static bool is_enabled = [] { |
| bool is_enabled = false; |
| TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar( |
| "TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT", |
| /*default_val=*/false, &is_enabled)); |
| return is_enabled; |
| }(); |
| return is_enabled; |
| #else |
| return false; |
| #endif |
| } |
| } // namespace |
| |
| template <typename T, typename U> |
| struct FusedBatchNorm<GPUDevice, T, U> { |
| void operator()(OpKernelContext* context, const Tensor& x, |
| const Tensor& scale, const Tensor& offset, |
| const Tensor& estimated_mean, |
| const Tensor& estimated_variance, const Tensor& side_input, |
| U epsilon, FusedBatchNormActivationMode activation_mode, |
| Tensor* y, Tensor* batch_mean, Tensor* batch_var, |
| Tensor* saved_mean, Tensor* saved_inv_var, |
| TensorFormat tensor_format, |
| ScratchAllocator* reserve_space_allocator, |
| ScratchAllocator* workspace_allocator, bool is_training) { |
| auto* stream = context->op_device_context()->stream(); |
| OP_REQUIRES(context, stream, errors::Internal("No GPU stream available")); |
| |
| const int64 batch_size = GetTensorDim(x, tensor_format, 'N'); |
| const int64 channels = GetTensorDim(x, tensor_format, 'C'); |
| const int64 height = GetTensorDim(x, tensor_format, 'H'); |
| const int64 width = GetTensorDim(x, tensor_format, 'W'); |
| |
| // We have reserve_space_3 output only in FusedBatchNormV3 op, and in this |
| // case we pass non-nullptr allocators. |
| const bool has_reserve_space_3 = |
| reserve_space_allocator != nullptr && workspace_allocator != nullptr; |
| |
| // Check if cuDNN batch normalization has a fast NHWC implementation: |
| // (1) In inference mode it's always fast. |
| // (2) Tensorflow enabled batchnorm spatial persistence, and |
| // FusedBatchNormV3 passed non-null allocators. |
| const bool fast_nhwc_batch_norm = |
| !is_training || |
| (BatchnormSpatialPersistentEnabled() && |
| DataTypeToEnum<T>::value == DT_HALF && has_reserve_space_3); |
| |
| // If input tensor is in NHWC format, and we have a fast cuDNN |
| // implementation, there is no need to do data format conversion. |
| TensorFormat compute_format = |
| fast_nhwc_batch_norm && tensor_format == FORMAT_NHWC ? FORMAT_NHWC |
| : FORMAT_NCHW; |
| |
| VLOG(2) << "FusedBatchNorm:" |
| << " batch_size: " << batch_size << " channels: " << channels |
| << " height: " << height << " width:" << width |
| << " x shape: " << x.shape().DebugString() |
| << " scale shape: " << scale.shape().DebugString() |
| << " offset shape: " << offset.shape().DebugString() |
| << " side input shape: " << side_input.shape().DebugString() |
| << " activation mode: " << ToString(activation_mode) |
| << " tensor format: " << ToString(tensor_format) |
| << " compute format: " << ToString(compute_format); |
| |
| // If input is empty, return NaN mean/variance |
| if (x.shape().num_elements() == 0) { |
| functor::SetNanFunctor<U> f; |
| f(context->eigen_device<GPUDevice>(), batch_mean->flat<U>()); |
| f(context->eigen_device<GPUDevice>(), batch_var->flat<U>()); |
| return; |
| } |
| |
| // In inference mode we use custom CUDA kernel, because cuDNN does not |
| // support side input and activations for inference. |
| const bool has_side_input = side_input.dim_size(0) != 0; |
| const bool has_activation = |
| activation_mode != FusedBatchNormActivationMode::kIdentity; |
| |
| if (!is_training && (has_side_input || has_activation)) { |
| FusedBatchNormInferenceFunctor<GPUDevice, T, U> inference_functor; |
| |
| if (has_side_input) { |
| inference_functor(context, tensor_format, x.tensor<T, 4>(), |
| scale.vec<U>(), offset.vec<U>(), |
| estimated_mean.vec<U>(), estimated_variance.vec<U>(), |
| side_input.tensor<T, 4>(), epsilon, activation_mode, |
| y->tensor<T, 4>()); |
| } else { |
| typename TTypes<T, 4>::ConstTensor empty_tensor(nullptr, 0, 0, 0, 0); |
| inference_functor(context, tensor_format, x.tensor<T, 4>(), |
| scale.vec<U>(), offset.vec<U>(), |
| estimated_mean.vec<U>(), estimated_variance.vec<U>(), |
| empty_tensor, epsilon, activation_mode, |
| y->tensor<T, 4>()); |
| } |
| |
| return; |
| } |
| |
| Tensor x_maybe_transformed = x; |
| Tensor x_transformed; |
| Tensor y_transformed; |
| se::DeviceMemory<T> y_ptr; |
| |
| if (tensor_format == compute_format) { |
| y_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*y); |
| } else if (tensor_format == FORMAT_NHWC && compute_format == FORMAT_NCHW) { |
| OP_REQUIRES_OK(context, context->allocate_temp( |
| DataTypeToEnum<T>::value, |
| ShapeFromFormat(compute_format, batch_size, |
| height, width, channels), |
| &x_transformed)); |
| functor::NHWCToNCHW<GPUDevice, T, 4>()( |
| context->eigen_device<GPUDevice>(), |
| const_cast<const Tensor&>(x_maybe_transformed).tensor<T, 4>(), |
| x_transformed.tensor<T, 4>()); |
| x_maybe_transformed = x_transformed; |
| |
| OP_REQUIRES_OK(context, context->allocate_temp( |
| DataTypeToEnum<T>::value, |
| ShapeFromFormat(compute_format, batch_size, |
| height, width, channels), |
| &y_transformed)); |
| y_ptr = StreamExecutorUtil::AsDeviceMemory<T>(y_transformed); |
| } else { |
| context->SetStatus(errors::Internal( |
| "Unsupported tensor format: ", ToString(tensor_format), |
| " and compute format: ", ToString(compute_format))); |
| return; |
| } |
| |
| const se::dnn::DataLayout data_layout = |
| compute_format == FORMAT_NHWC ? se::dnn::DataLayout::kBatchYXDepth |
| : se::dnn::DataLayout::kBatchDepthYX; |
| |
| se::dnn::BatchDescriptor x_desc; |
| x_desc.set_count(batch_size) |
| .set_feature_map_count(channels) |
| .set_height(height) |
| .set_width(width) |
| .set_layout(data_layout); |
| |
| se::dnn::BatchDescriptor scale_offset_desc; |
| scale_offset_desc.set_count(1) |
| .set_feature_map_count(channels) |
| .set_height(1) |
| .set_width(1) |
| .set_layout(se::dnn::DataLayout::kBatchDepthYX); |
| |
| auto x_ptr = StreamExecutorUtil::AsDeviceMemory<T>(x_maybe_transformed); |
| auto scale_ptr = StreamExecutorUtil::AsDeviceMemory<U>(scale); |
| auto offset_ptr = StreamExecutorUtil::AsDeviceMemory<U>(offset); |
| auto estimated_mean_ptr = |
| StreamExecutorUtil::AsDeviceMemory<U>(estimated_mean); |
| auto estimated_variance_ptr = |
| StreamExecutorUtil::AsDeviceMemory<U>(estimated_variance); |
| auto side_input_ptr = StreamExecutorUtil::AsDeviceMemory<U>(side_input); |
| auto batch_mean_ptr = StreamExecutorUtil::AsDeviceMemory<U>(*batch_mean); |
| |
| auto batch_var_ptr = StreamExecutorUtil::AsDeviceMemory<U>(*batch_var); |
| auto saved_mean_ptr = StreamExecutorUtil::AsDeviceMemory<U>(*saved_mean); |
| auto saved_inv_var_ptr = |
| StreamExecutorUtil::AsDeviceMemory<U>(*saved_inv_var); |
| |
| GPUDevice d = context->eigen_device<GPUDevice>(); |
| using se::DeviceMemory; |
| Tensor inv_var; |
| OP_REQUIRES_OK( |
| context, context->allocate_temp(DataTypeToEnum<U>::value, |
| estimated_variance.shape(), &inv_var)); |
| auto inv_var_ptr = StreamExecutorUtil::AsDeviceMemory<U>(inv_var); |
| std::function<const DeviceMemory<U>&()> var_to_inv_var = |
| [d, epsilon, estimated_variance, |
| &inv_var_ptr]() -> const DeviceMemory<U>& { |
| auto estimated_variance_ptr = |
| StreamExecutorUtil::AsDeviceMemory<U>(estimated_variance); |
| const U* variance = |
| static_cast<const U*>(estimated_variance_ptr.opaque()); |
| U* inv_variance = static_cast<U*>(inv_var_ptr.opaque()); |
| int channels = inv_var_ptr.ElementCount(); |
| VarianceToInvVariance<U>()(d, variance, epsilon, channels, inv_variance); |
| return inv_var_ptr; |
| }; |
| const int64 sample_size = batch_size * height * width; |
| std::function<void()> inv_var_to_var = [d, &batch_var_ptr, epsilon, |
| sample_size]() { |
| U* variance = static_cast<U*>(batch_var_ptr.opaque()); |
| int channels = batch_var_ptr.ElementCount(); |
| InvVarianceToVariance<U>()(d, epsilon, sample_size, channels, variance); |
| }; |
| |
| bool cudnn_launch_status = |
| stream |
| ->ThenBatchNormalizationForward( |
| x_ptr, scale_ptr, offset_ptr, estimated_mean_ptr, |
| estimated_variance_ptr, side_input_ptr, x_desc, |
| scale_offset_desc, static_cast<double>(epsilon), |
| AsDnnActivationMode(activation_mode), &y_ptr, &batch_mean_ptr, |
| &batch_var_ptr, &saved_mean_ptr, &saved_inv_var_ptr, |
| is_training, std::move(var_to_inv_var), |
| std::move(inv_var_to_var), reserve_space_allocator, |
| workspace_allocator) |
| .ok(); |
| |
| if (!cudnn_launch_status) { |
| context->SetStatus( |
| errors::Internal("cuDNN launch failure : input shape (", |
| x.shape().DebugString(), ")")); |
| } |
| |
| if (tensor_format == FORMAT_NHWC && compute_format == FORMAT_NCHW) { |
| functor::NCHWToNHWC<GPUDevice, T, 4>()( |
| context->eigen_device<GPUDevice>(), |
| const_cast<const Tensor&>(y_transformed).tensor<T, 4>(), |
| y->tensor<T, 4>()); |
| } |
| } |
| }; |
| |
| template <typename T, typename U> |
| struct FusedBatchNormGrad<GPUDevice, T, U> { |
| void operator()(OpKernelContext* context, const Tensor& y_backprop, |
| const Tensor& x, const Tensor& scale, const Tensor& mean, |
| const Tensor& inv_variance, U epsilon, Tensor* x_backprop, |
| Tensor* scale_backprop, Tensor* offset_backprop, |
| const Tensor* reserve_space, |
| ScratchAllocator* workspace_allocator, |
| TensorFormat tensor_format) { |
| auto* stream = context->op_device_context()->stream(); |
| OP_REQUIRES(context, stream, errors::Internal("No GPU stream available")); |
| |
| const int64 batch_size = GetTensorDim(x, tensor_format, 'N'); |
| const int64 channels = GetTensorDim(x, tensor_format, 'C'); |
| const int64 height = GetTensorDim(x, tensor_format, 'H'); |
| const int64 width = GetTensorDim(x, tensor_format, 'W'); |
| |
| // Check if cuDNN batch normalization has a fast NHWC implementation: |
| // (1) Tensorflow enabled batchnorm spatial persistence, and |
| // FusedBatchNormGradV3 passed non-null reserve space and allocator. |
| const bool fast_nhwc_batch_norm = BatchnormSpatialPersistentEnabled() && |
| DataTypeToEnum<T>::value == DT_HALF && |
| reserve_space != nullptr && |
| workspace_allocator != nullptr; |
| |
| // If input tensor is in NHWC format, and we have a fast cuDNN |
| // implementation, there is no need to do data format conversion. |
| TensorFormat compute_format = |
| fast_nhwc_batch_norm && tensor_format == FORMAT_NHWC ? FORMAT_NHWC |
| : FORMAT_NCHW; |
| |
| VLOG(2) << "FusedBatchNormGrad:" |
| << " batch_size: " << batch_size << " channels: " << channels |
| << " height: " << height << " width: " << width |
| << " y_backprop shape: " << y_backprop.shape().DebugString() |
| << " x shape: " << x.shape().DebugString() |
| << " scale shape: " << scale.shape().DebugString() |
| << " tensor format: " << ToString(tensor_format) |
| << " compute format: " << ToString(compute_format); |
| |
| // Inputs |
| Tensor y_backprop_maybe_transformed = y_backprop; |
| Tensor x_maybe_transformed = x; |
| Tensor y_backprop_transformed; |
| Tensor x_transformed; |
| |
| // Outputs |
| Tensor x_backprop_transformed; |
| se::DeviceMemory<T> x_backprop_ptr; |
| |
| if (tensor_format == compute_format) { |
| x_backprop_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*x_backprop); |
| } else if (tensor_format == FORMAT_NHWC && compute_format == FORMAT_NCHW) { |
| // Transform inputs from 'NHWC' to 'NCHW' |
| OP_REQUIRES_OK(context, context->allocate_temp( |
| DataTypeToEnum<T>::value, |
| ShapeFromFormat(FORMAT_NCHW, batch_size, |
| height, width, channels), |
| &y_backprop_transformed)); |
| functor::NHWCToNCHW<GPUDevice, T, 4>()( |
| context->eigen_device<GPUDevice>(), |
| const_cast<const Tensor&>(y_backprop_maybe_transformed) |
| .tensor<T, 4>(), |
| y_backprop_transformed.tensor<T, 4>()); |
| y_backprop_maybe_transformed = y_backprop_transformed; |
| |
| OP_REQUIRES_OK(context, context->allocate_temp( |
| DataTypeToEnum<T>::value, |
| ShapeFromFormat(FORMAT_NCHW, batch_size, |
| height, width, channels), |
| &x_transformed)); |
| functor::NHWCToNCHW<GPUDevice, T, 4>()( |
| context->eigen_device<GPUDevice>(), |
| const_cast<const Tensor&>(x_maybe_transformed).tensor<T, 4>(), |
| x_transformed.tensor<T, 4>()); |
| x_maybe_transformed = x_transformed; |
| |
| // Allocate memory for transformed outputs in 'NCHW' |
| OP_REQUIRES_OK(context, context->allocate_temp( |
| DataTypeToEnum<T>::value, |
| ShapeFromFormat(FORMAT_NCHW, batch_size, |
| height, width, channels), |
| &x_backprop_transformed)); |
| x_backprop_ptr = |
| StreamExecutorUtil::AsDeviceMemory<T>(x_backprop_transformed); |
| } else { |
| context->SetStatus(errors::Internal( |
| "Unsupported tensor format: ", ToString(tensor_format), |
| " and compute format: ", ToString(compute_format))); |
| return; |
| } |
| |
| const se::dnn::DataLayout data_layout = |
| compute_format == FORMAT_NHWC ? se::dnn::DataLayout::kBatchYXDepth |
| : se::dnn::DataLayout::kBatchDepthYX; |
| |
| se::dnn::BatchDescriptor x_desc; |
| x_desc.set_count(batch_size) |
| .set_feature_map_count(channels) |
| .set_height(height) |
| .set_width(width) |
| .set_layout(data_layout); |
| |
| se::dnn::BatchDescriptor scale_offset_desc; |
| scale_offset_desc.set_count(1) |
| .set_feature_map_count(channels) |
| .set_height(1) |
| .set_width(1) |
| .set_layout(se::dnn::DataLayout::kBatchDepthYX); |
| |
| auto y_backprop_ptr = |
| StreamExecutorUtil::AsDeviceMemory<T>(y_backprop_maybe_transformed); |
| auto x_ptr = StreamExecutorUtil::AsDeviceMemory<T>(x_maybe_transformed); |
| auto scale_ptr = StreamExecutorUtil::AsDeviceMemory<U>(scale); |
| auto mean_ptr = StreamExecutorUtil::AsDeviceMemory<U>(mean); |
| auto inv_variance_ptr = StreamExecutorUtil::AsDeviceMemory<U>(inv_variance); |
| auto scale_backprop_ptr = |
| StreamExecutorUtil::AsDeviceMemory<U>(*scale_backprop); |
| auto offset_backprop_ptr = |
| StreamExecutorUtil::AsDeviceMemory<U>(*offset_backprop); |
| |
| // the cudnn kernel outputs inverse variance in forward and reuse it in |
| // backward |
| DeviceMemory<uint8>* reserve_space_data = nullptr; |
| if (reserve_space != nullptr && reserve_space->dims() != 0) { |
| auto reserve_space_uint8 = functor::CastDeviceMemory<uint8, U>( |
| const_cast<Tensor*>(reserve_space)); |
| reserve_space_data = &reserve_space_uint8; |
| } |
| bool cudnn_launch_status = |
| stream |
| ->ThenBatchNormalizationBackward( |
| y_backprop_ptr, x_ptr, scale_ptr, mean_ptr, inv_variance_ptr, |
| x_desc, scale_offset_desc, static_cast<double>(epsilon), |
| &x_backprop_ptr, &scale_backprop_ptr, &offset_backprop_ptr, |
| reserve_space_data, workspace_allocator) |
| .ok(); |
| |
| if (!cudnn_launch_status) { |
| context->SetStatus( |
| errors::Internal("cuDNN launch failure : input shape (", |
| x.shape().DebugString(), ")")); |
| } |
| if (tensor_format == FORMAT_NHWC && compute_format == FORMAT_NCHW) { |
| functor::NCHWToNHWC<GPUDevice, T, 4>()( |
| context->eigen_device<GPUDevice>(), |
| const_cast<const Tensor&>(x_backprop_transformed).tensor<T, 4>(), |
| x_backprop->tensor<T, 4>()); |
| } |
| } |
| }; |
| |
| // Forward declarations of the functor specializations for GPU. |
| #define DECLARE_GPU_SPEC(T, U) \ |
| template <> \ |
| void FusedBatchNormFreezeGrad<GPUDevice, T, U>::operator()( \ |
| OpKernelContext* context, const Tensor& y_backprop_input, \ |
| const Tensor& x_input, const Tensor& scale_input, \ |
| const Tensor& mean_input, const Tensor& variance_input, U epsilon, \ |
| Tensor* x_backprop_output, Tensor* scale_backprop_output, \ |
| Tensor* offset_backprop_output); \ |
| extern template struct FusedBatchNormFreezeGrad<GPUDevice, T, U>; \ |
| template <> \ |
| void FusedBatchNormInferenceFunctor<GPUDevice, T, U>::operator()( \ |
| OpKernelContext* context, TensorFormat tensor_format, \ |
| typename TTypes<T, 4>::ConstTensor in, \ |
| typename TTypes<U>::ConstVec scale, typename TTypes<U>::ConstVec offset, \ |
| typename TTypes<U>::ConstVec estimated_mean, \ |
| typename TTypes<U>::ConstVec estimated_variance, \ |
| typename TTypes<T, 4>::ConstTensor side_input, U epsilon, \ |
| FusedBatchNormActivationMode activation_mode, \ |
| typename TTypes<T, 4>::Tensor out); \ |
| extern template struct FusedBatchNormInferenceFunctor<GPUDevice, T, U>; |
| |
| DECLARE_GPU_SPEC(float, float); |
| DECLARE_GPU_SPEC(Eigen::half, float); |
| |
| #undef DECLARE_GPU_SPEC |
| |
| #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
| } // namespace functor |
| |
| template <typename Device, typename T, typename U> |
| class FusedBatchNormOpBase : public OpKernel { |
| using FbnActivationMode = functor::FusedBatchNormActivationMode; |
| |
| protected: |
| explicit FusedBatchNormOpBase(OpKernelConstruction* context, |
| bool is_batch_norm_ex = false) |
| : OpKernel(context), empty_side_input_(DataTypeToEnum<T>::value, {0}) { |
| float epsilon; |
| OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon)); |
| epsilon_ = U(epsilon); |
| string tensor_format; |
| OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format)); |
| OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_), |
| errors::InvalidArgument("Invalid data format")); |
| OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_)); |
| |
| if (!is_batch_norm_ex) { |
| has_side_input_ = false; |
| activation_mode_ = FbnActivationMode::kIdentity; |
| } else { |
| OP_REQUIRES_OK(context, ParseActivationMode(context, &activation_mode_)); |
| |
| int num_side_inputs; |
| OP_REQUIRES_OK(context, |
| context->GetAttr("num_side_inputs", &num_side_inputs)); |
| OP_REQUIRES(context, num_side_inputs >= 0 && num_side_inputs <= 1, |
| errors::InvalidArgument( |
| "FusedBatchNorm accepts at most one side input.")); |
| has_side_input_ = (num_side_inputs == 1); |
| if (has_side_input_ && is_training_) { |
| OP_REQUIRES( |
| context, activation_mode_ != FbnActivationMode::kIdentity, |
| errors::InvalidArgument("Identity activation is not supported with " |
| "non-empty side input")); |
| } |
| } |
| |
| if (activation_mode_ != FbnActivationMode::kIdentity && is_training_) { |
| // NOTE(ezhulenev): Following requirements are coming from implementation |
| // details of cudnnBatchNormalizationForwardTrainingEx used in training |
| // mode. In inference mode we call custom CUDA kernel that supports all |
| // data formats and data types. |
| OP_REQUIRES(context, DataTypeToEnum<T>::value == DT_HALF, |
| errors::InvalidArgument("FusedBatchNorm with activation " |
| "supports only DT_HALF data type.")); |
| OP_REQUIRES(context, tensor_format_ == FORMAT_NHWC, |
| errors::InvalidArgument("FusedBatchNorm with activation " |
| "supports only NHWC tensor format.")); |
| OP_REQUIRES(context, functor::BatchnormSpatialPersistentEnabled(), |
| errors::InvalidArgument( |
| "FusedBatchNorm with activation must run with cuDNN " |
| "spatial persistence mode enabled.")); |
| } |
| } |
| |
| // If use_reserved_space is true, we need to handle the 5th output (a reserved |
| // space) and a new cudnn batch norm will be called if the version > 7.4.2. |
| // If use_reserved_space is false, we don't have 5th output. |
| virtual void ComputeWithReservedSpace(OpKernelContext* context, |
| bool use_reserved_space) { |
| const Tensor& x = context->input(0); |
| const Tensor& scale = context->input(1); |
| const Tensor& offset = context->input(2); |
| const Tensor& estimated_mean = context->input(3); |
| const Tensor& estimated_variance = context->input(4); |
| const Tensor& side_input = |
| has_side_input_ ? context->input(5) : empty_side_input_; |
| |
| OP_REQUIRES(context, x.dims() == 4, |
| errors::InvalidArgument("input must be 4-dimensional", |
| x.shape().DebugString())); |
| OP_REQUIRES(context, scale.dims() == 1, |
| errors::InvalidArgument("scale must be 1-dimensional", |
| scale.shape().DebugString())); |
| OP_REQUIRES(context, offset.dims() == 1, |
| errors::InvalidArgument("offset must be 1-dimensional", |
| offset.shape().DebugString())); |
| OP_REQUIRES(context, estimated_mean.dims() == 1, |
| errors::InvalidArgument("estimated_mean must be 1-dimensional", |
| estimated_mean.shape().DebugString())); |
| OP_REQUIRES( |
| context, estimated_variance.dims() == 1, |
| errors::InvalidArgument("estimated_variance must be 1-dimensional", |
| estimated_variance.shape().DebugString())); |
| if (has_side_input_) { |
| OP_REQUIRES(context, side_input.shape() == x.shape(), |
| errors::InvalidArgument( |
| "side_input shape must be equal to input shape: ", |
| side_input.shape().DebugString(), |
| " != ", x.shape().DebugString())); |
| } |
| |
| if (activation_mode_ != FbnActivationMode::kIdentity) { |
| // NOTE(ezhulenev): This requirement is coming from implementation |
| // details of cudnnBatchNormalizationForwardTrainingEx. |
| OP_REQUIRES( |
| context, !is_training_ || x.dim_size(3) % 4 == 0, |
| errors::InvalidArgument("FusedBatchNorm with activation requires " |
| "channel dimension to be a multiple of 4.")); |
| } |
| |
| if (is_training_) { |
| OP_REQUIRES( |
| context, estimated_mean.dim_size(0) == 0, |
| errors::InvalidArgument("estimated_mean must be empty for training", |
| estimated_mean.shape().DebugString())); |
| OP_REQUIRES(context, estimated_variance.dim_size(0) == 0, |
| errors::InvalidArgument( |
| "estimated_variance must be empty for training", |
| estimated_variance.shape().DebugString())); |
| } |
| |
| Tensor* y = nullptr; |
| OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( |
| {0}, 0, x.shape(), &y)); |
| Tensor* batch_mean = nullptr; |
| OP_REQUIRES_OK(context, |
| context->allocate_output(1, scale.shape(), &batch_mean)); |
| Tensor* batch_var = nullptr; |
| OP_REQUIRES_OK(context, |
| context->allocate_output(2, scale.shape(), &batch_var)); |
| Tensor* saved_mean = nullptr; |
| OP_REQUIRES_OK(context, |
| context->allocate_output(3, scale.shape(), &saved_mean)); |
| Tensor* saved_maybe_inv_var = nullptr; |
| OP_REQUIRES_OK(context, context->allocate_output(4, scale.shape(), |
| &saved_maybe_inv_var)); |
| |
| if (!use_reserved_space) { |
| functor::FusedBatchNorm<Device, T, U>()( |
| context, x, scale, offset, estimated_mean, estimated_variance, |
| side_input, epsilon_, activation_mode_, y, batch_mean, batch_var, |
| saved_mean, saved_maybe_inv_var, tensor_format_, nullptr, nullptr, |
| is_training_); |
| } else { |
| functor::CudnnBatchNormAllocatorInOutput<U> reserve_space_allocator( |
| context, 5); |
| functor::CudnnBatchNormAllocatorInTemp<uint8> workspace_allocator( |
| context); |
| functor::FusedBatchNorm<Device, T, U>()( |
| context, x, scale, offset, estimated_mean, estimated_variance, |
| side_input, epsilon_, activation_mode_, y, batch_mean, batch_var, |
| saved_mean, saved_maybe_inv_var, tensor_format_, |
| &reserve_space_allocator, &workspace_allocator, is_training_); |
| } |
| } |
| |
| private: |
| U epsilon_; |
| TensorFormat tensor_format_; |
| bool is_training_; |
| bool has_side_input_; |
| FbnActivationMode activation_mode_; |
| Tensor empty_side_input_; |
| }; |
| |
| template <typename Device, typename T, typename U> |
| class FusedBatchNormOp : public FusedBatchNormOpBase<Device, T, U> { |
| public: |
| explicit FusedBatchNormOp(OpKernelConstruction* context) |
| : FusedBatchNormOpBase<Device, T, U>(context) {} |
| |
| void Compute(OpKernelContext* context) override { |
| FusedBatchNormOpBase<Device, T, U>::ComputeWithReservedSpace(context, |
| false); |
| } |
| }; |
| |
| template <typename Device, typename T, typename U> |
| class FusedBatchNormOpV3 : public FusedBatchNormOpBase<Device, T, U> { |
| public: |
| explicit FusedBatchNormOpV3(OpKernelConstruction* context) |
| : FusedBatchNormOpBase<Device, T, U>(context) {} |
| |
| void Compute(OpKernelContext* context) override { |
| FusedBatchNormOpBase<Device, T, U>::ComputeWithReservedSpace(context, true); |
| } |
| }; |
| |
| template <typename Device, typename T, typename U> |
| class FusedBatchNormOpEx : public FusedBatchNormOpBase<Device, T, U> { |
| static constexpr bool kWithSideInputAndActivation = true; |
| |
| public: |
| explicit FusedBatchNormOpEx(OpKernelConstruction* context) |
| : FusedBatchNormOpBase<Device, T, U>(context, |
| kWithSideInputAndActivation) {} |
| |
| void Compute(OpKernelContext* context) override { |
| FusedBatchNormOpBase<Device, T, U>::ComputeWithReservedSpace(context, true); |
| } |
| }; |
| |
| template <typename Device, typename T, typename U> |
| class FusedBatchNormGradOpBase : public OpKernel { |
| protected: |
| explicit FusedBatchNormGradOpBase(OpKernelConstruction* context) |
| : OpKernel(context) { |
| float epsilon; |
| OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon)); |
| epsilon_ = U(epsilon); |
| string tensor_format; |
| OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format)); |
| OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_), |
| errors::InvalidArgument("Invalid data format")); |
| OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_)); |
| } |
| |
| virtual void ComputeWithReservedSpace(OpKernelContext* context, |
| bool use_reserved_space) { |
| const Tensor& y_backprop = context->input(0); |
| const Tensor& x = context->input(1); |
| const Tensor& scale = context->input(2); |
| // When is_training=True, batch mean and variance/inverted variance are |
| // saved in the forward pass to be reused here. When is_training=False, |
| // population mean and variance need to be forwarded here to compute the |
| // gradients. |
| const Tensor& saved_mean_or_pop_mean = context->input(3); |
| // The Eigen implementation saves variance in the forward pass, while cuDNN |
| // saves inverted variance. |
| const Tensor& saved_maybe_inv_var_or_pop_var = context->input(4); |
| |
| OP_REQUIRES(context, y_backprop.dims() == 4, |
| errors::InvalidArgument("input must be 4-dimensional", |
| y_backprop.shape().DebugString())); |
| OP_REQUIRES(context, x.dims() == 4, |
| errors::InvalidArgument("input must be 4-dimensional", |
| x.shape().DebugString())); |
| OP_REQUIRES(context, scale.dims() == 1, |
| errors::InvalidArgument("scale must be 1-dimensional", |
| scale.shape().DebugString())); |
| OP_REQUIRES( |
| context, saved_mean_or_pop_mean.dims() == 1, |
| errors::InvalidArgument("saved mean must be 1-dimensional", |
| saved_mean_or_pop_mean.shape().DebugString())); |
| OP_REQUIRES(context, saved_maybe_inv_var_or_pop_var.dims() == 1, |
| errors::InvalidArgument( |
| "saved variance must be 1-dimensional", |
| saved_maybe_inv_var_or_pop_var.shape().DebugString())); |
| |
| Tensor* x_backprop = nullptr; |
| OP_REQUIRES_OK(context, |
| context->allocate_output(0, x.shape(), &x_backprop)); |
| |
| const TensorShape& scale_offset_shape = scale.shape(); |
| Tensor* scale_backprop = nullptr; |
| OP_REQUIRES_OK(context, context->allocate_output(1, scale_offset_shape, |
| &scale_backprop)); |
| Tensor* offset_backprop = nullptr; |
| OP_REQUIRES_OK(context, context->allocate_output(2, scale_offset_shape, |
| &offset_backprop)); |
| // Two placeholders for estimated_mean and estimated_variance, which are |
| // used for inference and thus not needed here for gradient computation. |
| // They are filled with zeros so as to avoid NaN outputs. |
| Tensor* placeholder_1 = nullptr; |
| OP_REQUIRES_OK( |
| context, context->allocate_output(3, TensorShape({}), &placeholder_1)); |
| functor::SetZeroFunctor<Device, float> f; |
| f(context->eigen_device<Device>(), placeholder_1->flat<U>()); |
| Tensor* placeholder_2 = nullptr; |
| OP_REQUIRES_OK( |
| context, context->allocate_output(4, TensorShape({}), &placeholder_2)); |
| f(context->eigen_device<Device>(), placeholder_2->flat<U>()); |
| |
| // If input is empty, set gradients w.r.t scale/offset to zero. |
| if (x.shape().num_elements() == 0) { |
| functor::SetZeroFunctor<Device, U> f; |
| f(context->eigen_device<Device>(), scale_backprop->flat<U>()); |
| f(context->eigen_device<Device>(), offset_backprop->flat<U>()); |
| return; |
| } |
| |
| const Tensor* reserve_space_data = nullptr; |
| functor::CudnnBatchNormAllocatorInTemp<uint8>* workspace_allocator_ptr = |
| nullptr; |
| |
| #if CUDNN_VERSION >= 7402 |
| functor::CudnnBatchNormAllocatorInTemp<uint8> workspace_allocator(context); |
| if (use_reserved_space) { |
| const Tensor& reserve_space = context->input(5); |
| reserve_space_data = &reserve_space; |
| workspace_allocator_ptr = &workspace_allocator; |
| } |
| #endif // CUDNN_VERSION >= 7402 |
| |
| if (is_training_) { |
| functor::FusedBatchNormGrad<Device, T, U>()( |
| context, y_backprop, x, scale, saved_mean_or_pop_mean, |
| saved_maybe_inv_var_or_pop_var, epsilon_, x_backprop, scale_backprop, |
| offset_backprop, reserve_space_data, workspace_allocator_ptr, |
| tensor_format_); |
| } else { |
| // Necessary layout conversion is currently done in python. |
| CHECK(tensor_format_ == FORMAT_NHWC) |
| << "The implementation of FusedBatchNormGrad with is_training=False " |
| "only support " |
| << "NHWC tensor format for now."; |
| functor::FusedBatchNormFreezeGrad<Device, T, U>()( |
| context, y_backprop, x, scale, saved_mean_or_pop_mean, |
| saved_maybe_inv_var_or_pop_var, epsilon_, x_backprop, scale_backprop, |
| offset_backprop); |
| } |
| } |
| |
| private: |
| U epsilon_; |
| TensorFormat tensor_format_; |
| bool is_training_; |
| }; |
| |
| template <typename Device, typename T, typename U> |
| class FusedBatchNormGradOp : public FusedBatchNormGradOpBase<Device, T, U> { |
| public: |
| explicit FusedBatchNormGradOp(OpKernelConstruction* context) |
| : FusedBatchNormGradOpBase<Device, T, U>(context) {} |
| |
| void Compute(OpKernelContext* context) override { |
| FusedBatchNormGradOpBase<Device, T, U>::ComputeWithReservedSpace(context, |
| false); |
| } |
| }; |
| |
| template <typename Device, typename T, typename U> |
| class FusedBatchNormGradOpV3 : public FusedBatchNormGradOpBase<Device, T, U> { |
| public: |
| explicit FusedBatchNormGradOpV3(OpKernelConstruction* context) |
| : FusedBatchNormGradOpBase<Device, T, U>(context) {} |
| |
| void Compute(OpKernelContext* context) override { |
| FusedBatchNormGradOpBase<Device, T, U>::ComputeWithReservedSpace(context, |
| true); |
| } |
| }; |
| |
| REGISTER_KERNEL_BUILDER( |
| Name("FusedBatchNorm").Device(DEVICE_CPU).TypeConstraint<float>("T"), |
| FusedBatchNormOp<CPUDevice, float, float>); |
| |
| REGISTER_KERNEL_BUILDER( |
| Name("FusedBatchNormGrad").Device(DEVICE_CPU).TypeConstraint<float>("T"), |
| FusedBatchNormGradOp<CPUDevice, float, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV2") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<float>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormOp<CPUDevice, float, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV2") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<float>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormGradOp<CPUDevice, float, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV2") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<Eigen::half>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormOp<CPUDevice, Eigen::half, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV2") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<Eigen::half>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormGradOp<CPUDevice, Eigen::half, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV3") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<float>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormOpV3<CPUDevice, float, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV3") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<float>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormGradOpV3<CPUDevice, float, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV3") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<Eigen::half>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormOpV3<CPUDevice, Eigen::half, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV3") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<Eigen::half>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormGradOpV3<CPUDevice, Eigen::half, float>); |
| |
| #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
| |
| REGISTER_KERNEL_BUILDER( |
| Name("FusedBatchNorm").Device(DEVICE_GPU).TypeConstraint<float>("T"), |
| FusedBatchNormOp<GPUDevice, float, float>); |
| |
| REGISTER_KERNEL_BUILDER( |
| Name("FusedBatchNormGrad").Device(DEVICE_GPU).TypeConstraint<float>("T"), |
| FusedBatchNormGradOp<GPUDevice, float, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV2") |
| .Device(DEVICE_GPU) |
| .TypeConstraint<float>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormOp<GPUDevice, float, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV2") |
| .Device(DEVICE_GPU) |
| .TypeConstraint<float>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormGradOp<GPUDevice, float, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV2") |
| .Device(DEVICE_GPU) |
| .TypeConstraint<Eigen::half>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormOp<GPUDevice, Eigen::half, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV2") |
| .Device(DEVICE_GPU) |
| .TypeConstraint<Eigen::half>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormGradOp<GPUDevice, Eigen::half, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV3") |
| .Device(DEVICE_GPU) |
| .TypeConstraint<float>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormOpV3<GPUDevice, float, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("_FusedBatchNormEx") |
| .Device(DEVICE_GPU) |
| .TypeConstraint<float>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormOpEx<GPUDevice, float, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV3") |
| .Device(DEVICE_GPU) |
| .TypeConstraint<float>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormGradOpV3<GPUDevice, float, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV3") |
| .Device(DEVICE_GPU) |
| .TypeConstraint<Eigen::half>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormOpV3<GPUDevice, Eigen::half, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("_FusedBatchNormEx") |
| .Device(DEVICE_GPU) |
| .TypeConstraint<Eigen::half>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormOpEx<GPUDevice, Eigen::half, float>); |
| |
| REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV3") |
| .Device(DEVICE_GPU) |
| .TypeConstraint<Eigen::half>("T") |
| .TypeConstraint<float>("U"), |
| FusedBatchNormGradOpV3<GPUDevice, Eigen::half, float>); |
| |
| #endif |
| |
| } // namespace tensorflow |