|  | #include "caffe2/core/context_gpu.h" | 
|  |  | 
|  | #include "caffe2/core/common_gpu.h" | 
|  | #include "caffe2/core/cudnn_wrappers.h" | 
|  | #include "caffe2/operators/conv_op.h" | 
|  | #include "caffe2/operators/conv_op_cache_cudnn.h" | 
|  | #include "caffe2/operators/conv_pool_op_base.h" | 
|  | #include "caffe2/operators/op_utils_cudnn.h" | 
|  |  | 
|  | namespace caffe2 { | 
|  |  | 
|  | class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> { | 
|  | public: | 
|  | CudnnConvOpBase(const OperatorDef& operator_def, Workspace* ws) | 
|  | : ConvPoolOpBase<CUDAContext>(operator_def, ws), | 
|  | cudnn_wrapper_(&context_), | 
|  | cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>( | 
|  | "ws_nbytes_limit", | 
|  | kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)), | 
|  | exhaustive_search_( | 
|  | OperatorBase::GetSingleArgument<int>("exhaustive_search", 0)), | 
|  | deterministic_( | 
|  | OperatorBase::GetSingleArgument<int>("deterministic", 0)), | 
|  | cudnn_state_(OperatorBase::GetSingleArgument<int>("cudnn_state", 0)), | 
|  | force_algo_(OperatorBase::GetRepeatedArgument<int>("force_algo", vector<int>{-1,-1,-1})), | 
|  | enable_tensor_core_(OperatorBase::GetSingleArgument<bool>("enable_tensor_core", 1)) { | 
|  | CHECK(!deterministic_ || !exhaustive_search_); | 
|  | CAFFE_ENFORCE(group_ > 0); | 
|  | CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_); | 
|  | for (int i = 0; i < kernel_.size(); ++i) { | 
|  | OPERATOR_NEEDS_FEATURE( | 
|  | pads_[i] == pads_[kernel_.size() + i], | 
|  | "The current padding scheme leads to unequal padding on the left " | 
|  | "and right, which is not supported by cudnn."); | 
|  | } | 
|  | // dilated convolution supported by some algorithms in cuDNN v6 | 
|  | #if !(CUDNN_VERSION_MIN(6,0,0)) | 
|  | OPERATOR_NEEDS_FEATURE( | 
|  | dilation_h() == 1 && dilation_w() == 1, | 
|  | "The cudnn convolution does not support dilation yet."); | 
|  | #endif | 
|  | // dilated grouped convolution supported in cuDNN v7.1 | 
|  | #if !(CUDNN_VERSION_MIN(7,1,0)) | 
|  | if (group_ != 1) { | 
|  | for (int dim = 0; dim < kernel_.size(); ++dim) { | 
|  | OPERATOR_NEEDS_FEATURE(dilation_[dim] == 1, | 
|  | "When group is used, dilation should not be set at the same time."); | 
|  | } | 
|  | } | 
|  | #endif | 
|  |  | 
|  | #if CUDNN_VERSION_MIN(7, 0, 0) | 
|  | // verify TensorCore math is supported | 
|  | enable_tensor_core_ &= TensorCoreAvailable(); | 
|  | #else | 
|  | enable_tensor_core_ = false; | 
|  | #endif | 
|  |  | 
|  | bool individual_force_algo = OperatorBase::HasArgument("force_algo_fwd") || | 
|  | OperatorBase::HasArgument("force_algo_dgrad") || | 
|  | OperatorBase::HasArgument("force_algo_wgrad"); | 
|  | if (OperatorBase::HasArgument("force_algo")) { | 
|  | CAFFE_ENFORCE(!individual_force_algo, | 
|  | "Cannot specify both force_algo and any of", | 
|  | "force_algo_fwd, force_algo_dgrad, force_algo_wgrad"); | 
|  | } else { | 
|  | force_algo_ = std::vector<int>{-1,-1,-1}; | 
|  | force_algo_[ALGO_FWD] = | 
|  | OperatorBase::GetSingleArgument<int>("force_algo_fwd", -1); | 
|  | force_algo_[ALGO_DGRAD] = | 
|  | OperatorBase::GetSingleArgument<int>("force_algo_dgrad", -1); | 
|  | force_algo_[ALGO_WGRAD] = | 
|  | OperatorBase::GetSingleArgument<int>("force_algo_wgrad", -1); | 
|  | } | 
|  |  | 
|  | CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bottom_desc_)); | 
|  | CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&filter_desc_)); | 
|  | CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bias_desc_)); | 
|  | CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_)); | 
|  | CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_for_bias_)); | 
|  | CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&conv_desc_)); | 
|  | } | 
|  |  | 
|  | ~CudnnConvOpBase() { | 
|  | CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bottom_desc_)); | 
|  | CUDNN_ENFORCE(cudnnDestroyFilterDescriptor(filter_desc_)); | 
|  | CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bias_desc_)); | 
|  | CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_)); | 
|  | CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_for_bias_)); | 
|  | CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(conv_desc_)); | 
|  | } | 
|  |  | 
|  | protected: | 
|  | // A helper function to set up the tensor Nd desriptor, depending on the order | 
|  | // the group and the type given. | 
|  | template <typename T> | 
|  | void SetTensorNdDescriptorWithGroup( | 
|  | int size, | 
|  | cudnnTensorDescriptor_t tensorDesc, | 
|  | int N, | 
|  | int C, | 
|  | int H, | 
|  | int W, | 
|  | int D) { | 
|  | #if CUDNN_VERSION_MIN(7, 0, 0) | 
|  | const int CC = C; | 
|  | #else | 
|  | const int CC = C / group_; | 
|  | #endif | 
|  | switch (order_) { | 
|  | case StorageOrder::NHWC: | 
|  | if (size == 4) { | 
|  | CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx( | 
|  | tensorDesc, | 
|  | cudnnTypeWrapper<T>::type, | 
|  | N, | 
|  | CC, | 
|  | H, | 
|  | W, | 
|  | H * W * C, | 
|  | 1, | 
|  | W * C, | 
|  | C)); | 
|  | } else { | 
|  | vector<int> dims = {N, H, W, D, CC}; | 
|  | vector<int> strides = {H * W * D * CC, W * D * CC, D * CC, CC, 1}; | 
|  | CUDNN_ENFORCE(cudnnSetTensorNdDescriptor( | 
|  | tensorDesc, | 
|  | cudnnTypeWrapper<T>::type, | 
|  | size > 3 ? size : 4, | 
|  | dims.data(), | 
|  | strides.data())); | 
|  | } | 
|  | break; | 
|  | case StorageOrder::NCHW: | 
|  | if (size == 4) { | 
|  | CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx( | 
|  | tensorDesc, | 
|  | cudnnTypeWrapper<T>::type, | 
|  | N, | 
|  | CC, | 
|  | H, | 
|  | W, | 
|  | C * H * W, | 
|  | H * W, | 
|  | W, | 
|  | 1)); | 
|  | } else { | 
|  | vector<int> dims = {N, CC, H, W, D}; | 
|  | vector<int> strides = {CC * H * W * D, H * W * D, W * D, D, 1}; | 
|  | CUDNN_ENFORCE(cudnnSetTensorNdDescriptor( | 
|  | tensorDesc, | 
|  | cudnnTypeWrapper<T>::type, | 
|  | size > 3 ? size : 4, | 
|  | dims.data(), | 
|  | strides.data())); | 
|  | } | 
|  | break; | 
|  | default: | 
|  | LOG(FATAL) << "Unknown storage order: " << order_; | 
|  | } | 
|  | } | 
|  |  | 
|  | void DuplicateConvDesc( | 
|  | cudnnConvolutionDescriptor_t input, | 
|  | size_t kernelDims, | 
|  | size_t dilationDims, | 
|  | cudnnConvolutionDescriptor_t copy) { | 
|  | if (kernelDims == 2) { | 
|  | cudnnConvolutionMode_t mode; | 
|  | cudnnDataType_t dataType; | 
|  | int pad_height = 0; | 
|  | int pad_width = 0; | 
|  | int stride_height = 0; | 
|  | int stride_width = 0; | 
|  | int dilation_height = 0; | 
|  | int dilation_width = 0; | 
|  |  | 
|  | #if CUDNN_VERSION_MIN(6, 0, 0) | 
|  | CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor( | 
|  | input, | 
|  | &pad_height, | 
|  | &pad_width, | 
|  | &stride_height, | 
|  | &stride_width, | 
|  | &dilation_height, | 
|  | &dilation_width, | 
|  | &mode, | 
|  | &dataType | 
|  | )); | 
|  | #else | 
|  | CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor( | 
|  | input, | 
|  | &pad_height, | 
|  | &pad_width, | 
|  | &stride_height, | 
|  | &stride_width, | 
|  | &dilation_height, | 
|  | &dilation_width, | 
|  | &mode | 
|  | )); | 
|  | #endif | 
|  |  | 
|  | #if CUDNN_VERSION_MIN(6, 0, 0) | 
|  | CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor( | 
|  | copy, | 
|  | pad_height, | 
|  | pad_width, | 
|  | stride_height, | 
|  | stride_width, | 
|  | dilation_height, | 
|  | dilation_width, | 
|  | mode, | 
|  | dataType | 
|  | )); | 
|  | #else | 
|  | CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor( | 
|  | copy, | 
|  | pad_height, | 
|  | pad_width, | 
|  | stride_height, | 
|  | stride_width, | 
|  | dilation_height, | 
|  | dilation_width, | 
|  | mode | 
|  | )); | 
|  | #endif | 
|  | } else { | 
|  | cudnnConvolutionMode_t mode; | 
|  | cudnnDataType_t dataType; | 
|  | int arrayLength = 0; | 
|  | vector<int> ones(dilationDims, 1); | 
|  | CUDNN_ENFORCE(cudnnGetConvolutionNdDescriptor( | 
|  | input, | 
|  | kernel_.size(), | 
|  | &arrayLength, | 
|  | pads_.data(), | 
|  | stride_.data(), | 
|  | ones.data(), | 
|  | &mode, | 
|  | &dataType)); | 
|  |  | 
|  | CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor( | 
|  | copy, | 
|  | kernel_.size(), | 
|  | pads_.data(), | 
|  | stride_.data(), | 
|  | ones.data(), | 
|  | mode, | 
|  | dataType)); | 
|  | } | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | cudnnDataType_t DetermineComputeTypeFromInput(const T& X) { | 
|  | const cudaDeviceProp& prop = GetDeviceProperty(0); | 
|  | cudnnDataType_t computeType = CUDNN_DATA_FLOAT; | 
|  | if (X.template IsType<at::Half>()) { | 
|  | if (float16_compute_ && prop.major >= 6) { | 
|  | VLOG(1) << "CUDNN Convolution: float16_compute specified and " | 
|  | << "supported, input data is Half - using Half " | 
|  | << "compute."; | 
|  | computeType = CUDNN_DATA_HALF; | 
|  | } else if (float16_compute_) { | 
|  | VLOG(1) << "CUDNN Convolution: float16_compute specified but" | 
|  | << "not supported, input data is Half - using float32 " | 
|  | << "compute."; | 
|  | } else { | 
|  | VLOG(1) << "CUDNN Convolution: float16_compute not specified but " | 
|  | << "input data is Half - using float32 compute."; | 
|  | } | 
|  | } else { | 
|  | VLOG(1) << "CUDNN Convolution: using float32 compute."; | 
|  | } | 
|  | return computeType; | 
|  | } | 
|  |  | 
|  | void SetConvDescFromArguments() { | 
|  | #if CUDNN_VERSION_MIN(6, 0, 0) | 
|  | if (kernel_.size() == 2) { | 
|  | CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor( | 
|  | conv_desc_, | 
|  | pad_t(), | 
|  | pad_l(), | 
|  | stride_h(), | 
|  | stride_w(), | 
|  | dilation_h(), | 
|  | dilation_w(), | 
|  | CUDNN_CROSS_CORRELATION, | 
|  | compute_type_)); | 
|  | } else { | 
|  | CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor( | 
|  | conv_desc_, | 
|  | kernel_.size(), | 
|  | pads_.data(), | 
|  | stride_.data(), | 
|  | dilation_.data(), | 
|  | CUDNN_CROSS_CORRELATION, | 
|  | compute_type_)); | 
|  | } | 
|  | #else | 
|  | if (kernel_.size() == 2) { | 
|  | CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor( | 
|  | conv_desc_, | 
|  | pad_t(), | 
|  | pad_l(), | 
|  | stride_h(), | 
|  | stride_w(), | 
|  | 1, | 
|  | 1, | 
|  | CUDNN_CROSS_CORRELATION)); | 
|  | } else { | 
|  | vector<int> ones(dilation_.size(), 1); | 
|  | CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor( | 
|  | conv_desc_, | 
|  | kernel_.size(), | 
|  | pads_.data(), | 
|  | stride_.data(), | 
|  | ones.data(), | 
|  | CUDNN_CROSS_CORRELATION, | 
|  | compute_type_)); | 
|  | } | 
|  | #endif | 
|  | } | 
|  |  | 
|  | void SetConvDescComputeType( | 
|  | cudnnConvolutionDescriptor_t conv_desc, | 
|  | cudnnDataType_t math) { | 
|  | if (kernel_.size() == 2) { | 
|  | cudnnConvolutionMode_t mode; | 
|  | cudnnDataType_t dataType; | 
|  | int pad_height = 0; | 
|  | int pad_width = 0; | 
|  | int stride_height = 0; | 
|  | int stride_width = 0; | 
|  | int dilation_height = 0; | 
|  | int dilation_width = 0; | 
|  |  | 
|  | #if CUDNN_VERSION_MIN(6, 0, 0) | 
|  | CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor( | 
|  | conv_desc, | 
|  | &pad_height, | 
|  | &pad_width, | 
|  | &stride_height, | 
|  | &stride_width, | 
|  | &dilation_height, | 
|  | &dilation_width, | 
|  | &mode, | 
|  | &dataType | 
|  | )); | 
|  | #else | 
|  | CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor( | 
|  | conv_desc, | 
|  | &pad_height, | 
|  | &pad_width, | 
|  | &stride_height, | 
|  | &stride_width, | 
|  | &dilation_height, | 
|  | &dilation_width, | 
|  | &mode | 
|  | )); | 
|  | #endif | 
|  |  | 
|  | #if CUDNN_VERSION_MIN(6, 0, 0) | 
|  | CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor( | 
|  | conv_desc, | 
|  | pad_height, | 
|  | pad_width, | 
|  | stride_height, | 
|  | stride_width, | 
|  | dilation_height, | 
|  | dilation_width, | 
|  | mode, | 
|  | math | 
|  | )); | 
|  | #else | 
|  | CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor( | 
|  | conv_desc, | 
|  | pad_height, | 
|  | pad_width, | 
|  | stride_height, | 
|  | stride_width, | 
|  | dilation_height, | 
|  | dilation_width, | 
|  | mode | 
|  | )); | 
|  | #endif | 
|  | } else { | 
|  | cudnnConvolutionMode_t mode; | 
|  | cudnnDataType_t dataType; | 
|  | int arrayLength = 0; | 
|  | vector<int> ones(dilation_.size(), 1); | 
|  | CUDNN_ENFORCE(cudnnGetConvolutionNdDescriptor( | 
|  | conv_desc, | 
|  | kernel_.size(), | 
|  | &arrayLength, | 
|  | pads_.data(), | 
|  | stride_.data(), | 
|  | ones.data(), | 
|  | &mode, | 
|  | &dataType)); | 
|  |  | 
|  | CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor( | 
|  | conv_desc, | 
|  | kernel_.size(), | 
|  | pads_.data(), | 
|  | stride_.data(), | 
|  | ones.data(), | 
|  | mode, | 
|  | math)); | 
|  | } | 
|  | } | 
|  |  | 
|  | vector<int64_t> cudnn_input_dims_; | 
|  | vector<int64_t> cudnn_filter_dims_; | 
|  |  | 
|  | CuDNNWrapper cudnn_wrapper_; | 
|  | cudnnTensorDescriptor_t bottom_desc_; | 
|  | cudnnFilterDescriptor_t filter_desc_; | 
|  | cudnnTensorDescriptor_t bias_desc_; | 
|  | cudnnTensorDescriptor_t top_desc_; | 
|  | // top desc for bias add in case we do group convolution | 
|  | cudnnTensorDescriptor_t top_desc_for_bias_; | 
|  | cudnnConvolutionDescriptor_t conv_desc_; | 
|  | const size_t cudnn_ws_nbytes_limit_; | 
|  | size_t cudnn_ws_nbytes_; | 
|  | bool exhaustive_search_; | 
|  | bool deterministic_; | 
|  | size_t cudnn_state_; | 
|  | vector<int> force_algo_; // stored as FWD, dFILTER, dDATA | 
|  | bool enable_tensor_core_; | 
|  | cudnnDataType_t compute_type_; | 
|  | }; | 
|  |  | 
|  | class CudnnConvOp final : public CudnnConvOpBase { | 
|  | public: | 
|  | CudnnConvOp(const OperatorDef& operator_def, Workspace* ws) | 
|  | : CudnnConvOpBase(operator_def, ws) {} | 
|  |  | 
|  | ~CudnnConvOp() {} | 
|  |  | 
|  | template <typename T_X, typename T_W, typename T_B, typename T_Y> | 
|  | bool DoRunWithType(); | 
|  |  | 
|  | bool RunOnDevice() override; | 
|  |  | 
|  | private: | 
|  | cudnnConvolutionFwdAlgo_t algo_; | 
|  | using ConvFwdAlgorithmWithCost = std::tuple<cudnnConvolutionFwdAlgo_t, float>; | 
|  | AlgorithmsCache<ConvFwdAlgorithmWithCost> algo_cache_; | 
|  | // Input: X, W, b | 
|  | // Output: Y | 
|  | INPUT_TAGS(INPUT, FILTER, BIAS); | 
|  | }; | 
|  |  | 
|  | class CudnnConvGradientOp final : public CudnnConvOpBase { | 
|  | public: | 
|  | CudnnConvGradientOp(const OperatorDef& operator_def, Workspace* ws) | 
|  | : CudnnConvOpBase(operator_def, ws), | 
|  | no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", 0)) { | 
|  | CAFFE_ENFORCE( | 
|  | !(no_bias_ && OutputSize() == 3), | 
|  | "If bias is not present, you should not have 3 grad output."); | 
|  |  | 
|  | CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&bwd_data_conv_desc_)); | 
|  | CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&bwd_filter_conv_desc_)); | 
|  | } | 
|  |  | 
|  | ~CudnnConvGradientOp() { | 
|  | CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(bwd_data_conv_desc_)); | 
|  | CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(bwd_filter_conv_desc_)); | 
|  | } | 
|  |  | 
|  | template < | 
|  | typename T_X, | 
|  | typename T_DY, | 
|  | typename T_W, | 
|  | typename T_B, | 
|  | typename T_DX, | 
|  | typename T_DW, | 
|  | typename T_DB> | 
|  | bool DoRunWithType(); | 
|  |  | 
|  | bool RunOnDevice() override; | 
|  |  | 
|  | private: | 
|  | cudnnConvolutionDescriptor_t bwd_filter_conv_desc_; | 
|  | cudnnConvolutionDescriptor_t bwd_data_conv_desc_; | 
|  | cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_; | 
|  | cudnnConvolutionBwdDataAlgo_t bwd_data_algo_; | 
|  | using ConvBwdFilterAlgorithmWithCost = | 
|  | std::tuple<cudnnConvolutionBwdFilterAlgo_t, float>; | 
|  | using ConvBwdDataAlgorithmWithCost = | 
|  | std::tuple<cudnnConvolutionBwdDataAlgo_t, float>; | 
|  | AlgorithmsCache<ConvBwdFilterAlgorithmWithCost> filter_algo_cache_; | 
|  | AlgorithmsCache<ConvBwdDataAlgorithmWithCost> data_algo_cache_; | 
|  | bool no_bias_; | 
|  | // input: X, W, dY | 
|  | // output: dW, db, and optionally dX | 
|  | INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD); | 
|  | OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD); | 
|  | }; | 
|  |  | 
|  | //////////////////////////////////////////////////////////////////////////////// | 
|  | // Implementations | 
|  | //////////////////////////////////////////////////////////////////////////////// | 
|  |  | 
|  | static constexpr std::array<cudnnDataType_t, 2> kComputeTypesToTry = { | 
|  | CUDNN_DATA_FLOAT, | 
|  | CUDNN_DATA_HALF}; | 
|  | static constexpr std::array<const char*, 2> kComputePassNames = { | 
|  | "fp32 compute", | 
|  | "fp16 compute"}; | 
|  |  | 
|  | template <typename T_X, typename T_W, typename T_B, typename T_Y> | 
|  | bool CudnnConvOp::DoRunWithType() { | 
|  | auto& X = Input(INPUT); | 
|  | auto& filter = Input(FILTER); | 
|  | auto* Y = Output(0); | 
|  |  | 
|  | // Figure out the output shape | 
|  | CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5); | 
|  | CAFFE_ENFORCE(filter.ndim() >= 3 && filter.ndim() <= 5); | 
|  | const int M = filter.dim32(0); | 
|  | ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, M); | 
|  | int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0; | 
|  | int group_offset_X = 0, group_offset_Y = 0; | 
|  |  | 
|  | switch (order_) { | 
|  | case StorageOrder::NHWC: | 
|  | N = X.dim32(0); | 
|  | H = X.dim32(1); | 
|  | W = X.ndim() > 3 ? X.dim32(2) : 1; | 
|  | D = X.ndim() > 4 ? X.dim32(3) : 1; | 
|  | C = X.dim32(X.ndim() - 1); | 
|  | H_out = Y->dim32(1); | 
|  | W_out = Y->ndim() > 3 ? Y->dim32(2) : 1; | 
|  | D_out = Y->ndim() > 4 ? Y->dim32(3) : 1; | 
|  | for (int i = 0; i < kernel_.size(); ++i) { | 
|  | CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]); | 
|  | } | 
|  | CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_); | 
|  | group_offset_X = C / group_; | 
|  | group_offset_Y = M / group_; | 
|  | break; | 
|  | case StorageOrder::NCHW: | 
|  | N = X.dim32(0); | 
|  | C = X.dim32(1); | 
|  | H = X.dim32(2); | 
|  | W = X.ndim() > 3 ? X.dim32(3) : 1; | 
|  | D = X.ndim() > 4 ? X.dim32(4) : 1; | 
|  | H_out = Y->dim32(2); | 
|  | W_out = Y->ndim() > 3 ? Y->dim32(3) : 1; | 
|  | D_out = Y->ndim() > 4 ? Y->dim32(4) : 1; | 
|  | CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_); | 
|  | for (int i = 0; i < kernel_.size(); ++i) { | 
|  | CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]); | 
|  | } | 
|  | group_offset_X = C / group_ * H * W * D; | 
|  | group_offset_Y = M / group_ * H_out * W_out * D_out; | 
|  | break; | 
|  | default: | 
|  | LOG(FATAL) << "Unknown storage order: " << order_; | 
|  | } | 
|  |  | 
|  | CAFFE_ENFORCE( | 
|  | C % group_ == 0, | 
|  | "If you set group, the number of input channels should be divisible " | 
|  | "by group."); | 
|  | CAFFE_ENFORCE( | 
|  | M % group_ == 0, | 
|  | "If you set group, the number of output channels should be divisible " | 
|  | "by group."); | 
|  |  | 
|  | int group_offset_filter = filter.size() / group_; | 
|  |  | 
|  | // Set up the cudnn algorithms & workspace if necessary | 
|  | bool input_changed = (X.sizes() != cudnn_input_dims_); | 
|  | bool filter_changed = (filter.sizes() != cudnn_filter_dims_); | 
|  | if (input_changed || filter_changed) { | 
|  | VLOG(1) << "Changing the cudnn descriptor configurations."; | 
|  | if (input_changed) { | 
|  | cudnn_input_dims_ = X.sizes().vec(); | 
|  | SetTensorNdDescriptorWithGroup<T_X>( | 
|  | X.ndim(), bottom_desc_, N, C, H, W, D); | 
|  | } | 
|  | if (filter_changed) { | 
|  | cudnn_filter_dims_ = filter.sizes().vec(); | 
|  | if (kernel_.size() == 2) { | 
|  | #if CUDNN_VERSION_MIN(7, 0, 0) | 
|  | const int MM = M; | 
|  | #else | 
|  | const int MM = M / group_; | 
|  | #endif | 
|  | CUDNN_ENFORCE(cudnnSetFilter4dDescriptor( | 
|  | filter_desc_, | 
|  | cudnnTypeWrapper<T_W>::type, | 
|  | GetCudnnTensorFormat(order_), | 
|  | MM, | 
|  | C / group_, | 
|  | kernel_h(), | 
|  | kernel_w())); | 
|  | } else { | 
|  | vector<int> dims(filter.sizes().begin(), filter.sizes().end()); | 
|  | #if !CUDNN_VERSION_MIN(7, 0, 0) | 
|  | // We only need to divide dims by group_ when CUDNN version < 7.0 | 
|  | // see CUDA group convolution doc: https://fburl.com/dgj6dvpd | 
|  | order_ == StorageOrder::NCHW ? dims[1] /= group_ | 
|  | : dims[filter.ndim() - 1] /= group_; | 
|  | #endif | 
|  | CUDNN_ENFORCE(cudnnSetFilterNdDescriptor( | 
|  | filter_desc_, | 
|  | cudnnTypeWrapper<T_W>::type, | 
|  | GetCudnnTensorFormat(order_), | 
|  | dims.size(), | 
|  | dims.data())); | 
|  | } | 
|  | if (InputSize() == 3) { | 
|  | if (kernel_.size() == 2) { | 
|  | CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( | 
|  | bias_desc_, | 
|  | GetCudnnTensorFormat(order_), | 
|  | cudnnTypeWrapper<T_B>::type, | 
|  | 1, | 
|  | M, | 
|  | 1, | 
|  | 1)); | 
|  | } else { | 
|  | std::vector<int> bias_dims(X.ndim(), 1); | 
|  | bias_dims[1] = M; | 
|  | std::vector<int> strides = {M, 1, 1, 1, 1, 1}; | 
|  | CUDNN_ENFORCE(cudnnSetTensorNdDescriptor( | 
|  | bias_desc_, | 
|  | cudnnTypeWrapper<T_B>::type, | 
|  | X.ndim() > 3 ? X.ndim() : 4, | 
|  | bias_dims.data(), | 
|  | strides.data())); | 
|  | } | 
|  | } | 
|  | } | 
|  | // Set the output | 
|  | SetTensorNdDescriptorWithGroup<T_Y>( | 
|  | X.ndim(), top_desc_, N, M, H_out, W_out, D_out); | 
|  | // Set the output with descriptor useful for bias addition in one run. | 
|  | if (kernel_.size() == 2) { | 
|  | CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( | 
|  | top_desc_for_bias_, | 
|  | GetCudnnTensorFormat(order_), | 
|  | cudnnTypeWrapper<T_B>::type, | 
|  | N, | 
|  | M, | 
|  | H_out, | 
|  | W_out)); | 
|  | } else { | 
|  | vector<int> dims = {N, M, H_out, W_out, D_out}; | 
|  | vector<int> strides = {M * H_out * W_out * D_out, | 
|  | H_out * W_out * D_out, | 
|  | W_out * D_out, | 
|  | D_out, | 
|  | 1}; | 
|  | CUDNN_ENFORCE(cudnnSetTensorNdDescriptor( | 
|  | top_desc_for_bias_, | 
|  | cudnnTypeWrapper<T_B>::type, | 
|  | X.ndim() > 3 ? X.ndim() : 4, | 
|  | dims.data(), | 
|  | strides.data())); | 
|  | } | 
|  |  | 
|  | compute_type_ = DetermineComputeTypeFromInput(X); | 
|  | SetConvDescFromArguments(); | 
|  |  | 
|  | #if CUDNN_VERSION_MIN(7, 0, 0) | 
|  | if (enable_tensor_core_) { | 
|  | CUDNN_ENFORCE( | 
|  | cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH)); | 
|  | } | 
|  |  | 
|  | // enable cuDNN conv groups | 
|  | CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_)); | 
|  | #endif | 
|  |  | 
|  | if (force_algo_[ALGO_FWD] >= 0) { | 
|  | algo_ = (cudnnConvolutionFwdAlgo_t)force_algo_[ALGO_FWD]; | 
|  | } else if (deterministic_) { | 
|  | algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; | 
|  | } else if (exhaustive_search_) { | 
|  | // Even when FP16 compute is supported and requested, try FP32 | 
|  | // because it may be faster. However, if FP32 compute is specified, | 
|  | // FP16 is not a suitable alternative - early out from the loop. | 
|  | std::array<ConvFwdAlgorithmWithCost, 2> algosToCompare; | 
|  | for (int i = 0; i < 2; i++) { | 
|  | SetConvDescComputeType(conv_desc_, kComputeTypesToTry[i]); | 
|  |  | 
|  | algosToCompare[i] = algo_cache_.getAlgorithm( | 
|  | X.sizes(), filter.sizes(), kComputeTypesToTry[i], [&]() { | 
|  | VLOG(1) << "CUDNN Convolution fwd: doing exhaustive " | 
|  | << "search for " << kComputePassNames[i]; | 
|  | // When we do an exhaustive search, we will ignore the workspace | 
|  | // size limit and simply go for the fastest algorithm. If you | 
|  | // happen to run out of memory later, you will be on your own... | 
|  | int returned_algo_count; | 
|  | std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS> | 
|  | fwd_perf_stat; | 
|  |  | 
|  | // no need to clean up workspace, | 
|  | cudnn_wrapper_.with_cudnn_state( | 
|  | cudnn_state_, [&](CuDNNState* state) { | 
|  | // Actually run the search. | 
|  | CUDNN_ENFORCE(cudnnFindConvolutionForwardAlgorithmEx( | 
|  | state->cudnn_handle(), | 
|  | bottom_desc_, | 
|  | X.template data<T_X>(), | 
|  | filter_desc_, | 
|  | filter.template data<T_W>(), | 
|  | conv_desc_, | 
|  | top_desc_, | 
|  | Y->template mutable_data<T_Y>(), | 
|  | kNUM_CUDNN_FWD_ALGS, | 
|  | &returned_algo_count, | 
|  | fwd_perf_stat.data(), | 
|  | state->workspace().get(cudnn_ws_nbytes_limit_), | 
|  | cudnn_ws_nbytes_limit_)); | 
|  | }); | 
|  | LogCuDNNPerfStats(fwd_perf_stat, returned_algo_count); | 
|  | float algo_time = fwd_perf_stat[0].status == CUDNN_STATUS_SUCCESS | 
|  | ? fwd_perf_stat[0].time | 
|  | : 1e10; | 
|  | return ConvFwdAlgorithmWithCost(fwd_perf_stat[0].algo, algo_time); | 
|  | }); | 
|  |  | 
|  | // When set to fp32 compute, don't try fp16 | 
|  | if (compute_type_ == CUDNN_DATA_FLOAT) { | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (compute_type_ == CUDNN_DATA_FLOAT) { | 
|  | // For FP32 compute, just use the best FP32 algorithm | 
|  | algo_ = std::get<0>(algosToCompare[0]); | 
|  | } else { | 
|  | // For FP16 compute, choose algo with fastest execution | 
|  | int bestAlgoIndex = | 
|  | (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1])) | 
|  | ? 0 | 
|  | : 1; | 
|  | algo_ = std::get<0>(algosToCompare[bestAlgoIndex]); | 
|  | SetConvDescComputeType(conv_desc_, kComputeTypesToTry[bestAlgoIndex]); | 
|  | } | 
|  | } else { | 
|  | // Get the convolution algorithm based on the workspace limit. | 
|  | CUDNN_ENFORCE(cudnnGetConvolutionForwardAlgorithm( | 
|  | cudnn_wrapper_.inline_cudnn_handle(), | 
|  | bottom_desc_, | 
|  | filter_desc_, | 
|  | conv_desc_, | 
|  | top_desc_, | 
|  | CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, | 
|  | cudnn_ws_nbytes_limit_, | 
|  | &algo_)); | 
|  | } | 
|  | CUDNN_ENFORCE(cudnnGetConvolutionForwardWorkspaceSize( | 
|  | cudnn_wrapper_.inline_cudnn_handle(), | 
|  | bottom_desc_, | 
|  | filter_desc_, | 
|  | conv_desc_, | 
|  | top_desc_, | 
|  | algo_, | 
|  | &cudnn_ws_nbytes_)); | 
|  | VLOG(1) << "CuDNN algorithm: " << algo_; | 
|  | VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_; | 
|  | } | 
|  |  | 
|  | // Now, actually run the computation. | 
|  | // Run directly through cuDNN if possible | 
|  | #if CUDNN_VERSION_MIN(7,0,0) | 
|  | cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) { | 
|  | CUDNN_ENFORCE(cudnnConvolutionForward( | 
|  | state->cudnn_handle(), | 
|  | cudnnTypeWrapper<T_X>::kOne(), | 
|  | bottom_desc_, | 
|  | X.template data<T_X>(), | 
|  | filter_desc_, | 
|  | filter.template data<T_W>(), | 
|  | conv_desc_, | 
|  | algo_, | 
|  | state->workspace().get(cudnn_ws_nbytes_), | 
|  | cudnn_ws_nbytes_, | 
|  | cudnnTypeWrapper<T_Y>::kZero(), | 
|  | top_desc_, | 
|  | Y->template mutable_data<T_Y>())); | 
|  | }); | 
|  | #else | 
|  | // otherwise manually run through groups | 
|  | for (int i = 0; i < group_; ++i) { | 
|  | cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) { | 
|  | CUDNN_ENFORCE(cudnnConvolutionForward( | 
|  | state->cudnn_handle(), | 
|  | cudnnTypeWrapper<T_X>::kOne(), | 
|  | bottom_desc_, | 
|  | X.template data<T_X>() + i * group_offset_X, | 
|  | filter_desc_, | 
|  | filter.template data<T_W>() + i * group_offset_filter, | 
|  | conv_desc_, | 
|  | algo_, | 
|  | state->workspace().get(cudnn_ws_nbytes_), | 
|  | cudnn_ws_nbytes_, | 
|  | cudnnTypeWrapper<T_Y>::kZero(), | 
|  | top_desc_, | 
|  | Y->template mutable_data<T_Y>() + i * group_offset_Y)); | 
|  | }); | 
|  | } | 
|  | #endif | 
|  | // Bias | 
|  | if (InputSize() == 3) { | 
|  | auto& bias = Input(BIAS); | 
|  |  | 
|  | CAFFE_ENFORCE_EQ(bias.ndim(), 1); | 
|  | CAFFE_ENFORCE_EQ(bias.dim32(0), M); | 
|  |  | 
|  | CUDNN_ENFORCE(cudnnAddTensor( | 
|  | cudnn_wrapper_.inline_cudnn_handle(), | 
|  | cudnnTypeWrapper<T_B>::kOne(), | 
|  | bias_desc_, | 
|  | bias.template data<T_B>(), | 
|  | cudnnTypeWrapper<T_Y>::kOne(), | 
|  | top_desc_for_bias_, | 
|  | Y->template mutable_data<T_Y>())); | 
|  | } | 
|  | // Done. | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool CudnnConvOp::RunOnDevice() { | 
|  | if (Input(0).IsType<float>()) { | 
|  | return DoRunWithType< | 
|  | float, // X | 
|  | float, // W | 
|  | float, // B | 
|  | float>(); // Y | 
|  | } else if (Input(0).IsType<at::Half>()) { | 
|  | return DoRunWithType< | 
|  | at::Half, // X | 
|  | at::Half, // W | 
|  | at::Half, // B | 
|  | at::Half>(); // Y | 
|  | } else { | 
|  | LOG(FATAL) << "Only float (32bit) and Half are supported by " | 
|  | << "cudnn convolution, but input " << debug_def().input(0) | 
|  | << " has [" << Input(0).meta().name() << "]"; | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | template < | 
|  | typename T_X, | 
|  | typename T_DY, | 
|  | typename T_W, | 
|  | typename T_B, | 
|  | typename T_DX, | 
|  | typename T_DW, | 
|  | typename T_DB> | 
|  | bool CudnnConvGradientOp::DoRunWithType() { | 
|  | auto& X = Input(INPUT); | 
|  | auto& filter = Input(FILTER); | 
|  | auto& dY = Input(OUTPUT_GRAD); | 
|  | auto* dfilter = Output(FILTER_GRAD); | 
|  |  | 
|  | CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5); | 
|  | CAFFE_ENFORCE(filter.ndim() >= 3 && filter.ndim() <= 5); | 
|  |  | 
|  | const int M = filter.dim32(0); | 
|  | int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0; | 
|  | int group_offset_X = 0, group_offset_Y = 0; | 
|  |  | 
|  | switch (order_) { | 
|  | case StorageOrder::NHWC: | 
|  | N = X.dim32(0); | 
|  | H = X.dim32(1); | 
|  | W = X.ndim() > 3 ? X.dim32(2) : 1; | 
|  | D = X.ndim() > 4 ? X.dim32(3) : 1; | 
|  | C = X.dim32(X.ndim() - 1); | 
|  | H_out = dY.dim32(1); | 
|  | W_out = dY.ndim() > 3 ? dY.dim32(2) : 1; | 
|  | D_out = dY.ndim() > 4 ? dY.dim32(3) : 1; | 
|  | for (int i = 0; i < kernel_.size(); ++i) { | 
|  | CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]); | 
|  | } | 
|  | CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_); | 
|  | group_offset_X = C / group_; | 
|  | group_offset_Y = M / group_; | 
|  | break; | 
|  | case StorageOrder::NCHW: | 
|  | N = X.dim32(0); | 
|  | C = X.dim32(1); | 
|  | H = X.dim32(2); | 
|  | W = X.ndim() > 3 ? X.dim32(3) : 1; | 
|  | D = X.ndim() > 4 ? X.dim32(4) : 1; | 
|  | H_out = dY.dim32(2); | 
|  | W_out = dY.ndim() > 3 ? dY.dim32(3) : 1; | 
|  | D_out = dY.ndim() > 4 ? dY.dim32(4) : 1; | 
|  | CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_); | 
|  | for (int i = 0; i < kernel_.size(); ++i) { | 
|  | CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]); | 
|  | } | 
|  | group_offset_X = C / group_ * H * W * D; | 
|  | group_offset_Y = M / group_ * H_out * W_out * D_out; | 
|  | break; | 
|  | default: | 
|  | LOG(FATAL) << "Unknown storage order: " << order_; | 
|  | } | 
|  |  | 
|  | CAFFE_ENFORCE( | 
|  | C % group_ == 0, | 
|  | "If you set group, the number of input channels should be divisible " | 
|  | "by group."); | 
|  | CAFFE_ENFORCE( | 
|  | M % group_ == 0, | 
|  | "If you set group, the number of output channels should be divisible " | 
|  | "by group."); | 
|  |  | 
|  | int group_offset_filter = filter.size() / group_; | 
|  | if (kernel_.size() == 1) { | 
|  | ConvPoolOpBase<CUDAContext>::ComputePads({H}); | 
|  | } else if (kernel_.size() == 2) { | 
|  | ConvPoolOpBase<CUDAContext>::ComputePads({H, W}); | 
|  | } else if (kernel_.size() == 3) { | 
|  | ConvPoolOpBase<CUDAContext>::ComputePads({H, W, D}); | 
|  | } else { | 
|  | CAFFE_THROW("Unsupported kernel size:", kernel_.size()); | 
|  | } | 
|  | dfilter->ResizeLike(filter); | 
|  |  | 
|  | // Set up the cudnn algorithms & workspace if necessary | 
|  | bool input_changed = (X.sizes() != cudnn_input_dims_); | 
|  | bool filter_changed = (filter.sizes() != cudnn_filter_dims_); | 
|  | if (input_changed || filter_changed) { | 
|  | VLOG(1) << "Changing the cudnn descriptor configurations."; | 
|  | if (input_changed) { | 
|  | cudnn_input_dims_ = X.sizes().vec(); | 
|  | SetTensorNdDescriptorWithGroup<T_X>( | 
|  | X.ndim(), bottom_desc_, N, C, H, W, D); | 
|  | } | 
|  | if (filter_changed) { | 
|  | cudnn_filter_dims_ = filter.sizes().vec(); | 
|  | if (kernel_.size() == 2) { | 
|  | #if CUDNN_VERSION_MIN(7, 0, 0) | 
|  | const int MM = M; | 
|  | #else | 
|  | const int MM = M / group_; | 
|  | #endif | 
|  | CUDNN_ENFORCE(cudnnSetFilter4dDescriptor( | 
|  | filter_desc_, | 
|  | cudnnTypeWrapper<T_W>::type, | 
|  | GetCudnnTensorFormat(order_), | 
|  | MM, | 
|  | C / group_, | 
|  | kernel_h(), | 
|  | kernel_w())); | 
|  | } else { | 
|  | vector<int> dims(filter.sizes().begin(), filter.sizes().end()); | 
|  | #if !CUDNN_VERSION_MIN(7, 0, 0) | 
|  | // We only need to divide dims by group_ when CUDNN version < 7.0 | 
|  | // see CUDA group convolution doc: https://fburl.com/dgj6dvpd | 
|  | order_ == StorageOrder::NCHW ? dims[1] /= group_ | 
|  | : dims[filter.ndim() - 1] /= group_; | 
|  | #endif | 
|  |  | 
|  | CUDNN_ENFORCE(cudnnSetFilterNdDescriptor( | 
|  | filter_desc_, | 
|  | cudnnTypeWrapper<T_W>::type, | 
|  | GetCudnnTensorFormat(order_), | 
|  | dims.size(), | 
|  | dims.data())); | 
|  | } | 
|  | if (!no_bias_) { | 
|  | if (kernel_.size() == 2) { | 
|  | CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( | 
|  | bias_desc_, | 
|  | GetCudnnTensorFormat(order_), | 
|  | cudnnTypeWrapper<T_B>::type, | 
|  | 1, | 
|  | M, | 
|  | 1, | 
|  | 1)); | 
|  | } else { | 
|  | std::vector<int> bias_dims(X.ndim(), 1); | 
|  | bias_dims[1] = M; | 
|  | std::vector<int> strides = {M, 1, 1, 1, 1, 1}; | 
|  | CUDNN_ENFORCE(cudnnSetTensorNdDescriptor( | 
|  | bias_desc_, | 
|  | cudnnTypeWrapper<T_B>::type, | 
|  | X.ndim() > 3 ? X.ndim() : 4, | 
|  | bias_dims.data(), | 
|  | strides.data())); | 
|  | } | 
|  | } | 
|  | } | 
|  | // Set the output | 
|  | SetTensorNdDescriptorWithGroup<T_DX>( | 
|  | X.ndim(), top_desc_, N, M, H_out, W_out, D_out); | 
|  | // Set the output with descriptor useful for bias addition in one run. | 
|  | if (kernel_.size() == 2) { | 
|  | CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( | 
|  | top_desc_for_bias_, | 
|  | GetCudnnTensorFormat(order_), | 
|  | cudnnTypeWrapper<T_B>::type, | 
|  | N, | 
|  | M, | 
|  | H_out, | 
|  | W_out)); | 
|  | } else { | 
|  | vector<int> dims = {N, M, H_out, W_out, D_out}; | 
|  | vector<int> strides = {M * H_out * W_out * D_out, | 
|  | H_out * W_out * D_out, | 
|  | W_out * D_out, | 
|  | D_out, | 
|  | 1}; | 
|  | CUDNN_ENFORCE(cudnnSetTensorNdDescriptor( | 
|  | top_desc_for_bias_, | 
|  | cudnnTypeWrapper<T_B>::type, | 
|  | X.ndim() > 3 ? X.ndim() : 4, | 
|  | dims.data(), | 
|  | strides.data())); | 
|  | } | 
|  |  | 
|  | compute_type_ = DetermineComputeTypeFromInput(X); | 
|  | SetConvDescFromArguments(); | 
|  |  | 
|  | DuplicateConvDesc( | 
|  | conv_desc_, kernel_.size(), dilation_.size(), bwd_filter_conv_desc_); | 
|  | DuplicateConvDesc( | 
|  | conv_desc_, kernel_.size(), dilation_.size(), bwd_data_conv_desc_); | 
|  |  | 
|  | #if CUDNN_VERSION_MIN(7, 0, 0) | 
|  | if (enable_tensor_core_) { | 
|  | CUDNN_ENFORCE(cudnnSetConvolutionMathType( | 
|  | bwd_filter_conv_desc_, CUDNN_TENSOR_OP_MATH)); | 
|  | CUDNN_ENFORCE(cudnnSetConvolutionMathType( | 
|  | bwd_data_conv_desc_, CUDNN_TENSOR_OP_MATH)); | 
|  | } | 
|  |  | 
|  | // set cuDNN groups if appropriate | 
|  | CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_filter_conv_desc_, group_)); | 
|  | CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_data_conv_desc_, group_)); | 
|  | #endif | 
|  |  | 
|  | // Choose dW algorithm | 
|  | if (force_algo_[ALGO_WGRAD] >= 0) { | 
|  | bwd_filter_algo_ = | 
|  | (cudnnConvolutionBwdFilterAlgo_t)force_algo_[ALGO_WGRAD]; | 
|  | } else if (deterministic_) { | 
|  | bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; | 
|  | } else if (exhaustive_search_) { | 
|  | // Even when FP16 compute is supported and requested, try FP32 | 
|  | // because it may be faster. However, if FP32 compute is specified, | 
|  | // FP16 is not a suitable alternative - early out from the loop. | 
|  | std::array<ConvBwdFilterAlgorithmWithCost, 2> algosToCompare; | 
|  | for (int i = 0; i < 2; i++) { | 
|  | SetConvDescComputeType(bwd_filter_conv_desc_, kComputeTypesToTry[i]); | 
|  |  | 
|  | algosToCompare[i] = filter_algo_cache_.getAlgorithm( | 
|  | X.sizes(), filter.sizes(), kComputeTypesToTry[i], [&]() { | 
|  | VLOG(1) << "CUDNN Convolution bwd: doing filter exhaustive" | 
|  | << "search for " << kComputePassNames[i]; | 
|  | // When we do an exhaustive search, we will ignore the workspace | 
|  | // size limit and simply go for the fastest algorithm. If you | 
|  | // happen to run out of memory later, you will be on your own... | 
|  | int returned_algo_count; | 
|  | // We clean up the current workspace memory so that the forward | 
|  | // algorithm is free to allocate memory. | 
|  | // Actually run the search. | 
|  | std::array< | 
|  | cudnnConvolutionBwdFilterAlgoPerf_t, | 
|  | kNUM_CUDNN_BWD_FILTER_ALGS> | 
|  | filter_perf_stat; | 
|  |  | 
|  | cudnn_wrapper_.with_cudnn_state( | 
|  | cudnn_state_, [&](CuDNNState* state) { | 
|  | CUDNN_ENFORCE(cudnnFindConvolutionBackwardFilterAlgorithmEx( | 
|  | state->cudnn_handle(), | 
|  | bottom_desc_, | 
|  | X.template data<T_X>(), | 
|  | top_desc_, | 
|  | dY.template data<T_DY>(), | 
|  | bwd_filter_conv_desc_, | 
|  | filter_desc_, | 
|  | dfilter->template mutable_data<T_DW>(), | 
|  | kNUM_CUDNN_BWD_FILTER_ALGS, | 
|  | &returned_algo_count, | 
|  | filter_perf_stat.data(), | 
|  | state->workspace().get(cudnn_ws_nbytes_limit_), | 
|  | cudnn_ws_nbytes_limit_)); | 
|  | }); | 
|  | LogCuDNNPerfStats(filter_perf_stat, returned_algo_count); | 
|  | float algo_time = | 
|  | filter_perf_stat[0].status == CUDNN_STATUS_SUCCESS | 
|  | ? filter_perf_stat[0].time | 
|  | : 1e10; | 
|  | return ConvBwdFilterAlgorithmWithCost( | 
|  | filter_perf_stat[0].algo, algo_time); | 
|  | }); | 
|  |  | 
|  | // When set to fp32 compute, don't try fp16 | 
|  | if (compute_type_ == CUDNN_DATA_FLOAT) { | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (compute_type_ == CUDNN_DATA_FLOAT) { | 
|  | // For FP32 compute, just use the best FP32 algorithm | 
|  | bwd_filter_algo_ = std::get<0>(algosToCompare[0]); | 
|  | } else { | 
|  | // For FP16 compute, choose algo with fastest execution | 
|  | int bestAlgoIndex = | 
|  | (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1])) | 
|  | ? 0 | 
|  | : 1; | 
|  | bwd_filter_algo_ = std::get<0>(algosToCompare[bestAlgoIndex]); | 
|  | SetConvDescComputeType( | 
|  | bwd_filter_conv_desc_, kComputeTypesToTry[bestAlgoIndex]); | 
|  | } | 
|  | } else { | 
|  | // choose backward algorithm for filter | 
|  | CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterAlgorithm( | 
|  | cudnn_wrapper_.inline_cudnn_handle(), | 
|  | bottom_desc_, | 
|  | top_desc_, | 
|  | bwd_filter_conv_desc_, | 
|  | filter_desc_, | 
|  | CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, | 
|  | cudnn_ws_nbytes_limit_, | 
|  | &bwd_filter_algo_)); | 
|  | } | 
|  | // Pick dX algo if needed | 
|  | if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) { | 
|  | if (force_algo_[ALGO_DGRAD] >= 0) { | 
|  | bwd_data_algo_ = (cudnnConvolutionBwdDataAlgo_t)force_algo_[ALGO_DGRAD]; | 
|  | } else if (deterministic_) { | 
|  | bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; | 
|  | } else if (exhaustive_search_) { | 
|  | // Even when FP16 compute is supported and requested, try FP32 | 
|  | // because it may be faster. However, if FP32 compute is specified, | 
|  | // FP16 is not a suitable alternative - early out from the loop. | 
|  | std::array<ConvBwdDataAlgorithmWithCost, 2> algosToCompare; | 
|  | for (int i = 0; i < 2; i++) { | 
|  | SetConvDescComputeType(bwd_data_conv_desc_, kComputeTypesToTry[i]); | 
|  |  | 
|  | algosToCompare[i] = data_algo_cache_.getAlgorithm( | 
|  | X.sizes(), filter.sizes(), kComputeTypesToTry[i], [&]() { | 
|  | VLOG(1) << "CUDNN Convolution bwd: doing data exhaustive" | 
|  | << "search for " << kComputePassNames[i]; | 
|  | int returned_algo_count; | 
|  |  | 
|  | std::array< | 
|  | cudnnConvolutionBwdDataAlgoPerf_t, | 
|  | kNUM_CUDNN_BWD_DATA_ALGS> | 
|  | data_perf_stat; | 
|  | cudnn_wrapper_.with_cudnn_state( | 
|  | cudnn_state_, [&](CuDNNState* state) { | 
|  | auto* dX = | 
|  | Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD); | 
|  | dX->ResizeLike(X); | 
|  | const T_W* filter_data = filter.template data<T_W>(); | 
|  | const T_DY* dYdata = dY.template data<T_DY>(); | 
|  | T_DX* dXdata = dX->template mutable_data<T_DX>(); | 
|  | CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithmEx( | 
|  | state->cudnn_handle(), | 
|  | filter_desc_, | 
|  | filter_data, | 
|  | top_desc_, | 
|  | dYdata, | 
|  | bwd_data_conv_desc_, | 
|  | bottom_desc_, | 
|  | dXdata, | 
|  | kNUM_CUDNN_BWD_DATA_ALGS, | 
|  | &returned_algo_count, | 
|  | data_perf_stat.data(), | 
|  | state->workspace().get(cudnn_ws_nbytes_limit_), | 
|  | cudnn_ws_nbytes_limit_)); | 
|  | }); | 
|  |  | 
|  | LogCuDNNPerfStats(data_perf_stat, returned_algo_count); | 
|  | float algo_time = | 
|  | data_perf_stat[0].status == CUDNN_STATUS_SUCCESS | 
|  | ? data_perf_stat[0].time | 
|  | : 1e10; | 
|  | return ConvBwdDataAlgorithmWithCost( | 
|  | data_perf_stat[0].algo, algo_time); | 
|  | }); | 
|  |  | 
|  | // When set to fp32 compute, don't try fp16 | 
|  | if (compute_type_ == CUDNN_DATA_FLOAT) { | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (compute_type_ == CUDNN_DATA_FLOAT) { | 
|  | // For FP32 compute, just use the best FP32 algorithm | 
|  | bwd_data_algo_ = std::get<0>(algosToCompare[0]); | 
|  | } else { | 
|  | // For FP16 compute, choose algo with fastest execution | 
|  | int bestAlgoIndex = | 
|  | (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1])) | 
|  | ? 0 | 
|  | : 1; | 
|  | bwd_data_algo_ = std::get<0>(algosToCompare[bestAlgoIndex]); | 
|  | SetConvDescComputeType( | 
|  | bwd_data_conv_desc_, kComputeTypesToTry[bestAlgoIndex]); | 
|  | } | 
|  | } else { | 
|  | CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataAlgorithm( | 
|  | cudnn_wrapper_.inline_cudnn_handle(), | 
|  | filter_desc_, | 
|  | top_desc_, | 
|  | bwd_data_conv_desc_, | 
|  | bottom_desc_, | 
|  | CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, | 
|  | cudnn_ws_nbytes_limit_, | 
|  | &bwd_data_algo_)); | 
|  | } | 
|  | } | 
|  |  | 
|  | // get workspace size for backwards filter algorithm | 
|  | size_t bwd_filter_ws_size, bwd_data_ws_size; | 
|  |  | 
|  | CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterWorkspaceSize( | 
|  | cudnn_wrapper_.inline_cudnn_handle(), | 
|  | bottom_desc_, | 
|  | top_desc_, | 
|  | bwd_filter_conv_desc_, | 
|  | filter_desc_, | 
|  | bwd_filter_algo_, | 
|  | &bwd_filter_ws_size)); | 
|  | if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) { | 
|  | // get workspace size for backwards data algorithm | 
|  | CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataWorkspaceSize( | 
|  | cudnn_wrapper_.inline_cudnn_handle(), | 
|  | filter_desc_, | 
|  | top_desc_, | 
|  | bwd_data_conv_desc_, | 
|  | bottom_desc_, | 
|  | bwd_data_algo_, | 
|  | &bwd_data_ws_size)); | 
|  | } else { | 
|  | bwd_data_ws_size = 0; | 
|  | } | 
|  | cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, bwd_data_ws_size); | 
|  |  | 
|  | VLOG(1) << "CuDNN bwd data & filter algorithm: " << bwd_data_algo_ << ", " | 
|  | << bwd_filter_algo_; | 
|  | VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_; | 
|  | } | 
|  |  | 
|  | // Now, actually run the computation. | 
|  | if (!no_bias_) { | 
|  | auto* dbias = Output(BIAS_OR_INPUT_GRAD); | 
|  | dbias->Resize(M); | 
|  | CUDNN_ENFORCE(cudnnConvolutionBackwardBias( | 
|  | cudnn_wrapper_.inline_cudnn_handle(), | 
|  | cudnnTypeWrapper<T_DY>::kOne(), | 
|  | top_desc_for_bias_, | 
|  | dY.template data<T_DY>(), | 
|  | cudnnTypeWrapper<T_DB>::kZero(), | 
|  | bias_desc_, | 
|  | dbias->template mutable_data<T_DB>())); | 
|  | } | 
|  |  | 
|  | #if CUDNN_VERSION_MIN(7, 0, 0) | 
|  | cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) { | 
|  | CUDNN_ENFORCE(cudnnConvolutionBackwardFilter( | 
|  | state->cudnn_handle(), | 
|  | cudnnTypeWrapper<T_X>::kOne(), | 
|  | bottom_desc_, | 
|  | X.template data<T_X>(), | 
|  | top_desc_, | 
|  | dY.template data<T_DY>(), | 
|  | bwd_filter_conv_desc_, | 
|  | bwd_filter_algo_, | 
|  | state->workspace().get(cudnn_ws_nbytes_), | 
|  | cudnn_ws_nbytes_, | 
|  | cudnnTypeWrapper<T_DW>::kZero(), | 
|  | filter_desc_, | 
|  | dfilter->template mutable_data<T_DW>())); | 
|  | if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) { | 
|  | // Compute the gradient w.r.t. the input. | 
|  | auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD); | 
|  | dX->ResizeLike(X); | 
|  | CUDNN_ENFORCE(cudnnConvolutionBackwardData( | 
|  | state->cudnn_handle(), | 
|  | cudnnTypeWrapper<T_W>::kOne(), | 
|  | filter_desc_, | 
|  | filter.template data<T_W>(), | 
|  | top_desc_, | 
|  | dY.template data<T_DY>(), | 
|  | bwd_data_conv_desc_, | 
|  | bwd_data_algo_, | 
|  | state->workspace().get(cudnn_ws_nbytes_), | 
|  | cudnn_ws_nbytes_, | 
|  | cudnnTypeWrapper<T_DX>::kZero(), | 
|  | bottom_desc_, | 
|  | dX->template mutable_data<T_DX>())); | 
|  | } | 
|  | }); | 
|  | #else | 
|  | for (int i = 0; i < group_; ++i) { | 
|  | cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) { | 
|  | CUDNN_ENFORCE(cudnnConvolutionBackwardFilter( | 
|  | state->cudnn_handle(), | 
|  | cudnnTypeWrapper<T_X>::kOne(), | 
|  | bottom_desc_, | 
|  | X.template data<T_X>() + i * group_offset_X, | 
|  | top_desc_, | 
|  | dY.template data<T_DY>() + i * group_offset_Y, | 
|  | bwd_filter_conv_desc_, | 
|  | bwd_filter_algo_, | 
|  | state->workspace().get(cudnn_ws_nbytes_), | 
|  | cudnn_ws_nbytes_, | 
|  | cudnnTypeWrapper<T_DW>::kZero(), | 
|  | filter_desc_, | 
|  | dfilter->template mutable_data<T_DW>() + i * group_offset_filter)); | 
|  | if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) { | 
|  | // Compute the gradient w.r.t. the input. | 
|  | auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD); | 
|  | dX->ResizeLike(X); | 
|  | CUDNN_ENFORCE(cudnnConvolutionBackwardData( | 
|  | state->cudnn_handle(), | 
|  | cudnnTypeWrapper<T_W>::kOne(), | 
|  | filter_desc_, | 
|  | filter.template data<T_W>() + i * group_offset_filter, | 
|  | top_desc_, | 
|  | dY.template data<T_DY>() + i * group_offset_Y, | 
|  | bwd_data_conv_desc_, | 
|  | bwd_data_algo_, | 
|  | state->workspace().get(cudnn_ws_nbytes_), | 
|  | cudnn_ws_nbytes_, | 
|  | cudnnTypeWrapper<T_DX>::kZero(), | 
|  | bottom_desc_, | 
|  | dX->template mutable_data<T_DX>() + i * group_offset_X)); | 
|  | } | 
|  | }); | 
|  | } | 
|  | #endif | 
|  | return true; | 
|  | } | 
|  |  | 
|  | // TODO(Yangqing): a lot of the function contents are very similar. Consider | 
|  | // consolidating them. | 
|  | bool CudnnConvGradientOp::RunOnDevice() { | 
|  | if (Input(0).IsType<float>()) { | 
|  | return DoRunWithType< | 
|  | float, //  X | 
|  | float, // dY | 
|  | float, //  W | 
|  | float, //  b | 
|  | float, // dX | 
|  | float, // dW | 
|  | float>(); // db | 
|  | } else if (Input(0).IsType<at::Half>()) { | 
|  | return DoRunWithType< | 
|  | at::Half, //  X | 
|  | at::Half, // dY | 
|  | at::Half, //  W | 
|  | at::Half, //  b | 
|  | at::Half, // dX | 
|  | at::Half, // dW | 
|  | at::Half>(); // db | 
|  | } else { | 
|  | LOG(FATAL) << "Unsupported input types"; | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | REGISTER_CUDNN_OPERATOR(Conv, CudnnConvOp); | 
|  | REGISTER_CUDNN_OPERATOR(ConvGradient, CudnnConvGradientOp); | 
|  |  | 
|  | REGISTER_CUDNN_OPERATOR(Conv1D, CudnnConvOp); | 
|  | REGISTER_CUDNN_OPERATOR(Conv1DGradient, CudnnConvGradientOp); | 
|  |  | 
|  | REGISTER_CUDNN_OPERATOR(Conv2D, CudnnConvOp); | 
|  | REGISTER_CUDNN_OPERATOR(Conv2DGradient, CudnnConvGradientOp); | 
|  |  | 
|  | REGISTER_CUDNN_OPERATOR(Conv3D, CudnnConvOp); | 
|  | REGISTER_CUDNN_OPERATOR(Conv3DGradient, CudnnConvGradientOp); | 
|  |  | 
|  | } // namespace caffe2 |