| #include "caffe2/core/common_cudnn.h" |
| #include "caffe2/core/context_gpu.h" |
| #include "caffe2/core/operator.h" |
| #include "caffe2/core/types.h" |
| |
| namespace caffe2 { |
| |
| class CuDNNReluOp final : public Operator<CUDAContext> { |
| public: |
| CuDNNReluOp(const OperatorDef& operator_def, Workspace* ws) |
| : Operator<CUDAContext>(operator_def, ws), |
| cudnn_wrapper_(&context_), |
| order_(StringToStorageOrder( |
| OperatorBase::GetSingleArgument<string>("order", "NCHW"))) { |
| CUDNN_CHECK(cudnnCreateTensorDescriptor(&data_desc_)); |
| CUDNN_CHECK(cudnnCreateActivationDescriptor(&activ_desc_)); |
| CUDNN_CHECK(cudnnSetActivationDescriptor( |
| activ_desc_, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0)); |
| |
| // Choose function body for given math type |
| TensorProto_DataType math = TensorProto_DataType_FLOAT; // hardcode for now |
| |
| switch (math) { |
| case TensorProto_DataType_FLOAT: |
| body_ = &CuDNNReluOp::DoRunWithMathType<float>; |
| break; |
| case TensorProto_DataType_FLOAT16: |
| body_ = &CuDNNReluOp::DoRunWithMathType<float16>; |
| break; |
| default: |
| CAFFE_THROW("Invalid math type specified"); |
| } |
| } |
| |
| ~CuDNNReluOp() { |
| CUDNN_CHECK(cudnnDestroyTensorDescriptor(data_desc_)); |
| CUDNN_CHECK(cudnnDestroyActivationDescriptor(activ_desc_)); |
| } |
| |
| template <typename M> |
| bool DoRunWithMathType() { |
| return DispatchHelper< |
| TensorTypes< |
| float, |
| float16>, |
| M>::call(this, Input(0)); |
| } |
| |
| template <typename T, typename M> |
| bool DoRunWithType() { |
| const auto& X = Input(0); |
| auto* Y = Output(0); |
| // See if we need to reshape. |
| if (X.dims() != cudnn_input_dims_) { |
| VLOG(1) << "Setting descriptors."; |
| cudnn_input_dims_ = X.dims(); |
| int C = 1, H = 1, W = 1; |
| if (X.ndim() == 4) { |
| // Normal 4-dimensional tensors for images. |
| C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3)); |
| H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1)); |
| W = (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2)); |
| } else { |
| // If X is not 4-dimensional, we will simply use H = 1 and W = 1 |
| // and wrap everything into C. |
| C = X.size() / X.dim32(0); |
| } |
| CUDNN_CHECK(cudnnSetTensor4dDescriptor( |
| data_desc_, GetCudnnTensorFormat(order_), |
| cudnnTypeWrapper<T>::type, X.dim32(0), C, H, W)); |
| } |
| CUDNN_CHECK(cudnnActivationForward(cudnn_wrapper_.inline_cudnn_handle(), |
| activ_desc_, cudnnTypeWrapper<T>::kOne(), data_desc_, |
| X.template data<T>(), cudnnTypeWrapper<T>::kZero(), |
| data_desc_, Y->template mutable_data<T>())); |
| return true; |
| } |
| |
| bool RunOnDevice() override { |
| // dispatch based on contents of tensor(s) |
| const auto& X = Input(0); |
| auto* Y = Output(0); |
| Y->ResizeLike(X); |
| |
| return (this->*body_)(); |
| } |
| |
| protected: |
| CuDNNWrapper cudnn_wrapper_; |
| cudnnTensorDescriptor_t data_desc_; |
| cudnnActivationDescriptor_t activ_desc_; |
| vector<TIndex> cudnn_input_dims_; |
| StorageOrder order_; |
| bool (CuDNNReluOp::*body_)(); |
| }; |
| |
| |
| // Note: You can see that in CuDNNReluGradientOp, we abused the cudnn interface |
| // by passing in the output tensor for both bottom and top. This is dependent on |
| // the assumption that the Relu gradient actually does not rely on the bottom |
| // data, or it treats input=0 the same way as input<0. This is of course not |
| // very safe, but we have been running in this way in Caffe for a while so it |
| // *might* be safe to assume so. |
| class CuDNNReluGradientOp final : public Operator<CUDAContext> { |
| public: |
| CuDNNReluGradientOp(const OperatorDef& operator_def, Workspace* ws) |
| : Operator<CUDAContext>(operator_def, ws), |
| cudnn_wrapper_(&context_), |
| order_(StringToStorageOrder( |
| OperatorBase::GetSingleArgument<string>("order", "NCHW"))) { |
| CUDNN_CHECK(cudnnCreateTensorDescriptor(&data_desc_)); |
| CUDNN_CHECK(cudnnCreateActivationDescriptor(&activ_desc_)); |
| CUDNN_CHECK(cudnnSetActivationDescriptor( |
| activ_desc_, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0)); |
| |
| // Choose function body for given math type |
| TensorProto_DataType math = TensorProto_DataType_FLOAT; // hardcode for now |
| |
| switch (math) { |
| case TensorProto_DataType_FLOAT: |
| body_ = &CuDNNReluGradientOp::DoRunWithMathType<float>; |
| break; |
| case TensorProto_DataType_FLOAT16: |
| body_ = &CuDNNReluGradientOp::DoRunWithMathType<float16>; |
| break; |
| default: |
| CAFFE_THROW("Invalid math type specified"); |
| } |
| } |
| |
| ~CuDNNReluGradientOp() { |
| CUDNN_CHECK(cudnnDestroyTensorDescriptor(data_desc_)); |
| CUDNN_CHECK(cudnnDestroyActivationDescriptor(activ_desc_)); |
| } |
| |
| template <typename M> |
| bool DoRunWithMathType() { |
| return DispatchHelper< |
| TensorTypes< |
| float, |
| float16>, |
| M>::call(this, Input(0)); |
| } |
| |
| template <typename T, typename M> |
| bool DoRunWithType() { |
| const auto& Y = Input(0); |
| const auto& dY = Input(1); |
| auto* dX = Output(0); |
| // See if we need to reshape. |
| if (Y.dims() != cudnn_input_dims_) { |
| VLOG(1) << "Setting descriptors."; |
| cudnn_input_dims_ = Y.dims(); |
| int C = 1, H = 1, W = 1; |
| if (Y.ndim() == 4) { |
| // Normal 4-dimensional tensors for images. |
| C = (order_ == StorageOrder::NCHW ? Y.dim32(1) : Y.dim32(3)); |
| H = (order_ == StorageOrder::NCHW ? Y.dim32(2) : Y.dim32(1)); |
| W = (order_ == StorageOrder::NCHW ? Y.dim32(3) : Y.dim32(2)); |
| } else { |
| // If Y is not 4-dimensional, we will simply use H = 1 and W = 1 |
| // and wrap everything into C. |
| C = Y.size() / Y.dim32(0); |
| } |
| CUDNN_CHECK(cudnnSetTensor4dDescriptor( |
| data_desc_, GetCudnnTensorFormat(order_), |
| cudnnTypeWrapper<T>::type, Y.dim32(0), C, H, W)); |
| } |
| const typename cudnnTypeWrapper<T>::ScalingParamType kOne = 1; |
| const typename cudnnTypeWrapper<T>::ScalingParamType kZero = 0; |
| CUDNN_CHECK(cudnnActivationBackward(cudnn_wrapper_.inline_cudnn_handle(), |
| activ_desc_, &kOne, data_desc_, Y.template data<T>(), |
| data_desc_, dY.template data<T>(), data_desc_, Y.template data<T>(), |
| &kZero, data_desc_, dX->template mutable_data<T>())); |
| return true; |
| } |
| |
| bool RunOnDevice() override { |
| const auto& Y = Input(0); |
| const auto& dY = Input(1); |
| auto* dX = Output(0); |
| dX->ResizeLike(Y); |
| |
| return (this->*body_)(); |
| } |
| |
| protected: |
| CuDNNWrapper cudnn_wrapper_; |
| cudnnTensorDescriptor_t data_desc_; |
| cudnnActivationDescriptor_t activ_desc_; |
| vector<TIndex> cudnn_input_dims_; |
| StorageOrder order_; |
| // Input: Y, dY; Output: dX |
| private: |
| bool (CuDNNReluGradientOp::*body_)(); |
| }; |
| |
| namespace { |
| REGISTER_CUDNN_OPERATOR(Relu, CuDNNReluOp); |
| REGISTER_CUDNN_OPERATOR(ReluGradient, CuDNNReluGradientOp); |
| } // namespace |
| } // namespace caffe2 |