| #include "caffe2/core/context_gpu.h" |
| #include "caffe2/core/cudnn_wrappers.h" |
| #include "caffe2/core/operator.h" |
| #include "caffe2/core/types.h" |
| |
| namespace caffe2 { |
| |
| // cudnnRestoreDropoutDescriptor is needed for correctness and |
| // doesn't exist prior to cuDNN v7 |
| #if CUDNN_VERSION_MIN(7,0,0) |
| |
| class CuDNNDropoutOp final : public Operator<CUDAContext> { |
| public: |
| USE_OPERATOR_FUNCTIONS(CUDAContext); |
| |
| explicit CuDNNDropoutOp(const OperatorDef& operator_def, Workspace* ws) |
| : Operator<CUDAContext>(operator_def, ws), |
| cudnn_wrapper_(&context_), |
| ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)), |
| is_test_(OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)), |
| states_initialized_(false), |
| random_seed_(operator_def.device_option().random_seed()) { |
| CAFFE_ENFORCE_GE(ratio_, 0); |
| CAFFE_ENFORCE_LT(ratio_, 1); |
| CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_)); |
| |
| CUDNN_ENFORCE(cudnnCreateDropoutDescriptor(&dropout_desc_)); |
| CUDNN_ENFORCE(cudnnDropoutGetStatesSize( |
| cudnn_wrapper_.inline_cudnn_handle(), |
| reinterpret_cast<size_t*>(&states_size_in_bytes_))); |
| |
| if (!is_test_) { |
| scratch_blob_ = ws->CreateBlob(scratch_blob_name(operator_def.output(1))); |
| CAFFE_ENFORCE(scratch_blob_); |
| } |
| } |
| |
| ~CuDNNDropoutOp() noexcept override { |
| CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_)); |
| CUDNN_ENFORCE(cudnnDestroyDropoutDescriptor(dropout_desc_)); |
| } |
| |
| template <typename T, typename M> |
| bool DoRunWithType(); |
| |
| bool RunOnDevice() override; |
| |
| static string scratch_blob_name(string mask_blob_name) { |
| return "cudnn_dropout_scratch_" + mask_blob_name; |
| } |
| |
| protected: |
| CuDNNWrapper cudnn_wrapper_; |
| cudnnTensorDescriptor_t data_desc_; |
| cudnnDropoutDescriptor_t dropout_desc_; |
| |
| vector<int64_t> cudnn_input_dims_; |
| |
| float ratio_; |
| bool is_test_; |
| |
| Blob* scratch_blob_ = nullptr; |
| |
| size_t states_size_in_bytes_, reserve_space_size_in_bytes_; |
| // Input: X, Output: Y, mask_and_states |
| |
| // track whether states have been initialized - only needs to happen once |
| bool states_initialized_; |
| |
| // random seed |
| unsigned long long random_seed_; |
| }; |
| |
| class CuDNNDropoutGradientOp final : public Operator<CUDAContext> { |
| public: |
| USE_OPERATOR_FUNCTIONS(CUDAContext); |
| explicit CuDNNDropoutGradientOp(const OperatorDef& operator_def, Workspace* ws) |
| : Operator<CUDAContext>(operator_def, ws), |
| cudnn_wrapper_(&context_), |
| ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)), |
| is_test_(OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)), |
| states_initialized_(false), |
| random_seed_(operator_def.device_option().random_seed()) { |
| CAFFE_ENFORCE_GE(ratio_, 0); |
| CAFFE_ENFORCE_LT(ratio_, 1); |
| CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_)); |
| |
| CUDNN_ENFORCE(cudnnCreateDropoutDescriptor(&dropout_desc_)); |
| CUDNN_ENFORCE(cudnnDropoutGetStatesSize( |
| cudnn_wrapper_.inline_cudnn_handle(), |
| reinterpret_cast<size_t*>(&states_size_in_bytes_))); |
| |
| // Share scratch with the forward op |
| scratch_blob_ = |
| ws->GetBlob(CuDNNDropoutOp::scratch_blob_name(operator_def.input(1))); |
| CAFFE_ENFORCE(scratch_blob_); |
| } |
| |
| ~CuDNNDropoutGradientOp() noexcept override { |
| CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_)); |
| CUDNN_ENFORCE(cudnnDestroyDropoutDescriptor(dropout_desc_)); |
| } |
| |
| template <typename T, typename M> |
| bool DoRunWithType(); |
| |
| bool RunOnDevice() override; |
| |
| protected: |
| CuDNNWrapper cudnn_wrapper_; |
| cudnnTensorDescriptor_t data_desc_; |
| cudnnDropoutDescriptor_t dropout_desc_; |
| |
| vector<int64_t> cudnn_input_dims_; |
| |
| Blob* scratch_blob_; |
| |
| float ratio_; |
| bool is_test_; |
| |
| size_t states_size_in_bytes_, reserve_space_size_in_bytes_; |
| // Input: dY, mask_and_states, Output: dX |
| |
| // only need to initialize states once (size is static) |
| bool states_initialized_; |
| |
| unsigned long long random_seed_; |
| }; |
| |
| template <typename T, typename M> |
| bool CuDNNDropoutOp::DoRunWithType() { |
| const auto& X = Input(0); |
| auto* Y = Output(0); |
| |
| auto size_prod = 1; |
| for (auto dim : X.sizes()) { |
| size_prod *= dim; |
| } |
| // now actually run the computation |
| if (is_test_) { |
| if (Y != &X) { |
| context_.CopySameDevice<T>( |
| X.numel(), X.template data<T>(), Y->template mutable_data<T>()); |
| } |
| return true; |
| } else { |
| // Reshape tensor descriptors if necessary |
| if (X.sizes() != cudnn_input_dims_) { |
| CAFFE_ENFORCE(scratch_blob_); |
| Tensor* states = BlobGetMutableTensor(scratch_blob_, CUDA); |
| cudnn_input_dims_ = X.sizes().vec(); |
| CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( |
| data_desc_, |
| GetCudnnTensorFormat(StorageOrder::NCHW), |
| cudnnTypeWrapper<T>::type, |
| size_prod, |
| 1, |
| 1, |
| 1)); |
| |
| // get the reserve space we need |
| CUDNN_ENFORCE(cudnnDropoutGetReserveSpaceSize( |
| data_desc_, &reserve_space_size_in_bytes_)); |
| |
| states->Resize(states_size_in_bytes_); |
| |
| if (!states_initialized_) { |
| // set the dropout descriptor (note: need to allocate the states data |
| // before acquiring the mutex) |
| uint8_t* states_data = states->template mutable_data<uint8_t>(); |
| { |
| // Need to protect as clashes with NCCL |
| std::lock_guard<std::mutex> lk(CUDAContext::mutex()); |
| CUDNN_ENFORCE(cudnnSetDropoutDescriptor( |
| dropout_desc_, |
| cudnn_wrapper_.inline_cudnn_handle(), |
| ratio_, |
| states_data, |
| states_size_in_bytes_, |
| random_seed_ |
| )); |
| } |
| states_initialized_ = true; |
| } |
| } |
| auto* mask = Output( |
| 1, |
| {static_cast<int64_t>(reserve_space_size_in_bytes_)}, |
| at::dtype<uint8_t>()); |
| CUDNN_ENFORCE(cudnnDropoutForward( |
| cudnn_wrapper_.inline_cudnn_handle(), |
| dropout_desc_, |
| data_desc_, |
| X.template data<T>(), |
| data_desc_, |
| Y->template mutable_data<T>(), |
| mask->template mutable_data<uint8_t>(), |
| reserve_space_size_in_bytes_)); |
| } |
| return true; |
| } |
| |
| bool CuDNNDropoutOp::RunOnDevice() { |
| // dispatch based on contents of tensor(s) |
| const auto& X = Input(0); |
| auto* Y = Output(0); |
| Y->ResizeLike(X); |
| |
| if (X.IsType<float>()) { |
| return DoRunWithType<float, float>(); |
| } else if (X.IsType<at::Half>()) { |
| return DoRunWithType<at::Half, float>(); |
| } |
| return false; |
| } |
| |
| template <typename T, typename M> |
| bool CuDNNDropoutGradientOp::DoRunWithType() { |
| const auto& dY = Input(0); |
| const auto& mask = Input(1); |
| const Tensor& states = scratch_blob_->Get<Tensor>(); |
| auto* dX = Output(0); |
| |
| auto size_prod = 1; |
| for (auto dim : dY.sizes()) { |
| size_prod *= dim; |
| } |
| |
| if (!states_initialized_) { |
| // set the dropout descriptor |
| { |
| // Need to protect as clashes with NCCL |
| std::lock_guard<std::mutex> lk(CUDAContext::mutex()); |
| CUDNN_ENFORCE(cudnnRestoreDropoutDescriptor( |
| dropout_desc_, |
| cudnn_wrapper_.inline_cudnn_handle(), |
| ratio_, |
| const_cast<uint8_t*>(states.data<uint8_t>()), |
| states_size_in_bytes_, |
| random_seed_ |
| )); |
| } |
| states_initialized_ = true; |
| } |
| |
| if (dY.sizes() != cudnn_input_dims_) { |
| cudnn_input_dims_ = dY.sizes().vec(); |
| CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( |
| data_desc_, |
| GetCudnnTensorFormat(StorageOrder::NCHW), |
| cudnnTypeWrapper<T>::type, |
| size_prod, |
| 1, |
| 1, |
| 1)); |
| |
| // get the reserve space we need |
| CUDNN_ENFORCE(cudnnDropoutGetReserveSpaceSize( |
| data_desc_, &reserve_space_size_in_bytes_)); |
| } |
| |
| // run the computation |
| void* mask_data = const_cast<void*>(mask.raw_data()); |
| CUDNN_ENFORCE(cudnnDropoutBackward( |
| cudnn_wrapper_.inline_cudnn_handle(), |
| dropout_desc_, |
| data_desc_, |
| dY.data<T>(), |
| data_desc_, |
| dX->template mutable_data<T>(), |
| mask_data, |
| reserve_space_size_in_bytes_)); |
| return true; |
| } |
| |
| bool CuDNNDropoutGradientOp::RunOnDevice() { |
| // dispatch based on contents of tensor(s) |
| const auto& dY = Input(0); |
| auto* dX = Output(0); |
| |
| dX->ResizeLike(dY); |
| |
| if (dY.IsType<float>()) { |
| return DoRunWithType<float, float>(); |
| } else if (dY.IsType<at::Half>()) { |
| return DoRunWithType<at::Half, float>(); |
| } |
| return false; |
| } |
| |
| namespace { |
| REGISTER_CUDNN_OPERATOR(Dropout, CuDNNDropoutOp); |
| REGISTER_CUDNN_OPERATOR(DropoutGrad, CuDNNDropoutGradientOp); |
| } |
| |
| #endif |
| |
| }; // namespace caffe2 |