blob: 6867916356db398ac7f2e7252287edd8fe1cfa13 [file] [log] [blame]
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/conv_op.h"
#include "caffe2/operators/conv_op_cache_cudnn.h"
#include "caffe2/operators/conv_pool_op_base.h"
namespace caffe2 {
// Earlier in the days Caffe sets the default cudnn workspace to 8MB. We bump
// it up to 64MB in Caffe2, as this enables the use of Winograd in many cases,
// something very beneficial to more recent CNN models.
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 64*1024*1024;
// Manually specified number of algorithms implemented in CuDNN.
// This does not have any performance implications, as we will always find the
// fastest algorithm; setting them to the right number of algorithms will enable
// us to best report the statistics when doing an exhaustive search, though.
static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
namespace {
template <typename ArrayOfcudnnConvolutionAlgoPerf_t>
inline void LogCuDNNPerfStats(
const ArrayOfcudnnConvolutionAlgoPerf_t& perf_stat,
int returned_algo_count) {
VLOG(1) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = perf_stat[i];
VLOG(1) << stat.algo << ": " << stat.status << " " << stat.time << " "
<< stat.memory;
}
}
} // namespace
class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
public:
CudnnConvOpBase(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<CUDAContext>(operator_def, ws),
cudnn_wrapper_(&context_),
cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
"ws_nbytes_limit",
kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)),
exhaustive_search_(
OperatorBase::GetSingleArgument<int>("exhaustive_search", 0)),
deterministic_(
OperatorBase::GetSingleArgument<int>("deterministic", 0)),
cudnn_state_(OperatorBase::GetSingleArgument<int>("cudnn_state", 0)) {
CAFFE_ENFORCE(group_ > 0);
CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
OPERATOR_NEEDS_FEATURE(
pad_t_ == pad_b_,
"The current padding scheme leads to unequal padding on the top and "
"bottom, which is not supported by cudnn.");
OPERATOR_NEEDS_FEATURE(
pad_l_ == pad_r_,
"The current padding scheme leads to unequal padding on the left "
"and right, which is not supported by cudnn.");
OPERATOR_NEEDS_FEATURE(
dilation_h_ == 1 && dilation_w_ == 1,
"The cudnn convolution does not support dilation yet.");
CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_));
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&top_desc_));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&top_desc_for_bias_));
CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
}
~CudnnConvOpBase() {
CUDNN_CHECK(cudnnDestroyTensorDescriptor(bottom_desc_));
CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_));
CUDNN_CHECK(cudnnDestroyTensorDescriptor(top_desc_));
CUDNN_CHECK(cudnnDestroyTensorDescriptor(top_desc_for_bias_));
CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
}
protected:
// A helper function to set up the tensor 4d desriptor, depending on the order
// the group and the type given.
template <typename T>
void SetTensor4dDescriptorWithGroup(
cudnnTensorDescriptor_t desc_,
int N,
int C,
int H,
int W) {
switch (order_) {
case StorageOrder::NHWC:
CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(
desc_,
cudnnTypeWrapper<T>::type,
N,
C / group_,
H,
W,
H * W * C,
1,
W * C,
C));
break;
case StorageOrder::NCHW:
CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(
desc_,
cudnnTypeWrapper<T>::type,
N,
C / group_,
H,
W,
C * H * W,
H * W,
W,
1));
break;
default:
LOG(FATAL) << "Unknown storage order: " << order_;
}
}
vector<TIndex> cudnn_input_dims_;
vector<TIndex> cudnn_filter_dims_;
CuDNNWrapper cudnn_wrapper_;
cudnnTensorDescriptor_t bottom_desc_;
cudnnFilterDescriptor_t filter_desc_;
cudnnTensorDescriptor_t bias_desc_;
cudnnTensorDescriptor_t top_desc_;
// top desc for bias add in case we do group convolution
cudnnTensorDescriptor_t top_desc_for_bias_;
cudnnConvolutionDescriptor_t conv_desc_;
const size_t cudnn_ws_nbytes_limit_;
size_t cudnn_ws_nbytes_;
bool exhaustive_search_;
bool deterministic_;
size_t cudnn_state_;
};
template <typename T>
class CudnnConvOp final : public CudnnConvOpBase {
public:
CudnnConvOp(const OperatorDef& operator_def, Workspace* ws)
: CudnnConvOpBase(operator_def, ws) {}
~CudnnConvOp() {}
bool RunOnDevice() override;
private:
cudnnConvolutionFwdAlgo_t algo_;
AlgorithmsCache<cudnnConvolutionFwdAlgo_t> algo_cache_;
// Input: X, W, b
// Output: Y
INPUT_TAGS(INPUT, FILTER, BIAS);
};
template <typename T>
class CudnnConvGradientOp final : public CudnnConvOpBase {
public:
CudnnConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
: CudnnConvOpBase(operator_def, ws),
no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", 0)) {
CAFFE_ENFORCE(
!(no_bias_ && OutputSize() == 3),
"If bias is not present, you should not have 3 grad output.");
}
~CudnnConvGradientOp() {}
bool RunOnDevice() override;
private:
cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t> filter_algo_cache_;
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t> data_algo_cache_;
bool no_bias_;
// input: X, W, dY
// output: dW, db, and optionally dX
INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
};
////////////////////////////////////////////////////////////////////////////////
// Implementations
////////////////////////////////////////////////////////////////////////////////
template <typename T>
bool CudnnConvOp<T>::RunOnDevice() {
auto& X = Input(INPUT);
auto& filter = Input(FILTER);
auto* Y = Output(0);
// Figure out the output shape
DCHECK_EQ(X.ndim(), 4);
DCHECK_EQ(filter.ndim(), 4);
const int M = filter.dim32(0);
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, M);
int N = 0, C = 0, H = 0, W = 0, H_out = 0, W_out = 0;
int group_offset_X = 0, group_offset_Y = 0;
CAFFE_ENFORCE(
C % group_ == 0,
"If you set group, the number of input channels should be divisible "
"by group.");
CAFFE_ENFORCE(
M % group_ == 0,
"If you set group, the number of output channels should be divisible "
"by group.");
switch (order_) {
case StorageOrder::NHWC:
N = X.dim32(0); H = X.dim32(1); W = X.dim32(2); C = X.dim32(3);
H_out = Y->dim32(1); W_out = Y->dim32(2);
CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h_);
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w_);
CAFFE_ENFORCE_EQ(filter.dim32(3), C / group_);
group_offset_X = C / group_;
group_offset_Y = M / group_;
break;
case StorageOrder::NCHW:
N = X.dim32(0); C = X.dim32(1); H = X.dim32(2); W = X.dim32(3);
H_out = Y->dim32(2); W_out = Y->dim32(3);
CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h_);
CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w_);
group_offset_X = C / group_ * H * W;
group_offset_Y = M / group_ * H_out * W_out;
break;
default:
LOG(FATAL) << "Unknown storage order: " << order_;
}
int group_offset_filter = filter.size() / group_;
// Set up the cudnn algorithms & workspace if necessary
bool input_changed = (X.dims() != cudnn_input_dims_);
bool filter_changed = (filter.dims() != cudnn_filter_dims_);
if (input_changed || filter_changed) {
VLOG(1) << "Changing the cudnn descriptor configurations.";
if (input_changed) {
cudnn_input_dims_ = X.dims();
SetTensor4dDescriptorWithGroup<T>(bottom_desc_, N, C, H, W);
}
if (filter_changed) {
cudnn_filter_dims_ = filter.dims();
CUDNN_CHECK(cudnnSetFilter4dDescriptor(
filter_desc_,
cudnnTypeWrapper<T>::type,
GetCudnnTensorFormat(order_),
M / group_,
C / group_,
kernel_h_,
kernel_w_));
if (InputSize() == 3) {
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
bias_desc_, GetCudnnTensorFormat(order_), cudnnTypeWrapper<T>::type,
1, M, 1, 1));
}
}
// Set the output
SetTensor4dDescriptorWithGroup<T>(top_desc_, N, M, H_out, W_out);
// Set the output with descriptor useful for bias addition in one run
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
top_desc_for_bias_,
GetCudnnTensorFormat(order_),
cudnnTypeWrapper<T>::type,
N,
M,
H_out,
W_out));
// Set the convolution descriptor
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, 1, 1,
CUDNN_CROSS_CORRELATION));
if (deterministic_) {
algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
} else if (exhaustive_search_) {
algo_ = algo_cache_.getAlgorithm(X.dims(), filter.dims(), [&]() {
VLOG(1) << "CUDNN Convolution: doing exhaustive search.";
// When we do an exhaustive search, we will ignore the workspace size
// limit and simply go for the fastest algorithm. If you happen to run
// out of memory later, you will be on your own...
int returned_algo_count;
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
perf_stat;
// no need to clean up workspace,
cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
// Actually run the search.
CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithmEx(
state->cudnn_handle(),
bottom_desc_,
X.template data<T>(),
filter_desc_,
filter.template data<T>(),
conv_desc_,
top_desc_,
Y->template mutable_data<T>(),
kNUM_CUDNN_FWD_ALGS,
&returned_algo_count,
perf_stat.data(),
state->workspace().get(cudnn_ws_nbytes_limit_),
cudnn_ws_nbytes_limit_));
});
LogCuDNNPerfStats(perf_stat, returned_algo_count);
return perf_stat[0].algo;
});
} else {
// Get the convolution algorithm based on the workspace limit.
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
cudnn_wrapper_.inline_cudnn_handle(),
bottom_desc_, filter_desc_, conv_desc_, top_desc_,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
cudnn_ws_nbytes_limit_,
&algo_));
}
CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
cudnn_wrapper_.inline_cudnn_handle(),
bottom_desc_, filter_desc_, conv_desc_, top_desc_,
algo_, &cudnn_ws_nbytes_));
VLOG(1) << "CuDNN algorithm: " << algo_;
VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
}
// Now, actually run the computation.
// Filter
for (int i = 0; i < group_; ++i) {
cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
CUDNN_CHECK(cudnnConvolutionForward(
state->cudnn_handle(),
cudnnTypeWrapper<T>::kOne(),
bottom_desc_,
X.template data<T>() + i * group_offset_X,
filter_desc_,
filter.template data<T>() + i * group_offset_filter,
conv_desc_,
algo_,
state->workspace().get(cudnn_ws_nbytes_),
cudnn_ws_nbytes_,
cudnnTypeWrapper<T>::kZero(),
top_desc_,
Y->template mutable_data<T>() + i * group_offset_Y));
});
}
// Bias
if (InputSize() == 3) {
auto& bias = Input(BIAS);
DCHECK_EQ(bias.ndim(), 1);
DCHECK_EQ(bias.dim32(0), M);
CUDNN_CHECK(cudnnAddTensor(
cudnn_wrapper_.inline_cudnn_handle(),
cudnnTypeWrapper<T>::kOne(),
bias_desc_,
bias.template data<T>(),
cudnnTypeWrapper<T>::kOne(),
top_desc_for_bias_,
Y->template mutable_data<T>()));
}
// Done.
return true;
}
// TODO(Yangqing): a lot of the function contents are very similar. Consider
// consolidating them.
template <typename T>
bool CudnnConvGradientOp<T>::RunOnDevice() {
auto& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& dY = Input(OUTPUT_GRAD);
auto* dfilter = Output(FILTER_GRAD);
DCHECK_EQ(X.ndim(), 4);
DCHECK_EQ(filter.ndim(), 4);
const int M = filter.dim32(0);
int N = 0, C = 0, H = 0, W = 0, H_out = 0, W_out = 0;
int group_offset_X = 0, group_offset_Y = 0;
CAFFE_ENFORCE(
C % group_ == 0,
"If you set group, the number of input channels should be divisible "
"by group.");
CAFFE_ENFORCE(
M % group_ == 0,
"If you set group, the number of output channels should be divisible "
"by group.");
switch (order_) {
case StorageOrder::NHWC:
N = X.dim32(0); H = X.dim32(1); W = X.dim32(2); C = X.dim32(3);
H_out = dY.dim32(1); W_out = dY.dim32(2);
CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h_);
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w_);
CAFFE_ENFORCE_EQ(filter.dim32(3), C / group_);
group_offset_X = C / group_;
group_offset_Y = M / group_;
break;
case StorageOrder::NCHW:
N = X.dim32(0); C = X.dim32(1); H = X.dim32(2); W = X.dim32(3);
H_out = dY.dim32(2); W_out = dY.dim32(3);
CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h_);
CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w_);
group_offset_X = C / group_ * H * W;
group_offset_Y = M / group_ * H_out * W_out;
break;
default:
LOG(FATAL) << "Unknown storage order: " << order_;
}
int group_offset_filter = filter.size() / group_;
ConvPoolOpBase<CUDAContext>::ComputePads(H, W);
dfilter->ResizeLike(filter);
// Set up the cudnn algorithms & workspace if necessary
bool input_changed = (X.dims() != cudnn_input_dims_);
bool filter_changed = (filter.dims() != cudnn_filter_dims_);
if (input_changed || filter_changed) {
VLOG(1) << "Changing the cudnn descriptor configurations.";
if (input_changed) {
cudnn_input_dims_ = X.dims();
SetTensor4dDescriptorWithGroup<T>(bottom_desc_, N, C, H, W);
}
if (filter_changed) {
cudnn_filter_dims_ = filter.dims();
CUDNN_CHECK(cudnnSetFilter4dDescriptor(
filter_desc_,
cudnnTypeWrapper<T>::type,
GetCudnnTensorFormat(order_),
M / group_,
C / group_,
kernel_h_,
kernel_w_));
if (!no_bias_) {
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
bias_desc_, GetCudnnTensorFormat(order_), cudnnTypeWrapper<T>::type,
1, M, 1, 1));
}
}
// Set the output
SetTensor4dDescriptorWithGroup<T>(top_desc_, N, M, H_out, W_out);
// Set the output with descriptor useful for bias addition in one run
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
top_desc_for_bias_,
GetCudnnTensorFormat(order_),
cudnnTypeWrapper<T>::type,
N,
M,
H_out,
W_out));
// Set the convolution descriptor
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, 1, 1,
CUDNN_CROSS_CORRELATION));
// Set the workspace
size_t bwd_filter_ws_size, bwd_data_ws_size;
if (deterministic_) {
bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
} else if (exhaustive_search_) {
bwd_filter_algo_ =
filter_algo_cache_.getAlgorithm(X.dims(), filter.dims(), [&]() {
VLOG(1) << "CUDNN Convolution bwd: doing filter exhaustive search.";
// When we do an exhaustive search, we will ignore the workspace
// size
// limit and simply go for the fastest algorithm. If you happen to
// run
// out of memory later, you will be on your own...
int returned_algo_count;
// We clean up the current workspace memory so that the forward
// algorithm is free to allocate memory.
// Actually run the search.
std::array<
cudnnConvolutionBwdFilterAlgoPerf_t,
kNUM_CUDNN_BWD_FILTER_ALGS>
filter_perf_stat;
cudnn_wrapper_.with_cudnn_state(
cudnn_state_, [&](CuDNNState* state) {
CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithmEx(
state->cudnn_handle(),
bottom_desc_,
X.template data<T>(),
top_desc_,
dY.template data<T>(),
conv_desc_,
filter_desc_,
dfilter->template mutable_data<T>(),
kNUM_CUDNN_BWD_FILTER_ALGS,
&returned_algo_count,
filter_perf_stat.data(),
state->workspace().get(cudnn_ws_nbytes_limit_),
cudnn_ws_nbytes_limit_));
});
LogCuDNNPerfStats(filter_perf_stat, returned_algo_count);
return filter_perf_stat[0].algo;
});
if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
bwd_data_algo_ =
data_algo_cache_.getAlgorithm(X.dims(), filter.dims(), [&]() {
VLOG(1) << "CUDNN Convolution bwd: doing data exhaustive search.";
int returned_algo_count;
std::array<
cudnnConvolutionBwdDataAlgoPerf_t,
kNUM_CUDNN_BWD_DATA_ALGS>
data_perf_stat;
cudnn_wrapper_.with_cudnn_state(
cudnn_state_, [&](CuDNNState* state) {
auto* dX =
Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
dX->ResizeLike(X);
CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithmEx(
state->cudnn_handle(),
filter_desc_,
filter.template data<T>(),
top_desc_,
dY.template data<T>(),
conv_desc_,
bottom_desc_,
dX->template mutable_data<T>(),
kNUM_CUDNN_BWD_DATA_ALGS,
&returned_algo_count,
data_perf_stat.data(),
state->workspace().get(cudnn_ws_nbytes_limit_),
cudnn_ws_nbytes_limit_));
});
LogCuDNNPerfStats(data_perf_stat, returned_algo_count);
return data_perf_stat[0].algo;
});
}
} else {
// choose backward algorithm for filter
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
cudnn_wrapper_.inline_cudnn_handle(),
bottom_desc_, top_desc_, conv_desc_, filter_desc_,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
cudnn_ws_nbytes_limit_, &bwd_filter_algo_));
// choose backward algo for data
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
cudnn_wrapper_.inline_cudnn_handle(),
filter_desc_, top_desc_, conv_desc_, bottom_desc_,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
cudnn_ws_nbytes_limit_, &bwd_data_algo_));
}
// get workspace for backwards filter algorithm
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
cudnn_wrapper_.inline_cudnn_handle(),
bottom_desc_, top_desc_, conv_desc_, filter_desc_,
bwd_filter_algo_, &bwd_filter_ws_size));
// get workspace for backwards data algorithm
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
cudnn_wrapper_.inline_cudnn_handle(),
filter_desc_, top_desc_, conv_desc_, bottom_desc_,
bwd_data_algo_, &bwd_data_ws_size));
cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, bwd_data_ws_size);
VLOG(1) << "CuDNN bwd algorithm: " << bwd_filter_algo_ << ", "
<< bwd_data_algo_;
VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
}
// Now, actually run the computation.
if (!no_bias_) {
auto* dbias = Output(BIAS_OR_INPUT_GRAD);
dbias->Resize(M);
CUDNN_CHECK(cudnnConvolutionBackwardBias(
cudnn_wrapper_.inline_cudnn_handle(),
cudnnTypeWrapper<T>::kOne(),
top_desc_for_bias_,
dY.template data<T>(),
cudnnTypeWrapper<T>::kZero(),
bias_desc_,
dbias->template mutable_data<T>()));
}
for (int i = 0; i < group_; ++i) {
cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
CUDNN_CHECK(cudnnConvolutionBackwardFilter(
state->cudnn_handle(),
cudnnTypeWrapper<T>::kOne(),
bottom_desc_,
X.template data<T>() + i * group_offset_X,
top_desc_,
dY.template data<T>() + i * group_offset_Y,
conv_desc_,
bwd_filter_algo_,
state->workspace().get(cudnn_ws_nbytes_),
cudnn_ws_nbytes_,
cudnnTypeWrapper<T>::kZero(),
filter_desc_,
dfilter->template mutable_data<T>() + i * group_offset_filter));
if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
// Compute the gradient w.r.t. the input.
auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
dX->ResizeLike(X);
CUDNN_CHECK(cudnnConvolutionBackwardData(
state->cudnn_handle(),
cudnnTypeWrapper<T>::kOne(),
filter_desc_,
filter.template data<T>() + i * group_offset_filter,
top_desc_,
dY.template data<T>() + i * group_offset_Y,
conv_desc_,
bwd_data_algo_,
state->workspace().get(cudnn_ws_nbytes_),
cudnn_ws_nbytes_,
cudnnTypeWrapper<T>::kZero(),
bottom_desc_,
dX->template mutable_data<T>() + i * group_offset_X));
}
});
}
return true;
}
REGISTER_CUDNN_OPERATOR(Conv, CudnnConvOp<float>);
REGISTER_CUDNN_OPERATOR(ConvGradient, CudnnConvGradientOp<float>);
REGISTER_CUDNN_OPERATOR(ConvFp16, CudnnConvOp<float16>);
REGISTER_CUDNN_OPERATOR(ConvFp16Gradient, CudnnConvGradientOp<float16>);
class GetConvFp16Gradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
CAFFE_ENFORCE(def_.input_size() == 3 || def_.input_size() == 2);
if (def_.input_size() == 3) {
return SingleGradientDef(
"ConvFp16Gradient",
"",
vector<string>{I(0), I(1), GO(0)},
vector<string>{GI(1), GI(2), GI(0)});
} else {
return SingleGradientDef(
"ConvFp16Gradient",
"",
vector<string>{I(0), I(1), GO(0)},
vector<string>{GI(1), GI(0)},
vector<Argument>{MakeArgument<int>("no_bias", 1)});
}
}
};
REGISTER_GRADIENT(ConvFp16, GetConvFp16Gradient);
} // namespace caffe2