blob: 7fe4503b2bfbe29dccc4816366d10e95e1a27c30 [file] [log] [blame]
#include "caffe2/operators/utility_ops.h"
#include <type_traits>
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/cudnn_wrappers.h"
#include "caffe2/utils/conversions.h"
namespace caffe2 {
class CuDNNWeightedSumOp : public Operator<CUDAContext> {
public:
USE_OPERATOR_FUNCTIONS(CUDAContext);
CuDNNWeightedSumOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CUDAContext>(operator_def, ws), cudnn_wrapper_(&context_) {
CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
CUDNN_ENFORCE(cudnnCreateOpTensorDescriptor(&add_desc_));
// Both float and at::Half require opTensorCompType to be CUDNN_DATA_FLOAT.
CUDNN_ENFORCE(cudnnSetOpTensorDescriptor(
add_desc_, CUDNN_OP_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_PROPAGATE_NAN));
}
~CuDNNWeightedSumOp() override {
CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
CUDNN_ENFORCE(cudnnDestroyOpTensorDescriptor(add_desc_));
}
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<float, at::Half>>::call(this, Input(0));
}
template <typename T>
bool DoRunWithType() {
if (std::is_same<T, at::Half>::value) {
LOG(WARNING)
<< "CuDNN only support same type for data and weight, "
"so the weight will be cast to at::Half when data type is Half.";
}
const int num_inputs = InputSize();
CAFFE_ENFORCE_EQ(num_inputs % 2, 0);
const auto& X0 = Input(0);
const auto& weight0 = Input(1);
CAFFE_ENFORCE_GT(X0.numel(), 0);
CAFFE_ENFORCE_EQ(weight0.numel(), 1);
const int input_size = X0.numel();
SetTensorDescriptor(cudnnTypeWrapper<T>::type, input_size);
// Note: removed Aliasing check, since Output already has
// caching capability
auto* Y = Output(0, X0.sizes(), at::dtype<T>());
T* Y_data = Y->template mutable_data<T>();
T alpha = convert::To<float, T>(0.0f);
T beta = convert::To<float, T>(0.0f);
if (num_inputs == 2) {
CopyWeightToHost<T>(weight0.template data<float>(), &alpha);
CUDNN_ENFORCE(cudnnAddTensor(
cudnn_wrapper_.inline_cudnn_handle(),
&alpha,
data_desc_,
X0.template data<T>(),
cudnnTypeWrapper<T>::kZero(),
data_desc_,
Y_data));
return true;
}
const auto& X1 = Input(2);
CAFFE_ENFORCE(
!IsInputOutputAlias(2, 0),
"Input #2 is the same as output. If you want to do in-place updates, "
"put the output as input #0.");
const auto& weight1 = Input(3);
CAFFE_ENFORCE_EQ(X1.numel(), input_size);
CAFFE_ENFORCE_EQ(weight1.numel(), 1);
CopyWeightToHost<T>(weight1.template data<float>(), &alpha);
CopyWeightToHost<T>(weight0.template data<float>(), &beta);
if (IsInputOutputAlias(0, 0)) {
CUDNN_ENFORCE(cudnnAddTensor(
cudnn_wrapper_.inline_cudnn_handle(),
&alpha,
data_desc_,
X1.template data<T>(),
&beta,
data_desc_,
Y_data));
} else {
CUDNN_ENFORCE(cudnnOpTensor(
cudnn_wrapper_.inline_cudnn_handle(),
add_desc_,
&alpha,
data_desc_,
X1.template data<T>(),
&beta,
data_desc_,
X0.template data<T>(),
cudnnTypeWrapper<T>::kZero(),
data_desc_,
Y_data));
}
for (int i = 4; i < num_inputs; i += 2) {
const auto& Xi = Input(i);
// Do a check: if the input is the same as output, we have a problem -
// in-place update should always only happen with the zeroth input.
const std::string err_msg = "Input #" + to_string(i) +
" is the same as output. If you want to do in-place updates, "
"put the output as input #0.";
CAFFE_ENFORCE(!IsInputOutputAlias(i, 0), err_msg);
const auto& weighti = Input(i + 1);
CAFFE_ENFORCE_EQ(Xi.numel(), input_size);
CAFFE_ENFORCE_EQ(weighti.numel(), 1);
CopyWeightToHost<T>(weighti.template data<float>(), &alpha);
CUDNN_ENFORCE(cudnnAddTensor(
cudnn_wrapper_.inline_cudnn_handle(),
&alpha,
data_desc_,
Xi.template data<T>(),
cudnnTypeWrapper<T>::kOne(),
data_desc_,
Y_data));
}
return true;
}
private:
void SetTensorDescriptor(
const cudnnDataType_t data_type,
const int input_size) {
if (cached_input_size_ != input_size) {
cached_input_size_ = input_size;
// Since the best performance is obtained when the tesor is HW-packed, we
// put X.size() to W.
CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
data_desc_,
GetCudnnTensorFormat(StorageOrder::NCHW),
data_type,
1,
1,
1,
input_size));
}
}
template <typename T>
void CopyWeightToHost(const float* src, T* dst);
CuDNNWrapper cudnn_wrapper_;
cudnnTensorDescriptor_t data_desc_;
cudnnOpTensorDescriptor_t add_desc_;
int cached_input_size_ = 0;
};
template <typename T>
void CuDNNWeightedSumOp::CopyWeightToHost(const float* src, T* dst) {
float val;
context_.template CopyToCPU<float>(1, src, &val);
*dst = convert::To<float, T>(val);
}
template <>
void CuDNNWeightedSumOp::CopyWeightToHost<float>(const float* src, float* dst) {
context_.CopyToCPU<float>(1, src, dst);
}
REGISTER_CUDNN_OPERATOR(WeightedSum, CuDNNWeightedSumOp);
} // namespace caffe2