caffe2/operators/utility_ops_cudnn.cc - platform/external/pytorch - Git at Google

 #include "caffe2/operators/utility_ops.h"

 #include <type_traits>

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/core/cudnn_wrappers.h"
 #include "caffe2/utils/conversions.h"

 namespace caffe2 {

 class CuDNNWeightedSumOp : public Operator<CUDAContext> {
  public:
   USE_OPERATOR_FUNCTIONS(CUDAContext);

   CuDNNWeightedSumOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<CUDAContext>(operator_def, ws), cudnn_wrapper_(&context_) {
     CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
     CUDNN_ENFORCE(cudnnCreateOpTensorDescriptor(&add_desc_));
     // Both float and at::Half require opTensorCompType to be CUDNN_DATA_FLOAT.
     CUDNN_ENFORCE(cudnnSetOpTensorDescriptor(
         add_desc_, CUDNN_OP_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_PROPAGATE_NAN));
   }

   ~CuDNNWeightedSumOp() override {
     CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
     CUDNN_ENFORCE(cudnnDestroyOpTensorDescriptor(add_desc_));
   }

   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<float, at::Half>>::call(this, Input(0));
   }

   template <typename T>
   bool DoRunWithType() {
     if (std::is_same<T, at::Half>::value) {
       LOG(WARNING)
           << "CuDNN only support same type for data and weight, "
              "so the weight will be cast to at::Half when data type is Half.";
     }
     const int num_inputs = InputSize();
     CAFFE_ENFORCE_EQ(num_inputs % 2, 0);
     const auto& X0 = Input(0);
     const auto& weight0 = Input(1);
     CAFFE_ENFORCE_GT(X0.numel(), 0);
     CAFFE_ENFORCE_EQ(weight0.numel(), 1);
     const int input_size = X0.numel();
     SetTensorDescriptor(cudnnTypeWrapper<T>::type, input_size);

     // Note: removed Aliasing check, since Output already has
     // caching capability
     auto* Y = Output(0, X0.sizes(), at::dtype<T>());
     T* Y_data = Y->template mutable_data<T>();
     T alpha = convert::To<float, T>(0.0f);
     T beta = convert::To<float, T>(0.0f);
     if (num_inputs == 2) {
       CopyWeightToHost<T>(weight0.template data<float>(), &alpha);
       CUDNN_ENFORCE(cudnnAddTensor(
           cudnn_wrapper_.inline_cudnn_handle(),
           &alpha,
           data_desc_,
           X0.template data<T>(),
           cudnnTypeWrapper<T>::kZero(),
           data_desc_,
           Y_data));
       return true;
     }
     const auto& X1 = Input(2);
     CAFFE_ENFORCE(
         !IsInputOutputAlias(2, 0),
         "Input #2 is the same as output. If you want to do in-place updates, "
         "put the output as input #0.");
     const auto& weight1 = Input(3);
     CAFFE_ENFORCE_EQ(X1.numel(), input_size);
     CAFFE_ENFORCE_EQ(weight1.numel(), 1);
     CopyWeightToHost<T>(weight1.template data<float>(), &alpha);
     CopyWeightToHost<T>(weight0.template data<float>(), &beta);
     if (IsInputOutputAlias(0, 0)) {
       CUDNN_ENFORCE(cudnnAddTensor(
           cudnn_wrapper_.inline_cudnn_handle(),
           &alpha,
           data_desc_,
           X1.template data<T>(),
           &beta,
           data_desc_,
           Y_data));
     } else {
       CUDNN_ENFORCE(cudnnOpTensor(
           cudnn_wrapper_.inline_cudnn_handle(),
           add_desc_,
           &alpha,
           data_desc_,
           X1.template data<T>(),
           &beta,
           data_desc_,
           X0.template data<T>(),
           cudnnTypeWrapper<T>::kZero(),
           data_desc_,
           Y_data));
     }
     for (int i = 4; i < num_inputs; i += 2) {
       const auto& Xi = Input(i);
       // Do a check: if the input is the same as output, we have a problem -
       // in-place update should always only happen with the zeroth input.
       const std::string err_msg = "Input #" + to_string(i) +
           " is the same as output. If you want to do in-place updates, "
           "put the output as input #0.";
       CAFFE_ENFORCE(!IsInputOutputAlias(i, 0), err_msg);
       const auto& weighti = Input(i + 1);
       CAFFE_ENFORCE_EQ(Xi.numel(), input_size);
       CAFFE_ENFORCE_EQ(weighti.numel(), 1);
       CopyWeightToHost<T>(weighti.template data<float>(), &alpha);
       CUDNN_ENFORCE(cudnnAddTensor(
           cudnn_wrapper_.inline_cudnn_handle(),
           &alpha,
           data_desc_,
           Xi.template data<T>(),
           cudnnTypeWrapper<T>::kOne(),
           data_desc_,
           Y_data));
     }
     return true;
   }

  private:
   void SetTensorDescriptor(
       const cudnnDataType_t data_type,
       const int input_size) {
     if (cached_input_size_ != input_size) {
       cached_input_size_ = input_size;
       // Since the best performance is obtained when the tesor is HW-packed, we
       // put X.size() to W.
       CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
           data_desc_,
           GetCudnnTensorFormat(StorageOrder::NCHW),
           data_type,
           1,
           1,
           1,
           input_size));
     }
   }

   template <typename T>
   void CopyWeightToHost(const float* src, T* dst);

   CuDNNWrapper cudnn_wrapper_;
   cudnnTensorDescriptor_t data_desc_;
   cudnnOpTensorDescriptor_t add_desc_;

   int cached_input_size_ = 0;
 };

 template <typename T>
 void CuDNNWeightedSumOp::CopyWeightToHost(const float* src, T* dst) {
   float val;
   context_.template CopyToCPU<float>(1, src, &val);
   *dst = convert::To<float, T>(val);
 }

 template <>
 void CuDNNWeightedSumOp::CopyWeightToHost<float>(const float* src, float* dst) {
   context_.CopyToCPU<float>(1, src, dst);
 }

 REGISTER_CUDNN_OPERATOR(WeightedSum, CuDNNWeightedSumOp);

 } // namespace caffe2
	#include "caffe2/operators/utility_ops.h"

	#include <type_traits>

	#include "caffe2/core/context_gpu.h"
	#include "caffe2/core/cudnn_wrappers.h"
	#include "caffe2/utils/conversions.h"

	namespace caffe2 {

	class CuDNNWeightedSumOp : public Operator<CUDAContext> {
	public:
	USE_OPERATOR_FUNCTIONS(CUDAContext);

	CuDNNWeightedSumOp(const OperatorDef& operator_def, Workspace* ws)
	: Operator<CUDAContext>(operator_def, ws), cudnn_wrapper_(&context_) {
	CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
	CUDNN_ENFORCE(cudnnCreateOpTensorDescriptor(&add_desc_));
	// Both float and at::Half require opTensorCompType to be CUDNN_DATA_FLOAT.
	CUDNN_ENFORCE(cudnnSetOpTensorDescriptor(
	add_desc_, CUDNN_OP_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_PROPAGATE_NAN));
	}

	~CuDNNWeightedSumOp() override {
	CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
	CUDNN_ENFORCE(cudnnDestroyOpTensorDescriptor(add_desc_));
	}

	bool RunOnDevice() override {
	return DispatchHelper<TensorTypes<float, at::Half>>::call(this, Input(0));
	}

	template <typename T>
	bool DoRunWithType() {
	if (std::is_same<T, at::Half>::value) {
	LOG(WARNING)
	<< "CuDNN only support same type for data and weight, "
	"so the weight will be cast to at::Half when data type is Half.";
	}
	const int num_inputs = InputSize();
	CAFFE_ENFORCE_EQ(num_inputs % 2, 0);
	const auto& X0 = Input(0);
	const auto& weight0 = Input(1);
	CAFFE_ENFORCE_GT(X0.numel(), 0);
	CAFFE_ENFORCE_EQ(weight0.numel(), 1);
	const int input_size = X0.numel();
	SetTensorDescriptor(cudnnTypeWrapper<T>::type, input_size);

	// Note: removed Aliasing check, since Output already has
	// caching capability
	auto* Y = Output(0, X0.sizes(), at::dtype<T>());
	T* Y_data = Y->template mutable_data<T>();
	T alpha = convert::To<float, T>(0.0f);
	T beta = convert::To<float, T>(0.0f);
	if (num_inputs == 2) {
	CopyWeightToHost<T>(weight0.template data<float>(), &alpha);
	CUDNN_ENFORCE(cudnnAddTensor(
	cudnn_wrapper_.inline_cudnn_handle(),
	&alpha,
	data_desc_,
	X0.template data<T>(),
	cudnnTypeWrapper<T>::kZero(),
	data_desc_,
	Y_data));
	return true;
	}
	const auto& X1 = Input(2);
	CAFFE_ENFORCE(
	!IsInputOutputAlias(2, 0),
	"Input #2 is the same as output. If you want to do in-place updates, "
	"put the output as input #0.");
	const auto& weight1 = Input(3);
	CAFFE_ENFORCE_EQ(X1.numel(), input_size);
	CAFFE_ENFORCE_EQ(weight1.numel(), 1);
	CopyWeightToHost<T>(weight1.template data<float>(), &alpha);
	CopyWeightToHost<T>(weight0.template data<float>(), &beta);
	if (IsInputOutputAlias(0, 0)) {
	CUDNN_ENFORCE(cudnnAddTensor(
	cudnn_wrapper_.inline_cudnn_handle(),
	&alpha,
	data_desc_,
	X1.template data<T>(),
	&beta,
	data_desc_,
	Y_data));
	} else {
	CUDNN_ENFORCE(cudnnOpTensor(
	cudnn_wrapper_.inline_cudnn_handle(),
	add_desc_,
	&alpha,
	data_desc_,
	X1.template data<T>(),
	&beta,
	data_desc_,
	X0.template data<T>(),
	cudnnTypeWrapper<T>::kZero(),
	data_desc_,
	Y_data));
	}
	for (int i = 4; i < num_inputs; i += 2) {
	const auto& Xi = Input(i);
	// Do a check: if the input is the same as output, we have a problem -
	// in-place update should always only happen with the zeroth input.
	const std::string err_msg = "Input #" + to_string(i) +
	" is the same as output. If you want to do in-place updates, "
	"put the output as input #0.";
	CAFFE_ENFORCE(!IsInputOutputAlias(i, 0), err_msg);
	const auto& weighti = Input(i + 1);
	CAFFE_ENFORCE_EQ(Xi.numel(), input_size);
	CAFFE_ENFORCE_EQ(weighti.numel(), 1);
	CopyWeightToHost<T>(weighti.template data<float>(), &alpha);
	CUDNN_ENFORCE(cudnnAddTensor(
	cudnn_wrapper_.inline_cudnn_handle(),
	&alpha,
	data_desc_,
	Xi.template data<T>(),
	cudnnTypeWrapper<T>::kOne(),
	data_desc_,
	Y_data));
	}
	return true;
	}

	private:
	void SetTensorDescriptor(
	const cudnnDataType_t data_type,
	const int input_size) {
	if (cached_input_size_ != input_size) {
	cached_input_size_ = input_size;
	// Since the best performance is obtained when the tesor is HW-packed, we
	// put X.size() to W.
	CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
	data_desc_,
	GetCudnnTensorFormat(StorageOrder::NCHW),
	data_type,
	1,
	1,
	1,
	input_size));
	}
	}

	template <typename T>
	void CopyWeightToHost(const float* src, T* dst);

	CuDNNWrapper cudnn_wrapper_;
	cudnnTensorDescriptor_t data_desc_;
	cudnnOpTensorDescriptor_t add_desc_;

	int cached_input_size_ = 0;
	};

	template <typename T>
	void CuDNNWeightedSumOp::CopyWeightToHost(const float* src, T* dst) {
	float val;
	context_.template CopyToCPU<float>(1, src, &val);
	*dst = convert::To<float, T>(val);
	}

	template <>
	void CuDNNWeightedSumOp::CopyWeightToHost<float>(const float* src, float* dst) {
	context_.CopyToCPU<float>(1, src, dst);
	}

	REGISTER_CUDNN_OPERATOR(WeightedSum, CuDNNWeightedSumOp);

	} // namespace caffe2