caffe2/operators/transpose_op_cudnn.cc - platform/external/pytorch - Git at Google

 /**
  * Copyright (c) 2016-present, Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/core/cudnn_wrappers.h"
 #include "caffe2/core/types.h"
 #include "caffe2/operators/transpose_op.h"
 #include "caffe2/operators/transpose_op_gpu.h"

 namespace caffe2 {
 #define MAX_DIMS 8

 class CuDNNTransposeOp final : public Operator<CUDAContext> {
  public:
   USE_OPERATOR_FUNCTIONS(CUDAContext);
   USE_DISPATCH_HELPER;

   CuDNNTransposeOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<CUDAContext>(operator_def, ws),
         cudnn_wrapper_(&context_),
         axes_(OperatorBase::GetRepeatedArgument<int>("axes")) {
     // We will check the legality of axes_: it should be from 0 to axes_.size().
     std::vector<int> axes_sorted(axes_);
     std::sort(axes_sorted.begin(), axes_sorted.end());
     for (int i = 0; i < axes_sorted.size(); ++i) {
       if (axes_sorted[i] != i) {
         CAFFE_THROW("Axes should be a permutation of 0 to ndim.");
       }
     }

     CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&xDesc_));
     CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&yDesc_));
   }

   ~CuDNNTransposeOp() {
     CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(xDesc_));
     CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(yDesc_));
   }

   bool RunOnDevice() override {
     const auto& X = Input(0);
     auto* Y = Output(0);
     new_dims_.resize(X.ndim());
     if (axes_.size() == 0) {
       axes_.resize(X.ndim());
       for (int i = 0; i < axes_.size(); ++i) {
         axes_[i] = axes_.size() - 1 - i;
       }
       new_dims_.assign(X.dims().rbegin(), X.dims().rend());
     } else {
       CAFFE_ENFORCE_EQ(X.ndim(), axes_.size());
       for (int i = 0; i < new_dims_.size(); ++i) {
         new_dims_[i] = X.dim(axes_[i]);
       }
     }
     Y->Resize(new_dims_);
     // Do the actual transpose, which is implemented in DoRunWithType().
 #if CUDNN_VERSION_MIN(6, 0, 0)
     return DispatchHelper<TensorTypes<float, int>>::call(this, Input(0));
 #else
     // CUDNN 5.1 does not have int support yet.
     return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
 #endif
   }

  protected:


   template <typename T>
   bool DoRunWithType() {
     const auto& input = Input(0);
     auto* output = Output(0);
     int ndim = input.ndim();

     if (ndim == 0) {
       return true;
     }
     if (ndim == 1) {
       output->CopyFrom(input);
       return true;
     }

     cudnnDataType_t typedesc = cudnnTypeWrapper<T>::type;
 #if CUDNN_VERSION_MIN(6, 0, 0)
     if (typedesc == CUDNN_DATA_INT32) {
       // CUDNN Transpose only support float for now
       return TransposeCUDA<int>(
           axes_, context_, input, output, buffer_cpu_, buffer_);
     }
 #endif

     CAFFE_ENFORCE(ndim < MAX_DIMS, "Input ndim exceeds compile time max.");

     stride_y[ndim - 1] = 1;
     for (int i = ndim - 2; i >= 0; i--) {
       stride_y[i] = stride_y[i + 1] * output->dim32(i + 1);
     }

     CHECK(axes_.size() >= ndim);

     stride_x[ndim] = 1;
     for (int i = 0; i < ndim; i++) {
       stride_x[i] = 1;
       for (int j = axes_[i] + 1; j < ndim; j++) {
         stride_x[i] *= input.dim32(j);
       }
       dim_y_int[i] = output->dim32(i);
     }

     // CuDNN requires at least 3-dim tensors
     for (int i = ndim; i < MAX_DIMS; i++) {
       stride_x[i] = 1;
       stride_y[i] = 1;
       dim_y_int[i] = 1;
     }


     CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
         xDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_x));

     CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
         yDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_y));

     CUDNN_ENFORCE(cudnnTransformTensor(
         cudnn_wrapper_.inline_cudnn_handle(),
         cudnnTypeWrapper<T>::kOne(),
         xDesc_,
         static_cast<const void*>(input.template data<T>()),
         cudnnTypeWrapper<T>::kZero(),
         yDesc_,
         static_cast<void*>(output->template mutable_data<T>())));
     return true;
   }

   int stride_x[MAX_DIMS];
   int stride_y[MAX_DIMS];
   int dim_y_int[MAX_DIMS];

   cudnnTensorDescriptor_t xDesc_;
   cudnnTensorDescriptor_t yDesc_;
   CuDNNWrapper cudnn_wrapper_;
   std::vector<int> axes_;
   std::vector<TIndex> new_dims_;

   // This buffers are needed if need to call non-CUDNN version of transpose
   Tensor<CUDAContext> buffer_;
   TensorCPU buffer_cpu_;
 };

 REGISTER_CUDNN_OPERATOR(Transpose, CuDNNTransposeOp);

 } // namespace caffe2
	/**
	* Copyright (c) 2016-present, Facebook, Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "caffe2/core/context_gpu.h"
	#include "caffe2/core/cudnn_wrappers.h"
	#include "caffe2/core/types.h"
	#include "caffe2/operators/transpose_op.h"
	#include "caffe2/operators/transpose_op_gpu.h"

	namespace caffe2 {
	#define MAX_DIMS 8

	class CuDNNTransposeOp final : public Operator<CUDAContext> {
	public:
	USE_OPERATOR_FUNCTIONS(CUDAContext);
	USE_DISPATCH_HELPER;

	CuDNNTransposeOp(const OperatorDef& operator_def, Workspace* ws)
	: Operator<CUDAContext>(operator_def, ws),
	cudnn_wrapper_(&context_),
	axes_(OperatorBase::GetRepeatedArgument<int>("axes")) {
	// We will check the legality of axes_: it should be from 0 to axes_.size().
	std::vector<int> axes_sorted(axes_);
	std::sort(axes_sorted.begin(), axes_sorted.end());
	for (int i = 0; i < axes_sorted.size(); ++i) {
	if (axes_sorted[i] != i) {
	CAFFE_THROW("Axes should be a permutation of 0 to ndim.");
	}
	}

	CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&xDesc_));
	CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&yDesc_));
	}

	~CuDNNTransposeOp() {
	CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(xDesc_));
	CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(yDesc_));
	}

	bool RunOnDevice() override {
	const auto& X = Input(0);
	auto* Y = Output(0);
	new_dims_.resize(X.ndim());
	if (axes_.size() == 0) {
	axes_.resize(X.ndim());
	for (int i = 0; i < axes_.size(); ++i) {
	axes_[i] = axes_.size() - 1 - i;
	}
	new_dims_.assign(X.dims().rbegin(), X.dims().rend());
	} else {
	CAFFE_ENFORCE_EQ(X.ndim(), axes_.size());
	for (int i = 0; i < new_dims_.size(); ++i) {
	new_dims_[i] = X.dim(axes_[i]);
	}
	}
	Y->Resize(new_dims_);
	// Do the actual transpose, which is implemented in DoRunWithType().
	#if CUDNN_VERSION_MIN(6, 0, 0)
	return DispatchHelper<TensorTypes<float, int>>::call(this, Input(0));
	#else
	// CUDNN 5.1 does not have int support yet.
	return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
	#endif
	}

	protected:


	template <typename T>
	bool DoRunWithType() {
	const auto& input = Input(0);
	auto* output = Output(0);
	int ndim = input.ndim();

	if (ndim == 0) {
	return true;
	}
	if (ndim == 1) {
	output->CopyFrom(input);
	return true;
	}

	cudnnDataType_t typedesc = cudnnTypeWrapper<T>::type;
	#if CUDNN_VERSION_MIN(6, 0, 0)
	if (typedesc == CUDNN_DATA_INT32) {
	// CUDNN Transpose only support float for now
	return TransposeCUDA<int>(
	axes_, context_, input, output, buffer_cpu_, buffer_);
	}
	#endif

	CAFFE_ENFORCE(ndim < MAX_DIMS, "Input ndim exceeds compile time max.");

	stride_y[ndim - 1] = 1;
	for (int i = ndim - 2; i >= 0; i--) {
	stride_y[i] = stride_y[i + 1] * output->dim32(i + 1);
	}

	CHECK(axes_.size() >= ndim);

	stride_x[ndim] = 1;
	for (int i = 0; i < ndim; i++) {
	stride_x[i] = 1;
	for (int j = axes_[i] + 1; j < ndim; j++) {
	stride_x[i] *= input.dim32(j);
	}
	dim_y_int[i] = output->dim32(i);
	}

	// CuDNN requires at least 3-dim tensors
	for (int i = ndim; i < MAX_DIMS; i++) {
	stride_x[i] = 1;
	stride_y[i] = 1;
	dim_y_int[i] = 1;
	}



	CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
	xDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_x));

	CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
	yDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_y));

	CUDNN_ENFORCE(cudnnTransformTensor(
	cudnn_wrapper_.inline_cudnn_handle(),
	cudnnTypeWrapper<T>::kOne(),
	xDesc_,
	static_cast<const void*>(input.template data<T>()),
	cudnnTypeWrapper<T>::kZero(),
	yDesc_,
	static_cast<void*>(output->template mutable_data<T>())));
	return true;
	}

	int stride_x[MAX_DIMS];
	int stride_y[MAX_DIMS];
	int dim_y_int[MAX_DIMS];

	cudnnTensorDescriptor_t xDesc_;
	cudnnTensorDescriptor_t yDesc_;
	CuDNNWrapper cudnn_wrapper_;
	std::vector<int> axes_;
	std::vector<TIndex> new_dims_;

	// This buffers are needed if need to call non-CUDNN version of transpose
	Tensor<CUDAContext> buffer_;
	TensorCPU buffer_cpu_;
	};

	REGISTER_CUDNN_OPERATOR(Transpose, CuDNNTransposeOp);

	} // namespace caffe2