|  | #include "caffe2/operators/transpose_op.h" | 
|  |  | 
|  | #include <algorithm> | 
|  | #include <limits> | 
|  | #include <vector> | 
|  |  | 
|  | #include "caffe2/core/context_gpu.h" | 
|  | #include "caffe2/core/cudnn_wrappers.h" | 
|  | #include "caffe2/core/types.h" | 
|  | #include "caffe2/utils/math.h" | 
|  |  | 
|  | namespace caffe2 { | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | class CuDNNTransposeOp final : public Operator<CUDAContext> { | 
|  | public: | 
|  | USE_OPERATOR_FUNCTIONS(CUDAContext); | 
|  |  | 
|  | template <class... Args> | 
|  | explicit CuDNNTransposeOp(Args&&... args) | 
|  | : Operator<CUDAContext>(std::forward<Args>(args)...), | 
|  | cudnn_wrapper_(&context_), | 
|  | axes_(OperatorBase::GetRepeatedArgument<int>("axes")) { | 
|  | // Checks the legality of axes_: it should be from 0 to axes_.size(). | 
|  | std::vector<int> axes_sorted(axes_); | 
|  | std::sort(axes_sorted.begin(), axes_sorted.end()); | 
|  | for (std::size_t i = 0; i < axes_sorted.size(); ++i) { | 
|  | if (axes_sorted[i] != i) { | 
|  | CAFFE_THROW("Axes should be a permutation of 0 to ndim."); | 
|  | } | 
|  | } | 
|  |  | 
|  | CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_)); | 
|  | CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&Y_desc_)); | 
|  | } | 
|  |  | 
|  | ~CuDNNTransposeOp() override { | 
|  | CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_)); | 
|  | CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(Y_desc_)); | 
|  | } | 
|  |  | 
|  | bool RunOnDevice() override { | 
|  | return DispatchHelper<TensorTypes<float, int>>::call(this, Input(0)); | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | bool DoRunWithType() { | 
|  | const auto& X = Input(0); | 
|  | const int ndim = X.dim(); | 
|  | if (axes_.empty()) { | 
|  | axes_.resize(ndim); | 
|  | std::iota(axes_.rbegin(), axes_.rend(), 0); | 
|  | } else { | 
|  | CAFFE_ENFORCE_EQ(axes_.size(), ndim); | 
|  | } | 
|  | std::vector<std::int64_t> X_dims = X.sizes().vec(); | 
|  | std::vector<std::int64_t> Y_dims(ndim); | 
|  | for (int i = 0; i < ndim; ++i) { | 
|  | Y_dims[i] = X_dims[axes_[i]]; | 
|  | } | 
|  | auto* Y = Output(0, Y_dims, at::dtype<T>()); | 
|  | const T* X_data = X.template data<T>(); | 
|  | T* Y_data = Y->template mutable_data<T>(); | 
|  | if (X.numel() == 0) { | 
|  | return true; | 
|  | } | 
|  | if (ndim < 3 || ndim > CUDNN_DIM_MAX || | 
|  | X.numel() > std::numeric_limits<std::int32_t>::max()) { | 
|  | math::Transpose<std::int64_t, T, CUDAContext>( | 
|  | ndim, X_dims.data(), axes_.data(), X_data, Y_data, &context_); | 
|  | return true; | 
|  | } | 
|  | if (X_dims != cached_X_dims_) { | 
|  | SetTensorDescriptor(cudnnTypeWrapper<T>::type, X_dims, Y_dims); | 
|  | cached_X_dims_ = X_dims; | 
|  | } | 
|  | CUDNN_ENFORCE(cudnnTransformTensor( | 
|  | cudnn_wrapper_.inline_cudnn_handle(), | 
|  | cudnnTypeWrapper<T>::kOne(), | 
|  | X_desc_, | 
|  | X_data, | 
|  | cudnnTypeWrapper<T>::kZero(), | 
|  | Y_desc_, | 
|  | Y_data)); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | private: | 
|  | void SetTensorDescriptor( | 
|  | const cudnnDataType_t data_type, | 
|  | const std::vector<std::int64_t>& X_dims, | 
|  | const std::vector<std::int64_t>& Y_dims) { | 
|  | const int ndim = X_dims.size(); | 
|  | std::vector<int> dims(Y_dims.cbegin(), Y_dims.cend()); | 
|  | std::vector<int> X_strides(ndim); | 
|  | std::vector<int> X_buff(ndim); | 
|  | std::vector<int> Y_strides(ndim); | 
|  | X_buff.back() = 1; | 
|  | Y_strides.back() = 1; | 
|  | for (int i = ndim - 1; i > 0; --i) { | 
|  | X_buff[i - 1] = X_buff[i] * X_dims[i]; | 
|  | Y_strides[i - 1] = Y_strides[i] * Y_dims[i]; | 
|  | } | 
|  | for (int i = 0; i < ndim; ++i) { | 
|  | X_strides[i] = X_buff[axes_[i]]; | 
|  | } | 
|  | CUDNN_ENFORCE(cudnnSetTensorNdDescriptor( | 
|  | X_desc_, data_type, ndim, dims.data(), X_strides.data())); | 
|  | CUDNN_ENFORCE(cudnnSetTensorNdDescriptor( | 
|  | Y_desc_, data_type, ndim, dims.data(), Y_strides.data())); | 
|  | } | 
|  |  | 
|  | CuDNNWrapper cudnn_wrapper_; | 
|  | cudnnTensorDescriptor_t X_desc_; | 
|  | cudnnTensorDescriptor_t Y_desc_; | 
|  |  | 
|  | std::vector<std::int64_t> cached_X_dims_; | 
|  | std::vector<std::int32_t> axes_; | 
|  | }; | 
|  |  | 
|  | #if !CUDNN_VERSION_MIN(6, 0, 0) | 
|  |  | 
|  | // CuDNN 5.1 does not have int support yet. | 
|  | template <> | 
|  | bool CuDNNTransposeOp::DoRunWithType<int>() { | 
|  | const auto& X = Input(0); | 
|  | const int ndim = X.dim(); | 
|  | if (axes_.empty()) { | 
|  | axes_.resize(ndim); | 
|  | std::iota(axes_.rbegin(), axes_.rend(), 0); | 
|  | } else { | 
|  | CAFFE_ENFORCE_EQ(axes_.size(), ndim); | 
|  | } | 
|  | std::vector<std::int64_t> X_dims = X.sizes().vec(); | 
|  | std::vector<std::int64_t> Y_dims(ndim); | 
|  | for (int i = 0; i < ndim; ++i) { | 
|  | Y_dims[i] = X_dims[axes_[i]]; | 
|  | } | 
|  | auto* Y = Output(0, Y_dims, at::dtype<T>()); | 
|  | const T* X_data = X.template data<T>(); | 
|  | T* Y_data = Y->template mutable_data<T>(); | 
|  | math::Transpose<std::int64_t, T, CUDAContext>( | 
|  | ndim, X_dims.data(), axes_.data(), X_data, Y_data, &context_); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | #endif // !CUDNN_VERSION_MIN(6, 0, 0) | 
|  |  | 
|  | } // namespace | 
|  |  | 
|  | REGISTER_CUDNN_OPERATOR(Transpose, CuDNNTransposeOp); | 
|  |  | 
|  | } // namespace caffe2 |