blob: 37594c9e659e18655805466b43baff52c23a6b8b [file] [log] [blame]
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/cudnn_wrappers.h"
#include "caffe2/core/types.h"
#include "caffe2/operators/transpose_op.h"
#include "caffe2/operators/transpose_op_gpu.h"
namespace caffe2 {
#define MAX_DIMS 8
class CuDNNTransposeOp final : public Operator<CUDAContext> {
public:
USE_OPERATOR_FUNCTIONS(CUDAContext);
USE_DISPATCH_HELPER;
CuDNNTransposeOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CUDAContext>(operator_def, ws),
cudnn_wrapper_(&context_),
axes_(OperatorBase::GetRepeatedArgument<int>("axes")) {
// We will check the legality of axes_: it should be from 0 to axes_.size().
std::vector<int> axes_sorted(axes_);
std::sort(axes_sorted.begin(), axes_sorted.end());
for (int i = 0; i < axes_sorted.size(); ++i) {
if (axes_sorted[i] != i) {
CAFFE_THROW("Axes should be a permutation of 0 to ndim.");
}
}
CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&xDesc_));
CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&yDesc_));
}
~CuDNNTransposeOp() {
CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(xDesc_));
CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(yDesc_));
}
bool RunOnDevice() override {
const auto& X = Input(0);
auto* Y = Output(0);
new_dims_.resize(X.ndim());
if (axes_.size() == 0) {
axes_.resize(X.ndim());
for (int i = 0; i < axes_.size(); ++i) {
axes_[i] = axes_.size() - 1 - i;
}
new_dims_.assign(X.dims().rbegin(), X.dims().rend());
} else {
CAFFE_ENFORCE_EQ(X.ndim(), axes_.size());
for (int i = 0; i < new_dims_.size(); ++i) {
new_dims_[i] = X.dim(axes_[i]);
}
}
Y->Resize(new_dims_);
// Do the actual transpose, which is implemented in DoRunWithType().
#if CUDNN_VERSION_MIN(6, 0, 0)
return DispatchHelper<TensorTypes<float, int>>::call(this, Input(0));
#else
// CUDNN 5.1 does not have int support yet.
return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
#endif
}
protected:
template <typename T>
bool DoRunWithType() {
const auto& input = Input(0);
auto* output = Output(0);
int ndim = input.ndim();
if (ndim == 0) {
return true;
}
if (ndim == 1) {
output->CopyFrom(input);
return true;
}
cudnnDataType_t typedesc = cudnnTypeWrapper<T>::type;
#if CUDNN_VERSION_MIN(6, 0, 0)
if (typedesc == CUDNN_DATA_INT32) {
// CUDNN Transpose only support float for now
return TransposeCUDA<int>(
axes_, context_, input, output, buffer_cpu_, buffer_);
}
#endif
CAFFE_ENFORCE(ndim < MAX_DIMS, "Input ndim exceeds compile time max.");
stride_y[ndim - 1] = 1;
for (int i = ndim - 2; i >= 0; i--) {
stride_y[i] = stride_y[i + 1] * output->dim32(i + 1);
}
CHECK(axes_.size() >= ndim);
stride_x[ndim] = 1;
for (int i = 0; i < ndim; i++) {
stride_x[i] = 1;
for (int j = axes_[i] + 1; j < ndim; j++) {
stride_x[i] *= input.dim32(j);
}
dim_y_int[i] = output->dim32(i);
}
// CuDNN requires at least 3-dim tensors
for (int i = ndim; i < MAX_DIMS; i++) {
stride_x[i] = 1;
stride_y[i] = 1;
dim_y_int[i] = 1;
}
CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
xDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_x));
CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
yDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_y));
CUDNN_ENFORCE(cudnnTransformTensor(
cudnn_wrapper_.inline_cudnn_handle(),
cudnnTypeWrapper<T>::kOne(),
xDesc_,
static_cast<const void*>(input.template data<T>()),
cudnnTypeWrapper<T>::kZero(),
yDesc_,
static_cast<void*>(output->template mutable_data<T>())));
return true;
}
int stride_x[MAX_DIMS];
int stride_y[MAX_DIMS];
int dim_y_int[MAX_DIMS];
cudnnTensorDescriptor_t xDesc_;
cudnnTensorDescriptor_t yDesc_;
CuDNNWrapper cudnn_wrapper_;
std::vector<int> axes_;
std::vector<TIndex> new_dims_;
// This buffers are needed if need to call non-CUDNN version of transpose
Tensor<CUDAContext> buffer_;
TensorCPU buffer_cpu_;
};
REGISTER_CUDNN_OPERATOR(Transpose, CuDNNTransposeOp);
} // namespace caffe2