blob: 664fbe25ab4a1b2f538860e55e988755e5a260e1 [file] [log] [blame]
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "caffe2/operators/transpose_op.h"
#include <limits>
#include "caffe2/core/context_gpu.h"
namespace caffe2 {
// Cuda memory is precious so let's do a lower ndim limit.
#define COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS 6
namespace {
// TODO(jiayq): one possible optimization is to copy the buffer into a shared
// memory location to speed up access.
template <typename Dtype>
__global__ void transpose_gpu(const int nthreads, const Dtype* from_data,
Dtype* to_data, const int* buffer, const int num_axes) {
int from_inds[COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS];
const int* from_counts = buffer;
const int* to_counts = buffer + num_axes;
const int* axes = buffer + num_axes * 2;
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int from_index = index, to_index = 0;
for (int i = num_axes - 1; i >= 0; --i) {
from_inds[i] = from_index % from_counts[i];
from_index = from_index / from_counts[i];
}
for (int i = 0; i < num_axes - 1; i++) {
to_index = (to_index + from_inds[axes[i]]) * to_counts[i + 1];
}
to_index += from_inds[axes[num_axes - 1]];
to_data[to_index] = from_data[index];
}
}
} // namespace
template <>
template <typename T>
bool TransposeOp<CUDAContext>::DoRunWithType() {
const auto& input = Input(0);
auto* output = Output(0);
int count = input.size();
int ndim = input.ndim();
CAFFE_ENFORCE(
count < std::numeric_limits<int>::max(),
"Transpose op on GPU only supports int32");
CAFFE_ENFORCE(
ndim <= COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS,
"Input ndim exceeds compile time max.");
// Buffer contains the following data:
// (1) the dimenions of the inputs
// (2) the dimension of the outputs
// (3) the axis mapping from inputs to outputs
buffer_cpu_.Resize(3 * ndim);
int* buffer_data = buffer_cpu_.mutable_data<int>();
for (int i = 0; i < ndim; ++i) {
*(buffer_data++) = input.dim32(i);
}
for (int i = 0; i < ndim; ++i) {
*(buffer_data++) = output->dim32(i);
}
for (int i = 0; i < ndim; ++i) {
*(buffer_data++) = axes_[i];
}
// Copy the dimension information to GPU.
buffer_.CopyFrom(buffer_cpu_, &context_);
transpose_gpu<T><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
count, input.template data<T>(), output->template mutable_data<T>(),
buffer_.data<int>(), ndim);
return true;
}
REGISTER_CUDA_OPERATOR(Transpose, TransposeOp<CUDAContext>);
} // namespace caffe2