blob: a9ac0db0d08fba9baf7acadb964f28e1998813d0 [file] [log] [blame]
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/slice_op.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
namespace {
__global__ void SliceCopyKernel(
char* src_offset_bytes,
int src_block_size_bytes,
char* dst_offset_bytes,
int dst_block_size_bytes,
int copy_size,
int itemsize,
int num_blocks) {
if ((copy_size % sizeof(int) == 0) &&
(src_block_size_bytes % sizeof(int) == 0) &&
(dst_block_size_bytes % sizeof(int) == 0)) {
int* src = (int*)src_offset_bytes;
int* dst = (int*)dst_offset_bytes;
int src_block_size = src_block_size_bytes / sizeof(int);
int dst_block_size = dst_block_size_bytes / sizeof(int);
int copyChunks = copy_size / sizeof(int);
CUDA_1D_KERNEL_LOOP(index, num_blocks * copyChunks) {
int chunk = index % copyChunks;
int block = index / copyChunks;
dst[block * dst_block_size + chunk] = src[block * src_block_size + chunk];
}
} else {
char* src = (char*)src_offset_bytes;
char* dst = (char*)dst_offset_bytes;
int src_block_size = src_block_size_bytes / sizeof(char);
int dst_block_size = dst_block_size_bytes / sizeof(char);
int copyChunks = copy_size / sizeof(char);
CUDA_1D_KERNEL_LOOP(index, num_blocks * copyChunks) {
int chunk = index % copyChunks;
int block = index / copyChunks;
dst[block * dst_block_size + chunk] = src[block * src_block_size + chunk];
}
}
}
template <class SIndex, class Context>
bool SliceImplGpu(
Tensor<Context>* output,
const Tensor<Context>& data,
const TensorCPU& starts,
const TensorCPU& ends,
Context* context,
Tensor<Context>* gdata = nullptr,
const Tensor<Context>* go = nullptr) {
bool backward = output == nullptr;
auto* starts_data = starts.template data<SIndex>();
auto* ends_data = ends.template data<SIndex>();
CAFFE_ENFORCE_EQ(starts.ndim(), 1);
CAFFE_ENFORCE_EQ(ends.ndim(), 1);
CAFFE_ENFORCE_GE(data.ndim(), starts.size());
CAFFE_ENFORCE_EQ(starts.size(), ends.size());
std::vector<int> starts_idx(data.ndim());
std::vector<int> ends_idx(data.ndim());
std::vector<int> dst_sizes(data.ndim());
for (int i = 0; i < data.ndim(); ++i) {
if (i >= starts.size()) {
starts_idx[i] = 0;
ends_idx[i] = data.dims()[i];
continue;
}
if (data.dims()[i] > 0) {
auto start = starts_data[i];
auto end = ends_data[i];
if (start < 0) {
start = data.dims()[i] + 1 + start;
}
if (end < 0) {
end = data.dims()[i] + 1 + end;
}
if (start > data.dims()[i]) {
start = data.dims()[i];
}
if (end > data.dims()[i]) {
end = data.dims()[i];
}
CAFFE_ENFORCE_GE(start, 0);
CAFFE_ENFORCE_GE(end, 0);
CAFFE_ENFORCE_GE(end, start);
starts_idx[i] = start;
ends_idx[i] = end;
dst_sizes[i] = end - start;
} else {
starts_idx[i] = 0;
ends_idx[i] = 0;
dst_sizes[i] = 0;
}
}
if (data.size() <= 0) {
// When the input is empty, we do not need to do copy.
if (!backward) {
output->Resize(dst_sizes);
output->raw_mutable_data(data.meta());
}
return true;
}
// for now only supports slicing in 1 dimension
int dim = -1;
for (int i = 0; i < data.ndim(); ++i) {
if (starts_idx[i] > 0 || ends_idx[i] < data.dims()[i]) {
CAFFE_ENFORCE_EQ(
dim, -1, "Currently only possible to slice in 1 dimension.");
dim = i;
}
}
if (dim == -1) {
if (!backward) {
output->CopyFrom(data, context);
} else {
gdata->CopyFrom(*go, context);
}
return true;
}
int unit = std::accumulate(
data.dims().begin() + dim + 1,
data.dims().end(),
1,
std::multiplies<int>());
int num_blocks = std::accumulate(
data.dims().begin(),
data.dims().begin() + dim,
1,
std::multiplies<int>());
if (!backward) {
output->Resize(dst_sizes);
} else {
gdata->ResizeLike(data);
}
auto itemsize = data.meta().itemsize();
if (!backward) {
char* src_bytes = (char*)data.raw_data();
char* dst_bytes = (char*)output->raw_mutable_data(data.meta());
size_t src_nbytes = data.nbytes();
size_t dst_nbytes = output->nbytes();
size_t src_block_size = unit * data.dims()[dim];
size_t dst_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
size_t src_offset = unit * starts_idx[dim];
if (num_blocks == 0 || dst_block_size == 0) {
return true;
}
size_t src_block_size_bytes = itemsize * src_block_size;
size_t dst_block_size_bytes = itemsize * dst_block_size;
char* src_offset_bytes = src_bytes + itemsize * src_offset;
char* dst_offset_bytes = dst_bytes;
SliceCopyKernel<<<
std::min(num_blocks, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(
src_offset_bytes,
src_block_size_bytes,
dst_offset_bytes,
dst_block_size_bytes,
dst_block_size_bytes,
itemsize,
num_blocks);
} else {
char* src_bytes = (char*)go->raw_data();
char* dst_bytes = (char*)gdata->raw_mutable_data(go->meta());
size_t src_nbytes = go->nbytes();
size_t dst_nbytes = gdata->nbytes();
size_t src_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
size_t dst_block_size = unit * data.dims()[dim];
size_t dst_offset = unit * starts_idx[dim];
if (num_blocks == 0 || dst_block_size == 0) {
return true;
}
size_t src_block_size_bytes = itemsize * src_block_size;
size_t dst_block_size_bytes = itemsize * dst_block_size;
char* src_offset_bytes = src_bytes;
char* dst_offset_bytes = dst_bytes + itemsize * dst_offset;
// Zero out gradient blob before copy since we copy in fewer items than
// there is space for
math::Set<float, CUDAContext>(
gdata->size(),
0.0f,
(float*)gdata->raw_mutable_data(go->meta()),
context);
// If output tensor is empty, just return zeroed gradient tensor
if (!src_bytes) {
return true;
}
SliceCopyKernel<<<
std::min(num_blocks, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(
src_offset_bytes,
src_block_size_bytes,
dst_offset_bytes,
dst_block_size_bytes,
src_block_size_bytes,
itemsize,
num_blocks);
}
return true;
}
} // namespace
template <>
bool SliceOp<int, CUDAContext>::RunOnDevice() {
auto* output = Output(0);
auto& data = Input(0);
if (InputSize() > 1) {
starts_host_.CopyFrom<CUDAContext>(Input(1));
ends_host_.CopyFrom<CUDAContext>(Input(2));
} else {
if (!statically_inited_) {
CAFFE_ENFORCE(HasArgument("starts"));
CAFFE_ENFORCE(HasArgument("ends"));
CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
starts_host_.Resize(starts_.size());
ends_host_.Resize(ends_.size());
memcpy(
starts_host_.mutable_data<int>(),
starts_.data(),
sizeof(int) * starts_.size());
memcpy(
ends_host_.mutable_data<int>(),
ends_.data(),
sizeof(int) * ends_.size());
statically_inited_ = true;
}
}
return SliceImplGpu<int, CUDAContext>(
output, data, starts_host_, ends_host_, &context_);
}
REGISTER_CUDA_OPERATOR(Slice, SliceOp<int, CUDAContext>);
template <>
bool SliceGradientOp<int, CUDAContext>::RunOnDevice() {
auto* gdata = Output(0);
auto& data = Input(0);
if (InputSize() == 4) {
starts_host_.CopyFrom<CUDAContext>(Input(1));
ends_host_.CopyFrom<CUDAContext>(Input(2));
auto& go = Input(3);
return SliceImplGpu<int, CUDAContext>(
nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
} else {
if (!statically_inited_) {
CAFFE_ENFORCE(HasArgument("starts"));
CAFFE_ENFORCE(HasArgument("ends"));
CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
starts_host_.Resize(starts_.size());
ends_host_.Resize(ends_.size());
memcpy(
starts_host_.mutable_data<int>(),
starts_.data(),
sizeof(int) * starts_.size());
memcpy(
ends_host_.mutable_data<int>(),
ends_.data(),
sizeof(int) * ends_.size());
statically_inited_ = true;
}
auto& go = Input(1);
return SliceImplGpu<int, CUDAContext>(
nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
}
}
REGISTER_CUDA_OPERATOR(SliceGradient, SliceGradientOp<int, CUDAContext>);
} // namespace caffe2