caffe2/operators/slice_op.cu - platform/external/pytorch - Git at Google

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/slice_op.h"
 #include "caffe2/utils/math.h"

 namespace caffe2 {
 namespace {
 __global__ void SliceCopyKernel(
     char* src_offset_bytes,
     int src_block_size_bytes,
     char* dst_offset_bytes,
     int dst_block_size_bytes,
     int copy_size,
     int itemsize,
     int num_blocks) {
   if ((copy_size % sizeof(int) == 0) &&
       (src_block_size_bytes % sizeof(int) == 0) &&
       (dst_block_size_bytes % sizeof(int) == 0)) {
     int* src = (int*)src_offset_bytes;
     int* dst = (int*)dst_offset_bytes;

     int src_block_size = src_block_size_bytes / sizeof(int);
     int dst_block_size = dst_block_size_bytes / sizeof(int);

     int copyChunks = copy_size / sizeof(int);

     CUDA_1D_KERNEL_LOOP(index, num_blocks * copyChunks) {
       int chunk = index % copyChunks;
       int block = index / copyChunks;

       dst[block * dst_block_size + chunk] = src[block * src_block_size + chunk];
     }
   } else {
     char* src = (char*)src_offset_bytes;
     char* dst = (char*)dst_offset_bytes;

     int src_block_size = src_block_size_bytes / sizeof(char);
     int dst_block_size = dst_block_size_bytes / sizeof(char);

     int copyChunks = copy_size / sizeof(char);

     CUDA_1D_KERNEL_LOOP(index, num_blocks * copyChunks) {
       int chunk = index % copyChunks;
       int block = index / copyChunks;

       dst[block * dst_block_size + chunk] = src[block * src_block_size + chunk];
     }
   }
 }

 template <class SIndex, class Context>
 bool SliceImplGpu(
     Tensor<Context>* output,
     const Tensor<Context>& data,
     const TensorCPU& starts,
     const TensorCPU& ends,
     Context* context,
     Tensor<Context>* gdata = nullptr,
     const Tensor<Context>* go = nullptr) {
   bool backward = output == nullptr;

   auto* starts_data = starts.template data<SIndex>();
   auto* ends_data = ends.template data<SIndex>();

   CAFFE_ENFORCE_EQ(starts.ndim(), 1);
   CAFFE_ENFORCE_EQ(ends.ndim(), 1);
   CAFFE_ENFORCE_GE(data.ndim(), starts.size());
   CAFFE_ENFORCE_EQ(starts.size(), ends.size());

   std::vector<int> starts_idx(data.ndim());
   std::vector<int> ends_idx(data.ndim());
   std::vector<int> dst_sizes(data.ndim());

   for (int i = 0; i < data.ndim(); ++i) {
     if (i >= starts.size()) {
       starts_idx[i] = 0;
       ends_idx[i] = data.dims()[i];
       continue;
     }
     if (data.dims()[i] > 0) {
       auto start = starts_data[i];
       auto end = ends_data[i];
       if (start < 0) {
         start = data.dims()[i] + 1 + start;
       }
       if (end < 0) {
         end = data.dims()[i] + 1 + end;
       }
       if (start > data.dims()[i]) {
         start = data.dims()[i];
       }
       if (end > data.dims()[i]) {
         end = data.dims()[i];
       }
       CAFFE_ENFORCE_GE(start, 0);
       CAFFE_ENFORCE_GE(end, 0);
       CAFFE_ENFORCE_GE(end, start);
       starts_idx[i] = start;
       ends_idx[i] = end;
       dst_sizes[i] = end - start;
     } else {
       starts_idx[i] = 0;
       ends_idx[i] = 0;
       dst_sizes[i] = 0;
     }
   }

   if (data.size() <= 0) {
     // When the input is empty, we do not need to do copy.
     if (!backward) {
       output->Resize(dst_sizes);
       output->raw_mutable_data(data.meta());
     }
     return true;
   }
   // for now only supports slicing in 1 dimension
   int dim = -1;
   for (int i = 0; i < data.ndim(); ++i) {
     if (starts_idx[i] > 0 || ends_idx[i] < data.dims()[i]) {
       CAFFE_ENFORCE_EQ(
           dim, -1, "Currently only possible to slice in 1 dimension.");
       dim = i;
     }
   }
   if (dim == -1) {
     if (!backward) {
       output->CopyFrom(data, context);
     } else {
       gdata->CopyFrom(*go, context);
     }
     return true;
   }
   int unit = std::accumulate(
       data.dims().begin() + dim + 1,
       data.dims().end(),
       1,
       std::multiplies<int>());
   int num_blocks = std::accumulate(
       data.dims().begin(),
       data.dims().begin() + dim,
       1,
       std::multiplies<int>());
   if (!backward) {
     output->Resize(dst_sizes);
   } else {
     gdata->ResizeLike(data);
   }

   auto itemsize = data.meta().itemsize();

   if (!backward) {
     char* src_bytes = (char*)data.raw_data();
     char* dst_bytes = (char*)output->raw_mutable_data(data.meta());

     size_t src_nbytes = data.nbytes();
     size_t dst_nbytes = output->nbytes();

     size_t src_block_size = unit * data.dims()[dim];
     size_t dst_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
     size_t src_offset = unit * starts_idx[dim];

     if (num_blocks == 0 || dst_block_size == 0) {
       return true;
     }

     size_t src_block_size_bytes = itemsize * src_block_size;
     size_t dst_block_size_bytes = itemsize * dst_block_size;
     char* src_offset_bytes = src_bytes + itemsize * src_offset;
     char* dst_offset_bytes = dst_bytes;

     SliceCopyKernel<<<
         std::min(num_blocks, CAFFE_MAXIMUM_NUM_BLOCKS),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(
         src_offset_bytes,
         src_block_size_bytes,
         dst_offset_bytes,
         dst_block_size_bytes,
         dst_block_size_bytes,
         itemsize,
         num_blocks);
   } else {
     char* src_bytes = (char*)go->raw_data();
     char* dst_bytes = (char*)gdata->raw_mutable_data(go->meta());

     size_t src_nbytes = go->nbytes();
     size_t dst_nbytes = gdata->nbytes();

     size_t src_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
     size_t dst_block_size = unit * data.dims()[dim];
     size_t dst_offset = unit * starts_idx[dim];

     if (num_blocks == 0 || dst_block_size == 0) {
       return true;
     }

     size_t src_block_size_bytes = itemsize * src_block_size;
     size_t dst_block_size_bytes = itemsize * dst_block_size;

     char* src_offset_bytes = src_bytes;
     char* dst_offset_bytes = dst_bytes + itemsize * dst_offset;
     // Zero out gradient blob before copy since we copy in fewer items than
     // there is space for
     math::Set<float, CUDAContext>(
         gdata->size(),
         0.0f,
         (float*)gdata->raw_mutable_data(go->meta()),
         context);

     // If output tensor is empty, just return zeroed gradient tensor
     if (!src_bytes) {
       return true;
     }

     SliceCopyKernel<<<
         std::min(num_blocks, CAFFE_MAXIMUM_NUM_BLOCKS),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(
         src_offset_bytes,
         src_block_size_bytes,
         dst_offset_bytes,
         dst_block_size_bytes,
         src_block_size_bytes,
         itemsize,
         num_blocks);
   }

   return true;
 }

 } // namespace

 template <>
 bool SliceOp<int, CUDAContext>::RunOnDevice() {
   auto* output = Output(0);
   auto& data = Input(0);

   if (InputSize() > 1) {
     starts_host_.CopyFrom<CUDAContext>(Input(1));
     ends_host_.CopyFrom<CUDAContext>(Input(2));
   } else {
     if (!statically_inited_) {
       CAFFE_ENFORCE(HasArgument("starts"));
       CAFFE_ENFORCE(HasArgument("ends"));
       CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());

       starts_host_.Resize(starts_.size());
       ends_host_.Resize(ends_.size());

       memcpy(
           starts_host_.mutable_data<int>(),
           starts_.data(),
           sizeof(int) * starts_.size());
       memcpy(
           ends_host_.mutable_data<int>(),
           ends_.data(),
           sizeof(int) * ends_.size());
       statically_inited_ = true;
     }
   }

   return SliceImplGpu<int, CUDAContext>(
       output, data, starts_host_, ends_host_, &context_);
 }

 REGISTER_CUDA_OPERATOR(Slice, SliceOp<int, CUDAContext>);

 template <>
 bool SliceGradientOp<int, CUDAContext>::RunOnDevice() {
   auto* gdata = Output(0);
   auto& data = Input(0);

   if (InputSize() == 4) {
     starts_host_.CopyFrom<CUDAContext>(Input(1));
     ends_host_.CopyFrom<CUDAContext>(Input(2));

     auto& go = Input(3);

     return SliceImplGpu<int, CUDAContext>(
         nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
   } else {
     if (!statically_inited_) {
       CAFFE_ENFORCE(HasArgument("starts"));
       CAFFE_ENFORCE(HasArgument("ends"));
       CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());

       starts_host_.Resize(starts_.size());
       ends_host_.Resize(ends_.size());

       memcpy(
           starts_host_.mutable_data<int>(),
           starts_.data(),
           sizeof(int) * starts_.size());
       memcpy(
           ends_host_.mutable_data<int>(),
           ends_.data(),
           sizeof(int) * ends_.size());

       statically_inited_ = true;
     }
     auto& go = Input(1);

     return SliceImplGpu<int, CUDAContext>(
         nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
   }
 }
 REGISTER_CUDA_OPERATOR(SliceGradient, SliceGradientOp<int, CUDAContext>);
 } // namespace caffe2
	#include "caffe2/core/context_gpu.h"
	#include "caffe2/operators/slice_op.h"
	#include "caffe2/utils/math.h"

	namespace caffe2 {
	namespace {
	__global__ void SliceCopyKernel(
	char* src_offset_bytes,
	int src_block_size_bytes,
	char* dst_offset_bytes,
	int dst_block_size_bytes,
	int copy_size,
	int itemsize,
	int num_blocks) {
	if ((copy_size % sizeof(int) == 0) &&
	(src_block_size_bytes % sizeof(int) == 0) &&
	(dst_block_size_bytes % sizeof(int) == 0)) {
	int* src = (int*)src_offset_bytes;
	int* dst = (int*)dst_offset_bytes;

	int src_block_size = src_block_size_bytes / sizeof(int);
	int dst_block_size = dst_block_size_bytes / sizeof(int);

	int copyChunks = copy_size / sizeof(int);

	CUDA_1D_KERNEL_LOOP(index, num_blocks * copyChunks) {
	int chunk = index % copyChunks;
	int block = index / copyChunks;

	dst[block * dst_block_size + chunk] = src[block * src_block_size + chunk];
	}
	} else {
	char* src = (char*)src_offset_bytes;
	char* dst = (char*)dst_offset_bytes;

	int src_block_size = src_block_size_bytes / sizeof(char);
	int dst_block_size = dst_block_size_bytes / sizeof(char);

	int copyChunks = copy_size / sizeof(char);

	CUDA_1D_KERNEL_LOOP(index, num_blocks * copyChunks) {
	int chunk = index % copyChunks;
	int block = index / copyChunks;

	dst[block * dst_block_size + chunk] = src[block * src_block_size + chunk];
	}
	}
	}

	template <class SIndex, class Context>
	bool SliceImplGpu(
	Tensor<Context>* output,
	const Tensor<Context>& data,
	const TensorCPU& starts,
	const TensorCPU& ends,
	Context* context,
	Tensor<Context>* gdata = nullptr,
	const Tensor<Context>* go = nullptr) {
	bool backward = output == nullptr;

	auto* starts_data = starts.template data<SIndex>();
	auto* ends_data = ends.template data<SIndex>();

	CAFFE_ENFORCE_EQ(starts.ndim(), 1);
	CAFFE_ENFORCE_EQ(ends.ndim(), 1);
	CAFFE_ENFORCE_GE(data.ndim(), starts.size());
	CAFFE_ENFORCE_EQ(starts.size(), ends.size());

	std::vector<int> starts_idx(data.ndim());
	std::vector<int> ends_idx(data.ndim());
	std::vector<int> dst_sizes(data.ndim());

	for (int i = 0; i < data.ndim(); ++i) {
	if (i >= starts.size()) {
	starts_idx[i] = 0;
	ends_idx[i] = data.dims()[i];
	continue;
	}
	if (data.dims()[i] > 0) {
	auto start = starts_data[i];
	auto end = ends_data[i];
	if (start < 0) {
	start = data.dims()[i] + 1 + start;
	}
	if (end < 0) {
	end = data.dims()[i] + 1 + end;
	}
	if (start > data.dims()[i]) {
	start = data.dims()[i];
	}
	if (end > data.dims()[i]) {
	end = data.dims()[i];
	}
	CAFFE_ENFORCE_GE(start, 0);
	CAFFE_ENFORCE_GE(end, 0);
	CAFFE_ENFORCE_GE(end, start);
	starts_idx[i] = start;
	ends_idx[i] = end;
	dst_sizes[i] = end - start;
	} else {
	starts_idx[i] = 0;
	ends_idx[i] = 0;
	dst_sizes[i] = 0;
	}
	}

	if (data.size() <= 0) {
	// When the input is empty, we do not need to do copy.
	if (!backward) {
	output->Resize(dst_sizes);
	output->raw_mutable_data(data.meta());
	}
	return true;
	}
	// for now only supports slicing in 1 dimension
	int dim = -1;
	for (int i = 0; i < data.ndim(); ++i) {
	if (starts_idx[i] > 0 \|\| ends_idx[i] < data.dims()[i]) {
	CAFFE_ENFORCE_EQ(
	dim, -1, "Currently only possible to slice in 1 dimension.");
	dim = i;
	}
	}
	if (dim == -1) {
	if (!backward) {
	output->CopyFrom(data, context);
	} else {
	gdata->CopyFrom(*go, context);
	}
	return true;
	}
	int unit = std::accumulate(
	data.dims().begin() + dim + 1,
	data.dims().end(),
	1,
	std::multiplies<int>());
	int num_blocks = std::accumulate(
	data.dims().begin(),
	data.dims().begin() + dim,
	1,
	std::multiplies<int>());
	if (!backward) {
	output->Resize(dst_sizes);
	} else {
	gdata->ResizeLike(data);
	}

	auto itemsize = data.meta().itemsize();

	if (!backward) {
	char* src_bytes = (char*)data.raw_data();
	char* dst_bytes = (char*)output->raw_mutable_data(data.meta());

	size_t src_nbytes = data.nbytes();
	size_t dst_nbytes = output->nbytes();

	size_t src_block_size = unit * data.dims()[dim];
	size_t dst_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
	size_t src_offset = unit * starts_idx[dim];

	if (num_blocks == 0 \|\| dst_block_size == 0) {
	return true;
	}

	size_t src_block_size_bytes = itemsize * src_block_size;
	size_t dst_block_size_bytes = itemsize * dst_block_size;
	char* src_offset_bytes = src_bytes + itemsize * src_offset;
	char* dst_offset_bytes = dst_bytes;

	SliceCopyKernel<<<
	std::min(num_blocks, CAFFE_MAXIMUM_NUM_BLOCKS),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context->cuda_stream()>>>(
	src_offset_bytes,
	src_block_size_bytes,
	dst_offset_bytes,
	dst_block_size_bytes,
	dst_block_size_bytes,
	itemsize,
	num_blocks);
	} else {
	char* src_bytes = (char*)go->raw_data();
	char* dst_bytes = (char*)gdata->raw_mutable_data(go->meta());

	size_t src_nbytes = go->nbytes();
	size_t dst_nbytes = gdata->nbytes();

	size_t src_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
	size_t dst_block_size = unit * data.dims()[dim];
	size_t dst_offset = unit * starts_idx[dim];

	if (num_blocks == 0 \|\| dst_block_size == 0) {
	return true;
	}

	size_t src_block_size_bytes = itemsize * src_block_size;
	size_t dst_block_size_bytes = itemsize * dst_block_size;

	char* src_offset_bytes = src_bytes;
	char* dst_offset_bytes = dst_bytes + itemsize * dst_offset;
	// Zero out gradient blob before copy since we copy in fewer items than
	// there is space for
	math::Set<float, CUDAContext>(
	gdata->size(),
	0.0f,
	(float*)gdata->raw_mutable_data(go->meta()),
	context);

	// If output tensor is empty, just return zeroed gradient tensor
	if (!src_bytes) {
	return true;
	}

	SliceCopyKernel<<<
	std::min(num_blocks, CAFFE_MAXIMUM_NUM_BLOCKS),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context->cuda_stream()>>>(
	src_offset_bytes,
	src_block_size_bytes,
	dst_offset_bytes,
	dst_block_size_bytes,
	src_block_size_bytes,
	itemsize,
	num_blocks);
	}

	return true;
	}

	} // namespace

	template <>
	bool SliceOp<int, CUDAContext>::RunOnDevice() {
	auto* output = Output(0);
	auto& data = Input(0);

	if (InputSize() > 1) {
	starts_host_.CopyFrom<CUDAContext>(Input(1));
	ends_host_.CopyFrom<CUDAContext>(Input(2));
	} else {
	if (!statically_inited_) {
	CAFFE_ENFORCE(HasArgument("starts"));
	CAFFE_ENFORCE(HasArgument("ends"));
	CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());

	starts_host_.Resize(starts_.size());
	ends_host_.Resize(ends_.size());

	memcpy(
	starts_host_.mutable_data<int>(),
	starts_.data(),
	sizeof(int) * starts_.size());
	memcpy(
	ends_host_.mutable_data<int>(),
	ends_.data(),
	sizeof(int) * ends_.size());
	statically_inited_ = true;
	}
	}

	return SliceImplGpu<int, CUDAContext>(
	output, data, starts_host_, ends_host_, &context_);
	}

	REGISTER_CUDA_OPERATOR(Slice, SliceOp<int, CUDAContext>);

	template <>
	bool SliceGradientOp<int, CUDAContext>::RunOnDevice() {
	auto* gdata = Output(0);
	auto& data = Input(0);

	if (InputSize() == 4) {
	starts_host_.CopyFrom<CUDAContext>(Input(1));
	ends_host_.CopyFrom<CUDAContext>(Input(2));

	auto& go = Input(3);

	return SliceImplGpu<int, CUDAContext>(
	nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
	} else {
	if (!statically_inited_) {
	CAFFE_ENFORCE(HasArgument("starts"));
	CAFFE_ENFORCE(HasArgument("ends"));
	CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());

	starts_host_.Resize(starts_.size());
	ends_host_.Resize(ends_.size());

	memcpy(
	starts_host_.mutable_data<int>(),
	starts_.data(),
	sizeof(int) * starts_.size());
	memcpy(
	ends_host_.mutable_data<int>(),
	ends_.data(),
	sizeof(int) * ends_.size());

	statically_inited_ = true;
	}
	auto& go = Input(1);

	return SliceImplGpu<int, CUDAContext>(
	nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
	}
	}
	REGISTER_CUDA_OPERATOR(SliceGradient, SliceGradientOp<int, CUDAContext>);
	} // namespace caffe2