modules/detectron/batch_permutation_op.cu - platform/external/pytorch - Git at Google

 /**
  * Copyright (c) 2016-present, Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "modules/detectron/batch_permutation_op.h"
 #include "caffe2/core/context_gpu.h"

 namespace caffe2 {

 namespace {
 template <bool forward>
 __global__ void BatchPermutationKernel(
     int N,
     int C,
     int H,
     int W,
     const float* src,
     const int* indices,
     float* dst) {
   CUDA_1D_KERNEL_LOOP(index, N * C * H * W) {
     int w = index % W;
     int h = (index / W) % H;
     int c = (index / W / H) % C;
     int n = (index / W / H / C);
     int idx = indices[n];
     if (forward) {
       dst[n * C * H * W + c * H * W + h * W + w] =
           src[idx * C * H * W + c * H * W + h * W + w];
     } else {
       dst[idx * C * H * W + c * H * W + h * W + w] =
           src[n * C * H * W + c * H * W + h * W + w];
     }
   }
 }
 }

 template <>
 bool BatchPermutationOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0);
   auto& indices = Input(1);


   CAFFE_ENFORCE(indices.ndim() == 1, "indices must be 1-d");
   CAFFE_ENFORCE(
       X.dim32(0) == indices.dim32(0),
       "X.dim32(0) must be equal to indices.dim32(0)",
       "(",
       X.dim32(0),
       " vs. ",
       indices.dim32(0),
       ")");

   auto* Y = Output(0, X.sizes(), at::dtype<float>());

   BatchPermutationKernel<true><<<
       CAFFE_GET_BLOCKS(X.size()),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       X.dim32(0),
       X.dim32(1),
       X.dim32(2),
       X.dim32(3),
       X.data<float>(),
       indices.data<int>(),
       Y->mutable_data<float>());

   return true;
 }

 template <>
 bool BatchPermutationGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& indices = Input(0);
   auto& dY = Input(1);

   auto* dX = Output(0, dY.sizes(), at::dtype<float>());

   BatchPermutationKernel<false><<<
       CAFFE_GET_BLOCKS(dY.size()),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       dY.dim32(0),
       dY.dim32(1),
       dY.dim32(2),
       dY.dim32(3),
       dY.data<float>(),
       indices.data<int>(),
       dX->mutable_data<float>());

   return true;
 }

 REGISTER_CUDA_OPERATOR(
     BatchPermutation,
     BatchPermutationOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(
     BatchPermutationGradient,
     BatchPermutationGradientOp<float, CUDAContext>);
 } // namespace caffe2
	/**
	* Copyright (c) 2016-present, Facebook, Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "modules/detectron/batch_permutation_op.h"
	#include "caffe2/core/context_gpu.h"

	namespace caffe2 {

	namespace {
	template <bool forward>
	__global__ void BatchPermutationKernel(
	int N,
	int C,
	int H,
	int W,
	const float* src,
	const int* indices,
	float* dst) {
	CUDA_1D_KERNEL_LOOP(index, N * C * H * W) {
	int w = index % W;
	int h = (index / W) % H;
	int c = (index / W / H) % C;
	int n = (index / W / H / C);
	int idx = indices[n];
	if (forward) {
	dst[n * C * H * W + c * H * W + h * W + w] =
	src[idx * C * H * W + c * H * W + h * W + w];
	} else {
	dst[idx * C * H * W + c * H * W + h * W + w] =
	src[n * C * H * W + c * H * W + h * W + w];
	}
	}
	}
	}

	template <>
	bool BatchPermutationOp<float, CUDAContext>::RunOnDevice() {
	auto& X = Input(0);
	auto& indices = Input(1);


	CAFFE_ENFORCE(indices.ndim() == 1, "indices must be 1-d");
	CAFFE_ENFORCE(
	X.dim32(0) == indices.dim32(0),
	"X.dim32(0) must be equal to indices.dim32(0)",
	"(",
	X.dim32(0),
	" vs. ",
	indices.dim32(0),
	")");

	auto* Y = Output(0, X.sizes(), at::dtype<float>());

	BatchPermutationKernel<true><<<
	CAFFE_GET_BLOCKS(X.size()),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	X.dim32(0),
	X.dim32(1),
	X.dim32(2),
	X.dim32(3),
	X.data<float>(),
	indices.data<int>(),
	Y->mutable_data<float>());

	return true;
	}

	template <>
	bool BatchPermutationGradientOp<float, CUDAContext>::RunOnDevice() {
	auto& indices = Input(0);
	auto& dY = Input(1);

	auto* dX = Output(0, dY.sizes(), at::dtype<float>());

	BatchPermutationKernel<false><<<
	CAFFE_GET_BLOCKS(dY.size()),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	dY.dim32(0),
	dY.dim32(1),
	dY.dim32(2),
	dY.dim32(3),
	dY.data<float>(),
	indices.data<int>(),
	dX->mutable_data<float>());

	return true;
	}

	REGISTER_CUDA_OPERATOR(
	BatchPermutation,
	BatchPermutationOp<float, CUDAContext>);
	REGISTER_CUDA_OPERATOR(
	BatchPermutationGradient,
	BatchPermutationGradientOp<float, CUDAContext>);
	} // namespace caffe2