|  | /** | 
|  | * Copyright (c) 2016-present, Facebook, Inc. | 
|  | * | 
|  | * Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | * you may not use this file except in compliance with the License. | 
|  | * You may obtain a copy of the License at | 
|  | * | 
|  | *     http://www.apache.org/licenses/LICENSE-2.0 | 
|  | * | 
|  | * Unless required by applicable law or agreed to in writing, software | 
|  | * distributed under the License is distributed on an "AS IS" BASIS, | 
|  | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | * See the License for the specific language governing permissions and | 
|  | * limitations under the License. | 
|  | */ | 
|  |  | 
|  | #include "modules/detectron/batch_permutation_op.h" | 
|  | #include "caffe2/core/context_gpu.h" | 
|  |  | 
|  | namespace caffe2 { | 
|  |  | 
|  | namespace { | 
|  | template <bool forward> | 
|  | __global__ void BatchPermutationKernel( | 
|  | int N, | 
|  | int C, | 
|  | int H, | 
|  | int W, | 
|  | const float* src, | 
|  | const int* indices, | 
|  | float* dst) { | 
|  | CUDA_1D_KERNEL_LOOP(index, N * C * H * W) { | 
|  | int w = index % W; | 
|  | int h = (index / W) % H; | 
|  | int c = (index / W / H) % C; | 
|  | int n = (index / W / H / C); | 
|  | int idx = indices[n]; | 
|  | if (forward) { | 
|  | dst[n * C * H * W + c * H * W + h * W + w] = | 
|  | src[idx * C * H * W + c * H * W + h * W + w]; | 
|  | } else { | 
|  | dst[idx * C * H * W + c * H * W + h * W + w] = | 
|  | src[n * C * H * W + c * H * W + h * W + w]; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | template <> | 
|  | bool BatchPermutationOp<float, CUDAContext>::RunOnDevice() { | 
|  | auto& X = Input(0); | 
|  | auto& indices = Input(1); | 
|  |  | 
|  |  | 
|  | CAFFE_ENFORCE(indices.ndim() == 1, "indices must be 1-d"); | 
|  | CAFFE_ENFORCE( | 
|  | X.dim32(0) == indices.dim32(0), | 
|  | "X.dim32(0) must be equal to indices.dim32(0)", | 
|  | "(", | 
|  | X.dim32(0), | 
|  | " vs. ", | 
|  | indices.dim32(0), | 
|  | ")"); | 
|  |  | 
|  | auto* Y = Output(0, X.sizes(), at::dtype<float>()); | 
|  |  | 
|  | BatchPermutationKernel<true><<< | 
|  | CAFFE_GET_BLOCKS(X.size()), | 
|  | CAFFE_CUDA_NUM_THREADS, | 
|  | 0, | 
|  | context_.cuda_stream()>>>( | 
|  | X.dim32(0), | 
|  | X.dim32(1), | 
|  | X.dim32(2), | 
|  | X.dim32(3), | 
|  | X.data<float>(), | 
|  | indices.data<int>(), | 
|  | Y->mutable_data<float>()); | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | template <> | 
|  | bool BatchPermutationGradientOp<float, CUDAContext>::RunOnDevice() { | 
|  | auto& indices = Input(0); | 
|  | auto& dY = Input(1); | 
|  |  | 
|  | auto* dX = Output(0, dY.sizes(), at::dtype<float>()); | 
|  |  | 
|  | BatchPermutationKernel<false><<< | 
|  | CAFFE_GET_BLOCKS(dY.size()), | 
|  | CAFFE_CUDA_NUM_THREADS, | 
|  | 0, | 
|  | context_.cuda_stream()>>>( | 
|  | dY.dim32(0), | 
|  | dY.dim32(1), | 
|  | dY.dim32(2), | 
|  | dY.dim32(3), | 
|  | dY.data<float>(), | 
|  | indices.data<int>(), | 
|  | dX->mutable_data<float>()); | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | REGISTER_CUDA_OPERATOR( | 
|  | BatchPermutation, | 
|  | BatchPermutationOp<float, CUDAContext>); | 
|  | REGISTER_CUDA_OPERATOR( | 
|  | BatchPermutationGradient, | 
|  | BatchPermutationGradientOp<float, CUDAContext>); | 
|  | } // namespace caffe2 |