|  | #include <algorithm> | 
|  |  | 
|  | #include "caffe2/core/context_gpu.h" | 
|  | #include "caffe2/operators/boolean_mask_ops.h" | 
|  | #include <cub/cub.cuh> | 
|  | #include "caffe2/utils/cub_namespace.cuh" | 
|  |  | 
|  | namespace caffe2 { | 
|  |  | 
|  | namespace { | 
|  | __global__ void BooleanMaskCopyKernel( | 
|  | const int64_t numOfOutput, | 
|  | const int64_t numBytes, | 
|  | const int64_t* indices, | 
|  | const uint8_t* src, | 
|  | uint8_t* dest) { | 
|  | for (int64_t i = blockIdx.x; i < numOfOutput; i += gridDim.x) { | 
|  | const auto srcBase = indices[i] * numBytes; | 
|  | const auto destBase = i * numBytes; | 
|  | for (int64_t j = threadIdx.x; j < numBytes; j += blockDim.x) { | 
|  | dest[destBase + j] = src[srcBase + j]; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | template <> | 
|  | class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> { | 
|  | public: | 
|  | BooleanMaskOp(const OperatorDef& operator_def, Workspace* ws) | 
|  | : Operator<CUDAContext>(operator_def, ws) {} | 
|  |  | 
|  | bool RunOnDevice() override { | 
|  | const auto& src = Input(0); | 
|  | const auto& mask = Input(1); | 
|  | auto* dest = Output(0); | 
|  |  | 
|  | CAFFE_ENFORCE(src.dim() >= 1); | 
|  | CAFFE_ENFORCE_EQ(mask.dim(), 1); | 
|  | CAFFE_ENFORCE(src.size(0) == mask.size(0)); | 
|  |  | 
|  | const auto* maskData = mask.data<bool>(); | 
|  | const auto outerSize = mask.size(0); | 
|  | ReinitializeTensor( | 
|  | &indices_, {outerSize}, at::dtype<int64_t>().device(CUDA)); | 
|  | auto* indicesData = indices_.mutable_data<int64_t>(); | 
|  |  | 
|  | size_t numBytes = 0; | 
|  | cub::CountingInputIterator<int> itr(0); | 
|  | cub::DeviceSelect::Flagged( | 
|  | nullptr, | 
|  | numBytes, | 
|  | itr, | 
|  | maskData, | 
|  | indicesData, | 
|  | static_cast<int64_t*>(nullptr), | 
|  | outerSize, | 
|  | context_.cuda_stream()); | 
|  |  | 
|  | auto numint64_t = | 
|  | static_cast<int64_t>((numBytes + sizeof(int64_t) - 1) / sizeof(int64_t)); | 
|  | // allocate one more int64_t at the end of scratch for storing numOfOutput | 
|  | ReinitializeTensor( | 
|  | &scratch_, {numint64_t + 1}, at::dtype<int64_t>().device(CUDA)); | 
|  | auto* scratchData = scratch_.mutable_data<int64_t>(); | 
|  | auto* numOfOutputData = scratchData + numint64_t; | 
|  |  | 
|  | cub::DeviceSelect::Flagged( | 
|  | static_cast<void*>(scratchData), | 
|  | numBytes, | 
|  | itr, | 
|  | maskData, | 
|  | indicesData, | 
|  | numOfOutputData, | 
|  | outerSize, | 
|  | context_.cuda_stream()); | 
|  |  | 
|  | // Copy numOfOutput from gpu to cpu | 
|  | int64_t numOfOutput; | 
|  | context_.CopyToCPU(1, numOfOutputData, &numOfOutput); | 
|  |  | 
|  | indices_.Resize(numOfOutput); | 
|  | std::vector<int64_t> dims = src.sizes().vec(); | 
|  | dims[0] = numOfOutput; | 
|  | dest->Resize(dims); | 
|  | auto* destData = (uint8_t*)dest->raw_mutable_data(src.meta()); | 
|  | const auto* srcData = (uint8_t*)src.raw_data(); | 
|  | if (OutputSize() == 2) { | 
|  |  | 
|  | auto* indicesOut = Output(1, {numOfOutput}, at::dtype<int64_t>()); | 
|  | indicesOut->template mutable_data<int64_t>(); | 
|  | } | 
|  |  | 
|  | if (numOfOutput > 0) { | 
|  | BooleanMaskCopyKernel<<< | 
|  | std::min(numOfOutput, static_cast<int64_t>(CAFFE_MAXIMUM_NUM_BLOCKS)), | 
|  | CAFFE_CUDA_NUM_THREADS, | 
|  | 0, | 
|  | context_.cuda_stream()>>>( | 
|  | numOfOutput, | 
|  | src.size_from_dim(1) * src.meta().itemsize(), | 
|  | indicesData, | 
|  | srcData, | 
|  | destData); | 
|  | C10_CUDA_KERNEL_LAUNCH_CHECK(); | 
|  |  | 
|  | if (OutputSize() == 2) { | 
|  | Output(1)->CopyFrom(indices_, /* async */ true); | 
|  | } | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | private: | 
|  | Tensor indices_; | 
|  | Tensor scratch_; | 
|  | }; | 
|  |  | 
|  | REGISTER_CUDA_OPERATOR(BooleanMask, BooleanMaskOp<CUDAContext>); | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | #define minf (-1.0f * std::numeric_limits<float>::infinity()) | 
|  |  | 
|  | template <typename T> | 
|  | __global__ void sequenceMaskKernel( | 
|  | int N, | 
|  | int M, | 
|  | int B, | 
|  | const T* in, | 
|  | const int* seq_lengths, | 
|  | T fill_val, | 
|  | T* out) { | 
|  | if (B >= 0) { | 
|  | CUDA_1D_KERNEL_LOOP(index, B * N * M) { | 
|  | int k = index % M; | 
|  | int j = (index - k) / M % N; | 
|  | int i = (index - M * j - k) / (N * M); | 
|  |  | 
|  | int ind = N * M * i + M * j + k; | 
|  | out[ind] = (k >= seq_lengths[j] ? fill_val : in[ind]); | 
|  | } | 
|  | } else { | 
|  | CUDA_1D_KERNEL_LOOP(index, N * M) { | 
|  | int i = index / M; | 
|  | int j = index % M; | 
|  |  | 
|  | out[index] = (j >= seq_lengths[i] ? fill_val : in[index]); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | __global__ void repeatedSequenceMaskKernel( | 
|  | int N, | 
|  | int M, | 
|  | int D, | 
|  | const T* in, | 
|  | const int* seq_lengths, | 
|  | T fill_val, | 
|  | T* out) { | 
|  | CUDA_1D_KERNEL_LOOP(index, N * M * D) { | 
|  | int i = index / (D * M); | 
|  | int j = (index / D) % M; | 
|  |  | 
|  | out[index] = (j >= seq_lengths[i] ? fill_val : in[index]); | 
|  | } | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | __global__ void windowMaskKernel( | 
|  | int N, | 
|  | int M, | 
|  | int B, | 
|  | const T* in, | 
|  | const int* window_centers, | 
|  | const int radius, | 
|  | T fill_val, | 
|  | T* out) { | 
|  | if (B >= 0) { | 
|  | CUDA_1D_KERNEL_LOOP(index, B * N * M) { | 
|  | int k = index % M; | 
|  | int j = (index - k) / M % N; | 
|  | int i = (index - M * j - k) / (N * M); | 
|  |  | 
|  | int ind = N * M * i + M * j + k; | 
|  | out[ind] = | 
|  | (k < window_centers[j] - radius || k > window_centers[j] + radius | 
|  | ? fill_val | 
|  | : in[ind]); | 
|  | } | 
|  | } else { | 
|  | CUDA_1D_KERNEL_LOOP(index, N * M) { | 
|  | int i = index / M; | 
|  | int j = index % M; | 
|  |  | 
|  | out[index] = | 
|  | (j < window_centers[i] - radius || j > window_centers[i] + radius | 
|  | ? fill_val | 
|  | : in[index]); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | __global__ void | 
|  | upperMaskKernel(int N, int M, int B, const T* in, T fill_val, T* out) { | 
|  | if (B >= 0) { | 
|  | CUDA_1D_KERNEL_LOOP(index, B * N * M) { | 
|  | int k = index % M; | 
|  | int j = (index - k) / M % N; | 
|  | int i = (index - M * j - k) / (N * M); | 
|  |  | 
|  | int ind = N * M * i + M * j + k; | 
|  | out[ind] = (k > j ? fill_val : in[ind]); | 
|  | } | 
|  | } else { | 
|  | CUDA_1D_KERNEL_LOOP(index, N * M) { | 
|  | int i = index / M; | 
|  | int j = index % M; | 
|  |  | 
|  | out[index] = (j > i ? fill_val : in[index]); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | __global__ void | 
|  | lowerMaskKernel(int N, int M, int B, const T* in, T fill_val, T* out) { | 
|  | if (B >= 0) { | 
|  | CUDA_1D_KERNEL_LOOP(index, B * N * M) { | 
|  | int k = index % M; | 
|  | int j = (index - k) / M % N; | 
|  | int i = (index - M * j - k) / (N * M); | 
|  |  | 
|  | int ind = N * M * i + M * j + k; | 
|  | out[ind] = (k < j ? fill_val : in[ind]); | 
|  | } | 
|  | } else { | 
|  | CUDA_1D_KERNEL_LOOP(index, N * M) { | 
|  | int i = index / M; | 
|  | int j = index % M; | 
|  |  | 
|  | out[index] = (j < i ? fill_val : in[index]); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | __global__ void | 
|  | upperDiagMaskKernel(int N, int M, int B, const T* in, T fill_val, T* out) { | 
|  | if (B >= 0) { | 
|  | CUDA_1D_KERNEL_LOOP(index, B * N * M) { | 
|  | int k = index % M; | 
|  | int j = (index - k) / M % N; | 
|  | int i = (index - M * j - k) / (N * M); | 
|  |  | 
|  | int ind = N * M * i + M * j + k; | 
|  | out[ind] = (k >= j ? fill_val : in[ind]); | 
|  | } | 
|  | } else { | 
|  | CUDA_1D_KERNEL_LOOP(index, N * M) { | 
|  | int i = index / M; | 
|  | int j = index % M; | 
|  |  | 
|  | out[index] = (j >= i ? fill_val : in[index]); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | __global__ void | 
|  | lowerDiagMaskKernel(int N, int M, int B, const T* in, T fill_val, T* out) { | 
|  | if (B >= 0) { | 
|  | CUDA_1D_KERNEL_LOOP(index, B * N * M) { | 
|  | int k = index % M; | 
|  | int j = (index - k) / M % N; | 
|  | int i = (index - M * j - k) / (N * M); | 
|  |  | 
|  | int ind = N * M * i + M * j + k; | 
|  | out[ind] = (k <= j ? fill_val : in[ind]); | 
|  | } | 
|  | } else { | 
|  | CUDA_1D_KERNEL_LOOP(index, N * M) { | 
|  | int i = index / M; | 
|  | int j = index % M; | 
|  |  | 
|  | out[index] = (j <= i ? fill_val : in[index]); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | } // namespace | 
|  |  | 
|  | template <> | 
|  | bool SequenceMaskOp<CUDAContext>::RunOnDevice() { | 
|  | return DispatchHelper<TensorTypes<at::Half, float>>::call(this, Input(0)); | 
|  | } | 
|  |  | 
|  | template <> | 
|  | template <class T> | 
|  | bool SequenceMaskOp<CUDAContext>::DoRunWithType() { | 
|  | const Tensor* input = &Input(0); | 
|  | const Tensor* sequence_lengths = nullptr; | 
|  | const Tensor* window_centers = nullptr; | 
|  |  | 
|  | if (mode_ == "sequence") { | 
|  | sequence_lengths = &Input(1); | 
|  | } else if (mode_ == "window") { | 
|  | window_centers = &Input(1); | 
|  | } | 
|  |  | 
|  | auto* output = Output(0, input->sizes(), at::dtype<T>()); | 
|  |  | 
|  | const auto canonical_axis = input->canonical_axis_index(axis_); | 
|  |  | 
|  | // canonical_batch is non-negative if batching, -1 otherwise | 
|  | int canonical_batch = -1; | 
|  | if ((HasArgument("batch"))) { | 
|  | canonical_batch = input->canonical_axis_index(batch_); | 
|  | } | 
|  |  | 
|  | // make sure batch < axis | 
|  | if (canonical_batch >= 0) { | 
|  | CAFFE_ENFORCE_LT(canonical_batch, canonical_axis); | 
|  | } | 
|  |  | 
|  | // if no batch, then left is product of dims up to axis | 
|  | // otherwise, left is product of dims between batch and axis | 
|  | const int left = | 
|  | (canonical_batch >= 0 | 
|  | ? input->size_between_dim(canonical_batch, canonical_axis) | 
|  | : input->size_to_dim(canonical_axis)); | 
|  | const int right = input->size_from_dim(canonical_axis); | 
|  |  | 
|  | // product of dims from 1 to batch | 
|  | const int batch_dim = | 
|  | (canonical_batch >= 0 | 
|  | ? input->size_to_dim(canonical_batch) * input->dim(canonical_batch) | 
|  | : -1); | 
|  |  | 
|  | T fill_val = convert::To<float, T>(grad_ ? 0.0f : fill_val_); | 
|  | if (mode_ == "sequence") { | 
|  | if (HasArgument("repeat_from_axis")) { | 
|  | const int canonical_repeat_from = | 
|  | input->canonical_axis_index(repeat_from_); | 
|  | const int repeated_dims = input->size_from_dim(canonical_repeat_from); | 
|  | const int masked_dims = right / repeated_dims; | 
|  | repeatedSequenceMaskKernel<<< | 
|  | CAFFE_GET_BLOCKS(left * right), | 
|  | CAFFE_CUDA_NUM_THREADS, | 
|  | 0, | 
|  | context_.cuda_stream()>>>( | 
|  | left, | 
|  | masked_dims, | 
|  | repeated_dims, | 
|  | input->data<T>(), | 
|  | sequence_lengths->data<int>(), | 
|  | fill_val, | 
|  | output->template mutable_data<T>()); | 
|  | C10_CUDA_KERNEL_LAUNCH_CHECK(); | 
|  | } else { | 
|  | sequenceMaskKernel<<< | 
|  | CAFFE_GET_BLOCKS(left * right), | 
|  | CAFFE_CUDA_NUM_THREADS, | 
|  | 0, | 
|  | context_.cuda_stream()>>>( | 
|  | left, | 
|  | right, | 
|  | batch_dim, | 
|  | input->data<T>(), | 
|  | sequence_lengths->data<int>(), | 
|  | fill_val, | 
|  | output->template mutable_data<T>()); | 
|  | C10_CUDA_KERNEL_LAUNCH_CHECK(); | 
|  | } | 
|  | } else if (mode_ == "window") { | 
|  | windowMaskKernel<<< | 
|  | CAFFE_GET_BLOCKS(left * right), | 
|  | CAFFE_CUDA_NUM_THREADS, | 
|  | 0, | 
|  | context_.cuda_stream()>>>( | 
|  | left, | 
|  | right, | 
|  | batch_dim, | 
|  | input->data<T>(), | 
|  | window_centers->data<int>(), | 
|  | radius_, | 
|  | fill_val, | 
|  | output->template mutable_data<T>()); | 
|  | C10_CUDA_KERNEL_LAUNCH_CHECK(); | 
|  | } else if (mode_ == "upper") { | 
|  | upperMaskKernel<<< | 
|  | CAFFE_GET_BLOCKS(left * right), | 
|  | CAFFE_CUDA_NUM_THREADS, | 
|  | 0, | 
|  | context_.cuda_stream()>>>( | 
|  | left, | 
|  | right, | 
|  | batch_dim, | 
|  | input->data<T>(), | 
|  | fill_val, | 
|  | output->template mutable_data<T>()); | 
|  | C10_CUDA_KERNEL_LAUNCH_CHECK(); | 
|  | } else if (mode_ == "lower") { | 
|  | lowerMaskKernel<<< | 
|  | CAFFE_GET_BLOCKS(left * right), | 
|  | CAFFE_CUDA_NUM_THREADS, | 
|  | 0, | 
|  | context_.cuda_stream()>>>( | 
|  | left, | 
|  | right, | 
|  | batch_dim, | 
|  | input->data<T>(), | 
|  | fill_val, | 
|  | output->template mutable_data<T>()); | 
|  | C10_CUDA_KERNEL_LAUNCH_CHECK(); | 
|  | } else if (mode_ == "upperdiag") { | 
|  | upperDiagMaskKernel<<< | 
|  | CAFFE_GET_BLOCKS(left * right), | 
|  | CAFFE_CUDA_NUM_THREADS, | 
|  | 0, | 
|  | context_.cuda_stream()>>>( | 
|  | left, | 
|  | right, | 
|  | batch_dim, | 
|  | input->data<T>(), | 
|  | fill_val, | 
|  | output->template mutable_data<T>()); | 
|  | C10_CUDA_KERNEL_LAUNCH_CHECK(); | 
|  | } else if (mode_ == "lowerdiag") { | 
|  | lowerDiagMaskKernel<<< | 
|  | CAFFE_GET_BLOCKS(left * right), | 
|  | CAFFE_CUDA_NUM_THREADS, | 
|  | 0, | 
|  | context_.cuda_stream()>>>( | 
|  | left, | 
|  | right, | 
|  | batch_dim, | 
|  | input->data<T>(), | 
|  | fill_val, | 
|  | output->template mutable_data<T>()); | 
|  | C10_CUDA_KERNEL_LAUNCH_CHECK(); | 
|  | } else { | 
|  | CAFFE_ENFORCE(false, "Unsupported mode for SequenceMaskOp!"); | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | REGISTER_CUDA_OPERATOR(SequenceMask, SequenceMaskOp<CUDAContext>); | 
|  |  | 
|  | } // namespace caffe2 |