blob: e6f7614e57d3e66a8f8f350035e7a53a6ec545fc [file] [log] [blame]
#include "caffe2/operators/glu_op.h"
#include "caffe2/core/context_gpu.h"
namespace caffe2 {
namespace {
__global__ void glu_kernel(
const int M,
const int split_dim_size,
const int N,
const float* Xdata,
float* Ydata) {
const int xOffset = 2 * split_dim_size * N;
const int yOffset = split_dim_size * N;
CUDA_1D_KERNEL_LOOP(index, M * split_dim_size * N) {
const int i = index / split_dim_size / N;
const int j = index / N % split_dim_size;
const int k = index % N;
const float x1 = Xdata[i * xOffset + j * N + k];
const float x2 = Xdata[i * xOffset + (j + split_dim_size) * N + k];
Ydata[i * yOffset + j * N + k] = x1 * (1. / (1. + exp(-x2)));
}
}
} // namespace
template <>
void GluOp<float, CUDAContext>::ComputeGlu(
const int M,
const int split_dim_size,
const int N,
const float* x_data,
float* y_data) {
glu_kernel<<<
CAFFE_GET_BLOCKS(M * N * split_dim_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(M, split_dim_size, N, x_data, y_data);
}
REGISTER_CUDA_OPERATOR(Glu, GluOp<float, CUDAContext>);
} // namespace caffe2