blob: a2734e46b217664ed6bee27df0cf8e8a9a5b9eca [file] [log] [blame]
#include <assert.h>
#include <cub/block/block_reduce.cuh>
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/cross_entropy_op.h"
#include "caffe2/operators/operator_fallback_gpu.h"
#include "caffe2/utils/cub_namespace.cuh"
namespace caffe2 {
namespace {
__global__ void LabelCrossEntropyKernel(
const int N, const int D, const float* Xdata, const int* labeldata,
const float log_threshold, float* Ydata) {
CUDA_1D_KERNEL_LOOP(i, N) {
CUDA_KERNEL_ASSERT(labeldata[i] >= 0 && labeldata[i] < D);
Ydata[i] = -logf(fmaxf(Xdata[i * D + labeldata[i]], log_threshold));
}
}
__global__ void LabelCrossEntropyGradientKernel(
const int N, const int D, const float* Xdata, const int* labeldata,
const float* dYdata, const float log_threshold, float* dXdata) {
CUDA_1D_KERNEL_LOOP(i, N) {
int idx = i * D + labeldata[i];
dXdata[idx] = - dYdata[i] / fmaxf(Xdata[idx], log_threshold);
}
}
} // namespace
template <>
bool LabelCrossEntropyOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
auto& label = Input(1);
int N, D;
if (X.dim() > 1) {
N = X.dim32(0);
D = X.size_from_dim(1);
} else {
N = 1;
D = X.dim32(0);
}
CAFFE_ENFORCE(
(label.dim() == 1) || (label.dim() == 2 && label.dim32(1) == 1));
CAFFE_ENFORCE_EQ(label.dim32(0), N);
auto* Y = Output(0, vector<int64_t>(size_t(1), N), at::dtype<float>());
LabelCrossEntropyKernel<<<
CAFFE_GET_BLOCKS(N),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
N,
D,
X.data<float>(),
label.data<int>(),
kLOG_THRESHOLD(),
Y->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
template <>
bool LabelCrossEntropyGradientOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
auto& label = Input(1);
auto& dY = Input(2);
int N, D;
if (X.dim() > 1) {
N = X.dim32(0);
D = X.size_from_dim(1);
} else {
N = 1;
D = X.dim32(0);
}
CAFFE_ENFORCE(
(label.dim() == 1) || (label.dim() == 2 && label.dim32(1) == 1));
CAFFE_ENFORCE_EQ(label.dim32(0), N);
CAFFE_ENFORCE_EQ(dY.dim(), 1);
CAFFE_ENFORCE_EQ(dY.dim32(0), N);
auto* dX = Output(0, X.sizes(), at::dtype<float>());
math::Set<float, CUDAContext>(
dX->numel(), 0.f, dX->template mutable_data<float>(), &context_);
LabelCrossEntropyGradientKernel<<<
CAFFE_GET_BLOCKS(N),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
N,
D,
X.data<float>(),
label.data<int>(),
dY.data<float>(),
kLOG_THRESHOLD(),
dX->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
namespace {
__global__ void MakeTwoClassKernel(
const int N, const float* Xdata, float* Ydata) {
CUDA_1D_KERNEL_LOOP(i, N) {
Ydata[i * 2] = 1.0 - Xdata[i];
Ydata[i * 2 + 1] = Xdata[i];
}
}
__global__ void MakeTwoClassGradientKernel(
const int N, const float* dYdata, float* dXdata) {
CUDA_1D_KERNEL_LOOP(i, N) {
dXdata[i] = dYdata[i * 2 + 1] - dYdata[i * 2];
}
}
} // namespace
template <>
bool MakeTwoClassOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
auto shape = X.sizes().vec();
shape.push_back(2);
CAFFE_ENFORCE_LT(X.numel(), std::numeric_limits<int>::max() / 2);
auto* Y = Output(0, shape, at::dtype<float>());
int N = X.numel();
MakeTwoClassKernel<<<
CAFFE_GET_BLOCKS(N),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
N, X.data<float>(), Y->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
template <>
bool MakeTwoClassGradientOp<float, CUDAContext>::RunOnDevice() {
auto& dY = Input(0);
auto shape = dY.sizes().vec();
CAFFE_ENFORCE_GE(shape.size(), 1);
CAFFE_ENFORCE_EQ(shape.back(), 2);
shape.pop_back();
CAFFE_ENFORCE_LT(dY.numel(), std::numeric_limits<int>::max());
auto* dX = Output(0, shape, at::dtype<float>());
int N = dX->numel();
MakeTwoClassGradientKernel<<<
CAFFE_GET_BLOCKS(N),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
N, dY.data<float>(), dX->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
namespace {
__device__ float sigmoid_xent_forward(float lgt, float tgt) {
return lgt * (tgt - (lgt >= 0)) - log(1 + exp(lgt - 2 * lgt * (lgt >= 0)));
}
__device__ float sigmoid_xent_backward(float lgt, float tgt) {
return tgt - 1. / (1. + exp(-lgt));
}
__device__ float sigmoid_partition(float lgt) {
// computes log(1 + exp(lgt)) with only exp(x) function when x >= 0
return lgt * (lgt >= 0) + log(1 + exp(lgt - 2 * lgt * (lgt >= 0)));
}
__device__ float sigmoid_xent_forward_with_log_d_trick(float lgt, float tgt) {
return (2 * tgt - 1.) * (lgt - sigmoid_partition(lgt));
}
__device__ float sigmoid_xent_backward_with_log_d_trick(float lgt, float tgt) {
return (2 * tgt - 1.) / (1. + exp(lgt));
}
__device__ float unjoined_sigmoid_xent_forward(float lgt, float tgt) {
return lgt * tgt + (tgt - 1) * lgt * (lgt >= 0) -
(1 - tgt) * log(1 + exp(lgt - 2 * lgt * (lgt >= 0)));
}
__device__ float unjoined_sigmoid_xent_backward(float lgt, float tgt) {
return tgt - (1. - tgt) / (1. + exp(-lgt));
}
__global__ void SigmoidCrossEntropyWithLogitsKernel(
const int inner_size,
const bool log_D_trick,
const bool unjoined_lr_loss,
const float* logits_ptr,
const float* targets_ptr,
float* out_ptr) {
int i = blockIdx.x;
int last_idx = (i + 1) * inner_size;
float value = 0;
for (int in_idx = i * inner_size + threadIdx.x; in_idx < last_idx;
in_idx += blockDim.x) {
if (unjoined_lr_loss) {
value += unjoined_sigmoid_xent_forward(
logits_ptr[in_idx], targets_ptr[in_idx]);
} else {
value +=
(log_D_trick
? sigmoid_xent_forward_with_log_d_trick(
logits_ptr[in_idx], targets_ptr[in_idx])
: sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]));
}
}
typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float sum = BlockReduce(temp_storage).Sum(value);
if (threadIdx.x == 0) {
out_ptr[i] = -sum / inner_size;
}
}
__global__ void SigmoidCrossEntropyGradientWithLogitsKernel(
const int outer_size,
const int inner_size,
const bool log_D_trick,
const bool unjoined_lr_loss,
const float* g_ptr,
const float* logits_ptr,
const float* targets_ptr,
float* out_ptr) {
CUDA_1D_KERNEL_LOOP(in_idx, outer_size * inner_size) {
int i = in_idx / inner_size;
auto g_factor = -g_ptr[i] / inner_size;
if (unjoined_lr_loss) {
out_ptr[in_idx] = g_factor *
unjoined_sigmoid_xent_backward(
logits_ptr[in_idx], targets_ptr[in_idx]);
} else {
out_ptr[in_idx] = g_factor *
(log_D_trick ? sigmoid_xent_backward_with_log_d_trick(
logits_ptr[in_idx], targets_ptr[in_idx])
: sigmoid_xent_backward(
logits_ptr[in_idx], targets_ptr[in_idx]));
}
}
}
} // namespace
template <>
bool SigmoidCrossEntropyWithLogitsOp<float, CUDAContext>::RunOnDevice() {
auto& logits = Input(0);
auto& targets = Input(1);
CAFFE_ENFORCE_EQ(logits.sizes(), targets.sizes());
const auto inner_size = logits.dim() > 0 ? logits.sizes().back() : 1;
const auto outer_size = logits.numel() / inner_size;
std::vector<int64_t> dims;
if (logits.dim() != 0) {
dims =
std::vector<int64_t>(logits.sizes().begin(), logits.sizes().end() - 1);
}
auto* out = Output(0, dims, at::dtype<float>());
auto* out_ptr = out->template mutable_data<float>();
auto* logits_ptr = logits.data<float>();
auto* targets_ptr = targets.data<float>();
if (logits.numel() <= 0) {
// nothing to do, not even launching kernel
return true;
}
SigmoidCrossEntropyWithLogitsKernel<<<
outer_size,
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
inner_size,
log_D_trick_,
unjoined_lr_loss_,
logits_ptr,
targets_ptr,
out_ptr);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
template <>
bool SigmoidCrossEntropyWithLogitsGradientOp<float, CUDAContext>::
RunOnDevice() {
auto& g = Input(0);
auto& logits = Input(1);
auto& targets = Input(2);
CAFFE_ENFORCE_EQ(logits.sizes(), targets.sizes());
const auto inner_size = logits.dim() > 0 ? logits.sizes().back() : 1;
const auto outer_size = logits.numel() / inner_size;
CAFFE_ENFORCE_EQ(g.numel(), outer_size);
auto* out = Output(0, logits.sizes(), at::dtype<float>());
auto* out_ptr = out->template mutable_data<float>();
auto* logits_ptr = logits.data<float>();
auto* targets_ptr = targets.data<float>();
auto* g_ptr = g.data<float>();
SigmoidCrossEntropyGradientWithLogitsKernel<<<
CAFFE_GET_BLOCKS(outer_size * inner_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
outer_size,
inner_size,
log_D_trick_,
unjoined_lr_loss_,
g_ptr,
logits_ptr,
targets_ptr,
out_ptr);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
namespace {
__global__ void WeightedSigmoidCrossEntropyWithLogitsKernel(
const int inner_size,
const float* logits_ptr,
const float* targets_ptr,
const float* weights_ptr,
float* out_ptr) {
int i = blockIdx.x;
int last_idx = (i + 1) * inner_size;
float value = 0;
for (int in_idx = i * inner_size + threadIdx.x; in_idx < last_idx;
in_idx += blockDim.x) {
value += sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]) *
weights_ptr[in_idx];
}
typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float sum = BlockReduce(temp_storage).Sum(value);
if (threadIdx.x == 0) {
out_ptr[i] = -sum / inner_size;
}
}
__global__ void WeightedSigmoidCrossEntropyGradientWithLogitsKernel(
const int outer_size,
const int inner_size,
const float* g_ptr,
const float* logits_ptr,
const float* targets_ptr,
const float* weights_ptr,
float* out_ptr) {
CUDA_1D_KERNEL_LOOP(in_idx, outer_size * inner_size) {
int i = in_idx / inner_size;
auto g_factor = -g_ptr[i] / inner_size;
out_ptr[in_idx] = g_factor *
sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]) *
weights_ptr[in_idx];
}
}
} // namespace
template <>
bool WeightedSigmoidCrossEntropyWithLogitsOp<float, CUDAContext>::
RunOnDevice() {
auto& logits = Input(0);
auto& targets = Input(1);
auto& weights = Input(2);
CAFFE_ENFORCE_EQ(logits.sizes(), targets.sizes());
CAFFE_ENFORCE_EQ(weights.sizes(), targets.sizes());
const auto inner_size = logits.dim() > 0 ? logits.sizes().back() : 1;
const auto outer_size = logits.numel() / inner_size;
std::vector<int64_t> dims;
if (logits.dim() != 0) {
dims =
std::vector<int64_t>(logits.sizes().begin(), logits.sizes().end() - 1);
}
auto* out = Output(0, dims, at::dtype<float>());
auto* out_ptr = out->template mutable_data<float>();
auto* logits_ptr = logits.data<float>();
auto* targets_ptr = targets.data<float>();
auto* weights_ptr = weights.data<float>();
WeightedSigmoidCrossEntropyWithLogitsKernel<<<
outer_size,
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
inner_size, logits_ptr, targets_ptr, weights_ptr, out_ptr);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
template <>
bool WeightedSigmoidCrossEntropyWithLogitsGradientOp<float, CUDAContext>::
RunOnDevice() {
auto& g = Input(0);
auto& logits = Input(1);
auto& targets = Input(2);
auto& weights = Input(3);
CAFFE_ENFORCE_EQ(logits.sizes(), targets.sizes());
CAFFE_ENFORCE_EQ(weights.sizes(), targets.sizes());
const auto inner_size = logits.dim() > 0 ? logits.sizes().back() : 1;
const auto outer_size = logits.numel() / inner_size;
CAFFE_ENFORCE_EQ(g.numel(), outer_size);
auto* out = Output(0, logits.sizes(), at::dtype<float>());
auto* out_ptr = out->template mutable_data<float>();
auto* logits_ptr = logits.data<float>();
auto* targets_ptr = targets.data<float>();
auto* weights_ptr = weights.data<float>();
auto* g_ptr = g.data<float>();
WeightedSigmoidCrossEntropyGradientWithLogitsKernel<<<
CAFFE_GET_BLOCKS(outer_size * inner_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
outer_size,
inner_size,
g_ptr,
logits_ptr,
targets_ptr,
weights_ptr,
out_ptr);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
REGISTER_CUDA_OPERATOR(LabelCrossEntropy,
LabelCrossEntropyOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(LabelCrossEntropyGradient,
LabelCrossEntropyGradientOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
SigmoidCrossEntropyWithLogits,
SigmoidCrossEntropyWithLogitsOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
SigmoidCrossEntropyWithLogitsGradient,
SigmoidCrossEntropyWithLogitsGradientOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
WeightedSigmoidCrossEntropyWithLogits,
WeightedSigmoidCrossEntropyWithLogitsOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
WeightedSigmoidCrossEntropyWithLogitsGradient,
WeightedSigmoidCrossEntropyWithLogitsGradientOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(MakeTwoClass,
MakeTwoClassOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(MakeTwoClassGradient,
MakeTwoClassGradientOp<float, CUDAContext>);
//TODO(surya) Add full GPU/CUDA support for the CrossEntropyOp
REGISTER_CUDA_OPERATOR(CrossEntropy, GPUFallbackOp);
REGISTER_CUDA_OPERATOR(CrossEntropyGradient, GPUFallbackOp);
} // namespace caffe2