blob: 068784b676de950766bd49c15a391a97d4f5f56e [file] [log] [blame]
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/accuracy_op.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
namespace {
__global__ void AccuracyKernel(const int N, const int D, const float* Xdata,
const int* labeldata, float* accuracy) {
int count = 0;
CUDA_1D_KERNEL_LOOP(i, N) {
float maxval = Xdata[i * D];
int maxid = 0;
for (int j = 1; j < D; ++j) {
if (Xdata[i * D + j] > maxval) {
maxval = Xdata[i * D + j];
maxid = j;
}
}
if (maxid == labeldata[i]) {
++count;
}
}
atomicAdd(accuracy, static_cast<float>(count));
}
__global__ void AccuracyDivideKernel(const int N, float* accuracy) {
*accuracy /= N;
}
} // namespace
template <>
bool AccuracyOp<float, CUDAContext>::RunOnDevice() {
CAFFE_ENFORCE_EQ(
top_k_, 1, "Currently only top-1 accuracy supported");
auto& X = Input(PREDICTION);
auto& label = Input(LABEL);
auto* Y = Output(0);
DCHECK_EQ(X.ndim(), 2);
int N = X.dim32(0);
int D = X.dim32(1);
DCHECK_EQ(label.ndim(), 1);
DCHECK_EQ(label.dim32(0), N);
Y->Resize(vector<TIndex>());
float* Ydata = Y->mutable_data<float>();
math::Set<float, CUDAContext>(1, 0, Ydata, &context_);
AccuracyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, D, X.data<float>(), label.data<int>(), Ydata);
// This is going to be executed only in one single kernel. Not very beautiful,
// but probably we have to do this?
AccuracyDivideKernel<<<1, 1, 0, context_.cuda_stream()>>>(
N, Ydata);
return true;
}
namespace {
REGISTER_CUDA_OPERATOR(Accuracy, AccuracyOp<float, CUDAContext>);
} // namespace
} // namespace caffe2