blob: 2b528ef4b99d541bbb5f1a271b066e5c3f6b262c [file] [log] [blame]
#include "THCUNN.h"
#include "common.h"
#include <thrust/fill.h>
#include <thrust/functional.h>
#include <thrust/device_ptr.h>
#include <thrust/reduce.h>
#include <thrust/inner_product.h>
#if CUDA_VERSION >= 7000
#include <thrust/system/cuda/execution_policy.h>
#endif
struct smoothl1_functor
{
smoothl1_functor() {}
__host__ __device__ float operator()(const float &x, const float &y) const
{
float z = fabsf(x-y);
return z < 1.f ? 0.5f*z*z : z - 0.5f;
}
};
void THNN_CudaSmoothL1Criterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage)
{
THCUNN_assertSameGPU(state, 2, input, target);
THArgCheck(
THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
"input and target need to have the same number of elements"
);
long size = THCudaTensor_nElement(state, input);
input = THCudaTensor_newContiguous(state, input);
target = THCudaTensor_newContiguous(state, target);
thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
float sum = thrust::inner_product(
#if CUDA_VERSION >= 7000
thrust::cuda::par.on(THCState_getCurrentStream(state)),
#endif
input_data, input_data+size, target_data, (float) 0,
thrust::plus<float>(), smoothl1_functor()
);
if (sizeAverage)
sum /= size;
THCudaTensor_free(state, input);
THCudaTensor_free(state, target);
THCudaTensor_set1d(state, output, 0, sum);
}
struct smoothl1_updateGradInput_functor
{
const float norm;
smoothl1_updateGradInput_functor(float norm_)
: norm(norm_)
{}
__host__ __device__ float operator()(const float &x, const float &y) const
{
float z = x - y;
if (z < -1.f)
return -norm;
else if (z > 1.f)
return norm;
else
return norm * z;
}
};
void THNN_CudaSmoothL1Criterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage)
{
THCUNN_assertSameGPU(state, 3, input, target, gradInput);
THArgCheck(
THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
"input and target need to have the same number of elements"
);
long size = THCudaTensor_nElement(state, input);
float norm = sizeAverage ? 1./size : 1.;
input = THCudaTensor_newContiguous(state, input);
target = THCudaTensor_newContiguous(state, target);
THCudaTensor_resizeAs(state, gradInput, input);
thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
thrust::transform(
#if CUDA_VERSION >= 7000
thrust::cuda::par.on(THCState_getCurrentStream(state)),
#endif
input_data, input_data+size, target_data, gradInput_data,
smoothl1_updateGradInput_functor(norm)
);
THCudaTensor_free(state, input);
THCudaTensor_free(state, target);
}