| #include "THCUNN.h" |
| #include "common.h" |
| |
| #include <thrust/fill.h> |
| #include <thrust/functional.h> |
| #include <thrust/device_ptr.h> |
| #include <thrust/reduce.h> |
| #include <thrust/inner_product.h> |
| #if CUDA_VERSION >= 7000 |
| #include <thrust/system/cuda/execution_policy.h> |
| #endif |
| |
| struct smoothl1_functor |
| { |
| smoothl1_functor() {} |
| |
| __host__ __device__ float operator()(const float &x, const float &y) const |
| { |
| float z = fabsf(x-y); |
| return z < 1.f ? 0.5f*z*z : z - 0.5f; |
| } |
| }; |
| |
| void THNN_CudaSmoothL1Criterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage) |
| { |
| THCUNN_assertSameGPU(state, 2, input, target); |
| THArgCheck( |
| THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2, |
| "input and target need to have the same number of elements" |
| ); |
| |
| long size = THCudaTensor_nElement(state, input); |
| |
| input = THCudaTensor_newContiguous(state, input); |
| target = THCudaTensor_newContiguous(state, target); |
| |
| thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); |
| thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); |
| float sum = thrust::inner_product( |
| #if CUDA_VERSION >= 7000 |
| thrust::cuda::par.on(THCState_getCurrentStream(state)), |
| #endif |
| input_data, input_data+size, target_data, (float) 0, |
| thrust::plus<float>(), smoothl1_functor() |
| ); |
| |
| if (sizeAverage) |
| sum /= size; |
| |
| THCudaTensor_free(state, input); |
| THCudaTensor_free(state, target); |
| |
| THCudaTensor_set1d(state, output, 0, sum); |
| } |
| |
| struct smoothl1_updateGradInput_functor |
| { |
| const float norm; |
| |
| smoothl1_updateGradInput_functor(float norm_) |
| : norm(norm_) |
| {} |
| |
| __host__ __device__ float operator()(const float &x, const float &y) const |
| { |
| float z = x - y; |
| if (z < -1.f) |
| return -norm; |
| else if (z > 1.f) |
| return norm; |
| else |
| return norm * z; |
| } |
| }; |
| |
| void THNN_CudaSmoothL1Criterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage) |
| { |
| THCUNN_assertSameGPU(state, 3, input, target, gradInput); |
| THArgCheck( |
| THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2, |
| "input and target need to have the same number of elements" |
| ); |
| |
| long size = THCudaTensor_nElement(state, input); |
| float norm = sizeAverage ? 1./size : 1.; |
| |
| input = THCudaTensor_newContiguous(state, input); |
| target = THCudaTensor_newContiguous(state, target); |
| |
| THCudaTensor_resizeAs(state, gradInput, input); |
| |
| thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); |
| thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); |
| thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput)); |
| |
| thrust::transform( |
| #if CUDA_VERSION >= 7000 |
| thrust::cuda::par.on(THCState_getCurrentStream(state)), |
| #endif |
| input_data, input_data+size, target_data, gradInput_data, |
| smoothl1_updateGradInput_functor(norm) |
| ); |
| |
| THCudaTensor_free(state, input); |
| THCudaTensor_free(state, target); |
| } |