torch/lib/THCUNN/MultiMarginCriterion.cu - platform/external/pytorch - Git at Google

 #include "THCUNN.h"
 #include "common.h"
 #include "THCHalf.h"
 #include "THCHalfAutoNumerics.cuh"

 #define MULTIMARGIN_THREADS 128

 template <int P, typename Dtype, typename Acctype>
 __global__ void cunn_MultiMarginCriterion_updateOutput_kernel(Dtype *output, Dtype *input, THCIndex_t *target, Dtype *weights, int nframe, int dim, bool sizeAverage, Dtype margin)
 {
   __shared__ Acctype buffer[MULTIMARGIN_THREADS];
   int k = blockIdx.x;
   Dtype *input_k = input + k*dim;
   Dtype *output_k = output + k;
   int target_k = ((int)target[k]) - TH_INDEX_BASE;
   Dtype input_target_k = input_k[target_k];

   int i_start = threadIdx.x;
   int i_end = dim;
   int i_step = blockDim.x;

   buffer[threadIdx.x] = 0;
   for (int i = i_start; i < i_end; i += i_step)
   {
     Dtype z = margin - input_target_k + input_k[i];
     if (i == target_k)
       continue;

     if (z > 0) {
       Dtype h = (P==1) ? z : z*z;
       if(weights)
         h *= weights[target_k];
       buffer[threadIdx.x] += h;
     }
   }
   __syncthreads();

   // reduce
   if (threadIdx.x == 0)
   {
     Acctype sum = 0;
     for (int i=0; i < blockDim.x; i++)
       sum += buffer[i];

     *output_k = ScalarConvert<Acctype, Dtype>::to(sum/dim);
     if(sizeAverage)
       *output_k /= nframe;
   }
 }

 template <int P, typename Dtype, typename Acctype>
 __global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(Dtype *gradInput, Dtype *input, THCIndex_t *target, Dtype *weights, int nframe, int dim, bool sizeAverage, Dtype margin)
 {
   __shared__ Acctype buffer[MULTIMARGIN_THREADS];
   int k = blockIdx.x;
   Dtype *input_k = input + k*dim;
   Dtype *gradInput_k = gradInput + k*dim;
   int target_k = ((int)target[k]) - TH_INDEX_BASE;
   Dtype input_target_k = input_k[target_k];
   Acctype g = (sizeAverage ? 1./((Acctype)(nframe*dim)) : 1./((Acctype)dim));

   int i_start = threadIdx.x;
   int i_end = dim;
   int i_step = blockDim.x;

   buffer[threadIdx.x] = 0;
   for (int i=i_start; i<i_end; i+=i_step)
   {
     Dtype z = margin - input_target_k + input_k[i];
     if (i == target_k)
       continue;

     if (z > 0)
     {
       Dtype h = ScalarConvert<Acctype, Dtype>::to((P == 1) ? g : 2*g*z);
       if(weights)
         h *= weights[target_k];
       buffer[threadIdx.x] -= h;
       gradInput_k[i] = h;
     }
     else
       gradInput_k[i] = ScalarConvert<int, Dtype>::to(0);
   }

   __syncthreads();

   // reduce
   if (threadIdx.x == 0)
   {
     Acctype gradInput_target_k = 0;
     for (int i=0; i<blockDim.x; i++)
       gradInput_target_k += buffer[i];
     gradInput_k[target_k] = ScalarConvert<Acctype, Dtype>::to(gradInput_target_k);
   }
 }

 #include "generic/MultiMarginCriterion.cu"
 #include "THCGenerateFloatTypes.h"

 #undef MULTIMARGIN_THREADS
	#include "THCUNN.h"
	#include "common.h"
	#include "THCHalf.h"
	#include "THCHalfAutoNumerics.cuh"

	#define MULTIMARGIN_THREADS 128

	template <int P, typename Dtype, typename Acctype>
	__global__ void cunn_MultiMarginCriterion_updateOutput_kernel(Dtype output, Dtype input, THCIndex_t target, Dtype weights, int nframe, int dim, bool sizeAverage, Dtype margin)
	{
	__shared__ Acctype buffer[MULTIMARGIN_THREADS];
	int k = blockIdx.x;
	Dtype input_k = input + kdim;
	Dtype *output_k = output + k;
	int target_k = ((int)target[k]) - TH_INDEX_BASE;
	Dtype input_target_k = input_k[target_k];

	int i_start = threadIdx.x;
	int i_end = dim;
	int i_step = blockDim.x;

	buffer[threadIdx.x] = 0;
	for (int i = i_start; i < i_end; i += i_step)
	{
	Dtype z = margin - input_target_k + input_k[i];
	if (i == target_k)
	continue;

	if (z > 0) {
	Dtype h = (P==1) ? z : z*z;
	if(weights)
	h *= weights[target_k];
	buffer[threadIdx.x] += h;
	}
	}
	__syncthreads();

	// reduce
	if (threadIdx.x == 0)
	{
	Acctype sum = 0;
	for (int i=0; i < blockDim.x; i++)
	sum += buffer[i];

	*output_k = ScalarConvert<Acctype, Dtype>::to(sum/dim);
	if(sizeAverage)
	*output_k /= nframe;
	}
	}

	template <int P, typename Dtype, typename Acctype>
	__global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(Dtype gradInput, Dtype input, THCIndex_t target, Dtype weights, int nframe, int dim, bool sizeAverage, Dtype margin)
	{
	__shared__ Acctype buffer[MULTIMARGIN_THREADS];
	int k = blockIdx.x;
	Dtype input_k = input + kdim;
	Dtype gradInput_k = gradInput + kdim;
	int target_k = ((int)target[k]) - TH_INDEX_BASE;
	Dtype input_target_k = input_k[target_k];
	Acctype g = (sizeAverage ? 1./((Acctype)(nframe*dim)) : 1./((Acctype)dim));

	int i_start = threadIdx.x;
	int i_end = dim;
	int i_step = blockDim.x;

	buffer[threadIdx.x] = 0;
	for (int i=i_start; i<i_end; i+=i_step)
	{
	Dtype z = margin - input_target_k + input_k[i];
	if (i == target_k)
	continue;

	if (z > 0)
	{
	Dtype h = ScalarConvert<Acctype, Dtype>::to((P == 1) ? g : 2gz);
	if(weights)
	h *= weights[target_k];
	buffer[threadIdx.x] -= h;
	gradInput_k[i] = h;
	}
	else
	gradInput_k[i] = ScalarConvert<int, Dtype>::to(0);
	}

	__syncthreads();

	// reduce
	if (threadIdx.x == 0)
	{
	Acctype gradInput_target_k = 0;
	for (int i=0; i<blockDim.x; i++)
	gradInput_target_k += buffer[i];
	gradInput_k[target_k] = ScalarConvert<Acctype, Dtype>::to(gradInput_target_k);
	}
	}

	#include "generic/MultiMarginCriterion.cu"
	#include "THCGenerateFloatTypes.h"

	#undef MULTIMARGIN_THREADS