torch/lib/THCUNN/VolumetricAveragePooling.cu - platform/external/pytorch - Git at Google

 #include "THCUNN.h"
 #include "common.h"
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
 #include "THCHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"

 template <typename Dtype, typename Acctype>
 __global__ void cuda_VolumetricAveragePooling_updateOutput(
   THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<Dtype, 4> output,
   int kT, int kH, int kW, int dT, int dH, int dW, Acctype normFactor, int offsetZ)
 {
   int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
   int oRow   = blockIdx.y * blockDim.y + threadIdx.y;
   int oFrame = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time
   int slice  = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature

   if (oRow < output.getSize(2) && oCol < output.getSize(3))
   {
     Acctype sum = 0.0;

     int iColumn = oCol * dW;
     int iRow    = oRow    * dH;
     int iFrame  = oFrame  * dT;

     for (int frame = 0; frame < kT; ++frame)
     {
       if (iFrame + frame < input.getSize(1))
       {
         for (int row = 0; row < kH; ++row)
         {
           if (iRow + row < input.getSize(2))
           {
             for (int column = 0; column < kW; ++column)
             {
               if (iColumn + column < input.getSize(3))
               {
                 Dtype val = input[slice][iFrame + frame][iRow + row][iColumn + column];
                 sum += val;
               }
             }
           }
         }
       }
     }

     output[slice][oFrame][oRow][oCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor);
   }
 }

 // Inner-most loop size (kW) passed as template parameter for
 // performance reasons.
 //
 template<int KERNEL_WIDTH, typename Dtype, typename Acctype>
 __global__ void cuda_VolumetricAveragePooling_updateOutput(
   THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<Dtype, 4> output,
   int kT, int kH, int dT, int dH, int dW, Acctype normFactor, int offsetZ)
 {
   int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
   int oRow   = blockIdx.y * blockDim.y + threadIdx.y;
   int oFrame = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time
   int slice  = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature

   if (oRow < output.getSize(2) && oCol < output.getSize(3))
   {
     Acctype sum = 0.0;

     int iColumn = oCol * dW;
     int iRow    = oRow    * dH;
     int iFrame  = oFrame  * dT;

     for (int frame = 0; frame < kT; ++frame)
     {
       if (iFrame + frame < input.getSize(1))
       {
         for (int row = 0; row < kH; ++row)
         {
           if (iRow + row < input.getSize(2))
           {
             for (int column = 0; column < KERNEL_WIDTH; ++column)
             {
               if (iColumn + column < input.getSize(3))
               {
                 Dtype val = input[slice][iFrame + frame][iRow + row][iColumn + column];
                 sum += val;
               }
             }
           }
         }
       }
     }

     output[slice][oFrame][oRow][oCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor);
   }
 }

 #define LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW:                  \
   cuda_VolumetricAveragePooling_updateOutput<KW><<<grid, block>>>(      \
     cudaInput, cudaOutput, kT, kH, dT, dH, dW, normFactor, offsetZ); \
   break

 template <typename Dtype, typename Acctype>
 __global__ void cuda_VolumetricAveragePooling_updateGradInput_Stride1(
   THCDeviceTensor<Dtype, 4> gradOutput,
   THCDeviceTensor<Dtype, 4> gradInput,
   int kT, int kH, int kW, Acctype normFactor, int offsetZ)
 {
   int iCol   = blockIdx.x * blockDim.x + threadIdx.x;
   int iRow   = blockIdx.y * blockDim.y + threadIdx.y;
   int iFrame = (blockIdx.z + offsetZ) % gradInput.getSize(1); // input frame/time
   int slice  = (blockIdx.z + offsetZ) / gradInput.getSize(1); // input slice/feature

   // guard against over-tiled threads
   if (iRow < gradInput.getSize(2) && iCol < gradInput.getSize(3))
   {
     Acctype sum = 0.0;
     Dtype *gOut = &gradOutput[slice][max(0, iFrame - kT + 1)]
       [max(0, iRow - kH + 1)][max(0, iCol - kW + 1)];
     int frameOffset = 0;
     for (int oFrame  = max(0, iFrame - kT + 1);
          oFrame < min(iFrame + 1, gradOutput.getSize(1));
          ++oFrame)
     {
       int rowOffset = frameOffset;
       for (int oRow = max(0, iRow - kH + 1);
            oRow < min(iRow + 1, gradOutput.getSize(2));
            ++oRow)
       {
         int colOffset = rowOffset;
         for (int oCol = max(0, iCol - kW + 1);
              oCol < min(iCol + 1, gradOutput.getSize(3));
              ++oCol)
         {
           sum += gOut[colOffset];
           ++colOffset;
         }
         rowOffset += gradOutput.getSize(3);
       }
       frameOffset += gradOutput.getSize(2) * gradOutput.getSize(3);
     }
     gradInput[slice][iFrame][iRow][iCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor);
   }
 }

 template <typename Dtype, typename Acctype>
 __global__ void cuda_VolumetricAveragePooling_updateGradInput_atomicAdd(
   THCDeviceTensor<Dtype, 4> gradOutput,
   THCDeviceTensor<Dtype, 4> gradInput,
   int kT, int kH, int kW, int dT, int dH, int dW, int offsetZ)
 {
   int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
   int oRow   = blockIdx.y * blockDim.y + threadIdx.y;
   int oFrame = (blockIdx.z + offsetZ) % gradOutput.getSize(1); // gradOutput frame/time
   int slice  = (blockIdx.z + offsetZ) / gradOutput.getSize(1); // gradOutput slice/feature

   // guard against over-tiled threads
   if (oRow < gradOutput.getSize(2) && oCol < gradOutput.getSize(3))
   {
     Dtype val = ScalarConvert<Acctype, Dtype>::to(
       ScalarConvert<Dtype, Acctype>::to(gradOutput[slice][oFrame][oRow][oCol]) / (kT * kH * kW));
     for (int iFrame = oFrame * dT; iFrame < oFrame * dT + kT; ++iFrame)
     {
       for (int iRow = oRow * dH; iRow < oRow * dH + kH; ++iRow)
       {
         for (int iCol = oCol * dW; iCol < oCol * dW + kW; ++iCol)
         {
           atomicAdd(&gradInput[slice][iFrame][iRow][iCol], val);
         }
       }
     }
   }
 }

 template <typename Dtype, typename Acctype>
 __global__ void cuda_VolumetricAveragePooling_updateGradInput(
   THCDeviceTensor<Dtype, 4> gradOutput,
   THCDeviceTensor<Dtype, 4> gradInput,
   int kT, int kH, int kW,
   int dT, int dH, int dW, int offsetZ)
 {
   int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
   int oRow   = blockIdx.y * blockDim.y + threadIdx.y;
   int oFrame = (blockIdx.z + offsetZ) % gradOutput.getSize(1); // gradOutput frame/time
   int slice  = (blockIdx.z + offsetZ) / gradOutput.getSize(1); // gradOutput slice/feature

   // guard against over-tiled threads
   if (oRow < gradOutput.getSize(2) && oCol < gradOutput.getSize(3))
   {
     Dtype val = ScalarConvert<Acctype, Dtype>::to(
       ScalarConvert<Dtype, Acctype>::to(gradOutput[slice][oFrame][oRow][oCol]) / (kT * kH * kW));
     for (int iFrame = oFrame * dT; iFrame < oFrame * dT + kT; ++iFrame)
     {
       for (int iRow = oRow * dH; iRow < oRow * dH + kH; ++iRow)
       {
         for (int iCol = oCol * dW; iCol < oCol * dW + kW; ++iCol)
         {
           gradInput[slice][iFrame][iRow][iCol] = val;
         }
       }
     }
   }
 }

 #include "generic/VolumetricAveragePooling.cu"
 #include "THCGenerateFloatTypes.h"
	#include "THCUNN.h"
	#include "common.h"
	#include "THCDeviceTensor.cuh"
	#include "THCDeviceTensorUtils.cuh"
	#include "THCDeviceUtils.cuh"
	#include "THCHalf.h"
	#include "THCHalfAutoNumerics.cuh"
	#include "THCAtomics.cuh"

	template <typename Dtype, typename Acctype>
	__global__ void cuda_VolumetricAveragePooling_updateOutput(
	THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<Dtype, 4> output,
	int kT, int kH, int kW, int dT, int dH, int dW, Acctype normFactor, int offsetZ)
	{
	int oCol = blockIdx.x * blockDim.x + threadIdx.x;
	int oRow = blockIdx.y * blockDim.y + threadIdx.y;
	int oFrame = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time
	int slice = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature

	if (oRow < output.getSize(2) && oCol < output.getSize(3))
	{
	Acctype sum = 0.0;

	int iColumn = oCol * dW;
	int iRow = oRow * dH;
	int iFrame = oFrame * dT;

	for (int frame = 0; frame < kT; ++frame)
	{
	if (iFrame + frame < input.getSize(1))
	{
	for (int row = 0; row < kH; ++row)
	{
	if (iRow + row < input.getSize(2))
	{
	for (int column = 0; column < kW; ++column)
	{
	if (iColumn + column < input.getSize(3))
	{
	Dtype val = input[slice][iFrame + frame][iRow + row][iColumn + column];
	sum += val;
	}
	}
	}
	}
	}
	}

	output[slice][oFrame][oRow][oCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor);
	}
	}

	// Inner-most loop size (kW) passed as template parameter for
	// performance reasons.
	//
	template<int KERNEL_WIDTH, typename Dtype, typename Acctype>
	__global__ void cuda_VolumetricAveragePooling_updateOutput(
	THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<Dtype, 4> output,
	int kT, int kH, int dT, int dH, int dW, Acctype normFactor, int offsetZ)
	{
	int oCol = blockIdx.x * blockDim.x + threadIdx.x;
	int oRow = blockIdx.y * blockDim.y + threadIdx.y;
	int oFrame = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time
	int slice = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature

	if (oRow < output.getSize(2) && oCol < output.getSize(3))
	{
	Acctype sum = 0.0;

	int iColumn = oCol * dW;
	int iRow = oRow * dH;
	int iFrame = oFrame * dT;

	for (int frame = 0; frame < kT; ++frame)
	{
	if (iFrame + frame < input.getSize(1))
	{
	for (int row = 0; row < kH; ++row)
	{
	if (iRow + row < input.getSize(2))
	{
	for (int column = 0; column < KERNEL_WIDTH; ++column)
	{
	if (iColumn + column < input.getSize(3))
	{
	Dtype val = input[slice][iFrame + frame][iRow + row][iColumn + column];
	sum += val;
	}
	}
	}
	}
	}
	}

	output[slice][oFrame][oRow][oCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor);
	}
	}

	#define LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW: \
	cuda_VolumetricAveragePooling_updateOutput<KW><<<grid, block>>>( \
	cudaInput, cudaOutput, kT, kH, dT, dH, dW, normFactor, offsetZ); \
	break

	template <typename Dtype, typename Acctype>
	__global__ void cuda_VolumetricAveragePooling_updateGradInput_Stride1(
	THCDeviceTensor<Dtype, 4> gradOutput,
	THCDeviceTensor<Dtype, 4> gradInput,
	int kT, int kH, int kW, Acctype normFactor, int offsetZ)
	{
	int iCol = blockIdx.x * blockDim.x + threadIdx.x;
	int iRow = blockIdx.y * blockDim.y + threadIdx.y;
	int iFrame = (blockIdx.z + offsetZ) % gradInput.getSize(1); // input frame/time
	int slice = (blockIdx.z + offsetZ) / gradInput.getSize(1); // input slice/feature

	// guard against over-tiled threads
	if (iRow < gradInput.getSize(2) && iCol < gradInput.getSize(3))
	{
	Acctype sum = 0.0;
	Dtype *gOut = &gradOutput[slice][max(0, iFrame - kT + 1)]
	[max(0, iRow - kH + 1)][max(0, iCol - kW + 1)];
	int frameOffset = 0;
	for (int oFrame = max(0, iFrame - kT + 1);
	oFrame < min(iFrame + 1, gradOutput.getSize(1));
	++oFrame)
	{
	int rowOffset = frameOffset;
	for (int oRow = max(0, iRow - kH + 1);
	oRow < min(iRow + 1, gradOutput.getSize(2));
	++oRow)
	{
	int colOffset = rowOffset;
	for (int oCol = max(0, iCol - kW + 1);
	oCol < min(iCol + 1, gradOutput.getSize(3));
	++oCol)
	{
	sum += gOut[colOffset];
	++colOffset;
	}
	rowOffset += gradOutput.getSize(3);
	}
	frameOffset += gradOutput.getSize(2) * gradOutput.getSize(3);
	}
	gradInput[slice][iFrame][iRow][iCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor);
	}
	}

	template <typename Dtype, typename Acctype>
	__global__ void cuda_VolumetricAveragePooling_updateGradInput_atomicAdd(
	THCDeviceTensor<Dtype, 4> gradOutput,
	THCDeviceTensor<Dtype, 4> gradInput,
	int kT, int kH, int kW, int dT, int dH, int dW, int offsetZ)
	{
	int oCol = blockIdx.x * blockDim.x + threadIdx.x;
	int oRow = blockIdx.y * blockDim.y + threadIdx.y;
	int oFrame = (blockIdx.z + offsetZ) % gradOutput.getSize(1); // gradOutput frame/time
	int slice = (blockIdx.z + offsetZ) / gradOutput.getSize(1); // gradOutput slice/feature

	// guard against over-tiled threads
	if (oRow < gradOutput.getSize(2) && oCol < gradOutput.getSize(3))
	{
	Dtype val = ScalarConvert<Acctype, Dtype>::to(
	ScalarConvert<Dtype, Acctype>::to(gradOutput[slice][oFrame][oRow][oCol]) / (kT * kH * kW));
	for (int iFrame = oFrame * dT; iFrame < oFrame * dT + kT; ++iFrame)
	{
	for (int iRow = oRow * dH; iRow < oRow * dH + kH; ++iRow)
	{
	for (int iCol = oCol * dW; iCol < oCol * dW + kW; ++iCol)
	{
	atomicAdd(&gradInput[slice][iFrame][iRow][iCol], val);
	}
	}
	}
	}
	}

	template <typename Dtype, typename Acctype>
	__global__ void cuda_VolumetricAveragePooling_updateGradInput(
	THCDeviceTensor<Dtype, 4> gradOutput,
	THCDeviceTensor<Dtype, 4> gradInput,
	int kT, int kH, int kW,
	int dT, int dH, int dW, int offsetZ)
	{
	int oCol = blockIdx.x * blockDim.x + threadIdx.x;
	int oRow = blockIdx.y * blockDim.y + threadIdx.y;
	int oFrame = (blockIdx.z + offsetZ) % gradOutput.getSize(1); // gradOutput frame/time
	int slice = (blockIdx.z + offsetZ) / gradOutput.getSize(1); // gradOutput slice/feature

	// guard against over-tiled threads
	if (oRow < gradOutput.getSize(2) && oCol < gradOutput.getSize(3))
	{
	Dtype val = ScalarConvert<Acctype, Dtype>::to(
	ScalarConvert<Dtype, Acctype>::to(gradOutput[slice][oFrame][oRow][oCol]) / (kT * kH * kW));
	for (int iFrame = oFrame * dT; iFrame < oFrame * dT + kT; ++iFrame)
	{
	for (int iRow = oRow * dH; iRow < oRow * dH + kH; ++iRow)
	{
	for (int iCol = oCol * dW; iCol < oCol * dW + kW; ++iCol)
	{
	gradInput[slice][iFrame][iRow][iCol] = val;
	}
	}
	}
	}
	}

	#include "generic/VolumetricAveragePooling.cu"
	#include "THCGenerateFloatTypes.h"