torch/lib/THCUNN/VolumetricDilatedMaxPooling.cu - platform/external/pytorch - Git at Google

 #include "THCUNN.h"
 #include "common.h"
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
 #include "THCHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"

 #include <cfloat>

 template <typename Dtype>
 __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
   THCDeviceTensor<Dtype, 4> input,
   THCDeviceTensor<THCIndex_t, 4> indices,
   THCDeviceTensor<Dtype, 4> output,
   int kT, int kH, int kW,
   int dT, int dH, int dW,
   int padT, int padH, int padW,
   int dilationT, int dilationH, int dilationW,
   int offsetZ)
 {
   int oColumn = blockIdx.x * blockDim.x + threadIdx.x;
   int oRow    = blockIdx.y * blockDim.y + threadIdx.y;
   int oFrame  = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time
   int slice   = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature

   if (oRow < output.getSize(2) && oColumn < output.getSize(3))
   {
     int iColumn = oColumn * dW - padW;
     int iRow    = oRow    * dH - padH;
     int iFrame  = oFrame  * dT - padT;

     int maxColumn = 0;
     int maxRow = 0;
     int maxFrame = 0;

     Dtype max = THCNumerics<Dtype>::min();

     for (int frame = 0; frame < kT; ++frame)
     {
       if (iFrame + frame * dilationT < input.getSize(1) && iFrame + frame * dilationT >= 0)
       {
         for (int row = 0; row < kH; ++row)
         {
           if (iRow + row * dilationH < input.getSize(2) && iRow + row * dilationH >= 0)
           {
             for (int column = 0; column < kW; ++column)
             {
               if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0)
               {
                 Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];

                 if (max < val)
                 {
                   max = val;
                   maxColumn = column;
                   maxRow    = row;
                   maxFrame  = frame;
                 }
               }
             }
           }
         }
       }
     }

     output[slice][oFrame][oRow][oColumn] = max;
     THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
     ((unsigned char*)(idx))[0] = maxFrame;
     ((unsigned char*)(idx))[1] = maxRow;
     ((unsigned char*)(idx))[2] = maxColumn;
     ((unsigned char*)(idx))[3] = 0;
   }
 }

 template <int KERNEL_WIDTH, typename Dtype>
 __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
   THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<THCIndex_t, 4> indices,
   THCDeviceTensor<Dtype, 4> output,
   int kT, int kH,
   int dT, int dH, int dW,
   int padT, int padH, int padW,
   int dilationT, int dilationH, int dilationW,
   int offsetZ)
 {
   int oColumn = blockIdx.x * blockDim.x + threadIdx.x;
   int oRow    = blockIdx.y * blockDim.y + threadIdx.y;
   int oFrame  = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time
   int slice   = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature

   if (oRow < output.getSize(2) && oColumn < output.getSize(3))
   {
     int iColumn = oColumn * dW - padW;
     int iRow    = oRow    * dH - padH;
     int iFrame  = oFrame  * dT - padT;

     int maxColumn = 0;
     int maxRow = 0;
     int maxFrame;

     Dtype max = THCNumerics<Dtype>::min();

     for (int frame = 0; frame < kT; ++frame)
     {
       if (iFrame + frame * dilationT < input.getSize(1) && iFrame + frame * dilationT >= 0)
       {
         for (int row = 0; row < kH; ++row)
         {
           if (iRow + row * dilationH < input.getSize(2) && iRow + row * dilationH >= 0)
           {
             for (int column = 0; column < KERNEL_WIDTH; ++column)
             {
               if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0)
               {
                 Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];

                 if (max < val)
                 {
                   max = val;
                   maxColumn = column;
                   maxRow    = row;
                   maxFrame  = frame;
                 }
               }
             }
           }
         }
       }
     }

     output[slice][oFrame][oRow][oColumn] = max;
     THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
     ((unsigned char*)(idx))[0] = maxFrame;
     ((unsigned char*)(idx))[1] = maxRow;
     ((unsigned char*)(idx))[2] = maxColumn;
     ((unsigned char*)(idx))[3] = 0;
   }
 }

 template <typename Dtype>
 __global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput(
   THCDeviceTensor<Dtype, 4> gradOutput,
   THCDeviceTensor<THCIndex_t, 4> indices,
   THCDeviceTensor<Dtype, 4> gradInput,
   int dT, int dH, int dW,
   int padT, int padH, int padW,
   int dilationT, int dilationH, int dilationW,
   int offsetZ)
 {
   int oColumn = blockIdx.x * blockDim.x + threadIdx.x;
   int oRow    = blockIdx.y * blockDim.y + threadIdx.y;
   int oFrame  = (blockIdx.z + offsetZ) % gradOutput.getSize(1); // output frame/time
   int slice   = (blockIdx.z + offsetZ) / gradOutput.getSize(1); // output slice/feature

   if (oRow < gradOutput.getSize(2) && oColumn < gradOutput.getSize(3))
   {
     THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
     int iFrame  = ((unsigned char*)(idx))[0] * dilationT + oFrame  * dT - padT;
     int iRow    = ((unsigned char*)(idx))[1] * dilationH + oRow    * dH - padH;
     int iColumn = ((unsigned char*)(idx))[2] * dilationW + oColumn * dW - padW;
     atomicAdd(&gradInput[slice][iFrame][iRow][iColumn],
               gradOutput[slice][oFrame][oRow][oColumn]);
   }
 }

 #include "generic/VolumetricDilatedMaxPooling.cu"
 #include "THCGenerateFloatTypes.h"
	#include "THCUNN.h"
	#include "common.h"
	#include "THCDeviceTensor.cuh"
	#include "THCDeviceTensorUtils.cuh"
	#include "THCDeviceUtils.cuh"
	#include "THCHalf.h"
	#include "THCHalfAutoNumerics.cuh"
	#include "THCAtomics.cuh"

	#include <cfloat>

	template <typename Dtype>
	__global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
	THCDeviceTensor<Dtype, 4> input,
	THCDeviceTensor<THCIndex_t, 4> indices,
	THCDeviceTensor<Dtype, 4> output,
	int kT, int kH, int kW,
	int dT, int dH, int dW,
	int padT, int padH, int padW,
	int dilationT, int dilationH, int dilationW,
	int offsetZ)
	{
	int oColumn = blockIdx.x * blockDim.x + threadIdx.x;
	int oRow = blockIdx.y * blockDim.y + threadIdx.y;
	int oFrame = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time
	int slice = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature

	if (oRow < output.getSize(2) && oColumn < output.getSize(3))
	{
	int iColumn = oColumn * dW - padW;
	int iRow = oRow * dH - padH;
	int iFrame = oFrame * dT - padT;

	int maxColumn = 0;
	int maxRow = 0;
	int maxFrame = 0;

	Dtype max = THCNumerics<Dtype>::min();

	for (int frame = 0; frame < kT; ++frame)
	{
	if (iFrame + frame * dilationT < input.getSize(1) && iFrame + frame * dilationT >= 0)
	{
	for (int row = 0; row < kH; ++row)
	{
	if (iRow + row * dilationH < input.getSize(2) && iRow + row * dilationH >= 0)
	{
	for (int column = 0; column < kW; ++column)
	{
	if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0)
	{
	Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];

	if (max < val)
	{
	max = val;
	maxColumn = column;
	maxRow = row;
	maxFrame = frame;
	}
	}
	}
	}
	}
	}
	}

	output[slice][oFrame][oRow][oColumn] = max;
	THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
	((unsigned char*)(idx))[0] = maxFrame;
	((unsigned char*)(idx))[1] = maxRow;
	((unsigned char*)(idx))[2] = maxColumn;
	((unsigned char*)(idx))[3] = 0;
	}
	}

	template <int KERNEL_WIDTH, typename Dtype>
	__global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
	THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<THCIndex_t, 4> indices,
	THCDeviceTensor<Dtype, 4> output,
	int kT, int kH,
	int dT, int dH, int dW,
	int padT, int padH, int padW,
	int dilationT, int dilationH, int dilationW,
	int offsetZ)
	{
	int oColumn = blockIdx.x * blockDim.x + threadIdx.x;
	int oRow = blockIdx.y * blockDim.y + threadIdx.y;
	int oFrame = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time
	int slice = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature

	if (oRow < output.getSize(2) && oColumn < output.getSize(3))
	{
	int iColumn = oColumn * dW - padW;
	int iRow = oRow * dH - padH;
	int iFrame = oFrame * dT - padT;

	int maxColumn = 0;
	int maxRow = 0;
	int maxFrame;

	Dtype max = THCNumerics<Dtype>::min();

	for (int frame = 0; frame < kT; ++frame)
	{
	if (iFrame + frame * dilationT < input.getSize(1) && iFrame + frame * dilationT >= 0)
	{
	for (int row = 0; row < kH; ++row)
	{
	if (iRow + row * dilationH < input.getSize(2) && iRow + row * dilationH >= 0)
	{
	for (int column = 0; column < KERNEL_WIDTH; ++column)
	{
	if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0)
	{
	Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];

	if (max < val)
	{
	max = val;
	maxColumn = column;
	maxRow = row;
	maxFrame = frame;
	}
	}
	}
	}
	}
	}
	}

	output[slice][oFrame][oRow][oColumn] = max;
	THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
	((unsigned char*)(idx))[0] = maxFrame;
	((unsigned char*)(idx))[1] = maxRow;
	((unsigned char*)(idx))[2] = maxColumn;
	((unsigned char*)(idx))[3] = 0;
	}
	}

	template <typename Dtype>
	__global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput(
	THCDeviceTensor<Dtype, 4> gradOutput,
	THCDeviceTensor<THCIndex_t, 4> indices,
	THCDeviceTensor<Dtype, 4> gradInput,
	int dT, int dH, int dW,
	int padT, int padH, int padW,
	int dilationT, int dilationH, int dilationW,
	int offsetZ)
	{
	int oColumn = blockIdx.x * blockDim.x + threadIdx.x;
	int oRow = blockIdx.y * blockDim.y + threadIdx.y;
	int oFrame = (blockIdx.z + offsetZ) % gradOutput.getSize(1); // output frame/time
	int slice = (blockIdx.z + offsetZ) / gradOutput.getSize(1); // output slice/feature

	if (oRow < gradOutput.getSize(2) && oColumn < gradOutput.getSize(3))
	{
	THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
	int iFrame = ((unsigned char)(idx))[0] dilationT + oFrame * dT - padT;
	int iRow = ((unsigned char)(idx))[1] dilationH + oRow * dH - padH;
	int iColumn = ((unsigned char)(idx))[2] dilationW + oColumn * dW - padW;
	atomicAdd(&gradInput[slice][iFrame][iRow][iColumn],
	gradOutput[slice][oFrame][oRow][oColumn]);
	}
	}

	#include "generic/VolumetricDilatedMaxPooling.cu"
	#include "THCGenerateFloatTypes.h"