blob: 0cff32c975799d8e503ea42f307f1df5d617a8f5 [file] [log] [blame]
#include "THCUNN.h"
#include "common.h"
#include "THCDeviceTensor.cuh"
#include "THCDeviceTensorUtils.cuh"
#include "THCDeviceUtils.cuh"
#include "THCHalf.h"
#include "THCHalfAutoNumerics.cuh"
#include "THCAtomics.cuh"
#include <cfloat>
template <typename Dtype>
__global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
THCDeviceTensor<Dtype, 4> input,
THCDeviceTensor<THCIndex_t, 4> indices,
THCDeviceTensor<Dtype, 4> output,
int kT, int kH, int kW,
int dT, int dH, int dW,
int padT, int padH, int padW,
int dilationT, int dilationH, int dilationW,
int offsetZ)
{
int oColumn = blockIdx.x * blockDim.x + threadIdx.x;
int oRow = blockIdx.y * blockDim.y + threadIdx.y;
int oFrame = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time
int slice = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature
if (oRow < output.getSize(2) && oColumn < output.getSize(3))
{
int iColumn = oColumn * dW - padW;
int iRow = oRow * dH - padH;
int iFrame = oFrame * dT - padT;
int maxColumn = 0;
int maxRow = 0;
int maxFrame = 0;
Dtype max = THCNumerics<Dtype>::min();
for (int frame = 0; frame < kT; ++frame)
{
if (iFrame + frame * dilationT < input.getSize(1) && iFrame + frame * dilationT >= 0)
{
for (int row = 0; row < kH; ++row)
{
if (iRow + row * dilationH < input.getSize(2) && iRow + row * dilationH >= 0)
{
for (int column = 0; column < kW; ++column)
{
if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0)
{
Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];
if (max < val)
{
max = val;
maxColumn = column;
maxRow = row;
maxFrame = frame;
}
}
}
}
}
}
}
output[slice][oFrame][oRow][oColumn] = max;
THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
((unsigned char*)(idx))[0] = maxFrame;
((unsigned char*)(idx))[1] = maxRow;
((unsigned char*)(idx))[2] = maxColumn;
((unsigned char*)(idx))[3] = 0;
}
}
template <int KERNEL_WIDTH, typename Dtype>
__global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<THCIndex_t, 4> indices,
THCDeviceTensor<Dtype, 4> output,
int kT, int kH,
int dT, int dH, int dW,
int padT, int padH, int padW,
int dilationT, int dilationH, int dilationW,
int offsetZ)
{
int oColumn = blockIdx.x * blockDim.x + threadIdx.x;
int oRow = blockIdx.y * blockDim.y + threadIdx.y;
int oFrame = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time
int slice = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature
if (oRow < output.getSize(2) && oColumn < output.getSize(3))
{
int iColumn = oColumn * dW - padW;
int iRow = oRow * dH - padH;
int iFrame = oFrame * dT - padT;
int maxColumn = 0;
int maxRow = 0;
int maxFrame;
Dtype max = THCNumerics<Dtype>::min();
for (int frame = 0; frame < kT; ++frame)
{
if (iFrame + frame * dilationT < input.getSize(1) && iFrame + frame * dilationT >= 0)
{
for (int row = 0; row < kH; ++row)
{
if (iRow + row * dilationH < input.getSize(2) && iRow + row * dilationH >= 0)
{
for (int column = 0; column < KERNEL_WIDTH; ++column)
{
if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0)
{
Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];
if (max < val)
{
max = val;
maxColumn = column;
maxRow = row;
maxFrame = frame;
}
}
}
}
}
}
}
output[slice][oFrame][oRow][oColumn] = max;
THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
((unsigned char*)(idx))[0] = maxFrame;
((unsigned char*)(idx))[1] = maxRow;
((unsigned char*)(idx))[2] = maxColumn;
((unsigned char*)(idx))[3] = 0;
}
}
template <typename Dtype>
__global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput(
THCDeviceTensor<Dtype, 4> gradOutput,
THCDeviceTensor<THCIndex_t, 4> indices,
THCDeviceTensor<Dtype, 4> gradInput,
int dT, int dH, int dW,
int padT, int padH, int padW,
int dilationT, int dilationH, int dilationW,
int offsetZ)
{
int oColumn = blockIdx.x * blockDim.x + threadIdx.x;
int oRow = blockIdx.y * blockDim.y + threadIdx.y;
int oFrame = (blockIdx.z + offsetZ) % gradOutput.getSize(1); // output frame/time
int slice = (blockIdx.z + offsetZ) / gradOutput.getSize(1); // output slice/feature
if (oRow < gradOutput.getSize(2) && oColumn < gradOutput.getSize(3))
{
THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
int iFrame = ((unsigned char*)(idx))[0] * dilationT + oFrame * dT - padT;
int iRow = ((unsigned char*)(idx))[1] * dilationH + oRow * dH - padH;
int iColumn = ((unsigned char*)(idx))[2] * dilationW + oColumn * dW - padW;
atomicAdd(&gradInput[slice][iFrame][iRow][iColumn],
gradOutput[slice][oFrame][oRow][oColumn]);
}
}
#include "generic/VolumetricDilatedMaxPooling.cu"
#include "THCGenerateFloatTypes.h"