blob: a44db4a60bc134d112c687d2cf1b839cec8f90c8 [file] [log] [blame]
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/SpatialDilatedMaxPooling.cu"
#else
#include "../common.h"
void THNN_(SpatialDilatedMaxPooling_updateOutput)(
THCState *state,
THCTensor *input,
THCTensor *output,
THCIndexTensor *indices,
int kW, int kH,
int dW, int dH,
int padW, int padH,
int dilationW, int dilationH,
bool ceil_mode)
{
THCUNN_assertSameGPU_generic(state, 3, input, output, indices);
THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
long nInputCols, nInputRows, nInputPlane, batchSize;
long nOutputCols, nOutputRows;
if (input->nDimension == 3) {
nInputCols = input->size[2];
nInputRows = input->size[1];
nInputPlane = input->size[0];
batchSize = 1;
}
else
{
nInputCols = input->size[3];
nInputRows = input->size[2];
nInputPlane = input->size[1];
batchSize = input->size[0];
}
THArgCheck(nInputCols >= kW - padW && nInputRows >= kH - padH, 2, "input image smaller than kernel size");
THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
if(ceil_mode) {
nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
}
else {
nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
}
if (nOutputCols < 1 || nOutputRows < 1)
THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols);
if (padW || padH)
{
// ensure that the last pooling starts inside the image
if ((nOutputRows - 1)*dH >= nInputRows + padH)
--nOutputRows;
if ((nOutputCols - 1)*dW >= nInputCols + padW)
--nOutputCols;
}
input = THCTensor_(newContiguous)(state, input);
real* input_data = THCTensor_(data)(state, input);
THCTensor_(resize4d)(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
THCUNN_resizeAs_indices(state, indices, output);
THCIndex_t* indices_data = THCIndexTensor_(data)(state, indices);
real* output_data = THCTensor_(data)(state, output);
int count = THCTensor_(nElement)(state, output);
MaxPoolForward<real, accreal> <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
(count, input_data,
batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
kH, kW, dH, dW, padH, padW, dilationH, dilationW, output_data, indices_data);
THCudaCheck(cudaGetLastError());
if(input->nDimension == 3)
THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);
THCTensor_(free)(state, input);
}
void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *gradInput,
THCIndexTensor *indices,
int kW, int kH,
int dW, int dH,
int padW, int padH,
int dilationW, int dilationH,
bool ceil_mode)
{
THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, indices, gradInput);
input = THCTensor_(newContiguous)(state, input);
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
long nInputCols, nInputRows, nInputPlane, batchSize;
long nOutputCols, nOutputRows;
if (input->nDimension == 3) {
nInputCols = input->size[2];
nInputRows = input->size[1];
nInputPlane = input->size[0];
batchSize = 1;
}
else
{
nInputCols = input->size[3];
nInputRows = input->size[2];
nInputPlane = input->size[1];
batchSize = input->size[0];
}
if(ceil_mode) {
nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
}
else {
nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
}
if (nOutputCols < 1 || nOutputRows < 1)
THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols);
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
THCTensor_(resizeAs)(state, gradInput, input);
int count = THCTensor_(nElement)(state, input);
MaxPoolBackward<real, accreal> <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
(count,
THCTensor_(data)(state, gradOutput),
THCIndexTensor_(data)(state, indices),
batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
kH, kW, dH, dW, padH, padW, dilationH, dilationW,
THCTensor_(data)(state, gradInput));
THCudaCheck(cudaGetLastError());
THCTensor_(free)(state, gradOutput);
// clean
THCTensor_(free)(state, input);
THCTensor_(free)(state, gradOutput);
}
#endif