blob: fb080dac698a676cf58ecdaac3385db4fc3c76b6 [file] [log] [blame]
#include "THCUNN.h"
#include "common.h"
template <typename Dtype>
__global__ void MaxUnpoolForward(const int nthreads, const Dtype* bottom_data, const Dtype* bottom_mask,
const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* top_data) {
CUDA_KERNEL_LOOP(index, nthreads) { //index here indices the input pixels
int c = (index / iwidth / iheight) % channels;
int n = index / iwidth / iheight / channels;
top_data += (n*channels + c)*oheight*owidth;
int maxind = bottom_mask[index]-1;
top_data[maxind] = bottom_data[index];
}
}
template <typename Dtype>
__global__ void MaxUnpoolBackward(const int nthreads, const Dtype* top_diff, const Dtype* bottom_mask,
const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* bottom_diff) {
CUDA_KERNEL_LOOP(index, nthreads) {
int c = (index / iwidth / iheight) % channels;
int n = index / iwidth / iheight / channels;
top_diff += (n*channels + c)*oheight*owidth;
int maxind = bottom_mask[index]-1;
bottom_diff[index] = top_diff[maxind];
}
}
void THNN_CudaSpatialMaxUnpooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, int owidth, int oheight)
{
THCUNN_assertSameGPU(state, 3, input, output, indices);
THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
long nInputCols, nInputRows, nInputPlane, batchSize;
if (input->nDimension == 3) {
nInputCols = input->size[2];
nInputRows = input->size[1];
nInputPlane = input->size[0];
batchSize = 1;
}
else
{
nInputCols = input->size[3];
nInputRows = input->size[2];
nInputPlane = input->size[1];
batchSize = input->size[0];
}
input = THCudaTensor_newContiguous(state, input);
indices = THCudaTensor_newContiguous(state, indices);
THCudaTensor_resize4d(state, output, batchSize, nInputPlane, oheight, owidth);
THCudaTensor_zero(state, output);
int count = THCudaTensor_nElement(state, input);
MaxUnpoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
(count, THCudaTensor_data(state, input), THCudaTensor_data(state, indices),
batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCudaTensor_data(state, output));
if(input->nDimension == 3)
THCudaTensor_resize3d(state, output, nInputPlane, oheight, owidth);
THCudaTensor_free(state, input);
// check for errors
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("error in SpatialMaxUnpooling.updateOutput: %s\n", cudaGetErrorString(err));
THError("aborting");
}
}
void THNN_CudaSpatialMaxUnpooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *indices, int owidth, int oheight)
{
THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput);
long nInputCols, nInputRows, nInputPlane, batchSize;
if (input->nDimension == 3) {
nInputCols = input->size[2];
nInputRows = input->size[1];
nInputPlane = input->size[0];
batchSize = 1;
}
else
{
nInputCols = input->size[3];
nInputRows = input->size[2];
nInputPlane = input->size[1];
batchSize = input->size[0];
}
input = THCudaTensor_newContiguous(state, input);
indices = THCudaTensor_newContiguous(state, indices);
gradOutput = THCudaTensor_newContiguous(state, gradOutput);
THCudaTensor_resizeAs(state, gradInput, input);
int count = THCudaTensor_nElement(state, input);
MaxUnpoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
(count, THCudaTensor_data(state, gradOutput), THCudaTensor_data(state, indices),
batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCudaTensor_data(state, gradInput));
// check for errors
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("error in SpatialMaxUnpooling.updateGradInput: %s\n", cudaGetErrorString(err));
THError("aborting");
}
// clean
THCudaTensor_free(state, input);
THCudaTensor_free(state, gradOutput);
}