blob: b91896227cc9f894f6f5aabccc4060ba7c8733c4 [file] [log] [blame]
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/SpatialSubSampling.cu"
#else
#include "../common.h"
static inline void THNN_(SpatialSubSampling_shapeCheck)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *weight,
int kW, int kH) {
THCUNN_argCheck(state, input->nDimension == 3 || input->nDimension == 4, 2, input,
"3D or 4D input tensor expected but got: %s");
int nInputPlane = THCTensor_(size)(state, weight, 0);
int dimc = 2;
int dimr = 1;
int dimp = 0;
if (input->nDimension == 4) {
dimc++;
dimr++;
dimp++;
}
long nInputCols = input->size[dimc];
long nInputRows = input->size[dimr];
THArgCheck(input->size[dimp] == nInputPlane, 2, "invalid number of input planes");
THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size");
}
void THNN_(SpatialSubSampling_updateOutput)(
THCState *state,
THCTensor *input,
THCTensor *output,
THCTensor *weight,
THCTensor *bias,
int kW, int kH,
int dW, int dH)
{
real *weight_data = THCTensor_(data)(state, weight);
real *bias_data = THCTensor_(data)(state, bias);
real *output_data;
real *input_data;
int nInputPlane = THCTensor_(size)(state, weight, 0);
THCUNN_assertSameGPU(state, 4, input, output, weight, bias);
THNN_(SpatialSubSampling_shapeCheck)(state, input, NULL, weight, kW, kH);
if (input->nDimension == 3) {
long nInputCols = input->size[2];
long nInputRows = input->size[1];
long nOutputCols = (nInputCols - kW) / dW + 1;
long nOutputRows = (nInputRows - kH) / dH + 1;
input = THCTensor_(newContiguous)(state, input);
input_data = THCTensor_(data)(state, input);
THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);
output_data = THCTensor_(data)(state, output);
// cuda blocks & threads:
int yblocks = (int)(16L / nInputPlane);
yblocks = yblocks < 1 ? 1 : yblocks;
dim3 blocks(nInputPlane,yblocks);
dim3 threads(32,8);
// run subsample kernel
subsample<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
input_data, output_data, weight_data, bias_data,
nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
THCudaCheck(cudaGetLastError());
} else {
long nInputCols = input->size[3];
long nInputRows = input->size[2];
long nbatch = input->size[0];
long nOutputCols = (nInputCols - kW) / dW + 1;
long nOutputRows = (nInputRows - kH) / dH + 1;
input = THCTensor_(newContiguous)(state, input);
input_data = THCTensor_(data)(state, input);
THCTensor_(resize4d)(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols);
output_data = THCTensor_(data)(state, output);
// cuda blocks & threads:
int yblocks = (int)(16L / nInputPlane);
yblocks = yblocks < 1 ? 1 : yblocks;
dim3 blocks(nInputPlane*nbatch,yblocks);
dim3 threads(32,8);
// run subsample kernel
subsample<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
input_data, output_data, weight_data, bias_data,
nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
THCudaCheck(cudaGetLastError());
}
// clean
THCTensor_(free)(state, input);
}
void THNN_(SpatialSubSampling_updateGradInput)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *gradInput,
THCTensor *weight,
int kW, int kH,
int dW, int dH)
{
THCUNN_assertSameGPU(state, 4, input, gradOutput, weight, gradInput);
THNN_(SpatialSubSampling_shapeCheck)(state, input, gradOutput, weight, kW, kH);
int nInputPlane = THCTensor_(size)(state, weight, 0);
if (input->nDimension == 3) {
long nInputCols = input->size[2];
long nInputRows = input->size[1];
real *weight_data = THCTensor_(data)(state, weight);
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
real *gradOutput_data = THCTensor_(data)(state, gradOutput);
real *gradInput_data;
THCTensor_(resizeAs)(state, gradInput, input);
THCTensor_(zero)(state, gradInput);
gradInput_data = THCTensor_(data)(state, gradInput);
// cuda blocks & threads:
int yblocks = (int)(16L / nInputPlane);
yblocks = yblocks < 1 ? 1 : yblocks;
dim3 blocks(nInputPlane,yblocks);
dim3 threads(32,8);
// run updateGradInput kernel
if (kH <= dH && kW <= dW) {
subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
gradInput_data, gradOutput_data, weight_data,
nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
} else {
subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
gradInput_data, gradOutput_data, weight_data,
nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
}
THCudaCheck(cudaGetLastError());
} else {
long nInputCols = input->size[3];
long nInputRows = input->size[2];
long nbatch = input->size[0];
real *weight_data = THCTensor_(data)(state, weight);
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
real *gradOutput_data = THCTensor_(data)(state, gradOutput);
real *gradInput_data;
THCTensor_(resizeAs)(state, gradInput, input);
THCTensor_(zero)(state, gradInput);
gradInput_data = THCTensor_(data)(state, gradInput);
// cuda blocks & threads:
int yblocks = (int)(16L / nInputPlane);
yblocks = yblocks < 1 ? 1 : yblocks;
dim3 blocks(nInputPlane*nbatch,yblocks);
dim3 threads(32,8);
// run updateGradInput kernel
if (kH <= dH && kW <= dW) {
subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
gradInput_data, gradOutput_data, weight_data,
nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
} else {
subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
gradInput_data, gradOutput_data, weight_data,
nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
}
THCudaCheck(cudaGetLastError());
}
THCTensor_(free)(state, gradOutput);
}
void THNN_(SpatialSubSampling_accGradParameters)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *gradWeight,
THCTensor *gradBias,
int kW, int kH,
int dW, int dH,
float scale)
{
THCUNN_assertSameGPU(state, 4, input, gradOutput, gradWeight, gradBias);
THNN_(SpatialSubSampling_shapeCheck)(state, input, gradOutput, gradWeight, kW, kH);
int nInputPlane = THCTensor_(size)(state, gradWeight, 0);
if (input->nDimension == 3) {
long nInputCols = input->size[2];
long nInputRows = input->size[1];
real *gradWeight_data = THCTensor_(data)(state, gradWeight);
real *gradBias_data = THCTensor_(data)(state, gradBias);
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
real *gradOutput_data = THCTensor_(data)(state, gradOutput);
real *input_data;
input = THCTensor_(newContiguous)(state, input);
input_data = THCTensor_(data)(state, input);
// cuda blocks & threads:
dim3 blocks(nInputPlane);
dim3 threads(32,8);
// run gradweight kernel
subgradweight<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
input_data, gradOutput_data, gradWeight_data, gradBias_data,
nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale);
THCudaCheck(cudaGetLastError());
} else {
long nInputCols = input->size[3];
long nInputRows = input->size[2];
long nbatch = input->size[0];
real *gradWeight_data = THCTensor_(data)(state, gradWeight);
real *gradBias_data = THCTensor_(data)(state, gradBias);
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
real *gradOutput_data = THCTensor_(data)(state, gradOutput);
real *input_data;
input = THCTensor_(newContiguous)(state, input);
input_data = THCTensor_(data)(state, input);
// cuda blocks & threads:
dim3 blocks(nInputPlane);
dim3 threads(32,8);
// run gradweight kernel
long sl;
for (sl=0; sl<nbatch; sl++) {
subgradweight<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
input_data + sl*input->stride[0],
gradOutput_data + sl*gradOutput->stride[0],
gradWeight_data, gradBias_data,
nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale);
}
THCudaCheck(cudaGetLastError());
}
// clean
THCTensor_(free)(state, input);
THCTensor_(free)(state, gradOutput);
}
#endif