blob: d807abcdfeb33619940287067226af0bfdf00897 [file] [log] [blame]
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/TemporalMaxPooling.cu"
#else
void THNN_(TemporalMaxPooling_updateOutput)(
THCState *state,
THCTensor *input,
THCTensor *output,
THCIndexTensor *indices,
int kW, int dW) {
int dimT = 0; // Temporal dimension
int dimF = 1; // Feature dimension
int batch = 1;
int input_w;
int input_n;
int output_w;
int nthreads;
real *input_data;
real *output_data;
THCIndex_t *indices_data;
THCUNN_assertSameGPU(state, 3, input, output, indices);
THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
if (input->nDimension == 3)
{
dimT = 1;
dimF = 2;
batch = input->size[0];
}
THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size");
input = THCTensor_(newContiguous)(state, input);
input_w = input->size[dimT];
input_n = input->size[dimF];
output_w = (input_w - kW) / dW + 1;
if (input->nDimension == 2)
{
THCTensor_(resize2d)(state, output, output_w, input->size[dimF]);
THCIndexTensor_(resize2d)(state, indices, output_w, input->size[dimF]);
}
else
{
THCTensor_(resize3d)(state, output, batch, output_w, input->size[dimF]);
THCIndexTensor_(resize3d)(state, indices, batch, output_w, input->size[dimF]);
}
input_data = THCTensor_(data)(state, input);
output_data = THCTensor_(data)(state, output);
indices_data = THCIndexTensor_(data)(state, indices);
dim3 blocks(batch);
nthreads = (output_w / 32) * 32;
if (output_w % 32 > 0) {
nthreads += 32;
}
if (nthreads > TEMPORAL_MAX_POOLING_THREADS) {
blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS;
if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) {
blocks.y += 1;
}
nthreads = TEMPORAL_MAX_POOLING_THREADS;
}
dim3 threads(nthreads);
cunn_TemporalMaxPooling_updateOutputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
input_data, output_data, indices_data, input_w, input_n, output_w, kW, dW);
THCudaCheck(cudaGetLastError());
THCTensor_(free)(state, input);
}
void THNN_(TemporalMaxPooling_updateGradInput)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *gradInput,
THCIndexTensor *indices,
int kW, int dW) {
int dimT = 0; // Temporal dimension
int dimF = 1; // Feature dimension
int batch = 1;
int input_w;
int input_n;
int output_w;
int nthreads;
real *gradInput_data;
real *gradOutput_data;
THCIndex_t *indices_data;
THCUNN_assertSameGPU(state, 4, input, gradOutput, gradInput, indices);
THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
THCTensor_(resizeAs)(state, gradInput, input);
THCTensor_(zero)(state, gradInput);
if (input->nDimension == 3)
{
dimT = 1;
dimF = 2;
batch = input->size[0];
}
THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size");
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
input_w = input->size[dimT];
input_n = input->size[dimF];
output_w = (input_w - kW) / dW + 1;
gradInput_data = THCTensor_(data)(state, gradInput);
gradOutput_data = THCTensor_(data)(state, gradOutput);
indices_data = THCIndexTensor_(data)(state, indices);
dim3 blocks(batch);
nthreads = (output_w / 32) * 32;
if (output_w % 32 > 0) {
nthreads += 32;
}
if (nthreads > TEMPORAL_MAX_POOLING_THREADS) {
blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS;
if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) {
blocks.y += 1;
}
nthreads = TEMPORAL_MAX_POOLING_THREADS;
}
dim3 threads(nthreads);
if (kW <= dW) {
cunn_TemporalMaxPooling_updateGradInputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW);
} else {
cunn_TemporalMaxPooling_updateGradInputKernelAtomic <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW);
}
THCudaCheck(cudaGetLastError());
THCTensor_(free)(state, gradOutput);
}
#endif