blob: 4e28ae2ea34d9a165946b8ccc67e0519d925ee21 [file] [log] [blame]
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/SpatialFullConvolution.cu"
#else
void THNN_(SpatialFullConvolution_updateOutput)(
THCState *state,
THCTensor *input,
THCTensor *output,
THCTensor *weight,
THCTensor *bias,
THCTensor *columns,
THCTensor *ones,
int kW, int kH,
int dW, int dH,
int padW, int padH,
int adjW, int adjH)
{
int nInputPlane = THCTensor_(size)(state, weight, 0);
int nOutputPlane = THCTensor_(size)(state, weight, 1);
THCUNN_assertSameGPU_generic(state, 6, input, output, weight,
bias, columns, ones);
THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
int batch = 1;
if (input->nDimension == 3) {
THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
// Force batch
batch = 0;
THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
} else {
THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
}
long inputWidth = input->size[3];
long inputHeight = input->size[2];
long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
// Batch size + input planes
long batchSize = input->size[0];
// Resize output
THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
// Resize temporary columns
THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
// Define a buffer of ones, for bias accumulation
// Note: this buffer can be shared with other modules, it only ever gets increased,
// and always contains ones.
if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
// Resize plane and fill with ones...
THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
}
// Helpers
THCTensor *input_n = THCTensor_(new)(state);
THCTensor *output_n = THCTensor_(new)(state);
// For each elt in batch, do:
for (int elt = 0; elt < batchSize; elt ++) {
// Matrix mulitply per output:
THCTensor_(select)(state, input_n, input, 0, elt);
THCTensor_(select)(state, output_n, output, 0, elt);
// M,N,K are dims of matrix A and B
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
long m = weight->size[1] * weight->size[2] * weight->size[3];
long n = columns->size[1];
long k = weight->size[0];
// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
#ifdef THC_REAL_IS_FLOAT
THCudaBlas_Sgemm(
#elif defined(THC_REAL_IS_HALF)
THCudaBlas_Hgemm(
#elif defined(THC_REAL_IS_DOUBLE)
THCudaBlas_Dgemm(
#endif
state,
'n', 't',
n, m, k,
ScalarConvert<int, real>::to(1),
THCTensor_(data)(state, input_n), n,
THCTensor_(data)(state, weight), m,
ScalarConvert<int, real>::to(0),
THCTensor_(data)(state, columns), n
);
// Unpack columns back into input:
col2im(
THCState_getCurrentStream(state),
THCTensor_(data)(state, columns),
nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
1, 1, THCTensor_(data)(state, output_n)
);
// Do Bias after:
// M,N,K are dims of matrix A and B
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
long m_ = nOutputPlane;
long n_ = outputHeight * outputWidth;
long k_ = 1;
// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
if (bias) {
#ifdef THC_REAL_IS_FLOAT
THCudaBlas_Sgemm(
#elif defined(THC_REAL_IS_HALF)
THCudaBlas_Hgemm(
#elif defined(THC_REAL_IS_DOUBLE)
THCudaBlas_Dgemm(
#endif
state,
't', 'n',
n_, m_, k_,
ScalarConvert<int, real>::to(1),
THCTensor_(data)(state, ones), k_,
THCTensor_(data)(state, bias), k_,
ScalarConvert<int, real>::to(1),
THCTensor_(data)(state, output_n), n_
);
}
}
// Free
THCTensor_(free)(state, input_n);
THCTensor_(free)(state, output_n);
// Resize output
if (batch == 0) {
THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth);
THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
}
}
void THNN_(SpatialFullConvolution_updateGradInput)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *gradInput,
THCTensor *weight,
THCTensor *gradColumns,
int kW, int kH,
int dW, int dH,
int padW, int padH,
int adjW, int adjH)
{
int nInputPlane = THCTensor_(size)(state, weight, 0);
int nOutputPlane = THCTensor_(size)(state, weight, 1);
THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight,
gradColumns, gradInput);
THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
int batch = 1;
if (input->nDimension == 3) {
// Force batch
batch = 0;
THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
}
long inputWidth = input->size[3];
long inputHeight = input->size[2];
long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
// Batch size + input planes
long batchSize = input->size[0];
// Resize output
THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
// Resize temporary columns
THCTensor_(resize2d)(state, gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
// Helpers
THCTensor *gradInput_n = THCTensor_(new)(state);
THCTensor *gradOutput_n = THCTensor_(new)(state);
// For each elt in batch, do:
for (int elt = 0; elt < batchSize; elt ++) {
// Matrix mulitply per sample:
THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
// Extract columns:
im2col(
THCState_getCurrentStream(state),
THCTensor_(data)(state, gradOutput_n),
nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
1, 1, THCTensor_(data)(state, gradColumns)
);
// M,N,K are dims of matrix A and B
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
long m = weight->size[0];
long n = gradColumns->size[1];
long k = weight->size[1] * weight->size[2] * weight->size[3];
// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
#ifdef THC_REAL_IS_FLOAT
THCudaBlas_Sgemm(
#elif defined(THC_REAL_IS_HALF)
THCudaBlas_Hgemm(
#elif defined(THC_REAL_IS_DOUBLE)
THCudaBlas_Dgemm(
#endif
state,
'n', 'n',
n, m, k,
ScalarConvert<int, real>::to(1),
THCTensor_(data)(state, gradColumns), n,
THCTensor_(data)(state, weight), k,
ScalarConvert<int, real>::to(0),
THCTensor_(data)(state, gradInput_n), n
);
}
// Free
THCTensor_(free)(state, gradInput_n);
THCTensor_(free)(state, gradOutput_n);
// Resize output
if (batch == 0) {
THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth);
}
}
void THNN_(SpatialFullConvolution_accGradParameters)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *gradWeight,
THCTensor *gradBias,
THCTensor *columns,
THCTensor *ones,
int kW, int kH,
int dW, int dH,
int padW, int padH,
int adjW, int adjH,
real scale)
{
int nInputPlane = THCTensor_(size)(state, gradWeight, 0);
int nOutputPlane = THCTensor_(size)(state, gradWeight, 1);
THCUNN_assertSameGPU_generic(state, 6, input, gradOutput, gradWeight,
gradBias, columns, ones);
THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
int batch = 1;
if (input->nDimension == 3) {
// Force batch
batch = 0;
THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
}
long inputWidth = input->size[3];
long inputHeight = input->size[2];
long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
// Batch size + input planes
long batchSize = input->size[0];
// Define a buffer of ones, for bias accumulation
if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
// Resize plane and fill with ones...
THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
}
// Resize temporary columns
THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
// Helpers
THCTensor *input_n = THCTensor_(new)(state);
THCTensor *gradOutput_n = THCTensor_(new)(state);
// For each elt in batch, do:
for (int elt = 0; elt < batchSize; elt ++) {
// Matrix mulitply per output:
THCTensor_(select)(state, input_n, input, 0, elt);
THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
// Extract columns:
im2col(
THCState_getCurrentStream(state),
THCTensor_(data)(state, gradOutput_n),
nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
1, 1, THCTensor_(data)(state, columns)
);
// M,N,K are dims of matrix A and B
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
long n = columns->size[0]; // nOutputPlane * kh * kw
long m = input_n->size[0]; // nInputPlane
long k = columns->size[1]; // inputHeight * inputWidth
// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
#ifdef THC_REAL_IS_FLOAT
THCudaBlas_Sgemm(
#elif defined(THC_REAL_IS_HALF)
THCudaBlas_Hgemm(
#elif defined(THC_REAL_IS_DOUBLE)
THCudaBlas_Dgemm(
#endif
state,
't', 'n',
n, m, k,
scale,
THCTensor_(data)(state, columns), k,
THCTensor_(data)(state, input_n), k,
ScalarConvert<int, real>::to(1),
THCTensor_(data)(state, gradWeight), n
);
// Do Bias:
// M,N,K are dims of matrix A and B
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
long m_ = nOutputPlane;
long k_ = outputHeight * outputWidth;
// Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
if (gradBias) {
#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
#ifdef THC_REAL_IS_FLOAT
THCudaBlas_Sgemv(
#elif defined(THC_REAL_IS_DOUBLE)
THCudaBlas_Dgemv(
#endif
state,
't',
k_, m_,
scale,
THCTensor_(data)(state, gradOutput_n), k_,
THCTensor_(data)(state, ones), 1,
ScalarConvert<int, real>::to(1),
THCTensor_(data)(state, gradBias), 1
);
#endif
#ifdef THC_REAL_IS_HALF
THCudaBlas_Hgemm(
state,
't', 'n',
m_, 1, k_,
scale,
THCTensor_(data)(state, gradOutput_n), k_,
THCTensor_(data)(state, ones), k_,
ScalarConvert<int, real>::to(1),
THCTensor_(data)(state, gradBias), m_
);
#endif
}
}
// Free
THCTensor_(free)(state, input_n);
THCTensor_(free)(state, gradOutput_n);
// Resize
if (batch == 0) {
THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
}
}
#endif