blob: 72caba7059ad44eb6577abafd60c51cd270e0458 [file] [log] [blame]
#include "THCUNN.h"
#include "common.h"
#include "vol2col.h"
void THNN_CudaVolumetricDilatedConvolution_updateOutput(
THCState *state,
THCudaTensor *input,
THCudaTensor *output,
THCudaTensor *weight,
THCudaTensor *bias,
THCudaTensor *columns,
THCudaTensor *ones,
int kT, int kW, int kH,
int dT, int dW, int dH,
int padT, int padW, int padH,
int dilationT, int dilationW, int dilationH) {
THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones);
if (bias) {
THCUNN_assertSameGPU(state, 2, weight, bias);
}
THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected, but got: %d", input->nDimension);
THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
// Params:
int nInputPlane = weight->size[1];
int nOutputPlane = weight->size[0];
int batch = 1;
if (input->nDimension == 4) {
THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
// Force batch
batch = 0;
THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
} else {
THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
}
long inputDepth = input->size[2];
long inputHeight = input->size[3];
long inputWidth = input->size[4];
long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
// Batch size + input planes
long batchSize = input->size[0];
// Resize output
THCudaTensor_resize5d(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
// Resize temporary columns
THCudaTensor_resize2d(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
// Define a buffer of ones, for bias accumulation
// Note: this buffer can be shared with other modules, it only ever gets increased,
// and always contains ones.
if (ones->nDimension != 2 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
// Resize plane and fill with ones...
THCudaTensor_resize3d(state, ones, outputDepth, outputHeight, outputWidth);
THCudaTensor_fill(state, ones, 1);
}
// Helpers
THCudaTensor *input_n = THCudaTensor_new(state);
THCudaTensor *output_n = THCudaTensor_new(state);
// For each elt in batch, do:
for (int elt = 0; elt < batchSize; elt ++) {
// Matrix mulitply per output:
THCudaTensor_select(state, input_n, input, 0, elt);
THCudaTensor_select(state, output_n, output, 0, elt);
// Do Bias first:
// M,N,K are dims of matrix A and B
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
long m_ = nOutputPlane;
long n_ = outputDepth * outputHeight * outputWidth;
long k_ = 1;
// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
if (bias) {
THCudaBlas_Sgemm(
state,
't', 'n',
n_, m_, k_,
1,
THCudaTensor_data(state, ones), k_,
THCudaTensor_data(state, bias), k_,
0,
THCudaTensor_data(state, output_n), n_
);
} else {
THCudaTensor_zero(state, output_n);
}
// Extract columns:
vol2col(
THCState_getCurrentStream(state),
THCudaTensor_data(state, input_n),
nInputPlane, inputDepth, inputHeight, inputWidth,
kT, kH, kW, padT, padH, padW, dT, dH, dW,
dilationT, dilationH, dilationW,
THCudaTensor_data(state, columns)
);
// M,N,K are dims of matrix A and B
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
long m = nOutputPlane;
long n = columns->size[1];
long k = nInputPlane*kT*kH*kW;
// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
THCudaBlas_Sgemm(
state,
'n', 'n',
n, m, k,
1,
THCudaTensor_data(state, columns), n,
THCudaTensor_data(state, weight), k,
1,
THCudaTensor_data(state, output_n), n
);
}
// Free
THCudaTensor_free(state, input_n);
THCudaTensor_free(state, output_n);
// Resize output
if (batch == 0) {
THCudaTensor_resize4d(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth);
THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
}
}
void THNN_CudaVolumetricDilatedConvolution_updateGradInput(
THCState *state,
THCudaTensor *input,
THCudaTensor *gradOutput,
THCudaTensor *gradInput,
THCudaTensor *weight,
THCudaTensor *gradColumns,
int kT, int kW, int kH,
int dT, int dW, int dH,
int padT, int padW, int padH,
int dilationT, int dilationW, int dilationH) {
THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
gradColumns, gradInput);
THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
// Params
int nInputPlane = weight->size[1];
int nOutputPlane = weight->size[0];
int batch = 1;
if (input->nDimension == 4) {
// Force batch
batch = 0;
THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
}
long inputDepth = input->size[2];
long inputWidth = input->size[4];
long inputHeight = input->size[3];
long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
// Batch size + input planes
long batchSize = input->size[0];
// Resize output
THCudaTensor_resize5d(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
// Resize temporary columns
THCudaTensor_resize2d(state, gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
// Helpers
THCudaTensor *gradInput_n = THCudaTensor_new(state);
THCudaTensor *gradOutput_n = THCudaTensor_new(state);
// For each elt in batch, do:
for (int elt = 0; elt < batchSize; elt ++) {
// Matrix mulitply per sample:
THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
// M,N,K are dims of matrix A and B
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
long m = nInputPlane*kT*kW*kH;
long n = gradColumns->size[1];
long k = nOutputPlane;
// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
THCudaBlas_Sgemm(
state,
'n', 't',
n, m, k,
1,
THCudaTensor_data(state, gradOutput_n), n,
THCudaTensor_data(state, weight), m,
0,
THCudaTensor_data(state, gradColumns), n
);
// Unpack columns back into input:
col2vol(
THCState_getCurrentStream(state),
THCudaTensor_data(state, gradColumns),
nInputPlane, inputDepth, inputHeight, inputWidth,
kT, kH, kW, padT, padH, padW, dT, dH, dW,
dilationT, dilationH, dilationW,
THCudaTensor_data(state, gradInput_n)
);
}
// Free
THCudaTensor_free(state, gradInput_n);
THCudaTensor_free(state, gradOutput_n);
// Resize output
if (batch == 0) {
THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
THCudaTensor_resize4d(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
}
}
void THNN_CudaVolumetricDilatedConvolution_accGradParameters(
THCState *state,
THCudaTensor *input,
THCudaTensor *gradOutput,
THCudaTensor *gradWeight,
THCudaTensor *gradBias,
THCudaTensor *columns,
THCudaTensor *ones,
int kT, int kW, int kH,
int dT, int dW, int dH,
int padT, int padW, int padH,
int dilationT, int dilationW, int dilationH,
float scale) {
THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, columns, ones);
if (gradBias) {
THCUNN_assertSameGPU(state, 2, gradWeight, gradBias);
}
THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
THArgCheck(gradWeight->nDimension == 5, 4, "gradWeight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
// Params
int nInputPlane = gradWeight->size[1];
int nOutputPlane = gradWeight->size[0];
int batch = 1;
if (input->nDimension == 4) {
// Force batch
batch = 0;
THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
}
long inputDepth = input->size[2];
long inputWidth = input->size[4];
long inputHeight = input->size[3];
long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
// Batch size + input planes
long batchSize = input->size[0];
// Define a buffer of ones, for bias accumulation
if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
// Resize plane and fill with ones...
THCudaTensor_resize3d(state, ones, outputDepth, outputHeight, outputWidth);
THCudaTensor_fill(state, ones, 1);
}
// Resize temporary columns
THCudaTensor_resize2d(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
// Helpers
THCudaTensor *input_n = THCudaTensor_new(state);
THCudaTensor *gradOutput_n = THCudaTensor_new(state);
// For each elt in batch, do:
for (int elt = 0; elt < batchSize; elt ++) {
// Matrix mulitply per output:
THCudaTensor_select(state, input_n, input, 0, elt);
THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
// Extract columns:
vol2col(
THCState_getCurrentStream(state),
THCudaTensor_data(state, input_n),
nInputPlane, inputDepth, inputHeight, inputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
dilationT, dilationH, dilationW,
THCudaTensor_data(state, columns)
);
// M,N,K are dims of matrix A and B
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
long m = nOutputPlane;
long n = nInputPlane*kT*kW*kH;
long k = columns->size[1];
// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
THCudaBlas_Sgemm(
state,
't', 'n',
n, m, k,
scale,
THCudaTensor_data(state, columns), k,
THCudaTensor_data(state, gradOutput_n), k,
1,
THCudaTensor_data(state, gradWeight), n
);
// Do Bias:
// M,N,K are dims of matrix A and B
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
long m_ = nOutputPlane;
long k_ = outputDepth * outputHeight * outputWidth;
// Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
if (gradBias) {
THCudaBlas_Sgemv(
state,
't',
k_, m_,
scale,
THCudaTensor_data(state, gradOutput_n), k_,
THCudaTensor_data(state, ones), 1,
1,
THCudaTensor_data(state, gradBias), 1
);
}
}
// Free
THCudaTensor_free(state, input_n);
THCudaTensor_free(state, gradOutput_n);
// Resize output
if (batch == 0) {
THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
}
}