blob: 9975c04cb6ecea0575c778c259c07eff207da674 [file] [log] [blame]
// conv_op_impl.h is the templated implementation of the conv_op.h file.
#ifndef CAFFE2_OPERATORS_CONV_OP_IMPL_H_
#define CAFFE2_OPERATORS_CONV_OP_IMPL_H_
#include "caffe2/operators/conv_op.h"
#include <array>
#include <vector>
#include "caffe2/core/context.h"
#include "caffe2/core/flags.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/operators/conv_pool_op_base.h"
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <typename T, class Context>
bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
const Tensor& X = Input(INPUT);
auto& filter = Input(FILTER);
Tensor* Y = Output(0);
const int N = X.dim32(0), C = X.dim32(1);
CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
const int M = filter.dim32(0);
CAFFE_ENFORCE(
C == filter.dim32(1) * group_,
"Convolution op: input channels does not match: # of input channels ",
C,
" is not equal to kernel channels * group:",
filter.dim32(1),
"*",
group_);
CAFFE_ENFORCE(
M % group_ == 0,
"The number of output channels is not divisible by group.");
int kernel_dims_size = 1;
for (int i = 0; i < kernel_.size(); ++i) {
CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]);
kernel_dims_size *= kernel_[i];
}
ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
const vector<int> input_dims = GetDims(X);
const vector<int> output_dims = GetDims(*Y);
const int input_image_size = this->GetDimsSize(X);
const int output_image_size = this->GetDimsSize(*Y);
vector<int> img_shape;
img_shape.assign(X.dims().begin() + 1, X.dims().end());
vector<int> buffer_shape;
buffer_shape.push_back(C / group_ * kernel_dims_size);
buffer_shape.insert(
buffer_shape.end(), output_dims.begin(), output_dims.end());
if (kernel_.size() != 2) {
SetDeviceTensor(img_shape, &img_shape_device_);
SetDeviceTensor(buffer_shape, &col_buffer_shape_device_);
}
const int col_buffer_size =
(C / group_) * kernel_dims_size * output_image_size;
// The dimension of each kernel
const int kernel_dim = C / group_ * kernel_dims_size;
// The offset corresponding to a single input image, and a single output
// image.
const int input_offset = C / group_ * input_image_size;
const int output_offset = Y->size() / Y->dim32(0) / group_;
const int filter_offset = filter.size() / group_;
// The col buffer is stored in CHW order as well - kernel_dim, and the height
// and width.
const T* X_data = X.template data<T>();
const T* filter_data = filter.template data<T>();
const T* bias_data = nullptr;
if (InputSize() == 3) {
const auto& bias = Input(BIAS);
CAFFE_ENFORCE_EQ(bias.ndim(), 1);
CAFFE_ENFORCE_EQ(bias.dim32(0), M);
bias_data = bias.template data<T>();
ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
output_image_size, &bias_multiplier_);
}
T* Y_data = Y->template mutable_data<T>();
// Shortcut for 1x1 conv.
if (kernel_dims_size == 1 && !HasPad() && !HasStride()) {
const int HxW = X.size() / (N * C);
return Run1x1ConvOnDeviceWithOrderNCHW(
N, C, HxW, M, X_data, filter_data, bias_data, Y_data);
}
auto f = [&](Tensor* col_buffer) {
col_buffer->Resize(buffer_shape);
T* col_buffer_data = col_buffer->template mutable_data<T>();
// Im2Col, followed by gemm.
for (int image_id = 0; image_id < N; ++image_id) {
for (int group_id = 0; group_id < group_; ++group_id) {
if (kernel_.size() == 2) {
math::Im2Col<T, Context, StorageOrder::NCHW>(
C / group_,
input_dims[0],
input_dims[1],
kernel_h(),
kernel_w(),
dilation_h(),
dilation_w(),
pad_t(),
pad_l(),
pad_b(),
pad_r(),
stride_h(),
stride_w(),
X_data + group_id * input_offset,
col_buffer_data,
&context_);
} else {
math::Im2ColNd<T, Context, StorageOrder::NCHW>(
kernel_.size(),
C * input_image_size,
col_buffer_size,
img_shape.data(),
buffer_shape.data(),
kernel_.data(),
stride_.data(),
dilation_.data(),
pads_.data(),
X_data + group_id * input_offset,
col_buffer_data,
&context_);
}
// Weight term
math::Gemm<T, Context>(
CblasNoTrans,
CblasNoTrans,
M / group_,
output_image_size,
kernel_dim,
1,
filter_data + group_id * filter_offset,
col_buffer_data,
0,
Y_data + group_id * output_offset,
&context_);
}
if (bias_data != nullptr) {
// Bias term can be carried out outside the group definition
// to be efficient.
math::Gemm<T, Context>(
CblasNoTrans,
CblasNoTrans,
M,
output_image_size,
1,
1,
bias_data,
bias_multiplier_.template data<T>(),
1,
Y_data,
&context_);
}
X_data += input_offset * group_;
Y_data += output_offset * group_;
}
};
if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
runWithSharedBuffer<Context>(ws_, f);
} else {
f(&col_buffer_);
}
return true;
}
// The implementations.
template <typename T, class Context>
bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
const Tensor& X = Input(INPUT);
auto& filter = Input(FILTER);
Tensor* Y = Output(0);
const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
CAFFE_ENFORCE_EQ(
kernel_.size(),
2,
"Only 2d convolution is supported for NHWC storage type");
CAFFE_ENFORCE(X.ndim(), filter.ndim());
const int M = filter.dim32(0);
CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
CAFFE_ENFORCE(filter.dim32(3) == C);
ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
// The dimension of each kernel
const int kernel_dim = kernel_h() * kernel_w() * C;
// The offset corresponding to a single input image, and a single output
// image.
const int input_offset = H * W * C;
const int output_offset = Y->size() / Y->dim32(0);
// The output image size is the spatial size of the output.
const int output_image_size = Y->dim32(1) * Y->dim32(2);
// The col buffer is stored in HWC order as well - kernel_dim, and the height
// and width.
const T* X_data = X.template data<T>();
const T* filter_data = filter.template data<T>();
const T* bias_data = nullptr;
T* Y_data = Y->template mutable_data<T>();
if (InputSize() == 3) {
const auto& bias = Input(BIAS);
CAFFE_ENFORCE_EQ(bias.ndim(), 1);
CAFFE_ENFORCE_EQ(bias.dim32(0), M);
bias_data = bias.template data<T>();
}
// Specialized path for 1 by 1 convolution with stride 1, pad 0 - we
// can skip im2col.
if (kernel_dim == C && !HasPad() && !HasStride()) {
const int HxW = X.size() / (N * C);
if (bias_data != nullptr) {
ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
N * HxW, &bias_multiplier_);
}
return Run1x1ConvOnDeviceWithOrderNHWC(
N, C, HxW, M, X_data, filter_data, bias_data, Y_data);
}
if (bias_data != nullptr) {
ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
output_image_size, &bias_multiplier_);
}
auto f = [&](Tensor* col_buffer) {
col_buffer->Resize(
vector<TIndex>{Y->dim32(1), Y->dim32(2), kernel_h(), kernel_w(), C});
T* col_buffer_data = col_buffer->template mutable_data<T>();
// Im2Col, followed by gemm.
for (int image_id = 0; image_id < N; ++image_id) {
math::Im2Col<T, Context, StorageOrder::NHWC>(
C,
H,
W,
kernel_h(),
kernel_w(),
dilation_h(),
dilation_w(),
pad_t(),
pad_l(),
pad_b(),
pad_r(),
stride_h(),
stride_w(),
X_data,
col_buffer_data,
&context_);
// Weight term
math::Gemm<T, Context>(
CblasNoTrans,
CblasTrans,
output_image_size,
M,
kernel_dim,
1,
col_buffer_data,
filter_data,
0,
Y_data,
&context_);
if (bias_data != nullptr) {
// Bias term
math::Gemm<T, Context>(
CblasNoTrans,
CblasNoTrans,
output_image_size,
M,
1,
1,
bias_multiplier_.template data<T>(),
bias_data,
1,
Y_data,
&context_);
}
X_data += input_offset;
Y_data += output_offset;
}
};
if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
runWithSharedBuffer<Context>(ws_, f);
} else {
f(&col_buffer_);
}
return true;
}
template <typename T, class Context>
bool ConvOp<T, Context>::Run1x1ConvOnDeviceWithOrderNCHW(
const int N,
const int C,
const int HxW,
const int M,
const T* X,
const T* filter,
const T* bias,
T* Y) {
const int G = group_;
if (G == 1) {
math::GemmStridedBatched<T, Context>(
CblasNoTrans,
CblasNoTrans,
N,
M,
HxW,
C,
1.0f,
filter,
0,
X,
C * HxW,
0.0f,
Y,
M * HxW,
&context_);
} else {
const int batch_size = N * G;
const int D_X = C / G;
const int D_Y = M / G;
const int X_stride = D_X * HxW;
const int W_stride = D_Y * D_X;
const int Y_stride = D_Y * HxW;
std::vector<const T*> X_ptr(N * G);
std::vector<const T*> W_ptr(N * G);
std::vector<T*> Y_ptr(N * G);
for (int i = 0; i < N; ++i) {
for (int j = 0; j < G; ++j) {
const int index = i * G + j;
X_ptr[index] = X + index * X_stride;
W_ptr[index] = filter + j * W_stride;
Y_ptr[index] = Y + index * Y_stride;
}
}
math::GemmBatched<T, Context>(
CblasNoTrans,
CblasNoTrans,
batch_size,
D_Y,
HxW,
D_X,
1.0f,
W_ptr.data(),
X_ptr.data(),
0.0f,
Y_ptr.data(),
&context_);
}
if (bias != nullptr) {
const T* bias_multiplier_data = bias_multiplier_.template data<T>();
math::GemmStridedBatched<T, Context>(
CblasNoTrans,
CblasNoTrans,
N,
M,
HxW,
1,
1.0f,
bias,
0,
bias_multiplier_data,
0,
1.0f,
Y,
M * HxW,
&context_);
}
return true;
}
template <typename T, class Context>
bool ConvOp<T, Context>::Run1x1ConvOnDeviceWithOrderNHWC(
const int N,
const int C,
const int HxW,
const int M,
const T* X,
const T* filter,
const T* bias,
T* Y) {
const int G = group_;
CAFFE_ENFORCE_EQ(G, 1);
math::Gemm<T, Context>(
CblasNoTrans,
CblasTrans,
N * HxW,
M,
C,
1.0f,
X,
filter,
0.0f,
Y,
&context_);
if (bias != nullptr) {
const T* bias_multiplier_data = bias_multiplier_.template data<T>();
math::Gemm<T, Context>(
CblasNoTrans,
CblasNoTrans,
N * HxW,
M,
1,
1.0f,
bias_multiplier_data,
bias,
1.0f,
Y,
&context_);
}
return true;
}
template <typename T, class Context>
bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
auto& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& dY = Input(OUTPUT_GRAD);
auto* dfilter = Output(FILTER_GRAD);
const int N = X.dim32(0), C = X.dim32(1);
const vector<int> input_dims = this->GetDims(X);
const int input_image_size = this->GetDimsSize(X);
const vector<int> output_dims = this->GetDims(dY);
// The output image size is the spatial size of the output.
const int output_image_size = this->GetDimsSize(dY);
ConvPoolOpBase<Context>::ComputePads(input_dims);
CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
const int M = filter.dim32(0);
CAFFE_ENFORCE(filter.dim32(1) * group_ == C);
int kernel_dims_size = 1;
for (int i = 0; i < kernel_.size(); ++i) {
CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]);
kernel_dims_size *= kernel_[i];
}
CAFFE_ENFORCE(M % group_ == 0);
dfilter->ResizeLike(filter);
// The dimension of each kernel
const int kernel_dim = C / group_ * kernel_dims_size;
// The offset corresponding to a single input image, and a single output
// image.
const int input_offset = C / group_ * input_image_size;
const int output_offset = dY.size() / dY.dim32(0) / group_;
const int filter_offset = filter.size() / group_;
// The col buffer is stored in CHW order as well - kernel_dim, and the height
// and width.
vector<int> img_shape;
img_shape.assign(X.dims().begin() + 1, X.dims().end());
vector<int> col_buffer_shape;
col_buffer_shape.push_back(C / group_ * kernel_dims_size);
col_buffer_shape.insert(
col_buffer_shape.end(), output_dims.begin(), output_dims.end());
col_buffer_.Resize(col_buffer_shape);
if (kernel_.size() != 2) {
SetDeviceTensor(img_shape, &img_shape_device_);
SetDeviceTensor(col_buffer_shape, &col_buffer_shape_device_);
}
const int col_buffer_size =
(C / group_) * kernel_dims_size * output_image_size;
const T* Xdata = X.template data<T>();
const T* filter_data = filter.template data<T>();
const T* dYdata = dY.template data<T>();
T* col_buffer_data = col_buffer_.template mutable_data<T>();
T* dfilter_data = dfilter->template mutable_data<T>();
// Pre-setting the gradients to zero.
math::Set<T, Context>(dfilter->size(), 0, dfilter_data, &context_);
T* dbias_data = nullptr;
if (!no_bias_) {
auto* dbias = Output(BIAS_OR_INPUT_GRAD);
dbias->Resize(M);
if (bias_multiplier_.size() != output_image_size) {
// If the helper bias multiplier is not M, reshape and fill it with one.
bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
math::Set<T, Context>(
output_image_size,
static_cast<T>(1),
bias_multiplier_.template mutable_data<T>(),
&context_);
}
dbias_data = dbias->template mutable_data<T>();
math::Set<T, Context>(dbias->size(), 0, dbias_data, &context_);
}
for (int image_id = 0; image_id < N; ++image_id) {
for (int group_id = 0; group_id < group_; ++group_id) {
// When we compute the gradient with respect to the filters, we need to do
// im2col to allow gemm-type computation.
if (kernel_.size() == 2) {
math::Im2Col<T, Context, StorageOrder::NCHW>(
C / group_,
input_dims[0],
input_dims[1],
kernel_h(),
kernel_w(),
dilation_h(),
dilation_w(),
pad_t(),
pad_l(),
pad_b(),
pad_r(),
stride_h(),
stride_w(),
Xdata + group_id * input_offset,
col_buffer_data,
&context_);
} else {
math::Im2ColNd<T, Context, StorageOrder::NCHW>(
kernel_.size(),
C * input_image_size,
col_buffer_size,
img_shape.data(),
col_buffer_shape.data(),
kernel_.data(),
stride_.data(),
dilation_.data(),
pads_.data(),
Xdata + group_id * input_offset,
col_buffer_data,
&context_);
}
// Gradient with respect to filter.
math::Gemm<T, Context>(
CblasNoTrans,
CblasTrans,
M / group_,
kernel_dim,
output_image_size,
1,
dYdata + group_id * output_offset,
col_buffer_data,
1,
dfilter_data + group_id * filter_offset,
&context_);
}
if (!no_bias_) {
// Gradient with respect to bias can be computed independent from group.
math::Gemv<T, Context>(
CblasNoTrans,
M,
output_image_size,
1,
dYdata,
bias_multiplier_.template data<T>(),
1,
dbias_data,
&context_);
}
Xdata += input_offset * group_;
dYdata += output_offset * group_;
}
if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
// Compute the gradient w.r.t. the input.
auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
dX->ResizeLike(X);
T* dXdata = dX->template mutable_data<T>();
dYdata = dY.template data<T>();
for (int image_id = 0; image_id < N; ++image_id) {
for (int group_id = 0; group_id < group_; ++group_id) {
// Compute gradient into col_buffer.
math::Gemm<T, Context>(
CblasTrans,
CblasNoTrans,
kernel_dim,
output_image_size,
M / group_,
1,
filter_data + group_id * filter_offset,
dYdata,
0,
col_buffer_data,
&context_);
if (kernel_.size() == 2) {
math::Col2Im<T, Context, StorageOrder::NCHW>(
C / group_,
input_dims[0],
input_dims[1],
kernel_h(),
kernel_w(),
dilation_h(),
dilation_w(),
pad_t(),
pad_l(),
pad_b(),
pad_r(),
stride_h(),
stride_w(),
col_buffer_data,
dXdata,
&context_);
} else {
math::Col2ImNd<T, Context, StorageOrder::NCHW>(
kernel_.size(),
C * input_image_size,
col_buffer_size,
img_shape.data(),
col_buffer_shape.data(),
kernel_.data(),
stride_.data(),
dilation_.data(),
pads_.data(),
col_buffer_data,
dXdata,
&context_);
}
dXdata += input_offset;
dYdata += output_offset;
}
}
}
return true;
}
template <typename T, class Context>
bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
auto& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& dY = Input(OUTPUT_GRAD);
auto* dfilter = Output(FILTER_GRAD);
const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
ConvPoolOpBase<Context>::ComputePads({H, W});
CAFFE_ENFORCE(4 == filter.ndim());
const int M = filter.dim32(0);
CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
CAFFE_ENFORCE(filter.dim32(3) == C);
dfilter->ResizeLike(filter);
// The dimension of each kernel
const int kernel_dim = kernel_h() * kernel_w() * C;
// The offset corresponding to a single input image, and a single output
// image.
const int input_offset = H * W * C;
const int output_offset = dY.size() / dY.dim32(0);
// The output image size is the spatial size of the output.
const int output_image_size = dY.dim32(1) * dY.dim32(2);
// The col buffer is stored in CHW order as well - kernel_dim, and the height
// and width.
col_buffer_.Resize(output_image_size, kernel_dim);
const T* Xdata = X.template data<T>();
const T* const filter_data = filter.template data<T>();
const T* const dYdata = dY.template data<T>();
T* col_buffer_data = col_buffer_.template mutable_data<T>();
T* dfilter_data = dfilter->template mutable_data<T>();
// Pre-setting the gradients to zero.
math::Set<T, Context>(dfilter->size(), 0, dfilter_data, &context_);
T* dbias_data = nullptr;
if (!no_bias_) {
auto* dbias = Output(BIAS_OR_INPUT_GRAD);
dbias->Resize(M);
dbias_data = dbias->template mutable_data<T>();
math::Set<T, Context>(dbias->size(), 0, dbias_data, &context_);
if (bias_multiplier_.size() != output_image_size) {
// If the helper bias multiplier is not M, reshape and fill it with one.
bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
math::Set<T, Context>(
output_image_size,
static_cast<T>(1),
bias_multiplier_.template mutable_data<T>(),
&context_);
}
}
for (int image_id = 0; image_id < N; ++image_id) {
// When we compute the gradient with respect to the filters, we need to do
// im2col to allow gemm-type computation.
math::Im2Col<T, Context, StorageOrder::NHWC>(
C,
H,
W,
kernel_h(),
kernel_w(),
dilation_h(),
dilation_w(),
pad_t(),
pad_l(),
pad_b(),
pad_r(),
stride_h(),
stride_w(),
Xdata,
col_buffer_data,
&context_);
// Gradient with respect to filter.
math::Gemm<T, Context>(
CblasTrans,
CblasNoTrans,
M,
kernel_dim,
output_image_size,
1,
dYdata + output_offset * image_id,
col_buffer_data,
1,
dfilter_data,
&context_);
if (!no_bias_) {
// Gradient with respect to bias
math::Gemv<T, Context>(
CblasTrans,
output_image_size,
M,
1,
dYdata + output_offset * image_id,
bias_multiplier_.template data<T>(),
1,
dbias_data,
&context_);
}
Xdata += input_offset;
}
if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
// Compute the gradient w.r.t. the input.
auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
dX->ResizeLike(X);
T* dXdata = dX->template mutable_data<T>();
for (int image_id = 0; image_id < N; ++image_id) {
// Compute gradient into col_buffer.
math::Gemm<T, Context>(
CblasNoTrans,
CblasNoTrans,
output_image_size,
kernel_dim,
M,
1,
dYdata + output_offset * image_id,
filter_data,
0,
col_buffer_data,
&context_);
math::Col2Im<T, Context, StorageOrder::NHWC>(
C,
H,
W,
kernel_h(),
kernel_w(),
dilation_h(),
dilation_w(),
pad_t(),
pad_l(),
pad_b(),
pad_r(),
stride_h(),
stride_w(),
col_buffer_data,
dXdata,
&context_);
dXdata += input_offset;
}
}
return true;
}
} // namespace caffe2
#endif // CAFFE2_OPERATORS_CONV_OP_IMPL_H_