blob: e4ae7e51ffc93e1d8062789e9f1d89eeee6e011c [file] [log] [blame]
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/mkl/mkl_utils.h"
#include "caffe2/operators/conv_pool_op_base.h"
#ifdef CAFFE2_HAS_MKL_DNN
namespace caffe2 {
namespace mkl {
template <typename T>
class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
public:
USE_CONV_POOL_BASE_FUNCTIONS(MKLContext);
MKLConvOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<MKLContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(
dilation_h() == 1 && dilation_w() == 1, "Dilation not supported.");
OPERATOR_NEEDS_FEATURE(
pad_l() == pad_r() && pad_t() == pad_b(),
"Uneven padding not supported.");
OPERATOR_NEEDS_FEATURE(
order_ == StorageOrder::NCHW, "Only NCHW order supported.");
}
~MKLConvOp() {}
// TODO(jiayq): support double if needed.
bool RunOnDeviceWithOrderNCHW() override {
const auto& X = OperatorBase::Input<MKLMemory<float>>(INPUT);
const auto& filter = OperatorBase::Input<MKLMemory<float>>(FILTER);
const int M = filter.dim32(0);
if (InputSize() == 2 && !zero_bias_) {
TensorCPU cpu_zero_bias;
cpu_zero_bias.Resize(M);
CPUContext ctx;
math::Set<T, CPUContext>(
M, 0.0, cpu_zero_bias.template mutable_data<float>(), &ctx);
zero_bias_.reset(new MKLMemory<T>(std::vector<TIndex>{M}));
zero_bias_->CopyFrom(cpu_zero_bias);
}
const auto& bias = InputSize() == 2
? *zero_bias_
: OperatorBase::Input<MKLMemory<float>>(BIAS);
MKLMemory<float>* Y = OperatorBase::Output<MKLMemory<float>>(0);
CAFFE_ENFORCE(4 == X.ndim());
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
CAFFE_ENFORCE(4 == filter.ndim());
bool dims_changed;
CHECK_INPUT_FILTER_DIMS(X, filter, dims_changed);
if (dims_changed || FLAGS_caffe2_mkl_memonger_in_use) {
CAFFE_ENFORCE(
C == filter.dim32(1) * group_,
"Convolution op: input channels does not match: # of input channels ",
C,
" is not equal to kernel channels * group:",
filter.dim32(1),
"*",
group_);
CAFFE_ENFORCE(
M % group_ == 0,
"The number of output channels is not divisible by group.");
CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
CAFFE_ENFORCE(bias.ndim() == 1);
CAFFE_ENFORCE(bias.dim32(0) == M);
size_t dimension = 4;
size_t bdata_sizes[4] = {W, H, C, N};
// We will utilize the SetOutputSize() function int he base class
// with dummy TensorCPU input and output to calculate the sizes.
TensorCPU dummy_input(X.dims());
TensorCPU dummy_output;
ConvPoolOpBase<MKLContext>::SetOutputSize(
dummy_input, &dummy_output, M);
size_t tdata_sizes[4] = {
dummy_output.dim(3), dummy_output.dim(2),
dummy_output.dim(1), dummy_output.dim(0)};
size_t fdata_sizes[5] = {
kernel_w(), kernel_h(), C / group_, M / group_, group_};
size_t strides[2] = {stride_w(), stride_h()};
int pads[2] = {-pad_l(), -pad_t()};
if (group_ > 1) {
primitive_.Reset(
dnnGroupsConvolutionCreateForwardBias<float>,
nullptr,
dnnAlgorithmConvolutionDirect,
group_,
dimension,
bdata_sizes,
tdata_sizes,
fdata_sizes,
strides,
pads,
dnnBorderZeros);
} else {
primitive_.Reset(
dnnConvolutionCreateForwardBias<float>,
nullptr,
dnnAlgorithmConvolutionDirect,
dimension,
bdata_sizes,
tdata_sizes,
fdata_sizes,
strides,
pads,
dnnBorderZeros);
}
Y->Reset(dummy_output.dims(), primitive_, dnnResourceDst);
buffer_.Reset(dummy_output.dims(), primitive_, dnnResourceDst, true);
input_layout_.Reset(primitive_, dnnResourceSrc);
filter_layout_.Reset(primitive_, dnnResourceFilter);
bias_layout_.Reset(primitive_, dnnResourceBias);
}
// Try to share from the output: this allows us to avoid unnecessary copy
// operations, if the output is already allocated and is having the same
// layout as the buffer has.
bool shared = buffer_.ShareFrom(*Y);
std::shared_ptr<void> X_view = X.View(
input_layout_, primitive_, dnnResourceSrc);
std::shared_ptr<void> bias_view =
bias.View(bias_layout_, primitive_, dnnResourceBias);
std::shared_ptr<void> filter_view;
if (group_ > 1) {
// Explicitly reformat the buffer.
MKLMemory<float> group_filter(
std::vector<TIndex>{TIndex(group_),
TIndex(filter.dim32(0) / group_),
TIndex(filter.dim32(1)),
TIndex(filter.dim32(2)),
TIndex(filter.dim32(3))},
nullptr,
dnnResourceFilter,
/*share_memory_if_possible=*/true);
group_filter.CopyFrom(filter.buffer());
filter_view =
group_filter.View(filter_layout_, primitive_, dnnResourceFilter);
} else {
filter_view = filter.View(filter_layout_, primitive_, dnnResourceFilter);
}
resources_[dnnResourceSrc] = X_view.get(); // X.buffer();
resources_[dnnResourceFilter] = filter_view.get();
resources_[dnnResourceBias] = bias_view.get();
resources_[dnnResourceDst] = buffer_.buffer();
MKLDNN_SAFE_CALL(mkl::dnnExecute<T>(primitive_, resources_));
buffer_.CopyTo(Y, primitive_, dnnResourceDst);
if (FLAGS_caffe2_mkl_memonger_in_use && !shared) {
// buffer_ is not shared with Y. Free memory since it'll
// be re-allocated in the next run anyway due to memonger in use.
buffer_.Reset();
}
return true;
}
bool RunOnDeviceWithOrderNHWC() override {
CAFFE_NOT_IMPLEMENTED;
}
private:
// Input: X, W, b
// Output: Y
std::unique_ptr<MKLMemory<T>> zero_bias_;
vector<TIndex> cached_input_dims_;
vector<TIndex> cached_filter_dims_;
PrimitiveWrapper<T> primitive_;
LayoutWrapper<T> input_layout_;
LayoutWrapper<T> filter_layout_;
LayoutWrapper<T> bias_layout_;
MKLMemory<T> buffer_;
void* resources_[dnnResourceNumber] = {0};
INPUT_TAGS(INPUT, FILTER, BIAS);
};
} // namespace mkl
REGISTER_MKL_OPERATOR(Conv, mkl::MKLConvOp<float>);
} // namespace caffe2
#endif // CAFFE2_HAS_MKL_DNN