blob: e2928e8da1da5a5601f435f205a7710f26535169 [file] [log] [blame]
#ifdef USE_XNNPACK
#include <vector>
#include <ATen/native/xnnpack/Common.h>
#include <ATen/native/ConvUtils.h>
#include <ATen/native/utils/ParamUtils.h>
#include <ATen/native/xnnpack/Factory.h>
#include <ATen/native/xnnpack/Convolution.h>
namespace at {
namespace native {
namespace xnnpack {
namespace internal {
namespace convolution2d {
namespace {
// Supports NHWC and NCHW FP32 convolutions with any valid
// - kernel size
// - padding
// - stride
// - dilation
// - grouping
// TODO: Decouple and improve error handling and messages.
bool available(
const Tensor& weight,
const c10::optional<Tensor>& bias,
const IntArrayRef padding,
const IntArrayRef stride,
const IntArrayRef dilation,
const int64_t groups,
const float output_min,
const float output_max) {
// XNNPACK
return xnnpack::internal::available() &&
// Weight
(4 == weight.ndimension()) &&
(weight.size(Layout::Filter::height) > 0) &&
(weight.size(Layout::Filter::width) > 0) &&
(c10::DeviceType::CPU == weight.device().type()) &&
(kFloat == weight.scalar_type()) &&
// Bias
((bias && bias->defined()) ? ((1 == bias->ndimension()) &&
(c10::DeviceType::CPU == bias->device().type()) &&
(kFloat == bias->scalar_type()) &&
(weight.size(Layout::Filter::output)) == bias->size(0))
: true) &&
// Padding
(padding[Layout::Parameter::height] >= 0) &&
(padding[Layout::Parameter::width] >= 0) &&
// Stride
(stride[Layout::Parameter::height] > 0) &&
(stride[Layout::Parameter::width] > 0) &&
// Dilation
(dilation[Layout::Parameter::height] > 0) &&
(dilation[Layout::Parameter::width] > 0) &&
// Groups
(groups > 0) &&
// Input
(weight.size(Layout::Filter::input) > 0) &&
// Output
(weight.size(Layout::Filter::output) > 0) &&
// Output - Groups
((weight.size(Layout::Filter::output) % groups) == 0) &&
// Output Min / Max
(output_max > output_min) &&
true;
}
// TODO: Decouple and improve error handling and messages.
bool usable(const Tensor& input) {
// Input
return (4 == input.ndimension()) &&
(c10::DeviceType::CPU == input.device().type()) &&
(kFloat == input.scalar_type()) &&
(input.size(Layout::Activation4D::batch) >= 0) &&
(input.size(Layout::Activation4D::channels) > 0) &&
(input.size(Layout::Activation4D::height) > 0) &&
(input.size(Layout::Activation4D::width) > 0) &&
true;
}
Tensor create_and_run(
const Tensor& input,
const Tensor& weight,
const Tensor& bias,
const IntArrayRef padding,
const IntArrayRef stride,
const IntArrayRef dilation,
const int64_t groups,
const float output_min,
const float output_max) {
return run(
create(
weight,
bias,
padding,
stride,
dilation,
groups,
output_min,
output_max),
input);
}
} // namespace
ContextConv2D create(
const Tensor& weight,
const c10::optional<Tensor>& bias,
const IntArrayRef padding,
const IntArrayRef stride,
const IntArrayRef dilation,
const int64_t groups,
const float output_min,
const float output_max) {
const auto padding_expanded = expand_param_if_needed(padding, "padding", 2);
const auto stride_expanded = expand_param_if_needed(stride, "stride", 2);
const auto dilation_expanded = expand_param_if_needed(dilation, "dilation", 2);
const Tensor weight_nhwc = weight.contiguous(MemoryFormat::ChannelsLast);
TORCH_CHECK(
available(
weight_nhwc,
bias,
padding_expanded,
stride_expanded,
dilation_expanded,
groups,
output_min,
output_max),
"xnnpack::convolution not available! "
"Reason: The provided (weight, bias, padding, stride, dilation, groups, output_min, output_max) "
"parameters are either invalid individually or their combination is not supported by XNNPACK.");
xnn_operator_t convolution_op{};
const xnn_status create_status = xnn_create_convolution2d_nhwc_f32(
padding_expanded[Layout::Parameter::height], // input_padding_top
padding_expanded[Layout::Parameter::width], // input_padding_right
padding_expanded[Layout::Parameter::height], // input_padding_bottom
padding_expanded[Layout::Parameter::width], // input_padding_left
weight_nhwc.size(Layout::Filter::height), // kernel_height
weight_nhwc.size(Layout::Filter::width), // kernel_width
stride_expanded[Layout::Parameter::height], // subsampling_height
stride_expanded[Layout::Parameter::width], // subsampling_width
dilation_expanded[Layout::Parameter::height], // dilation_height
dilation_expanded[Layout::Parameter::width], // dilation_width
groups, // groups
weight_nhwc.size(Layout::Filter::input), // group_input_channels
weight_nhwc.size(Layout::Filter::output) / groups, // group_output_channels
weight_nhwc.size(Layout::Filter::input) * groups, // input_pixel_stride
weight_nhwc.size(Layout::Filter::output), // output_pixel_stride
weight_nhwc.data_ptr<float>(), // kernel
(bias && bias->defined()) ? bias->data_ptr<float>() : nullptr, // bias
output_min, // output_min
output_max, // output_max
0u, // flags
&convolution_op); // operator
TORCH_CHECK(
xnn_status_success == create_status,
"xnn_create_convolution2d_nhwc_f32 failed!");
return ContextConv2D{
Operator(convolution_op),
{weight_nhwc.sizes()[0], weight_nhwc.sizes()[1],
weight_nhwc.sizes()[2], weight_nhwc.sizes()[3]},
{padding_expanded[0], padding_expanded[1]},
{stride_expanded[0], stride_expanded[1]},
{dilation_expanded[0], dilation_expanded[1]}
};
}
Tensor run(
const ContextConv2D& context,
const Tensor& input) {
using namespace internal;
const Tensor padded_input_nhwc = allocate_padded_contiguous_if_needed(
input, MemoryFormat::ChannelsLast);
TORCH_CHECK(
usable(padded_input_nhwc),
"XNNPACK Convolution not usable! "
"Reason: The provided input tensor is either invalid or unsupported by XNNPACK.");
Tensor output = empty_with_tail_padding(
conv_output_size(
padded_input_nhwc.sizes(),
context.weight_size_,
context.padding_,
context.stride_,
context.dilation_),
padded_input_nhwc.options().dtype(),
MemoryFormat::ChannelsLast);
const xnn_status setup_status = xnn_setup_convolution2d_nhwc_f32(
context.op.get(), // operator
padded_input_nhwc.size(Layout::Activation4D::batch), // batch_size
padded_input_nhwc.size(Layout::Activation4D::height), // input_height
padded_input_nhwc.size(Layout::Activation4D::width), // input_width
padded_input_nhwc.data_ptr<float>(), // input
output.data_ptr<float>(), // output
caffe2::xnnpack_threadpool()); // threadpool
TORCH_CHECK(
xnn_status_success == setup_status,
"xnn_setup_convolution2d_nhwc_f32 failed!");
const xnn_status run_status = xnn_run_operator(
context.op.get(), // operator
caffe2::xnnpack_threadpool()); // threadpool
TORCH_INTERNAL_ASSERT(
xnn_status_success == run_status,
"xnn_run_operator failed!");
return output.contiguous(input.suggest_memory_format());
}
c10::intrusive_ptr<xnnpack::Conv2dOpContext>
createConv2dClampPrePackOpContext(
Tensor weight,
c10::optional<Tensor> bias,
std::vector<int64_t> stride,
std::vector<int64_t> padding,
std::vector<int64_t> dilation,
int64_t groups,
c10::optional<Scalar> output_min,
c10::optional<Scalar> output_max) {
return xnnpack::XNNPackConv2dOpContext::create_context(
std::move(weight),
std::move(bias),
std::move(padding),
std::move(stride),
std::move(dilation),
groups,
output_min,
output_max);
}
Tensor Conv2dClampRun::operator()(
const Tensor& input,
const c10::intrusive_ptr<xnnpack::Conv2dOpContext>& op_context) {
return op_context->run(input);
}
} // namespace convolution2d
} // namespace internal
bool use_convolution2d(
const Tensor& input,
const Tensor& weight,
const Tensor& bias,
const IntArrayRef padding,
const IntArrayRef stride,
const IntArrayRef dilation,
const int64_t groups) {
return internal::convolution2d::available(
weight,
bias,
padding,
stride,
dilation,
groups,
ContextConv2D::kMin,
ContextConv2D::kMax) &&
internal::convolution2d::usable(input);
}
Tensor convolution2d(
const Tensor& input,
const Tensor& weight,
const Tensor& bias,
const IntArrayRef padding,
const IntArrayRef stride,
const IntArrayRef dilation,
const int64_t groups) {
return internal::convolution2d::create_and_run(
input,
weight,
bias,
padding,
stride,
dilation,
groups,
ContextConv2D::kMin,
ContextConv2D::kMax);
}
} // namespace xnnpack
} // namespace native
} // namespace at
#endif /* USE_XNNPACK */