blob: acfb6bb6d8308fec0116adfcf0d724b082e6ab2b [file] [log] [blame]
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <executorch/backends/qnnpack/executor/QNNExecutor.h>
#include <executorch/backends/qnnpack/qnnpack_schema_generated.h>
#include <executorch/backends/qnnpack/utils/utils.h>
#include <executorch/extension/fb/threadpool/threadpool.h>
#include <executorch/runtime/backend/backend_registry.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/core/evalue.h>
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
#include <executorch/runtime/platform/profiler.h>
#include <executorch/util/memory_utils.h>
#include <pytorch_qnnpack.h>
#include <qnnpack_func.h>
#include <string>
namespace torch {
namespace executor {
// On x86, the bias tensor data is loaded using the 128-bit MOVAPS instruction
// ("Move Aligned Packed Single-Precision Floating-Point Values"), which will
// generate an exception if it does not receive 16-byte-aligned data.
static constexpr size_t kTensorDataAlignment = 16;
namespace {
Error allocate_tensor(
const size_t ndim,
const exec_aten::SizesType* sizes,
const ScalarType type,
MemoryAllocator* runtime_allocator,
const size_t pad_bytes,
TensorImpl** tensor_impl_ptr) {
exec_aten::SizesType* tensor_sizes = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
runtime_allocator, exec_aten::SizesType, ndim);
std::memcpy(tensor_sizes, sizes, ndim * sizeof(exec_aten::SizesType));
// We don't really need to allocate strides, but resizes modify strides.
// TensorImpl constructor however is ok taking nullptr as strides so resize
// impl needs to account for this difference.
exec_aten::DimOrderType* tensor_dim_order = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
runtime_allocator, exec_aten::DimOrderType, ndim);
exec_aten::StridesType* tensor_strides = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
runtime_allocator, exec_aten::StridesType, ndim);
for (size_t i = 0; i < ndim; ++i) {
tensor_dim_order[i] = static_cast<exec_aten::DimOrderType>(i);
}
tensor_strides[ndim - 1] = 1;
for (size_t i = ndim - 1; i > 0; --i) {
// For sizes[i] == 0, treat it as 1 to be consistent with core Pytorch
if (sizes[i] == 0) {
tensor_strides[i - 1] = tensor_strides[i];
} else {
tensor_strides[i - 1] = tensor_strides[i] * sizes[i];
}
}
auto tensor_impl =
ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(runtime_allocator, TensorImpl);
new (tensor_impl) TensorImpl(
type,
ndim,
tensor_sizes,
nullptr,
tensor_dim_order,
tensor_strides,
TensorShapeDynamism::DYNAMIC_BOUND);
size_t nbytes = tensor_impl->nbytes();
void* tensor_storage =
runtime_allocator->allocate(nbytes + pad_bytes, kTensorDataAlignment);
if (tensor_storage == nullptr) {
return Error::MemoryAllocationFailed;
}
tensor_impl->set_data(tensor_storage);
*tensor_impl_ptr = tensor_impl;
return Error::Ok;
}
Error allocate_and_copy_tensor(
const size_t ndim,
const exec_aten::SizesType* sizes,
const void* data,
const ScalarType type,
MemoryAllocator* runtime_allocator,
const size_t pad_bytes,
TensorImpl** tensor_impl_ptr) {
ET_CHECK_MSG(
*tensor_impl_ptr == nullptr,
"Tensor impl pointer must be null initialized");
ET_CHECK_MSG(
allocate_tensor(
ndim, sizes, type, runtime_allocator, pad_bytes, tensor_impl_ptr) ==
Error::Ok,
"Could not allocate tensor in QNNPACK backend.");
TensorImpl* tensor_impl = *tensor_impl_ptr;
std::memcpy(tensor_impl->mutable_data(), data, tensor_impl->nbytes());
return Error::Ok;
}
} // namespace
using namespace qnnpack_utils;
class QnnpackBackend final : public PyTorchBackendInterface {
public:
~QnnpackBackend() = default;
bool is_available() const override {
return pytorch_qnnp_status_success == pytorch_qnnp_initialize();
}
Result<DelegateHandle*> init(
FreeableBuffer* processed,
ArrayRef<CompileSpec> compile_specs,
MemoryAllocator* runtime_allocator) const override {
auto dynamic_linear = fb_qnnpack::GetQNNDynamicLinear(processed->data());
auto bias = dynamic_linear->bias();
constexpr size_t pre_pad_bytes = 16;
// Create + copy Bias Tensor
TensorImpl* bias_buf = nullptr;
allocate_and_copy_tensor(
bias->shape()->size(),
bias->shape()->data(),
bias->buffer()->data(),
ScalarType::Float,
runtime_allocator,
pre_pad_bytes,
&bias_buf);
// Create + copy Weight Zero-Points Tensor
auto weights_zp = dynamic_linear->weights_zero_point();
TensorImpl* zp_buf = nullptr;
allocate_and_copy_tensor(
weights_zp->shape()->size(),
weights_zp->shape()->data(),
weights_zp->buffer()->data(),
ScalarType::QUInt8,
runtime_allocator,
0,
&zp_buf);
// Create + copy Weight Scales Tensor
auto weights_scale = dynamic_linear->weights_scale();
TensorImpl* scale_buf = nullptr;
allocate_and_copy_tensor(
weights_scale->shape()->size(),
weights_scale->shape()->data(),
weights_scale->buffer()->data(),
ScalarType::Float,
runtime_allocator,
0,
&scale_buf);
// Create Quantized Input Tensor
auto input_shape = dynamic_linear->input_shape();
TensorImpl* qinput_buf = nullptr;
allocate_tensor(
input_shape->size(),
input_shape->data(),
ScalarType::QUInt8,
runtime_allocator,
// Add prepadding to make qnnpack happy
pre_pad_bytes,
&qinput_buf);
qinput_buf->set_data(
static_cast<uint8_t*>(qinput_buf->mutable_data()) + pre_pad_bytes);
// Pack Weights
auto weights = dynamic_linear->weights();
auto packed_weights = std::make_unique<qnnpack::PackBMatrix>(
weights->shape()->Get(0), /* input_channels */
weights->shape()->Get(1), /* output_channels */
weights_zp->buffer()->data(), /* kernel_zero_points */
reinterpret_cast<const float*>(
weights_scale->buffer()->data()), /* requantization_scales */
weights->buffer()->data(), /* kernel */
nullptr /* bias */
);
// Create QNNExecutor
QNNExecutor* executor =
ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(runtime_allocator, QNNExecutor);
// NOTE: Since we use placement new and since this type is not trivially
// destructible, we must call the destructor manually in destroy().
new (executor) QNNExecutor(
std::move(packed_weights), bias_buf, qinput_buf, scale_buf, zp_buf);
// TODO(T144120904): Remove this MMAP block once all users switch to
// MmapDataLoader.
#if defined(ET_MMAP_SUPPORTED)
torch::executor::util::mark_memory_as_unused(
const_cast<void*>(processed->data()), processed->size());
#endif
processed->Free();
return executor;
}
Error execute(DelegateHandle* handle, EValue** args) const override {
static constexpr size_t kMaxDims = 16;
QNNExecutor* etor = static_cast<QNNExecutor*>(handle);
const Tensor rinput = args[0]->toTensor();
ET_CHECK_OR_RETURN_ERROR(
rinput.dim() <= kMaxDims,
Internal,
"rinput.dim() %u > kMaxDims %zu",
(unsigned int)rinput.dim(),
kMaxDims);
Tensor::SizesType expected_output_size[kMaxDims];
for (int32_t i = 0; i < rinput.dim() - 1; ++i) {
expected_output_size[i] = rinput.size(i);
}
expected_output_size[rinput.dim() - 1] = etor->bias_.size(0);
Tensor output = args[1]->toTensor();
auto error = resize_tensor(
output, {expected_output_size, static_cast<size_t>(rinput.dim())});
if (error != Error::Ok) {
std::string message("Failed to resize output tensor for size:{");
for (int32_t i = 0; i < rinput.dim(); i++) {
message += std::to_string(expected_output_size[i]) + ", ";
}
message += "}";
ET_CHECK_MSG(false, "%s", message.c_str());
}
float rinput_min, rinput_max;
std::tie(rinput_min, rinput_max) = GetMinMax(rinput);
QuantizationParams input_qparam;
uint8_t qmin = std::numeric_limits<uint8_t>::min();
uint8_t qmax = std::numeric_limits<uint8_t>::max();
Error e = ChooseQuantizationParams(
rinput_min,
rinput_max,
qmin,
qmax,
input_qparam,
false, /* preserve_sparsity */
false, /* force_scale_power_of_two */
false /* reduce_range */
);
ET_CHECK_OR_RETURN_ERROR(
e == Error::Ok, Internal, "ChooseQuantizationParams() failed");
ET_CHECK_OR_RETURN_ERROR(
input_qparam.zero_point <= qmax && input_qparam.zero_point >= qmin,
Internal,
"ChooseQuantizationParams() selected invalid input_zero_point: %d",
input_qparam.zero_point);
std::vector<float> dequantization_scales;
e = GenerateRequantizationScale(
etor->weight_scales_,
input_qparam.scale,
1.0f /* output_scale */,
dequantization_scales);
ET_CHECK_OR_RETURN_ERROR(
e == Error::Ok, Internal, "GenerateRequantizationScale() failed");
// padding to handle out of bounds access
dequantization_scales.resize(dequantization_scales.size() + 4);
// Need to resize quantized tensor to match fp32 tensor sizes
// Have to do this conditionally since only joiner of asr model has
if (etor->qinput_.dim() == rinput.dim()) {
resize(etor->qinput_, rinput.sizes());
}
e = QuantizePerTensor(
rinput, etor->qinput_, input_qparam.scale, input_qparam.zero_point);
ET_CHECK_OR_RETURN_ERROR(
e == Error::Ok, Internal, "QuantizePerTensor() failed");
size_t rows_weight = etor->bias_.size(0);
size_t rows_input = 1;
size_t cols_input = rinput.size(rinput.dim() - 1);
size_t cols_weight = etor->packed_weight_.get()->getInputChannels();
for (int i = 0; i < rinput.dim() - 1; ++i) {
rows_input *= rinput.size(i);
}
ET_CHECK_OR_RETURN_ERROR(
cols_input == cols_weight,
Internal,
"Can not multiple matrices, size mismatch input[-1]: %zd, weight[1]: %zd",
cols_input,
cols_weight);
auto pthreadpool = torch::executorch::threadpool::get_pthreadpool();
enum pytorch_qnnp_status status = qnnpack::qnnpackLinearDynamic(
rows_input, /* const size_t batch_size */
cols_input, /* const size_t input_channels */
rows_weight, /* const size_t output_channels */
input_qparam.zero_point, /* const uint8_t input_zero_point */
etor->weight_zero_points_
.const_data_ptr<uint8_t>(), /* const uint8_t* kernel_zero_points */
dequantization_scales.data(), /* const float* dequantization_scales */
etor->qinput_.const_data_ptr<uint8_t>(), /* const uint8_t* input */
cols_input, /* const size_t input_stride */
etor->packed_weight_.get()
->getPackedWeights(), /* void* packed_weights */
etor->bias_.const_data_ptr<float>(), /* const float* bias */
output.mutable_data_ptr<float>(), /* float* output */
rows_weight, /* const size_t output_stride */
pthreadpool /* pthreadpool_t threadpool */
);
ET_CHECK_OR_RETURN_ERROR(
status == pytorch_qnnp_status_success,
Internal,
"qnnpackLinearDynamic failed");
return Error::Ok;
}
void destroy(DelegateHandle* handle) const override {
if (handle != nullptr) {
auto executor = static_cast<QNNExecutor*>(handle);
// QNNExecutor is not trivially destructible. Since this was constructed
// manually in init(), we must destroy it manually here.
executor->~QNNExecutor();
}
}
};
namespace {
auto cls = QnnpackBackend();
Backend backend{"QnnpackBackend", &cls};
static auto success_with_compiler = register_backend(backend);
} // namespace
} // namespace executor
} // namespace torch