backends/qnnpack/QNNPackBackend.cpp - platform/external/executorch - Git at Google

 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */

 #include <executorch/backends/qnnpack/executor/QNNExecutor.h>
 #include <executorch/backends/qnnpack/qnnpack_schema_generated.h>
 #include <executorch/backends/qnnpack/utils/utils.h>
 #include <executorch/extension/fb/threadpool/threadpool.h>
 #include <executorch/runtime/backend/backend_registry.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <executorch/runtime/platform/profiler.h>
 #include <executorch/util/memory_utils.h>
 #include <pytorch_qnnpack.h>
 #include <qnnpack_func.h>
 #include <string>

 namespace torch {
 namespace executor {

 // On x86, the bias tensor data is loaded using the 128-bit MOVAPS instruction
 // ("Move Aligned Packed Single-Precision Floating-Point Values"), which will
 // generate an exception if it does not receive 16-byte-aligned data.
 static constexpr size_t kTensorDataAlignment = 16;

 namespace {
 Error allocate_tensor(
     const size_t ndim,
     const exec_aten::SizesType* sizes,
     const ScalarType type,
     MemoryAllocator* runtime_allocator,
     const size_t pad_bytes,
     TensorImpl** tensor_impl_ptr) {
   exec_aten::SizesType* tensor_sizes = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
       runtime_allocator, exec_aten::SizesType, ndim);
   std::memcpy(tensor_sizes, sizes, ndim * sizeof(exec_aten::SizesType));
   // We don't really need to allocate strides, but resizes modify strides.
   // TensorImpl constructor however is ok taking nullptr as strides so resize
   // impl needs to account for this difference.
   exec_aten::DimOrderType* tensor_dim_order = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
       runtime_allocator, exec_aten::DimOrderType, ndim);
   exec_aten::StridesType* tensor_strides = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
       runtime_allocator, exec_aten::StridesType, ndim);
   for (size_t i = 0; i < ndim; ++i) {
     tensor_dim_order[i] = static_cast<exec_aten::DimOrderType>(i);
   }
   tensor_strides[ndim - 1] = 1;
   for (size_t i = ndim - 1; i > 0; --i) {
     // For sizes[i] == 0, treat it as 1 to be consistent with core Pytorch
     if (sizes[i] == 0) {
       tensor_strides[i - 1] = tensor_strides[i];
     } else {
       tensor_strides[i - 1] = tensor_strides[i] * sizes[i];
     }
   }
   auto tensor_impl =
       ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(runtime_allocator, TensorImpl);
   new (tensor_impl) TensorImpl(
       type,
       ndim,
       tensor_sizes,
       nullptr,
       tensor_dim_order,
       tensor_strides,
       TensorShapeDynamism::DYNAMIC_BOUND);
   size_t nbytes = tensor_impl->nbytes();
   void* tensor_storage =
       runtime_allocator->allocate(nbytes + pad_bytes, kTensorDataAlignment);
   if (tensor_storage == nullptr) {
     return Error::MemoryAllocationFailed;
   }
   tensor_impl->set_data(tensor_storage);
   *tensor_impl_ptr = tensor_impl;
   return Error::Ok;
 }

 Error allocate_and_copy_tensor(
     const size_t ndim,
     const exec_aten::SizesType* sizes,
     const void* data,
     const ScalarType type,
     MemoryAllocator* runtime_allocator,
     const size_t pad_bytes,
     TensorImpl** tensor_impl_ptr) {
   ET_CHECK_MSG(
       *tensor_impl_ptr == nullptr,
       "Tensor impl pointer must be null initialized");
   ET_CHECK_MSG(
       allocate_tensor(
           ndim, sizes, type, runtime_allocator, pad_bytes, tensor_impl_ptr) ==
           Error::Ok,
       "Could not allocate tensor in QNNPACK backend.");
   TensorImpl* tensor_impl = *tensor_impl_ptr;
   std::memcpy(tensor_impl->mutable_data(), data, tensor_impl->nbytes());
   return Error::Ok;
 }
 } // namespace

 using namespace qnnpack_utils;

 class QnnpackBackend final : public PyTorchBackendInterface {
  public:
   ~QnnpackBackend() = default;

   bool is_available() const override {
     return pytorch_qnnp_status_success == pytorch_qnnp_initialize();
   }

   Result<DelegateHandle*> init(
       FreeableBuffer* processed,
       ArrayRef<CompileSpec> compile_specs,
       MemoryAllocator* runtime_allocator) const override {
     auto dynamic_linear = fb_qnnpack::GetQNNDynamicLinear(processed->data());
     auto bias = dynamic_linear->bias();

     constexpr size_t pre_pad_bytes = 16;
     // Create + copy Bias Tensor
     TensorImpl* bias_buf = nullptr;
     allocate_and_copy_tensor(
         bias->shape()->size(),
         bias->shape()->data(),
         bias->buffer()->data(),
         ScalarType::Float,
         runtime_allocator,
         pre_pad_bytes,
         &bias_buf);

     // Create + copy Weight Zero-Points Tensor
     auto weights_zp = dynamic_linear->weights_zero_point();
     TensorImpl* zp_buf = nullptr;
     allocate_and_copy_tensor(
         weights_zp->shape()->size(),
         weights_zp->shape()->data(),
         weights_zp->buffer()->data(),
         ScalarType::QUInt8,
         runtime_allocator,
         0,
         &zp_buf);

     // Create + copy Weight Scales Tensor
     auto weights_scale = dynamic_linear->weights_scale();
     TensorImpl* scale_buf = nullptr;
     allocate_and_copy_tensor(
         weights_scale->shape()->size(),
         weights_scale->shape()->data(),
         weights_scale->buffer()->data(),
         ScalarType::Float,
         runtime_allocator,
         0,
         &scale_buf);

     // Create Quantized Input Tensor
     auto input_shape = dynamic_linear->input_shape();
     TensorImpl* qinput_buf = nullptr;
     allocate_tensor(
         input_shape->size(),
         input_shape->data(),
         ScalarType::QUInt8,
         runtime_allocator,
         // Add prepadding to make qnnpack happy
         pre_pad_bytes,
         &qinput_buf);
     qinput_buf->set_data(
         static_cast<uint8_t*>(qinput_buf->mutable_data()) + pre_pad_bytes);

     // Pack Weights
     auto weights = dynamic_linear->weights();
     auto packed_weights = std::make_unique<qnnpack::PackBMatrix>(
         weights->shape()->Get(0), /* input_channels */
         weights->shape()->Get(1), /* output_channels */
         weights_zp->buffer()->data(), /* kernel_zero_points */
         reinterpret_cast<const float*>(
             weights_scale->buffer()->data()), /* requantization_scales */
         weights->buffer()->data(), /* kernel */
         nullptr /* bias */
     );

     // Create QNNExecutor
     QNNExecutor* executor =
         ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(runtime_allocator, QNNExecutor);

     // NOTE: Since we use placement new and since this type is not trivially
     // destructible, we must call the destructor manually in destroy().
     new (executor) QNNExecutor(
         std::move(packed_weights), bias_buf, qinput_buf, scale_buf, zp_buf);

     // TODO(T144120904): Remove this MMAP block once all users switch to
     // MmapDataLoader.
 #if defined(ET_MMAP_SUPPORTED)
     torch::executor::util::mark_memory_as_unused(
         const_cast<void*>(processed->data()), processed->size());
 #endif
     processed->Free();

     return executor;
   }

   Error execute(DelegateHandle* handle, EValue** args) const override {
     static constexpr size_t kMaxDims = 16;

     QNNExecutor* etor = static_cast<QNNExecutor*>(handle);

     const Tensor rinput = args[0]->toTensor();
     ET_CHECK_OR_RETURN_ERROR(
         rinput.dim() <= kMaxDims,
         Internal,
         "rinput.dim() %u > kMaxDims %zu",
         (unsigned int)rinput.dim(),
         kMaxDims);
     Tensor::SizesType expected_output_size[kMaxDims];
     for (int32_t i = 0; i < rinput.dim() - 1; ++i) {
       expected_output_size[i] = rinput.size(i);
     }
     expected_output_size[rinput.dim() - 1] = etor->bias_.size(0);

     Tensor output = args[1]->toTensor();
     auto error = resize_tensor(
         output, {expected_output_size, static_cast<size_t>(rinput.dim())});
     if (error != Error::Ok) {
       std::string message("Failed to resize output tensor for size:{");
       for (int32_t i = 0; i < rinput.dim(); i++) {
         message += std::to_string(expected_output_size[i]) + ", ";
       }
       message += "}";
       ET_CHECK_MSG(false, "%s", message.c_str());
     }

     float rinput_min, rinput_max;
     std::tie(rinput_min, rinput_max) = GetMinMax(rinput);
     QuantizationParams input_qparam;

     uint8_t qmin = std::numeric_limits<uint8_t>::min();
     uint8_t qmax = std::numeric_limits<uint8_t>::max();
     Error e = ChooseQuantizationParams(
         rinput_min,
         rinput_max,
         qmin,
         qmax,
         input_qparam,
         false, /* preserve_sparsity */
         false, /* force_scale_power_of_two */
         false /* reduce_range */
     );
     ET_CHECK_OR_RETURN_ERROR(
         e == Error::Ok, Internal, "ChooseQuantizationParams() failed");

     ET_CHECK_OR_RETURN_ERROR(
         input_qparam.zero_point <= qmax && input_qparam.zero_point >= qmin,
         Internal,
         "ChooseQuantizationParams() selected invalid input_zero_point: %d",
         input_qparam.zero_point);

     std::vector<float> dequantization_scales;
     e = GenerateRequantizationScale(
         etor->weight_scales_,
         input_qparam.scale,
         1.0f /* output_scale */,
         dequantization_scales);

     ET_CHECK_OR_RETURN_ERROR(
         e == Error::Ok, Internal, "GenerateRequantizationScale() failed");

     // padding to handle out of bounds access
     dequantization_scales.resize(dequantization_scales.size() + 4);

     // Need to resize quantized tensor to match fp32 tensor sizes
     // Have to do this conditionally since only joiner of asr model has
     if (etor->qinput_.dim() == rinput.dim()) {
       resize(etor->qinput_, rinput.sizes());
     }
     e = QuantizePerTensor(
         rinput, etor->qinput_, input_qparam.scale, input_qparam.zero_point);

     ET_CHECK_OR_RETURN_ERROR(
         e == Error::Ok, Internal, "QuantizePerTensor() failed");

     size_t rows_weight = etor->bias_.size(0);
     size_t rows_input = 1;
     size_t cols_input = rinput.size(rinput.dim() - 1);
     size_t cols_weight = etor->packed_weight_.get()->getInputChannels();
     for (int i = 0; i < rinput.dim() - 1; ++i) {
       rows_input *= rinput.size(i);
     }

     ET_CHECK_OR_RETURN_ERROR(
         cols_input == cols_weight,
         Internal,
         "Can not multiple matrices, size mismatch input[-1]: %zd, weight[1]: %zd",
         cols_input,
         cols_weight);

     auto pthreadpool = torch::executorch::threadpool::get_pthreadpool();
     enum pytorch_qnnp_status status = qnnpack::qnnpackLinearDynamic(
         rows_input, /* const size_t batch_size */
         cols_input, /* const size_t input_channels */
         rows_weight, /* const size_t output_channels */
         input_qparam.zero_point, /* const uint8_t input_zero_point */
         etor->weight_zero_points_
             .const_data_ptr<uint8_t>(), /* const uint8_t* kernel_zero_points */
         dequantization_scales.data(), /* const float* dequantization_scales */
         etor->qinput_.const_data_ptr<uint8_t>(), /* const uint8_t* input */
         cols_input, /* const size_t input_stride */
         etor->packed_weight_.get()
             ->getPackedWeights(), /* void* packed_weights */
         etor->bias_.const_data_ptr<float>(), /* const float* bias */
         output.mutable_data_ptr<float>(), /* float* output */
         rows_weight, /* const size_t output_stride */
         pthreadpool /* pthreadpool_t threadpool */
     );

     ET_CHECK_OR_RETURN_ERROR(
         status == pytorch_qnnp_status_success,
         Internal,
         "qnnpackLinearDynamic failed");

     return Error::Ok;
   }

   void destroy(DelegateHandle* handle) const override {
     if (handle != nullptr) {
       auto executor = static_cast<QNNExecutor*>(handle);
       // QNNExecutor is not trivially destructible. Since this was constructed
       // manually in init(), we must destroy it manually here.
       executor->~QNNExecutor();
     }
   }
 };

 namespace {
 auto cls = QnnpackBackend();
 Backend backend{"QnnpackBackend", &cls};
 static auto success_with_compiler = register_backend(backend);
 } // namespace

 } // namespace executor
 } // namespace torch
	/*
	* Copyright (c) Meta Platforms, Inc. and affiliates.
	* All rights reserved.
	*
	* This source code is licensed under the BSD-style license found in the
	* LICENSE file in the root directory of this source tree.
	*/

	#include <executorch/backends/qnnpack/executor/QNNExecutor.h>
	#include <executorch/backends/qnnpack/qnnpack_schema_generated.h>
	#include <executorch/backends/qnnpack/utils/utils.h>
	#include <executorch/extension/fb/threadpool/threadpool.h>
	#include <executorch/runtime/backend/backend_registry.h>
	#include <executorch/runtime/core/error.h>
	#include <executorch/runtime/core/evalue.h>
	#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
	#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
	#include <executorch/runtime/platform/profiler.h>
	#include <executorch/util/memory_utils.h>
	#include <pytorch_qnnpack.h>
	#include <qnnpack_func.h>
	#include <string>

	namespace torch {
	namespace executor {

	// On x86, the bias tensor data is loaded using the 128-bit MOVAPS instruction
	// ("Move Aligned Packed Single-Precision Floating-Point Values"), which will
	// generate an exception if it does not receive 16-byte-aligned data.
	static constexpr size_t kTensorDataAlignment = 16;

	namespace {
	Error allocate_tensor(
	const size_t ndim,
	const exec_aten::SizesType* sizes,
	const ScalarType type,
	MemoryAllocator* runtime_allocator,
	const size_t pad_bytes,
	TensorImpl** tensor_impl_ptr) {
	exec_aten::SizesType* tensor_sizes = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
	runtime_allocator, exec_aten::SizesType, ndim);
	std::memcpy(tensor_sizes, sizes, ndim * sizeof(exec_aten::SizesType));
	// We don't really need to allocate strides, but resizes modify strides.
	// TensorImpl constructor however is ok taking nullptr as strides so resize
	// impl needs to account for this difference.
	exec_aten::DimOrderType* tensor_dim_order = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
	runtime_allocator, exec_aten::DimOrderType, ndim);
	exec_aten::StridesType* tensor_strides = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
	runtime_allocator, exec_aten::StridesType, ndim);
	for (size_t i = 0; i < ndim; ++i) {
	tensor_dim_order[i] = static_cast<exec_aten::DimOrderType>(i);
	}
	tensor_strides[ndim - 1] = 1;
	for (size_t i = ndim - 1; i > 0; --i) {
	// For sizes[i] == 0, treat it as 1 to be consistent with core Pytorch
	if (sizes[i] == 0) {
	tensor_strides[i - 1] = tensor_strides[i];
	} else {
	tensor_strides[i - 1] = tensor_strides[i] * sizes[i];
	}
	}
	auto tensor_impl =
	ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(runtime_allocator, TensorImpl);
	new (tensor_impl) TensorImpl(
	type,
	ndim,
	tensor_sizes,
	nullptr,
	tensor_dim_order,
	tensor_strides,
	TensorShapeDynamism::DYNAMIC_BOUND);
	size_t nbytes = tensor_impl->nbytes();
	void* tensor_storage =
	runtime_allocator->allocate(nbytes + pad_bytes, kTensorDataAlignment);
	if (tensor_storage == nullptr) {
	return Error::MemoryAllocationFailed;
	}
	tensor_impl->set_data(tensor_storage);
	*tensor_impl_ptr = tensor_impl;
	return Error::Ok;
	}

	Error allocate_and_copy_tensor(
	const size_t ndim,
	const exec_aten::SizesType* sizes,
	const void* data,
	const ScalarType type,
	MemoryAllocator* runtime_allocator,
	const size_t pad_bytes,
	TensorImpl** tensor_impl_ptr) {
	ET_CHECK_MSG(
	*tensor_impl_ptr == nullptr,
	"Tensor impl pointer must be null initialized");
	ET_CHECK_MSG(
	allocate_tensor(
	ndim, sizes, type, runtime_allocator, pad_bytes, tensor_impl_ptr) ==
	Error::Ok,
	"Could not allocate tensor in QNNPACK backend.");
	TensorImpl* tensor_impl = *tensor_impl_ptr;
	std::memcpy(tensor_impl->mutable_data(), data, tensor_impl->nbytes());
	return Error::Ok;
	}
	} // namespace

	using namespace qnnpack_utils;

	class QnnpackBackend final : public PyTorchBackendInterface {
	public:
	~QnnpackBackend() = default;

	bool is_available() const override {
	return pytorch_qnnp_status_success == pytorch_qnnp_initialize();
	}

	Result<DelegateHandle*> init(
	FreeableBuffer* processed,
	ArrayRef<CompileSpec> compile_specs,
	MemoryAllocator* runtime_allocator) const override {
	auto dynamic_linear = fb_qnnpack::GetQNNDynamicLinear(processed->data());
	auto bias = dynamic_linear->bias();

	constexpr size_t pre_pad_bytes = 16;
	// Create + copy Bias Tensor
	TensorImpl* bias_buf = nullptr;
	allocate_and_copy_tensor(
	bias->shape()->size(),
	bias->shape()->data(),
	bias->buffer()->data(),
	ScalarType::Float,
	runtime_allocator,
	pre_pad_bytes,
	&bias_buf);

	// Create + copy Weight Zero-Points Tensor
	auto weights_zp = dynamic_linear->weights_zero_point();
	TensorImpl* zp_buf = nullptr;
	allocate_and_copy_tensor(
	weights_zp->shape()->size(),
	weights_zp->shape()->data(),
	weights_zp->buffer()->data(),
	ScalarType::QUInt8,
	runtime_allocator,
	0,
	&zp_buf);

	// Create + copy Weight Scales Tensor
	auto weights_scale = dynamic_linear->weights_scale();
	TensorImpl* scale_buf = nullptr;
	allocate_and_copy_tensor(
	weights_scale->shape()->size(),
	weights_scale->shape()->data(),
	weights_scale->buffer()->data(),
	ScalarType::Float,
	runtime_allocator,
	0,
	&scale_buf);

	// Create Quantized Input Tensor
	auto input_shape = dynamic_linear->input_shape();
	TensorImpl* qinput_buf = nullptr;
	allocate_tensor(
	input_shape->size(),
	input_shape->data(),
	ScalarType::QUInt8,
	runtime_allocator,
	// Add prepadding to make qnnpack happy
	pre_pad_bytes,
	&qinput_buf);
	qinput_buf->set_data(
	static_cast<uint8_t*>(qinput_buf->mutable_data()) + pre_pad_bytes);

	// Pack Weights
	auto weights = dynamic_linear->weights();
	auto packed_weights = std::make_unique<qnnpack::PackBMatrix>(
	weights->shape()->Get(0), /* input_channels */
	weights->shape()->Get(1), /* output_channels */
	weights_zp->buffer()->data(), /* kernel_zero_points */
	reinterpret_cast<const float*>(
	weights_scale->buffer()->data()), /* requantization_scales */
	weights->buffer()->data(), /* kernel */
	nullptr /* bias */
	);

	// Create QNNExecutor
	QNNExecutor* executor =
	ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(runtime_allocator, QNNExecutor);

	// NOTE: Since we use placement new and since this type is not trivially
	// destructible, we must call the destructor manually in destroy().
	new (executor) QNNExecutor(
	std::move(packed_weights), bias_buf, qinput_buf, scale_buf, zp_buf);

	// TODO(T144120904): Remove this MMAP block once all users switch to
	// MmapDataLoader.
	#if defined(ET_MMAP_SUPPORTED)
	torch::executor::util::mark_memory_as_unused(
	const_cast<void*>(processed->data()), processed->size());
	#endif
	processed->Free();

	return executor;
	}

	Error execute(DelegateHandle* handle, EValue** args) const override {
	static constexpr size_t kMaxDims = 16;

	QNNExecutor* etor = static_cast<QNNExecutor*>(handle);

	const Tensor rinput = args[0]->toTensor();
	ET_CHECK_OR_RETURN_ERROR(
	rinput.dim() <= kMaxDims,
	Internal,
	"rinput.dim() %u > kMaxDims %zu",
	(unsigned int)rinput.dim(),
	kMaxDims);
	Tensor::SizesType expected_output_size[kMaxDims];
	for (int32_t i = 0; i < rinput.dim() - 1; ++i) {
	expected_output_size[i] = rinput.size(i);
	}
	expected_output_size[rinput.dim() - 1] = etor->bias_.size(0);

	Tensor output = args[1]->toTensor();
	auto error = resize_tensor(
	output, {expected_output_size, static_cast<size_t>(rinput.dim())});
	if (error != Error::Ok) {
	std::string message("Failed to resize output tensor for size:{");
	for (int32_t i = 0; i < rinput.dim(); i++) {
	message += std::to_string(expected_output_size[i]) + ", ";
	}
	message += "}";
	ET_CHECK_MSG(false, "%s", message.c_str());
	}

	float rinput_min, rinput_max;
	std::tie(rinput_min, rinput_max) = GetMinMax(rinput);
	QuantizationParams input_qparam;

	uint8_t qmin = std::numeric_limits<uint8_t>::min();
	uint8_t qmax = std::numeric_limits<uint8_t>::max();
	Error e = ChooseQuantizationParams(
	rinput_min,
	rinput_max,
	qmin,
	qmax,
	input_qparam,
	false, /* preserve_sparsity */
	false, /* force_scale_power_of_two */
	false /* reduce_range */
	);
	ET_CHECK_OR_RETURN_ERROR(
	e == Error::Ok, Internal, "ChooseQuantizationParams() failed");

	ET_CHECK_OR_RETURN_ERROR(
	input_qparam.zero_point <= qmax && input_qparam.zero_point >= qmin,
	Internal,
	"ChooseQuantizationParams() selected invalid input_zero_point: %d",
	input_qparam.zero_point);

	std::vector<float> dequantization_scales;
	e = GenerateRequantizationScale(
	etor->weight_scales_,
	input_qparam.scale,
	1.0f /* output_scale */,
	dequantization_scales);

	ET_CHECK_OR_RETURN_ERROR(
	e == Error::Ok, Internal, "GenerateRequantizationScale() failed");

	// padding to handle out of bounds access
	dequantization_scales.resize(dequantization_scales.size() + 4);

	// Need to resize quantized tensor to match fp32 tensor sizes
	// Have to do this conditionally since only joiner of asr model has
	if (etor->qinput_.dim() == rinput.dim()) {
	resize(etor->qinput_, rinput.sizes());
	}
	e = QuantizePerTensor(
	rinput, etor->qinput_, input_qparam.scale, input_qparam.zero_point);

	ET_CHECK_OR_RETURN_ERROR(
	e == Error::Ok, Internal, "QuantizePerTensor() failed");

	size_t rows_weight = etor->bias_.size(0);
	size_t rows_input = 1;
	size_t cols_input = rinput.size(rinput.dim() - 1);
	size_t cols_weight = etor->packed_weight_.get()->getInputChannels();
	for (int i = 0; i < rinput.dim() - 1; ++i) {
	rows_input *= rinput.size(i);
	}

	ET_CHECK_OR_RETURN_ERROR(
	cols_input == cols_weight,
	Internal,
	"Can not multiple matrices, size mismatch input[-1]: %zd, weight[1]: %zd",
	cols_input,
	cols_weight);

	auto pthreadpool = torch::executorch::threadpool::get_pthreadpool();
	enum pytorch_qnnp_status status = qnnpack::qnnpackLinearDynamic(
	rows_input, /* const size_t batch_size */
	cols_input, /* const size_t input_channels */
	rows_weight, /* const size_t output_channels */
	input_qparam.zero_point, /* const uint8_t input_zero_point */
	etor->weight_zero_points_
	.const_data_ptr<uint8_t>(), /* const uint8_t* kernel_zero_points */
	dequantization_scales.data(), /* const float* dequantization_scales */
	etor->qinput_.const_data_ptr<uint8_t>(), /* const uint8_t* input */
	cols_input, /* const size_t input_stride */
	etor->packed_weight_.get()
	->getPackedWeights(), /* void* packed_weights */
	etor->bias_.const_data_ptr<float>(), /* const float* bias */
	output.mutable_data_ptr<float>(), /* float* output */
	rows_weight, /* const size_t output_stride */
	pthreadpool /* pthreadpool_t threadpool */
	);

	ET_CHECK_OR_RETURN_ERROR(
	status == pytorch_qnnp_status_success,
	Internal,
	"qnnpackLinearDynamic failed");

	return Error::Ok;
	}

	void destroy(DelegateHandle* handle) const override {
	if (handle != nullptr) {
	auto executor = static_cast<QNNExecutor*>(handle);
	// QNNExecutor is not trivially destructible. Since this was constructed
	// manually in init(), we must destroy it manually here.
	executor->~QNNExecutor();
	}
	}
	};

	namespace {
	auto cls = QnnpackBackend();
	Backend backend{"QnnpackBackend", &cls};
	static auto success_with_compiler = register_backend(backend);
	} // namespace

	} // namespace executor
	} // namespace torch