kernels/quantized/cpu/embeddingxb.cpp - platform/external/executorch - Git at Google

 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */

 #include <executorch/kernels/quantized/cpu/embeddingxb.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <algorithm>
 #include <cassert>
 #include <cinttypes>
 #include <cmath>

 namespace torch {
 namespace executor {
 namespace native {

 using Tensor = exec_aten::Tensor;
 using Scalar = exec_aten::Scalar;
 using ScalarType = exec_aten::ScalarType;

 namespace {

 static inline int32_t
 weight_value(const unsigned char* w_data, int32_t index, int32_t weight_nbit) {
   if (weight_nbit == 2) {
     int32_t subbyte = index % 4;
     index >>= 2;
     switch (subbyte) {
       case 0:
         return (int32_t)(w_data[index] & 3) - 2;
       case 1:
         return (int32_t)((w_data[index] & 12) >> 2) - 2;
       case 2:
         return (int32_t)((w_data[index] & 48) >> 4) - 2;
       case 3:
         return (int32_t)((w_data[index] & 192) >> 6) - 2;
     }
   } else if (weight_nbit == 4) {
     int32_t odd = index & 1;
     index >>= 1;
     if (odd) {
       return (int32_t)(w_data[index] & 0x0F) - 8;
     } else {
       return (int32_t)((w_data[index] >> 4) & 0x0F) - 8;
     }
   }

   ET_CHECK_MSG(false, "invalid weight_nbit");
 }

 static inline int32_t get_embedding_dim(
     int32_t packed_dim,
     int32_t weight_nbit) {
   ET_CHECK_MSG(8 % weight_nbit == 0, "invalid embedding dim");
   int packed_values_per_byte = 8 / weight_nbit;
   return packed_dim * packed_values_per_byte;
 }

 /**
  * Asserts that the parameters are valid.
  */
 void check_embedding_xbit_args(
     const Tensor& weight,
     const Tensor& weight_scales,
     const exec_aten::optional<Tensor>& opt_weight_zero_points,
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
     exec_aten::optional<ScalarType> out_dtype,
     Tensor& out,
     int weight_nbit) {
   ET_CHECK_MSG(8 % weight_nbit == 0, "nbit must divide 8");

   ET_CHECK_MSG(
       weight.dim() == 2, "weight must be 2D but got() %zd dims", weight.dim());

   ET_CHECK_MSG(
       weight_scales.dim() == 1 || weight_scales.dim() == 2,
       "weight_scales must be 1D or 2D but got() %zd dims",
       weight_scales.dim());

   ET_CHECK_MSG(
       weight_scales.size(0) == weight.size(0),
       "Number of scales must be == weight.size(0)=%zd"
       ", but got %zd",
       weight_scales.size(0),
       weight.size(0));

   if (weight_scales.dim() == 2) {
     auto num_groups = weight_scales.size(1);
     ET_CHECK_MSG(
         // each 8b uint8 column is packed_values_per_byte columns
         get_embedding_dim(weight.size(1), weight_nbit) % num_groups == 0,
         "Number of groups must divide weight.size(1)=%zd"
         ", but got # of groups = %zd",
         weight.size(1),
         num_groups);
   }

   ET_CHECK_MSG(
       weight.scalar_type() == ScalarType::Byte,
       "weight.scalar_type() %" PRId8 " is not supported:",
       static_cast<int8_t>(weight.scalar_type()));

   ET_CHECK_MSG(
       out.scalar_type() == ScalarType::Float ||
           out.scalar_type() == ScalarType::Half,
       "out.scalar_type() %" PRId8 " is not supported:",
       static_cast<int8_t>(out.scalar_type()));

   ET_CHECK_MSG(
       weight_scales.scalar_type() == ScalarType::Float ||
           weight_scales.scalar_type() == ScalarType::Half,
       "weight_scales.scalar_type() %" PRId8 " is not supported:",
       static_cast<int8_t>(weight_scales.scalar_type()));

   if (opt_weight_zero_points.has_value()) {
     ET_CHECK_MSG(
         opt_weight_zero_points.value().dim() == weight_scales.dim(),
         "weight_zero_points's rank match that of weight_scales. "
         "weight_zero_points rank: %" PRId8 ", weight_scales rank: %" PRId8,
         static_cast<int8_t>(opt_weight_zero_points.value().dim()),
         static_cast<int8_t>(weight_scales.dim()));

     ET_CHECK_MSG(
         opt_weight_zero_points.value().scalar_type() == out.scalar_type(),
         "weight zero points scalar type %" PRId8
         " does not match out.scalar_type()",
         static_cast<int8_t>(opt_weight_zero_points.value().scalar_type()));

     for (int32_t i = 0; i < weight_scales.dim(); ++i) {
       ET_CHECK_MSG(
           opt_weight_zero_points.value().size(i) == weight_scales.size(i),
           "Dimension size misatch at dim %" PRIi32
           "Weight_zero_point size = %zd"
           ", weight_scales size = %zd.",
           i,
           opt_weight_zero_points.value().size(i),
           weight_scales.size(i));
     }
   }

   ET_CHECK_MSG(
       indices.scalar_type() == ScalarType::Long,
       "indices.scalar_type() %" PRId8 " is not Long only Long is supported:",
       static_cast<int8_t>(indices.scalar_type()));

   ET_CHECK_MSG(
       weight_quant_min <= weight_quant_max,
       "weight quant min: %" PRId64
       " is greater than weight quant max: %" PRId64,
       weight_quant_min,
       weight_quant_max);

   if (out_dtype.has_value()) {
     ET_CHECK_MSG(
         out.scalar_type() == out_dtype.value(),
         "output_dtype must match the dtype of the out tensor");
   }
 }

 /**
  * Retrieves the embeddings specified by indices, dequantizes them, and stores
  * them in out. Weight will always be uint8
  */
 template <typename CTYPE_PARAMS, typename CTYPE_OUT>
 void embedding_xbit_per_channel(
     const Tensor& weight,
     const Tensor& weight_scales,
     const exec_aten::optional<Tensor>& opt_weight_zero_points,
     const Tensor& indices,
     Tensor& out,
     int weight_nbit) {
   auto embedding_dim = get_embedding_dim(weight.size(1), weight_nbit);

   int32_t num_groups_per_channel = 1;
   if (weight_scales.dim() == 2) {
     num_groups_per_channel = weight_scales.size(1);
   }
   int32_t group_size = embedding_dim / num_groups_per_channel;

   CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
   const int64_t* indices_ptr = indices.const_data_ptr<int64_t>();

   const CTYPE_PARAMS* scales = weight_scales.const_data_ptr<CTYPE_PARAMS>();
   const CTYPE_PARAMS* zero_points = nullptr;
   if (opt_weight_zero_points.has_value()) {
     zero_points = opt_weight_zero_points.value().const_data_ptr<CTYPE_PARAMS>();
   }

   for (int i = 0; i < indices.numel(); i++) {
     int64_t index = indices_ptr[i];
     // If using groupwise embedding
     int32_t qparams_index = index * num_groups_per_channel;
     CTYPE_PARAMS zp = 0.0;
     const CTYPE_PARAMS* scale_ptr = scales + qparams_index;
     const CTYPE_PARAMS* zero_points_ptr = nullptr;
     if (opt_weight_zero_points.has_value()) {
       zero_points_ptr = zero_points + qparams_index;
     }

     const uint8_t* w_data =
         weight.const_data_ptr<uint8_t>() + weight.size(1) * index;

     for (int j = 0; j < embedding_dim; ++j) {
       int32_t group_id = j / group_size;
       const CTYPE_PARAMS scale = scale_ptr[group_id];
       if (opt_weight_zero_points.has_value()) {
         zp = zero_points_ptr[group_id];
       }
       out_data[j] = static_cast<CTYPE_OUT>(
           (static_cast<float>(weight_value(w_data, j, weight_nbit)) -
            static_cast<float>(zp)) *
           static_cast<float>(scale));
     }
     out_data += embedding_dim;
   }
 }

 void resize_out_tensor(
     const Tensor& weight,
     const Tensor& indices,
     Tensor& out,
     int weight_nbit) {
   exec_aten::SizesType expected_output_size[kTensorDimensionLimit];
   for (size_t i = 0; i < indices.dim(); i++) {
     expected_output_size[i] = indices.size(i);
   }
   const size_t embedding_dim = get_embedding_dim(weight.size(1), weight_nbit);
   expected_output_size[out.dim() - 1] = embedding_dim;

   exec_aten::ArrayRef<exec_aten::SizesType> output_size{
       expected_output_size, static_cast<size_t>(out.dim())};

   torch::executor::Error err = resize_tensor(out, output_size);
   ET_CHECK_MSG(
       err == torch::executor::Error::Ok,
       "Failed to resize out Tensor in quantized_embedding_xbit_out");
 }

 } // namespace

 /**
  * Retrieves the embeddings specified by indices, dequantizes them, and stores
  * them in out. The weight is quantized per channel, with a scale and zero_point
  * for each embedding.
  *
  * Corresponds as the out variant to torch.ops.quantized.embedding_xbit
  *
  * NOTE: quant_min, quant_max, and Dtype are not used in computation, but rather
  * metadata that is passed around which can be useful for pattern matching. See
  * https://github.com/pytorch/pytorch/pull/87093#discussion_r1000841181 for more
  * info.
  */
 Tensor& quantized_embedding_xbit_out(
     // TODO Evaluate whether this name is appropriate for an operator that takes
     // non quant input and returns fp output
     const Tensor& weight,
     const Tensor& weight_scales,
     const exec_aten::optional<Tensor>& opt_weight_zero_points,
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
     Tensor& out,
     int weight_nbit) {
   ScalarType out_type = out.scalar_type();

   // TODO (jakeszwe): improve these to account for the size of out in relation
   // to weight and indices accounting for a possible batch dimension
   check_embedding_xbit_args(
       weight,
       weight_scales,
       opt_weight_zero_points,
       weight_quant_min,
       weight_quant_max,
       indices,
       out_type,
       out,
       weight_nbit);

   constexpr auto name = "quantized_decomposed::embedding_xbit.out";
   ET_SWITCH_TWO_TYPES(Float, Half, out_type, ctx, name, CTYPE_OUT, [&]() {
     embedding_xbit_per_channel<CTYPE_OUT, CTYPE_OUT>(
         weight,
         weight_scales,
         opt_weight_zero_points,
         indices,
         out,
         weight_nbit);
   });

   return out;
 }

 Tensor& quantized_embedding_xbit_out(
     KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
     const exec_aten::optional<Tensor>& opt_weight_zero_points,
     int64_t weight_quant_min,
     int64_t weight_quant_max,
     const Tensor& indices,
     Tensor& out,
     int weight_nbit) {
   // TODO(larryliu): Add a context arg to the real op function and remove this
   // wrapper
   (void)context;
   resize_out_tensor(weight, indices, out, weight_nbit);
   return quantized_embedding_xbit_out(
       weight,
       weight_scales,
       opt_weight_zero_points,
       weight_quant_min,
       weight_quant_max,
       indices,
       out,
       weight_nbit);
 }

 Tensor& quantized_embedding_xbit_dtype_out(
     // TODO Evaluate whether this name is appropriate for an operator that takes
     // non quant input and returns fp output
     const Tensor& weight,
     const Tensor& weight_scales,
     const exec_aten::optional<Tensor>& opt_weight_zero_points,
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
     exec_aten::optional<ScalarType> out_dtype,
     Tensor& out,
     int weight_nbit) {
   // TODO (jakeszwe): improve these to account for the size of out in relation
   // to weight and indices accounting for a possible batch dimension
   check_embedding_xbit_args(
       weight,
       weight_scales,
       opt_weight_zero_points,
       weight_quant_min,
       weight_quant_max,
       indices,
       out_dtype,
       out,
       weight_nbit);

   ScalarType params_type = weight_scales.scalar_type();
   ScalarType out_type = out.scalar_type();

   constexpr auto name = "quantized_decomposed::embedding_xbit.dtype_out";
   ET_SWITCH_TWO_TYPES(Float, Half, params_type, ctx, name, CTYPE_P, [&]() {
     ET_SWITCH_TWO_TYPES(Float, Half, out_type, ctx, name, CTYPE_OUT, [&]() {
       embedding_xbit_per_channel<CTYPE_P, CTYPE_OUT>(
           weight,
           weight_scales,
           opt_weight_zero_points,
           indices,
           out,
           weight_nbit);
     });
   });

   return out;
 }

 Tensor& quantized_embedding_xbit_dtype_out(
     KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
     const exec_aten::optional<Tensor>& opt_weight_zero_points,
     int64_t weight_quant_min,
     int64_t weight_quant_max,
     const Tensor& indices,
     exec_aten::optional<ScalarType> out_dtype,
     Tensor& out,
     int weight_nbit) {
   // TODO(larryliu): Add a context arg to the real op function and remove this
   // wrapper
   (void)context;
   resize_out_tensor(weight, indices, out, weight_nbit);
   return quantized_embedding_xbit_dtype_out(
       weight,
       weight_scales,
       opt_weight_zero_points,
       weight_quant_min,
       weight_quant_max,
       indices,
       out_dtype,
       out,
       weight_nbit);
 }

 } // namespace native
 } // namespace executor
 } // namespace torch
	/*
	* Copyright (c) Meta Platforms, Inc. and affiliates.
	* All rights reserved.
	*
	* This source code is licensed under the BSD-style license found in the
	* LICENSE file in the root directory of this source tree.
	*/

	#include <executorch/kernels/quantized/cpu/embeddingxb.h>
	#include <executorch/runtime/kernel/kernel_includes.h>
	#include <algorithm>
	#include <cassert>
	#include <cinttypes>
	#include <cmath>

	namespace torch {
	namespace executor {
	namespace native {

	using Tensor = exec_aten::Tensor;
	using Scalar = exec_aten::Scalar;
	using ScalarType = exec_aten::ScalarType;

	namespace {

	static inline int32_t
	weight_value(const unsigned char* w_data, int32_t index, int32_t weight_nbit) {
	if (weight_nbit == 2) {
	int32_t subbyte = index % 4;
	index >>= 2;
	switch (subbyte) {
	case 0:
	return (int32_t)(w_data[index] & 3) - 2;
	case 1:
	return (int32_t)((w_data[index] & 12) >> 2) - 2;
	case 2:
	return (int32_t)((w_data[index] & 48) >> 4) - 2;
	case 3:
	return (int32_t)((w_data[index] & 192) >> 6) - 2;
	}
	} else if (weight_nbit == 4) {
	int32_t odd = index & 1;
	index >>= 1;
	if (odd) {
	return (int32_t)(w_data[index] & 0x0F) - 8;
	} else {
	return (int32_t)((w_data[index] >> 4) & 0x0F) - 8;
	}
	}

	ET_CHECK_MSG(false, "invalid weight_nbit");
	}

	static inline int32_t get_embedding_dim(
	int32_t packed_dim,
	int32_t weight_nbit) {
	ET_CHECK_MSG(8 % weight_nbit == 0, "invalid embedding dim");
	int packed_values_per_byte = 8 / weight_nbit;
	return packed_dim * packed_values_per_byte;
	}

	/**
	* Asserts that the parameters are valid.
	*/
	void check_embedding_xbit_args(
	const Tensor& weight,
	const Tensor& weight_scales,
	const exec_aten::optional<Tensor>& opt_weight_zero_points,
	const int64_t weight_quant_min,
	const int64_t weight_quant_max,
	const Tensor& indices,
	exec_aten::optional<ScalarType> out_dtype,
	Tensor& out,
	int weight_nbit) {
	ET_CHECK_MSG(8 % weight_nbit == 0, "nbit must divide 8");

	ET_CHECK_MSG(
	weight.dim() == 2, "weight must be 2D but got() %zd dims", weight.dim());

	ET_CHECK_MSG(
	weight_scales.dim() == 1 \|\| weight_scales.dim() == 2,
	"weight_scales must be 1D or 2D but got() %zd dims",
	weight_scales.dim());

	ET_CHECK_MSG(
	weight_scales.size(0) == weight.size(0),
	"Number of scales must be == weight.size(0)=%zd"
	", but got %zd",
	weight_scales.size(0),
	weight.size(0));

	if (weight_scales.dim() == 2) {
	auto num_groups = weight_scales.size(1);
	ET_CHECK_MSG(
	// each 8b uint8 column is packed_values_per_byte columns
	get_embedding_dim(weight.size(1), weight_nbit) % num_groups == 0,
	"Number of groups must divide weight.size(1)=%zd"
	", but got # of groups = %zd",
	weight.size(1),
	num_groups);
	}

	ET_CHECK_MSG(
	weight.scalar_type() == ScalarType::Byte,
	"weight.scalar_type() %" PRId8 " is not supported:",
	static_cast<int8_t>(weight.scalar_type()));

	ET_CHECK_MSG(
	out.scalar_type() == ScalarType::Float \|\|
	out.scalar_type() == ScalarType::Half,
	"out.scalar_type() %" PRId8 " is not supported:",
	static_cast<int8_t>(out.scalar_type()));

	ET_CHECK_MSG(
	weight_scales.scalar_type() == ScalarType::Float \|\|
	weight_scales.scalar_type() == ScalarType::Half,
	"weight_scales.scalar_type() %" PRId8 " is not supported:",
	static_cast<int8_t>(weight_scales.scalar_type()));

	if (opt_weight_zero_points.has_value()) {
	ET_CHECK_MSG(
	opt_weight_zero_points.value().dim() == weight_scales.dim(),
	"weight_zero_points's rank match that of weight_scales. "
	"weight_zero_points rank: %" PRId8 ", weight_scales rank: %" PRId8,
	static_cast<int8_t>(opt_weight_zero_points.value().dim()),
	static_cast<int8_t>(weight_scales.dim()));

	ET_CHECK_MSG(
	opt_weight_zero_points.value().scalar_type() == out.scalar_type(),
	"weight zero points scalar type %" PRId8
	" does not match out.scalar_type()",
	static_cast<int8_t>(opt_weight_zero_points.value().scalar_type()));

	for (int32_t i = 0; i < weight_scales.dim(); ++i) {
	ET_CHECK_MSG(
	opt_weight_zero_points.value().size(i) == weight_scales.size(i),
	"Dimension size misatch at dim %" PRIi32
	"Weight_zero_point size = %zd"
	", weight_scales size = %zd.",
	i,
	opt_weight_zero_points.value().size(i),
	weight_scales.size(i));
	}
	}

	ET_CHECK_MSG(
	indices.scalar_type() == ScalarType::Long,
	"indices.scalar_type() %" PRId8 " is not Long only Long is supported:",
	static_cast<int8_t>(indices.scalar_type()));

	ET_CHECK_MSG(
	weight_quant_min <= weight_quant_max,
	"weight quant min: %" PRId64
	" is greater than weight quant max: %" PRId64,
	weight_quant_min,
	weight_quant_max);

	if (out_dtype.has_value()) {
	ET_CHECK_MSG(
	out.scalar_type() == out_dtype.value(),
	"output_dtype must match the dtype of the out tensor");
	}
	}

	/**
	* Retrieves the embeddings specified by indices, dequantizes them, and stores
	* them in out. Weight will always be uint8
	*/
	template <typename CTYPE_PARAMS, typename CTYPE_OUT>
	void embedding_xbit_per_channel(
	const Tensor& weight,
	const Tensor& weight_scales,
	const exec_aten::optional<Tensor>& opt_weight_zero_points,
	const Tensor& indices,
	Tensor& out,
	int weight_nbit) {
	auto embedding_dim = get_embedding_dim(weight.size(1), weight_nbit);

	int32_t num_groups_per_channel = 1;
	if (weight_scales.dim() == 2) {
	num_groups_per_channel = weight_scales.size(1);
	}
	int32_t group_size = embedding_dim / num_groups_per_channel;

	CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
	const int64_t* indices_ptr = indices.const_data_ptr<int64_t>();

	const CTYPE_PARAMS* scales = weight_scales.const_data_ptr<CTYPE_PARAMS>();
	const CTYPE_PARAMS* zero_points = nullptr;
	if (opt_weight_zero_points.has_value()) {
	zero_points = opt_weight_zero_points.value().const_data_ptr<CTYPE_PARAMS>();
	}

	for (int i = 0; i < indices.numel(); i++) {
	int64_t index = indices_ptr[i];
	// If using groupwise embedding
	int32_t qparams_index = index * num_groups_per_channel;
	CTYPE_PARAMS zp = 0.0;
	const CTYPE_PARAMS* scale_ptr = scales + qparams_index;
	const CTYPE_PARAMS* zero_points_ptr = nullptr;
	if (opt_weight_zero_points.has_value()) {
	zero_points_ptr = zero_points + qparams_index;
	}

	const uint8_t* w_data =
	weight.const_data_ptr<uint8_t>() + weight.size(1) * index;

	for (int j = 0; j < embedding_dim; ++j) {
	int32_t group_id = j / group_size;
	const CTYPE_PARAMS scale = scale_ptr[group_id];
	if (opt_weight_zero_points.has_value()) {
	zp = zero_points_ptr[group_id];
	}
	out_data[j] = static_cast<CTYPE_OUT>(
	(static_cast<float>(weight_value(w_data, j, weight_nbit)) -
	static_cast<float>(zp)) *
	static_cast<float>(scale));
	}
	out_data += embedding_dim;
	}
	}

	void resize_out_tensor(
	const Tensor& weight,
	const Tensor& indices,
	Tensor& out,
	int weight_nbit) {
	exec_aten::SizesType expected_output_size[kTensorDimensionLimit];
	for (size_t i = 0; i < indices.dim(); i++) {
	expected_output_size[i] = indices.size(i);
	}
	const size_t embedding_dim = get_embedding_dim(weight.size(1), weight_nbit);
	expected_output_size[out.dim() - 1] = embedding_dim;

	exec_aten::ArrayRef<exec_aten::SizesType> output_size{
	expected_output_size, static_cast<size_t>(out.dim())};

	torch::executor::Error err = resize_tensor(out, output_size);
	ET_CHECK_MSG(
	err == torch::executor::Error::Ok,
	"Failed to resize out Tensor in quantized_embedding_xbit_out");
	}

	} // namespace

	/**
	* Retrieves the embeddings specified by indices, dequantizes them, and stores
	* them in out. The weight is quantized per channel, with a scale and zero_point
	* for each embedding.
	*
	* Corresponds as the out variant to torch.ops.quantized.embedding_xbit
	*
	* NOTE: quant_min, quant_max, and Dtype are not used in computation, but rather
	* metadata that is passed around which can be useful for pattern matching. See
	* https://github.com/pytorch/pytorch/pull/87093#discussion_r1000841181 for more
	* info.
	*/
	Tensor& quantized_embedding_xbit_out(
	// TODO Evaluate whether this name is appropriate for an operator that takes
	// non quant input and returns fp output
	const Tensor& weight,
	const Tensor& weight_scales,
	const exec_aten::optional<Tensor>& opt_weight_zero_points,
	const int64_t weight_quant_min,
	const int64_t weight_quant_max,
	const Tensor& indices,
	Tensor& out,
	int weight_nbit) {
	ScalarType out_type = out.scalar_type();

	// TODO (jakeszwe): improve these to account for the size of out in relation
	// to weight and indices accounting for a possible batch dimension
	check_embedding_xbit_args(
	weight,
	weight_scales,
	opt_weight_zero_points,
	weight_quant_min,
	weight_quant_max,
	indices,
	out_type,
	out,
	weight_nbit);

	constexpr auto name = "quantized_decomposed::embedding_xbit.out";
	ET_SWITCH_TWO_TYPES(Float, Half, out_type, ctx, name, CTYPE_OUT, [&]() {
	embedding_xbit_per_channel<CTYPE_OUT, CTYPE_OUT>(
	weight,
	weight_scales,
	opt_weight_zero_points,
	indices,
	out,
	weight_nbit);
	});

	return out;
	}

	Tensor& quantized_embedding_xbit_out(
	KernelRuntimeContext& context,
	const Tensor& weight,
	const Tensor& weight_scales,
	const exec_aten::optional<Tensor>& opt_weight_zero_points,
	int64_t weight_quant_min,
	int64_t weight_quant_max,
	const Tensor& indices,
	Tensor& out,
	int weight_nbit) {
	// TODO(larryliu): Add a context arg to the real op function and remove this
	// wrapper
	(void)context;
	resize_out_tensor(weight, indices, out, weight_nbit);
	return quantized_embedding_xbit_out(
	weight,
	weight_scales,
	opt_weight_zero_points,
	weight_quant_min,
	weight_quant_max,
	indices,
	out,
	weight_nbit);
	}

	Tensor& quantized_embedding_xbit_dtype_out(
	// TODO Evaluate whether this name is appropriate for an operator that takes
	// non quant input and returns fp output
	const Tensor& weight,
	const Tensor& weight_scales,
	const exec_aten::optional<Tensor>& opt_weight_zero_points,
	const int64_t weight_quant_min,
	const int64_t weight_quant_max,
	const Tensor& indices,
	exec_aten::optional<ScalarType> out_dtype,
	Tensor& out,
	int weight_nbit) {
	// TODO (jakeszwe): improve these to account for the size of out in relation
	// to weight and indices accounting for a possible batch dimension
	check_embedding_xbit_args(
	weight,
	weight_scales,
	opt_weight_zero_points,
	weight_quant_min,
	weight_quant_max,
	indices,
	out_dtype,
	out,
	weight_nbit);

	ScalarType params_type = weight_scales.scalar_type();
	ScalarType out_type = out.scalar_type();

	constexpr auto name = "quantized_decomposed::embedding_xbit.dtype_out";
	ET_SWITCH_TWO_TYPES(Float, Half, params_type, ctx, name, CTYPE_P, [&]() {
	ET_SWITCH_TWO_TYPES(Float, Half, out_type, ctx, name, CTYPE_OUT, [&]() {
	embedding_xbit_per_channel<CTYPE_P, CTYPE_OUT>(
	weight,
	weight_scales,
	opt_weight_zero_points,
	indices,
	out,
	weight_nbit);
	});
	});

	return out;
	}

	Tensor& quantized_embedding_xbit_dtype_out(
	KernelRuntimeContext& context,
	const Tensor& weight,
	const Tensor& weight_scales,
	const exec_aten::optional<Tensor>& opt_weight_zero_points,
	int64_t weight_quant_min,
	int64_t weight_quant_max,
	const Tensor& indices,
	exec_aten::optional<ScalarType> out_dtype,
	Tensor& out,
	int weight_nbit) {
	// TODO(larryliu): Add a context arg to the real op function and remove this
	// wrapper
	(void)context;
	resize_out_tensor(weight, indices, out, weight_nbit);
	return quantized_embedding_xbit_dtype_out(
	weight,
	weight_scales,
	opt_weight_zero_points,
	weight_quant_min,
	weight_quant_max,
	indices,
	out_dtype,
	out,
	weight_nbit);
	}

	} // namespace native
	} // namespace executor
	} // namespace torch