XNNPack delegate to support delegating grouped conv2d op.

PiperOrigin-RevId: 436813859
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
index f18e7f8..837e102 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
@@ -105,36 +105,6 @@
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, Grouped) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto batch_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
-  auto input_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
-  auto channel_per_group_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
-  auto groups_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 8), std::ref(rng));
-
-  auto groups = groups_rng();
-  Conv2DTester()
-      .BatchSize(batch_rng())
-      .InputHeight(input_rng())
-      .InputWidth(input_rng())
-      .InputChannels(groups * channel_per_group_rng())
-      .OutputChannels(groups * channel_per_group_rng())
-      .Groups(groups)
-      .KernelHeight(3)
-      .KernelWidth(3)
-      .SamePadding()
-      .Test(xnnpack_delegate.get());
-}
-
 TEST(Conv2D, SmallKernelWithSamePadding) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
index 082a27c..222fdac 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
@@ -140,8 +140,8 @@
                                       densify_filter_outputs.size())));
   }
 
-  const std::vector<int32_t> filter_shape = {
-      OutputChannels(), KernelHeight(), KernelWidth(), KernelInputChannels()};
+  const std::vector<int32_t> filter_shape = {OutputChannels(), KernelHeight(),
+                                             KernelWidth(), InputChannels()};
   const std::vector<int32_t> bias_shape = {OutputChannels()};
   std::vector<float> filter_scales;
   std::vector<int64_t> filter_zero_points;
@@ -151,7 +151,7 @@
         CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
 
     std::vector<uint16_t> filter_data(OutputChannels() * KernelHeight() *
-                                      KernelWidth() * KernelInputChannels());
+                                      KernelWidth() * InputChannels());
     std::vector<uint16_t> bias_data(OutputChannels());
     for (int32_t oc = 0; oc < OutputChannels(); oc++) {
       // Use the same range of all-positive or all-negative values to generate
@@ -165,12 +165,12 @@
                                   std::min(range, 0.0f), std::max(range, 0.0f)),
                               std::ref(rng)));
       bias_data[oc] = value_rng();
-      for (int32_t ic = 0; ic < KernelInputChannels(); ic++) {
+      for (int32_t ic = 0; ic < InputChannels(); ic++) {
         for (int32_t y = 0; y < KernelHeight(); y++) {
           for (int32_t x = 0; x < KernelWidth(); x++) {
             const int32_t index =
                 ((oc * KernelHeight() + y) * KernelWidth() + x) *
-                    KernelInputChannels() +
+                    InputChannels() +
                 ic;
             filter_data[index] = value_rng();
           }
@@ -209,7 +209,7 @@
                                       dequantize_bias_outputs.size())));
   } else {
     std::vector<float> filter_data(OutputChannels() * KernelHeight() *
-                                   KernelWidth() * KernelInputChannels());
+                                   KernelWidth() * InputChannels());
     std::vector<float> bias_data(OutputChannels());
     for (int32_t oc = 0; oc < OutputChannels(); oc++) {
       // Use the same range of all-positive or all-negative values to generate
@@ -222,12 +222,12 @@
                         std::min(range, 0.0f), std::max(range, 0.0f)),
                     std::ref(rng));
       bias_data[oc] = value_rng();
-      for (int32_t ic = 0; ic < KernelInputChannels(); ic++) {
+      for (int32_t ic = 0; ic < InputChannels(); ic++) {
         for (int32_t y = 0; y < KernelHeight(); y++) {
           for (int32_t x = 0; x < KernelWidth(); x++) {
             const int32_t index =
                 ((oc * KernelHeight() + y) * KernelWidth() + x) *
-                    KernelInputChannels() +
+                    InputChannels() +
                 ic;
             filter_data[index] = value_rng();
           }
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h
index 16144a2..a0034db 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h
@@ -56,19 +56,6 @@
 
   inline int32_t OutputChannels() const { return output_channels_; }
 
-  inline Conv2DTester& Groups(int32_t groups) {
-    EXPECT_EQ(InputChannels() % groups, 0);
-    EXPECT_EQ(OutputChannels() % groups, 0);
-    groups_ = groups;
-    return *this;
-  }
-
-  inline int32_t Groups() const { return groups_; }
-
-  inline int32_t KernelInputChannels() const {
-    return input_channels_ / groups_;
-  }
-
   inline Conv2DTester& InputHeight(int32_t input_height) {
     EXPECT_GT(input_height, 0);
     input_height_ = input_height;
@@ -240,7 +227,6 @@
   int32_t batch_size_ = 1;
   int32_t input_channels_ = 1;
   int32_t output_channels_ = 1;
-  int32_t groups_ = 1;
   int32_t input_height_ = 1;
   int32_t input_width_ = 1;
   int32_t kernel_height_ = 1;
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc
index 8d4faf7..1ae2119 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc
@@ -138,7 +138,7 @@
       {CreateOperatorCode(builder, BuiltinOperator_CONV_2D)}};
 
   std::vector<int8_t> filter_data(OutputChannels() * KernelHeight() *
-                                  KernelWidth() * KernelInputChannels());
+                                  KernelWidth() * InputChannels());
   std::generate(filter_data.begin(), filter_data.end(), std::ref(filter_rng));
   std::vector<int32_t> bias_data(OutputChannels());
   std::generate(bias_data.begin(), bias_data.end(), std::ref(bias_rng));
@@ -160,7 +160,7 @@
   const std::array<int32_t, 4> output_shape{
       {BatchSize(), OutputHeight(), OutputWidth(), OutputChannels()}};
   const std::array<int32_t, 4> filter_shape{
-      {OutputChannels(), KernelHeight(), KernelWidth(), KernelInputChannels()}};
+      {OutputChannels(), KernelHeight(), KernelWidth(), InputChannels()}};
   const std::array<int32_t, 1> bias_shape{{OutputChannels()}};
 
   flatbuffers::Offset<flatbuffers::Vector<float>> filter_scale_offset = 0;
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h b/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h
index 8200fd2..9f6215d 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h
@@ -152,19 +152,6 @@
     return (KernelWidth() - 1) * DilationWidth() + 1;
   }
 
-  inline QuantizedConv2DTester& Groups(int32_t groups) {
-    EXPECT_EQ(InputChannels() % groups, 0);
-    EXPECT_EQ(OutputChannels() % groups, 0);
-    groups_ = groups;
-    return *this;
-  }
-
-  inline int32_t Groups() const { return groups_; }
-
-  inline int32_t KernelInputChannels() const {
-    return input_channels_ / groups_;
-  }
-
   inline QuantizedConv2DTester& InputZeroPoint(int32_t input_zero_point) {
     input_zero_point_ = input_zero_point;
     return *this;
@@ -269,7 +256,6 @@
   int32_t batch_size_ = 1;
   int32_t input_channels_ = 1;
   int32_t output_channels_ = 1;
-  int32_t groups_ = 1;
   int32_t input_height_ = 1;
   int32_t input_width_ = 1;
   int32_t kernel_height_ = 1;
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc
index 20740cd..26377db 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc
@@ -124,44 +124,6 @@
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, Grouped) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
-                                      std::numeric_limits<int8_t>::min(),
-                                      std::numeric_limits<int8_t>::max()),
-                                  std::ref(rng));
-  auto batch_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
-  auto input_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
-  auto channel_per_group_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
-  auto groups_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 8), std::ref(rng));
-
-  auto groups = groups_rng();
-  QuantizedConv2DTester()
-      .InputZeroPoint(zero_point_rng())
-      .OutputZeroPoint(zero_point_rng())
-      .BatchSize(batch_rng())
-      .InputHeight(input_rng())
-      .InputWidth(input_rng())
-      .InputChannels(groups * channel_per_group_rng())
-      .OutputChannels(groups * channel_per_group_rng())
-      .Groups(groups)
-      .KernelHeight(3)
-      .KernelWidth(3)
-      .StrideHeight(2)
-      .StrideWidth(2)
-      .SamePadding()
-      .Test(xnnpack_delegate.get());
-}
-
 TEST(SignedQuantizedConv2D, SmallKernelWithSamePadding) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc
index 19e6ea1..87d2e16 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc
@@ -113,47 +113,6 @@
       std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
-  auto channel_per_group_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
-  auto groups_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 8), std::ref(rng));
-
-  auto groups = groups_rng();
-  QuantizedConv2DTester()
-      .InputZeroPoint(zero_point_rng())
-      .OutputZeroPoint(zero_point_rng())
-      .KernelZeroPoint(kernel_zero_point_rng())
-      .BatchSize(batch_rng())
-      .InputHeight(input_rng())
-      .InputWidth(input_rng())
-      .InputChannels(groups * channel_per_group_rng())
-      .OutputChannels(groups * channel_per_group_rng())
-      .Groups(groups)
-      .KernelHeight(3)
-      .KernelWidth(3)
-      .StrideHeight(2)
-      .StrideWidth(2)
-      .SamePadding()
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(UnsignedQuantizedConv2D, Grouped) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
-                                      std::numeric_limits<uint8_t>::min(),
-                                      std::numeric_limits<uint8_t>::max()),
-                                  std::ref(rng));
-  auto kernel_zero_point_rng = std::bind(
-      std::uniform_int_distribution<int32_t>(100, 150), std::ref(rng));
-  auto batch_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
-  auto input_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
   auto channel_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
 
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index ef80445..c8abc64 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -2321,7 +2321,6 @@
     const int kernel_height = SizeOfDimension(&filter_tensor, 1);
     const int kernel_width = SizeOfDimension(&filter_tensor, 2);
     const int input_channels = SizeOfDimension(&filter_tensor, 3);
-    const int groups = SizeOfDimension(&input_tensor, 3) / input_channels;
 
     uint32_t flags;
     TF_LITE_ENSURE_STATUS(CalculatePadding(
@@ -2344,9 +2343,9 @@
           static_cast<uint32_t>(conv_params->stride_height),
           static_cast<uint32_t>(conv_params->stride_width),
           static_cast<uint32_t>(conv_params->dilation_height_factor),
-          static_cast<uint32_t>(conv_params->dilation_width_factor), groups,
-          static_cast<size_t>(input_channels),
-          static_cast<size_t>(output_channels) / groups, output_min, output_max,
+          static_cast<uint32_t>(conv_params->dilation_width_factor),
+          /*groups=*/1, static_cast<size_t>(input_channels),
+          static_cast<size_t>(output_channels), output_min, output_max,
           /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
           /*filter_id=*/xnnpack_tensors[node->inputs->data[1]],
           /*bias_id=*/xnnpack_tensors[node->inputs->data[2]],