[XNNPACK] Support quantized SUB operator

PiperOrigin-RevId: 395516269
Change-Id: I3e2881d5f50a4db9239dd0af676df9c91d71974c
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 0c3272f..81a528f 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -1037,6 +1037,21 @@
 )
 
 cc_test(
+    name = "signed_quantized_sub_test",
+    srcs = ["signed_quantized_sub_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":quantized_binary_elementwise_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
     name = "softmax_test",
     srcs = ["softmax_test.cc"],
     linkopts = select({
@@ -1216,4 +1231,19 @@
     ],
 )
 
+cc_test(
+    name = "unsigned_quantized_sub_test",
+    srcs = ["unsigned_quantized_sub_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":quantized_binary_elementwise_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tflite_portable_test_suite_combined(combine_conditions = {"deps": [":test_main"]})
diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index 184ec3f..8ae8a08 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -399,6 +399,13 @@
   (use `kTfLiteMmapRo` allocation type).
 * The numbers of padding elements must be non-negative.
 
+#### `SUB`
+
+* Inputs and outputs must be in 8-bit quantized format.
+* Only addition with two inputs is supported.
+* Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
+  but fused `TANH` and `SIGN_BIT` activations are not.
+
 ### Sparse Inference
 
 XNNPACK backend supports sparse inference for CNN models described in the
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_sub_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_sub_test.cc
new file mode 100644
index 0000000..c2ef856
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_sub_test.cc
@@ -0,0 +1,1067 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(SignedQuantizedSub, 4DBy4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DBy4DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, 1, 1, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, 1, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DBy4DBroadcastWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, 1, width, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, width, 1})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DBy4DBroadcastHeight) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, height, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, 1, 1})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DBy4DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, 1, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, 1, 1, 1})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DBy4DBroadcastHeightWidthChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DBy3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 2DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, channels})
+      .Input2Shape({batch, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 2DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({channels})
+      .Input2Shape({batch, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, channels})
+      .Input2Shape({channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 2DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({})
+      .Input2Shape({batch, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, channels})
+      .Input2Shape({})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DByStatic4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DByStatic4DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, 1, 1, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, 1, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DByStatic4DBroadcastWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, 1, width, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, width, 1})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DByStatic4DBroadcastHeight) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, height, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, 1, 1})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DByStatic4DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, 1, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, 1, 1, 1})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DByStatic4DBroadcastHeightWidthChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DByStatic3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({height, width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DByStatic2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DByStatic1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 4DByStatic0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 2DByStatic2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, channels})
+      .Input2Shape({batch, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, channels})
+      .Input2Shape({batch, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 2DByStatic1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({channels})
+      .Input2Shape({batch, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, channels})
+      .Input2Shape({channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, 2DByStatic0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({})
+      .Input2Shape({batch, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, channels})
+      .Input2Shape({})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  // Avoid degenerate situation when
+  // output_min == output_max == std::numeric_limits<int8_t>::max()
+  auto output_zero_point_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(
+                    std::numeric_limits<int8_t>::min(),
+                    std::numeric_limits<int8_t>::max() - 1),
+                std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(output_zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .ReluActivation()
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  // Avoid degenerate situation when
+  // output_min == output_max == std::numeric_limits<int8_t>::max()
+  auto output_zero_point_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(
+                    std::numeric_limits<int8_t>::min(),
+                    std::numeric_limits<int8_t>::max() - 1),
+                std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(output_zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Relu6Activation()
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+// TODO(b/195554527): Re-enable this test
+TEST(SignedQuantizedSub, DISABLED_ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .ReluMinus1To1Activation()
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedSub, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<int8_t>::min(),
+                                      std::numeric_limits<int8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_sub_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_sub_test.cc
new file mode 100644
index 0000000..8485d1b
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_sub_test.cc
@@ -0,0 +1,1121 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(UnsignedQuantizedSub, 4DBy4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DBy4DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, 1, 1, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, 1, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DBy4DBroadcastWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, 1, width, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, width, 1})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DBy4DBroadcastHeight) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, height, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, 1, 1})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DBy4DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, 1, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, 1, 1, 1})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DBy4DBroadcastHeightWidthChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DBy3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 2DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, channels})
+      .Input2Shape({batch, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 2DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({channels})
+      .Input2Shape({batch, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, channels})
+      .Input2Shape({channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 2DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({})
+      .Input2Shape({batch, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, channels})
+      .Input2Shape({})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DByStatic4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DByStatic4DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, 1, 1, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, 1, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DByStatic4DBroadcastWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, 1, width, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, 1, width, 1})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DByStatic4DBroadcastHeight) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, height, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, 1, 1})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DByStatic4DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, 1, 1, 1})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, 1, 1, 1})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DByStatic4DBroadcastHeightWidthChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({1, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({1, height, width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DByStatic3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({height, width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DByStatic2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({width, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DByStatic1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 4DByStatic0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 2DByStatic2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, channels})
+      .Input2Shape({batch, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, channels})
+      .Input2Shape({batch, channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 2DByStatic1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({channels})
+      .Input2Shape({batch, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, channels})
+      .Input2Shape({channels})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, 2DByStatic0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({})
+      .Input2Shape({batch, channels})
+      .Input1Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, channels})
+      .Input2Shape({})
+      .Input2Static(true)
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  // Avoid degenerate situation when
+  // output_min == output_max == std::numeric_limits<uint8_t>::max()
+  auto output_zero_point_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(
+                    std::numeric_limits<uint8_t>::min(),
+                    std::numeric_limits<uint8_t>::max() - 1),
+                std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(output_zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .ReluActivation()
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  // Avoid degenerate situation when
+  // output_min == output_max == std::numeric_limits<uint8_t>::max()
+  auto output_zero_point_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(
+                    std::numeric_limits<uint8_t>::min(),
+                    std::numeric_limits<uint8_t>::max() - 1),
+                std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(output_zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Relu6Activation()
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+// TODO(b/195554527): Re-enable this test
+TEST(UnsignedQuantizedSub, DISABLED_ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .ReluMinus1To1Activation()
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedSub, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                      std::numeric_limits<uint8_t>::min(),
+                                      std::numeric_limits<uint8_t>::max()),
+                                  std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedBinaryElementwiseTester()
+      .Unsigned(true)
+      .Input1ZeroPoint(zero_point_rng())
+      .Input2ZeroPoint(zero_point_rng())
+      .OutputZeroPoint(zero_point_rng())
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_SUB, xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 540a53e..87dddcf 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -3391,19 +3391,19 @@
         CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
 
     const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
         logging_context, input1_tensor, node->inputs->data[0], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, input1_tensor, node->inputs->data[0], node_index));
 
     const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
         logging_context, input2_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, input2_tensor, node->inputs->data[1], node_index));
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
         logging_context, output_tensor, node->outputs->data[0], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, output_tensor, node->outputs->data[0], node_index));
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index c602d60..9205256 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG 0f6613555829d59cbc165f1be87bdcd5137e23d2
+  GIT_TAG dfe763f462d3569323de6caa085d8b06ce38eb7b
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index ecab631..6be1218 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -135,11 +135,11 @@
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "f3b3256b6dcde8002159df50380b86087ae9ee927464b4179a22028be8a5ac20",
-        strip_prefix = "XNNPACK-0f6613555829d59cbc165f1be87bdcd5137e23d2",
+        sha256 = "a9ad81f50c3bc3b1795403012f20f31fd3171d5eef7e98f3287cc6e950405c94",
+        strip_prefix = "XNNPACK-dfe763f462d3569323de6caa085d8b06ce38eb7b",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/0f6613555829d59cbc165f1be87bdcd5137e23d2.zip",
-            "https://github.com/google/XNNPACK/archive/0f6613555829d59cbc165f1be87bdcd5137e23d2.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/dfe763f462d3569323de6caa085d8b06ce38eb7b.zip",
+            "https://github.com/google/XNNPACK/archive/dfe763f462d3569323de6caa085d8b06ce38eb7b.zip",
         ],
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)