Support 2-input concatenation nodes using existing copy operators.

A 2-input concatenation is implemented as 2 strided copy operators. We need to keep track of another xnn_operator_t inside xnn_operator_data, since we will now have 2 operators to run.

PiperOrigin-RevId: 431550347
diff --git a/BUILD.bazel b/BUILD.bazel
index 7ca704e..e8662fe 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -84,6 +84,7 @@
     "src/subgraph/bankers-rounding.c",
     "src/subgraph/ceiling.c",
     "src/subgraph/clamp.c",
+    "src/subgraph/concatenate2.c",
     "src/subgraph/convert.c",
     "src/subgraph/convolution-2d.c",
     "src/subgraph/deconvolution-2d.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7fee0bd..a392084 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -199,6 +199,7 @@
   src/subgraph/bankers-rounding.c
   src/subgraph/ceiling.c
   src/subgraph/clamp.c
+  src/subgraph/concatenate2.c
   src/subgraph/convert.c
   src/subgraph/convolution-2d.c
   src/subgraph/deconvolution-2d.c
diff --git a/include/xnnpack.h b/include/xnnpack.h
index 9f9898b..ddff0dc 100644
--- a/include/xnnpack.h
+++ b/include/xnnpack.h
@@ -906,6 +906,30 @@
   uint32_t output_id,
   uint32_t flags);
 
+/// Define a 2-Input Concatenate Node and add it to a Subgraph.
+///
+/// The 2-Input Concatenate Node concatenates two tensors along a specified axis.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param axis - the axis to concatenate the two input tensors along
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    second input.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    first input.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the dimension of both inputs, except the axis
+///                    dimension, where it is the sum of the corresponding dimensions of both inputs.
+/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
+enum xnn_status xnn_define_concatenate2(
+  xnn_subgraph_t subgraph,
+  size_t axis,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
 /// Define a Reshape Node with static shape specification and add it to a Subgraph.
 ///
 /// @param subgraph - a Subgraph object that will own the created Node.
diff --git a/src/runtime.c b/src/runtime.c
index 6e3b9d1..5324072 100644
--- a/src/runtime.c
+++ b/src/runtime.c
@@ -201,7 +201,7 @@
 
   for (size_t i = 0; i < runtime->num_ops; i++) {
     const struct xnn_operator_data* opdata = &runtime->opdata[i];
-    if (opdata->operator_object == NULL) {
+    if (opdata->operator_objects[0] == NULL) {
       // Operator was removed during optimization
       continue;
     }
@@ -221,15 +221,18 @@
   xnn_runtime_t runtime)
 {
   for (size_t i = 0; i < runtime->num_ops; i++) {
-    if (runtime->opdata[i].operator_object == NULL) {
-      // Operator was removed after fusion
-      continue;
+    for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) {
+      if (runtime->opdata[i].operator_objects[j] == NULL) {
+        // Operator was removed after fusion
+        continue;
+      }
+
+      const enum xnn_status status = xnn_run_operator(runtime->opdata[i].operator_objects[j], runtime->threadpool);
+      if (status != xnn_status_success) {
+        return status;
+      }
     }
 
-    const enum xnn_status status = xnn_run_operator(runtime->opdata[i].operator_object, runtime->threadpool);
-    if (status != xnn_status_success) {
-      return status;
-    }
   }
   return xnn_status_success;
 }
@@ -240,7 +243,9 @@
   if (runtime != NULL) {
     if (runtime->opdata != NULL) {
       for (size_t i = 0; i < runtime->num_ops; i++) {
-        xnn_delete_operator(runtime->opdata[i].operator_object);
+        for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) {
+          xnn_delete_operator(runtime->opdata[i].operator_objects[j]);
+        }
       }
       xnn_release_memory(runtime->opdata);
 
diff --git a/src/subgraph-strings.c b/src/subgraph-strings.c
index 12916eb..e123340 100644
--- a/src/subgraph-strings.c
+++ b/src/subgraph-strings.c
@@ -32,6 +32,8 @@
       return "Ceiling";
     case xnn_node_type_clamp:
       return "Clamp";
+    case xnn_node_type_concatenate2:
+      return "Concatenate2";
     case xnn_node_type_convert:
       return "Convert";
     case xnn_node_type_convolution_2d:
diff --git a/src/subgraph/abs.c b/src/subgraph/abs.c
index 377a240..7a2a939 100644
--- a/src/subgraph/abs.c
+++ b/src/subgraph/abs.c
@@ -39,7 +39,7 @@
   const enum xnn_status status = xnn_create_abs_nc_f32(
     channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
     node->flags,
-    &opdata->operator_object);
+    &opdata->operator_objects[0]);
   if (status == xnn_status_success) {
     opdata->batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape);
     opdata->inputs[0] = input_id;
@@ -71,7 +71,7 @@
   assert(output_data != NULL);
 
   return xnn_setup_abs_nc_f32(
-    opdata->operator_object,
+    opdata->operator_objects[0],
     opdata->batch_size,
     input_data,
     output_data,
diff --git a/src/subgraph/add2.c b/src/subgraph/add2.c
index 8cad5be..4aefd96 100644
--- a/src/subgraph/add2.c
+++ b/src/subgraph/add2.c
@@ -42,7 +42,7 @@
         node->activation.output_min,
         node->activation.output_max,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     case xnn_compute_type_fp32:
@@ -50,7 +50,7 @@
         node->activation.output_min,
         node->activation.output_max,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_compute_type_qs8:
@@ -68,7 +68,7 @@
         values[input2_id].quantization.scale,
         (int8_t) output_zero_point,
         output_scale, output_min, output_max, node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_QS8_OPERATORS)
@@ -88,7 +88,7 @@
         values[input2_id].quantization.scale,
         (uint8_t) output_zero_point,
         output_scale, output_min, output_max, node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_QU8_OPERATORS)
@@ -155,10 +155,10 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
     case xnn_operator_type_add_nd_f32:
       return xnn_setup_add_nd_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->shape1.num_dims,
         opdata->shape1.dim,
         opdata->shape2.num_dims,
@@ -168,7 +168,7 @@
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_add_nd_f16:
       return xnn_setup_add_nd_f16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->shape1.num_dims,
         opdata->shape1.dim,
         opdata->shape2.num_dims,
@@ -179,7 +179,7 @@
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_operator_type_add_nd_qs8:
       return xnn_setup_add_nd_qs8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->shape1.num_dims,
         opdata->shape1.dim,
         opdata->shape2.num_dims,
@@ -190,7 +190,7 @@
 #ifndef XNN_NO_QU8_OPERATORS
     case xnn_operator_type_add_nd_qu8:
       return xnn_setup_add_nd_qu8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->shape1.num_dims,
         opdata->shape1.dim,
         opdata->shape2.num_dims,
diff --git a/src/subgraph/argmax-pooling-2d.c b/src/subgraph/argmax-pooling-2d.c
index 1d8d2b2..28fd3ae 100644
--- a/src/subgraph/argmax-pooling-2d.c
+++ b/src/subgraph/argmax-pooling-2d.c
@@ -48,7 +48,7 @@
     node->params.pooling_2d.pooling_width,
     channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
     node->flags,
-    &opdata->operator_object);
+    &opdata->operator_objects[0]);
   if (status == xnn_status_success) {
     opdata->batch_size = values[input_id].shape.dim[0];
     opdata->input_height = values[input_id].shape.dim[1];
@@ -91,7 +91,7 @@
   assert(output_index_data != NULL);
 
   return xnn_setup_argmax_pooling2d_nhwc_f32(
-    opdata->operator_object,
+    opdata->operator_objects[0],
     opdata->batch_size,
     opdata->input_height,
     opdata->input_width,
diff --git a/src/subgraph/average-pooling-2d.c b/src/subgraph/average-pooling-2d.c
index 24dd46b..160af85 100644
--- a/src/subgraph/average-pooling-2d.c
+++ b/src/subgraph/average-pooling-2d.c
@@ -48,7 +48,7 @@
     node->activation.output_min,
     node->activation.output_max,
     node->flags,
-    &opdata->operator_object);
+    &opdata->operator_objects[0]);
   if (status == xnn_status_success) {
     opdata->batch_size = values[input_id].shape.dim[0];
     opdata->input_height = values[input_id].shape.dim[1];
@@ -82,7 +82,7 @@
   assert(output_data != NULL);
 
   return xnn_setup_average_pooling2d_nhwc_f32(
-    opdata->operator_object,
+    opdata->operator_objects[0],
     opdata->batch_size,
     opdata->input_height,
     opdata->input_width,
diff --git a/src/subgraph/bankers-rounding.c b/src/subgraph/bankers-rounding.c
index 9341de5..f2b16d6 100644
--- a/src/subgraph/bankers-rounding.c
+++ b/src/subgraph/bankers-rounding.c
@@ -39,7 +39,7 @@
   const enum xnn_status status = xnn_create_bankers_rounding_nc_f32(
     channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
     node->flags,
-    &opdata->operator_object);
+    &opdata->operator_objects[0]);
   if (status == xnn_status_success) {
     opdata->batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape);
     opdata->inputs[0] = input_id;
@@ -71,7 +71,7 @@
   assert(output_data != NULL);
 
   return xnn_setup_bankers_rounding_nc_f32(
-    opdata->operator_object,
+    opdata->operator_objects[0],
     opdata->batch_size,
     input_data,
     output_data,
diff --git a/src/subgraph/ceiling.c b/src/subgraph/ceiling.c
index 3a3d49c..b69eeae 100644
--- a/src/subgraph/ceiling.c
+++ b/src/subgraph/ceiling.c
@@ -38,7 +38,7 @@
   const enum xnn_status status = xnn_create_ceiling_nc_f32(
     channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
     node->flags,
-    &opdata->operator_object);
+    &opdata->operator_objects[0]);
   if (status == xnn_status_success) {
     opdata->batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape);
     opdata->inputs[0] = input_id;
@@ -70,7 +70,7 @@
   assert(output_data != NULL);
 
   return xnn_setup_ceiling_nc_f32(
-    opdata->operator_object,
+    opdata->operator_objects[0],
     opdata->batch_size,
     input_data,
     output_data,
diff --git a/src/subgraph/clamp.c b/src/subgraph/clamp.c
index 357654c..6a45135 100644
--- a/src/subgraph/clamp.c
+++ b/src/subgraph/clamp.c
@@ -42,7 +42,7 @@
         node->activation.output_min,
         node->activation.output_max,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #endif  // XNN_NO_F16_OPERATORS
     case xnn_compute_type_fp32:
@@ -51,7 +51,7 @@
         node->activation.output_min,
         node->activation.output_max,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #ifndef XNN_NO_S8_OPERATORS
     case xnn_compute_type_qs8:
@@ -67,7 +67,7 @@
         output_min,
         output_max,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_S8_OPERATORS)
@@ -85,7 +85,7 @@
         output_min,
         output_max,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_U8_OPERATORS)
@@ -122,11 +122,11 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_clamp_nc_f16:
       return xnn_setup_clamp_nc_f16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
@@ -134,7 +134,7 @@
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     case xnn_operator_type_clamp_nc_f32:
       return xnn_setup_clamp_nc_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
@@ -142,7 +142,7 @@
 #ifndef XNN_NO_S8_OPERATORS
     case xnn_operator_type_clamp_nc_s8:
       return xnn_setup_clamp_nc_s8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
@@ -151,7 +151,7 @@
 #ifndef XNN_NO_U8_OPERATORS
     case xnn_operator_type_clamp_nc_u8:
       return xnn_setup_clamp_nc_u8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
diff --git a/src/subgraph/concatenate2.c b/src/subgraph/concatenate2.c
new file mode 100644
index 0000000..69d8ab4
--- /dev/null
+++ b/src/subgraph/concatenate2.c
@@ -0,0 +1,455 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <stdint.h>
+
+#include <xnnpack.h>
+#include <xnnpack/log.h>
+#include <xnnpack/params.h>
+#include <xnnpack/subgraph.h>
+
+static enum xnn_status create_concatenate_operator(
+  const struct xnn_node* node,
+  const struct xnn_value* values,
+  size_t num_values,
+  struct xnn_operator_data* opdata,
+  struct xnn_code_cache* code_cache)
+{
+  assert(node->num_inputs == 2);
+  const uint32_t input1_id = node->inputs[0];
+  assert(input1_id != XNN_INVALID_VALUE_ID);
+  assert(input1_id < num_values);
+  const uint32_t input2_id = node->inputs[1];
+  assert(input2_id != XNN_INVALID_VALUE_ID);
+  assert(input2_id < num_values);
+
+  assert(node->num_outputs == 1);
+  const uint32_t output_id = node->outputs[0];
+  assert(output_id != XNN_INVALID_VALUE_ID);
+  assert(output_id < num_values);
+
+  const size_t axis = node->params.concatenate.axis;
+  size_t batch_size = 1, channels_1 = 1, channels_2 = 1;
+  for (size_t i = 0; i < axis; i++) {
+    batch_size *= values[input1_id].shape.dim[i];
+  }
+  for (size_t i = axis; i < values[input1_id].shape.num_dims; i++) {
+    channels_1 *= values[input1_id].shape.dim[i];
+    channels_2 *= values[input2_id].shape.dim[i];
+  }
+  const size_t output_stride = channels_1 + channels_2;
+
+  enum xnn_status status;
+  switch (node->compute_type) {
+#ifndef XNN_NO_F16_OPERATORS
+    case xnn_compute_type_fp16:
+    {
+      status = xnn_create_copy_nc_x16(channels_1, channels_1, output_stride, node->flags, &opdata->operator_objects[0]);
+      if (status != xnn_status_success) {
+        break;
+      }
+      status = xnn_create_copy_nc_x16(channels_2, channels_2, output_stride, node->flags, &opdata->operator_objects[1]);
+      break;
+    }
+#endif  // !defined(XNN_NO_F16_OPERATORS)
+    case xnn_compute_type_fp32:
+    {
+      status = xnn_create_copy_nc_x32(channels_1, channels_1, output_stride, node->flags, &opdata->operator_objects[0]);
+      if (status != xnn_status_success) {
+        break;
+      }
+      status = xnn_create_copy_nc_x32(channels_2, channels_2, output_stride, node->flags, &opdata->operator_objects[1]);
+      break;
+    }
+#ifndef XNN_NO_QS8_OPERATORS
+    case xnn_compute_type_qs8:
+#endif  // !defined(XNN_NO_QS8_OPERATORS)
+#ifndef XNN_NO_QU8_OPERATORS
+    case xnn_compute_type_qu8:
+#endif  // !defined(XNN_NO_QU8_OPERATORS)
+#if !defined(XNN_NO_QS8_OPERATORS) || !defined(XNN_NO_QU8_OPERATORS)
+    {
+      status = xnn_create_copy_nc_x8(channels_1, channels_1, output_stride, node->flags, &opdata->operator_objects[0]);
+      if (status != xnn_status_success) {
+        break;
+      }
+      status = xnn_create_copy_nc_x8(channels_2, channels_2, output_stride, node->flags, &opdata->operator_objects[1]);
+      break;
+
+    }
+#endif  // !defined(XNN_NO_QS8_OPERATORS) || !defined(XNN_NO_QU8_OPERATORS)
+    default:
+      XNN_UNREACHABLE;
+  }
+
+  if (status == xnn_status_success) {
+    opdata->inputs[0] = input1_id;
+    opdata->inputs[1] = input2_id;
+    opdata->outputs[0] = output_id;
+    opdata->batch_size = batch_size;
+  }
+
+  return status;
+}
+
+static enum xnn_status setup_concatenate_operator(
+  const struct xnn_operator_data* opdata,
+  const struct xnn_blob* blobs,
+  size_t num_blobs,
+  pthreadpool_t threadpool)
+{
+  const uint32_t input1_id = opdata->inputs[0];
+  assert(input1_id != XNN_INVALID_VALUE_ID);
+  assert(input1_id < num_blobs);
+
+  const uint32_t input2_id = opdata->inputs[1];
+  assert(input2_id != XNN_INVALID_VALUE_ID);
+  assert(input2_id < num_blobs);
+
+  const uint32_t output_id = opdata->outputs[0];
+  assert(output_id != XNN_INVALID_VALUE_ID);
+  assert(output_id < num_blobs);
+
+  const struct xnn_blob* input1_blob = blobs + input1_id;
+  const void* input1_data = input1_blob->data;
+  assert(input1_data != NULL);
+
+  const struct xnn_blob* input2_blob = blobs + input2_id;
+  const void* input2_data = input2_blob->data;
+  assert(input2_data != NULL);
+
+  const struct xnn_blob* output_blob = blobs + output_id;
+  void* output_data = output_blob->data;
+  assert(output_data != NULL);
+
+  enum xnn_status status;
+  size_t channels = opdata->operator_objects[0]->channels;
+
+  switch (opdata->operator_objects[0]->type) {
+#ifndef XNN_NO_F16_OPERATORS
+    case xnn_operator_type_copy_nc_x16: {
+      status = xnn_setup_copy_nc_x16(
+          opdata->operator_objects[0],
+          opdata->batch_size,
+          input1_data,
+          output_data,
+          threadpool);
+      if (status != xnn_status_success) {
+        return status;
+      }
+      status = xnn_setup_copy_nc_x16(
+          opdata->operator_objects[1],
+          opdata->batch_size,
+          input2_data,
+          (uint16_t*) output_data + channels,
+          threadpool);
+      return status;
+    }
+#endif  // !defined(XNN_NO_F16_OPERATORS)
+    case xnn_operator_type_copy_nc_x32: {
+      status = xnn_setup_copy_nc_x32(
+          opdata->operator_objects[0],
+          opdata->batch_size,
+          input1_data,
+          output_data,
+          threadpool);
+      if (status != xnn_status_success) {
+        return status;
+      }
+      status = xnn_setup_copy_nc_x32(
+          opdata->operator_objects[1],
+          opdata->batch_size,
+          input2_data,
+          (uint32_t *) output_data + channels,
+          threadpool);
+      return status;
+    }
+#if !defined(XNN_NO_QS8_OPERATORS) || !defined(XNN_NO_QU8_OPERATORS)
+    case xnn_operator_type_copy_nc_x8: {
+      status = xnn_setup_copy_nc_x8(
+          opdata->operator_objects[0],
+          opdata->batch_size,
+          input1_data,
+          output_data,
+          threadpool);
+      if (status != xnn_status_success) {
+        return status;
+      }
+      status = xnn_setup_copy_nc_x8(
+          opdata->operator_objects[1],
+          opdata->batch_size,
+          input2_data,
+          (uint8_t*) output_data + channels,
+          threadpool);
+      return status;
+    }
+#endif  // !defined(XNN_NO_QS8_OPERATORS) || !defined(XNN_NO_QU8_OPERATORS)
+    default:
+      XNN_UNREACHABLE;
+  }
+}
+
+enum xnn_status xnn_define_concatenate2(
+  xnn_subgraph_t subgraph,
+  size_t axis,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags)
+{
+  if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
+    xnn_log_error("failed to define %s operator: XNNPACK is not initialized",
+      xnn_node_type_to_string(xnn_node_type_concatenate2));
+    return xnn_status_uninitialized;
+  }
+
+  if (input1_id >= subgraph->num_values) {
+    xnn_log_error(
+      "failed to define %s operator with the first input ID #%" PRIu32 ": invalid Value ID",
+      xnn_node_type_to_string(xnn_node_type_concatenate2), input1_id);
+    return xnn_status_invalid_parameter;
+  }
+
+  const struct xnn_value* input1_value = &subgraph->values[input1_id];
+  if (input1_value->type != xnn_value_type_dense_tensor) {
+    xnn_log_error(
+      "failed to define %s operator with the first input ID #%" PRIu32 ": unsupported Value type %d (expected dense tensor)",
+      xnn_node_type_to_string(xnn_node_type_concatenate2), input1_id, input1_value->type);
+    return xnn_status_invalid_parameter;
+  }
+
+  switch (input1_value->datatype) {
+    case xnn_datatype_fp32:
+#ifndef XNN_NO_QS8_OPERATORS
+    case xnn_datatype_qint8:
+#endif  // !defined(XNN_NO_QS8_OPERATORS)
+#ifndef XNN_NO_QU8_OPERATORS
+    case xnn_datatype_quint8:
+#endif  // !defined(XNN_NO_QU8_OPERATORS)
+      break;
+    default:
+      xnn_log_error(
+        "failed to define %s operator with the first input ID #%" PRIu32 ": unsupported Value datatype %s (%d)",
+        xnn_node_type_to_string(xnn_node_type_concatenate2), input1_id,
+        xnn_datatype_to_string(input1_value->datatype), input1_value->datatype);
+      return xnn_status_invalid_parameter;
+  }
+
+  if (axis >= input1_value->shape.num_dims) {
+    xnn_log_error(
+      "failed to define %s operator with the first input ID #%" PRIu32
+      ": axis (%zu) exceeds the number of dimensions (%zu)",
+      xnn_node_type_to_string(xnn_node_type_concatenate2), input1_id, axis, input1_value->shape.num_dims);
+    return xnn_status_invalid_parameter;
+  }
+
+  if (input2_id >= subgraph->num_values) {
+    xnn_log_error(
+      "failed to define %s operator with the second input ID #%" PRIu32 ": invalid Value ID",
+      xnn_node_type_to_string(xnn_node_type_concatenate2), input2_id);
+    return xnn_status_invalid_parameter;
+  }
+
+  const struct xnn_value* input2_value = &subgraph->values[input2_id];
+  if (input2_value->type != xnn_value_type_dense_tensor) {
+    xnn_log_error(
+      "failed to define %s operator with the second input ID #%" PRIu32 ": unsupported Value type %d (expected dense tensor)",
+      xnn_node_type_to_string(xnn_node_type_concatenate2), input2_id, input2_value->type);
+    return xnn_status_invalid_parameter;
+  }
+
+  switch (input2_value->datatype) {
+    case xnn_datatype_fp32:
+#ifndef XNN_NO_QS8_OPERATORS
+    case xnn_datatype_qint8:
+#endif  // !defined(XNN_NO_QS8_OPERATORS)
+#ifndef XNN_NO_QU8_OPERATORS
+    case xnn_datatype_quint8:
+#endif  // !defined(XNN_NO_QU8_OPERATORS)
+      break;
+    default:
+      xnn_log_error(
+        "failed to define %s operator with the second input ID #%" PRIu32 ": unsupported Value datatype %s (%d)",
+        xnn_node_type_to_string(xnn_node_type_concatenate2), input2_id,
+        xnn_datatype_to_string(input2_value->datatype), input2_value->datatype);
+      return xnn_status_invalid_parameter;
+  }
+
+  if (axis >= input2_value->shape.num_dims) {
+    xnn_log_error(
+      "failed to define %s operator with the second input ID #%" PRIu32
+      ": axis (%zu) exceeds the number of dimensions (%zu)",
+      xnn_node_type_to_string(xnn_node_type_concatenate2), input2_id, axis, input2_value->shape.num_dims);
+    return xnn_status_invalid_parameter;
+  }
+
+  if (input1_value->shape.num_dims != input2_value->shape.num_dims) {
+      xnn_log_error(
+        "failed to define %s operator with input IDs #%" PRIu32 " and #%" PRIu32
+        ": mismatching number of input dimensions %zu and %zu",
+        xnn_node_type_to_string(xnn_node_type_concatenate2), input1_id, input2_id,
+        input1_value->shape.num_dims, input2_value->shape.num_dims);
+    return xnn_status_invalid_parameter;
+  }
+
+  for (size_t i = 0; i < input1_value->shape.num_dims; i++) {
+    if (i == axis) {
+      continue;
+    }
+
+    if (input1_value->shape.dim[i] != input2_value->shape.dim[i]) {
+      xnn_log_error(
+          "failed to define %s operator with input IDs #%" PRIu32 " and #%" PRIu32
+          ": mismatch dimension %zu, first input has %zu, second input has %zu",
+          xnn_node_type_to_string(xnn_node_type_concatenate2), input1_id, input2_id,
+          i, input1_value->shape.dim[i], input2_value->shape.dim[i]);
+      return xnn_status_invalid_parameter;
+    }
+  }
+
+  if (output_id >= subgraph->num_values) {
+    xnn_log_error(
+      "failed to define %s operator with output ID #%" PRIu32 ": invalid Value ID",
+      xnn_node_type_to_string(xnn_node_type_concatenate2), output_id);
+    return xnn_status_invalid_parameter;
+  }
+
+  const struct xnn_value* output_value = &subgraph->values[output_id];
+  if (output_value->type != xnn_value_type_dense_tensor) {
+    xnn_log_error(
+      "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value type %d (expected dense tensor)",
+      xnn_node_type_to_string(xnn_node_type_concatenate2), output_id, output_value->type);
+    return xnn_status_invalid_parameter;
+  }
+
+  if (input1_value->shape.num_dims != output_value->shape.num_dims) {
+      xnn_log_error(
+        "failed to define %s operator with output ID #%" PRIu32
+        ": mismatch number of dimensions, first input has %zu, output has %zu",
+        xnn_node_type_to_string(xnn_node_type_concatenate2), output_id,
+        input1_value->shape.num_dims, output_value->shape.num_dims);
+    return xnn_status_invalid_parameter;
+  }
+
+  for (size_t i = 0; i < output_value->shape.num_dims; i++) {
+    if (i == axis) {
+      if (output_value->shape.dim[i] != input1_value->shape.dim[i] + input2_value->shape.dim[i]) {
+        xnn_log_error(
+            "failed to define %s operator with output ID #%" PRIu32
+            ": mismatch axis dimension %zu, output has %zu, sum of input dimensions is %zu",
+            xnn_node_type_to_string(xnn_node_type_concatenate2), output_id,
+            i, output_value->shape.dim[i], input1_value->shape.dim[i] + input2_value->shape.dim[i]);
+        return xnn_status_invalid_parameter;
+      }
+    }
+
+    if (output_value->shape.dim[i] != input1_value->shape.dim[i]) {
+      xnn_log_error(
+          "failed to define %s operator with output ID #%" PRIu32
+          ": mismatch dimension %zu, output has %zu, input has %zu",
+          xnn_node_type_to_string(xnn_node_type_concatenate2), output_id,
+          i, output_value->shape.dim[i], input1_value->shape.dim[i]);
+      return xnn_status_invalid_parameter;
+    }
+  }
+
+  enum xnn_compute_type compute_type = xnn_compute_type_invalid;
+  switch (output_value->datatype) {
+#ifndef XNN_NO_F16_OPERATORS
+    case xnn_datatype_fp16:
+      compute_type = xnn_compute_type_fp16;
+      break;
+#endif  // !defined(XNN_NO_F16_OPERATORS)
+    case xnn_datatype_fp32:
+      compute_type = xnn_compute_type_fp32;
+      break;
+#ifndef XNN_NO_QS8_OPERATORS
+    case xnn_datatype_qint8:
+      compute_type = xnn_compute_type_qs8;
+      break;
+#endif  // !defined(XNN_NO_QS8_OPERATORS)
+#ifndef XNN_NO_QU8_OPERATORS
+    case xnn_datatype_quint8:
+      compute_type = xnn_compute_type_qu8;
+      break;
+#endif  // !defined(XNN_NO_QU8_OPERATORS)
+    default:
+      xnn_log_error(
+        "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)",
+        xnn_node_type_to_string(xnn_node_type_concatenate2), output_id,
+        xnn_datatype_to_string(output_value->datatype), output_value->datatype);
+      return xnn_status_invalid_parameter;
+  }
+
+  if (input1_value->datatype != input2_value->datatype ||
+      input1_value->datatype != output_value->datatype)
+  {
+    xnn_log_error(
+      "failed to define %s operator with input IDs #%" PRIu32 " and #%" PRIu32 " and output ID #%" PRIu32
+      ": mismatching datatypes across the first input (%s), the second input (%s), and output (%s)",
+      xnn_node_type_to_string(xnn_node_type_concatenate2), input1_id, input2_id, output_id,
+      xnn_datatype_to_string(input1_value->datatype),
+      xnn_datatype_to_string(input2_value->datatype),
+      xnn_datatype_to_string(output_value->datatype));
+    return xnn_status_invalid_parameter;
+  }
+
+#if !defined(XNN_NO_QS8_OPERATORS) || !defined(XNN_NO_QU8_OPERATORS)
+  if (compute_type == xnn_compute_type_qs8 || compute_type == xnn_compute_type_qu8) {
+    if (input1_value->quantization.zero_point != input2_value->quantization.zero_point) {
+      xnn_log_error(
+          "failed to define %s operator with input IDs #%" PRIu32 " and #%" PRIu32
+          ": mismatching quantization zero point across the first input (%d) and second input (%d)",
+          xnn_node_type_to_string(xnn_node_type_concatenate2), input1_id, input2_id,
+          input1_value->quantization.zero_point, input2_value->quantization.zero_point);
+      return xnn_status_invalid_parameter;
+    }
+    if (input1_value->quantization.zero_point != output_value->quantization.zero_point) {
+      xnn_log_error(
+          "failed to define %s operator with input ID #%" PRIu32 " and output ID #%" PRIu32
+          ": mismatching quantization zero point across the first input (%d) and the output (%d)",
+          xnn_node_type_to_string(xnn_node_type_concatenate2), input1_id, output_id,
+          input1_value->quantization.zero_point, output_value->quantization.zero_point);
+      return xnn_status_invalid_parameter;
+    }
+    if (input1_value->quantization.scale != input2_value->quantization.scale) {
+      xnn_log_error(
+          "failed to define %s operator with input IDs #%" PRIu32 " and #%" PRIu32
+          ": mismatching quantization scale across the first input (%.7g) and second input (%.7g)",
+          xnn_node_type_to_string(xnn_node_type_concatenate2), input1_id, input2_id,
+          input1_value->quantization.scale, input2_value->quantization.scale);
+      return xnn_status_invalid_parameter;
+    }
+    if (input1_value->quantization.scale != output_value->quantization.scale) {
+      xnn_log_error(
+          "failed to define %s operator with input ID #%" PRIu32 " and output ID #%" PRIu32
+          ": mismatching quantization scale across the first input (%.7g) and the output (%.7g)",
+          xnn_node_type_to_string(xnn_node_type_concatenate2), input1_id, output_id,
+          input1_value->quantization.scale, output_value->quantization.scale);
+      return xnn_status_invalid_parameter;
+    }
+  }
+#endif // !defined( XNN_NO_QS8_OPERATORS) || !defined(XNN_NO_QU8_OPERATORS)
+
+  struct xnn_node* node = xnn_subgraph_new_node(subgraph);
+  if (node == NULL) {
+    return xnn_status_out_of_memory;
+  }
+
+  node->params.concatenate.axis = axis;
+  node->type = xnn_node_type_concatenate2;
+  node->compute_type = compute_type;
+  node->num_inputs = 2;
+  node->inputs[0] = input1_id;
+  node->inputs[1] = input2_id;
+  node->num_outputs = 1;
+  node->outputs[0] = output_id;
+  node->flags = flags;
+
+  node->create = create_concatenate_operator;
+  node->setup = setup_concatenate_operator;
+
+  return xnn_status_success;
+}
diff --git a/src/subgraph/convert.c b/src/subgraph/convert.c
index c118fa4..b692fdb 100644
--- a/src/subgraph/convert.c
+++ b/src/subgraph/convert.c
@@ -40,7 +40,7 @@
       status = xnn_create_convert_nc_f32_f16(
         channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     case xnn_compute_type_fp32_to_qs8:
       status = xnn_create_convert_nc_f32_qs8(
@@ -49,7 +49,7 @@
         (int8_t) values[output_id].quantization.zero_point,
         INT8_MIN, INT8_MAX,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     case xnn_compute_type_fp32_to_qu8:
       status = xnn_create_convert_nc_f32_qu8(
@@ -58,13 +58,13 @@
         (uint8_t) values[output_id].quantization.zero_point,
         0, UINT8_MAX,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     case xnn_compute_type_fp16_to_fp32:
       status = xnn_create_convert_nc_f16_f32(
         channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     case xnn_compute_type_qs8_to_fp32:
       status = xnn_create_convert_nc_qs8_f32(
@@ -72,7 +72,7 @@
         values[input_id].quantization.scale,
         (int8_t) values[input_id].quantization.zero_point,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     case xnn_compute_type_qu8_to_fp32:
       status = xnn_create_convert_nc_qu8_f32(
@@ -80,7 +80,7 @@
         values[input_id].quantization.scale,
         (uint8_t) values[input_id].quantization.zero_point,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     default:
       XNN_UNREACHABLE;
@@ -115,45 +115,45 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
     case xnn_operator_type_convert_nc_f32_f16:
       return xnn_setup_convert_nc_f32_f16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
         threadpool);
     case xnn_operator_type_convert_nc_f32_qs8:
       return xnn_setup_convert_nc_f32_qs8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
         threadpool);
     case xnn_operator_type_convert_nc_f32_qu8:
       return xnn_setup_convert_nc_f32_qu8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
         threadpool);
     case xnn_operator_type_convert_nc_f16_f32:
       return xnn_setup_convert_nc_f16_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
         threadpool);
     case xnn_operator_type_convert_nc_qs8_f32:
       return xnn_setup_convert_nc_qs8_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
         threadpool);
     case xnn_operator_type_convert_nc_qu8_f32:
       return xnn_setup_convert_nc_qu8_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
diff --git a/src/subgraph/convolution-2d.c b/src/subgraph/convolution-2d.c
index 5da0e55..4b4fc52 100644
--- a/src/subgraph/convolution-2d.c
+++ b/src/subgraph/convolution-2d.c
@@ -71,7 +71,7 @@
       node->activation.output_min,
       node->activation.output_max,
       node->flags | (values[input_id].layout == xnn_layout_type_nhwc ? XNN_FLAG_INPUT_NHWC : 0),
-      &opdata->operator_object);
+      &opdata->operator_objects[0]);
   } else {
     assert(values[input_id].layout == xnn_layout_type_nhwc);
     assert(values[output_id].layout == xnn_layout_type_nhwc);
@@ -99,7 +99,7 @@
           node->activation.output_max,
           node->flags,
           code_cache,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
 #ifndef XNN_NO_F16_OPERATORS
       case xnn_compute_type_fp16:
@@ -125,7 +125,7 @@
           node->activation.output_max,
           node->flags | XNN_FLAG_FP32_STATIC_WEIGHTS,
           NULL,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
 #endif  // XNN_NO_F16_OPERATORS
 #ifndef XNN_NO_QS8_OPERATORS
@@ -162,7 +162,7 @@
           output_scale, output_min, output_max,
           node->flags,
           NULL,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
       }
       case xnn_compute_type_qc8:
@@ -198,7 +198,7 @@
           output_scale, output_min, output_max,
           node->flags,
           NULL,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
       }
 #endif  // !defined(XNN_NO_QS8_OPERATORS)
@@ -237,7 +237,7 @@
           output_scale, output_min, output_max,
           node->flags,
           NULL,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
       }
 #endif  // !defined(XNN_NO_QU8_OPERATORS)
@@ -277,10 +277,10 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
     case xnn_operator_type_convolution_nchw_f32:
       return xnn_setup_convolution2d_nchw_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -290,7 +290,7 @@
       break;
     case xnn_operator_type_convolution_nhwc_f32:
       return xnn_setup_convolution2d_nhwc_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -301,7 +301,7 @@
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_convolution_nhwc_f16:
       return xnn_setup_convolution2d_nhwc_f16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -313,7 +313,7 @@
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_operator_type_convolution_nhwc_qc8:
       return xnn_setup_convolution2d_nhwc_qc8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -323,7 +323,7 @@
       break;
     case xnn_operator_type_convolution_nhwc_qs8:
       return xnn_setup_convolution2d_nhwc_qs8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -335,7 +335,7 @@
 #ifndef XNN_NO_QU8_OPERATORS
     case xnn_operator_type_convolution_nhwc_qu8:
       return xnn_setup_convolution2d_nhwc_qu8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
diff --git a/src/subgraph/deconvolution-2d.c b/src/subgraph/deconvolution-2d.c
index 225f1e2..84b4956 100644
--- a/src/subgraph/deconvolution-2d.c
+++ b/src/subgraph/deconvolution-2d.c
@@ -77,7 +77,7 @@
           node->activation.output_max,
           node->flags | XNN_FLAG_FP32_STATIC_WEIGHTS,
           code_cache,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
       break;
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     case xnn_compute_type_fp32:
@@ -103,7 +103,7 @@
           node->activation.output_max,
           node->flags,
           code_cache,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
       break;
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_compute_type_qs8:
@@ -141,7 +141,7 @@
           output_max,
           node->flags,
           code_cache,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_QS8_OPERATORS)
@@ -182,7 +182,7 @@
           output_max,
           node->flags,
           code_cache,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_QU8_OPERATORS)
@@ -223,11 +223,11 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_deconvolution_nhwc_f16:
       return xnn_setup_deconvolution2d_nhwc_f16(
-          opdata->operator_object,
+          opdata->operator_objects[0],
           opdata->batch_size,
           opdata->input_height,
           opdata->input_width,
@@ -240,7 +240,7 @@
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     case xnn_operator_type_deconvolution_nhwc_f32:
       return xnn_setup_deconvolution2d_nhwc_f32(
-          opdata->operator_object,
+          opdata->operator_objects[0],
           opdata->batch_size,
           opdata->input_height,
           opdata->input_width,
@@ -253,7 +253,7 @@
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_operator_type_deconvolution_nhwc_qs8:
       return xnn_setup_deconvolution2d_nhwc_qs8(
-          opdata->operator_object,
+          opdata->operator_objects[0],
           opdata->batch_size,
           opdata->input_height,
           opdata->input_width,
@@ -267,7 +267,7 @@
 #ifndef XNN_NO_QU8_OPERATORS
     case xnn_operator_type_deconvolution_nhwc_qu8:
       return xnn_setup_deconvolution2d_nhwc_qu8(
-          opdata->operator_object,
+          opdata->operator_objects[0],
           opdata->batch_size,
           opdata->input_height,
           opdata->input_width,
diff --git a/src/subgraph/depth-to-space.c b/src/subgraph/depth-to-space.c
index a65780f..f6f2c64 100644
--- a/src/subgraph/depth-to-space.c
+++ b/src/subgraph/depth-to-space.c
@@ -42,7 +42,7 @@
         output_channel_dim /* output stride */,
         node->params.depth_to_space.block_size,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
   } else {
     assert(values[input_id].layout == xnn_layout_type_nhwc);
     assert(values[output_id].layout == xnn_layout_type_nhwc);
@@ -55,7 +55,7 @@
             output_channel_dim /* output stride */,
             node->params.depth_to_space.block_size,
             node->flags,
-            &opdata->operator_object);
+            &opdata->operator_objects[0]);
         break;
 #endif  // XNN_NO_F16_OPERATORS
       case xnn_compute_type_fp32:
@@ -65,7 +65,7 @@
             output_channel_dim /* output stride */,
             node->params.depth_to_space.block_size,
             node->flags,
-            &opdata->operator_object);
+            &opdata->operator_objects[0]);
         break;
 #if !defined(XNN_NO_S8_OPERATORS) && !defined(XNN_NO_U8_OPERATORS)
       case xnn_compute_type_qs8:
@@ -76,7 +76,7 @@
             output_channel_dim /* output stride */,
             node->params.depth_to_space.block_size,
             node->flags,
-            &opdata->operator_object);
+            &opdata->operator_objects[0]);
         break;
 #endif  // !defined(XNN_NO_S8_OPERATORS) && !defined(XNN_NO_U8_OPERATORS)
       default:
@@ -117,10 +117,10 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
     case xnn_operator_type_depth_to_space_nchw2nhwc_x32:
       return xnn_setup_depth_to_space_nchw2nhwc_x32(
-          opdata->operator_object,
+          opdata->operator_objects[0],
           opdata->batch_size,
           opdata->input_height,
           opdata->input_width,
@@ -130,7 +130,7 @@
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_depth_to_space_nhwc_x16:
       return xnn_setup_depth_to_space_nhwc_x16(
-          opdata->operator_object,
+          opdata->operator_objects[0],
           opdata->batch_size,
           opdata->input_height,
           opdata->input_width,
@@ -140,7 +140,7 @@
 #endif  // XNN_NO_F16_OPERATORS
     case xnn_operator_type_depth_to_space_nhwc_x32:
       return xnn_setup_depth_to_space_nhwc_x32(
-          opdata->operator_object,
+          opdata->operator_objects[0],
           opdata->batch_size,
           opdata->input_height,
           opdata->input_width,
@@ -150,7 +150,7 @@
 #if !defined(XNN_NO_S8_OPERATORS) && !defined(XNN_NO_U8_OPERATORS)
     case xnn_operator_type_depth_to_space_nhwc_x8:
       return xnn_setup_depth_to_space_nhwc_x8(
-          opdata->operator_object,
+          opdata->operator_objects[0],
           opdata->batch_size,
           opdata->input_height,
           opdata->input_width,
diff --git a/src/subgraph/depthwise-convolution-2d.c b/src/subgraph/depthwise-convolution-2d.c
index 4f19139..d6e1d71 100644
--- a/src/subgraph/depthwise-convolution-2d.c
+++ b/src/subgraph/depthwise-convolution-2d.c
@@ -72,7 +72,7 @@
       node->activation.output_min,
       node->activation.output_max,
       node->flags | XNN_FLAG_DEPTHWISE_CONVOLUTION,
-      &opdata->operator_object);
+      &opdata->operator_objects[0]);
   } else {
     assert(values[input_id].layout == xnn_layout_type_nhwc);
     assert(values[output_id].layout == xnn_layout_type_nhwc);
@@ -100,7 +100,7 @@
           node->activation.output_max,
           node->flags | XNN_FLAG_DEPTHWISE_CONVOLUTION,
           NULL,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
 #ifndef XNN_NO_F16_OPERATORS
       case xnn_compute_type_fp16:
@@ -126,7 +126,7 @@
           node->activation.output_max,
           node->flags | XNN_FLAG_DEPTHWISE_CONVOLUTION | XNN_FLAG_FP32_STATIC_WEIGHTS,
           NULL,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
 #endif  // XNN_NO_F16_OPERATORS
 #ifndef XNN_NO_QS8_OPERATORS
@@ -163,7 +163,7 @@
           output_scale, output_min, output_max,
           node->flags | XNN_FLAG_DEPTHWISE_CONVOLUTION,
           NULL,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
       }
       case xnn_compute_type_qc8:
@@ -199,7 +199,7 @@
           output_scale, output_min, output_max,
           node->flags | XNN_FLAG_DEPTHWISE_CONVOLUTION,
           NULL,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
       }
 #endif  // !defined(XNN_NO_QS8_OPERATORS)
@@ -238,7 +238,7 @@
           output_scale, output_min, output_max,
           node->flags | XNN_FLAG_DEPTHWISE_CONVOLUTION,
           NULL,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
       }
 #endif  // !defined(XNN_NO_QU8_OPERATORS)
@@ -278,10 +278,10 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
     case xnn_operator_type_convolution_nchw_f32:
       return xnn_setup_convolution2d_nchw_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -291,7 +291,7 @@
       break;
     case xnn_operator_type_convolution_nhwc_f32:
       return xnn_setup_convolution2d_nhwc_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -302,7 +302,7 @@
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_convolution_nhwc_f16:
       return xnn_setup_convolution2d_nhwc_f16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -314,7 +314,7 @@
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_operator_type_convolution_nhwc_qc8:
       return xnn_setup_convolution2d_nhwc_qc8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -324,7 +324,7 @@
       break;
     case xnn_operator_type_convolution_nhwc_qs8:
       return xnn_setup_convolution2d_nhwc_qs8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -336,7 +336,7 @@
 #ifndef XNN_NO_QU8_OPERATORS
     case xnn_operator_type_convolution_nhwc_qu8:
       return xnn_setup_convolution2d_nhwc_qu8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
diff --git a/src/subgraph/divide.c b/src/subgraph/divide.c
index a15e5a7..1faadb1 100644
--- a/src/subgraph/divide.c
+++ b/src/subgraph/divide.c
@@ -40,7 +40,7 @@
     node->activation.output_min,
     node->activation.output_max,
     node->flags,
-    &opdata->operator_object);
+    &opdata->operator_objects[0]);
   if (status == xnn_status_success) {
     opdata->shape1.num_dims = values[input1_id].shape.num_dims;
     opdata->shape2.num_dims = values[input2_id].shape.num_dims;
@@ -102,7 +102,7 @@
   assert(output_data != NULL);
 
   return xnn_setup_divide_nd_f32(
-    opdata->operator_object,
+    opdata->operator_objects[0],
     opdata->shape1.num_dims,
     opdata->shape1.dim,
     opdata->shape2.num_dims,
diff --git a/src/subgraph/elu.c b/src/subgraph/elu.c
index 5503081..2cd15cd 100644
--- a/src/subgraph/elu.c
+++ b/src/subgraph/elu.c
@@ -40,7 +40,7 @@
         channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
         node->params.elu.alpha,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_compute_type_qs8:
@@ -53,7 +53,7 @@
         values[output_id].quantization.scale,
         INT8_MIN, INT8_MAX,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #endif  // XNN_NO_QS8_OPERATORS
     default:
@@ -89,10 +89,10 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
     case xnn_operator_type_elu_nc_f32:
       return xnn_setup_elu_nc_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
@@ -100,7 +100,7 @@
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_operator_type_elu_nc_qs8:
       return xnn_setup_elu_nc_qs8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
diff --git a/src/subgraph/floor.c b/src/subgraph/floor.c
index 62b61a4..7bd84f4 100644
--- a/src/subgraph/floor.c
+++ b/src/subgraph/floor.c
@@ -38,7 +38,7 @@
   const enum xnn_status status = xnn_create_floor_nc_f32(
     channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
     node->flags,
-    &opdata->operator_object);
+    &opdata->operator_objects[0]);
   if (status == xnn_status_success) {
     opdata->batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape);
     opdata->inputs[0] = input_id;
@@ -70,7 +70,7 @@
   assert(output_data != NULL);
 
   return xnn_setup_floor_nc_f32(
-    opdata->operator_object,
+    opdata->operator_objects[0],
     opdata->batch_size,
     input_data,
     output_data,
diff --git a/src/subgraph/fully-connected.c b/src/subgraph/fully-connected.c
index 01f3268..a3bb835 100644
--- a/src/subgraph/fully-connected.c
+++ b/src/subgraph/fully-connected.c
@@ -71,7 +71,7 @@
         node->activation.output_max,
         node->flags /* flags */,
         code_cache,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_compute_type_qs8:
@@ -96,7 +96,7 @@
         output_scale, output_min, output_max,
         node->flags /* flags */,
         code_cache,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_QS8_OPERATORS)
@@ -124,7 +124,7 @@
         output_scale, output_min, output_max,
         node->flags /* flags */,
         code_cache,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_QU8_OPERATORS)
@@ -161,10 +161,10 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
     case xnn_operator_type_fully_connected_nc_f32:
       return xnn_setup_fully_connected_nc_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
@@ -172,7 +172,7 @@
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_operator_type_fully_connected_nc_qs8:
       return xnn_setup_fully_connected_nc_qs8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
@@ -181,7 +181,7 @@
 #ifndef XNN_NO_QU8_OPERATORS
     case xnn_operator_type_fully_connected_nc_qu8:
       return xnn_setup_fully_connected_nc_qu8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
diff --git a/src/subgraph/global-average-pooling-2d.c b/src/subgraph/global-average-pooling-2d.c
index c517330..b05d94e 100644
--- a/src/subgraph/global-average-pooling-2d.c
+++ b/src/subgraph/global-average-pooling-2d.c
@@ -42,7 +42,7 @@
       node->activation.output_min,
       node->activation.output_max,
       node->flags,
-      &opdata->operator_object);
+      &opdata->operator_objects[0]);
   } else {
     assert(values[node->inputs[0]].layout == xnn_layout_type_nhwc);
     assert(values[node->outputs[0]].layout == xnn_layout_type_nhwc);
@@ -53,7 +53,7 @@
           node->activation.output_min,
           node->activation.output_max,
           node->flags,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
 #ifndef XNN_NO_F16_OPERATORS
       case xnn_compute_type_fp16:
@@ -62,7 +62,7 @@
           node->activation.output_min,
           node->activation.output_max,
           node->flags,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
 #endif  // !defined(XNN_NO_F16_OPERATORS)
 #ifndef XNN_NO_QS8_OPERATORS
@@ -81,7 +81,7 @@
           output_min,
           output_max,
           node->flags,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
       }
 #endif  // !defined(XNN_NO_QS8_OPERATORS)
@@ -101,7 +101,7 @@
           output_min,
           output_max,
           node->flags,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
       }
 #endif  // !defined(XNN_NO_QU8_OPERATORS)
@@ -140,10 +140,10 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
     case xnn_operator_type_global_average_pooling_ncw_f32:
       return xnn_setup_global_average_pooling_ncw_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_width,
         input_data,
@@ -152,7 +152,7 @@
       break;
     case xnn_operator_type_global_average_pooling_nwc_f32:
       return xnn_setup_global_average_pooling_nwc_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_width,
         input_data,
@@ -162,7 +162,7 @@
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_global_average_pooling_nwc_f16:
       return xnn_setup_global_average_pooling_nwc_f16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_width,
         input_data,
@@ -173,7 +173,7 @@
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_operator_type_global_average_pooling_nwc_qs8:
       return xnn_setup_global_average_pooling_nwc_qs8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_width,
         input_data,
@@ -184,7 +184,7 @@
 #ifndef XNN_NO_QU8_OPERATORS
     case xnn_operator_type_global_average_pooling_nwc_qu8:
       return xnn_setup_global_average_pooling_nwc_qu8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_width,
         input_data,
diff --git a/src/subgraph/hardswish.c b/src/subgraph/hardswish.c
index 367c640..a5aad4c 100644
--- a/src/subgraph/hardswish.c
+++ b/src/subgraph/hardswish.c
@@ -41,14 +41,14 @@
       status = xnn_create_hardswish_nc_f32(
         channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_compute_type_fp16:
       status = xnn_create_hardswish_nc_f16(
         channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     default:
@@ -84,10 +84,10 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
     case xnn_operator_type_hardswish_nc_f32:
       return xnn_setup_hardswish_nc_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
@@ -95,7 +95,7 @@
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_hardswish_nc_f16:
       return xnn_setup_hardswish_nc_f16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
diff --git a/src/subgraph/leaky-relu.c b/src/subgraph/leaky-relu.c
index 3fc987d..0a70a0e 100644
--- a/src/subgraph/leaky-relu.c
+++ b/src/subgraph/leaky-relu.c
@@ -41,7 +41,7 @@
         channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
         node->params.leaky_relu.negative_slope,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #endif  // XNN_NO_F16_OPERATORS
     case xnn_compute_type_fp32:
@@ -49,7 +49,7 @@
         channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
         node->params.leaky_relu.negative_slope,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     default:
       XNN_UNREACHABLE;
@@ -84,11 +84,11 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_leaky_relu_nc_f16:
       return xnn_setup_leaky_relu_nc_f16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
@@ -96,7 +96,7 @@
 #endif  // XNN_NO_F16_OPERATORS
     case xnn_operator_type_leaky_relu_nc_f32:
       return xnn_setup_leaky_relu_nc_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
diff --git a/src/subgraph/max-pooling-2d.c b/src/subgraph/max-pooling-2d.c
index cb9535f..4133370 100644
--- a/src/subgraph/max-pooling-2d.c
+++ b/src/subgraph/max-pooling-2d.c
@@ -52,7 +52,7 @@
         node->activation.output_min,
         node->activation.output_max,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     case xnn_compute_type_fp32:
@@ -71,7 +71,7 @@
         node->activation.output_min,
         node->activation.output_max,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #ifndef XNN_NO_S8_OPERATORS
     case xnn_compute_type_qs8:
@@ -97,7 +97,7 @@
         output_min,
         output_max,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_S8_OPERATORS)
@@ -125,7 +125,7 @@
         output_min,
         output_max,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_U8_OPERATORS)
@@ -164,11 +164,11 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_max_pooling_nhwc_f16:
       return xnn_setup_max_pooling2d_nhwc_f16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -178,7 +178,7 @@
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     case xnn_operator_type_max_pooling_nhwc_f32:
       return xnn_setup_max_pooling2d_nhwc_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -188,7 +188,7 @@
 #ifndef XNN_NO_S8_OPERATORS
     case xnn_operator_type_max_pooling_nhwc_s8:
       return xnn_setup_max_pooling2d_nhwc_s8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -199,7 +199,7 @@
 #ifndef XNN_NO_U8_OPERATORS
     case xnn_operator_type_max_pooling_nhwc_u8:
       return xnn_setup_max_pooling2d_nhwc_u8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
diff --git a/src/subgraph/maximum2.c b/src/subgraph/maximum2.c
index f8ca58f..0b7f6ff 100644
--- a/src/subgraph/maximum2.c
+++ b/src/subgraph/maximum2.c
@@ -38,7 +38,7 @@
 
   const enum xnn_status status = xnn_create_maximum_nd_f32(
     node->flags,
-    &opdata->operator_object);
+    &opdata->operator_objects[0]);
   if (status == xnn_status_success) {
     opdata->shape1.num_dims = values[input1_id].shape.num_dims;
     opdata->shape2.num_dims = values[input2_id].shape.num_dims;
@@ -100,7 +100,7 @@
   assert(output_data != NULL);
 
   return xnn_setup_maximum_nd_f32(
-    opdata->operator_object,
+    opdata->operator_objects[0],
     opdata->shape1.num_dims,
     opdata->shape1.dim,
     opdata->shape2.num_dims,
diff --git a/src/subgraph/minimum2.c b/src/subgraph/minimum2.c
index d6c99af..5f71606 100644
--- a/src/subgraph/minimum2.c
+++ b/src/subgraph/minimum2.c
@@ -38,7 +38,7 @@
 
   const enum xnn_status status = xnn_create_minimum_nd_f32(
     node->flags,
-    &opdata->operator_object);
+    &opdata->operator_objects[0]);
   if (status == xnn_status_success) {
     opdata->shape1.num_dims = values[input1_id].shape.num_dims;
     opdata->shape2.num_dims = values[input2_id].shape.num_dims;
@@ -100,7 +100,7 @@
   assert(output_data != NULL);
 
   return xnn_setup_minimum_nd_f32(
-    opdata->operator_object,
+    opdata->operator_objects[0],
     opdata->shape1.num_dims,
     opdata->shape1.dim,
     opdata->shape2.num_dims,
diff --git a/src/subgraph/multiply2.c b/src/subgraph/multiply2.c
index 2dfd304..21ed368 100644
--- a/src/subgraph/multiply2.c
+++ b/src/subgraph/multiply2.c
@@ -42,7 +42,7 @@
         node->activation.output_min,
         node->activation.output_max,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #endif  // XNN_NO_F16_OPERATORS
     case xnn_compute_type_fp32:
@@ -50,7 +50,7 @@
         node->activation.output_min,
         node->activation.output_max,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_compute_type_qs8:
@@ -68,7 +68,7 @@
         values[input2_id].quantization.scale,
         (int8_t) output_zero_point,
         output_scale, output_min, output_max, node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_QS8_OPERATORS)
@@ -88,7 +88,7 @@
         values[input2_id].quantization.scale,
         (uint8_t) output_zero_point,
         output_scale, output_min, output_max, node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_QU8_OPERATORS)
@@ -155,11 +155,11 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_multiply_nd_f16:
       return xnn_setup_multiply_nd_f16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->shape1.num_dims,
         opdata->shape1.dim,
         opdata->shape2.num_dims,
@@ -170,7 +170,7 @@
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     case xnn_operator_type_multiply_nd_f32:
       return xnn_setup_multiply_nd_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->shape1.num_dims,
         opdata->shape1.dim,
         opdata->shape2.num_dims,
@@ -181,7 +181,7 @@
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_operator_type_multiply_nd_qs8:
       return xnn_setup_multiply_nd_qs8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->shape1.num_dims,
         opdata->shape1.dim,
         opdata->shape2.num_dims,
@@ -193,7 +193,7 @@
 #ifndef XNN_NO_QU8_OPERATORS
     case xnn_operator_type_multiply_nd_qu8:
       return xnn_setup_multiply_nd_qu8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->shape1.num_dims,
         opdata->shape1.dim,
         opdata->shape2.num_dims,
diff --git a/src/subgraph/negate.c b/src/subgraph/negate.c
index acc3fd7..655950f 100644
--- a/src/subgraph/negate.c
+++ b/src/subgraph/negate.c
@@ -38,7 +38,7 @@
   const enum xnn_status status = xnn_create_negate_nc_f32(
     channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
     node->flags,
-    &opdata->operator_object);
+    &opdata->operator_objects[0]);
   if (status == xnn_status_success) {
     opdata->batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape);
     opdata->inputs[0] = input_id;
@@ -70,7 +70,7 @@
   assert(output_data != NULL);
 
   return xnn_setup_negate_nc_f32(
-    opdata->operator_object,
+    opdata->operator_objects[0],
     opdata->batch_size,
     input_data,
     output_data,
diff --git a/src/subgraph/prelu.c b/src/subgraph/prelu.c
index c535056..ed6ea10 100644
--- a/src/subgraph/prelu.c
+++ b/src/subgraph/prelu.c
@@ -44,7 +44,7 @@
         channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
         values[slope_id].data /* negative slope */,
         node->flags | XNN_FLAG_FP32_STATIC_WEIGHTS,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #endif  // XNN_NO_F16_OPERATORS
     case xnn_compute_type_fp32:
@@ -52,7 +52,7 @@
         channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
         values[slope_id].data /* negative slope */,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     default:
       XNN_UNREACHABLE;
@@ -87,11 +87,11 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_prelu_nc_f16:
       return xnn_setup_prelu_nc_f16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
@@ -99,7 +99,7 @@
 #endif  // XNN_NO_F16_OPERATORS
     case xnn_operator_type_prelu_nc_f32:
       return xnn_setup_prelu_nc_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
diff --git a/src/subgraph/sigmoid.c b/src/subgraph/sigmoid.c
index 87caec2..60a1ffe 100644
--- a/src/subgraph/sigmoid.c
+++ b/src/subgraph/sigmoid.c
@@ -40,14 +40,14 @@
       status = xnn_create_sigmoid_nc_f16(
         channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     case xnn_compute_type_fp32:
       status = xnn_create_sigmoid_nc_f32(
         channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_compute_type_qs8:
@@ -60,7 +60,7 @@
         values[output_id].quantization.scale,
         INT8_MIN, INT8_MAX,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_QS8_OPERATORS)
@@ -75,7 +75,7 @@
         values[output_id].quantization.scale,
         0, UINT8_MAX,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_QU8_OPERATORS)
@@ -112,11 +112,11 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_sigmoid_nc_f16:
       return xnn_setup_sigmoid_nc_f16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
@@ -124,7 +124,7 @@
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     case xnn_operator_type_sigmoid_nc_f32:
       return xnn_setup_sigmoid_nc_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
@@ -132,7 +132,7 @@
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_operator_type_sigmoid_nc_qs8:
       return xnn_setup_sigmoid_nc_qs8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
@@ -141,7 +141,7 @@
 #ifndef XNN_NO_QU8_OPERATORS
     case xnn_operator_type_sigmoid_nc_qu8:
       return xnn_setup_sigmoid_nc_qu8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
diff --git a/src/subgraph/softmax.c b/src/subgraph/softmax.c
index 4c9bc6d..f3eff15 100644
--- a/src/subgraph/softmax.c
+++ b/src/subgraph/softmax.c
@@ -39,14 +39,14 @@
       status = xnn_create_softmax_nc_f32(
         channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_datatype_fp16:
       status = xnn_create_softmax_nc_f16(
         channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     default:
@@ -82,10 +82,10 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
     case xnn_operator_type_softmax_nc_f32:
       return xnn_setup_softmax_nc_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
@@ -93,7 +93,7 @@
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_softmax_nc_f16:
       return xnn_setup_softmax_nc_f16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
diff --git a/src/subgraph/square-root.c b/src/subgraph/square-root.c
index cf75607..12a2db1 100644
--- a/src/subgraph/square-root.c
+++ b/src/subgraph/square-root.c
@@ -38,7 +38,7 @@
   const enum xnn_status status = xnn_create_square_root_nc_f32(
     channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
     node->flags,
-    &opdata->operator_object);
+    &opdata->operator_objects[0]);
   if (status == xnn_status_success) {
     opdata->batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape);
     opdata->inputs[0] = input_id;
@@ -70,7 +70,7 @@
   assert(output_data != NULL);
 
   return xnn_setup_square_root_nc_f32(
-    opdata->operator_object,
+    opdata->operator_objects[0],
     opdata->batch_size,
     input_data,
     output_data,
diff --git a/src/subgraph/square.c b/src/subgraph/square.c
index c91f0d1..8aa6c66 100644
--- a/src/subgraph/square.c
+++ b/src/subgraph/square.c
@@ -38,7 +38,7 @@
   const enum xnn_status status = xnn_create_square_nc_f32(
     channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
     node->flags,
-    &opdata->operator_object);
+    &opdata->operator_objects[0]);
   if (status == xnn_status_success) {
     opdata->batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape);
     opdata->inputs[0] = input_id;
@@ -70,7 +70,7 @@
   assert(output_data != NULL);
 
   return xnn_setup_square_nc_f32(
-    opdata->operator_object,
+    opdata->operator_objects[0],
     opdata->batch_size,
     input_data,
     output_data,
diff --git a/src/subgraph/squared-difference.c b/src/subgraph/squared-difference.c
index f0788f4..5f48cd8 100644
--- a/src/subgraph/squared-difference.c
+++ b/src/subgraph/squared-difference.c
@@ -38,7 +38,7 @@
 
   const enum xnn_status status = xnn_create_squared_difference_nd_f32(
     node->flags,
-    &opdata->operator_object);
+    &opdata->operator_objects[0]);
   if (status == xnn_status_success) {
     opdata->shape1.num_dims = values[input1_id].shape.num_dims;
     opdata->shape2.num_dims = values[input2_id].shape.num_dims;
@@ -100,7 +100,7 @@
   assert(output_data != NULL);
 
   return xnn_setup_squared_difference_nd_f32(
-    opdata->operator_object,
+    opdata->operator_objects[0],
     opdata->shape1.num_dims,
     opdata->shape1.dim,
     opdata->shape2.num_dims,
diff --git a/src/subgraph/static-constant-pad.c b/src/subgraph/static-constant-pad.c
index b19bf6a..f4eb9ff 100644
--- a/src/subgraph/static-constant-pad.c
+++ b/src/subgraph/static-constant-pad.c
@@ -40,14 +40,14 @@
       status = xnn_create_constant_pad_nd_x16(
         &node->params.static_pad.padding_value,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     case xnn_compute_type_fp32:
       status = xnn_create_constant_pad_nd_x32(
         &node->params.static_pad.padding_value,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_compute_type_qs8:
@@ -59,7 +59,7 @@
       status = xnn_create_constant_pad_nd_x8(
         &node->params.static_pad.padding_value,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #endif  // !defined(XNN_NO_QS8_OPERATORS) || !defined(XNN_NO_QU8_OPERATORS)
     default:
@@ -97,11 +97,11 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
 #if !defined(XNN_NO_QS8_OPERATORS) || !defined(XNN_NO_QU8_OPERATORS)
     case xnn_operator_type_constant_pad_nd_x8:
       return xnn_setup_constant_pad_nd_x8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->shape1.num_dims,
         opdata->shape1.dim,
         opdata->pre_paddings,
@@ -114,7 +114,7 @@
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_constant_pad_nd_x16:
       return xnn_setup_constant_pad_nd_x16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->shape1.num_dims,
         opdata->shape1.dim,
         opdata->pre_paddings,
@@ -126,7 +126,7 @@
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     case xnn_operator_type_constant_pad_nd_x32:
       return xnn_setup_constant_pad_nd_x32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->shape1.num_dims,
         opdata->shape1.dim,
         opdata->pre_paddings,
diff --git a/src/subgraph/static-reshape.c b/src/subgraph/static-reshape.c
index 9e777c1..0648574 100644
--- a/src/subgraph/static-reshape.c
+++ b/src/subgraph/static-reshape.c
@@ -38,14 +38,14 @@
       status = xnn_create_copy_nc_x16(
         1 /* channels */, 1 /* input stride */, 1 /* output stride */,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     case xnn_compute_type_fp32:
       status = xnn_create_copy_nc_x32(
         1 /* channels */, 1 /* input stride */, 1 /* output stride */,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_compute_type_qs8:
@@ -57,7 +57,7 @@
       status = xnn_create_copy_nc_x8(
         1 /* channels */, 1 /* input stride */, 1 /* output stride */,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #endif  // !defined(XNN_NO_QS8_OPERATORS) || !defined(XNN_NO_QU8_OPERATORS)
     default:
@@ -93,11 +93,11 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
 #if !defined(XNN_NO_QS8_OPERATORS) || !defined(XNN_NO_QU8_OPERATORS)
     case xnn_operator_type_copy_nc_x8:
       return xnn_setup_copy_nc_x8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
@@ -107,7 +107,7 @@
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_copy_nc_x16:
       return xnn_setup_copy_nc_x16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
@@ -116,7 +116,7 @@
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     case xnn_operator_type_copy_nc_x32:
       return xnn_setup_copy_nc_x32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         input_data,
         output_data,
diff --git a/src/subgraph/static-resize-bilinear-2d.c b/src/subgraph/static-resize-bilinear-2d.c
index 90f0ab8..361d800 100644
--- a/src/subgraph/static-resize-bilinear-2d.c
+++ b/src/subgraph/static-resize-bilinear-2d.c
@@ -40,7 +40,7 @@
     status = xnn_create_resize_bilinear2d_nchw_f32(
       channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
       node->flags,
-      &opdata->operator_object);
+      &opdata->operator_objects[0]);
   } else {
     assert(values[input_id].layout == xnn_layout_type_nhwc);
     assert(values[output_id].layout == xnn_layout_type_nhwc);
@@ -50,21 +50,21 @@
         status = xnn_create_resize_bilinear2d_nhwc_f16(
           channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
           node->flags,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
 #endif  // !defined(XNN_NO_F16_OPERATORS)
       case xnn_compute_type_fp32:
         status = xnn_create_resize_bilinear2d_nhwc_f32(
           channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
           node->flags,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
 #ifndef XNN_NO_S8_OPERATORS
       case xnn_compute_type_qs8:
         status = xnn_create_resize_bilinear2d_nhwc_s8(
           channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
           node->flags,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
 #endif  // !defined(XNN_NO_S8_OPERATORS)
 #ifndef XNN_NO_U8_OPERATORS
@@ -72,7 +72,7 @@
         status = xnn_create_resize_bilinear2d_nhwc_u8(
           channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
           node->flags,
-          &opdata->operator_object);
+          &opdata->operator_objects[0]);
         break;
 #endif  // !defined(XNN_NO_U8_OPERATORS)
       default:
@@ -113,10 +113,10 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
     case xnn_operator_type_resize_bilinear_nchw_f32:
       return xnn_setup_resize_bilinear2d_nchw_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -129,7 +129,7 @@
 #ifndef XNN_NO_F16_OPERATORS
     case xnn_operator_type_resize_bilinear_nhwc_f16:
       return xnn_setup_resize_bilinear2d_nhwc_f16(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -142,7 +142,7 @@
 #endif  // !defined(XNN_NO_F16_OPERATORS)
     case xnn_operator_type_resize_bilinear_nhwc_f32:
       return xnn_setup_resize_bilinear2d_nhwc_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -155,7 +155,7 @@
 #ifndef XNN_NO_S8_OPERATORS
     case xnn_operator_type_resize_bilinear_nhwc_s8:
       return xnn_setup_resize_bilinear2d_nhwc_s8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
@@ -169,7 +169,7 @@
 #ifndef XNN_NO_U8_OPERATORS
     case xnn_operator_type_resize_bilinear_nhwc_u8:
       return xnn_setup_resize_bilinear2d_nhwc_u8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->batch_size,
         opdata->input_height,
         opdata->input_width,
diff --git a/src/subgraph/subtract.c b/src/subgraph/subtract.c
index 0a0518d..e9aab94 100644
--- a/src/subgraph/subtract.c
+++ b/src/subgraph/subtract.c
@@ -41,7 +41,7 @@
         node->activation.output_min,
         node->activation.output_max,
         node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_compute_type_qs8:
@@ -59,7 +59,7 @@
         values[input2_id].quantization.scale,
         (int8_t) output_zero_point,
         output_scale, output_min, output_max, node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_QS8_OPERATORS)
@@ -79,7 +79,7 @@
         values[input2_id].quantization.scale,
         (uint8_t) output_zero_point,
         output_scale, output_min, output_max, node->flags,
-        &opdata->operator_object);
+        &opdata->operator_objects[0]);
       break;
     }
 #endif  // !defined(XNN_NO_QU8_OPERATORS)
@@ -146,10 +146,10 @@
   void* output_data = output_blob->data;
   assert(output_data != NULL);
 
-  switch (opdata->operator_object->type) {
+  switch (opdata->operator_objects[0]->type) {
     case xnn_operator_type_subtract_nd_f32:
       return xnn_setup_subtract_nd_f32(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->shape1.num_dims,
         opdata->shape1.dim,
         opdata->shape2.num_dims,
@@ -160,7 +160,7 @@
 #ifndef XNN_NO_QS8_OPERATORS
     case xnn_operator_type_subtract_nd_qs8:
       return xnn_setup_subtract_nd_qs8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->shape1.num_dims,
         opdata->shape1.dim,
         opdata->shape2.num_dims,
@@ -172,7 +172,7 @@
 #ifndef XNN_NO_QU8_OPERATORS
     case xnn_operator_type_subtract_nd_qu8:
       return xnn_setup_subtract_nd_qu8(
-        opdata->operator_object,
+        opdata->operator_objects[0],
         opdata->shape1.num_dims,
         opdata->shape1.dim,
         opdata->shape2.num_dims,
diff --git a/src/subgraph/unpooling-2d.c b/src/subgraph/unpooling-2d.c
index 79d93fa..05e2e47 100644
--- a/src/subgraph/unpooling-2d.c
+++ b/src/subgraph/unpooling-2d.c
@@ -48,7 +48,7 @@
     node->params.pooling_2d.pooling_width,
     channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */,
     node->flags,
-    &opdata->operator_object);
+    &opdata->operator_objects[0]);
   if (status == xnn_status_success) {
     opdata->batch_size = values[input_value_id].shape.dim[0];
     opdata->input_height = values[input_value_id].shape.dim[1];
@@ -91,7 +91,7 @@
   assert(output_data != NULL);
 
   return xnn_setup_unpooling2d_nhwc_x32(
-    opdata->operator_object,
+    opdata->operator_objects[0],
     opdata->batch_size,
     opdata->input_height,
     opdata->input_width,
diff --git a/src/xnnpack/subgraph.h b/src/xnnpack/subgraph.h
index fba88e3..ac71d07 100644
--- a/src/xnnpack/subgraph.h
+++ b/src/xnnpack/subgraph.h
@@ -20,6 +20,8 @@
 
 #define XNN_INVALID_NODE_ID UINT32_MAX
 
+#define XNN_MAX_OPERATOR_OBJECTS 2
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -143,6 +145,7 @@
   xnn_node_type_bankers_rounding,
   xnn_node_type_ceiling,
   xnn_node_type_clamp,
+  xnn_node_type_concatenate2,
   xnn_node_type_convert,
   xnn_node_type_convolution_2d,
   xnn_node_type_deconvolution_2d,
@@ -258,6 +261,9 @@
       size_t new_height;
       size_t new_width;
     } static_resize;
+    struct {
+      size_t axis;
+    } concatenate;
   } params;
   struct {
     float output_min;
@@ -285,7 +291,7 @@
 };
 
 struct xnn_operator_data {
-  xnn_operator_t operator_object;
+  xnn_operator_t operator_objects[XNN_MAX_OPERATOR_OBJECTS];
   xnn_setup_operator_fn setup;
   size_t batch_size;
   size_t input_height;