[tf.unique()] Optimize the hash table implementation in `UniqueOp::Compute()`.

This change makes two improvements to the `UniqueOp` implementation:

1. Use `absl::flat_hash_map` instead of `std::unordered_map`.
2. For the `tstring` implementation, use `StringPiece` as the key instead of `tstring`, which avoids copying the strings into the map.

In addition, this change switches the microbenchmarks in unique_op_test.cc to use the SINGLE_THREADED_EXECUTOR, which removes thread scheduling overhead from the microbenchmark, and reduces noise in the results.

Microbenchmark results show a saving of between 0% and 65% on BM_Unique_INT32, between 8% and 26% on BM_Unique_INT32_Repeat, and between 17% and 40% on BM_Unique_STRING.

PiperOrigin-RevId: 307647292
Change-Id: If4367df37b856bf1c4cf91fcb34eea479014077f
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 0510730..2016612 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1371,7 +1371,9 @@
 tf_kernel_library(
     name = "unique_op",
     prefix = "unique_op",
-    deps = ARRAY_DEPS,
+    deps = ARRAY_DEPS + [
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
 )
 
 tf_kernel_library(
@@ -2335,6 +2337,7 @@
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/data:single_threaded_executor",
     ],
 )
 
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 8a9965f..2c1505f 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -17,6 +17,7 @@
 #include <unordered_map>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -26,10 +27,19 @@
 #include "tensorflow/core/lib/hash/hash.h"
 
 namespace tensorflow {
+namespace {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-template <typename T, typename TIndex>
+// `UniqueOp` computes the unique elements in the input tensor.
+//
+// * `T` is the element type.
+// * `TKey` is the key type used in a local hash map. It must be explicitly
+//   convertible from `T`. For POD inputs, `TKey = T`. For `tstring` inputs,
+//   `TKey = absl::string_view` avoids copying the input strings into the map.
+// * `TIndex` is the type used to represent indices in the output, either
+//   `int32` or `int64`.
+template <typename T, typename TKey, typename TIndex>
 class UniqueOp : public OpKernel {
  public:
   explicit UniqueOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -106,10 +116,10 @@
       auto Tin = input.flat<T>();
       const int64 N = static_cast<int64>(Tin.size());
 
-      std::unordered_map<T, TIndex> uniq;
+      absl::flat_hash_map<TKey, TIndex> uniq;
       uniq.reserve(2 * N);
       for (Eigen::Index i = 0, j = 0; i < N; ++i) {
-        auto it = uniq.insert(std::make_pair(Tin(i), j));
+        auto it = uniq.emplace(TKey(Tin(i)), j);
         idx_vec(i) = it.first->second;
         if (it.second) {
           ++j;
@@ -153,13 +163,14 @@
         return true;
       };
 
-      std::unordered_map<int64, int64, decltype(hash_fn), decltype(equal_to_fn)>
+      absl::flat_hash_map<int64, int64, decltype(hash_fn),
+                          decltype(equal_to_fn)>
           uniq(0, hash_fn, equal_to_fn);
 
       uniq.reserve(2 * Tin.dimension(1));
 
       for (int64 i = 0, j = 0; i < Tin.dimension(1); ++i) {
-        auto it = uniq.insert(std::make_pair(i, j));
+        auto it = uniq.emplace(i, j);
         idx_vec(i) = it.first->second;
         if (it.second) {
           ++j;
@@ -194,51 +205,56 @@
   }
 };
 
-#define REGISTER_UNIQUE(type)                                    \
+#define REGISTER_UNIQUE_WITH_KEY_TYPE(type, key_type)            \
   REGISTER_KERNEL_BUILDER(Name("Unique")                         \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int32>("out_idx"), \
-                          UniqueOp<type, int32>);                \
+                          UniqueOp<type, key_type, int32>);      \
   REGISTER_KERNEL_BUILDER(Name("Unique")                         \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueOp<type, int64>);                \
+                          UniqueOp<type, key_type, int64>);      \
   REGISTER_KERNEL_BUILDER(Name("UniqueV2")                       \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int32>("out_idx"), \
-                          UniqueOp<type, int32>);                \
+                          UniqueOp<type, key_type, int32>);      \
   REGISTER_KERNEL_BUILDER(Name("UniqueV2")                       \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueOp<type, int64>);                \
+                          UniqueOp<type, key_type, int64>);      \
   REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts")               \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int32>("out_idx"), \
-                          UniqueOp<type, int32>)                 \
+                          UniqueOp<type, key_type, int32>)       \
   REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts")               \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueOp<type, int64>);                \
+                          UniqueOp<type, key_type, int64>);      \
   REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2")             \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int32>("out_idx"), \
-                          UniqueOp<type, int32>)                 \
+                          UniqueOp<type, key_type, int32>)       \
   REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2")             \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueOp<type, int64>)
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
-REGISTER_UNIQUE(tstring)
-REGISTER_UNIQUE(bool)
-#undef REGISTER_UNIQUE
+                          UniqueOp<type, key_type, int64>)
+#define REGISTER_UNIQUE_WITH_SAME_KEY_TYPE(type) \
+  REGISTER_UNIQUE_WITH_KEY_TYPE(type, type)
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE_WITH_SAME_KEY_TYPE);
+REGISTER_UNIQUE_WITH_SAME_KEY_TYPE(bool)
+#undef REGISTER_UNIQUE_WITH_SAME_KEY_TYPE
+
+REGISTER_UNIQUE_WITH_KEY_TYPE(tstring, absl::string_view)
+#undef REGISTER_UNIQUE_WITH_KEY_TYPE
 
 // Fake integer GPU kernels so that the use of Unique in optimizers (to
 // de-duplicate sparse gradient indices) does not conflict with gradients being
@@ -251,7 +267,7 @@
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("idx"),
-                        UniqueOp<int32, int32>);
+                        UniqueOp<int32, int32, int32>);
 REGISTER_KERNEL_BUILDER(Name("Unique")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<int32>("T")
@@ -259,7 +275,7 @@
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("idx"),
-                        UniqueOp<int32, int64>);
+                        UniqueOp<int32, int32, int64>);
 REGISTER_KERNEL_BUILDER(Name("Unique")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<int64>("T")
@@ -267,7 +283,7 @@
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("idx"),
-                        UniqueOp<int64, int32>);
+                        UniqueOp<int64, int64, int32>);
 REGISTER_KERNEL_BUILDER(Name("Unique")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<int64>("T")
@@ -275,7 +291,7 @@
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("idx"),
-                        UniqueOp<int64, int64>);
+                        UniqueOp<int64, int64, int64>);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Unique")
@@ -285,7 +301,7 @@
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("idx"),
-                        UniqueOp<int32, int32>);
+                        UniqueOp<int32, int32, int32>);
 REGISTER_KERNEL_BUILDER(Name("Unique")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int64>("T")
@@ -293,7 +309,7 @@
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("idx"),
-                        UniqueOp<int64, int32>);
+                        UniqueOp<int64, int64, int32>);
 REGISTER_KERNEL_BUILDER(Name("Unique")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
@@ -301,7 +317,7 @@
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("idx"),
-                        UniqueOp<int32, int64>);
+                        UniqueOp<int32, int32, int64>);
 REGISTER_KERNEL_BUILDER(Name("Unique")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int64>("T")
@@ -309,6 +325,8 @@
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("idx"),
-                        UniqueOp<int64, int64>);
+                        UniqueOp<int64, int64, int64>);
 #endif  // TENSORFLOW_USE_SYCL
+
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unique_op_test.cc b/tensorflow/core/kernels/unique_op_test.cc
index 4861a45..a0249d9 100644
--- a/tensorflow/core/kernels/unique_op_test.cc
+++ b/tensorflow/core/kernels/unique_op_test.cc
@@ -22,6 +22,7 @@
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
@@ -75,11 +76,14 @@
                   .Input(test::graph::Constant(g, input))
                   .Attr("T", DT_INT32)
                   .Finalize(g, &node));
+  FixupSourceAndSinkEdges(g);
 
   testing::BytesProcessed(static_cast<int64>(iters) * dim * sizeof(int32));
   testing::UseRealTime();
   testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
+                  "SINGLE_THREADED_EXECUTOR")
+      .Run(iters);
 }
 
 static void BM_Unique_INT32_Repeat(int iters, int dim, int max_int) {
@@ -95,12 +99,15 @@
                   .Input(test::graph::Constant(g, input))
                   .Attr("T", DT_INT32)
                   .Finalize(g, &node));
+  FixupSourceAndSinkEdges(g);
 
   testing::BytesProcessed(static_cast<int64>(iters) * dim * 200 *
                           sizeof(int32));
   testing::UseRealTime();
   testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
+                  "SINGLE_THREADED_EXECUTOR")
+      .Run(iters);
 }
 
 TensorProto GetRandomStringsTensorProto(int dim, int max_str_len) {
@@ -132,11 +139,14 @@
                   .Input(test::graph::Constant(g, input))
                   .Attr("T", DT_STRING)
                   .Finalize(g, &node));
+  FixupSourceAndSinkEdges(g);
 
   testing::BytesProcessed(static_cast<int64>(iters) * dim * sizeof(tstring));
   testing::UseRealTime();
   testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
+                  "SINGLE_THREADED_EXECUTOR")
+      .Run(iters);
 }
 
 BENCHMARK(BM_Unique_INT32)