Simplify logic for growing single group table.

Instead of swapping first and second halves, we just shift elements by 1.

That algorithm derives "random" order from the previous state and leave a freedom for new element to be added at the beginning or at the end.

For tables resizing from capacity 1 to 3, the logic is practically unchanged (although we remove 0 size memcpy).

For tables resizing from capacity 3 to 7

1. Original order `012S`.
2. Old algorithm order: `2E01EEES`
3. New algorithm order: `E012EEES`

Adding the forth element at the beginning or at the end with ~50% probability will add enough randomization.

Benefits:
1. Simplify and speed up `GrowIntoSingleGroupShuffleControlBytes`. On ARM we just have two store operations.
2. `GrowIntoSingleGroupShuffleTransferableSlots` would become a single memcpy (instead of 2) that is faster.
3. `GrowSizeIntoSingleGroup` for non transferable objects is a bit simpler and moving data in order.

On x86.
```
name                                                                           old cpu/op   new cpu/op   delta
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 4>/set_size:1         20.1ns ± 2%  20.2ns ± 1%   +0.30%  (p=0.002 n=63+67)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 4>/set_size:2         23.2ns ± 1%  23.3ns ± 1%   +0.43%  (p=0.000 n=71+61)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 4>/set_size:4         23.3ns ± 2%  22.5ns ± 1%   -3.37%  (p=0.000 n=68+63)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 4>/set_size:8         20.5ns ± 2%  19.5ns ± 1%   -4.79%  (p=0.000 n=66+67)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 4>/set_size:16        20.7ns ± 2%  20.3ns ± 2%   -1.60%  (p=0.000 n=70+67)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 4>/set_size:32        19.1ns ± 2%  19.5ns ± 3%   +1.86%  (p=0.000 n=69+76)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 64>/set_size:1        44.0ns ± 2%  44.0ns ± 1%     ~     (p=0.673 n=67+63)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 64>/set_size:2        39.6ns ± 2%  37.6ns ± 1%   -5.01%  (p=0.000 n=65+62)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 64>/set_size:4        32.4ns ± 1%  30.3ns ± 1%   -6.56%  (p=0.000 n=64+62)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 64>/set_size:8        26.1ns ± 1%  24.3ns ± 2%   -6.91%  (p=0.000 n=67+63)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 64>/set_size:16       24.0ns ± 2%  23.4ns ± 2%   -2.56%  (p=0.000 n=64+64)
```

On ARM:
```
name                                                                           old cpu/op   new cpu/op   delta
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 4>/set_size:1         18.3ns ± 1%  18.4ns ± 1%    ~     (p=0.059 n=76+77)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 4>/set_size:2         24.7ns ± 0%  24.6ns ± 0%    ~     (p=0.719 n=65+59)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 4>/set_size:4         23.2ns ± 2%  22.6ns ± 0%  -2.81%  (p=0.000 n=71+51)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 4>/set_size:8         21.7ns ± 2%  21.4ns ± 1%  -1.36%  (p=0.000 n=76+50)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 4>/set_size:16        19.9ns ± 2%  19.7ns ± 1%  -1.03%  (p=0.000 n=75+65)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 64>/set_size:1        45.5ns ± 1%  45.6ns ± 1%  +0.36%  (p=0.000 n=58+59)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 64>/set_size:2        39.3ns ± 1%  38.3ns ± 1%  -2.54%  (p=0.000 n=63+60)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 64>/set_size:4        31.6ns ± 2%  30.0ns ± 2%  -4.99%  (p=0.000 n=76+73)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 64>/set_size:8        26.4ns ± 2%  25.6ns ± 2%  -3.00%  (p=0.000 n=73+66)
BM_SWISSMAP_InsertManyToEmpty_Hot<::absl::flat_hash_set, 64>/set_size:16       23.1ns ± 2%  22.7ns ± 1%  -1.92%  (p=0.000 n=67+64)
```

PiperOrigin-RevId: 702065961
Change-Id: I725475815e017a29384a733f73717050c3aad5b8
diff --git a/absl/container/internal/raw_hash_set.cc b/absl/container/internal/raw_hash_set.cc
index e8a5432..30307b6 100644
--- a/absl/container/internal/raw_hash_set.cc
+++ b/absl/container/internal/raw_hash_set.cc
@@ -346,124 +346,124 @@
     ctrl_t* __restrict new_ctrl, size_t new_capacity) const {
   assert(is_single_group(new_capacity));
   constexpr size_t kHalfWidth = Group::kWidth / 2;
-  constexpr size_t kQuarterWidth = Group::kWidth / 4;
-  assert(old_capacity_ < kHalfWidth);
-  static_assert(sizeof(uint64_t) >= kHalfWidth,
-                "Group size is too large. The ctrl bytes for half a group must "
-                "fit into a uint64_t for this implementation.");
-  static_assert(sizeof(uint64_t) <= Group::kWidth,
-                "Group size is too small. The ctrl bytes for a group must "
-                "cover a uint64_t for this implementation.");
+  ABSL_ASSUME(old_capacity_ < kHalfWidth);
+  ABSL_ASSUME(old_capacity_ > 0);
+  static_assert(Group::kWidth == 8 || Group::kWidth == 16,
+                "Group size is not supported.");
 
-  const size_t half_old_capacity = old_capacity_ / 2;
-
-  // NOTE: operations are done with compile time known size = kHalfWidth.
+  // NOTE: operations are done with compile time known size = 8.
   // Compiler optimizes that into single ASM operation.
 
-  // Load the bytes from half_old_capacity + 1. This contains the last half of
-  // old_ctrl bytes, followed by the sentinel byte, and then the first half of
-  // the cloned bytes. This effectively shuffles the control bytes.
-  uint64_t copied_bytes = 0;
-  copied_bytes =
-      absl::little_endian::Load64(old_ctrl() + half_old_capacity + 1);
+  // Load the bytes from old_capacity. This contains
+  // - the sentinel byte
+  // - all the old control bytes
+  // - the rest is filled with kEmpty bytes
+  // Example:
+  // old_ctrl =     012S012EEEEEEEEE...
+  // copied_bytes = S012EEEE
+  uint64_t copied_bytes =
+      absl::little_endian::Load64(old_ctrl() + old_capacity_);
 
   // We change the sentinel byte to kEmpty before storing to both the start of
   // the new_ctrl, and past the end of the new_ctrl later for the new cloned
   // bytes. Note that this is faster than setting the sentinel byte to kEmpty
   // after the copy directly in new_ctrl because we are limited on store
   // bandwidth.
-  constexpr uint64_t kEmptyXorSentinel =
+  static constexpr uint64_t kEmptyXorSentinel =
       static_cast<uint8_t>(ctrl_t::kEmpty) ^
       static_cast<uint8_t>(ctrl_t::kSentinel);
-  const uint64_t mask_convert_old_sentinel_to_empty =
-      kEmptyXorSentinel << (half_old_capacity * 8);
-  copied_bytes ^= mask_convert_old_sentinel_to_empty;
 
-  // Copy second half of bytes to the beginning. This correctly sets the bytes
-  // [0, old_capacity]. We potentially copy more bytes in order to have compile
-  // time known size. Mirrored bytes from the old_ctrl() will also be copied. In
-  // case of old_capacity_ == 3, we will copy 1st element twice.
-  // Examples:
-  // (old capacity = 1)
-  // old_ctrl = 0S0EEEEEEE...
-  // new_ctrl = E0EEEEEE??...
-  //
-  // (old capacity = 3)
-  // old_ctrl = 012S012EEEEE...
-  // new_ctrl = 12E012EE????...
-  //
-  // (old capacity = 7)
-  // old_ctrl = 0123456S0123456EE...
-  // new_ctrl = 456E0123?????????...
-  absl::little_endian::Store64(new_ctrl, copied_bytes);
+  // Replace the first byte kSentinel with kEmpty.
+  // Resulting bytes will be shifted by one byte old control blocks.
+  // Example:
+  // old_ctrl = 012S012EEEEEEEEE...
+  // before =   S012EEEE
+  // after  =   E012EEEE
+  copied_bytes ^= kEmptyXorSentinel;
 
-  // Set the space [old_capacity + 1, new_capacity] to empty as these bytes will
-  // not be written again. This is safe because
-  // NumControlBytes = new_capacity + kWidth and new_capacity >=
-  // old_capacity+1.
-  // Examples:
-  // (old_capacity = 3, new_capacity = 15)
-  // new_ctrl  = 12E012EE?????????????...??
-  // *new_ctrl = 12E0EEEEEEEEEEEEEEEE?...??
-  // position        /          S
-  //
-  // (old_capacity = 7, new_capacity = 15)
-  // new_ctrl  = 456E0123?????????????????...??
-  // *new_ctrl = 456E0123EEEEEEEEEEEEEEEE?...??
-  // position            /      S
-  std::memset(new_ctrl + old_capacity_ + 1, static_cast<int8_t>(ctrl_t::kEmpty),
-              Group::kWidth);
+  if (Group::kWidth == 8) {
+    // With group size 8, we can grow with two write operations.
+    assert(old_capacity_ < 8 && "old_capacity_ is too large for group size 8");
+    absl::little_endian::Store64(new_ctrl, copied_bytes);
 
-  // Set the last kHalfWidth bytes to empty, to ensure the bytes all the way to
-  // the end are initialized.
-  // Examples:
-  // new_ctrl  = 12E0EEEEEEEEEEEEEEEE?...???????
-  // *new_ctrl = 12E0EEEEEEEEEEEEEEEE???EEEEEEEE
-  // position                   S       /
-  //
-  // new_ctrl  = 456E0123EEEEEEEEEEEEEEEE???????
-  // *new_ctrl = 456E0123EEEEEEEEEEEEEEEEEEEEEEE
-  // position                   S       /
-  std::memset(new_ctrl + NumControlBytes(new_capacity) - kHalfWidth,
+    static constexpr uint64_t kSentinal64 =
+        static_cast<uint8_t>(ctrl_t::kSentinel);
+
+    // Prepend kSentinel byte to the beginning of copied_bytes.
+    // We have maximum 3 non-empty bytes at the beginning of copied_bytes for
+    // group size 8.
+    // Example:
+    // old_ctrl = 012S012EEEE
+    // before =   E012EEEE
+    // after  =   SE012EEE
+    copied_bytes = (copied_bytes << 8) ^ kSentinal64;
+    absl::little_endian::Store64(new_ctrl + new_capacity, copied_bytes);
+    // Example for capacity 3:
+    // old_ctrl = 012S012EEEE
+    // After the first store:
+    //           >!
+    // new_ctrl = E012EEEE???????
+    // After the second store:
+    //                  >!
+    // new_ctrl = E012EEESE012EEE
+    return;
+  }
+
+  assert(Group::kWidth == 16);
+
+  // Fill the second half of the main control bytes with kEmpty.
+  // For small capacity that may write into mirrored control bytes.
+  // It is fine as we will overwrite all the bytes later.
+  std::memset(new_ctrl + kHalfWidth, static_cast<int8_t>(ctrl_t::kEmpty),
+              kHalfWidth);
+  // Fill the second half of the mirrored control bytes with kEmpty.
+  std::memset(new_ctrl + new_capacity + kHalfWidth,
               static_cast<int8_t>(ctrl_t::kEmpty), kHalfWidth);
-
-  // Copy the first bytes to the end (starting at new_capacity +1) to set the
-  // cloned bytes. Note that we use the already copied bytes from old_ctrl here
-  // rather than copying from new_ctrl to avoid a Read-after-Write hazard, since
-  // new_ctrl was just written to. The first old_capacity-1 bytes are set
-  // correctly. Then there may be up to old_capacity bytes that need to be
-  // overwritten, and any remaining bytes will be correctly set to empty. This
-  // sets [new_capacity + 1, new_capacity +1 + old_capacity] correctly.
-  // Examples:
-  // new_ctrl  = 12E0EEEEEEEEEEEEEEEE?...???????
-  // *new_ctrl = 12E0EEEEEEEEEEEE12E012EEEEEEEEE
-  // position                   S/
-  //
-  // new_ctrl  = 456E0123EEEEEEEE?...???EEEEEEEE
-  // *new_ctrl = 456E0123EEEEEEEE456E0123EEEEEEE
-  // position                   S/
+  // Copy the first half of the non-mirrored control bytes.
+  absl::little_endian::Store64(new_ctrl, copied_bytes);
+  new_ctrl[new_capacity] = ctrl_t::kSentinel;
+  // Copy the first half of the mirrored control bytes.
   absl::little_endian::Store64(new_ctrl + new_capacity + 1, copied_bytes);
 
-  // Set The remaining bytes at the end past the cloned bytes to empty. The
-  // incorrectly set bytes are [new_capacity + old_capacity + 2,
-  // min(new_capacity + 1 + kHalfWidth, new_capacity + old_capacity + 2 +
-  // half_old_capacity)]. Taking the difference, we need to set min(kHalfWidth -
-  // (old_capacity + 1), half_old_capacity)]. Since old_capacity < kHalfWidth,
-  // half_old_capacity < kQuarterWidth, so we set kQuarterWidth beginning at
-  // new_capacity + old_capacity + 2 to kEmpty.
-  // Examples:
-  // new_ctrl  = 12E0EEEEEEEEEEEE12E012EEEEEEEEE
-  // *new_ctrl = 12E0EEEEEEEEEEEE12E0EEEEEEEEEEE
-  // position                   S    /
-  //
-  // new_ctrl  = 456E0123EEEEEEEE456E0123EEEEEEE
-  // *new_ctrl = 456E0123EEEEEEEE456E0123EEEEEEE (no change)
-  // position                   S        /
-  std::memset(new_ctrl + new_capacity + old_capacity_ + 2,
-              static_cast<int8_t>(ctrl_t::kEmpty), kQuarterWidth);
+  // Example for growth capacity 1->3:
+  // old_ctrl =                  0S0EEEEEEEEEEEEEE
+  // new_ctrl at the end =       E0ESE0EEEEEEEEEEEEE
+  //                                    >!
+  // new_ctrl after 1st memset = ????????EEEEEEEE???
+  //                                       >!
+  // new_ctrl after 2nd memset = ????????EEEEEEEEEEE
+  //                            >!
+  // new_ctrl after 1st store =  E0EEEEEEEEEEEEEEEEE
+  // new_ctrl after kSentinel =  E0ESEEEEEEEEEEEEEEE
+  //                                >!
+  // new_ctrl after 2nd store =  E0ESE0EEEEEEEEEEEEE
 
-  // Finally, we set the new sentinel byte.
-  new_ctrl[new_capacity] = ctrl_t::kSentinel;
+  // Example for growth capacity 3->7:
+  // old_ctrl =                  012S012EEEEEEEEEEEE
+  // new_ctrl at the end =       E012EEESE012EEEEEEEEEEE
+  //                                    >!
+  // new_ctrl after 1st memset = ????????EEEEEEEE???????
+  //                                           >!
+  // new_ctrl after 2nd memset = ????????EEEEEEEEEEEEEEE
+  //                            >!
+  // new_ctrl after 1st store =  E012EEEEEEEEEEEEEEEEEEE
+  // new_ctrl after kSentinel =  E012EEESEEEEEEEEEEEEEEE
+  //                                >!
+  // new_ctrl after 2nd store =  E012EEESE012EEEEEEEEEEE
+
+
+  // Example for growth capacity 7->15:
+  // old_ctrl =                  0123456S0123456EEEEEEEE
+  // new_ctrl at the end =       E0123456EEEEEEESE0123456EEEEEEE
+  //                                    >!
+  // new_ctrl after 1st memset = ????????EEEEEEEE???????????????
+  //                                                   >!
+  // new_ctrl after 2nd memset = ????????EEEEEEEE???????EEEEEEEE
+  //                            >!
+  // new_ctrl after 1st store =  E0123456EEEEEEEE???????EEEEEEEE
+  // new_ctrl after kSentinel =  E0123456EEEEEEES???????EEEEEEEE
+  //                                            >!
+  // new_ctrl after 2nd store =  E0123456EEEEEEESE0123456EEEEEEE
 }
 
 void HashSetResizeHelper::InitControlBytesAfterSoo(ctrl_t* new_ctrl, ctrl_t h2,
@@ -480,15 +480,10 @@
 
 void HashSetResizeHelper::GrowIntoSingleGroupShuffleTransferableSlots(
     void* new_slots, size_t slot_size) const {
-  assert(old_capacity_ > 0);
-  const size_t half_old_capacity = old_capacity_ / 2;
-
+  ABSL_ASSUME(old_capacity_ > 0);
   SanitizerUnpoisonMemoryRegion(old_slots(), slot_size * old_capacity_);
-  std::memcpy(new_slots,
-              SlotAddress(old_slots(), half_old_capacity + 1, slot_size),
-              slot_size * half_old_capacity);
-  std::memcpy(SlotAddress(new_slots, half_old_capacity + 1, slot_size),
-              old_slots(), slot_size * (half_old_capacity + 1));
+  std::memcpy(SlotAddress(new_slots, 1, slot_size), old_slots(),
+              slot_size * old_capacity_);
 }
 
 void HashSetResizeHelper::GrowSizeIntoSingleGroupTransferable(
diff --git a/absl/container/internal/raw_hash_set.h b/absl/container/internal/raw_hash_set.h
index 76e12d7..a6f2658 100644
--- a/absl/container/internal/raw_hash_set.h
+++ b/absl/container/internal/raw_hash_set.h
@@ -2014,14 +2014,14 @@
     }
     // Find a location for the new element non-deterministically.
     // Note that any position is correct.
-    // It will located at `half_old_capacity` or one of the other
+    // It will be located at `0` or one of the other
     // empty slots with approximately 50% probability each.
     size_t offset = probe(c, hash).offset();
 
     // Note that we intentionally use unsigned int underflow.
     if (offset - (old_capacity + 1) >= old_capacity) {
       // Offset fall on kSentinel or into the mostly occupied first half.
-      offset = old_capacity / 2;
+      offset = 0;
     }
     ABSL_SWISSTABLE_ASSERT(IsEmpty(c.control()[offset]));
     return FindInfo{offset, 0};
@@ -2148,16 +2148,14 @@
     using slot_type = typename PolicyTraits::slot_type;
     ABSL_SWISSTABLE_ASSERT(is_single_group(c.capacity()));
 
-    auto* new_slots = static_cast<slot_type*>(c.slot_array());
+    auto* new_slots = static_cast<slot_type*>(c.slot_array()) + 1;
     auto* old_slots_ptr = static_cast<slot_type*>(old_slots());
+    auto* old_ctrl_ptr = old_ctrl();
 
-    size_t shuffle_bit = old_capacity_ / 2 + 1;
-    for (size_t i = 0; i < old_capacity_; ++i) {
-      if (IsFull(old_ctrl()[i])) {
-        size_t new_i = i ^ shuffle_bit;
-        SanitizerUnpoisonMemoryRegion(new_slots + new_i, sizeof(slot_type));
-        PolicyTraits::transfer(&alloc_ref, new_slots + new_i,
-                               old_slots_ptr + i);
+    for (size_t i = 0; i < old_capacity_; ++i, ++new_slots) {
+      if (IsFull(old_ctrl_ptr[i])) {
+        SanitizerUnpoisonMemoryRegion(new_slots, sizeof(slot_type));
+        PolicyTraits::transfer(&alloc_ref, new_slots, old_slots_ptr + i);
       }
     }
     PoisonSingleGroupEmptySlots(c, sizeof(slot_type));
@@ -2199,27 +2197,25 @@
   // 1. new_ctrl is allocated for new_capacity,
   //    but not initialized.
   // 2. new_capacity is a single group.
+  // 3. old_capacity > 0.
   //
   // All elements are transferred into the first `old_capacity + 1` positions
-  // of the new_ctrl. Elements are rotated by `old_capacity_ / 2 + 1` positions
-  // in order to change an order and keep it non deterministic.
-  // Although rotation itself deterministic, position of the new added element
-  // will be based on `H1` and is not deterministic.
+  // of the new_ctrl. Elements are shifted by 1 in order to keep a space at the
+  // beginning for the new element.
+  // Position of the new added element will be based on `H1` and is not
+  // deterministic.
   //
   // Examples:
   // S = kSentinel, E = kEmpty
   //
-  // old_ctrl = SEEEEEEEE...
-  // new_ctrl = ESEEEEEEE...
-  //
   // old_ctrl = 0SEEEEEEE...
   // new_ctrl = E0ESE0EEE...
   //
   // old_ctrl = 012S012EEEEEEEEE...
-  // new_ctrl = 2E01EEES2E01EEE...
+  // new_ctrl = E012EEESE012EEE...
   //
   // old_ctrl = 0123456S0123456EEEEEEEEEEE...
-  // new_ctrl = 456E0123EEEEEES456E0123EEE...
+  // new_ctrl = E0123456EEEEEESE0123456EEE...
   void GrowIntoSingleGroupShuffleControlBytes(ctrl_t* new_ctrl,
                                               size_t new_capacity) const;