Optimize raw_hash_set destructor.

There are three optimizations here: 1. Early exit in case all slots were destroyed. especially useful for empty tables. 2. MatchFull is used in order to iterate over all full slots. 3. Portable group is used for `MatchFull` in the case of small table. PiperOrigin-RevId: 603434899 Change-Id: I40bc90d17331d579cfbb1b4e0693f0913e5c38e4
author: Abseil Team <absl-team@google.com> 2024-02-01 11:43:33 -0800
committer: Copybara-Service <copybara-worker@google.com> 2024-02-01 11:44:43 -0800
commit: 7339447a7f457f1d8efa6322c971e71afc304d32 (patch)
tree: 8e7116a29f274315e0257d33e79720a29363165a /absl
parent: a3ee6ce2e6eee06941a1e9479720cecbd898f4ca (diff)
2 files changed, 87 insertions, 18 deletions
diff --git a/absl/container/internal/raw_hash_set.h b/absl/container/internal/raw_hash_set.h
index 3b793825..0bb77bda 100644
--- a/absl/container/internal/raw_hash_set.h
+++ b/absl/container/internal/raw_hash_set.h
@@ -822,21 +822,21 @@ struct GroupPortableImpl {
 
 #ifdef ABSL_INTERNAL_HAVE_SSE2
 using Group = GroupSse2Impl;
-using GroupEmptyOrDeleted = GroupSse2Impl;
+using GroupFullEmptyOrDeleted = GroupSse2Impl;
 #elif defined(ABSL_INTERNAL_HAVE_ARM_NEON) && defined(ABSL_IS_LITTLE_ENDIAN)
 using Group = GroupAArch64Impl;
 // For Aarch64, we use the portable implementation for counting and masking
-// empty or deleted group elements. This is to avoid the latency of moving
+// full, empty or deleted group elements. This is to avoid the latency of moving
 // between data GPRs and Neon registers when it does not provide a benefit.
 // Using Neon is profitable when we call Match(), but is not when we don't,
-// which is the case when we do *EmptyOrDeleted operations. It is difficult to
-// make a similar approach beneficial on other architectures such as x86 since
-// they have much lower GPR <-> vector register transfer latency and 16-wide
-// Groups.
-using GroupEmptyOrDeleted = GroupPortableImpl;
+// which is the case when we do *EmptyOrDeleted and MaskFull operations.
+// It is difficult to make a similar approach beneficial on other architectures
+// such as x86 since they have much lower GPR <-> vector register transfer
+// latency and 16-wide Groups.
+using GroupFullEmptyOrDeleted = GroupPortableImpl;
 #else
 using Group = GroupPortableImpl;
-using GroupEmptyOrDeleted = GroupPortableImpl;
+using GroupFullEmptyOrDeleted = GroupPortableImpl;
 #endif
 
 // When there is an insertion with no reserved growth, we rehash with
@@ -1463,7 +1463,7 @@ inline FindInfo find_first_non_full(const CommonFields& common, size_t hash) {
   auto seq = probe(common, hash);
   const ctrl_t* ctrl = common.control();
   while (true) {
-    GroupEmptyOrDeleted g{ctrl + seq.offset()};
+    GroupFullEmptyOrDeleted g{ctrl + seq.offset()};
     auto mask = g.MaskEmptyOrDeleted();
     if (mask) {
 #if !defined(NDEBUG)
@@ -1545,6 +1545,44 @@ inline void* SlotAddress(void* slot_array, size_t slot, size_t slot_size) {
                                  (slot * slot_size));
 }
 
+// Iterates over all full slots and calls `cb(SlotType*)`.
+// NOTE: no erasure from this table allowed during Callback call.
+template <class SlotType, class Callback>
+ABSL_ATTRIBUTE_ALWAYS_INLINE inline void IterateOverFullSlots(
+    const CommonFields& c, SlotType* slot, Callback cb) {
+  const size_t cap = c.capacity();
+  const ctrl_t* ctrl = c.control();
+  if (is_small(cap)) {
+    // Mirrored/cloned control bytes in small table are also located in the
+    // first group (starting from position 0). We are taking group from position
+    // `capacity` in order to avoid duplicates.
+
+    // Small tables capacity fits into portable group, where
+    // GroupPortableImpl::MaskFull is more efficient for the
+    // capacity <= GroupPortableImpl::kWidth.
+    assert(cap <= GroupPortableImpl::kWidth &&
+           "unexpectedly large small capacity");
+    static_assert(Group::kWidth >= GroupPortableImpl::kWidth,
+                  "unexpected group width");
+    // Group starts from kSentinel slot, so indices in the mask will
+    // be increased by 1.
+    --slot;
+    for (uint32_t i : GroupPortableImpl(ctrl + cap).MaskFull()) {
+      cb(slot + i);
+    }
+    return;
+  }
+  size_t remaining = c.size();
+  while (remaining != 0) {
+    for (uint32_t i : GroupFullEmptyOrDeleted(ctrl).MaskFull()) {
+      cb(slot + i);
+      --remaining;
+    }
+    slot += Group::kWidth;
+    ctrl += Group::kWidth;
+  }
+}
+
 // Helper class to perform resize of the hash set.
 //
 // It contains special optimizations for small group resizes.
@@ -2028,7 +2066,7 @@ class raw_hash_set {
     void skip_empty_or_deleted() {
       while (IsEmptyOrDeleted(*ctrl_)) {
         uint32_t shift =
-            GroupEmptyOrDeleted{ctrl_}.CountLeadingEmptyOrDeleted();
+            GroupFullEmptyOrDeleted{ctrl_}.CountLeadingEmptyOrDeleted();
         ctrl_ += shift;
         slot_ += shift;
       }
@@ -2889,14 +2927,10 @@ class raw_hash_set {
 
   inline void destroy_slots() {
     if (PolicyTraits::template destroy_is_trivial<Alloc>()) return;
-    const size_t cap = capacity();
-    const ctrl_t* ctrl = control();
-    slot_type* slot = slot_array();
-    for (size_t i = 0; i != cap; ++i) {
-      if (IsFull(ctrl[i])) {
-        destroy(slot + i);
-      }
-    }
+    IterateOverFullSlots(common(), slot_array(),
+                         [&](slot_type* slot) ABSL_ATTRIBUTE_ALWAYS_INLINE {
+                           this->destroy(slot);
+                         });
   }
 
   inline void dealloc() {
diff --git a/absl/container/internal/raw_hash_set_test.cc b/absl/container/internal/raw_hash_set_test.cc
index 7ec72b22..5852904f 100644
--- a/absl/container/internal/raw_hash_set_test.cc
+++ b/absl/container/internal/raw_hash_set_test.cc
@@ -64,6 +64,10 @@ namespace container_internal {
 
 struct RawHashSetTestOnlyAccess {
   template <typename C>
+  static auto GetCommon(const C& c) -> decltype(c.common()) {
+    return c.common();
+  }
+  template <typename C>
   static auto GetSlots(const C& c) -> decltype(c.slot_array()) {
     return c.slot_array();
   }
@@ -2741,6 +2745,37 @@ TEST(Table, CountedHash) {
   }
 }
 
+TEST(Table, IterateOverFullSlotsEmpty) {
+  IntTable t;
+  auto fail_if_any = [](int64_t* i) { FAIL() << "expected no slots " << i; };
+  container_internal::IterateOverFullSlots(
+      RawHashSetTestOnlyAccess::GetCommon(t),
+      RawHashSetTestOnlyAccess::GetSlots(t), fail_if_any);
+  for (size_t i = 0; i < 256; ++i) {
+    t.reserve(i);
+    container_internal::IterateOverFullSlots(
+        RawHashSetTestOnlyAccess::GetCommon(t),
+        RawHashSetTestOnlyAccess::GetSlots(t), fail_if_any);
+  }
+}
+
+TEST(Table, IterateOverFullSlotsFull) {
+  IntTable t;
+
+  std::vector<int64_t> expected_slots;
+  for (int64_t i = 0; i < 128; ++i) {
+    t.insert(i);
+    expected_slots.push_back(i);
+
+    std::vector<int64_t> slots;
+    container_internal::IterateOverFullSlots(
+        RawHashSetTestOnlyAccess::GetCommon(t),
+        RawHashSetTestOnlyAccess::GetSlots(t),
+        [&slots](int64_t* i) { slots.push_back(*i); });
+    EXPECT_THAT(slots, testing::UnorderedElementsAreArray(expected_slots));
+  }
+}
+
 }  // namespace
 }  // namespace container_internal
 ABSL_NAMESPACE_END
author	Abseil Team <absl-team@google.com>	2024-02-01 11:43:33 -0800
committer	Copybara-Service <copybara-worker@google.com>	2024-02-01 11:44:43 -0800
commit	7339447a7f457f1d8efa6322c971e71afc304d32 (patch)
tree	8e7116a29f274315e0257d33e79720a29363165a /absl
parent	a3ee6ce2e6eee06941a1e9479720cecbd898f4ca (diff)