summaryrefslogtreecommitdiff
path: root/absl/container/internal/raw_hash_set.h
diff options
context:
space:
mode:
authorGravatar Abseil Team <absl-team@google.com>2022-05-26 08:32:42 -0700
committerGravatar Copybara-Service <copybara-worker@google.com>2022-05-26 08:33:47 -0700
commit591a2cdacb728f7c4f29782c31d7b694bb3a0782 (patch)
tree8ee2578ab7b20ac855778565503be39ef68fddf7 /absl/container/internal/raw_hash_set.h
parent4bb6a0e045c211576c7066e7862ae212e8333639 (diff)
Optimize SwissMap iteration for aarch64 by 5-6%
Benchmarks: https://pastebin.com/tZ7dr67W. Works well especially on smaller ranges. After a week on spending optimizing NEON SIMD where I almost managed to make hash tables work with NEON SIMD without performance hits (still 1 cycle to optimize and I gave up a little), I found an interesting optimization for aarch64 to use cls instruction (count leading sign bits). The loop has a property that ctrl_ group is not matched against count when the first slot is empty or deleted. ``` void skip_empty_or_deleted() { while (IsEmptyOrDeleted(*ctrl_)) { uint32_t shift = Group{ctrl_}.CountLeadingEmptyOrDeleted(); ctrl_ += shift; slot_ += shift; } ... } ``` However, `kEmpty` and `kDeleted` have format of `1xxxxxx0` and `~ctrl & (ctrl >> 7)` always sets the lowest bit to 1. In naive implementation, it does +1 to start counting zero bits, however, in aarch64 we may start counting one bits immediately. This saves 1 cycle and 5% of iteration performance. Then it becomes hard to find a supported and sustainable C++ version of it. `__clsll` is not supported by GCC and was supported only since clang 8, `__builtin_clrsb` is not producing optimal codegen for clang. `__rbit` is not supported by GCC and there is no intrinsic to do that, however, in clang we have `__builtin_bitreverse{32,64}`. For now I decided to enable this only for clang, only if they have appropriate builtins. PiperOrigin-RevId: 451168570 Change-Id: I7e9256a60aecdc88ced4e6eb15ebc257281b6664
Diffstat (limited to 'absl/container/internal/raw_hash_set.h')
-rw-r--r--absl/container/internal/raw_hash_set.h36
1 files changed, 36 insertions, 0 deletions
diff --git a/absl/container/internal/raw_hash_set.h b/absl/container/internal/raw_hash_set.h
index 769af50f..d503bc00 100644
--- a/absl/container/internal/raw_hash_set.h
+++ b/absl/container/internal/raw_hash_set.h
@@ -211,6 +211,10 @@
#include "absl/numeric/bits.h"
#include "absl/utility/utility.h"
+#ifdef ABSL_INTERNAL_HAVE_ARM_ACLE
+#include <arm_acle.h>
+#endif
+
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace container_internal {
@@ -627,8 +631,40 @@ struct GroupPortableImpl {
uint64_t ctrl;
};
+#ifdef ABSL_INTERNAL_HAVE_ARM_ACLE
+struct GroupAArch64Impl : public GroupPortableImpl {
+ static constexpr size_t kWidth = GroupPortableImpl::kWidth;
+
+ using GroupPortableImpl::GroupPortableImpl;
+
+ uint32_t CountLeadingEmptyOrDeleted() const {
+ assert(IsEmptyOrDeleted(static_cast<ctrl_t>(ctrl & 0xff)));
+ constexpr uint64_t gaps = 0x00FEFEFEFEFEFEFEULL;
+ // cls: Count leading sign bits.
+ // clsll(1ull << 63) -> 0
+ // clsll((1ull << 63) | (1ull << 62)) -> 1
+ // clsll((1ull << 63) | (1ull << 61)) -> 0
+ // clsll(~0ull) -> 63
+ // clsll(1) -> 62
+ // clsll(3) -> 61
+ // clsll(5) -> 60
+ // Note that CountLeadingEmptyOrDeleted is called when first control block
+ // is kDeleted or kEmpty. The implementation is similar to GroupPortableImpl
+ // but avoids +1 and __clsll returns result not including the high bit. Thus
+ // saves one cycle.
+ // kEmpty = -128, // 0b10000000
+ // kDeleted = -2, // 0b11111110
+ // ~ctrl & (ctrl >> 7) will have the lowest bit set to 1. After rbit,
+ // it will the highest one.
+ return (__clsll(__rbitll((~ctrl & (ctrl >> 7)) | gaps)) + 8) >> 3;
+ }
+};
+#endif
+
#ifdef ABSL_INTERNAL_HAVE_SSE2
using Group = GroupSse2Impl;
+#elif defined(ABSL_INTERNAL_HAVE_ARM_ACLE)
+using Group = GroupAArch64Impl;
#else
using Group = GroupPortableImpl;
#endif