From 591a2cdacb728f7c4f29782c31d7b694bb3a0782 Mon Sep 17 00:00:00 2001 From: Abseil Team Date: Thu, 26 May 2022 08:32:42 -0700 Subject: Optimize SwissMap iteration for aarch64 by 5-6% Benchmarks: https://pastebin.com/tZ7dr67W. Works well especially on smaller ranges. After a week on spending optimizing NEON SIMD where I almost managed to make hash tables work with NEON SIMD without performance hits (still 1 cycle to optimize and I gave up a little), I found an interesting optimization for aarch64 to use cls instruction (count leading sign bits). The loop has a property that ctrl_ group is not matched against count when the first slot is empty or deleted. ``` void skip_empty_or_deleted() { while (IsEmptyOrDeleted(*ctrl_)) { uint32_t shift = Group{ctrl_}.CountLeadingEmptyOrDeleted(); ctrl_ += shift; slot_ += shift; } ... } ``` However, `kEmpty` and `kDeleted` have format of `1xxxxxx0` and `~ctrl & (ctrl >> 7)` always sets the lowest bit to 1. In naive implementation, it does +1 to start counting zero bits, however, in aarch64 we may start counting one bits immediately. This saves 1 cycle and 5% of iteration performance. Then it becomes hard to find a supported and sustainable C++ version of it. `__clsll` is not supported by GCC and was supported only since clang 8, `__builtin_clrsb` is not producing optimal codegen for clang. `__rbit` is not supported by GCC and there is no intrinsic to do that, however, in clang we have `__builtin_bitreverse{32,64}`. For now I decided to enable this only for clang, only if they have appropriate builtins. PiperOrigin-RevId: 451168570 Change-Id: I7e9256a60aecdc88ced4e6eb15ebc257281b6664 --- absl/base/config.h | 15 ++++++++++++ absl/container/internal/raw_hash_set.h | 36 ++++++++++++++++++++++++++++ absl/container/internal/raw_hash_set_test.cc | 6 ++++- 3 files changed, 56 insertions(+), 1 deletion(-) diff --git a/absl/base/config.h b/absl/base/config.h index a0d599fe..5985358f 100644 --- a/absl/base/config.h +++ b/absl/base/config.h @@ -882,4 +882,19 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' || #define ABSL_INTERNAL_HAVE_SSSE3 1 #endif +// ABSL_INTERNAL_HAVE_ARM_ACLE is used for compile-time detection of ACLE (ARM +// C language extensions). +#ifdef ABSL_INTERNAL_HAVE_ARM_ACLE +#error ABSL_INTERNAL_HAVE_ARM_ACLE cannot be directly set +// __cls, __rbit were added quite late in clang. They are not supported +// by GCC as well. __cls can be replaced with __builtin_clrsb but clang does +// not recognize cls instruction in latest versions. +// TODO(b/233604649): Relax to __builtin_clrsb and __builtin_bitreverse64 (note +// that the latter is not supported by GCC). +#elif defined(__ARM_ACLE) && defined(__clang__) && \ + ABSL_HAVE_BUILTIN(__builtin_arm_cls64) && \ + ABSL_HAVE_BUILTIN(__builtin_arm_rbit64) +#define ABSL_INTERNAL_HAVE_ARM_ACLE 1 +#endif + #endif // ABSL_BASE_CONFIG_H_ diff --git a/absl/container/internal/raw_hash_set.h b/absl/container/internal/raw_hash_set.h index 769af50f..d503bc00 100644 --- a/absl/container/internal/raw_hash_set.h +++ b/absl/container/internal/raw_hash_set.h @@ -211,6 +211,10 @@ #include "absl/numeric/bits.h" #include "absl/utility/utility.h" +#ifdef ABSL_INTERNAL_HAVE_ARM_ACLE +#include +#endif + namespace absl { ABSL_NAMESPACE_BEGIN namespace container_internal { @@ -627,8 +631,40 @@ struct GroupPortableImpl { uint64_t ctrl; }; +#ifdef ABSL_INTERNAL_HAVE_ARM_ACLE +struct GroupAArch64Impl : public GroupPortableImpl { + static constexpr size_t kWidth = GroupPortableImpl::kWidth; + + using GroupPortableImpl::GroupPortableImpl; + + uint32_t CountLeadingEmptyOrDeleted() const { + assert(IsEmptyOrDeleted(static_cast(ctrl & 0xff))); + constexpr uint64_t gaps = 0x00FEFEFEFEFEFEFEULL; + // cls: Count leading sign bits. + // clsll(1ull << 63) -> 0 + // clsll((1ull << 63) | (1ull << 62)) -> 1 + // clsll((1ull << 63) | (1ull << 61)) -> 0 + // clsll(~0ull) -> 63 + // clsll(1) -> 62 + // clsll(3) -> 61 + // clsll(5) -> 60 + // Note that CountLeadingEmptyOrDeleted is called when first control block + // is kDeleted or kEmpty. The implementation is similar to GroupPortableImpl + // but avoids +1 and __clsll returns result not including the high bit. Thus + // saves one cycle. + // kEmpty = -128, // 0b10000000 + // kDeleted = -2, // 0b11111110 + // ~ctrl & (ctrl >> 7) will have the lowest bit set to 1. After rbit, + // it will the highest one. + return (__clsll(__rbitll((~ctrl & (ctrl >> 7)) | gaps)) + 8) >> 3; + } +}; +#endif + #ifdef ABSL_INTERNAL_HAVE_SSE2 using Group = GroupSse2Impl; +#elif defined(ABSL_INTERNAL_HAVE_ARM_ACLE) +using Group = GroupAArch64Impl; #else using Group = GroupPortableImpl; #endif diff --git a/absl/container/internal/raw_hash_set_test.cc b/absl/container/internal/raw_hash_set_test.cc index 914ec0c9..c79f8641 100644 --- a/absl/container/internal/raw_hash_set_test.cc +++ b/absl/container/internal/raw_hash_set_test.cc @@ -262,16 +262,20 @@ TEST(Group, CountLeadingEmptyOrDeleted) { for (ctrl_t empty : empty_examples) { std::vector e(Group::kWidth, empty); + EXPECT_TRUE(IsEmptyOrDeleted(e[0])); EXPECT_EQ(Group::kWidth, Group{e.data()}.CountLeadingEmptyOrDeleted()); for (ctrl_t full : full_examples) { - for (size_t i = 0; i != Group::kWidth; ++i) { + // First is always kEmpty or kDeleted. + for (size_t i = 1; i != Group::kWidth; ++i) { std::vector f(Group::kWidth, empty); f[i] = full; + EXPECT_TRUE(IsEmptyOrDeleted(f[0])); EXPECT_EQ(i, Group{f.data()}.CountLeadingEmptyOrDeleted()); } std::vector f(Group::kWidth, empty); f[Group::kWidth * 2 / 3] = full; f[Group::kWidth / 2] = full; + EXPECT_TRUE(IsEmptyOrDeleted(f[0])); EXPECT_EQ( Group::kWidth / 2, Group{f.data()}.CountLeadingEmptyOrDeleted()); } -- cgit v1.2.3