From 7383f346c9e33a08ed2132f117b3de6b13eac173 Mon Sep 17 00:00:00 2001 From: Abseil Team Date: Thu, 9 Jun 2022 03:12:43 -0700 Subject: Optimize SwissMap iteration by another 5-10% for ARM https://pastebin.com/fDvgWgHe After having a chat with Dougall Johnson (https://twitter.com/dougallj/status/1534213050944802816), we realized that __clzll works with zero arguments per documentation: https://developer.arm.com/documentation/101028/0009/Data-processing-intrinsics ``` Returns the number of leading zero bits in x. When x is zero it returns the argument width, i.e. 32 or 64. ``` Codegen improves https://godbolt.org/z/ebadf717Y Thus we can use a little bit different construction not involving CLS but using more understandable CLZ and removing some operations. PiperOrigin-RevId: 453879080 Change-Id: Ie2d7f834f63364d7bd50dd6a682c107985f21942 --- absl/container/internal/raw_hash_set.h | 39 +++++++++------------------------- 1 file changed, 10 insertions(+), 29 deletions(-) (limited to 'absl/container/internal/raw_hash_set.h') diff --git a/absl/container/internal/raw_hash_set.h b/absl/container/internal/raw_hash_set.h index 2756ce1b..cd31b870 100644 --- a/absl/container/internal/raw_hash_set.h +++ b/absl/container/internal/raw_hash_set.h @@ -188,10 +188,6 @@ #include #endif -#ifdef __ARM_ACLE -#include -#endif - #include #include #include @@ -634,29 +630,12 @@ struct GroupAArch64Impl { uint32_t CountLeadingEmptyOrDeleted() const { uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(ctrl), 0); - assert(IsEmptyOrDeleted(static_cast(mask & 0xff))); - constexpr uint64_t gaps = 0x00FEFEFEFEFEFEFEULL; -#if defined(ABSL_INTERNAL_HAVE_ARM_ACLE) - // cls: Count leading sign bits. - // clsll(1ull << 63) -> 0 - // clsll((1ull << 63) | (1ull << 62)) -> 1 - // clsll((1ull << 63) | (1ull << 61)) -> 0 - // clsll(~0ull) -> 63 - // clsll(1) -> 62 - // clsll(3) -> 61 - // clsll(5) -> 60 - // Note that CountLeadingEmptyOrDeleted is called when first control block - // is kDeleted or kEmpty. The implementation is similar to GroupPortableImpl - // but avoids +1 and __clsll returns result not including the high bit. Thus - // saves one cycle. - // kEmpty = -128, // 0b10000000 - // kDeleted = -2, // 0b11111110 - // ~ctrl & (ctrl >> 7) will have the lowest bit set to 1. After rbit, - // it will the highest one. - return (__clsll(__rbitll((~mask & (mask >> 7)) | gaps)) + 8) >> 3; -#else - return (TrailingZeros(((~mask & (mask >> 7)) | gaps) + 1) + 7) >> 3; -#endif // ABSL_INTERNAL_HAVE_ARM_ACLE + // ctrl | ~(ctrl >> 7) will have the lowest bit set to zero for kEmpty and + // kDeleted. We lower all other bits and count number of trailing zeros. + // Clang and GCC optimize countr_zero to rbit+clz without any check for 0, + // so we should be fine. + constexpr uint64_t bits = 0x0101010101010101ULL; + return countr_zero((mask | ~(mask >> 7)) & bits) >> 3; } void ConvertSpecialToEmptyAndFullToDeleted(ctrl_t* dst) const { @@ -711,8 +690,10 @@ struct GroupPortableImpl { } uint32_t CountLeadingEmptyOrDeleted() const { - constexpr uint64_t gaps = 0x00FEFEFEFEFEFEFEULL; - return (TrailingZeros(((~ctrl & (ctrl >> 7)) | gaps) + 1) + 7) >> 3; + // ctrl | ~(ctrl >> 7) will have the lowest bit set to zero for kEmpty and + // kDeleted. We lower all other bits and count number of trailing zeros. + constexpr uint64_t bits = 0x0101010101010101ULL; + return countr_zero((ctrl | ~(ctrl >> 7)) & bits) >> 3; } void ConvertSpecialToEmptyAndFullToDeleted(ctrl_t* dst) const { -- cgit v1.2.3