From cee0d5b4c38d6a3488e6dae2d7674ecbcff3f6c1 Mon Sep 17 00:00:00 2001 From: Abseil Team Date: Thu, 14 Dec 2023 12:11:16 -0800 Subject: Performance improvement for absl::AsciiStrToUpper() and absl::AsciiStrToLower() PiperOrigin-RevId: 591015112 Change-Id: I3e654433f0b0a4ea02ee10e0894e70738e730782 --- absl/strings/ascii.cc | 62 +++++++++++++++++++++++++++++++++++++++-- absl/strings/ascii_benchmark.cc | 4 +-- 2 files changed, 61 insertions(+), 5 deletions(-) diff --git a/absl/strings/ascii.cc b/absl/strings/ascii.cc index df905e05..c696a5b1 100644 --- a/absl/strings/ascii.cc +++ b/absl/strings/ascii.cc @@ -15,8 +15,10 @@ #include "absl/strings/ascii.h" #include +#include #include #include +#include #include "absl/base/config.h" #include "absl/base/nullability.h" @@ -160,6 +162,19 @@ ABSL_DLL const char kToUpper[256] = { }; // clang-format on +template +static constexpr T BroadcastByte(unsigned char value) { + static_assert(std::is_integral::value && sizeof(T) <= sizeof(uint64_t) && + std::is_unsigned::value, + "only unsigned integers up to 64-bit allowed"); + T result = value; + constexpr size_t result_bit_width = sizeof(result) * CHAR_BIT; + result |= result << ((CHAR_BIT << 0) & (result_bit_width - 1)); + result |= result << ((CHAR_BIT << 1) & (result_bit_width - 1)); + result |= result << ((CHAR_BIT << 2) & (result_bit_width - 1)); + return result; +} + // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`). // Implemented by: // 1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26). @@ -175,8 +190,42 @@ constexpr bool AsciiInAZRange(unsigned char c) { } template -constexpr void AsciiStrCaseFold(absl::Nonnull p, - absl::Nonnull end) { +static constexpr char* PartialAsciiStrCaseFold(absl::Nonnull p, + absl::Nonnull end) { + using vec_t = size_t; + const size_t n = static_cast(end - p); + + // SWAR algorithm: http://0x80.pl/notesen/2016-01-06-swar-swap-case.html + constexpr char ch_a = ToUpper ? 'a' : 'A', ch_z = ToUpper ? 'z' : 'Z'; + char* const swar_end = p + (n / sizeof(vec_t)) * sizeof(vec_t); + while (p < swar_end) { + vec_t v = vec_t(); + + // memcpy the vector, but constexpr + for (size_t i = 0; i < sizeof(vec_t); ++i) { + v |= static_cast(static_cast(p[i])) + << (i * CHAR_BIT); + } + + constexpr unsigned int msb = 1u << 7; + vec_t a = v + ascii_internal::BroadcastByte(msb - ch_a - 0), + z = v + ascii_internal::BroadcastByte(msb - ch_z - 1); + v ^= ((a & ~z) & ascii_internal::BroadcastByte(msb)) >> 2; + + // memcpy the vector, but constexpr + for (size_t i = 0; i < sizeof(vec_t); ++i) { + p[i] = static_cast(v >> (i * CHAR_BIT)); + } + + p += sizeof(v); + } + + return p; +} + +template +static constexpr void AsciiStrCaseFold(absl::Nonnull p, + absl::Nonnull end) { // The upper- and lowercase versions of ASCII characters differ by only 1 bit. // When we need to flip the case, we can xor with this bit to achieve the // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We @@ -184,10 +233,17 @@ constexpr void AsciiStrCaseFold(absl::Nonnull p, // have the same single bit difference. constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A'; - for (; p < end; ++p) { + using vec_t = size_t; + // TODO(b/316380338): When FDO becomes able to vectorize these, + // revert this manual optimization and just leave the naive loop. + if (static_cast(end - p) >= sizeof(vec_t)) { + p = ascii_internal::PartialAsciiStrCaseFold(p, end); + } + while (p < end) { unsigned char v = static_cast(*p); v ^= AsciiInAZRange(v) ? kAsciiCaseBitFlip : 0; *p = static_cast(v); + ++p; } } diff --git a/absl/strings/ascii_benchmark.cc b/absl/strings/ascii_benchmark.cc index 3aecdb8b..4ae73174 100644 --- a/absl/strings/ascii_benchmark.cc +++ b/absl/strings/ascii_benchmark.cc @@ -113,7 +113,7 @@ static void BM_StrToLower(benchmark::State& state) { BENCHMARK(BM_StrToLower) ->DenseRange(0, 32) ->RangeMultiplier(2) - ->Range(64, 1 << 20); + ->Range(64, 1 << 26); static void BM_StrToUpper(benchmark::State& state) { const int size = state.range(0); @@ -127,6 +127,6 @@ static void BM_StrToUpper(benchmark::State& state) { BENCHMARK(BM_StrToUpper) ->DenseRange(0, 32) ->RangeMultiplier(2) - ->Range(64, 1 << 20); + ->Range(64, 1 << 26); } // namespace -- cgit v1.2.3