From 15d26cdd0179f83b328f3b62ae7d08388ffeef0b Mon Sep 17 00:00:00 2001 From: Abseil Team Date: Fri, 19 May 2023 16:17:08 -0700 Subject: Optimize absl::StrCat for integers Previous implementation used a lookup table of 200 bytes which, if not in L1 cache, could cause memory stalls and consumes 3-4 cache lines. This is suboptimal in real world scenarious. We are going to use a purely scalar version of merging 2/4/8 bytes together. PiperOrigin-RevId: 533576619 Change-Id: Icd730d18536f7eb35979b62582f6edf86b786019 --- absl/strings/numbers.cc | 245 ++++++++++++++++++++++++++++-------------------- 1 file changed, 143 insertions(+), 102 deletions(-) diff --git a/absl/strings/numbers.cc b/absl/strings/numbers.cc index c2b861ae..8c05ff0e 100644 --- a/absl/strings/numbers.cc +++ b/absl/strings/numbers.cc @@ -32,6 +32,7 @@ #include "absl/base/attributes.h" #include "absl/base/internal/raw_logging.h" +#include "absl/base/optimization.h" #include "absl/numeric/bits.h" #include "absl/strings/ascii.h" #include "absl/strings/charconv.h" @@ -136,82 +137,123 @@ bool SimpleAtob(absl::string_view str, bool* out) { namespace { -// Used to optimize printing a decimal number's final digit. -const char one_ASCII_final_digits[10][2] { - {'0', 0}, {'1', 0}, {'2', 0}, {'3', 0}, {'4', 0}, - {'5', 0}, {'6', 0}, {'7', 0}, {'8', 0}, {'9', 0}, -}; +// Various routines to encode integers to strings. + +// We split data encodings into a group of 2 digits, 4 digits, 8 digits as +// it's easier to combine powers of two into scalar arithmetic. + +// Previous implementation used a lookup table of 200 bytes for every 2 bytes +// and it was memory bound, any L1 cache miss would result in a much slower +// result. When benchmarking with a cache eviction rate of several percent, +// this implementation proved to be better. + +// These constants represent '00', '0000' and '00000000' as ascii strings in +// integers. We can add these numbers if we encode to bytes from 0 to 9. as +// 'i' = '0' + i for 0 <= i <= 9. +constexpr uint32_t kTwoZeroBytes = 0x0101 * '0'; +constexpr uint64_t kFourZeroBytes = 0x01010101 * '0'; +constexpr uint64_t kEightZeroBytes = 0x0101010101010101ull * '0'; + +// * 103 / 1024 is a division by 10 for values from 0 to 99. It's also a +// division of a structure [k takes 2 bytes][m takes 2 bytes], then * 103 / 1024 +// will be [k / 10][m / 10]. It allows parallel division. +constexpr uint64_t kDivisionBy10Mul = 103u; +constexpr uint64_t kDivisionBy10Div = 1 << 10; + +// * 10486 / 1048576 is a division by 100 for values from 0 to 9999. +constexpr uint64_t kDivisionBy100Mul = 10486u; +constexpr uint64_t kDivisionBy100Div = 1 << 20; + +// Encode functions write the ASCII output of input `n` to `out_str`. +inline char* EncodeHundred(uint32_t n, char* out_str) { + int num_digits = static_cast(n - 10) >> 8; + uint32_t base = kTwoZeroBytes; + uint32_t div10 = (n * kDivisionBy10Mul) / kDivisionBy10Div; + uint32_t mod10 = n - 10u * div10; + base += div10 + (mod10 << 8); + base >>= num_digits & 8; + memcpy(out_str, &base, 2); + return out_str + 2 + num_digits; +} -} // namespace +inline char* EncodeTenThousand(uint32_t n, char* out_str) { + // We split lower 2 digits and upper 2 digits of n into 2 byte consecutive + // blocks. 123 -> [\0\1][\0\23]. We divide by 10 both blocks + // (it's 1 division + zeroing upper bits), and compute modulo 10 as well "in + // parallel". Then we combine both results to have both ASCII digits, + // strip trailing zeros, add ASCII '0000' and return. + uint32_t div100 = (n * kDivisionBy100Mul) / kDivisionBy100Div; + uint32_t mod100 = n - 100ull * div100; + uint32_t hundreds = (mod100 << 16) + div100; + uint32_t tens = (hundreds * kDivisionBy10Mul) / kDivisionBy10Div; + tens &= (0xFull << 16) | 0xFull; + tens += (hundreds - 10ull * tens) << 8; + ABSL_ASSUME(tens != 0); + // The result can contain trailing zero bits, we need to strip them to a first + // significant byte in a final representation. For example, for n = 123, we + // have tens to have representation \0\1\2\3. We do `& -8` to round + // to a multiple to 8 to strip zero bytes, not all zero bits. + // countr_zero to help. + // 0 minus 8 to make MSVC happy. + uint32_t zeroes = static_cast(absl::countr_zero(tens)) & (0 - 8ull); + tens += kFourZeroBytes; + tens >>= zeroes; + memcpy(out_str, &tens, sizeof(tens)); + return out_str + sizeof(tens) - zeroes / 8; +} -char* numbers_internal::FastIntToBuffer(uint32_t i, char* buffer) { - uint32_t digits; - // The idea of this implementation is to trim the number of divides to as few - // as possible, and also reducing memory stores and branches, by going in - // steps of two digits at a time rather than one whenever possible. - // The huge-number case is first, in the hopes that the compiler will output - // that case in one branch-free block of code, and only output conditional - // branches into it from below. - if (i >= 1000000000) { // >= 1,000,000,000 - digits = i / 100000000; // 100,000,000 - i -= digits * 100000000; - PutTwoDigits(digits, buffer); - buffer += 2; - lt100_000_000: - digits = i / 1000000; // 1,000,000 - i -= digits * 1000000; - PutTwoDigits(digits, buffer); - buffer += 2; - lt1_000_000: - digits = i / 10000; // 10,000 - i -= digits * 10000; - PutTwoDigits(digits, buffer); - buffer += 2; - lt10_000: - digits = i / 100; - i -= digits * 100; - PutTwoDigits(digits, buffer); - buffer += 2; - lt100: - digits = i; - PutTwoDigits(digits, buffer); - buffer += 2; - *buffer = 0; - return buffer; - } +// Prepare functions return an integer that should be written to out_str +// (but possibly include trailing zeros). +// For hi < 10000, lo < 10000 returns uint64_t as encoded in ASCII with +// possibly trailing zeroes of the number hi * 10000 + lo. +inline uint64_t PrepareTenThousands(uint64_t hi, uint64_t lo) { + uint64_t merged = hi | (lo << 32); + uint64_t div100 = ((merged * kDivisionBy100Mul) / kDivisionBy100Div) & + ((0x7Full << 32) | 0x7Full); + uint64_t mod100 = merged - 100ull * div100; + uint64_t hundreds = (mod100 << 16) + div100; + uint64_t tens = (hundreds * kDivisionBy10Mul) / kDivisionBy10Div; + tens &= (0xFull << 48) | (0xFull << 32) | (0xFull << 16) | 0xFull; + tens += (hundreds - 10ull * tens) << 8; + return tens; +} - if (i < 100) { - digits = i; - if (i >= 10) goto lt100; - memcpy(buffer, one_ASCII_final_digits[i], 2); - return buffer + 1; - } - if (i < 10000) { // 10,000 - if (i >= 1000) goto lt10_000; - digits = i / 100; - i -= digits * 100; - *buffer++ = '0' + static_cast(digits); - goto lt100; +inline char* EncodeFullU32(uint32_t n, char* out_str) { + if (n < 100'000'000) { + uint64_t bottom = PrepareTenThousands(n / 10000, n % 10000); + ABSL_ASSUME(bottom != 0); + // 0 minus 8 to make MSVC happy. + uint32_t zeroes = static_cast(absl::countr_zero(bottom)) + & (0 - 8ull); + uint64_t bottom_res = bottom + kEightZeroBytes; + bottom_res >>= zeroes; + memcpy(out_str, &bottom_res, sizeof(bottom)); + return out_str + sizeof(bottom) - zeroes / 8; } - if (i < 1000000) { // 1,000,000 - if (i >= 100000) goto lt1_000_000; - digits = i / 10000; // 10,000 - i -= digits * 10000; - *buffer++ = '0' + static_cast(digits); - goto lt10_000; + uint32_t top = n / 100'000'000; + n %= 100'000'000; + uint64_t bottom = PrepareTenThousands(n / 10000, n % 10000); + uint64_t bottom_res = bottom + kEightZeroBytes; + out_str = EncodeHundred(top, out_str); + memcpy(out_str, &bottom_res, sizeof(bottom)); + return out_str + sizeof(bottom); +} + +} // namespace + +char* numbers_internal::FastIntToBuffer(uint32_t n, char* out_str) { + if (n < 100) { + out_str = EncodeHundred(n, out_str); + goto set_last_zero; } - if (i < 100000000) { // 100,000,000 - if (i >= 10000000) goto lt100_000_000; - digits = i / 1000000; // 1,000,000 - i -= digits * 1000000; - *buffer++ = '0' + static_cast(digits); - goto lt1_000_000; + if (n < 10000) { + out_str = EncodeTenThousand(n, out_str); + goto set_last_zero; } - // we already know that i < 1,000,000,000 - digits = i / 100000000; // 100,000,000 - i -= digits * 100000000; - *buffer++ = '0' + static_cast(digits); - goto lt100_000_000; + out_str = EncodeFullU32(n, out_str); +set_last_zero: + *out_str = '\0'; + return out_str; } char* numbers_internal::FastIntToBuffer(int32_t i, char* buffer) { @@ -230,41 +272,40 @@ char* numbers_internal::FastIntToBuffer(uint64_t i, char* buffer) { uint32_t u32 = static_cast(i); if (u32 == i) return numbers_internal::FastIntToBuffer(u32, buffer); - // Here we know i has at least 10 decimal digits. - uint64_t top_1to11 = i / 1000000000; - u32 = static_cast(i - top_1to11 * 1000000000); - uint32_t top_1to11_32 = static_cast(top_1to11); + // 10**9 < 2**32 <= i < 10**10, we can do 2+8 + uint64_t div08 = i / 100'000'000ull; + uint64_t mod08 = i % 100'000'000ull; + uint64_t mod_result = + PrepareTenThousands(mod08 / 10000, mod08 % 10000) + kEightZeroBytes; + if (i < 10'000'000'000ull) { + buffer = EncodeHundred(static_cast(div08), buffer); + memcpy(buffer, &mod_result, 8); + buffer += 8; + goto set_last_zero; + } - if (top_1to11_32 == top_1to11) { - buffer = numbers_internal::FastIntToBuffer(top_1to11_32, buffer); + // i < 10**16, in this case 8+8 + if (i < 10'000'000'000'000'000ull) { + buffer = EncodeFullU32(static_cast(div08), buffer); + memcpy(buffer, &mod_result, 8); + buffer += 8; + goto set_last_zero; } else { - // top_1to11 has more than 32 bits too; print it in two steps. - uint32_t top_8to9 = static_cast(top_1to11 / 100); - uint32_t mid_2 = static_cast(top_1to11 - top_8to9 * 100); - buffer = numbers_internal::FastIntToBuffer(top_8to9, buffer); - PutTwoDigits(mid_2, buffer); - buffer += 2; + // 4 + 8 + 8 + uint64_t div016 = i / 10'000'000'000'000'000ull; + buffer = EncodeTenThousand(static_cast(div016), buffer); + uint64_t mid_result = div08 - div016 * 100'000'000ull; + mid_result = PrepareTenThousands(mid_result / 10000, mid_result % 10000) + + kEightZeroBytes; + memcpy(buffer, &mid_result, 8); + buffer += 8; + memcpy(buffer, &mod_result, 8); + buffer += 8; + goto set_last_zero; } - - // We have only 9 digits now, again the maximum uint32_t can handle fully. - uint32_t digits = u32 / 10000000; // 10,000,000 - u32 -= digits * 10000000; - PutTwoDigits(digits, buffer); - buffer += 2; - digits = u32 / 100000; // 100,000 - u32 -= digits * 100000; - PutTwoDigits(digits, buffer); - buffer += 2; - digits = u32 / 1000; // 1,000 - u32 -= digits * 1000; - PutTwoDigits(digits, buffer); - buffer += 2; - digits = u32 / 10; - u32 -= digits * 10; - PutTwoDigits(digits, buffer); - buffer += 2; - memcpy(buffer, one_ASCII_final_digits[u32], 2); - return buffer + 1; +set_last_zero: + *buffer = '\0'; + return buffer; } char* numbers_internal::FastIntToBuffer(int64_t i, char* buffer) { -- cgit v1.2.3