// Copyright 2017 The Abseil Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "absl/strings/ascii.h" #include #include #include #include #include #include "absl/base/config.h" #include "absl/base/nullability.h" namespace absl { ABSL_NAMESPACE_BEGIN namespace ascii_internal { // # Table generated by this Python code (bit 0x02 is currently unused): // TODO(mbar) Move Python code for generation of table to BUILD and link here. // NOTE: The kAsciiPropertyBits table used within this code was generated by // Python code of the following form. (Bit 0x02 is currently unused and // available.) // // def Hex2(n): // return '0x' + hex(n/16)[2:] + hex(n%16)[2:] // def IsPunct(ch): // return (ord(ch) >= 32 and ord(ch) < 127 and // not ch.isspace() and not ch.isalnum()) // def IsBlank(ch): // return ch in ' \t' // def IsCntrl(ch): // return ord(ch) < 32 or ord(ch) == 127 // def IsXDigit(ch): // return ch.isdigit() or ch.lower() in 'abcdef' // for i in range(128): // ch = chr(i) // mask = ((ch.isalpha() and 0x01 or 0) | // (ch.isalnum() and 0x04 or 0) | // (ch.isspace() and 0x08 or 0) | // (IsPunct(ch) and 0x10 or 0) | // (IsBlank(ch) and 0x20 or 0) | // (IsCntrl(ch) and 0x40 or 0) | // (IsXDigit(ch) and 0x80 or 0)) // print Hex2(mask) + ',', // if i % 16 == 7: // print ' //', Hex2(i & 0x78) // elif i % 16 == 15: // print // clang-format off // Array of bitfields holding character information. Each bit value corresponds // to a particular character feature. For readability, and because the value // of these bits is tightly coupled to this implementation, the individual bits // are not named. Note that bitfields for all characters above ASCII 127 are // zero-initialized. ABSL_DLL const unsigned char kPropertyBits[256] = { 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x00 0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x10 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0x20 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, // 0x30 0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x40 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x50 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x60 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x70 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40, }; // Array of characters for the ascii_tolower() function. For values 'A' // through 'Z', return the lower-case character; otherwise, return the // identity of the passed character. ABSL_DLL const char kToLower[256] = { '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f', '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27', '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f', '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37', '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f', '\x40', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f', '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67', '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f', '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77', '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f', '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87', '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f', '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f', '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7', '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf', '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7', '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf', '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7', '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf', '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7', '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf', '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7', '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff', }; // Array of characters for the ascii_toupper() function. For values 'a' // through 'z', return the upper-case character; otherwise, return the // identity of the passed character. ABSL_DLL const char kToUpper[256] = { '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f', '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27', '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f', '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37', '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f', '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47', '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f', '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57', '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f', '\x60', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f', '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87', '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f', '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f', '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7', '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf', '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7', '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf', '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7', '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf', '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7', '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf', '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7', '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff', }; // clang-format on template static constexpr T BroadcastByte(unsigned char value) { static_assert(std::is_integral::value && sizeof(T) <= sizeof(uint64_t) && std::is_unsigned::value, "only unsigned integers up to 64-bit allowed"); T result = value; constexpr size_t result_bit_width = sizeof(result) * CHAR_BIT; result |= result << ((CHAR_BIT << 0) & (result_bit_width - 1)); result |= result << ((CHAR_BIT << 1) & (result_bit_width - 1)); result |= result << ((CHAR_BIT << 2) & (result_bit_width - 1)); return result; } // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`). // Implemented by: // 1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26). // 2. Comparing to SCHAR_MIN + 26. template constexpr bool AsciiInAZRange(unsigned char c) { constexpr unsigned char sub = (ToUpper ? 'a' : 'A') - SCHAR_MIN; constexpr signed char threshold = SCHAR_MIN + 26; // 26 = alphabet size. // Using unsigned arithmetic as overflows/underflows are well defined. unsigned char u = c - sub; // Using signed cmp, as SIMD unsigned cmp isn't available in many platforms. return static_cast(u) < threshold; } template static constexpr char* PartialAsciiStrCaseFold(absl::Nonnull p, absl::Nonnull end) { using vec_t = size_t; const size_t n = static_cast(end - p); // SWAR algorithm: http://0x80.pl/notesen/2016-01-06-swar-swap-case.html constexpr char ch_a = ToUpper ? 'a' : 'A', ch_z = ToUpper ? 'z' : 'Z'; char* const swar_end = p + (n / sizeof(vec_t)) * sizeof(vec_t); while (p < swar_end) { vec_t v = vec_t(); // memcpy the vector, but constexpr for (size_t i = 0; i < sizeof(vec_t); ++i) { v |= static_cast(static_cast(p[i])) << (i * CHAR_BIT); } constexpr unsigned int msb = 1u << (CHAR_BIT - 1); const vec_t v_msb = v & BroadcastByte(msb); const vec_t v_nonascii_mask = (v_msb << 1) - (v_msb >> (CHAR_BIT - 1)); const vec_t v_nonascii = v & v_nonascii_mask; const vec_t v_ascii = v & ~v_nonascii_mask; const vec_t a = v_ascii + BroadcastByte(msb - ch_a - 0), z = v_ascii + BroadcastByte(msb - ch_z - 1); v = v_nonascii | (v_ascii ^ ((a ^ z) & BroadcastByte(msb)) >> 2); // memcpy the vector, but constexpr for (size_t i = 0; i < sizeof(vec_t); ++i) { p[i] = static_cast(v >> (i * CHAR_BIT)); } p += sizeof(v); } return p; } template static constexpr void AsciiStrCaseFold(absl::Nonnull p, absl::Nonnull end) { // The upper- and lowercase versions of ASCII characters differ by only 1 bit. // When we need to flip the case, we can xor with this bit to achieve the // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We // could have chosen 'z' and 'Z', or any other pair of characters as they all // have the same single bit difference. constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A'; using vec_t = size_t; // TODO(b/316380338): When FDO becomes able to vectorize these, // revert this manual optimization and just leave the naive loop. if (static_cast(end - p) >= sizeof(vec_t)) { p = ascii_internal::PartialAsciiStrCaseFold(p, end); } while (p < end) { unsigned char v = static_cast(*p); v ^= AsciiInAZRange(v) ? kAsciiCaseBitFlip : 0; *p = static_cast(v); ++p; } } static constexpr size_t ValidateAsciiCasefold() { constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN; size_t incorrect_index = 0; char lowered[num_chars] = {}; char uppered[num_chars] = {}; for (unsigned int i = 0; i < num_chars; ++i) { uppered[i] = lowered[i] = static_cast(i); } AsciiStrCaseFold(&lowered[0], &lowered[num_chars]); AsciiStrCaseFold(&uppered[0], &uppered[num_chars]); for (size_t i = 0; i < num_chars; ++i) { const char ch = static_cast(i), ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch), ch_lower = ('A' <= ch && ch <= 'Z' ? 'a' + (ch - 'A') : ch); if (uppered[i] != ch_upper || lowered[i] != ch_lower) { incorrect_index = i > 0 ? i : num_chars; break; } } return incorrect_index; } static_assert(ValidateAsciiCasefold() == 0, "error in case conversion"); } // namespace ascii_internal void AsciiStrToLower(absl::Nonnull s) { char* p = &(*s)[0]; // Guaranteed to be valid for empty strings return ascii_internal::AsciiStrCaseFold(p, p + s->size()); } void AsciiStrToUpper(absl::Nonnull s) { char* p = &(*s)[0]; // Guaranteed to be valid for empty strings return ascii_internal::AsciiStrCaseFold(p, p + s->size()); } void RemoveExtraAsciiWhitespace(absl::Nonnull str) { auto stripped = StripAsciiWhitespace(*str); if (stripped.empty()) { str->clear(); return; } auto input_it = stripped.begin(); auto input_end = stripped.end(); auto output_it = &(*str)[0]; bool is_ws = false; for (; input_it < input_end; ++input_it) { if (is_ws) { // Consecutive whitespace? Keep only the last. is_ws = absl::ascii_isspace(static_cast(*input_it)); if (is_ws) --output_it; } else { is_ws = absl::ascii_isspace(static_cast(*input_it)); } *output_it = *input_it; ++output_it; } str->erase(static_cast(output_it - &(*str)[0])); } ABSL_NAMESPACE_END } // namespace absl