diff options
Diffstat (limited to 'absl/crc/internal/crc.cc')
-rw-r--r-- | absl/crc/internal/crc.cc | 468 |
1 files changed, 468 insertions, 0 deletions
diff --git a/absl/crc/internal/crc.cc b/absl/crc/internal/crc.cc new file mode 100644 index 00000000..bb8936e3 --- /dev/null +++ b/absl/crc/internal/crc.cc @@ -0,0 +1,468 @@ +// Copyright 2022 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Implementation of CRCs (aka Rabin Fingerprints). +// Treats the input as a polynomial with coefficients in Z(2), +// and finds the remainder when divided by an irreducible polynomial +// of the appropriate length. +// It handles all CRC sizes from 8 to 128 bits. +// It's somewhat complicated by having separate implementations optimized for +// CRC's <=32 bits, <= 64 bits, and <= 128 bits. +// The input string is prefixed with a "1" bit, and has "degree" "0" bits +// appended to it before the remainder is found. This ensures that +// short strings are scrambled somewhat and that strings consisting +// of all nulls have a non-zero CRC. +// +// Uses the "interleaved word-by-word" method from +// "Everything we know about CRC but afraid to forget" by Andrew Kadatch +// and Bob Jenkins, +// http://crcutil.googlecode.com/files/crc-doc.1.0.pdf +// +// The idea is to compute kStride CRCs simultaneously, allowing the +// processor to more effectively use multiple execution units. Each of +// the CRCs is calculated on one word of data followed by kStride - 1 +// words of zeroes; the CRC starting points are staggered by one word. +// Assuming a stride of 4 with data words "ABCDABCDABCD", the first +// CRC is over A000A000A, the second over 0B000B000B, and so on. +// The CRC of the whole data is then calculated by properly aligning the +// CRCs by appending zeroes until the data lengths agree then XORing +// the CRCs. + +#include "absl/crc/internal/crc.h" + +#include <cstdint> + +#include "absl/base/internal/endian.h" +#include "absl/base/internal/prefetch.h" +#include "absl/base/internal/raw_logging.h" +#include "absl/crc/internal/crc_internal.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace crc_internal { + +namespace { + +// Constants +#if defined(__i386__) || defined(__x86_64__) +constexpr bool kNeedAlignedLoads = false; +#else +constexpr bool kNeedAlignedLoads = true; +#endif + +// We express the number of zeroes as a number in base ZEROES_BASE. By +// pre-computing the zero extensions for all possible components of such an +// expression (numbers in a form a*ZEROES_BASE**b), we can calculate the +// resulting extension by multiplying the extensions for individual components +// using log_{ZEROES_BASE}(num_zeroes) polynomial multiplications. The tables of +// zero extensions contain (ZEROES_BASE - 1) * (log_{ZEROES_BASE}(64)) entries. +constexpr int ZEROES_BASE_LG = 4; // log_2(ZEROES_BASE) +constexpr int ZEROES_BASE = (1 << ZEROES_BASE_LG); // must be a power of 2 + +constexpr uint32_t kCrc32cPoly = 0x82f63b78; + +uint32_t ReverseBits(uint32_t bits) { + bits = (bits & 0xaaaaaaaau) >> 1 | (bits & 0x55555555u) << 1; + bits = (bits & 0xccccccccu) >> 2 | (bits & 0x33333333u) << 2; + bits = (bits & 0xf0f0f0f0u) >> 4 | (bits & 0x0f0f0f0fu) << 4; + return absl::gbswap_32(bits); +} + +// Polynomial long multiplication mod the polynomial of degree 32. +void PolyMultiply(uint32_t* val, uint32_t m, uint32_t poly) { + uint32_t l = *val; + uint32_t result = 0; + auto onebit = uint32_t{0x80000000u}; + for (uint32_t one = onebit; one != 0; one >>= 1) { + if ((l & one) != 0) { + result ^= m; + } + if (m & 1) { + m = (m >> 1) ^ poly; + } else { + m >>= 1; + } + } + *val = result; +} +} // namespace + +void CRCImpl::FillWordTable(uint32_t poly, uint32_t last, int word_size, + Uint32By256* t) { + for (int j = 0; j != word_size; j++) { // for each byte of extension.... + t[j][0] = 0; // a zero has no effect + for (int i = 128; i != 0; i >>= 1) { // fill in entries for powers of 2 + if (j == 0 && i == 128) { + t[j][i] = last; // top bit in last byte is given + } else { + // each successive power of two is derived from the previous + // one, either in this table, or the last table + uint32_t pred; + if (i == 128) { + pred = t[j - 1][1]; + } else { + pred = t[j][i << 1]; + } + // Advance the CRC by one bit (multiply by X, and take remainder + // through one step of polynomial long division) + if (pred & 1) { + t[j][i] = (pred >> 1) ^ poly; + } else { + t[j][i] = pred >> 1; + } + } + } + // CRCs have the property that CRC(a xor b) == CRC(a) xor CRC(b) + // so we can make all the tables for non-powers of two by + // xoring previously created entries. + for (int i = 2; i != 256; i <<= 1) { + for (int k = i + 1; k != (i << 1); k++) { + t[j][k] = t[j][i] ^ t[j][k - i]; + } + } + } +} + +int CRCImpl::FillZeroesTable(uint32_t poly, Uint32By256* t) { + uint32_t inc = 1; + inc <<= 31; + + // Extend by one zero bit. We know degree > 1 so (inc & 1) == 0. + inc >>= 1; + + // Now extend by 2, 4, and 8 bits, so now `inc` is extended by one zero byte. + for (int i = 0; i < 3; ++i) { + PolyMultiply(&inc, inc, poly); + } + + int j = 0; + for (uint64_t inc_len = 1; inc_len != 0; inc_len <<= ZEROES_BASE_LG) { + // Every entry in the table adds an additional inc_len zeroes. + uint32_t v = inc; + for (int a = 1; a != ZEROES_BASE; a++) { + t[0][j] = v; + PolyMultiply(&v, inc, poly); + j++; + } + inc = v; + } + ABSL_RAW_CHECK(j <= 256, ""); + return j; +} + +// Internal version of the "constructor". +CRCImpl* CRCImpl::NewInternal() { + // Find an accelearated implementation first. + CRCImpl* result = TryNewCRC32AcceleratedX86ARMCombined(); + + // Fall back to generic implementions if no acceleration is available. + if (result == nullptr) { + result = new CRC32(); + } + + result->InitTables(); + + return result; +} + +// The CRC of the empty string is always the CRC polynomial itself. +void CRCImpl::Empty(uint32_t* crc) const { *crc = kCrc32cPoly; } + +// The 32-bit implementation + +void CRC32::InitTables() { + // Compute the table for extending a CRC by one byte. + Uint32By256* t = new Uint32By256[4]; + FillWordTable(kCrc32cPoly, kCrc32cPoly, 1, t); + for (int i = 0; i != 256; i++) { + this->table0_[i] = t[0][i]; + } + + // Construct a table for updating the CRC by 4 bytes data followed by + // 12 bytes of zeroes. + // + // Note: the data word size could be larger than the CRC size; it might + // be slightly faster to use a 64-bit data word, but doing so doubles the + // table size. + uint32_t last = kCrc32cPoly; + const size_t size = 12; + for (size_t i = 0; i < size; ++i) { + last = (last >> 8) ^ this->table0_[last & 0xff]; + } + FillWordTable(kCrc32cPoly, last, 4, t); + for (size_t b = 0; b < 4; ++b) { + for (int i = 0; i < 256; ++i) { + this->table_[b][i] = t[b][i]; + } + } + + int j = FillZeroesTable(kCrc32cPoly, t); + ABSL_RAW_CHECK(j <= static_cast<int>(ABSL_ARRAYSIZE(this->zeroes_)), ""); + for (int i = 0; i < j; i++) { + this->zeroes_[i] = t[0][i]; + } + + delete[] t; + + // Build up tables for _reversing_ the operation of doing CRC operations on + // zero bytes. + + // In C++, extending `crc` by a single zero bit is done by the following: + // (A) bool low_bit_set = (crc & 1); + // crc >>= 1; + // if (low_bit_set) crc ^= kCrc32cPoly; + // + // In particular note that the high bit of `crc` after this operation will be + // set if and only if the low bit of `crc` was set before it. This means that + // no information is lost, and the operation can be reversed, as follows: + // (B) bool high_bit_set = (crc & 0x80000000u); + // if (high_bit_set) crc ^= kCrc32cPoly; + // crc <<= 1; + // if (high_bit_set) crc ^= 1; + // + // Or, equivalently: + // (C) bool high_bit_set = (crc & 0x80000000u); + // crc <<= 1; + // if (high_bit_set) crc ^= ((kCrc32cPoly << 1) ^ 1); + // + // The last observation is, if we store our checksums in variable `rcrc`, + // with order of the bits reversed, the inverse operation becomes: + // (D) bool low_bit_set = (rcrc & 1); + // rcrc >>= 1; + // if (low_bit_set) rcrc ^= ReverseBits((kCrc32cPoly << 1) ^ 1) + // + // This is the same algorithm (A) that we started with, only with a different + // polynomial bit pattern. This means that by building up our tables with + // this alternate polynomial, we can apply the CRC algorithms to a + // bit-reversed CRC checksum to perform inverse zero-extension. + + const uint32_t kCrc32cUnextendPoly = + ReverseBits(static_cast<uint32_t>((kCrc32cPoly << 1) ^ 1)); + FillWordTable(kCrc32cUnextendPoly, kCrc32cUnextendPoly, 1, &reverse_table0_); + + j = FillZeroesTable(kCrc32cUnextendPoly, &reverse_zeroes_); + ABSL_RAW_CHECK(j <= static_cast<int>(ABSL_ARRAYSIZE(this->reverse_zeroes_)), + ""); +} + +void CRC32::Extend(uint32_t* crc, const void* bytes, size_t length) const { + const uint8_t* p = static_cast<const uint8_t*>(bytes); + const uint8_t* e = p + length; + uint32_t l = *crc; + + auto step_one_byte = [this, &p, &l] () { + int c = (l & 0xff) ^ *p++; + l = this->table0_[c] ^ (l >> 8); + }; + + if (kNeedAlignedLoads) { + // point x at first 4-byte aligned byte in string. this might be past the + // end of the string. + const uint8_t* x = RoundUp<4>(p); + if (x <= e) { + // Process bytes until finished or p is 4-byte aligned + while (p != x) { + step_one_byte(); + } + } + } + + const size_t kSwathSize = 16; + if (static_cast<size_t>(e - p) >= kSwathSize) { + // Load one swath of data into the operating buffers. + uint32_t buf0 = absl::little_endian::Load32(p) ^ l; + uint32_t buf1 = absl::little_endian::Load32(p + 4); + uint32_t buf2 = absl::little_endian::Load32(p + 8); + uint32_t buf3 = absl::little_endian::Load32(p + 12); + p += kSwathSize; + + // Increment a CRC value by a "swath"; this combines the four bytes + // starting at `ptr` and twelve zero bytes, so that four CRCs can be + // built incrementally and combined at the end. + const auto step_swath = [this](uint32_t crc_in, const std::uint8_t* ptr) { + return absl::little_endian::Load32(ptr) ^ + this->table_[3][crc_in & 0xff] ^ + this->table_[2][(crc_in >> 8) & 0xff] ^ + this->table_[1][(crc_in >> 16) & 0xff] ^ + this->table_[0][crc_in >> 24]; + }; + + // Run one CRC calculation step over all swaths in one 16-byte stride + const auto step_stride = [&]() { + buf0 = step_swath(buf0, p); + buf1 = step_swath(buf1, p + 4); + buf2 = step_swath(buf2, p + 8); + buf3 = step_swath(buf3, p + 12); + p += 16; + }; + + // Process kStride interleaved swaths through the data in parallel. + while ((e - p) > kPrefetchHorizon) { + base_internal::PrefetchNta( + reinterpret_cast<const void*>(p + kPrefetchHorizon)); + // Process 64 bytes at a time + step_stride(); + step_stride(); + step_stride(); + step_stride(); + } + while (static_cast<size_t>(e - p) >= kSwathSize) { + step_stride(); + } + + // Now advance one word at a time as far as possible. This isn't worth + // doing if we have word-advance tables. + while (static_cast<size_t>(e - p) >= 4) { + buf0 = step_swath(buf0, p); + uint32_t tmp = buf0; + buf0 = buf1; + buf1 = buf2; + buf2 = buf3; + buf3 = tmp; + p += 4; + } + + // Combine the results from the different swaths. This is just a CRC + // on the data values in the bufX words. + auto combine_one_word = [this](uint32_t crc_in, uint32_t w) { + w ^= crc_in; + for (size_t i = 0; i < 4; ++i) { + w = (w >> 8) ^ this->table0_[w & 0xff]; + } + return w; + }; + + l = combine_one_word(0, buf0); + l = combine_one_word(l, buf1); + l = combine_one_word(l, buf2); + l = combine_one_word(l, buf3); + } + + // Process the last few bytes + while (p != e) { + step_one_byte(); + } + + *crc = l; +} + +void CRC32::ExtendByZeroesImpl(uint32_t* crc, size_t length, + const uint32_t zeroes_table[256], + const uint32_t poly_table[256]) const { + if (length != 0) { + uint32_t l = *crc; + // For each ZEROES_BASE_LG bits in length + // (after the low-order bits have been removed) + // we lookup the appropriate polynomial in the zeroes_ array + // and do a polynomial long multiplication (mod the CRC polynomial) + // to extend the CRC by the appropriate number of bits. + for (int i = 0; length != 0; + i += ZEROES_BASE - 1, length >>= ZEROES_BASE_LG) { + int c = length & (ZEROES_BASE - 1); // pick next ZEROES_BASE_LG bits + if (c != 0) { // if they are not zero, + // multiply by entry in table + // Build a table to aid in multiplying 2 bits at a time. + // It takes too long to build tables for more bits. + uint64_t m = zeroes_table[c + i - 1]; + m <<= 1; + uint64_t m2 = m << 1; + uint64_t mtab[4] = {0, m, m2, m2 ^ m}; + + // Do the multiply one byte at a time. + uint64_t result = 0; + for (int x = 0; x < 32; x += 8) { + // The carry-less multiply. + result ^= mtab[l & 3] ^ (mtab[(l >> 2) & 3] << 2) ^ + (mtab[(l >> 4) & 3] << 4) ^ (mtab[(l >> 6) & 3] << 6); + l >>= 8; + + // Reduce modulo the polynomial + result = (result >> 8) ^ poly_table[result & 0xff]; + } + l = static_cast<uint32_t>(result); + } + } + *crc = l; + } +} + +void CRC32::ExtendByZeroes(uint32_t* crc, size_t length) const { + return CRC32::ExtendByZeroesImpl(crc, length, zeroes_, table0_); +} + +void CRC32::UnextendByZeroes(uint32_t* crc, size_t length) const { + // See the comment in CRC32::InitTables() for an explanation of the algorithm + // below. + *crc = ReverseBits(*crc); + ExtendByZeroesImpl(crc, length, reverse_zeroes_, reverse_table0_); + *crc = ReverseBits(*crc); +} + +void CRC32::Scramble(uint32_t* crc) const { + // Rotate by near half the word size plus 1. See the scramble comment in + // crc_internal.h for an explanation. + constexpr int scramble_rotate = (32 / 2) + 1; + *crc = RotateRight<uint32_t>(static_cast<unsigned int>(*crc + kScrambleLo), + 32, scramble_rotate) & + MaskOfLength<uint32_t>(32); +} + +void CRC32::Unscramble(uint32_t* crc) const { + constexpr int scramble_rotate = (32 / 2) + 1; + uint64_t rotated = RotateRight<uint32_t>(static_cast<unsigned int>(*crc), 32, + 32 - scramble_rotate); + *crc = (rotated - kScrambleLo) & MaskOfLength<uint32_t>(32); +} + +// Constructor and destructor for base class CRC. +CRC::~CRC() {} +CRC::CRC() {} + +// The "constructor" for a CRC32C with a standard polynomial. +CRC* CRC::Crc32c() { + static CRC* singleton = CRCImpl::NewInternal(); + return singleton; +} + +// This Concat implementation works for arbitrary polynomials. +void CRC::Concat(uint32_t* px, uint32_t y, size_t ylen) { + // https://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks + // The CRC of a message M is the remainder of polynomial divison modulo G, + // where the coefficient arithmetic is performed modulo 2 (so +/- are XOR): + // R(x) = M(x) x**n (mod G) + // (n is the degree of G) + // In practice, we use an initial value A and a bitmask B to get + // R = (A ^ B)x**|M| ^ Mx**n ^ B (mod G) + // If M is the concatenation of two strings S and T, and Z is the string of + // len(T) 0s, then the remainder CRC(ST) can be expressed as: + // R = (A ^ B)x**|ST| ^ STx**n ^ B + // = (A ^ B)x**|SZ| ^ SZx**n ^ B ^ Tx**n + // = CRC(SZ) ^ Tx**n + // CRC(Z) = (A ^ B)x**|T| ^ B + // CRC(T) = (A ^ B)x**|T| ^ Tx**n ^ B + // So R = CRC(SZ) ^ CRC(Z) ^ CRC(T) + // + // And further, since CRC(SZ) = Extend(CRC(S), Z), + // CRC(SZ) ^ CRC(Z) = Extend(CRC(S) ^ CRC(''), Z). + uint32_t z; + uint32_t t; + Empty(&z); + t = *px ^ z; + ExtendByZeroes(&t, ylen); + *px = t ^ y; +} + +} // namespace crc_internal +ABSL_NAMESPACE_END +} // namespace absl |