1 files changed, 468 insertions, 0 deletions
diff --git a/absl/crc/internal/crc.cc b/absl/crc/internal/crc.cc
new file mode 100644
index 00000000..bb8936e3
--- /dev/null
+++ b/absl/crc/internal/crc.cc
@@ -0,0 +1,468 @@
+// Copyright 2022 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Implementation of CRCs (aka Rabin Fingerprints).
+// Treats the input as a polynomial with coefficients in Z(2),
+// and finds the remainder when divided by an irreducible polynomial
+// of the appropriate length.
+// It handles all CRC sizes from 8 to 128 bits.
+// It's somewhat complicated by having separate implementations optimized for
+// CRC's <=32 bits, <= 64 bits, and <= 128 bits.
+// The input string is prefixed with a "1" bit, and has "degree" "0" bits
+// appended to it before the remainder is found.   This ensures that
+// short strings are scrambled somewhat and that strings consisting
+// of all nulls have a non-zero CRC.
+//
+// Uses the "interleaved word-by-word" method from
+// "Everything we know about CRC but afraid to forget" by Andrew Kadatch
+// and Bob Jenkins,
+// http://crcutil.googlecode.com/files/crc-doc.1.0.pdf
+//
+// The idea is to compute kStride CRCs simultaneously, allowing the
+// processor to more effectively use multiple execution units. Each of
+// the CRCs is calculated on one word of data followed by kStride - 1
+// words of zeroes; the CRC starting points are staggered by one word.
+// Assuming a stride of 4 with data words "ABCDABCDABCD", the first
+// CRC is over A000A000A, the second over 0B000B000B, and so on.
+// The CRC of the whole data is then calculated by properly aligning the
+// CRCs by appending zeroes until the data lengths agree then XORing
+// the CRCs.
+
+#include "absl/crc/internal/crc.h"
+
+#include <cstdint>
+
+#include "absl/base/internal/endian.h"
+#include "absl/base/internal/prefetch.h"
+#include "absl/base/internal/raw_logging.h"
+#include "absl/crc/internal/crc_internal.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+namespace {
+
+// Constants
+#if defined(__i386__) || defined(__x86_64__)
+constexpr bool kNeedAlignedLoads = false;
+#else
+constexpr bool kNeedAlignedLoads = true;
+#endif
+
+// We express the number of zeroes as a number in base ZEROES_BASE. By
+// pre-computing the zero extensions for all possible components of such an
+// expression (numbers in a form a*ZEROES_BASE**b), we can calculate the
+// resulting extension by multiplying the extensions for individual components
+// using log_{ZEROES_BASE}(num_zeroes) polynomial multiplications. The tables of
+// zero extensions contain (ZEROES_BASE - 1) * (log_{ZEROES_BASE}(64)) entries.
+constexpr int ZEROES_BASE_LG = 4;                   // log_2(ZEROES_BASE)
+constexpr int ZEROES_BASE = (1 << ZEROES_BASE_LG);  // must be a power of 2
+
+constexpr uint32_t kCrc32cPoly = 0x82f63b78;
+
+uint32_t ReverseBits(uint32_t bits) {
+  bits = (bits & 0xaaaaaaaau) >> 1 | (bits & 0x55555555u) << 1;
+  bits = (bits & 0xccccccccu) >> 2 | (bits & 0x33333333u) << 2;
+  bits = (bits & 0xf0f0f0f0u) >> 4 | (bits & 0x0f0f0f0fu) << 4;
+  return absl::gbswap_32(bits);
+}
+
+// Polynomial long multiplication mod the polynomial of degree 32.
+void PolyMultiply(uint32_t* val, uint32_t m, uint32_t poly) {
+  uint32_t l = *val;
+  uint32_t result = 0;
+  auto onebit = uint32_t{0x80000000u};
+  for (uint32_t one = onebit; one != 0; one >>= 1) {
+    if ((l & one) != 0) {
+      result ^= m;
+    }
+    if (m & 1) {
+      m = (m >> 1) ^ poly;
+    } else {
+      m >>= 1;
+    }
+  }
+  *val = result;
+}
+}  // namespace
+
+void CRCImpl::FillWordTable(uint32_t poly, uint32_t last, int word_size,
+                            Uint32By256* t) {
+  for (int j = 0; j != word_size; j++) {  // for each byte of extension....
+    t[j][0] = 0;                          // a zero has no effect
+    for (int i = 128; i != 0; i >>= 1) {  // fill in entries for powers of 2
+      if (j == 0 && i == 128) {
+        t[j][i] = last;  // top bit in last byte is given
+      } else {
+        // each successive power of two is derived from the previous
+        // one, either in this table, or the last table
+        uint32_t pred;
+        if (i == 128) {
+          pred = t[j - 1][1];
+        } else {
+          pred = t[j][i << 1];
+        }
+        // Advance the CRC by one bit (multiply by X, and take remainder
+        // through one step of polynomial long division)
+        if (pred & 1) {
+          t[j][i] = (pred >> 1) ^ poly;
+        } else {
+          t[j][i] = pred >> 1;
+        }
+      }
+    }
+    // CRCs have the property that CRC(a xor b) == CRC(a) xor CRC(b)
+    // so we can make all the tables for non-powers of two by
+    // xoring previously created entries.
+    for (int i = 2; i != 256; i <<= 1) {
+      for (int k = i + 1; k != (i << 1); k++) {
+        t[j][k] = t[j][i] ^ t[j][k - i];
+      }
+    }
+  }
+}
+
+int CRCImpl::FillZeroesTable(uint32_t poly, Uint32By256* t) {
+  uint32_t inc = 1;
+  inc <<= 31;
+
+  // Extend by one zero bit. We know degree > 1 so (inc & 1) == 0.
+  inc >>= 1;
+
+  // Now extend by 2, 4, and 8 bits, so now `inc` is extended by one zero byte.
+  for (int i = 0; i < 3; ++i) {
+    PolyMultiply(&inc, inc, poly);
+  }
+
+  int j = 0;
+  for (uint64_t inc_len = 1; inc_len != 0; inc_len <<= ZEROES_BASE_LG) {
+    // Every entry in the table adds an additional inc_len zeroes.
+    uint32_t v = inc;
+    for (int a = 1; a != ZEROES_BASE; a++) {
+      t[0][j] = v;
+      PolyMultiply(&v, inc, poly);
+      j++;
+    }
+    inc = v;
+  }
+  ABSL_RAW_CHECK(j <= 256, "");
+  return j;
+}
+
+// Internal version of the "constructor".
+CRCImpl* CRCImpl::NewInternal() {
+  // Find an accelearated implementation first.
+  CRCImpl* result = TryNewCRC32AcceleratedX86ARMCombined();
+
+  // Fall back to generic implementions if no acceleration is available.
+  if (result == nullptr) {
+    result = new CRC32();
+  }
+
+  result->InitTables();
+
+  return result;
+}
+
+// The CRC of the empty string is always the CRC polynomial itself.
+void CRCImpl::Empty(uint32_t* crc) const { *crc = kCrc32cPoly; }
+
+//  The 32-bit implementation
+
+void CRC32::InitTables() {
+  // Compute the table for extending a CRC by one byte.
+  Uint32By256* t = new Uint32By256[4];
+  FillWordTable(kCrc32cPoly, kCrc32cPoly, 1, t);
+  for (int i = 0; i != 256; i++) {
+    this->table0_[i] = t[0][i];
+  }
+
+  // Construct a table for updating the CRC by 4 bytes data followed by
+  // 12 bytes of zeroes.
+  //
+  // Note: the data word size could be larger than the CRC size; it might
+  // be slightly faster to use a 64-bit data word, but doing so doubles the
+  // table size.
+  uint32_t last = kCrc32cPoly;
+  const size_t size = 12;
+  for (size_t i = 0; i < size; ++i) {
+    last = (last >> 8) ^ this->table0_[last & 0xff];
+  }
+  FillWordTable(kCrc32cPoly, last, 4, t);
+  for (size_t b = 0; b < 4; ++b) {
+    for (int i = 0; i < 256; ++i) {
+      this->table_[b][i] = t[b][i];
+    }
+  }
+
+  int j = FillZeroesTable(kCrc32cPoly, t);
+  ABSL_RAW_CHECK(j <= static_cast<int>(ABSL_ARRAYSIZE(this->zeroes_)), "");
+  for (int i = 0; i < j; i++) {
+    this->zeroes_[i] = t[0][i];
+  }
+
+  delete[] t;
+
+  // Build up tables for _reversing_ the operation of doing CRC operations on
+  // zero bytes.
+
+  // In C++, extending `crc` by a single zero bit is done by the following:
+  // (A)  bool low_bit_set = (crc & 1);
+  //      crc >>= 1;
+  //      if (low_bit_set) crc ^= kCrc32cPoly;
+  //
+  // In particular note that the high bit of `crc` after this operation will be
+  // set if and only if the low bit of `crc` was set before it.  This means that
+  // no information is lost, and the operation can be reversed, as follows:
+  // (B)  bool high_bit_set = (crc & 0x80000000u);
+  //      if (high_bit_set) crc ^= kCrc32cPoly;
+  //      crc <<= 1;
+  //      if (high_bit_set) crc ^= 1;
+  //
+  // Or, equivalently:
+  // (C)  bool high_bit_set = (crc & 0x80000000u);
+  //      crc <<= 1;
+  //      if (high_bit_set) crc ^= ((kCrc32cPoly << 1) ^ 1);
+  //
+  // The last observation is, if we store our checksums in variable `rcrc`,
+  // with order of the bits reversed, the inverse operation becomes:
+  // (D)  bool low_bit_set = (rcrc & 1);
+  //      rcrc >>= 1;
+  //      if (low_bit_set) rcrc ^= ReverseBits((kCrc32cPoly << 1) ^ 1)
+  //
+  // This is the same algorithm (A) that we started with, only with a different
+  // polynomial bit pattern.  This means that by building up our tables with
+  // this alternate polynomial, we can apply the CRC algorithms to a
+  // bit-reversed CRC checksum to perform inverse zero-extension.
+
+  const uint32_t kCrc32cUnextendPoly =
+      ReverseBits(static_cast<uint32_t>((kCrc32cPoly << 1) ^ 1));
+  FillWordTable(kCrc32cUnextendPoly, kCrc32cUnextendPoly, 1, &reverse_table0_);
+
+  j = FillZeroesTable(kCrc32cUnextendPoly, &reverse_zeroes_);
+  ABSL_RAW_CHECK(j <= static_cast<int>(ABSL_ARRAYSIZE(this->reverse_zeroes_)),
+                 "");
+}
+
+void CRC32::Extend(uint32_t* crc, const void* bytes, size_t length) const {
+  const uint8_t* p = static_cast<const uint8_t*>(bytes);
+  const uint8_t* e = p + length;
+  uint32_t l = *crc;
+
+  auto step_one_byte = [this, &p, &l] () {
+    int c = (l & 0xff) ^ *p++;
+    l = this->table0_[c] ^ (l >> 8);
+  };
+
+  if (kNeedAlignedLoads) {
+    // point x at first 4-byte aligned byte in string. this might be past the
+    // end of the string.
+    const uint8_t* x = RoundUp<4>(p);
+    if (x <= e) {
+      // Process bytes until finished or p is 4-byte aligned
+      while (p != x) {
+        step_one_byte();
+      }
+    }
+  }
+
+  const size_t kSwathSize = 16;
+  if (static_cast<size_t>(e - p) >= kSwathSize) {
+    // Load one swath of data into the operating buffers.
+    uint32_t buf0 = absl::little_endian::Load32(p) ^ l;
+    uint32_t buf1 = absl::little_endian::Load32(p + 4);
+    uint32_t buf2 = absl::little_endian::Load32(p + 8);
+    uint32_t buf3 = absl::little_endian::Load32(p + 12);
+    p += kSwathSize;
+
+    // Increment a CRC value by a "swath"; this combines the four bytes
+    // starting at `ptr` and twelve zero bytes, so that four CRCs can be
+    // built incrementally and combined at the end.
+    const auto step_swath = [this](uint32_t crc_in, const std::uint8_t* ptr) {
+      return absl::little_endian::Load32(ptr) ^
+             this->table_[3][crc_in & 0xff] ^
+             this->table_[2][(crc_in >> 8) & 0xff] ^
+             this->table_[1][(crc_in >> 16) & 0xff] ^
+             this->table_[0][crc_in >> 24];
+    };
+
+    // Run one CRC calculation step over all swaths in one 16-byte stride
+    const auto step_stride = [&]() {
+      buf0 = step_swath(buf0, p);
+      buf1 = step_swath(buf1, p + 4);
+      buf2 = step_swath(buf2, p + 8);
+      buf3 = step_swath(buf3, p + 12);
+      p += 16;
+    };
+
+    // Process kStride interleaved swaths through the data in parallel.
+    while ((e - p) > kPrefetchHorizon) {
+      base_internal::PrefetchNta(
+          reinterpret_cast<const void*>(p + kPrefetchHorizon));
+      // Process 64 bytes at a time
+      step_stride();
+      step_stride();
+      step_stride();
+      step_stride();
+    }
+    while (static_cast<size_t>(e - p) >= kSwathSize) {
+      step_stride();
+    }
+
+    // Now advance one word at a time as far as possible. This isn't worth
+    // doing if we have word-advance tables.
+    while (static_cast<size_t>(e - p) >= 4) {
+      buf0 = step_swath(buf0, p);
+      uint32_t tmp = buf0;
+      buf0 = buf1;
+      buf1 = buf2;
+      buf2 = buf3;
+      buf3 = tmp;
+      p += 4;
+    }
+
+    // Combine the results from the different swaths. This is just a CRC
+    // on the data values in the bufX words.
+    auto combine_one_word = [this](uint32_t crc_in, uint32_t w) {
+      w ^= crc_in;
+      for (size_t i = 0; i < 4; ++i) {
+        w = (w >> 8) ^ this->table0_[w & 0xff];
+      }
+      return w;
+    };
+
+    l = combine_one_word(0, buf0);
+    l = combine_one_word(l, buf1);
+    l = combine_one_word(l, buf2);
+    l = combine_one_word(l, buf3);
+  }
+
+  // Process the last few bytes
+  while (p != e) {
+    step_one_byte();
+  }
+
+  *crc = l;
+}
+
+void CRC32::ExtendByZeroesImpl(uint32_t* crc, size_t length,
+                               const uint32_t zeroes_table[256],
+                               const uint32_t poly_table[256]) const {
+  if (length != 0) {
+    uint32_t l = *crc;
+    // For each ZEROES_BASE_LG bits in length
+    // (after the low-order bits have been removed)
+    // we lookup the appropriate polynomial in the zeroes_ array
+    // and do a polynomial long multiplication (mod the CRC polynomial)
+    // to extend the CRC by the appropriate number of bits.
+    for (int i = 0; length != 0;
+         i += ZEROES_BASE - 1, length >>= ZEROES_BASE_LG) {
+      int c = length & (ZEROES_BASE - 1);  // pick next ZEROES_BASE_LG bits
+      if (c != 0) {                        // if they are not zero,
+                                           // multiply by entry in table
+        // Build a table to aid in multiplying 2 bits at a time.
+        // It takes too long to build tables for more bits.
+        uint64_t m = zeroes_table[c + i - 1];
+        m <<= 1;
+        uint64_t m2 = m << 1;
+        uint64_t mtab[4] = {0, m, m2, m2 ^ m};
+
+        // Do the multiply one byte at a time.
+        uint64_t result = 0;
+        for (int x = 0; x < 32; x += 8) {
+          // The carry-less multiply.
+          result ^= mtab[l & 3] ^ (mtab[(l >> 2) & 3] << 2) ^
+                    (mtab[(l >> 4) & 3] << 4) ^ (mtab[(l >> 6) & 3] << 6);
+          l >>= 8;
+
+          // Reduce modulo the polynomial
+          result = (result >> 8) ^ poly_table[result & 0xff];
+        }
+        l = static_cast<uint32_t>(result);
+      }
+    }
+    *crc = l;
+  }
+}
+
+void CRC32::ExtendByZeroes(uint32_t* crc, size_t length) const {
+  return CRC32::ExtendByZeroesImpl(crc, length, zeroes_, table0_);
+}
+
+void CRC32::UnextendByZeroes(uint32_t* crc, size_t length) const {
+  // See the comment in CRC32::InitTables() for an explanation of the algorithm
+  // below.
+  *crc = ReverseBits(*crc);
+  ExtendByZeroesImpl(crc, length, reverse_zeroes_, reverse_table0_);
+  *crc = ReverseBits(*crc);
+}
+
+void CRC32::Scramble(uint32_t* crc) const {
+  // Rotate by near half the word size plus 1.  See the scramble comment in
+  // crc_internal.h for an explanation.
+  constexpr int scramble_rotate = (32 / 2) + 1;
+  *crc = RotateRight<uint32_t>(static_cast<unsigned int>(*crc + kScrambleLo),
+                               32, scramble_rotate) &
+         MaskOfLength<uint32_t>(32);
+}
+
+void CRC32::Unscramble(uint32_t* crc) const {
+  constexpr int scramble_rotate = (32 / 2) + 1;
+  uint64_t rotated = RotateRight<uint32_t>(static_cast<unsigned int>(*crc), 32,
+                                           32 - scramble_rotate);
+  *crc = (rotated - kScrambleLo) & MaskOfLength<uint32_t>(32);
+}
+
+// Constructor and destructor for base class CRC.
+CRC::~CRC() {}
+CRC::CRC() {}
+
+// The "constructor" for a CRC32C with a standard polynomial.
+CRC* CRC::Crc32c() {
+  static CRC* singleton = CRCImpl::NewInternal();
+  return singleton;
+}
+
+// This Concat implementation works for arbitrary polynomials.
+void CRC::Concat(uint32_t* px, uint32_t y, size_t ylen) {
+  // https://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks
+  // The CRC of a message M is the remainder of polynomial divison modulo G,
+  // where the coefficient arithmetic is performed modulo 2 (so +/- are XOR):
+  //   R(x) = M(x) x**n (mod G)
+  // (n is the degree of G)
+  // In practice, we use an initial value A and a bitmask B to get
+  //   R = (A ^ B)x**|M| ^ Mx**n ^ B (mod G)
+  // If M is the concatenation of two strings S and T, and Z is the string of
+  // len(T) 0s, then the remainder CRC(ST) can be expressed as:
+  //   R = (A ^ B)x**|ST| ^ STx**n ^ B
+  //     = (A ^ B)x**|SZ| ^ SZx**n ^ B ^ Tx**n
+  //     = CRC(SZ) ^ Tx**n
+  // CRC(Z) = (A ^ B)x**|T| ^ B
+  // CRC(T) = (A ^ B)x**|T| ^ Tx**n ^ B
+  // So R = CRC(SZ) ^ CRC(Z) ^ CRC(T)
+  //
+  // And further, since CRC(SZ) = Extend(CRC(S), Z),
+  //  CRC(SZ) ^ CRC(Z) = Extend(CRC(S) ^ CRC(''), Z).
+  uint32_t z;
+  uint32_t t;
+  Empty(&z);
+  t = *px ^ z;
+  ExtendByZeroes(&t, ylen);
+  *px = t ^ y;
+}
+
+}  // namespace crc_internal
+ABSL_NAMESPACE_END
+}  // namespace absl