summaryrefslogtreecommitdiff
path: root/absl/crc/internal/crc_memcpy_x86_64.cc
diff options
context:
space:
mode:
Diffstat (limited to 'absl/crc/internal/crc_memcpy_x86_64.cc')
-rw-r--r--absl/crc/internal/crc_memcpy_x86_64.cc109
1 files changed, 44 insertions, 65 deletions
diff --git a/absl/crc/internal/crc_memcpy_x86_64.cc b/absl/crc/internal/crc_memcpy_x86_64.cc
index 1c3299ee..a148fe17 100644
--- a/absl/crc/internal/crc_memcpy_x86_64.cc
+++ b/absl/crc/internal/crc_memcpy_x86_64.cc
@@ -82,21 +82,18 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
return crc32c_t{crc_uint32};
}
-constexpr int kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
+constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
// Common function for copying the tails of multiple large regions.
-template <int vec_regions, int int_regions>
+template <size_t vec_regions, size_t int_regions>
inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
size_t region_size, size_t copy_rounds) {
std::array<__m128i, vec_regions> data;
std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
while (copy_rounds > 0) {
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int i = 0; i < vec_regions; i++) {
- int region = i;
+ for (size_t i = 0; i < vec_regions; i++) {
+ size_t region = i;
auto* vsrc =
reinterpret_cast<const __m128i*>(*src + region_size * region);
@@ -109,27 +106,23 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
_mm_store_si128(vdst, data[i]);
// Compute the running CRC
- crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
- static_cast<uint32_t>(crcs[region]), _mm_extract_epi64(data[i], 0)))};
- crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
- static_cast<uint32_t>(crcs[region]), _mm_extract_epi64(data[i], 1)))};
+ crcs[region] = crc32c_t{static_cast<uint32_t>(
+ _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))};
+ crcs[region] = crc32c_t{static_cast<uint32_t>(
+ _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))};
}
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int i = 0; i < int_regions; i++) {
- int region = vec_regions + i;
+ for (size_t i = 0; i < int_regions; i++) {
+ size_t region = vec_regions + i;
auto* usrc =
reinterpret_cast<const uint64_t*>(*src + region_size * region);
auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region);
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int j = 0; j < kIntLoadsPerVec; j++) {
- int data_index = i * kIntLoadsPerVec + j;
+ for (size_t j = 0; j < kIntLoadsPerVec; j++) {
+ size_t data_index = i * kIntLoadsPerVec + j;
int_data[data_index] = *(usrc + j);
crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
@@ -148,7 +141,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
} // namespace
-template <int vec_regions, int int_regions>
+template <size_t vec_regions, size_t int_regions>
class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
public:
AcceleratedCrcMemcpyEngine() = default;
@@ -160,12 +153,12 @@ class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
std::size_t length, crc32c_t initial_crc) const override;
};
-template <int vec_regions, int int_regions>
+template <size_t vec_regions, size_t int_regions>
crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
void* __restrict dst, const void* __restrict src, std::size_t length,
crc32c_t initial_crc) const {
constexpr std::size_t kRegions = vec_regions + int_regions;
- constexpr crc32c_t kCrcDataXor = crc32c_t{0xffffffff};
+ constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
constexpr std::size_t kBlockSize = sizeof(__m128i);
constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
@@ -201,7 +194,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Start work on the CRC: undo the XOR from the previous calculation or set up
// the initial value of the CRC.
// initial_crc ^= kCrcDataXor;
- initial_crc = initial_crc ^ kCrcDataXor;
+ initial_crc = crc32c_t{static_cast<uint32_t>(initial_crc) ^ kCrcDataXor};
// Do an initial alignment copy, so we can use aligned store instructions to
// the destination pointer. We align the destination pointer because the
@@ -229,13 +222,13 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Initialize CRCs for kRegions regions.
crc32c_t crcs[kRegions];
crcs[0] = initial_crc;
- for (int i = 1; i < kRegions; i++) {
- crcs[i] = kCrcDataXor;
+ for (size_t i = 1; i < kRegions; i++) {
+ crcs[i] = crc32c_t{kCrcDataXor};
}
// Find the number of rounds to copy and the region size. Also compute the
// tail size here.
- int64_t copy_rounds = length / kCopyRoundSize;
+ size_t copy_rounds = length / kCopyRoundSize;
// Find the size of each region and the size of the tail.
const std::size_t region_size = copy_rounds * kBlockSize;
@@ -248,10 +241,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Main loop.
while (copy_rounds > kBlocksPerCacheLine) {
// Prefetch kPrefetchAhead bytes ahead of each pointer.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int i = 0; i < kRegions; i++) {
+ for (size_t i = 0; i < kRegions; i++) {
absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead +
region_size * i);
absl::base_internal::PrefetchT0(dst_bytes + kPrefetchAhead +
@@ -259,58 +249,46 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
}
// Load and store data, computing CRC on the way.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int i = 0; i < kBlocksPerCacheLine; i++) {
+ for (size_t i = 0; i < kBlocksPerCacheLine; i++) {
// Copy and CRC the data for the CRC regions.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int j = 0; j < vec_regions; j++) {
+ for (size_t j = 0; j < vec_regions; j++) {
// Cycle which regions get vector load/store and integer load/store, to
// engage prefetching logic around vector load/stores and save issue
// slots by using the integer registers.
- int region = (j + i) % kRegions;
+ size_t region = (j + i) % kRegions;
- auto* src =
+ auto* vsrc =
reinterpret_cast<const __m128i*>(src_bytes + region_size * region);
- auto* dst =
+ auto* vdst =
reinterpret_cast<__m128i*>(dst_bytes + region_size * region);
// Load and CRC data.
- vec_data[j] = _mm_loadu_si128(src + i);
- crcs[region] = crc32c_t{static_cast<uint32_t>(
- _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
- _mm_extract_epi64(vec_data[j], 0)))};
- crcs[region] = crc32c_t{static_cast<uint32_t>(
- _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
- _mm_extract_epi64(vec_data[j], 1)))};
+ vec_data[j] = _mm_loadu_si128(vsrc + i);
+ crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+ static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))};
+ crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+ static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))};
// Store the data.
- _mm_store_si128(dst + i, vec_data[j]);
+ _mm_store_si128(vdst + i, vec_data[j]);
}
// Preload the partial CRCs for the CLMUL subregions.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int j = 0; j < int_regions; j++) {
+ for (size_t j = 0; j < int_regions; j++) {
// Cycle which regions get vector load/store and integer load/store, to
// engage prefetching logic around vector load/stores and save issue
// slots by using the integer registers.
- int region = (j + vec_regions + i) % kRegions;
+ size_t region = (j + vec_regions + i) % kRegions;
auto* usrc =
reinterpret_cast<const uint64_t*>(src_bytes + region_size * region);
auto* udst =
reinterpret_cast<uint64_t*>(dst_bytes + region_size * region);
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int k = 0; k < kIntLoadsPerVec; k++) {
- int data_index = j * kIntLoadsPerVec + k;
+ for (size_t k = 0; k < kIntLoadsPerVec; k++) {
+ size_t data_index = j * kIntLoadsPerVec + k;
// Load and CRC the data.
int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
@@ -339,13 +317,13 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the
// XOR done before doing block copy + CRCs.
- for (int i = 0; i < kRegions - 1; i++) {
- crcs[i] = crcs[i] ^ kCrcDataXor;
+ for (size_t i = 0; i + 1 < kRegions; i++) {
+ crcs[i] = crc32c_t{static_cast<uint32_t>(crcs[i]) ^ kCrcDataXor};
}
// Build a CRC of the first kRegions - 1 regions.
crc32c_t full_crc = crcs[0];
- for (int i = 1; i < kRegions - 1; i++) {
+ for (size_t i = 1; i + 1 < kRegions; i++) {
full_crc = ConcatCrc32c(full_crc, crcs[i], region_size);
}
@@ -360,7 +338,8 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
crcs[kRegions - 1]);
// Finalize and concatenate the final CRC, then return.
- crcs[kRegions - 1] = crcs[kRegions - 1] ^ kCrcDataXor;
+ crcs[kRegions - 1] =
+ crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor};
return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size);
}