absl/crc/internal/crc_memcpy_x86_64.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434

// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Simultaneous memcopy and CRC-32C for x86-64.  Uses integer registers because
// XMM registers do not support the CRC instruction (yet).  While copying,
// compute the running CRC of the data being copied.
//
// It is assumed that any CPU running this code has SSE4.2 instructions
// available (for CRC32C).  This file will do nothing if that is not true.
//
// The CRC instruction has a 3-byte latency, and we are stressing the ALU ports
// here (unlike a traditional memcopy, which has almost no ALU use), so we will
// need to copy in such a way that the CRC unit is used efficiently. We have two
// regimes in this code:
//  1. For operations of size < kCrcSmallSize, do the CRC then the memcpy
//  2. For operations of size > kCrcSmallSize:
//      a) compute an initial CRC + copy on a small amount of data to align the
//         destination pointer on a 16-byte boundary.
//      b) Split the data into 3 main regions and a tail (smaller than 48 bytes)
//      c) Do the copy and CRC of the 3 main regions, interleaving (start with
//         full cache line copies for each region, then move to single 16 byte
//         pieces per region).
//      d) Combine the CRCs with CRC32C::Concat.
//      e) Copy the tail and extend the CRC with the CRC of the tail.
// This method is not ideal for op sizes between ~1k and ~8k because CRC::Concat
// takes a significant amount of time.  A medium-sized approach could be added
// using 3 CRCs over fixed-size blocks where the zero-extensions required for
// CRC32C::Concat can be precomputed.

#ifdef __SSE4_2__
#include <immintrin.h>
#endif

#ifdef _MSC_VER
#include <intrin.h>
#endif

#include <array>
#include <cstddef>
#include <cstdint>
#include <type_traits>

#include "absl/base/dynamic_annotations.h"
#include "absl/base/internal/prefetch.h"
#include "absl/base/optimization.h"
#include "absl/crc/crc32c.h"
#include "absl/crc/internal/cpu_detect.h"
#include "absl/crc/internal/crc_memcpy.h"
#include "absl/strings/string_view.h"

#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE

namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {

namespace {

inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
                             crc32c_t crc) {
  // Small copy: just go 1 byte at a time: being nice to the branch predictor
  // is more important here than anything else
  uint32_t crc_uint32 = static_cast<uint32_t>(crc);
  for (std::size_t i = 0; i < length; i++) {
    uint8_t data = *reinterpret_cast<const uint8_t*>(src);
    crc_uint32 = _mm_crc32_u8(crc_uint32, data);
    *reinterpret_cast<uint8_t*>(dst) = data;
    ++src;
    ++dst;
  }
  return crc32c_t{crc_uint32};
}

constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);

// Common function for copying the tails of multiple large regions.
template <size_t vec_regions, size_t int_regions>
inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
                          size_t region_size, size_t copy_rounds) {
  std::array<__m128i, vec_regions> data;
  std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;

  while (copy_rounds > 0) {
    for (size_t i = 0; i < vec_regions; i++) {
      size_t region = i;

      auto* vsrc =
          reinterpret_cast<const __m128i*>(*src + region_size * region);
      auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region);

      // Load the blocks, unaligned
      data[i] = _mm_loadu_si128(vsrc);

      // Store the blocks, aligned
      _mm_store_si128(vdst, data[i]);

      // Compute the running CRC
      crcs[region] = crc32c_t{static_cast<uint32_t>(
          _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
                        static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))};
      crcs[region] = crc32c_t{static_cast<uint32_t>(
          _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
                        static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))};
    }

    for (size_t i = 0; i < int_regions; i++) {
      size_t region = vec_regions + i;

      auto* usrc =
          reinterpret_cast<const uint64_t*>(*src + region_size * region);
      auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region);

      for (size_t j = 0; j < kIntLoadsPerVec; j++) {
        size_t data_index = i * kIntLoadsPerVec + j;

        int_data[data_index] = *(usrc + j);
        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
            static_cast<uint32_t>(crcs[region]), int_data[data_index]))};

        *(udst + j) = int_data[data_index];
      }
    }

    // Increment pointers
    *src += sizeof(__m128i);
    *dst += sizeof(__m128i);
    --copy_rounds;
  }
}

}  // namespace

template <size_t vec_regions, size_t int_regions>
class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
 public:
  AcceleratedCrcMemcpyEngine() = default;
  AcceleratedCrcMemcpyEngine(const AcceleratedCrcMemcpyEngine&) = delete;
  AcceleratedCrcMemcpyEngine operator=(const AcceleratedCrcMemcpyEngine&) =
      delete;

  crc32c_t Compute(void* __restrict dst, const void* __restrict src,
                   std::size_t length, crc32c_t initial_crc) const override;
};

template <size_t vec_regions, size_t int_regions>
crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
    void* __restrict dst, const void* __restrict src, std::size_t length,
    crc32c_t initial_crc) const {
  constexpr std::size_t kRegions = vec_regions + int_regions;
  constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
  constexpr std::size_t kBlockSize = sizeof(__m128i);
  constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;

  // Number of blocks per cacheline.
  constexpr std::size_t kBlocksPerCacheLine = ABSL_CACHELINE_SIZE / kBlockSize;

  char* dst_bytes = static_cast<char*>(dst);
  const char* src_bytes = static_cast<const char*>(src);

  // Make sure that one prefetch per big block is enough to cover the whole
  // dataset, and we don't prefetch too much.
  static_assert(ABSL_CACHELINE_SIZE % kBlockSize == 0,
                "Cache lines are not divided evenly into blocks, may have "
                "unintended behavior!");

  // Experimentally-determined boundary between a small and large copy.
  // Below this number, spin-up and concatenation of CRCs takes enough time that
  // it kills the throughput gains of using 3 regions and wide vectors.
  constexpr size_t kCrcSmallSize = 256;

  // Experimentally-determined prefetch distance.  Main loop copies will
  // prefeth data 2 cache lines ahead.
  constexpr std::size_t kPrefetchAhead = 2 * ABSL_CACHELINE_SIZE;

  // Small-size CRC-memcpy : just do CRC + memcpy
  if (length < kCrcSmallSize) {
    crc32c_t crc =
        ExtendCrc32c(initial_crc, absl::string_view(src_bytes, length));
    memcpy(dst, src, length);
    return crc;
  }

  // Start work on the CRC: undo the XOR from the previous calculation or set up
  // the initial value of the CRC.
  // initial_crc ^= kCrcDataXor;
  initial_crc = crc32c_t{static_cast<uint32_t>(initial_crc) ^ kCrcDataXor};

  // Do an initial alignment copy, so we can use aligned store instructions to
  // the destination pointer.  We align the destination pointer because the
  // penalty for an unaligned load is small compared to the penalty of an
  // unaligned store on modern CPUs.
  std::size_t bytes_from_last_aligned =
      reinterpret_cast<uintptr_t>(dst) & (kBlockSize - 1);
  if (bytes_from_last_aligned != 0) {
    std::size_t bytes_for_alignment = kBlockSize - bytes_from_last_aligned;

    // Do the short-sized copy and CRC.
    initial_crc =
        ShortCrcCopy(dst_bytes, src_bytes, bytes_for_alignment, initial_crc);
    src_bytes += bytes_for_alignment;
    dst_bytes += bytes_for_alignment;
    length -= bytes_for_alignment;
  }

  // We are going to do the copy and CRC in kRegions regions to make sure that
  // we can saturate the CRC unit.  The CRCs will be combined at the end of the
  // run.  Copying will use the SSE registers, and we will extract words from
  // the SSE registers to add to the CRC.  Initially, we run the loop one full
  // cache line per region at a time, in order to insert prefetches.

  // Initialize CRCs for kRegions regions.
  crc32c_t crcs[kRegions];
  crcs[0] = initial_crc;
  for (size_t i = 1; i < kRegions; i++) {
    crcs[i] = crc32c_t{kCrcDataXor};
  }

  // Find the number of rounds to copy and the region size.  Also compute the
  // tail size here.
  size_t copy_rounds = length / kCopyRoundSize;

  // Find the size of each region and the size of the tail.
  const std::size_t region_size = copy_rounds * kBlockSize;
  const std::size_t tail_size = length - (kRegions * region_size);

  // Holding registers for data in each region.
  std::array<__m128i, vec_regions> vec_data;
  std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data;

  // Main loop.
  while (copy_rounds > kBlocksPerCacheLine) {
    // Prefetch kPrefetchAhead bytes ahead of each pointer.
    for (size_t i = 0; i < kRegions; i++) {
      absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead +
                                      region_size * i);
      absl::base_internal::PrefetchT0(dst_bytes + kPrefetchAhead +
                                      region_size * i);
    }

    // Load and store data, computing CRC on the way.
    for (size_t i = 0; i < kBlocksPerCacheLine; i++) {
      // Copy and CRC the data for the CRC regions.
      for (size_t j = 0; j < vec_regions; j++) {
        // Cycle which regions get vector load/store and integer load/store, to
        // engage prefetching logic around vector load/stores and save issue
        // slots by using the integer registers.
        size_t region = (j + i) % kRegions;

        auto* vsrc =
            reinterpret_cast<const __m128i*>(src_bytes + region_size * region);
        auto* vdst =
            reinterpret_cast<__m128i*>(dst_bytes + region_size * region);

        // Load and CRC data.
        vec_data[j] = _mm_loadu_si128(vsrc + i);
        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
            static_cast<uint32_t>(crcs[region]),
            static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))};
        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
            static_cast<uint32_t>(crcs[region]),
            static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))};

        // Store the data.
        _mm_store_si128(vdst + i, vec_data[j]);
      }

      // Preload the partial CRCs for the CLMUL subregions.
      for (size_t j = 0; j < int_regions; j++) {
        // Cycle which regions get vector load/store and integer load/store, to
        // engage prefetching logic around vector load/stores and save issue
        // slots by using the integer registers.
        size_t region = (j + vec_regions + i) % kRegions;

        auto* usrc =
            reinterpret_cast<const uint64_t*>(src_bytes + region_size * region);
        auto* udst =
            reinterpret_cast<uint64_t*>(dst_bytes + region_size * region);

        for (size_t k = 0; k < kIntLoadsPerVec; k++) {
          size_t data_index = j * kIntLoadsPerVec + k;

          // Load and CRC the data.
          int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
          crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
              static_cast<uint32_t>(crcs[region]), int_data[data_index]))};

          // Store the data.
          *(udst + i * kIntLoadsPerVec + k) = int_data[data_index];
        }
      }
    }

    // Increment pointers
    src_bytes += kBlockSize * kBlocksPerCacheLine;
    dst_bytes += kBlockSize * kBlocksPerCacheLine;
    copy_rounds -= kBlocksPerCacheLine;
  }

  // Copy and CRC the tails of each region.
  LargeTailCopy<vec_regions, int_regions>(crcs, &dst_bytes, &src_bytes,
                                          region_size, copy_rounds);

  // Move the source and destination pointers to the end of the region
  src_bytes += region_size * (kRegions - 1);
  dst_bytes += region_size * (kRegions - 1);

  // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the
  // XOR done before doing block copy + CRCs.
  for (size_t i = 0; i + 1 < kRegions; i++) {
    crcs[i] = crc32c_t{static_cast<uint32_t>(crcs[i]) ^ kCrcDataXor};
  }

  // Build a CRC of the first kRegions - 1 regions.
  crc32c_t full_crc = crcs[0];
  for (size_t i = 1; i + 1 < kRegions; i++) {
    full_crc = ConcatCrc32c(full_crc, crcs[i], region_size);
  }

  // Copy and CRC the tail through the XMM registers.
  std::size_t tail_blocks = tail_size / kBlockSize;
  LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0,
                      tail_blocks);

  // Final tail copy for under 16 bytes.
  crcs[kRegions - 1] =
      ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize,
                   crcs[kRegions - 1]);

  // Finalize and concatenate the final CRC, then return.
  crcs[kRegions - 1] =
      crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor};
  return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size);
}

CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
#ifdef UNDEFINED_BEHAVIOR_SANITIZER
  // UBSAN does not play nicely with unaligned loads (which we use a lot).
  // Get the underlying architecture.
  CpuType cpu_type = GetCpuType();
  switch (cpu_type) {
    case CpuType::kUnknown:
    case CpuType::kAmdRome:
    case CpuType::kAmdNaples:
    case CpuType::kIntelCascadelakeXeon:
    case CpuType::kIntelSkylakeXeon:
    case CpuType::kIntelSkylake:
    case CpuType::kIntelBroadwell:
    case CpuType::kIntelHaswell:
    case CpuType::kIntelIvybridge:
      return {
          .temporal = new FallbackCrcMemcpyEngine(),
          .non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
      };
    // INTEL_SANDYBRIDGE performs better with SSE than AVX.
    case CpuType::kIntelSandybridge:
      return {
          .temporal = new FallbackCrcMemcpyEngine(),
          .non_temporal = new CrcNonTemporalMemcpyEngine(),
      };
    default:
      return {.temporal = new FallbackCrcMemcpyEngine(),
              .non_temporal = new FallbackCrcMemcpyEngine()};
  }
#else
  // Get the underlying architecture.
  CpuType cpu_type = GetCpuType();
  switch (cpu_type) {
    // On Zen 2, PEXTRQ uses 2 micro-ops, including one on the vector store port
    // which data movement from the vector registers to the integer registers
    // (where CRC32C happens) to crowd the same units as vector stores.  As a
    // result, using that path exclusively causes bottlenecking on this port.
    // We can avoid this bottleneck by using the integer side of the CPU for
    // most operations rather than the vector side.  We keep a vector region to
    // engage some of the prefetching logic in the cache hierarchy which seems
    // to give vector instructions special treatment.  These prefetch units see
    // strided access to each region, and do the right thing.
    case CpuType::kAmdRome:
    case CpuType::kAmdNaples:
      return {
          .temporal = new AcceleratedCrcMemcpyEngine<1, 2>(),
          .non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
      };
    // PCLMULQDQ is slow and we don't have wide enough issue width to take
    // advantage of it.  For an unknown architecture, don't risk using CLMULs.
    case CpuType::kIntelCascadelakeXeon:
    case CpuType::kIntelSkylakeXeon:
    case CpuType::kIntelSkylake:
    case CpuType::kIntelBroadwell:
    case CpuType::kIntelHaswell:
    case CpuType::kIntelIvybridge:
      return {
          .temporal = new AcceleratedCrcMemcpyEngine<3, 0>(),
          .non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
      };
    // INTEL_SANDYBRIDGE performs better with SSE than AVX.
    case CpuType::kIntelSandybridge:
      return {
          .temporal = new AcceleratedCrcMemcpyEngine<3, 0>(),
          .non_temporal = new CrcNonTemporalMemcpyEngine(),
      };
    default:
      return {.temporal = new FallbackCrcMemcpyEngine(),
              .non_temporal = new FallbackCrcMemcpyEngine()};
  }
#endif  // UNDEFINED_BEHAVIOR_SANITIZER
}

// For testing, allow the user to specify which engine they want.
std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
                                                          int integer) {
  if (vector == 3 && integer == 0) {
    return std::make_unique<AcceleratedCrcMemcpyEngine<3, 0>>();
  } else if (vector == 1 && integer == 2) {
    return std::make_unique<AcceleratedCrcMemcpyEngine<1, 2>>();
  }
  return nullptr;
}

}  // namespace crc_internal
ABSL_NAMESPACE_END
}  // namespace absl

#endif  // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE