summaryrefslogtreecommitdiff
path: root/absl/crc/internal/non_temporal_memcpy.h
diff options
context:
space:
mode:
Diffstat (limited to 'absl/crc/internal/non_temporal_memcpy.h')
-rw-r--r--absl/crc/internal/non_temporal_memcpy.h172
1 files changed, 172 insertions, 0 deletions
diff --git a/absl/crc/internal/non_temporal_memcpy.h b/absl/crc/internal/non_temporal_memcpy.h
new file mode 100644
index 00000000..0c6d7655
--- /dev/null
+++ b/absl/crc/internal/non_temporal_memcpy.h
@@ -0,0 +1,172 @@
+// Copyright 2022 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
+#define ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
+
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <iostream>
+
+#include "absl/base/config.h"
+#include "absl/base/optimization.h"
+
+#ifdef __SSE__
+// Only include if we're running on a CPU that supports SSE ISA, needed for
+// sfence
+#include <immintrin.h> // IWYU pragma: keep
+#endif
+#ifdef __SSE2__
+// Only include if we're running on a CPU that supports SSE2 ISA, needed for
+// movdqa, movdqu, movntdq
+#include <emmintrin.h> // IWYU pragma: keep
+#endif
+#ifdef __aarch64__
+// Only include if we're running on a CPU that supports ARM NEON ISA, needed for
+// sfence, movdqa, movdqu, movntdq
+#include "absl/crc/internal/non_temporal_arm_intrinsics.h"
+#endif
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+// This non-temporal memcpy does regular load and non-temporal store memory
+// copy. It is compatible to both 16-byte aligned and unaligned addresses. If
+// data at the destination is not immediately accessed, using non-temporal
+// memcpy can save 1 DRAM load of the destination cacheline.
+
+constexpr int kCacheLineSize = ABSL_CACHELINE_SIZE;
+
+// If the objects overlap, the behavior is undefined.
+// MSVC does not have proper header support for some of these intrinsics,
+// so it should go to fallback
+inline void *non_temporal_store_memcpy(void *__restrict dst,
+ const void *__restrict src, size_t len) {
+#if (defined(__SSE3__) || defined(__aarch64__)) && !defined(_MSC_VER)
+ uint8_t *d = reinterpret_cast<uint8_t *>(dst);
+ const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
+
+ // memcpy() the misaligned header. At the end of this if block, <d> is
+ // aligned to a 64-byte cacheline boundary or <len> == 0.
+ if (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)) {
+ uintptr_t bytes_before_alignment_boundary =
+ kCacheLineSize -
+ (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
+ int header_len = (std::min)(bytes_before_alignment_boundary, len);
+ assert(bytes_before_alignment_boundary < kCacheLineSize);
+ memcpy(d, s, header_len);
+ d += header_len;
+ s += header_len;
+ len -= header_len;
+ }
+
+ if (len >= kCacheLineSize) {
+ _mm_sfence();
+ __m128i *dst_cacheline = reinterpret_cast<__m128i *>(d);
+ const __m128i *src_cacheline = reinterpret_cast<const __m128i *>(s);
+ constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m128i);
+ uint64_t loops = len / kCacheLineSize;
+
+ while (len >= kCacheLineSize) {
+ __m128i temp1, temp2, temp3, temp4;
+ temp1 = _mm_lddqu_si128(src_cacheline + 0);
+ temp2 = _mm_lddqu_si128(src_cacheline + 1);
+ temp3 = _mm_lddqu_si128(src_cacheline + 2);
+ temp4 = _mm_lddqu_si128(src_cacheline + 3);
+ _mm_stream_si128(dst_cacheline + 0, temp1);
+ _mm_stream_si128(dst_cacheline + 1, temp2);
+ _mm_stream_si128(dst_cacheline + 2, temp3);
+ _mm_stream_si128(dst_cacheline + 3, temp4);
+ src_cacheline += kOpsPerCacheLine;
+ dst_cacheline += kOpsPerCacheLine;
+ len -= kCacheLineSize;
+ }
+ d += loops * kCacheLineSize;
+ s += loops * kCacheLineSize;
+ _mm_sfence();
+ }
+
+ // memcpy the tail.
+ if (len) {
+ memcpy(d, s, len);
+ }
+ return dst;
+#else
+ // Fallback to regular memcpy when SSE2/3 & aarch64 is not available.
+ return memcpy(dst, src, len);
+#endif // __SSE3__ || __aarch64__
+}
+
+// MSVC does not have proper header support for some of these intrinsics,
+// so it should go to fallback
+inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
+ const void *__restrict src,
+ size_t len) {
+#if defined(__AVX__) && !defined(_MSC_VER)
+ uint8_t *d = reinterpret_cast<uint8_t *>(dst);
+ const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
+
+ // memcpy() the misaligned header. At the end of this if block, <d> is
+ // aligned to a 64-byte cacheline boundary or <len> == 0.
+ if (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)) {
+ uintptr_t bytes_before_alignment_boundary =
+ kCacheLineSize -
+ (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
+ int header_len = (std::min)(bytes_before_alignment_boundary, len);
+ assert(bytes_before_alignment_boundary < kCacheLineSize);
+ memcpy(d, s, header_len);
+ d += header_len;
+ s += header_len;
+ len -= header_len;
+ }
+
+ if (len >= kCacheLineSize) {
+ _mm_sfence();
+ __m256i *dst_cacheline = reinterpret_cast<__m256i *>(d);
+ const __m256i *src_cacheline = reinterpret_cast<const __m256i *>(s);
+ constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m256i);
+ int loops = len / kCacheLineSize;
+
+ while (len >= kCacheLineSize) {
+ __m256i temp1, temp2;
+ temp1 = _mm256_lddqu_si256(src_cacheline + 0);
+ temp2 = _mm256_lddqu_si256(src_cacheline + 1);
+ _mm256_stream_si256(dst_cacheline + 0, temp1);
+ _mm256_stream_si256(dst_cacheline + 1, temp2);
+ src_cacheline += kOpsPerCacheLine;
+ dst_cacheline += kOpsPerCacheLine;
+ len -= kCacheLineSize;
+ }
+ d += loops * kCacheLineSize;
+ s += loops * kCacheLineSize;
+ _mm_sfence();
+ }
+
+ // memcpy the tail.
+ if (len) {
+ memcpy(d, s, len);
+ }
+ return dst;
+#else
+ // Fallback to regular memcpy when AVX is not available.
+ return memcpy(dst, src, len);
+#endif // __AVX__
+}
+
+} // namespace crc_internal
+ABSL_NAMESPACE_END
+} // namespace absl
+
+#endif // ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_