summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--absl/crc/internal/non_temporal_memcpy.h35
1 files changed, 18 insertions, 17 deletions
diff --git a/absl/crc/internal/non_temporal_memcpy.h b/absl/crc/internal/non_temporal_memcpy.h
index b3d94bad..7ae83bdc 100644
--- a/absl/crc/internal/non_temporal_memcpy.h
+++ b/absl/crc/internal/non_temporal_memcpy.h
@@ -19,19 +19,8 @@
#include <intrin.h>
#endif
-#ifdef __SSE__
-#include <xmmintrin.h>
-#endif
-
-#ifdef __SSE2__
-#include <emmintrin.h>
-#endif
-
-#ifdef __SSE3__
-#include <pmmintrin.h>
-#endif
-
-#ifdef __AVX__
+#if defined(__SSE__) || defined(__AVX__)
+// Pulls in both SSE and AVX intrinsics.
#include <immintrin.h>
#endif
@@ -44,6 +33,7 @@
#include <cstdint>
#include <cstring>
+#include "absl/base/attributes.h"
#include "absl/base/config.h"
#include "absl/base/optimization.h"
@@ -57,7 +47,9 @@ namespace crc_internal {
// memcpy can save 1 DRAM load of the destination cacheline.
constexpr size_t kCacheLineSize = ABSL_CACHELINE_SIZE;
-// If the objects overlap, the behavior is undefined.
+// If the objects overlap, the behavior is undefined. Uses regular memcpy
+// instead of non-temporal memcpy if the required CPU intrinsics are unavailable
+// at compile time.
inline void *non_temporal_store_memcpy(void *__restrict dst,
const void *__restrict src, size_t len) {
#if defined(__SSE3__) || defined(__aarch64__) || \
@@ -119,10 +111,20 @@ inline void *non_temporal_store_memcpy(void *__restrict dst,
#endif // __SSE3__ || __aarch64__ || (_MSC_VER && __AVX__)
}
+// If the objects overlap, the behavior is undefined. Uses regular memcpy
+// instead of non-temporal memcpy if the required CPU intrinsics are unavailable
+// at compile time.
+#if ABSL_HAVE_CPP_ATTRIBUTE(gnu::target) && \
+ (defined(__x86_64__) || defined(__i386__))
+[[gnu::target("avx")]]
+#endif
inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
const void *__restrict src,
size_t len) {
-#ifdef __AVX__
+ // This function requires AVX. For clang and gcc we compile it with AVX even
+ // if the translation unit isn't built with AVX support. This works because we
+ // only select this implementation at runtime if the CPU supports AVX.
+#if defined(__SSE3__) || (defined(_MSC_VER) && defined(__AVX__))
uint8_t *d = reinterpret_cast<uint8_t *>(dst);
const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
@@ -168,9 +170,8 @@ inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
}
return dst;
#else
- // Fallback to regular memcpy when AVX is not available.
return memcpy(dst, src, len);
-#endif // __AVX__
+#endif // __SSE3__ || (_MSC_VER && __AVX__)
}
} // namespace crc_internal