summaryrefslogtreecommitdiff
path: root/absl/crc/internal/non_temporal_arm_intrinsics.h
diff options
context:
space:
mode:
Diffstat (limited to 'absl/crc/internal/non_temporal_arm_intrinsics.h')
-rw-r--r--absl/crc/internal/non_temporal_arm_intrinsics.h77
1 files changed, 77 insertions, 0 deletions
diff --git a/absl/crc/internal/non_temporal_arm_intrinsics.h b/absl/crc/internal/non_temporal_arm_intrinsics.h
new file mode 100644
index 00000000..92632a33
--- /dev/null
+++ b/absl/crc/internal/non_temporal_arm_intrinsics.h
@@ -0,0 +1,77 @@
+// Copyright 2022 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ABSL_CRC_INTERNAL_NON_TEMPORAL_ARM_INTRINSICS_H_
+#define ABSL_CRC_INTERNAL_NON_TEMPORAL_ARM_INTRINSICS_H_
+
+#ifdef __aarch64__
+#include <arm_neon.h>
+
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+// Guarantees that every preceding store is globally visible before any
+// subsequent store.
+// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+static inline __attribute__((always_inline)) void _mm_sfence(void) {
+ __sync_synchronize();
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+//
+// dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
+static inline __attribute__((always_inline)) __m128i _mm_loadu_si128(
+ const __m128i *p) {
+ return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *)p));
+}
+
+// Stores the data in a to the address p without polluting the caches. If the
+// cache line containing address p is already in the cache, the cache will be
+// updated.
+// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+static inline __attribute__((always_inline)) void _mm_stream_si128(__m128i *p,
+ __m128i a) {
+#if __has_builtin(__builtin_nontemporal_store)
+ __builtin_nontemporal_store(a, p);
+#else
+ vst1q_s64((int64_t *)p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Sets the 16 signed 8-bit integer values.
+// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
+static inline __attribute__((always_inline)) __m128i _mm_set_epi8(
+ signed char b15, signed char b14, signed char b13, signed char b12,
+ signed char b11, signed char b10, signed char b9, signed char b8,
+ signed char b7, signed char b6, signed char b5, signed char b4,
+ signed char b3, signed char b2, signed char b1, signed char b0) {
+ int8_t __attribute__((aligned(16)))
+ data[16] = {(int8_t)b0, (int8_t)b1, (int8_t)b2, (int8_t)b3,
+ (int8_t)b4, (int8_t)b5, (int8_t)b6, (int8_t)b7,
+ (int8_t)b8, (int8_t)b9, (int8_t)b10, (int8_t)b11,
+ (int8_t)b12, (int8_t)b13, (int8_t)b14, (int8_t)b15};
+ return (__m128i)vld1q_s8(data);
+}
+#endif // __aarch64__
+
+#endif // ABSL_CRC_INTERNAL_NON_TEMPORAL_ARM_INTRINSICS_H_