summaryrefslogtreecommitdiff
path: root/absl/crc/internal/non_temporal_arm_intrinsics.h
blob: 92632a3341c3c947a72c427c566cb418bf75adc8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef ABSL_CRC_INTERNAL_NON_TEMPORAL_ARM_INTRINSICS_H_
#define ABSL_CRC_INTERNAL_NON_TEMPORAL_ARM_INTRINSICS_H_

#ifdef __aarch64__
#include <arm_neon.h>

typedef int64x2_t __m128i; /* 128-bit vector containing integers */
#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
#define vreinterpretq_s64_m128i(x) (x)

// Guarantees that every preceding store is globally visible before any
// subsequent store.
// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
static inline __attribute__((always_inline)) void _mm_sfence(void) {
  __sync_synchronize();
}

// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
// may perform better than _mm_loadu_si128 when the data crosses a cache line
// boundary.
//
//   dst[127:0] := MEM[mem_addr+127:mem_addr]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
#define _mm_lddqu_si128 _mm_loadu_si128

// Loads 128-bit value. :
// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
static inline __attribute__((always_inline)) __m128i _mm_loadu_si128(
    const __m128i *p) {
  return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *)p));
}

// Stores the data in a to the address p without polluting the caches.  If the
// cache line containing address p is already in the cache, the cache will be
// updated.
// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
static inline __attribute__((always_inline)) void _mm_stream_si128(__m128i *p,
                                                                   __m128i a) {
#if __has_builtin(__builtin_nontemporal_store)
  __builtin_nontemporal_store(a, p);
#else
  vst1q_s64((int64_t *)p, vreinterpretq_s64_m128i(a));
#endif
}

// Sets the 16 signed 8-bit integer values.
// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
static inline __attribute__((always_inline)) __m128i _mm_set_epi8(
    signed char b15, signed char b14, signed char b13, signed char b12,
    signed char b11, signed char b10, signed char b9, signed char b8,
    signed char b7, signed char b6, signed char b5, signed char b4,
    signed char b3, signed char b2, signed char b1, signed char b0) {
  int8_t __attribute__((aligned(16)))
  data[16] = {(int8_t)b0,  (int8_t)b1,  (int8_t)b2,  (int8_t)b3,
              (int8_t)b4,  (int8_t)b5,  (int8_t)b6,  (int8_t)b7,
              (int8_t)b8,  (int8_t)b9,  (int8_t)b10, (int8_t)b11,
              (int8_t)b12, (int8_t)b13, (int8_t)b14, (int8_t)b15};
  return (__m128i)vld1q_s8(data);
}
#endif  // __aarch64__

#endif  // ABSL_CRC_INTERNAL_NON_TEMPORAL_ARM_INTRINSICS_H_