diff options
Diffstat (limited to 'absl/crc/internal/crc32_x86_arm_combined_simd.h')
-rw-r--r-- | absl/crc/internal/crc32_x86_arm_combined_simd.h | 260 |
1 files changed, 260 insertions, 0 deletions
diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h new file mode 100644 index 00000000..59d71fd4 --- /dev/null +++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h @@ -0,0 +1,260 @@ +// Copyright 2022 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_ +#define ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_ + +#include <cstdint> + +#include "absl/base/config.h" + +// ------------------------------------------------------------------------- +// Many x86 and ARM machines have CRC acceleration hardware. +// We can do a faster version of Extend() on such machines. +// We define a translation layer for both x86 and ARM for the ease of use and +// most performance gains. + +// We need CRC (part of sse4.2) and PCLMULQDQ instructions. +#if defined(__SSE4_2__) && defined(__PCLMUL__) + +#include <x86intrin.h> +#define ABSL_CRC_INTERNAL_HAVE_X86_SIMD + +#elif defined(__aarch64__) && defined(__LITTLE_ENDIAN__) && \ + defined(__ARM_FEATURE_CRC32) && defined(__ARM_NEON) + +#include <arm_acle.h> +#include <arm_neon.h> +#define ABSL_CRC_INTERNAL_HAVE_ARM_SIMD + +#endif + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace crc_internal { + +#if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \ + defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) + +#if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) +using V128 = uint64x2_t; +#else +using V128 = __m128i; +#endif + +// Starting with the initial value in |crc|, accumulates a CRC32 value for +// unsigned integers of different sizes. +uint32_t CRC32_u8(uint32_t crc, uint8_t v); + +uint32_t CRC32_u16(uint32_t crc, uint16_t v); + +uint32_t CRC32_u32(uint32_t crc, uint32_t v); + +uint32_t CRC32_u64(uint32_t crc, uint64_t v); + +// Loads 128 bits of integer data. |src| must be 16-byte aligned. +V128 V128_Load(const V128* src); + +// Load 128 bits of integer data. |src| does not need to be aligned. +V128 V128_LoadU(const V128* src); + +// Polynomially multiplies the high 64 bits of |l| and |r|. +V128 V128_PMulHi(const V128 l, const V128 r); + +// Polynomially multiplies the low 64 bits of |l| and |r|. +V128 V128_PMulLow(const V128 l, const V128 r); + +// Polynomially multiplies the low 64 bits of |r| and high 64 bits of |l|. +V128 V128_PMul01(const V128 l, const V128 r); + +// Polynomially multiplies the low 64 bits of |l| and high 64 bits of |r|. +V128 V128_PMul10(const V128 l, const V128 r); + +// Produces a XOR operation of |l| and |r|. +V128 V128_Xor(const V128 l, const V128 r); + +// Produces an AND operation of |l| and |r|. +V128 V128_And(const V128 l, const V128 r); + +// Sets two 64 bit integers to one 128 bit vector. The order is reverse. +// dst[63:0] := |r| +// dst[127:64] := |l| +V128 V128_From2x64(const uint64_t l, const uint64_t r); + +// Shift |l| right by |imm| bytes while shifting in zeros. +template <int imm> +V128 V128_ShiftRight(const V128 l); + +// Extracts a 32-bit integer from |l|, selected with |imm|. +template <int imm> +int V128_Extract32(const V128 l); + +// Extracts the low 64 bits from V128. +int64_t V128_Low64(const V128 l); + +// Left-shifts packed 64-bit integers in l by r. +V128 V128_ShiftLeft64(const V128 l, const V128 r); + +#endif + +#if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) + +inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) { + return _mm_crc32_u8(crc, v); +} + +inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) { + return _mm_crc32_u16(crc, v); +} + +inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) { + return _mm_crc32_u32(crc, v); +} + +inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) { + return _mm_crc32_u64(crc, v); +} + +inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); } + +inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); } + +inline V128 V128_PMulHi(const V128 l, const V128 r) { + return _mm_clmulepi64_si128(l, r, 0x11); +} + +inline V128 V128_PMulLow(const V128 l, const V128 r) { + return _mm_clmulepi64_si128(l, r, 0x00); +} + +inline V128 V128_PMul01(const V128 l, const V128 r) { + return _mm_clmulepi64_si128(l, r, 0x01); +} + +inline V128 V128_PMul10(const V128 l, const V128 r) { + return _mm_clmulepi64_si128(l, r, 0x10); +} + +inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); } + +inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); } + +inline V128 V128_From2x64(const uint64_t l, const uint64_t r) { + return _mm_set_epi64x(l, r); +} + +template <int imm> +inline V128 V128_ShiftRight(const V128 l) { + return _mm_srli_si128(l, imm); +} + +template <int imm> +inline int V128_Extract32(const V128 l) { + return _mm_extract_epi32(l, imm); +} + +inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); } + +inline V128 V128_ShiftLeft64(const V128 l, const V128 r) { + return _mm_sll_epi64(l, r); +} + +#elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) + +inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) { return __crc32cb(crc, v); } + +inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) { + return __crc32ch(crc, v); +} + +inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) { + return __crc32cw(crc, v); +} + +inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) { + return __crc32cd(crc, v); +} + +inline V128 V128_Load(const V128* src) { + return vld1q_u64(reinterpret_cast<const uint64_t*>(src)); +} + +inline V128 V128_LoadU(const V128* src) { + return vld1q_u64(reinterpret_cast<const uint64_t*>(src)); +} + +// Using inline assembly as clang does not generate the pmull2 instruction and +// performance drops by 15-20%. +// TODO(b/193678732): Investigate why the compiler decides not to generate +// such instructions and why it becomes so much worse. +inline V128 V128_PMulHi(const V128 l, const V128 r) { + uint64x2_t res; + __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t" + : "=w"(res) + : "w"(l), "w"(r)); + return res; +} + +inline V128 V128_PMulLow(const V128 l, const V128 r) { + return reinterpret_cast<V128>(vmull_p64( + reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))), + reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r))))); +} + +inline V128 V128_PMul01(const V128 l, const V128 r) { + return reinterpret_cast<V128>(vmull_p64( + reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(l))), + reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r))))); +} + +inline V128 V128_PMul10(const V128 l, const V128 r) { + return reinterpret_cast<V128>(vmull_p64( + reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))), + reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(r))))); +} + +inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); } + +inline V128 V128_And(const V128 l, const V128 r) { return vandq_u64(l, r); } + +inline V128 V128_From2x64(const uint64_t l, const uint64_t r) { + return vcombine_u64(vcreate_u64(r), vcreate_u64(l)); +} + +template <int imm> +inline V128 V128_ShiftRight(const V128 l) { + return vreinterpretq_u64_s8( + vextq_s8(vreinterpretq_s8_u64(l), vdupq_n_s8(0), imm)); +} + +template <int imm> +inline int V128_Extract32(const V128 l) { + return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm); +} + +inline int64_t V128_Low64(const V128 l) { + return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0); +} + +inline V128 V128_ShiftLeft64(const V128 l, const V128 r) { + return vshlq_u64(l, r); +} + +#endif + +} // namespace crc_internal +ABSL_NAMESPACE_END +} // namespace absl + +#endif // ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_ |