Add support for Arm SVE

This patch adds support for Arm's new vector extension SVE (Scalable Vector Extension). In contrast to other vector extensions that are supported by Eigen, SVE types are inherently *sizeless*. For the use in Eigen we fix their size at compile-time (note that this is not necessary in general, SVE is *length agnostic*). During compilation the flag `-msve-vector-bits=N` has to be set where `N` is a power of two in the range of `128`to `2048`, indicating the length of an SVE vector. Since SVE is rather young, we decided to disable it by default even if it would be available. A user has to enable it explicitly by defining `EIGEN_ARM64_USE_SVE`. This patch introduces the packet types `PacketXf` and `PacketXi` for packets of `float` and `int32_t` respectively. The size of these packets depends on the SVE vector length. E.g. if `-msve-vector-bits=512` is set, `PacketXf` will contain `512/32 = 16` elements. This MR is joint work with Miguel Tairum <miguel.tairum@arm.com>.
author: David Tellenbach <david.tellenbach@me.com> 2021-01-21 21:11:57 +0000
committer: David Tellenbach <david.tellenbach@me.com> 2021-01-21 21:11:57 +0000
commit: 65e2169c4521660d30f4d90df61da5f3dd9f45bd (patch)
tree: dcea1ca38e2a77b630dd7159dace922f9169c40d
parent: b2126fd6b5e232d072ceadb1abb6695ae3352e2e (diff)
6 files changed, 894 insertions, 18 deletions
diff --git a/Eigen/Core b/Eigen/Core
index ef778511c..4d9a3309c 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -208,6 +208,10 @@ using std::ptrdiff_t;
   #include "src/Core/arch/NEON/TypeCasting.h"
   #include "src/Core/arch/NEON/MathFunctions.h"
   #include "src/Core/arch/NEON/Complex.h"
+#elif defined EIGEN_VECTORIZE_SVE
+  #include "src/Core/arch/SVE/PacketMath.h"
+  #include "src/Core/arch/SVE/TypeCasting.h"
+  #include "src/Core/arch/SVE/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_ZVECTOR
   #include "src/Core/arch/ZVector/PacketMath.h"
   #include "src/Core/arch/ZVector/MathFunctions.h"
diff --git a/Eigen/src/Core/arch/SVE/MathFunctions.h b/Eigen/src/Core/arch/SVE/MathFunctions.h
new file mode 100644
index 000000000..b139ea2e4
--- /dev/null
+++ b/Eigen/src/Core/arch/SVE/MathFunctions.h
@@ -0,0 +1,44 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_SVE_H
+#define EIGEN_MATH_FUNCTIONS_SVE_H
+
+namespace Eigen {
+namespace internal {
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf pexp<PacketXf>(const PacketXf& x) {
+  return pexp_float(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf plog<PacketXf>(const PacketXf& x) {
+  return plog_float(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf psin<PacketXf>(const PacketXf& x) {
+  return psin_float(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf pcos<PacketXf>(const PacketXf& x) {
+  return pcos_float(x);
+}
+
+// Hyperbolic Tangent function.
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf ptanh<PacketXf>(const PacketXf& x) {
+  return internal::generic_fast_tanh_float(x);
+}
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_SVE_H
diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
new file mode 100644
index 000000000..98585e6a9
--- /dev/null
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h
@@ -0,0 +1,756 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_SVE_H
+#define EIGEN_PACKET_MATH_SVE_H
+
+namespace Eigen
+{
+namespace internal
+{
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#endif
+
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+
+template <typename Scalar, int SVEVectorLength>
+struct sve_packet_size_selector {
+  enum { size = SVEVectorLength / (sizeof(Scalar) * CHAR_BIT) };
+};
+
+/********************************* int32 **************************************/
+typedef svint32_t PacketXi __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
+
+template <>
+struct packet_traits<numext::int32_t> : default_packet_traits {
+  typedef PacketXi type;
+  typedef PacketXi half;  // Half not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0  // Not implemented in SVE
+  };
+};
+
+template <>
+struct unpacket_traits<PacketXi> {
+  typedef numext::int32_t type;
+  typedef PacketXi half;  // Half not yet implemented
+  enum {
+    size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<numext::int32_t>(const numext::int32_t* addr)
+{
+  svprfw(svptrue_b32(), addr, SV_PLDL1KEEP);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pset1<PacketXi>(const numext::int32_t& from)
+{
+  return svdup_n_s32(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const numext::int32_t& a)
+{
+  numext::int32_t c[packet_traits<numext::int32_t>::size];
+  for (int i = 0; i < packet_traits<numext::int32_t>::size; i++) c[i] = i;
+  return svadd_s32_z(svptrue_b32(), pset1<PacketXi>(a), svld1_s32(svptrue_b32(), c));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi padd<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svadd_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi psub<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svsub_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a)
+{
+  return svneg_s32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a)
+{
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmul<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svmul_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pdiv<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdiv_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c)
+{
+  return svmla_s32_z(svptrue_b32(), c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmin<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svmin_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmax<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svmax_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcmp_le<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcmp_lt<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcmp_eq<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdup_n_s32_z(svcmpeq_s32(svptrue_b32(), a, b), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ptrue<PacketXi>(const PacketXi& /*a*/)
+{
+  return svdup_n_s32_z(svptrue_b32(), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pzero<PacketXi>(const PacketXi& /*a*/)
+{
+  return svdup_n_s32_z(svptrue_b32(), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pand<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svand_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi por<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svorr_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pxor<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return sveor_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pandnot<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svbic_s32_z(svptrue_b32(), a, b);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a)
+{
+  return svasrd_n_s32_z(svptrue_b32(), a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a)
+{
+  return svreinterpret_s32_u32(svlsr_u32_z(svptrue_b32(), svreinterpret_u32_s32(a), svdup_n_u32_z(svptrue_b32(), N)));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a)
+{
+  return svlsl_s32_z(svptrue_b32(), a, svdup_n_u32_z(svptrue_b32(), N));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pload<PacketXi>(const numext::int32_t* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadu<PacketXi>(const numext::int32_t* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const numext::int32_t* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const numext::int32_t* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
+  return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
+{
+  EIGEN_DEBUG_UNALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline PacketXi pgather<numext::int32_t, PacketXi>(const numext::int32_t* from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  return svld1_gather_s32index_s32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, PacketXi>(numext::int32_t* to, const PacketXi& from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  svst1_scatter_s32index_s32(svptrue_b32(), to, indices, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t pfirst<PacketXi>(const PacketXi& a)
+{
+  // svlasta returns the first element if all predicate bits are 0
+  return svlasta_s32(svpfalse_b(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a)
+{
+  return svrev_s32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a)
+{
+  return svabs_s32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux<PacketXi>(const PacketXi& a)
+{
+  return static_cast<numext::int32_t>(svaddv_s32(svptrue_b32(), a));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_mul<PacketXi>(const PacketXi& a)
+{
+  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
+                      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
+
+  // Multiply the vector by its reverse
+  svint32_t prod = svmul_s32_z(svptrue_b32(), a, svrev_s32(a));
+  svint32_t half_prod;
+
+  // Extract the high half of the vector. Depending on the VL more reductions need to be done
+  if (EIGEN_ARM64_SVE_VL >= 2048) {
+    half_prod = svtbl_s32(prod, svindex_u32(32, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 1024) {
+    half_prod = svtbl_s32(prod, svindex_u32(16, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 512) {
+    half_prod = svtbl_s32(prod, svindex_u32(8, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 256) {
+    half_prod = svtbl_s32(prod, svindex_u32(4, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  // Last reduction
+  half_prod = svtbl_s32(prod, svindex_u32(2, 1));
+  prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+
+  // The reduction is done to the first element.
+  return pfirst<PacketXi>(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_min<PacketXi>(const PacketXi& a)
+{
+  return svminv_s32(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_max<PacketXi>(const PacketXi& a)
+{
+  return svmaxv_s32(svptrue_b32(), a);
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXi, N>& kernel) {
+  int buffer[packet_traits<numext::int32_t>::size * N] = {0};
+  int i = 0;
+
+  PacketXi stride_index = svindex_s32(0, N);
+
+  for (i = 0; i < N; i++) {
+    svst1_scatter_s32index_s32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
+  }
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = svld1_s32(svptrue_b32(), buffer + i * packet_traits<numext::int32_t>::size);
+  }
+}
+
+/********************************* float32 ************************************/
+
+typedef svfloat32_t PacketXf __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef PacketXf type;
+  typedef PacketXf half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,  // Not implemented in SVE
+
+    HasDiv = 1,
+    HasFloor = 1,
+
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 0,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH
+  };
+};
+
+template <>
+struct unpacket_traits<PacketXf> {
+  typedef float type;
+  typedef PacketXf half;  // Half not yet implemented
+  typedef PacketXi integer_packet;
+
+  enum {
+    size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pset1<PacketXf>(const float& from)
+{
+  return svdup_n_f32(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(numext::uint32_t from)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), from));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a)
+{
+  float c[packet_traits<float>::size];
+  for (int i = 0; i < packet_traits<float>::size; i++) c[i] = i;
+  return svadd_f32_z(svptrue_b32(), pset1<PacketXf>(a), svld1_f32(svptrue_b32(), c));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf padd<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svadd_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf psub<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svsub_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a)
+{
+  return svneg_f32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a)
+{
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmul<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmul_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pdiv<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svdiv_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c)
+{
+  return svmla_f32_z(svptrue_b32(), c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmin<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmin_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmin<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return pmin<PacketXf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmin<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svminnm_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmax<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmax_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmax<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return pmax<PacketXf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmax<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmaxnm_f32_z(svptrue_b32(), a, b);
+}
+
+// Float comparisons in SVE return svbool (predicate). Use svdup to set active
+// lanes to 1 (0xffffffffu) and inactive lanes to 0.
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_le<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_lt<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_eq<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(), a, b), 0xffffffffu));
+}
+
+// Do a predicate inverse (svnot_b_z) on the predicate resulted from the
+// greater/equal comparison (svcmpge_f32). Then fill a float vector with the
+// active elements.
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pfloor<PacketXf>(const PacketXf& a)
+{
+  return svrintm_f32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ptrue<PacketXf>(const PacketXf& /*a*/)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), 0xffffffffu));
+}
+
+// Logical Operations are not supported for float, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE PacketXf pand<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svand_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf por<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svorr_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pxor<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(sveor_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pandnot<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svbic_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pload<PacketXf>(const float* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadu<PacketXf>(const float* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
+  return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const PacketXf& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const PacketXf& from)
+{
+  EIGEN_DEBUG_UNALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline PacketXf pgather<float, PacketXf>(const float* from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  return svld1_gather_s32index_f32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, PacketXf>(float* to, const PacketXf& from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  svst1_scatter_s32index_f32(svptrue_b32(), to, indices, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<PacketXf>(const PacketXf& a)
+{
+  // svlasta returns the first element if all predicate bits are 0
+  return svlasta_f32(svpfalse_b(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a)
+{
+  return svrev_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a)
+{
+  return svabs_f32_z(svptrue_b32(), a);
+}
+
+// TODO(tellenbach): Should this go into MathFunctions.h? If so, change for 
+// all vector extensions and the generic version.
+template <>
+EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent)
+{
+  return pfrexp_float(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a)
+{
+  return svaddv_f32(svptrue_b32(), a);
+}
+
+// Other reduction functions:
+// mul
+// Only works for SVE Vls multiple of 128
+template <>
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a)
+{
+  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
+                      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
+  // Multiply the vector by its reverse
+  svfloat32_t prod = svmul_f32_z(svptrue_b32(), a, svrev_f32(a));
+  svfloat32_t half_prod;
+
+  // Extract the high half of the vector. Depending on the VL more reductions need to be done
+  if (EIGEN_ARM64_SVE_VL >= 2048) {
+    half_prod = svtbl_f32(prod, svindex_u32(32, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 1024) {
+    half_prod = svtbl_f32(prod, svindex_u32(16, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 512) {
+    half_prod = svtbl_f32(prod, svindex_u32(8, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 256) {
+    half_prod = svtbl_f32(prod, svindex_u32(4, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  // Last reduction
+  half_prod = svtbl_f32(prod, svindex_u32(2, 1));
+  prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+
+  // The reduction is done to the first element.
+  return pfirst<PacketXf>(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PacketXf>(const PacketXf& a)
+{
+  return svminv_f32(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PacketXf>(const PacketXf& a)
+{
+  return svmaxv_f32(svptrue_b32(), a);
+}
+
+template<int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXf, N>& kernel)
+{
+  float buffer[packet_traits<float>::size * N] = {0};
+  int i = 0;
+
+  PacketXi stride_index = svindex_s32(0, N);
+
+  for (i = 0; i < N; i++) {
+    svst1_scatter_s32index_f32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = svld1_f32(svptrue_b32(), buffer + i * packet_traits<float>::size);
+  }
+}
+
+template<>
+EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent)
+{
+  return pldexp_float(a, exponent);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_SVE_H
diff --git a/Eigen/src/Core/arch/SVE/TypeCasting.h b/Eigen/src/Core/arch/SVE/TypeCasting.h
new file mode 100644
index 000000000..7ba5d9cd1
--- /dev/null
+++ b/Eigen/src/Core/arch/SVE/TypeCasting.h
@@ -0,0 +1,49 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_SVE_H
+#define EIGEN_TYPE_CASTING_SVE_H
+
+namespace Eigen {
+namespace internal {
+
+template <>
+struct type_casting_traits<float, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+struct type_casting_traits<numext::int32_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXi, PacketXf>(const PacketXi& a) {
+  return svcvt_f32_s32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXf, PacketXi>(const PacketXf& a) {
+  return svcvt_s32_f32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preinterpret<PacketXf, PacketXi>(const PacketXi& a) {
+  return svreinterpret_f32_s32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi preinterpret<PacketXi, PacketXf>(const PacketXf& a) {
+  return svreinterpret_s32_f32(a);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_SVE_H
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index f07a284f6..af4e69623 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2008-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2020, Arm Limited and Contributors
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -384,34 +385,50 @@
     #undef vector
     #undef pixel
 
-  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
+  #elif ((defined  __ARM_NEON) || (defined __ARM_NEON__)) && !(defined EIGEN_ARM64_USE_SVE)
 
     #define EIGEN_VECTORIZE
     #define EIGEN_VECTORIZE_NEON
     #include <arm_neon.h>
 
-  #elif (defined __s390x__ && defined __VEC__)
+  // We currently require SVE to be enabled explicitly via EIGEN_ARM64_USE_SVE and
+  // will not select the backend automatically
+  #elif (defined __ARM_FEATURE_SVE) && (defined EIGEN_ARM64_USE_SVE)
 
     #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_ZVECTOR
-    #include <vecintrin.h>
+    #define EIGEN_VECTORIZE_SVE
+    #include <arm_sve.h>
+
+    // Since we depend on knowing SVE vector lengths at compile-time, we need
+    // to ensure a fixed lengths is set
+    #if defined __ARM_FEATURE_SVE_BITS
+      #define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS
+    #else
+#error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set."
+#endif
 
-  #elif defined __mips_msa
+#elif (defined __s390x__ && defined __VEC__)
 
-    // Limit MSA optimizations to little-endian CPUs for now.
-    // TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
-    #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-      #if defined(__LP64__)
-        #define EIGEN_MIPS_64
-      #else
-        #define EIGEN_MIPS_32
-      #endif
-      #define EIGEN_VECTORIZE
-      #define EIGEN_VECTORIZE_MSA
-      #include <msa.h>
-    #endif
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_ZVECTOR
+#include <vecintrin.h>
 
-  #endif
+#elif defined __mips_msa
+
+// Limit MSA optimizations to little-endian CPUs for now.
+// TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#if defined(__LP64__)
+#define EIGEN_MIPS_64
+#else
+#define EIGEN_MIPS_32
+#endif
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_MSA
+#include <msa.h>
+#endif
+
+#endif
 #endif
 
 // Following the Arm ACLE arm_neon.h should also include arm_fp16.h but not all
@@ -478,6 +495,8 @@ inline static const char *SimdInstructionSetsInUse(void) {
   return "VSX";
 #elif defined(EIGEN_VECTORIZE_NEON)
   return "ARM NEON";
+#elif defined(EIGEN_VECTORIZE_SVE)
+  return "ARM SVE";
 #elif defined(EIGEN_VECTORIZE_ZVECTOR)
   return "S390X ZVECTOR";
 #elif defined(EIGEN_VECTORIZE_MSA)
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index ad9af5727..f7f907ab7 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -3,6 +3,7 @@
 //
 // Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2020, Arm Limited and Contributors
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -474,6 +475,7 @@ namespace Architecture
     VSX = 0x3,
     NEON = 0x4,
     MSA = 0x5,
+    SVE = 0x6,
 #if defined EIGEN_VECTORIZE_SSE
     Target = SSE
 #elif defined EIGEN_VECTORIZE_ALTIVEC
@@ -482,6 +484,8 @@ namespace Architecture
     Target = VSX
 #elif defined EIGEN_VECTORIZE_NEON
     Target = NEON
+#elif defined EIGEN_VECTORIZE_SVE
+    Target = SVE
 #elif defined EIGEN_VECTORIZE_MSA
     Target = MSA
 #else
author	David Tellenbach <david.tellenbach@me.com>	2021-01-21 21:11:57 +0000
committer	David Tellenbach <david.tellenbach@me.com>	2021-01-21 21:11:57 +0000
commit	65e2169c4521660d30f4d90df61da5f3dd9f45bd (patch)
tree	dcea1ca38e2a77b630dd7159dace922f9169c40d
parent	b2126fd6b5e232d072ceadb1abb6695ae3352e2e (diff)