From 87524922dcb1e23eccccdf8ff6df2a1b9bee6308 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 21 Oct 2014 18:08:50 +0000 Subject: check for __ARM_NEON instead as it's defined in arm64 as well --- Eigen/Core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/Core b/Eigen/Core index adab50b4a..2174c6bf8 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -180,7 +180,7 @@ #undef bool #undef vector #undef pixel - #elif defined __ARM_NEON__ + #elif defined __ARM_NEON #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_NEON #include -- cgit v1.2.3 From 0f65f2762de6f2da4d88d19175560a1a2788ebd7 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 21 Oct 2014 18:10:01 +0000 Subject: add EIGEN_TEST_NEON64, but it's a dummy, AArch64 implies NEON support so extra CXXFLAGS are needed --- CMakeLists.txt | 8 ++++++++ cmake/EigenTesting.cmake | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ea42cc8db..05d92babe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -231,6 +231,14 @@ if(NOT MSVC) message(STATUS "Enabling NEON in tests/examples") endif() + option(EIGEN_TEST_NEON64 "Enable/Disable Neon in tests/examples" OFF) + if(EIGEN_TEST_NEON64) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + message(STATUS "Enabling NEON in tests/examples") + endif() + + + check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP) if(COMPILER_SUPPORT_OPENMP) option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 3ed002a87..3eb4e4c07 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -294,6 +294,12 @@ macro(ei_testing_print_summary) message(STATUS "ARM NEON: Using architecture defaults") endif() + if(EIGEN_TEST_NEON64) + message(STATUS "ARMv8 NEON: ON") + else() + message(STATUS "ARMv8 NEON: Using architecture defaults") + endif() + endif() # vectorization / alignment options message(STATUS "\n${EIGEN_TESTING_SUMMARY}") @@ -424,6 +430,8 @@ macro(ei_get_cxxflags VAR) ei_is_64bit_env(IS_64BIT_ENV) if(EIGEN_TEST_NEON) set(${VAR} NEON) + elseif(EIGEN_TEST_NEON64) + set(${VAR} NEON) elseif(EIGEN_TEST_VSX) set(${VAR} VSX) elseif(EIGEN_TEST_ALTIVEC) -- cgit v1.2.3 From b50861939217d2c6c92362e2eea59866b5386b33 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 21 Oct 2014 18:10:33 +0000 Subject: working 64-bit support in PacketMath.h, Complex.h needed --- Eigen/src/Core/arch/NEON/PacketMath.h | 174 +++++++++++++++++++++++++++++++++- 1 file changed, 173 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 0504c095c..e3a13d93a 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -20,10 +20,18 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif +#ifndef EIGEN_HAS_FUSED_MADD +#define EIGEN_HAS_FUSED_MADD 1 +#endif + +#ifndef EIGEN_HAS_FUSE_CJMADD +#define EIGEN_HAS_FUSE_CJMADD 1 +#endif + // FIXME NEON has 16 quad registers, but since the current register allocator // is so bad, it is much better to reduce it to 8 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif typedef float32x4_t Packet4f; @@ -71,6 +79,7 @@ template<> struct packet_traits : default_packet_traits Vectorizable = 1, AlignedOnScalar = 1, size = 4, + HasHalfPacket=0, HasDiv = 1, // FIXME check the Has* @@ -136,6 +145,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { +#ifndef __aarch64__ Packet4f inv, restep, div; // NEON does not offer a divide instruction, we have to do a reciprocal approximation @@ -154,7 +164,11 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const div = vmulq_f32(a, inv); return div; +#else + return vdivq_f32(a,b); +#endif } + template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, const Packet4i& /*b*/) { eigen_assert(false && "packet integer division are not supported by NEON"); return pset1(0); @@ -472,6 +486,164 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1])); } +//---------- double ---------- +#ifdef __aarch64__ + +typedef float64x2_t Packet2d; + +template<> struct packet_traits : default_packet_traits +{ + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket=0, + + HasDiv = 1, + // FIXME check the Has* + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 0, + HasSqrt = 0 + }; +}; + +template<> struct unpacket_traits { typedef double type; enum {size=2}; typedef Packet2d half; }; + +template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vdupq_n_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) +{ + Packet2d countdown = EIGEN_INIT_NEON_PACKET2(0, 1); + return vaddq_f64(pset1(a), countdown); +} +template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); } + +// for some weird raisons, it has to be overloaded for packet of integers +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); } + +// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} + +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} + +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} + +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} + +template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) +{ + return vld1q_dup_f64(from); +} +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); } + +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); } + +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, DenseIndex stride) +{ + Packet2d res; + res = vsetq_lane_f64(from[0*stride], res, 0); + res = vsetq_lane_f64(from[1*stride], res, 1); + return res; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, DenseIndex stride) +{ + to[stride*0] = vgetq_lane_f64(from, 0); + to[stride*1] = vgetq_lane_f64(from, 1); +} +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ARM_PREFETCH(addr); } + +// FIXME only store the 2 first elements ? +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return vgetq_lane_f64(a, 0); } + +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } + +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } + +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return vget_low_f64(a) + vget_high_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + float64x2_t trn1, trn2; + + // NEON zip performs interleaving of the supplied vectors. + // We perform two interleaves in a row to acquire the transposed vector + trn1 = vzip1q_f64(vecs[0], vecs[1]); + trn2 = vzip2q_f64(vecs[0], vecs[1]); + + // Do the addition of the resulting vectors + return vaddq_f64(trn1, trn2); +} +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return vget_low_f64(a) * vget_high_f64(a); } + +// min +template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } + +// max +template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); } + +// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, +// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 +#define PALIGN_NEON(Offset,Type,Command) \ +template<>\ +struct palign_impl\ +{\ + EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ + {\ + if (Offset!=0)\ + first = Command(first, second, Offset);\ + }\ +};\ + +PALIGN_NEON(0,Packet2d,vextq_f64) +PALIGN_NEON(1,Packet2d,vextq_f64) +#undef PALIGN_NEON + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]); + float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]); + + kernel.packet[0] = trn1; + kernel.packet[1] = trn2; +} +#endif // __aarch64__ + } // end namespace internal } // end namespace Eigen -- cgit v1.2.3 From fae4fd7a26ee31fbf1eaa5d3581916ccee3c004f Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 22 Oct 2014 07:39:49 +0000 Subject: Added ARMv8 support --- Eigen/src/Core/arch/NEON/Complex.h | 194 +++++++++++++++++++++++++++++++++- Eigen/src/Core/arch/NEON/PacketMath.h | 16 +-- 2 files changed, 201 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 42e7733d7..57de400e5 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -33,6 +33,7 @@ template<> struct packet_traits > : default_packet_traits Vectorizable = 1, AlignedOnScalar = 1, size = 2, + HasHalfPacket = 0, HasAdd = 1, HasSub = 1, @@ -88,7 +89,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { - return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); + return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { @@ -252,7 +253,7 @@ template<> struct conj_helper template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { - // TODO optimize it for AltiVec + // TODO optimize it for NEON Packet2cf res = conj_helper().pmul(a,b); Packet4f s, rev_s; @@ -265,11 +266,198 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); + Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); kernel.packet[1].v = tmp; } +//---------- double ---------- +#ifdef __aarch64__ + +static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000); + +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} + Packet2d v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits { typedef std::complex type; enum {size=1}; typedef Packet1cd half; }; + +template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) +{ /* here we really have to use unaligned loads :( */ return ploadu(&from); } + +template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } + +template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) +{ + Packet2d v1, v2; + + // Get the real values of a + v1 = vdupq_lane_f64(vget_low_f64(a.v), 0); + // Get the real values of a + v2 = vdupq_lane_f64(vget_high_f64(a.v), 1); + // Multiply the real a with b + v1 = vmulq_f64(v1, b.v); + // Multiply the imag a with b + v2 = vmulq_f64(v2, b.v); + // Conjugate v2 + v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR)); + // Swap real/imag elements in v2. + v2 = preverse(v2); + // Add and return the result + return Packet1cd(vaddq_f64(v1, v2)); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} + +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((double *)addr); } + +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, DenseIndex stride) +{ + Packet2d res; + res = vsetq_lane_f64(std::real(from[0*stride]), res, 0); + res = vsetq_lane_f64(std::imag(from[0*stride]), res, 1); + return Packet1cd(res); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, DenseIndex stride) +{ + to[stride*0] = std::complex(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); +} + + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) +{ + std::complex EIGEN_ALIGN16 res; + pstore >(&res, a); + + return res; +} + +template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } + +template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) + { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for NEON + Packet1cd res = conj_helper().pmul(a,b); + Packet2d s = pmul(b.v, b.v); + Packet2d rev_s = preverse(s); + + return Packet1cd(pdiv(res.v, padd(s,rev_s))); +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v)); + kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v)); + kernel.packet[1].v = tmp; +} +#endif // __aarch64__ } // end namespace internal diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index e3a13d93a..472f7c0fe 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -34,8 +34,10 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif +typedef float32x2_t Packet2f; typedef float32x4_t Packet4f; typedef int32x4_t Packet4i; +typedef int32x2_t Packet2i; typedef uint32x4_t Packet4ui; #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ @@ -74,12 +76,12 @@ typedef uint32x4_t Packet4ui; template<> struct packet_traits : default_packet_traits { typedef Packet4f type; - typedef Packet4f half; + typedef Packet2f half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 4, - HasHalfPacket=0, + HasHalfPacket=1, HasDiv = 1, // FIXME check the Has* @@ -93,11 +95,12 @@ template<> struct packet_traits : default_packet_traits template<> struct packet_traits : default_packet_traits { typedef Packet4i type; - typedef Packet4i half; + typedef Packet2i half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size=4 + size=4, + HasHalfPacket=1 // FIXME check the Has* }; }; @@ -490,16 +493,17 @@ ptranspose(PacketBlock& kernel) { #ifdef __aarch64__ typedef float64x2_t Packet2d; +typedef float64x1_t Packet1d; template<> struct packet_traits : default_packet_traits { typedef Packet2d type; - typedef Packet2d half; + typedef Packet1d half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 2, - HasHalfPacket=0, + HasHalfPacket=1, HasDiv = 1, // FIXME check the Has* -- cgit v1.2.3