diff options
-rw-r--r-- | CMakeLists.txt | 7 | ||||
-rw-r--r-- | Eigen/Core | 8 | ||||
-rw-r--r-- | Eigen/src/Core/MatrixStorage.h | 6 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AltiVec/PacketMath.h | 5 | ||||
-rw-r--r-- | Eigen/src/Core/arch/CMakeLists.txt | 3 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/CMakeLists.txt | 6 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/PacketMath.h | 371 | ||||
-rw-r--r-- | Eigen/src/Core/util/Macros.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/util/Memory.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/util/XprHelper.h | 5 | ||||
-rw-r--r-- | cmake/EigenTesting.cmake | 8 | ||||
-rw-r--r-- | test/packetmath.cpp | 5 |
12 files changed, 423 insertions, 5 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index eefaf4b47..7162e4457 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,6 +103,13 @@ if(CMAKE_COMPILER_IS_GNUCXX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec") message("Enabling AltiVec in tests/examples") endif() + + option(EIGEN_TEST_NEON "Enable/Disable altivec in tests/examples" OFF) + if(EIGEN_TEST_NEON) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp -mfpu=neon -mcpu=cortex-a8") + message("Enabling NEON in tests/examples") + endif() + endif(CMAKE_COMPILER_IS_GNUCXX) if(MSVC) diff --git a/Eigen/Core b/Eigen/Core index f984a96c6..2908fef2e 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -111,6 +111,10 @@ #undef bool #undef vector #undef pixel + #elif defined __ARM_NEON__ + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_NEON + #include "arm_neon.h" #endif #endif @@ -165,6 +169,8 @@ inline static const char *SimdInstructionSetsInUse(void) { return "SSE, SSE2"; #elif defined(EIGEN_VECTORIZE_ALTIVEC) return "AltiVec"; +#elif defined(EIGEN_VECTORIZE_NEON) + return "ARM NEON"; #else return "None"; #endif @@ -204,6 +210,8 @@ struct Dense {}; #include "src/Core/arch/SSE/MathFunctions.h" #elif defined EIGEN_VECTORIZE_ALTIVEC #include "src/Core/arch/AltiVec/PacketMath.h" +#elif defined EIGEN_VECTORIZE_NEON + #include "src/Core/arch/NEON/PacketMath.h" #endif #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD diff --git a/Eigen/src/Core/MatrixStorage.h b/Eigen/src/Core/MatrixStorage.h index 3303b2663..ece603ffa 100644 --- a/Eigen/src/Core/MatrixStorage.h +++ b/Eigen/src/Core/MatrixStorage.h @@ -50,6 +50,12 @@ struct ei_matrix_array ei_matrix_array(ei_constructor_without_unaligned_array_assert) {} }; +// FIXME!!! This is a hack because ARM gcc does not honour __attribute__((aligned(16))) properly +#ifdef __ARM_NEON__ + #ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT + #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT + #endif +#endif #ifdef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) #else diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 1526a4b97..449de2078 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -169,6 +169,11 @@ template<> inline v4f ei_pdiv(const v4f& a, const v4f& b) { return res; } +template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) +{ ei_assert(false && "packet integer division are not supported by AltiVec"); + return ei_pset1<int>(0); +} + template<> inline v4f ei_pmadd(const v4f& a, const v4f& b, const v4f& c) { return vec_madd(a, b, c); } template<> inline v4f ei_pmin(const v4f& a, const v4f& b) { return vec_min(a,b); } diff --git a/Eigen/src/Core/arch/CMakeLists.txt b/Eigen/src/Core/arch/CMakeLists.txt index 8ddba284e..5470ed8f3 100644 --- a/Eigen/src/Core/arch/CMakeLists.txt +++ b/Eigen/src/Core/arch/CMakeLists.txt @@ -1,2 +1,3 @@ ADD_SUBDIRECTORY(SSE) -ADD_SUBDIRECTORY(AltiVec)
\ No newline at end of file +ADD_SUBDIRECTORY(AltiVec) +ADD_SUBDIRECTORY(NEON) diff --git a/Eigen/src/Core/arch/NEON/CMakeLists.txt b/Eigen/src/Core/arch/NEON/CMakeLists.txt new file mode 100644 index 000000000..fd4d4af50 --- /dev/null +++ b/Eigen/src/Core/arch/NEON/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_Core_arch_NEON_SRCS "*.h") + +INSTALL(FILES + ${Eigen_Core_arch_NEON_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/NEON COMPONENT Devel +) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h new file mode 100644 index 000000000..9df485105 --- /dev/null +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -0,0 +1,371 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr> +// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr> +// Heavily based on Gael's SSE version. +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_PACKET_MATH_NEON_H +#define EIGEN_PACKET_MATH_NEON_H + +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE +#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 4*96*96 +#endif + +typedef float32x4_t Packet4f; +typedef int32x4_t Packet4i; + +#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ + const Packet4f ei_p4f_##NAME = ei_pset1<float>(X) + +#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ + const Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X)) + +#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ + const Packet4i ei_p4i_##NAME = ei_pset1<int>(X) + +template<> struct ei_packet_traits<float> : ei_default_packet_traits +{ + typedef Packet4f type; enum {size=4}; + enum { + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 0, + HasSqrt = 0 + }; +}; +template<> struct ei_packet_traits<int> : ei_default_packet_traits +{ typedef Packet4i type; enum {size=4}; }; + +template<> struct ei_unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; +template<> struct ei_unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; + +template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { return vdupq_n_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { return vdupq_n_s32(from); } + +template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) +{ + Packet4f countdown = { 3, 2, 1, 0 }; + return vaddq_f32(ei_pset1(a), countdown); +} +template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) +{ + Packet4i countdown = { 3, 2, 1, 0 }; + return vaddq_s32(ei_pset1(a), countdown); +} + +template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i ei_padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f ei_psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i ei_psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f ei_pnegate(const Packet4f& a) { return vnegq_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pnegate(const Packet4i& a) { return vnegq_s32(a); } + +template<> EIGEN_STRONG_INLINE Packet4f ei_pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) +{ + Packet4f inv, restep, div; + + // NEON does not offer a divide instruction, we have to do a reciprocal approximation + // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers + // a reciprocal estimate AND a reciprocal step -which saves a few instructions + // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with + // Newton-Raphson and vrecpsq_f32() + inv = vrecpeq_f32(b); + + // This returns a differential, by which we will have to multiply inv to get a better + // approximation of 1/b. + restep = vrecpsq_f32(b, inv); + inv = vmulq_f32(restep, inv); + + // Finally, multiply a by 1/b and get the wanted result of the division. + div = vmulq_f32(a, inv); + + return div; +} +template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) +{ ei_assert(false && "packet integer division are not supported by NEON"); + return ei_pset1<int>(0); +} + +// for some weird raisons, it has to be overloaded for packet of integers +template<> EIGEN_STRONG_INLINE Packet4i ei_pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return ei_padd(ei_pmul(a,b), c); } + +template<> EIGEN_STRONG_INLINE Packet4f ei_pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f ei_pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); } + +// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics +template<> EIGEN_STRONG_INLINE Packet4f ei_pand<Packet4f>(const Packet4f& a, const Packet4f& b) +{ + return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +} +template<> EIGEN_STRONG_INLINE Packet4i ei_pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f ei_por<Packet4f>(const Packet4f& a, const Packet4f& b) +{ + return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +} +template<> EIGEN_STRONG_INLINE Packet4i ei_por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f ei_pxor<Packet4f>(const Packet4f& a, const Packet4f& b) +{ + return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +} +template<> EIGEN_STRONG_INLINE Packet4i ei_pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) +{ + return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +} +template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } + +template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } + +template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } +template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } + +template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } +template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } + +template<> EIGEN_STRONG_INLINE float ei_pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE int ei_pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet4f ei_preverse(const Packet4f& a) { + float32x2_t a_lo, a_hi; + Packet4f a_r64, a_r128; + + a_r64 = vrev64q_f32(a); + a_lo = vget_low_f32(a_r64); + a_hi = vget_high_f32(a_r64); + a_r128 = vcombine_f32(a_hi, a_lo); + + return a_r128; +} +template<> EIGEN_STRONG_INLINE Packet4i ei_preverse(const Packet4i& a) { + int32x2_t a_lo, a_hi; + Packet4i a_r64, a_r128; + + a_r64 = vrev64q_s32(a); + a_lo = vget_low_s32(a_r64); + a_hi = vget_high_s32(a_r64); + a_r128 = vcombine_s32(a_hi, a_lo); + + return a_r128; +} +template<> EIGEN_STRONG_INLINE Packet4f ei_pabs(const Packet4f& a) { return vabsq_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a) { return vabsq_s32(a); } + +template<> EIGEN_STRONG_INLINE float ei_predux<Packet4f>(const Packet4f& a) +{ + float32x2_t a_lo, a_hi, sum; + float s[2]; + + a_lo = vget_low_f32(a); + a_hi = vget_high_f32(a); + sum = vpadd_f32(a_lo, a_hi); + sum = vpadd_f32(sum, sum); + vst1_f32(s, sum); + + return s[0]; +} + +template<> EIGEN_STRONG_INLINE Packet4f ei_preduxp<Packet4f>(const Packet4f* vecs) +{ + float32x4x2_t vtrn1, vtrn2, res1, res2; + Packet4f sum1, sum2, sum; + + // NEON zip performs interleaving of the supplied vectors. + // We perform two interleaves in a row to acquire the transposed vector + vtrn1 = vzipq_f32(vecs[0], vecs[2]); + vtrn2 = vzipq_f32(vecs[1], vecs[3]); + res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]); + res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]); + + // Do the addition of the resulting vectors + sum1 = vaddq_f32(res1.val[0], res1.val[1]); + sum2 = vaddq_f32(res2.val[0], res2.val[1]); + sum = vaddq_f32(sum1, sum2); + + return sum; +} + +template<> EIGEN_STRONG_INLINE int ei_predux<Packet4i>(const Packet4i& a) +{ + int32x2_t a_lo, a_hi, sum; + int32_t s[2]; + + a_lo = vget_low_s32(a); + a_hi = vget_high_s32(a); + sum = vpadd_s32(a_lo, a_hi); + sum = vpadd_s32(sum, sum); + vst1_s32(s, sum); + + return s[0]; +} + +template<> EIGEN_STRONG_INLINE Packet4i ei_preduxp<Packet4i>(const Packet4i* vecs) +{ + int32x4x2_t vtrn1, vtrn2, res1, res2; + Packet4i sum1, sum2, sum; + + // NEON zip performs interleaving of the supplied vectors. + // We perform two interleaves in a row to acquire the transposed vector + vtrn1 = vzipq_s32(vecs[0], vecs[2]); + vtrn2 = vzipq_s32(vecs[1], vecs[3]); + res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]); + res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]); + + // Do the addition of the resulting vectors + sum1 = vaddq_s32(res1.val[0], res1.val[1]); + sum2 = vaddq_s32(res2.val[0], res2.val[1]); + sum = vaddq_s32(sum1, sum2); + + return sum; +} + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE float ei_predux_mul<Packet4f>(const Packet4f& a) +{ + float32x2_t a_lo, a_hi, prod; + float s[2]; + + // Get a_lo = |a1|a2| and a_hi = |a3|a4| + a_lo = vget_low_f32(a); + a_hi = vget_high_f32(a); + // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| + prod = vmul_f32(a_lo, a_hi); + // Multiply prod with its swapped value |a2*a4|a1*a3| + prod = vmul_f32(prod, vrev64_f32(prod)); + vst1_f32(s, prod); + + return s[0]; +} +template<> EIGEN_STRONG_INLINE int ei_predux_mul<Packet4i>(const Packet4i& a) +{ + int32x2_t a_lo, a_hi, prod; + int32_t s[2]; + + // Get a_lo = |a1|a2| and a_hi = |a3|a4| + a_lo = vget_low_s32(a); + a_hi = vget_high_s32(a); + // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| + prod = vmul_s32(a_lo, a_hi); + // Multiply prod with its swapped value |a2*a4|a1*a3| + prod = vmul_s32(prod, vrev64_s32(prod)); + vst1_s32(s, prod); + + return s[0]; +} + +// min +template<> EIGEN_STRONG_INLINE float ei_predux_min<Packet4f>(const Packet4f& a) +{ + float32x2_t a_lo, a_hi, min; + float s[2]; + + a_lo = vget_low_f32(a); + a_hi = vget_high_f32(a); + min = vpmin_f32(a_lo, a_hi); + min = vpmin_f32(min, min); + vst1_f32(s, min); + + return s[0]; +} +template<> EIGEN_STRONG_INLINE int ei_predux_min<Packet4i>(const Packet4i& a) +{ + int32x2_t a_lo, a_hi, min; + int32_t s[2]; + + a_lo = vget_low_s32(a); + a_hi = vget_high_s32(a); + min = vpmin_s32(a_lo, a_hi); + min = vpmin_s32(min, min); + vst1_s32(s, min); + + return s[0]; +} + +// max +template<> EIGEN_STRONG_INLINE float ei_predux_max<Packet4f>(const Packet4f& a) +{ + float32x2_t a_lo, a_hi, max; + float s[2]; + + a_lo = vget_low_f32(a); + a_hi = vget_high_f32(a); + max = vpmax_f32(a_lo, a_hi); + max = vpmax_f32(max, max); + vst1_f32(s, max); + + return s[0]; +} +template<> EIGEN_STRONG_INLINE int ei_predux_max<Packet4i>(const Packet4i& a) +{ + int32x2_t a_lo, a_hi, max; + int32_t s[2]; + + a_lo = vget_low_s32(a); + a_hi = vget_high_s32(a); + max = vpmax_s32(a_lo, a_hi); + max = vpmax_s32(max, max); + vst1_s32(s, max); + + return s[0]; +} + +template<int Offset> +struct ei_palign_impl<Offset,Packet4f> +{ + EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second) + { + if (Offset!=0) + first = vextq_f32(first, second, Offset); + } +}; + +template<int Offset> +struct ei_palign_impl<Offset,Packet4i> +{ + EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second) + { + if (Offset!=0) + first = vextq_s32(first, second, Offset); + } +}; +#endif // EIGEN_PACKET_MATH_NEON_H diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 37ccef047..7970b3bb0 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -39,7 +39,7 @@ // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable 16 byte alignment on all // platforms where vectorization might be enabled. In theory we could always enable alignment, but it can be a cause of problems // on some platforms, so we just disable it in certain common platform (compiler+architecture combinations) to avoid these problems. -#if defined(__GNUC__) && !(defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__ppc__) || defined(__ia64__)) +#if defined(__GNUC__) && !(defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__ppc__) || defined(__ia64__) || defined(__ARM_NEON__)) #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_ALIGNMENT 1 #else #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_ALIGNMENT 0 diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index fbb1ef4d6..4d037b998 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -424,7 +424,7 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size) * ei_aligned_stack_free(data,float,array.size()); * \endcode */ -#ifdef __linux__ +#if (defined __linux__) && !(defined __ARM_NEON__) #define ei_aligned_stack_alloc(SIZE) (SIZE<=EIGEN_STACK_ALLOCATION_LIMIT) \ ? alloca(SIZE) \ : ei_aligned_malloc(SIZE) diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 69c63e7bd..67665d91d 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -88,8 +88,13 @@ class ei_compute_matrix_flags enum { row_major_bit = Options&RowMajor ? RowMajorBit : 0, is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic, +#if !defined(__ARM_NEON__) is_fixed_size_aligned = (!is_dynamic_size_storage) +#else +// FIXME!!! This is a hack because ARM gcc does not honour __attribute__((aligned(16))) properly + is_fixed_size_aligned = 0 +#endif && (((MaxCols*MaxRows) % ei_packet_traits<Scalar>::size) == 0), aligned_bit = (((Options&DontAlign)==0) && (is_dynamic_size_storage || is_fixed_size_aligned)) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 7d90882a2..b08f8c340 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -185,11 +185,17 @@ macro(ei_testing_print_summary) endif() if(EIGEN_TEST_ALTIVEC) - message("Altivec: Using architecture defaults") + message("Altivec: ON") else() message("Altivec: Using architecture defaults") endif() + if(EIGEN_TEST_NEON) + message("ARM NEON: ON") + else() + message("ARM NEON: Using architecture defaults") + endif() + if(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION) message("Explicit vec: OFF") else() diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 7d863e616..e0cb61525 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -32,7 +32,10 @@ template<typename T> T ei_negate(const T& x) { return -x; } template<typename Scalar> bool areApprox(const Scalar* a, const Scalar* b, int size) { for (int i=0; i<size; ++i) - if (!ei_isApprox(a[i],b[i])) return false; + if (!ei_isApprox(a[i],b[i])) { + std::cout << "a[" << i << "]: " << a[i] << ", b[" << i << "]: " << b[i] << std::endl; + return false; + } return true; } |