merge with default branch

author: Gael Guennebaud <g.gael@free.fr> 2010-03-04 18:47:52 +0100
committer: Gael Guennebaud <g.gael@free.fr> 2010-03-04 18:47:52 +0100
commit: cefd9b888868bca6b23d67c0e6c69c49582508c3 (patch)
tree: e7cd032e6475afcee813cbefdf01d3c5bda33d9a /Eigen/src/Core/arch
parent: 65eba35f98941a1d5c7ff6f854ed17224ef65b40 (diff)
parent: 8ed1ef446998dc35f738ad9984cf479dbfc2cc6c (diff)
6 files changed, 454 insertions, 5 deletions
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 1526a4b97..449de2078 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -169,6 +169,11 @@ template<> inline v4f  ei_pdiv(const v4f&   a, const v4f&   b) {
   return res;
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
+{ ei_assert(false && "packet integer division are not supported by AltiVec");
+  return ei_pset1<int>(0);
+}
+
 template<> inline v4f  ei_pmadd(const v4f&  a, const v4f&   b, const v4f&  c) { return vec_madd(a, b, c); }
 
 template<> inline v4f  ei_pmin(const v4f&   a, const v4f&   b) { return vec_min(a,b); }
diff --git a/Eigen/src/Core/arch/CMakeLists.txt b/Eigen/src/Core/arch/CMakeLists.txt
index 8ddba284e..5470ed8f3 100644
--- a/Eigen/src/Core/arch/CMakeLists.txt
+++ b/Eigen/src/Core/arch/CMakeLists.txt
@@ -1,2 +1,3 @@
 ADD_SUBDIRECTORY(SSE)
-ADD_SUBDIRECTORY(AltiVec)
-\ No newline at end of file
+ADD_SUBDIRECTORY(AltiVec)
+ADD_SUBDIRECTORY(NEON)
diff --git a/Eigen/src/Core/arch/Default/Settings.h b/Eigen/src/Core/arch/Default/Settings.h
new file mode 100644
index 000000000..1e7cebdba
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/Settings.h
@@ -0,0 +1,65 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+
+/* All the parameters defined in this file can be specialized in the
+ * architecture specific files, and/or by the user.
+ * More to come... */
+
+#ifndef EIGEN_DEFAULT_SETTINGS_H
+#define EIGEN_DEFAULT_SETTINGS_H
+
+/** Defines the maximal loop size to enable meta unrolling of loops.
+  * Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
+  * it does not correspond to the number of iterations or the number of instructions
+  */
+#ifndef EIGEN_UNROLLING_LIMIT
+#define EIGEN_UNROLLING_LIMIT 100
+#endif
+
+/** Defines the threshold between a "small" and a "large" matrix.
+  * This threshold is mainly used to select the proper product implementation.
+  */
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+/** Defines the maximal size in Bytes of blocks fitting in CPU cache.
+  * The current value is set to generate blocks of 256x256 for float
+  *
+  * Typically for a single-threaded application you would set that to 25% of the size of your CPU caches in bytes
+  */
+#ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
+#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (sizeof(float)*256*256)
+#endif
+
+/** Defines the maximal width of the blocks used in the triangular product and solver
+  * for vectors (level 2 blas xTRMV and xTRSV). The default is 8.
+  */
+#ifndef EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH
+#define EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH 8
+#endif
+
+#endif // EIGEN_DEFAULT_SETTINGS_H
diff --git a/Eigen/src/Core/arch/NEON/CMakeLists.txt b/Eigen/src/Core/arch/NEON/CMakeLists.txt
new file mode 100644
index 000000000..fd4d4af50
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_Core_arch_NEON_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_Core_arch_NEON_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/NEON COMPONENT Devel
+)
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
new file mode 100644
index 000000000..f71b92a75
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -0,0 +1,372 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr>
+// Heavily based on Gael's SSE version.
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_PACKET_MATH_NEON_H
+#define EIGEN_PACKET_MATH_NEON_H
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
+#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 4*96*96
+#endif
+
+typedef float32x4_t Packet4f;
+typedef int32x4_t   Packet4i;
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+  const Packet4f ei_p4f_##NAME = ei_pset1<float>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
+  const Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
+
+template<> struct ei_packet_traits<float>  : ei_default_packet_traits
+{
+  typedef Packet4f type; enum {size=4};
+  enum {
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 0,
+    HasSqrt = 0
+  };
+};
+template<> struct ei_packet_traits<int>    : ei_default_packet_traits
+{ typedef Packet4i type; enum {size=4}; };
+
+template<> struct ei_unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
+template<> struct ei_unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) { return vdupq_n_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int&    from)   { return vdupq_n_s32(from); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a)
+{
+  Packet4f countdown = { 3, 2, 1, 0 };
+  return vaddq_f32(ei_pset1(a), countdown);
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a)
+{
+  Packet4i countdown = { 3, 2, 1, 0 };
+  return vaddq_s32(ei_pset1(a), countdown);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pnegate(const Packet4f& a) { return vnegq_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pnegate(const Packet4i& a) { return vnegq_s32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f inv, restep, div;
+
+  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
+  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
+  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
+  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
+  // Newton-Raphson and vrecpsq_f32()
+  inv = vrecpeq_f32(b);
+
+  // This returns a differential, by which we will have to multiply inv to get a better
+  // approximation of 1/b.
+  restep = vrecpsq_f32(b, inv);
+  inv = vmulq_f32(restep, inv);
+
+  // Finally, multiply a by 1/b and get the wanted result of the division.
+  div = vmulq_f32(a, inv);
+
+  return div;
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
+{ ei_assert(false && "packet integer division are not supported by NEON");
+  return ei_pset1<int>(0);
+}
+
+// for some weird raisons, it has to be overloaded for packet of integers
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return ei_padd(ei_pmul(a,b), c); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
+template<> EIGEN_STRONG_INLINE Packet4f ei_pand<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_por<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)   { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+
+template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
+
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
+
+// FIXME only store the 2 first elements ?
+template<> EIGEN_STRONG_INLINE float  ei_pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE int    ei_pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_preverse(const Packet4f& a) {
+  float32x2_t a_lo, a_hi;
+  Packet4f a_r64, a_r128;
+
+  a_r64 = vrev64q_f32(a);
+  a_lo = vget_low_f32(a_r64);
+  a_hi = vget_high_f32(a_r64);
+  a_r128 = vcombine_f32(a_hi, a_lo);
+
+  return a_r128;
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_preverse(const Packet4i& a) {
+  int32x2_t a_lo, a_hi;
+  Packet4i a_r64, a_r128;
+
+  a_r64 = vrev64q_s32(a);
+  a_lo = vget_low_s32(a_r64);
+  a_hi = vget_high_s32(a_r64);
+  a_r128 = vcombine_s32(a_hi, a_lo);
+
+  return a_r128;
+}
+template<> EIGEN_STRONG_INLINE Packet4f ei_pabs(const Packet4f& a) { return vabsq_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a) { return vabsq_s32(a); }
+
+template<> EIGEN_STRONG_INLINE float ei_predux<Packet4f>(const Packet4f& a)
+{
+  float32x2_t a_lo, a_hi, sum;
+  float s[2];
+
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  sum = vpadd_f32(a_lo, a_hi);
+  sum = vpadd_f32(sum, sum);
+  vst1_f32(s, sum);
+
+  return s[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_preduxp<Packet4f>(const Packet4f* vecs)
+{
+  float32x4x2_t vtrn1, vtrn2, res1, res2;
+  Packet4f sum1, sum2, sum;
+
+  // NEON zip performs interleaving of the supplied vectors.
+  // We perform two interleaves in a row to acquire the transposed vector
+  vtrn1 = vzipq_f32(vecs[0], vecs[2]);
+  vtrn2 = vzipq_f32(vecs[1], vecs[3]);
+  res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
+  res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
+
+  // Do the addition of the resulting vectors
+  sum1 = vaddq_f32(res1.val[0], res1.val[1]);
+  sum2 = vaddq_f32(res2.val[0], res2.val[1]);
+  sum = vaddq_f32(sum1, sum2);
+
+  return sum;
+}
+
+template<> EIGEN_STRONG_INLINE int ei_predux<Packet4i>(const Packet4i& a)
+{
+  int32x2_t a_lo, a_hi, sum;
+  int32_t s[2];
+
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  sum = vpadd_s32(a_lo, a_hi);
+  sum = vpadd_s32(sum, sum);
+  vst1_s32(s, sum);
+
+  return s[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i ei_preduxp<Packet4i>(const Packet4i* vecs)
+{
+  int32x4x2_t vtrn1, vtrn2, res1, res2;
+  Packet4i sum1, sum2, sum;
+
+  // NEON zip performs interleaving of the supplied vectors.
+  // We perform two interleaves in a row to acquire the transposed vector
+  vtrn1 = vzipq_s32(vecs[0], vecs[2]);
+  vtrn2 = vzipq_s32(vecs[1], vecs[3]);
+  res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
+  res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
+
+  // Do the addition of the resulting vectors
+  sum1 = vaddq_s32(res1.val[0], res1.val[1]);
+  sum2 = vaddq_s32(res2.val[0], res2.val[1]);
+  sum = vaddq_s32(sum1, sum2);
+
+  return sum;
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE float ei_predux_mul<Packet4f>(const Packet4f& a)
+{
+  float32x2_t a_lo, a_hi, prod;
+  float s[2];
+
+  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
+  prod = vmul_f32(a_lo, a_hi);
+  // Multiply prod with its swapped value |a2*a4|a1*a3|
+  prod = vmul_f32(prod, vrev64_f32(prod));
+  vst1_f32(s, prod);
+
+  return s[0];
+}
+template<> EIGEN_STRONG_INLINE int ei_predux_mul<Packet4i>(const Packet4i& a)
+{
+  int32x2_t a_lo, a_hi, prod;
+  int32_t s[2];
+
+  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
+  prod = vmul_s32(a_lo, a_hi);
+  // Multiply prod with its swapped value |a2*a4|a1*a3|
+  prod = vmul_s32(prod, vrev64_s32(prod));
+  vst1_s32(s, prod);
+
+  return s[0];
+}
+
+// min
+template<> EIGEN_STRONG_INLINE float ei_predux_min<Packet4f>(const Packet4f& a)
+{
+  float32x2_t a_lo, a_hi, min;
+  float s[2];
+
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  min = vpmin_f32(a_lo, a_hi);
+  min = vpmin_f32(min, min);
+  vst1_f32(s, min);
+
+  return s[0];
+}
+template<> EIGEN_STRONG_INLINE int ei_predux_min<Packet4i>(const Packet4i& a)
+{
+  int32x2_t a_lo, a_hi, min;
+  int32_t s[2];
+
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  min = vpmin_s32(a_lo, a_hi);
+  min = vpmin_s32(min, min);
+  vst1_s32(s, min);
+
+  return s[0];
+}
+
+// max
+template<> EIGEN_STRONG_INLINE float ei_predux_max<Packet4f>(const Packet4f& a)
+{
+  float32x2_t a_lo, a_hi, max;
+  float s[2];
+
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  max = vpmax_f32(a_lo, a_hi);
+  max = vpmax_f32(max, max);
+  vst1_f32(s, max);
+
+  return s[0];
+}
+template<> EIGEN_STRONG_INLINE int ei_predux_max<Packet4i>(const Packet4i& a)
+{
+  int32x2_t a_lo, a_hi, max;
+  int32_t s[2];
+
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  max = vpmax_s32(a_lo, a_hi);
+  max = vpmax_s32(max, max);
+  vst1_s32(s, max);
+
+  return s[0];
+}
+
+template<int Offset>
+struct ei_palign_impl<Offset,Packet4f>
+{
+  EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second)
+  {
+    if (Offset!=0)
+      first = vextq_f32(first, second, Offset);
+  }
+};
+
+template<int Offset>
+struct ei_palign_impl<Offset,Packet4i>
+{
+  EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second)
+  {
+    if (Offset!=0)
+      first = vextq_s32(first, second, Offset);
+  }
+};
+#endif // EIGEN_PACKET_MATH_NEON_H
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index de96aaffa..282a1971c 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -122,7 +122,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pmul<Packet4f>(const Packet4f& a, con
 template<> EIGEN_STRONG_INLINE Packet2d ei_pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
-#ifdef __SSE4_1__
+#ifdef EIGEN_VECTORIZE_SSE4_1
   return _mm_mullo_epi32(a,b);
 #else
   // this version is slightly faster than 4 scalar products
@@ -269,7 +269,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ei_pabs(const Packet2d& a)
 }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a)
 {
-  #ifdef __SSSE3__
+  #ifdef EIGEN_VECTORIZE_SSSE3
   return _mm_abs_epi32(a);
   #else
   Packet4i aux = _mm_srai_epi32(a,31);
@@ -285,7 +285,7 @@ EIGEN_STRONG_INLINE void ei_punpackp(Packet4f* vecs)
   vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
 }
 
-#ifdef __SSE3__
+#ifdef EIGEN_VECTORIZE_SSE3
 // TODO implement SSE2 versions as well as integer versions
 template<> EIGEN_STRONG_INLINE Packet4f ei_preduxp<Packet4f>(const Packet4f* vecs)
 {
@@ -446,7 +446,7 @@ template<> EIGEN_STRONG_INLINE int ei_predux_max<Packet4i>(const Packet4i& a)
 // }
 #endif
 
-#ifdef __SSSE3__
+#ifdef EIGEN_VECTORIZE_SSSE3
 // SSSE3 versions
 template<int Offset>
 struct ei_palign_impl<Offset,Packet4f>
author	Gael Guennebaud <g.gael@free.fr>	2010-03-04 18:47:52 +0100
committer	Gael Guennebaud <g.gael@free.fr>	2010-03-04 18:47:52 +0100
commit	cefd9b888868bca6b23d67c0e6c69c49582508c3 (patch)
tree	e7cd032e6475afcee813cbefdf01d3c5bda33d9a /Eigen/src/Core/arch
parent	65eba35f98941a1d5c7ff6f854ed17224ef65b40 (diff)
parent	8ed1ef446998dc35f738ad9984cf479dbfc2cc6c (diff)