aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt15
-rw-r--r--Eigen/Core26
-rw-r--r--Eigen/Geometry4
-rw-r--r--Eigen/LU4
-rw-r--r--Eigen/src/Core/Block.h2
-rw-r--r--Eigen/src/Core/DenseStorage.h8
-rw-r--r--Eigen/src/Core/GeneralProduct.h2
-rwxr-xr-xEigen/src/Core/GenericPacketMath.h23
-rw-r--r--Eigen/src/Core/Map.h2
-rw-r--r--Eigen/src/Core/MapBase.h2
-rw-r--r--Eigen/src/Core/Redux.h13
-rw-r--r--Eigen/src/Core/arch/AVX/CMakeLists.txt6
-rw-r--r--Eigen/src/Core/arch/AVX/Complex.h473
-rw-r--r--Eigen/src/Core/arch/AVX/PacketMath.h509
-rw-r--r--Eigen/src/Core/arch/SSE/Complex.h36
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h70
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h80
-rw-r--r--Eigen/src/Core/products/GeneralMatrixMatrix.h4
-rw-r--r--Eigen/src/Core/products/GeneralMatrixVector.h15
-rw-r--r--Eigen/src/Core/util/Macros.h19
-rw-r--r--Eigen/src/Core/util/Memory.h38
-rw-r--r--Eigen/src/Core/util/XprHelper.h2
-rw-r--r--cmake/EigenTesting.cmake16
-rw-r--r--test/geo_hyperplane.cpp8
-rw-r--r--test/geo_parametrizedline.cpp8
-rw-r--r--test/geo_quaternion.cpp14
-rw-r--r--test/geo_transformations.cpp6
-rw-r--r--test/mapped_matrix.cpp6
-rw-r--r--test/packetmath.cpp87
-rw-r--r--test/unalignedcount.cpp9
30 files changed, 1415 insertions, 92 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a4731c8c1..838a41b79 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -196,6 +196,21 @@ if(NOT MSVC)
message(STATUS "Enabling SSE4.2 in tests/examples")
endif()
+ option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF)
+ if(EIGEN_TEST_AVX)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
+ if(CMAKE_COMPILER_IS_GNUCXX)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6")
+ endif()
+ message(STATUS "Enabling AVX in tests/examples")
+ endif()
+
+ option(EIGEN_TEST_FMA "Enable/Disable FMA in tests/examples" OFF)
+ if(EIGEN_TEST_FMA)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
+ message(STATUS "Enabling FMA in tests/examples")
+ endif()
+
option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
if(EIGEN_TEST_ALTIVEC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
diff --git a/Eigen/Core b/Eigen/Core
index 412409497..661c7812e 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -110,7 +110,16 @@
#ifdef __SSE4_2__
#define EIGEN_VECTORIZE_SSE4_2
#endif
-
+ #ifdef __AVX__
+ #define EIGEN_VECTORIZE_AVX
+ #define EIGEN_VECTORIZE_SSE3
+ #define EIGEN_VECTORIZE_SSSE3
+ #define EIGEN_VECTORIZE_SSE4_1
+ #define EIGEN_VECTORIZE_SSE4_2
+ #endif
+ #ifdef __FMA__
+ #define EIGEN_VECTORIZE_FMA
+ #endif
// include files
// This extern "C" works around a MINGW-w64 compilation issue
@@ -140,6 +149,9 @@
#ifdef EIGEN_VECTORIZE_SSE4_2
#include <nmmintrin.h>
#endif
+ #ifdef EIGEN_VECTORIZE_AVX
+ #include <immintrin.h>
+ #endif
#endif
} // end extern "C"
#elif defined __ALTIVEC__
@@ -209,7 +221,9 @@
namespace Eigen {
inline static const char *SimdInstructionSetsInUse(void) {
-#if defined(EIGEN_VECTORIZE_SSE4_2)
+#if defined(EIGEN_VECTORIZE_AVX)
+ return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_2)
return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_SSE4_1)
return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
@@ -287,7 +301,13 @@ using std::ptrdiff_t;
#include "src/Core/MathFunctions.h"
#include "src/Core/GenericPacketMath.h"
-#if defined EIGEN_VECTORIZE_SSE
+#if defined EIGEN_VECTORIZE_AVX
+ // Use AVX for floats and doubles, SSE for integers
+ #include "src/Core/arch/SSE/PacketMath.h"
+ #include "src/Core/arch/SSE/Complex.h"
+ #include "src/Core/arch/AVX/PacketMath.h"
+ #include "src/Core/arch/AVX/Complex.h"
+#elif defined EIGEN_VECTORIZE_SSE
#include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/MathFunctions.h"
#include "src/Core/arch/SSE/Complex.h"
diff --git a/Eigen/Geometry b/Eigen/Geometry
index efd9d4504..f9bc6fc57 100644
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -47,7 +47,9 @@
#include "src/Geometry/AlignedBox.h"
#include "src/Geometry/Umeyama.h"
- #if defined EIGEN_VECTORIZE_SSE
+ // Use the SSE optimized version whenever possible. At the moment the
+ // SSE version doesn't compile when AVX is enabled
+ #if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
#include "src/Geometry/arch/Geometry_SSE.h"
#endif
#endif
diff --git a/Eigen/LU b/Eigen/LU
index db5795504..e5c3f32f7 100644
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -27,7 +27,9 @@
#include "src/LU/Determinant.h"
#include "src/LU/Inverse.h"
-#if defined EIGEN_VECTORIZE_SSE
+// Use the SSE optimized version whenever possible. At the moment the
+// SSE version doesn't compile when AVX is enabled
+#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
#include "src/LU/arch/Inverse_SSE.h"
#endif
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 31cd5c72c..e948e14aa 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -83,7 +83,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
&& (InnerStrideAtCompileTime == 1)
? PacketAccessBit : 0,
- MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
+ MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0,
FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index b9dd75ade..7264b44c7 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -40,7 +40,7 @@ void check_static_allocation_size()
*/
template <typename T, int Size, int MatrixOrArrayOptions,
int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
- : (((Size*sizeof(T))%16)==0) ? 16
+ : (((Size*sizeof(T))%EIGEN_ALIGN_BYTES)==0) ? EIGEN_ALIGN_BYTES
: 0 >
struct plain_array
{
@@ -81,9 +81,9 @@ struct plain_array
#endif
template <typename T, int Size, int MatrixOrArrayOptions>
-struct plain_array<T, Size, MatrixOrArrayOptions, 16>
+struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES>
{
- EIGEN_USER_ALIGN16 T array[Size];
+ EIGEN_USER_ALIGN32 T array[Size];
EIGEN_DEVICE_FUNC
plain_array()
@@ -102,7 +102,7 @@ struct plain_array<T, Size, MatrixOrArrayOptions, 16>
template <typename T, int MatrixOrArrayOptions, int Alignment>
struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
{
- EIGEN_USER_ALIGN16 T array[1];
+ EIGEN_USER_ALIGN32 T array[1];
EIGEN_DEVICE_FUNC plain_array() {}
EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
};
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index e3a165ac6..adda6f784 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -397,7 +397,7 @@ struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
EIGEN_STRONG_INLINE Scalar* data() {
return ForceAlignment
- ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(15))) + 16)
+ ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES)
: m_data.array;
}
#endif
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index d07541285..82eeeed4a 100755
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -217,7 +217,13 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(
/** \internal copy the packet \a from to \a *to, (un-aligned store) */
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
-{ (*to) = from; }
+{ (*to) = from; }
+
+ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, int /*stride*/)
+ { return ploadu<Packet>(from); }
+
+ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, int /*stride*/)
+ { pstore(to, from); }
/** \internal tries to do cache prefetching of \a addr */
template<typename Scalar> inline void prefetch(const Scalar* addr)
@@ -386,9 +392,22 @@ template<> inline std::complex<double> pmul(const std::complex<double>& a, const
#endif
+
+/***************************************************************************
+ * Kernel, that is a collection of N packets where N is the number of words
+ * in the packet.
+***************************************************************************/
+template <typename Packet> struct Kernel {
+ Packet packet[unpacket_traits<Packet>::size];
+};
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet>& /*kernel*/) {
+ // Nothing to do in the scalar case, i.e. a 1x1 matrix.
+}
+
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_GENERIC_PACKET_MATH_H
-
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index 8ea13cfb7..c75a5e95f 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -88,7 +88,7 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
&& ( bool(IsDynamicSize)
|| HasNoOuterStride
|| ( OuterStrideAtCompileTime!=Dynamic
- && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ),
+ && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ),
Flags0 = TraitsBase::Flags & (~NestByRefBit),
Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index ffa1371c2..a45a0b374 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -164,7 +164,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit,
internal::inner_stride_at_compile_time<Derived>::ret==1),
PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
- eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % 16) == 0)
+ eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0)
&& "data is not aligned");
}
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index b2c775d90..5b82c9a65 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -303,10 +303,15 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
{
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
- Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
- if (VectorizedSize != Size)
- res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
- return res;
+ if (VectorizedSize > 0) {
+ Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+ if (VectorizedSize != Size)
+ res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+ return res;
+ }
+ else {
+ return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
+ }
}
};
diff --git a/Eigen/src/Core/arch/AVX/CMakeLists.txt b/Eigen/src/Core/arch/AVX/CMakeLists.txt
new file mode 100644
index 000000000..bdb71ab99
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_Core_arch_AVX_SRCS "*.h")
+
+INSTALL(FILES
+ ${Eigen_Core_arch_AVX_SRCS}
+ DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AVX COMPONENT Devel
+)
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
new file mode 100644
index 000000000..ec9c861f9
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -0,0 +1,473 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_AVX_H
+#define EIGEN_COMPLEX_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet4cf
+{
+ EIGEN_STRONG_INLINE Packet4cf() {}
+ EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
+ __m256 v;
+};
+
+template<> struct packet_traits<std::complex<float> > : default_packet_traits
+{
+ typedef Packet4cf type;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 4,
+
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasNegate = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasSetLinear = 0
+ };
+};
+
+template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; };
+
+template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a)
+{
+ return Packet4cf(pnegate(a.v));
+}
+template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a)
+{
+ const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
+ return Packet4cf(_mm256_xor_ps(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
+{
+ __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
+ __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
+ __m256 result = _mm256_addsub_ps(tmp1, tmp2);
+ return Packet4cf(result);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pand <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf por <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pxor <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
+
+
+template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
+{
+ const float r = std::real(from);
+ const float i = std::imag(from);
+ // Beware, _mm256_set_ps expects the scalar values in reverse order (i.e. 7 to 0)
+ const __m256 result = _mm256_set_ps(i, r, i, r, i, r, i, r);
+ return Packet4cf(result);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
+{
+ // This should be optimized.
+ __m128 complex1 = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)from);
+ complex1 = _mm_movelh_ps(complex1, complex1);
+ __m128 complex2 = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from+1));
+ complex2 = _mm_movelh_ps(complex2, complex2);
+ __m256 result = _mm256_setzero_ps();
+ result = _mm256_insertf128_ps(result, complex1, 0);
+ result = _mm256_insertf128_ps(result, complex2, 1);
+ return Packet4cf(result);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, int stride)
+{
+ return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
+ std::imag(from[2*stride]), std::real(from[2*stride]),
+ std::imag(from[1*stride]), std::real(from[1*stride]),
+ std::imag(from[0*stride]), std::real(from[0*stride])));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, int stride)
+{
+ __m128 low = _mm256_extractf128_ps(from.v, 0);
+ to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
+ _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
+ to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
+ _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
+
+ __m128 high = _mm256_extractf128_ps(from.v, 1);
+ to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
+ _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
+ to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
+ _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
+
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a)
+{
+ __m128 low = _mm256_extractf128_ps(a.v, 0);
+ std::complex<float> res;
+ _mm_storel_pi((__m64*)&res, low);
+ return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
+ __m128 low = _mm256_extractf128_ps(a.v, 0);
+ __m128 high = _mm256_extractf128_ps(a.v, 1);
+ __m128d lowd = _mm_castps_pd(low);
+ __m128d highd = _mm_castps_pd(high);
+ low = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1));
+ high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1));
+ __m256 result = _mm256_setzero_ps();
+ result = _mm256_insertf128_ps(result, low, 1);
+ result = _mm256_insertf128_ps(result, high, 0);
+ return Packet4cf(result);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
+{
+ return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)),
+ Packet2cf(_mm256_extractf128_ps(a.v,1))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
+{
+ Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
+ Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
+ t0 = _mm256_hadd_ps(t0,t1);
+ Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
+ Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
+ t2 = _mm256_hadd_ps(t2,t3);
+
+ t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
+ t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
+
+ return Packet4cf(_mm256_add_ps(t1,t3));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
+{
+ return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
+ Packet2cf(_mm256_extractf128_ps(a.v, 1))));
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet4cf>
+{
+ static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
+ {
+ if (Offset==0) return;
+ palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
+ }
+};
+
+template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
+{
+ EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
+ {
+ return internal::pmul(a, pconj(b));
+ }
+};
+
+template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
+{
+ EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
+ {
+ return internal::pmul(pconj(a), b);
+ }
+};
+
+template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
+{
+ EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
+ {
+ return pconj(internal::pmul(a, b));
+ }
+};
+
+template<> struct conj_helper<Packet8f, Packet4cf, false,false>
+{
+ EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
+ { return padd(c, pmul(x,y)); }
+
+ EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
+ { return Packet4cf(Eigen::internal::pmul(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet4cf, Packet8f, false,false>
+{
+ EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
+ { return padd(c, pmul(x,y)); }
+
+ EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
+ { return Packet4cf(Eigen::internal::pmul(x.v, y)); }
+};
+
+template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
+{
+ Packet4cf num = pmul(a, pconj(b));
+ __m256 tmp = _mm256_mul_ps(b.v, b.v);
+ __m256 tmp2 = _mm256_shuffle_ps(tmp,tmp,0xB1);
+ __m256 denom = _mm256_add_ps(tmp, tmp2);
+ return Packet4cf(_mm256_div_ps(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
+{
+ return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
+}
+
+//---------- double ----------
+struct Packet2cd
+{
+ EIGEN_STRONG_INLINE Packet2cd() {}
+ EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
+ __m256d v;
+};
+
+template<> struct packet_traits<std::complex<double> > : default_packet_traits
+{
+ typedef Packet2cd type;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 0,
+ size = 2,
+
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasNegate = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasSetLinear = 0
+ };
+};
+
+template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; };
+
+template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a)
+{
+ const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
+ return Packet2cd(_mm256_xor_pd(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
+{
+ __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0);
+ __m256d even = _mm256_mul_pd(tmp1, b.v);
+ __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF);
+ __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5);
+ __m256d odd = _mm256_mul_pd(tmp2, tmp3);
+ return Packet2cd(_mm256_addsub_pd(even, odd));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pand <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd por <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pxor <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
+{
+ const double r = std::real(from);
+ const double i = std::imag(from);
+ // Beware, _mm256_set_pd expects the scalar values in reverse order (i.e. 3 to 0)
+ const __m256d result = _mm256_set_pd(i, r, i, r);
+ return Packet2cd(result);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, int stride)
+{
+ return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]),
+ std::imag(from[0*stride]), std::real(from[0*stride])));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, int stride)
+{
+ __m128d low = _mm256_extractf128_pd(from.v, 0);
+ to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
+ __m128d high = _mm256_extractf128_pd(from.v, 1);
+ to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
+{
+ __m128d low = _mm256_extractf128_pd(a.v, 0);
+ EIGEN_ALIGN16 double res[2];
+ _mm_store_pd(res, low);
+ return std::complex<double>(res[0],res[1]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
+ __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
+ return Packet2cd(result);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
+{
+ return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)),
+ Packet1cd(_mm256_extractf128_pd(a.v,1))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
+{
+ Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
+ Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
+
+ return Packet2cd(_mm256_add_pd(t0,t1));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
+{
+ return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
+ Packet1cd(_mm256_extractf128_pd(a.v,1))));
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet2cd>
+{
+ static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
+ {
+ if (Offset==0) return;
+ palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
+ }
+};
+
+template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
+{
+ EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
+ {
+ return internal::pmul(a, pconj(b));
+ }
+};
+
+template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
+{
+ EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
+ {
+ return internal::pmul(pconj(a), b);
+ }
+};
+
+template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
+{
+ EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
+ {
+ return pconj(internal::pmul(a, b));
+ }
+};
+
+template<> struct conj_helper<Packet4d, Packet2cd, false,false>
+{
+ EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
+ { return padd(c, pmul(x,y)); }
+
+ EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
+ { return Packet2cd(Eigen::internal::pmul(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet2cd, Packet4d, false,false>
+{
+ EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
+ { return padd(c, pmul(x,y)); }
+
+ EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
+ { return Packet2cd(Eigen::internal::pmul(x.v, y)); }
+};
+
+template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
+{
+ Packet2cd num = pmul(a, pconj(b));
+ __m256d tmp = _mm256_mul_pd(b.v, b.v);
+ __m256d denom = _mm256_hadd_pd(tmp, tmp);
+ return Packet2cd(_mm256_div_pd(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
+{
+ return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet4cf>& kernel) {
+ __m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
+ __m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
+ __m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
+ __m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
+
+ __m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
+ __m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
+ __m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
+ __m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
+
+ kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
+ kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
+ kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
+ kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet2cd>& kernel) {
+ __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
+ kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
+ kernel.packet[0].v = tmp;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_AVX_H
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
new file mode 100644
index 000000000..6d5a5f53f
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -0,0 +1,509 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_AVX_H
+#define EIGEN_PACKET_MATH_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#endif
+
+typedef __m256 Packet8f;
+typedef __m256i Packet8i;
+typedef __m256d Packet4d;
+
+template<> struct is_arithmetic<__m256> { enum { value = true }; };
+template<> struct is_arithmetic<__m256i> { enum { value = true }; };
+template<> struct is_arithmetic<__m256d> { enum { value = true }; };
+
+#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
+ const Packet8f p8f_##NAME = pset1<Packet8f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
+ const Packet4d p4d_##NAME = pset1<Packet4d>(X)
+
+
+template<> struct packet_traits<float> : default_packet_traits
+{
+ typedef Packet8f type;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size=8,
+
+ HasDiv = 1,
+ HasSin = 0,
+ HasCos = 0,
+ HasLog = 0,
+ HasExp = 0,
+ HasSqrt = 0
+ };
+ };
+template<> struct packet_traits<double> : default_packet_traits
+{
+ typedef Packet4d type;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size=4,
+
+ HasDiv = 1,
+ HasExp = 0
+ };
+};
+
+/* Proper support for integers is only provided by AVX2. In the meantime, we'll
+ use SSE instructions and packets to deal with integers.
+template<> struct packet_traits<int> : default_packet_traits
+{
+ typedef Packet8i type;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size=8
+ };
+};
+*/
+
+template<> struct unpacket_traits<Packet8f> { typedef float type; enum {size=8}; };
+template<> struct unpacket_traits<Packet4d> { typedef double type; enum {size=4}; };
+template<> struct unpacket_traits<Packet8i> { typedef int type; enum {size=8}; };
+
+template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); }
+template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
+
+template<> EIGEN_STRONG_INLINE Packet8f plset<float>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
+template<> EIGEN_STRONG_INLINE Packet4d plset<double>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
+{
+ return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
+{
+ return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
+{ eigen_assert(false && "packet integer division are not supported by AVX");
+ return pset1<Packet8i>(0);
+}
+
+#ifdef EIGEN_VECTORIZE_FMA
+template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
+#if defined(__clang__) || defined(__GNUC__)
+ // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
+ // and gcc stupidly generates a vfmadd132ps instruction,
+ // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
+ // the result of the product.
+ Packet8f res = c;
+ asm("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
+ return res;
+#else
+ return _mm256_fmadd_ps(a,b,c);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { return _mm256_fmadd_pd(a,b,c); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
+
+// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3}
+template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
+{
+ Packet8f tmp = ploadu<Packet8f>(from);
+ Packet8f tmp1 = _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
+ Packet8f tmp2 = _mm256_permute_ps(tmp, _MM_SHUFFLE(1,1,0,0));
+ return _mm256_blend_ps(_mm256_permute2f128_ps(tmp1,tmp1,1),tmp2,15);
+}
+// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1}
+template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
+{
+ Packet4d tmp = ploadu<Packet4d>(from);
+ Packet4d tmp1 = _mm256_permute_pd(tmp,0);
+ Packet4d tmp2 = _mm256_permute_pd(tmp,3);
+ return _mm256_blend_pd(tmp1,_mm256_permute2f128_pd(tmp2,tmp2,1),12);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
+
+// TODO: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
+template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, int stride)
+{
+ return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
+ from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, int stride)
+{
+ return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, int stride)
+{
+ __m128 low = _mm256_extractf128_ps(from, 0);
+ to[stride*0] = _mm_cvtss_f32(low);
+ to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
+ to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
+ to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
+
+ __m128 high = _mm256_extractf128_ps(from, 1);
+ to[stride*4] = _mm_cvtss_f32(high);
+ to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
+ to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
+ to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, int stride)
+{
+ __m128d low = _mm256_extractf128_pd(from, 0);
+ to[stride*0] = _mm_cvtsd_f64(low);
+ to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
+ __m128d high = _mm256_extractf128_pd(from, 1);
+ to[stride*2] = _mm_cvtsd_f64(high);
+ to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
+{
+ Packet8f pa = pset1<Packet8f>(a);
+ pstore(to, pa);
+}
+template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
+{
+ Packet4d pa = pset1<Packet4d>(a);
+ pstore(to, pa);
+}
+template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
+{
+ Packet8i pa = pset1<Packet8i>(a);
+ pstore(to, pa);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+
+template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
+ return _mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
+template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
+ return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
+}
+template<> EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) {
+ return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
+{
+ __m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
+ return _mm256_permute2f128_ps(tmp, tmp, 1);
+}
+template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
+{
+ __m256d tmp = _mm256_shuffle_pd(a,a,5);
+ return _mm256_permute2f128_pd(tmp, tmp, 1);
+
+ __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
+ return _mm256_permute_pd(swap_halves,5);
+}
+
+// pabs should be ok
+template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
+{
+ const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
+ return _mm256_and_ps(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
+{
+ const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
+ return _mm256_and_pd(a,mask);
+}
+
+// preduxp should be ok
+// FIXME: why is this ok? why isn't the simply implementation working as expected?
+template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
+{
+ __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
+ __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
+ __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
+ __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
+
+ __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
+ __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
+ __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
+ __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
+
+ __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
+ __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
+ __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
+ __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
+
+ __m256 sum1 = _mm256_add_ps(perm1, hsum5);
+ __m256 sum2 = _mm256_add_ps(perm2, hsum6);
+ __m256 sum3 = _mm256_add_ps(perm3, hsum7);
+ __m256 sum4 = _mm256_add_ps(perm4, hsum8);
+
+ __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
+ __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
+
+ __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
+ return final;
+}
+template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
+{
+ Packet4d tmp0, tmp1;
+
+ tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
+ tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
+
+ tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
+ tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
+
+ return _mm256_blend_pd(tmp0, tmp1, 0xC);
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
+{
+ Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1));
+ tmp0 = _mm256_hadd_ps(tmp0,tmp0);
+ return pfirst(_mm256_hadd_ps(tmp0, tmp0));
+}
+template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
+{
+ Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1));
+ return pfirst(_mm256_hadd_pd(tmp0,tmp0));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
+{
+ Packet8f tmp;
+ tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
+ tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
+ return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
+{
+ Packet4d tmp;
+ tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
+ return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
+{
+ Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
+ tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
+ return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
+{
+ Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
+ return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
+{
+ Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
+ tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
+ return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+}
+
+template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
+{
+ Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
+ return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
+}
+
+
+template<int Offset>
+struct palign_impl<Offset,Packet8f>
+{
+ static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
+ {
+ if (Offset==1)
+ {
+ first = _mm256_blend_ps(first, second, 1);
+ Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
+ first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0x88);
+ }
+ else if (Offset==2)
+ {
+ first = _mm256_blend_ps(first, second, 3);
+ Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
+ first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xcc);
+ }
+ else if (Offset==3)
+ {
+ first = _mm256_blend_ps(first, second, 7);
+ Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
+ first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xee);
+ }
+ else if (Offset==4)
+ {
+ first = _mm256_blend_ps(first, second, 15);
+ Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
+ first = _mm256_permute_ps(_mm256_permute2f128_ps (tmp, tmp, 1), _MM_SHUFFLE(3,2,1,0));
+ }
+ else if (Offset==5)
+ {
+ first = _mm256_blend_ps(first, second, 31);
+ first = _mm256_permute2f128_ps(first, first, 1);
+ Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
+ first = _mm256_permute2f128_ps(tmp, tmp, 1);
+ first = _mm256_blend_ps(tmp, first, 0x88);
+ }
+ else if (Offset==6)
+ {
+ first = _mm256_blend_ps(first, second, 63);
+ first = _mm256_permute2f128_ps(first, first, 1);
+ Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
+ first = _mm256_permute2f128_ps(tmp, tmp, 1);
+ first = _mm256_blend_ps(tmp, first, 0xcc);
+ }
+ else if (Offset==7)
+ {
+ first = _mm256_blend_ps(first, second, 127);
+ first = _mm256_permute2f128_ps(first, first, 1);
+ Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
+ first = _mm256_permute2f128_ps(tmp, tmp, 1);
+ first = _mm256_blend_ps(tmp, first, 0xee);
+ }
+ }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet4d>
+{
+ static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
+ {
+ if (Offset==1)
+ {
+ first = _mm256_blend_pd(first, second, 1);
+ __m256d tmp = _mm256_permute_pd(first, 5);
+ first = _mm256_permute2f128_pd(tmp, tmp, 1);
+ first = _mm256_blend_pd(tmp, first, 0xA);
+ }
+ else if (Offset==2)
+ {
+ first = _mm256_blend_pd(first, second, 3);
+ first = _mm256_permute2f128_pd(first, first, 1);
+ }
+ else if (Offset==3)
+ {
+ first = _mm256_blend_pd(first, second, 7);
+ __m256d tmp = _mm256_permute_pd(first, 5);
+ first = _mm256_permute2f128_pd(tmp, tmp, 1);
+ first = _mm256_blend_pd(tmp, first, 5);
+ }
+ }
+};
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet8f>& kernel) {
+ __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+ __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+ __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+ __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+ __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
+ __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
+ __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
+ __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
+ __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
+ __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
+ __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
+ __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
+ __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
+ __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
+ __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
+ __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
+ kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
+ kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
+ kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
+ kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
+ kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
+ kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
+ kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
+ kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet4d>& kernel) {
+ __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
+ __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
+ __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
+ __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
+
+ kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
+ kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
+ kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
+ kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_AVX_H
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 91bba5e38..86b90b2ee 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -22,6 +22,9 @@ struct Packet2cf
__m128 v;
};
+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
template<> struct packet_traits<std::complex<float> > : default_packet_traits
{
typedef Packet2cf type;
@@ -42,6 +45,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
HasSetLinear = 0
};
};
+#endif
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
@@ -107,6 +111,24 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
+{
+ return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
+ std::imag(from[0*stride]), std::real(from[0*stride])));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
+{
+ /* for (int i = 0; i < 2; i+=2) {
+ to[stride*i] = std::complex<float>(from.v[i], from.v[i+1]);
+ }*/
+ to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
+ _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
+ to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
+ _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
+}
+
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
@@ -248,6 +270,9 @@ struct Packet1cd
__m128d v;
};
+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
template<> struct packet_traits<std::complex<double> > : default_packet_traits
{
typedef Packet1cd type;
@@ -268,6 +293,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
HasSetLinear = 0
};
};
+#endif
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; };
@@ -435,6 +461,16 @@ EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
return Packet1cd(preverse(x.v));
}
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet2cf>& kernel) {
+ __m128d w1 = _mm_castps_pd(kernel.packet[0].v);
+ __m128d w2 = _mm_castps_pd(kernel.packet[1].v);
+
+ __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2));
+ kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2));
+ kernel.packet[1].v = tmp;
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index b1f7b7717..a98ca6bea 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -58,6 +58,9 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
template<> struct packet_traits<float> : default_packet_traits
{
typedef Packet4f type;
@@ -87,6 +90,7 @@ template<> struct packet_traits<double> : default_packet_traits
HasSqrt = 1
};
};
+#endif
template<> struct packet_traits<int> : default_packet_traits
{
typedef Packet4i type;
@@ -126,8 +130,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
}
#endif
+#ifndef EIGEN_VECTORIZE_AVX
template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
+#endif
template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
@@ -184,6 +190,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
// for some weird raisons, it has to be overloaded for packet of integers
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
+#ifdef EIGEN_VECTORIZE_FMA
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
+#endif
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
@@ -329,6 +339,39 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d&
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); }
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); }
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
+{
+ return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, int stride)
+{
+ return _mm_set_pd(from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
+{
+ return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+ }
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
+{
+ to[stride*0] = _mm_cvtss_f32(from);
+ to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
+ to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
+ to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, int stride)
+{
+ to[stride*0] = _mm_cvtsd_f64(from);
+ to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
+{
+ to[stride*0] = _mm_cvtsi128_si32(from);
+ to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+ to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+ to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+}
+
// some compilers might be tempted to perform multiple moves instead of using a vector path.
template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
{
@@ -342,9 +385,11 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double&
pstore(to, vec2d_swizzle1(pa,0,0));
}
+#ifndef EIGEN_VECTORIZE_AVX
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+#endif
#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
@@ -695,6 +740,31 @@ struct palign_impl<Offset,Packet2d>
};
#endif
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet4f>& kernel) {
+ _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet2d>& kernel) {
+ __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
+ kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
+ kernel.packet[1] = tmp;
+}
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet4i>& kernel) {
+ __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
+ __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
+ __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
+ __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
+
+ kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
+ kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
+ kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
+ kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index cba76edfe..0f47f6de5 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -208,7 +208,16 @@ public:
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
{
+ // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
+ // let gcc allocate the register in which to store the result of the pmul
+ // (in the case where there is no FMA) gcc fails to figure out how to avoid
+ // spilling register.
+#ifdef EIGEN_VECTORIZE_FMA
+ EIGEN_UNUSED_VARIABLE(tmp);
+ c = pmadd(a,b,c);
+#else
tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
+#endif
}
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
@@ -287,7 +296,12 @@ public:
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
{
+#ifdef EIGEN_VECTORIZE_FMA
+ EIGEN_UNUSED_VARIABLE(tmp);
+ c.v = pmadd(a.v,b,c.v);
+#else
tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
+#endif
}
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
@@ -983,9 +997,22 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder,
}
else
{
- for(Index k=0; k<depth; k++)
- {
- // TODO add a vectorized transpose here
+ const Index peeled_k = (depth/PacketSize)*PacketSize;
+ Index k=0;
+ for(; k<peeled_k; k+=PacketSize) {
+ for (Index m = 0; m < Pack1; m += PacketSize) {
+ Kernel<Packet> kernel;
+ for (int p = 0; p < PacketSize; ++p) {
+ kernel.packet[p] = ploadu<Packet>(&lhs(i+p+m, k));
+ }
+ ptranspose(kernel);
+ for (int p = 0; p < PacketSize; ++p) {
+ pstore(blockA+count+m+Pack1*p, cj.pconj(kernel.packet[p]));
+ }
+ }
+ count += PacketSize*Pack1;
+ }
+ for(; k<depth; k++) {
Index w=0;
for(; w<Pack1-3; w+=4)
{
@@ -1050,6 +1077,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
Index count = 0;
+ const Index peeled_k = (depth/PacketSize)*PacketSize;
if(nr>=8)
{
for(Index j2=0; j2<packet_cols8; j2+=8)
@@ -1064,7 +1092,22 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
const Scalar* b5 = &rhs[(j2+5)*rhsStride];
const Scalar* b6 = &rhs[(j2+6)*rhsStride];
const Scalar* b7 = &rhs[(j2+7)*rhsStride];
- for(Index k=0; k<depth; k++)
+ Index k=0;
+ if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
+ {
+ for(; k<peeled_k; k+=PacketSize) {
+ Kernel<Packet> kernel;
+ for (int p = 0; p < PacketSize; ++p) {
+ kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
+ }
+ ptranspose(kernel);
+ for (int p = 0; p < PacketSize; ++p) {
+ pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
+ count+=PacketSize;
+ }
+ }
+ }
+ for(; k<depth; k++)
{
blockB[count+0] = cj(b0[k]);
blockB[count+1] = cj(b1[k]);
@@ -1091,7 +1134,22 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
- for(Index k=0; k<depth; k++)
+ Index k=0;
+ if(PacketSize==4) // TODO enbale vectorized transposition for PacketSize==2 ??
+ {
+ for(; k<peeled_k; k+=PacketSize) {
+ Kernel<Packet> kernel;
+ for (int p = 0; p < PacketSize; ++p) {
+ kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
+ }
+ ptranspose(kernel);
+ for (int p = 0; p < PacketSize; ++p) {
+ pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
+ count+=PacketSize;
+ }
+ }
+ }
+ for(; k<depth; k++)
{
blockB[count+0] = cj(b0[k]);
blockB[count+1] = cj(b1[k]);
@@ -1148,10 +1206,14 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
if(PanelMode) count += 8 * offset;
for(Index k=0; k<depth; k++)
{
- if (8 == PacketSize) {
+ if (PacketSize==8) {
Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
pstoreu(blockB+count, cj.pconj(A));
- count += PacketSize;
+ } else if (PacketSize==4) {
+ Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+ Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
+ pstoreu(blockB+count, cj.pconj(A));
+ pstoreu(blockB+count+PacketSize, cj.pconj(B));
} else {
const Scalar* b0 = &rhs[k*rhsStride + j2];
blockB[count+0] = cj(b0[0]);
@@ -1162,8 +1224,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
blockB[count+5] = cj(b0[5]);
blockB[count+6] = cj(b0[6]);
blockB[count+7] = cj(b0[7]);
- count += 8;
}
+ count += 8;
}
// skip what we have after
if(PanelMode) count += 8 * (stride-offset-depth);
@@ -1177,7 +1239,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
if(PanelMode) count += 4 * offset;
for(Index k=0; k<depth; k++)
{
- if (4 == PacketSize) {
+ if (PacketSize==4) {
Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
pstoreu(blockB+count, cj.pconj(A));
count += PacketSize;
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 19991fa3f..dd9d79657 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -281,8 +281,8 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
SizeB = ActualCols * MaxDepth
};
- EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
- EIGEN_ALIGN16 RhsScalar m_staticB[SizeB];
+ EIGEN_ALIGN_DEFAULT LhsScalar m_staticA[SizeA];
+ EIGEN_ALIGN_DEFAULT RhsScalar m_staticB[SizeB];
public:
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index a73ce5ff0..340c51394 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -141,6 +141,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
alignedSize = 0;
alignedStart = 0;
}
+ else if(LhsPacketSize > 4)
+ {
+ // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+ // Currently, it seems to be better to perform unaligned loads anyway
+ alignmentPattern = NoneAligned;
+ }
else if (LhsPacketSize>1)
{
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
@@ -405,6 +411,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
alignedSize = 0;
alignedStart = 0;
}
+ else if(LhsPacketSize > 4)
+ {
+ // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+ alignmentPattern = NoneAligned;
+ }
else if (LhsPacketSize>1)
{
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
@@ -442,7 +453,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
{
- EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+ EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
// this helps the compiler generating good binary code
@@ -551,7 +562,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
{
for (Index i=start; i<end; ++i)
{
- EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+ EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
ResPacket ptmp0 = pset1<ResPacket>(tmp0);
const LhsScalar* lhs0 = lhs + i*lhsStride;
// process first unaligned result's coeffs
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index f69fc5ec4..bfd6ba7de 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -66,13 +66,24 @@
#define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
#endif
+// Defined the boundary (in bytes) on which the data needs to be aligned. Note
+// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
+// aligned at all regardless of the value of this #define.
+#define EIGEN_ALIGN_BYTES 16
+
#ifdef EIGEN_DONT_ALIGN
#ifndef EIGEN_DONT_ALIGN_STATICALLY
#define EIGEN_DONT_ALIGN_STATICALLY
#endif
#define EIGEN_ALIGN 0
-#else
+#elif !defined(EIGEN_DONT_VECTORIZE)
+ #if defined(__AVX__)
+ #undef EIGEN_ALIGN_BYTES
+ #define EIGEN_ALIGN_BYTES 32
+ #endif
#define EIGEN_ALIGN 1
+#else
+ #define EIGEN_ALIGN 0
#endif
// EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
@@ -286,13 +297,19 @@ namespace Eigen {
#endif
#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
+#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
+#define EIGEN_ALIGN_DEFAULT EIGEN_ALIGN_TO_BOUNDARY(EIGEN_ALIGN_BYTES)
#if EIGEN_ALIGN_STATICALLY
#define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n)
#define EIGEN_USER_ALIGN16 EIGEN_ALIGN16
+#define EIGEN_USER_ALIGN32 EIGEN_ALIGN32
+#define EIGEN_USER_ALIGN_DEFAULT EIGEN_ALIGN_DEFAULT
#else
#define EIGEN_USER_ALIGN_TO_BOUNDARY(n)
#define EIGEN_USER_ALIGN16
+#define EIGEN_USER_ALIGN32
+#define EIGEN_USER_ALIGN_DEFAULT
#endif
#ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 0f8ab065a..a4d7e454d 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -32,7 +32,7 @@
// page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
// quite safe, at least within the context of glibc, to equate 64-bit with LP64.
#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
- && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ )
+ && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_ALIGN_BYTES == 16)
#define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
#else
#define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
@@ -42,14 +42,14 @@
// See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
// FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
// See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
-#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__)
+#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) && (EIGEN_ALIGN_BYTES == 16)
#define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
#else
#define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
#endif
-#if defined(__APPLE__) \
- || defined(_WIN64) \
+#if (defined(__APPLE__) && (EIGEN_ALIGN_BYTES == 16)) \
+ || (defined(_WIN64) && (EIGEN_ALIGN_BYTES == 16)) \
|| EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \
|| EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
#define EIGEN_MALLOC_ALREADY_ALIGNED 1
@@ -73,7 +73,7 @@
#define EIGEN_HAS_POSIX_MEMALIGN 0
#endif
-#ifdef EIGEN_VECTORIZE_SSE
+#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_AVX
#define EIGEN_HAS_MM_MALLOC 1
#else
#define EIGEN_HAS_MM_MALLOC 0
@@ -105,9 +105,9 @@ inline void throw_std_bad_alloc()
*/
inline void* handmade_aligned_malloc(std::size_t size)
{
- void *original = std::malloc(size+16);
+ void *original = std::malloc(size+EIGEN_ALIGN_BYTES);
if (original == 0) return 0;
- void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
+ void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES);
*(reinterpret_cast<void**>(aligned) - 1) = original;
return aligned;
}
@@ -128,9 +128,9 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
if (ptr == 0) return handmade_aligned_malloc(size);
void *original = *(reinterpret_cast<void**>(ptr) - 1);
std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original);
- original = std::realloc(original,size+16);
+ original = std::realloc(original,size+EIGEN_ALIGN_BYTES);
if (original == 0) return 0;
- void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
+ void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES);
void *previous_aligned = static_cast<char *>(original)+previous_offset;
if(aligned!=previous_aligned)
std::memmove(aligned, previous_aligned, size);
@@ -208,7 +208,7 @@ inline void check_that_malloc_is_allowed()
{}
#endif
-/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
+/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
* On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
*/
inline void* aligned_malloc(size_t size)
@@ -221,11 +221,11 @@ inline void* aligned_malloc(size_t size)
#elif EIGEN_MALLOC_ALREADY_ALIGNED
result = std::malloc(size);
#elif EIGEN_HAS_POSIX_MEMALIGN
- if(posix_memalign(&result, 16, size)) result = 0;
+ if(posix_memalign(&result, EIGEN_ALIGN_BYTES, size)) result = 0;
#elif EIGEN_HAS_MM_MALLOC
- result = _mm_malloc(size, 16);
+ result = _mm_malloc(size, EIGEN_ALIGN_BYTES);
#elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
- result = _aligned_malloc(size, 16);
+ result = _aligned_malloc(size, EIGEN_ALIGN_BYTES);
#else
result = handmade_aligned_malloc(size);
#endif
@@ -275,12 +275,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
// implements _mm_malloc/_mm_free based on the corresponding _aligned_
// functions. This may not always be the case and we just try to be safe.
#if defined(_MSC_VER) && (!defined(_WIN32_WCE)) && defined(_mm_free)
- result = _aligned_realloc(ptr,new_size,16);
+ result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES);
#else
result = generic_aligned_realloc(ptr,new_size,old_size);
#endif
#elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
- result = _aligned_realloc(ptr,new_size,16);
+ result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES);
#else
result = handmade_aligned_realloc(ptr,new_size,old_size);
#endif
@@ -607,9 +607,9 @@ template<typename T> class aligned_stack_memory_handler
* The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
*/
#ifdef EIGEN_ALLOCA
-
- #if defined(__arm__) || defined(_WIN32)
- #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+16)) & ~(size_t(15))) + 16)
+ // The native alloca() that comes with llvm aligns buffer on 16 bytes even when AVX is enabled.
+#if defined(__arm__) || defined(_WIN32) || EIGEN_ALIGN_BYTES > 16
+ #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+EIGEN_ALIGN_BYTES)) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES)
#else
#define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA
#endif
@@ -679,7 +679,7 @@ template<typename T> class aligned_stack_memory_handler
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
- EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0)))
+ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_ALIGN_BYTES==0)))
/****************************************************************************/
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 195d9e2e1..a08538aff 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -136,7 +136,7 @@ class compute_matrix_flags
((Options&DontAlign)==0)
&& (
#if EIGEN_ALIGN_STATICALLY
- ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % 16) == 0))
+ ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0))
#else
0
#endif
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index fdc166bf8..9b9776894 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -264,6 +264,18 @@ macro(ei_testing_print_summary)
message(STATUS "SSE4.2: Using architecture defaults")
endif()
+ if(EIGEN_TEST_AVX)
+ message(STATUS "AVX: ON")
+ else()
+ message(STATUS "AVX: Using architecture defaults")
+ endif()
+
+ if(EIGEN_TEST_FMA)
+ message(STATUS "FMA: ON")
+ else()
+ message(STATUS "FMA: Using architecture defaults")
+ endif()
+
if(EIGEN_TEST_ALTIVEC)
message(STATUS "Altivec: ON")
else()
@@ -408,6 +420,10 @@ macro(ei_get_cxxflags VAR)
set(${VAR} NEON)
elseif(EIGEN_TEST_ALTIVEC)
set(${VAR} ALVEC)
+ elseif(EIGEN_TEST_FMA)
+ set(${VAR} FMA)
+ elseif(EIGEN_TEST_AVX)
+ set(${VAR} AVX)
elseif(EIGEN_TEST_SSE4_2)
set(${VAR} SSE42)
elseif(EIGEN_TEST_SSE4_1)
diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp
index befd7d483..ed5928f10 100644
--- a/test/geo_hyperplane.cpp
+++ b/test/geo_hyperplane.cpp
@@ -129,9 +129,9 @@ template<typename Scalar> void hyperplane_alignment()
typedef Hyperplane<Scalar,3,AutoAlign> Plane3a;
typedef Hyperplane<Scalar,3,DontAlign> Plane3u;
- EIGEN_ALIGN16 Scalar array1[4];
- EIGEN_ALIGN16 Scalar array2[4];
- EIGEN_ALIGN16 Scalar array3[4+1];
+ EIGEN_ALIGN_DEFAULT Scalar array1[4];
+ EIGEN_ALIGN_DEFAULT Scalar array2[4];
+ EIGEN_ALIGN_DEFAULT Scalar array3[4+1];
Scalar* array3u = array3+1;
Plane3a *p1 = ::new(reinterpret_cast<void*>(array1)) Plane3a;
@@ -146,7 +146,7 @@ template<typename Scalar> void hyperplane_alignment()
VERIFY_IS_APPROX(p1->coeffs(), p3->coeffs());
#if defined(EIGEN_VECTORIZE) && EIGEN_ALIGN_STATICALLY
- if(internal::packet_traits<Scalar>::Vectorizable)
+ if(internal::packet_traits<Scalar>::Vectorizable && internal::packet_traits<Scalar>::size<=4)
VERIFY_RAISES_ASSERT((::new(reinterpret_cast<void*>(array3u)) Plane3a));
#endif
}
diff --git a/test/geo_parametrizedline.cpp b/test/geo_parametrizedline.cpp
index f0462d40a..427327cd7 100644
--- a/test/geo_parametrizedline.cpp
+++ b/test/geo_parametrizedline.cpp
@@ -66,9 +66,9 @@ template<typename Scalar> void parametrizedline_alignment()
typedef ParametrizedLine<Scalar,4,AutoAlign> Line4a;
typedef ParametrizedLine<Scalar,4,DontAlign> Line4u;
- EIGEN_ALIGN16 Scalar array1[8];
- EIGEN_ALIGN16 Scalar array2[8];
- EIGEN_ALIGN16 Scalar array3[8+1];
+ EIGEN_ALIGN_DEFAULT Scalar array1[8];
+ EIGEN_ALIGN_DEFAULT Scalar array2[8];
+ EIGEN_ALIGN_DEFAULT Scalar array3[8+1];
Scalar* array3u = array3+1;
Line4a *p1 = ::new(reinterpret_cast<void*>(array1)) Line4a;
@@ -86,7 +86,7 @@ template<typename Scalar> void parametrizedline_alignment()
VERIFY_IS_APPROX(p1->direction(), p3->direction());
#if defined(EIGEN_VECTORIZE) && EIGEN_ALIGN_STATICALLY
- if(internal::packet_traits<Scalar>::Vectorizable)
+ if(internal::packet_traits<Scalar>::Vectorizable && internal::packet_traits<Scalar>::size<=4)
VERIFY_RAISES_ASSERT((::new(reinterpret_cast<void*>(array3u)) Line4a));
#endif
}
diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp
index 1694b32c7..de0f2aeda 100644
--- a/test/geo_quaternion.cpp
+++ b/test/geo_quaternion.cpp
@@ -181,9 +181,9 @@ template<typename Scalar> void mapQuaternion(void){
v1 = Vector3::Random();
Scalar a = internal::random<Scalar>(-Scalar(M_PI), Scalar(M_PI));
- EIGEN_ALIGN16 Scalar array1[4];
- EIGEN_ALIGN16 Scalar array2[4];
- EIGEN_ALIGN16 Scalar array3[4+1];
+ EIGEN_ALIGN_DEFAULT Scalar array1[4];
+ EIGEN_ALIGN_DEFAULT Scalar array2[4];
+ EIGEN_ALIGN_DEFAULT Scalar array3[4+1];
Scalar* array3unaligned = array3+1;
MQuaternionA mq1(array1);
@@ -232,9 +232,9 @@ template<typename Scalar> void quaternionAlignment(void){
typedef Quaternion<Scalar,AutoAlign> QuaternionA;
typedef Quaternion<Scalar,DontAlign> QuaternionUA;
- EIGEN_ALIGN16 Scalar array1[4];
- EIGEN_ALIGN16 Scalar array2[4];
- EIGEN_ALIGN16 Scalar array3[4+1];
+ EIGEN_ALIGN_DEFAULT Scalar array1[4];
+ EIGEN_ALIGN_DEFAULT Scalar array2[4];
+ EIGEN_ALIGN_DEFAULT Scalar array3[4+1];
Scalar* arrayunaligned = array3+1;
QuaternionA *q1 = ::new(reinterpret_cast<void*>(array1)) QuaternionA;
@@ -248,7 +248,7 @@ template<typename Scalar> void quaternionAlignment(void){
VERIFY_IS_APPROX(q1->coeffs(), q2->coeffs());
VERIFY_IS_APPROX(q1->coeffs(), q3->coeffs());
#if defined(EIGEN_VECTORIZE) && EIGEN_ALIGN_STATICALLY
- if(internal::packet_traits<Scalar>::Vectorizable)
+ if(internal::packet_traits<Scalar>::Vectorizable && internal::packet_traits<Scalar>::size<=4)
VERIFY_RAISES_ASSERT((::new(reinterpret_cast<void*>(arrayunaligned)) QuaternionA));
#endif
}
diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp
index ee3030b5d..7d9080333 100644
--- a/test/geo_transformations.cpp
+++ b/test/geo_transformations.cpp
@@ -404,9 +404,9 @@ template<typename Scalar> void transform_alignment()
typedef Transform<Scalar,3,Projective,AutoAlign> Projective3a;
typedef Transform<Scalar,3,Projective,DontAlign> Projective3u;
- EIGEN_ALIGN16 Scalar array1[16];
- EIGEN_ALIGN16 Scalar array2[16];
- EIGEN_ALIGN16 Scalar array3[16+1];
+ EIGEN_ALIGN_DEFAULT Scalar array1[16];
+ EIGEN_ALIGN_DEFAULT Scalar array2[16];
+ EIGEN_ALIGN_DEFAULT Scalar array3[16+1];
Scalar* array3u = array3+1;
Projective3a *p1 = ::new(reinterpret_cast<void*>(array1)) Projective3a;
diff --git a/test/mapped_matrix.cpp b/test/mapped_matrix.cpp
index c18e687a5..5eba3ecb3 100644
--- a/test/mapped_matrix.cpp
+++ b/test/mapped_matrix.cpp
@@ -26,7 +26,7 @@ template<typename VectorType> void map_class_vector(const VectorType& m)
Scalar* array1 = internal::aligned_new<Scalar>(size);
Scalar* array2 = internal::aligned_new<Scalar>(size);
Scalar* array3 = new Scalar[size+1];
- Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3;
+ Scalar* array3unaligned = size_t(array3)%EIGEN_ALIGN_BYTES == 0 ? array3+1 : array3;
Scalar array4[EIGEN_TESTMAP_MAX_SIZE];
Map<VectorType, Aligned>(array1, size) = VectorType::Random(size);
@@ -64,7 +64,7 @@ template<typename MatrixType> void map_class_matrix(const MatrixType& m)
for(int i = 0; i < size; i++) array2[i] = Scalar(1);
Scalar* array3 = new Scalar[size+1];
for(int i = 0; i < size+1; i++) array3[i] = Scalar(1);
- Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3;
+ Scalar* array3unaligned = size_t(array3)%EIGEN_ALIGN_BYTES == 0 ? array3+1 : array3;
Map<MatrixType, Aligned>(array1, rows, cols) = MatrixType::Ones(rows,cols);
Map<MatrixType>(array2, rows, cols) = Map<MatrixType>(array1, rows, cols);
Map<MatrixType>(array3unaligned, rows, cols) = Map<MatrixType>(array1, rows, cols);
@@ -90,7 +90,7 @@ template<typename VectorType> void map_static_methods(const VectorType& m)
Scalar* array1 = internal::aligned_new<Scalar>(size);
Scalar* array2 = internal::aligned_new<Scalar>(size);
Scalar* array3 = new Scalar[size+1];
- Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3;
+ Scalar* array3unaligned = size_t(array3)%EIGEN_ALIGN_BYTES == 0 ? array3+1 : array3;
VectorType::MapAligned(array1, size) = VectorType::Random(size);
VectorType::Map(array2, size) = VectorType::Map(array1, size);
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 2c0519c41..317b1c8fe 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -104,11 +104,12 @@ template<typename Scalar> void packetmath()
const int PacketSize = internal::packet_traits<Scalar>::size;
typedef typename NumTraits<Scalar>::Real RealScalar;
- const int size = PacketSize*4;
- EIGEN_ALIGN16 Scalar data1[internal::packet_traits<Scalar>::size*4];
- EIGEN_ALIGN16 Scalar data2[internal::packet_traits<Scalar>::size*4];
- EIGEN_ALIGN16 Packet packets[PacketSize*2];
- EIGEN_ALIGN16 Scalar ref[internal::packet_traits<Scalar>::size*4];
+ const int max_size = PacketSize > 4 ? PacketSize : 4;
+ const int size = PacketSize*max_size;
+ EIGEN_ALIGN_DEFAULT Scalar data1[size];
+ EIGEN_ALIGN_DEFAULT Scalar data2[size];
+ EIGEN_ALIGN_DEFAULT Packet packets[PacketSize*2];
+ EIGEN_ALIGN_DEFAULT Scalar ref[size];
RealScalar refvalue = 0;
for (int i=0; i<size; ++i)
{
@@ -140,6 +141,10 @@ template<typename Scalar> void packetmath()
else if (offset==1) internal::palign<1>(packets[0], packets[1]);
else if (offset==2) internal::palign<2>(packets[0], packets[1]);
else if (offset==3) internal::palign<3>(packets[0], packets[1]);
+ else if (offset==4) internal::palign<4>(packets[0], packets[1]);
+ else if (offset==5) internal::palign<5>(packets[0], packets[1]);
+ else if (offset==6) internal::palign<6>(packets[0], packets[1]);
+ else if (offset==7) internal::palign<7>(packets[0], packets[1]);
internal::pstore(data2, packets[0]);
for (int i=0; i<PacketSize; ++i)
@@ -203,6 +208,18 @@ template<typename Scalar> void packetmath()
ref[i] = data1[PacketSize-i-1];
internal::pstore(data2, internal::preverse(internal::pload<Packet>(data1)));
VERIFY(areApprox(ref, data2, PacketSize) && "internal::preverse");
+
+ internal::Kernel<Packet> kernel;
+ for (int i=0; i<PacketSize; ++i) {
+ kernel.packet[i] = internal::pload<Packet>(data1+i*PacketSize);
+ }
+ ptranspose(kernel);
+ for (int i=0; i<PacketSize; ++i) {
+ internal::pstore(data2, kernel.packet[i]);
+ for (int j = 0; j < PacketSize; ++j) {
+ VERIFY(isApproxAbs(data2[j], data1[i+j*PacketSize], refvalue));
+ }
+ }
}
template<typename Scalar> void packetmath_real()
@@ -212,9 +229,9 @@ template<typename Scalar> void packetmath_real()
const int PacketSize = internal::packet_traits<Scalar>::size;
const int size = PacketSize*4;
- EIGEN_ALIGN16 Scalar data1[internal::packet_traits<Scalar>::size*4];
- EIGEN_ALIGN16 Scalar data2[internal::packet_traits<Scalar>::size*4];
- EIGEN_ALIGN16 Scalar ref[internal::packet_traits<Scalar>::size*4];
+ EIGEN_ALIGN_DEFAULT Scalar data1[internal::packet_traits<Scalar>::size*4];
+ EIGEN_ALIGN_DEFAULT Scalar data2[internal::packet_traits<Scalar>::size*4];
+ EIGEN_ALIGN_DEFAULT Scalar ref[internal::packet_traits<Scalar>::size*4];
for (int i=0; i<size; ++i)
{
@@ -257,9 +274,9 @@ template<typename Scalar> void packetmath_notcomplex()
typedef typename internal::packet_traits<Scalar>::type Packet;
const int PacketSize = internal::packet_traits<Scalar>::size;
- EIGEN_ALIGN16 Scalar data1[internal::packet_traits<Scalar>::size*4];
- EIGEN_ALIGN16 Scalar data2[internal::packet_traits<Scalar>::size*4];
- EIGEN_ALIGN16 Scalar ref[internal::packet_traits<Scalar>::size*4];
+ EIGEN_ALIGN_DEFAULT Scalar data1[internal::packet_traits<Scalar>::size*4];
+ EIGEN_ALIGN_DEFAULT Scalar data2[internal::packet_traits<Scalar>::size*4];
+ EIGEN_ALIGN_DEFAULT Scalar ref[internal::packet_traits<Scalar>::size*4];
Array<Scalar,Dynamic,1>::Map(data1, internal::packet_traits<Scalar>::size*4).setRandom();
@@ -317,10 +334,10 @@ template<typename Scalar> void packetmath_complex()
const int PacketSize = internal::packet_traits<Scalar>::size;
const int size = PacketSize*4;
- EIGEN_ALIGN16 Scalar data1[PacketSize*4];
- EIGEN_ALIGN16 Scalar data2[PacketSize*4];
- EIGEN_ALIGN16 Scalar ref[PacketSize*4];
- EIGEN_ALIGN16 Scalar pval[PacketSize*4];
+ EIGEN_ALIGN_DEFAULT Scalar data1[PacketSize*4];
+ EIGEN_ALIGN_DEFAULT Scalar data2[PacketSize*4];
+ EIGEN_ALIGN_DEFAULT Scalar ref[PacketSize*4];
+ EIGEN_ALIGN_DEFAULT Scalar pval[PacketSize*4];
for (int i=0; i<size; ++i)
{
@@ -339,8 +356,38 @@ template<typename Scalar> void packetmath_complex()
internal::pstore(pval,internal::pcplxflip(internal::pload<Packet>(data1)));
VERIFY(areApprox(ref, pval, PacketSize) && "pcplxflip");
}
-
-
+}
+
+template<typename Scalar> void packetmath_scatter_gather() {
+ typedef typename internal::packet_traits<Scalar>::type Packet;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ const int PacketSize = internal::packet_traits<Scalar>::size;
+ Scalar data1[PacketSize];
+ RealScalar refvalue = 0;
+ for (int i=0; i<PacketSize; ++i) {
+ data1[i] = internal::random<Scalar>()/RealScalar(PacketSize);
+ }
+ Scalar buffer[PacketSize*11];
+ memset(buffer, 0, 11*sizeof(Packet));
+ Packet packet = internal::pload<Packet>(data1);
+ internal::pscatter<Scalar, Packet>(buffer, packet, 11);
+
+ for (int i = 0; i < PacketSize*11; ++i) {
+ if ((i%11) == 0) {
+ VERIFY(isApproxAbs(buffer[i], data1[i/11], refvalue));
+ } else {
+ VERIFY(isApproxAbs(buffer[i], Scalar(0), refvalue));
+ }
+ }
+
+ for (int i=0; i<PacketSize*7; ++i) {
+ buffer[i] = internal::random<Scalar>()/RealScalar(PacketSize);
+ }
+ packet = internal::pgather<Scalar, Packet>(buffer, 7);
+ internal::pstore(data1, packet);
+ for (int i = 0; i < PacketSize; ++i) {
+ VERIFY(isApproxAbs(data1[i], buffer[i*7], refvalue));
+ }
}
void test_packetmath()
@@ -361,5 +408,11 @@ void test_packetmath()
CALL_SUBTEST_1( packetmath_complex<std::complex<float> >() );
CALL_SUBTEST_2( packetmath_complex<std::complex<double> >() );
+
+ CALL_SUBTEST_1( packetmath_scatter_gather<float>() );
+ CALL_SUBTEST_2( packetmath_scatter_gather<double>() );
+ CALL_SUBTEST_3( packetmath_scatter_gather<int>() );
+ CALL_SUBTEST_3( packetmath_scatter_gather<std::complex<float> >() );
+ CALL_SUBTEST_3( packetmath_scatter_gather<std::complex<double> >() );
}
}
diff --git a/test/unalignedcount.cpp b/test/unalignedcount.cpp
index ca7e159f3..d6ffeafdf 100644
--- a/test/unalignedcount.cpp
+++ b/test/unalignedcount.cpp
@@ -30,7 +30,14 @@ static int nb_storeu;
void test_unalignedcount()
{
- #ifdef EIGEN_VECTORIZE_SSE
+ #if defined(EIGEN_VECTORIZE_AVX)
+ VectorXf a(40), b(40);
+ VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 10, 0, 5, 0);
+ VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 5, 5, 5, 0);
+ VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) -= b.segment(0,40), 5, 5, 5, 0);
+ VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) *= 3.5, 5, 0, 5, 0);
+ VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) /= 3.5, 5, 0, 5, 0);
+ #elif defined(EIGEN_VECTORIZE_SSE)
VectorXf a(40), b(40);
VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 20, 0, 10, 0);
VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 10, 10, 10, 0);