From 64a85800bd3573a7da7a396fde9707dce87a58d9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 29 Jan 2014 11:43:05 -0800 Subject: Added support for AVX to Eigen. --- CMakeLists.txt | 6 + Eigen/Core | 22 +- Eigen/Geometry | 4 +- Eigen/LU | 4 +- Eigen/src/Core/DenseStorage.h | 4 +- Eigen/src/Core/Redux.h | 13 +- Eigen/src/Core/arch/AVX/CMakeLists.txt | 6 + Eigen/src/Core/arch/AVX/Complex.h | 443 ++++++++++++++++++++++++++ Eigen/src/Core/arch/AVX/PacketMath.h | 423 ++++++++++++++++++++++++ Eigen/src/Core/arch/SSE/PacketMath.h | 8 + Eigen/src/Core/products/GeneralMatrixMatrix.h | 6 +- Eigen/src/Core/products/GeneralMatrixVector.h | 11 +- Eigen/src/Core/util/Macros.h | 9 +- Eigen/src/Core/util/Memory.h | 38 +-- cmake/EigenTesting.cmake | 6 + test/packetmath.cpp | 35 +- test/unalignedcount.cpp | 9 +- 17 files changed, 993 insertions(+), 54 deletions(-) create mode 100644 Eigen/src/Core/arch/AVX/CMakeLists.txt create mode 100644 Eigen/src/Core/arch/AVX/Complex.h create mode 100644 Eigen/src/Core/arch/AVX/PacketMath.h diff --git a/CMakeLists.txt b/CMakeLists.txt index a4731c8c1..16a9b5bcb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -196,6 +196,12 @@ if(NOT MSVC) message(STATUS "Enabling SSE4.2 in tests/examples") endif() + option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" ON) + if(EIGEN_TEST_AVX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx") + message(STATUS "Enabling AVX in tests/examples") + endif() + option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF) if(EIGEN_TEST_ALTIVEC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec") diff --git a/Eigen/Core b/Eigen/Core index 468ae0c76..bd20d5ac5 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -110,7 +110,13 @@ #ifdef __SSE4_2__ #define EIGEN_VECTORIZE_SSE4_2 #endif - + #ifdef __AVX__ + #define EIGEN_VECTORIZE_AVX + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #endif // include files // This extern "C" works around a MINGW-w64 compilation issue @@ -140,6 +146,9 @@ #ifdef EIGEN_VECTORIZE_SSE4_2 #include #endif + #ifdef EIGEN_VECTORIZE_AVX + #include + #endif #endif } // end extern "C" #elif defined __ALTIVEC__ @@ -209,7 +218,9 @@ namespace Eigen { inline static const char *SimdInstructionSetsInUse(void) { -#if defined(EIGEN_VECTORIZE_SSE4_2) +#if defined(EIGEN_VECTORIZE_AVX) + return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_SSE4_2) return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; #elif defined(EIGEN_VECTORIZE_SSE4_1) return "SSE, SSE2, SSE3, SSSE3, SSE4.1"; @@ -287,7 +298,12 @@ using std::ptrdiff_t; #include "src/Core/MathFunctions.h" #include "src/Core/GenericPacketMath.h" -#if defined EIGEN_VECTORIZE_SSE +#if defined EIGEN_VECTORIZE_AVX + // Use AVX for floats and doubles, SSE for integers + #include "src/Core/arch/AVX/PacketMath.h" + #include "src/Core/arch/AVX/Complex.h" + #include "src/Core/arch/SSE/PacketMath.h" // For integers +#elif defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/SSE/Complex.h" diff --git a/Eigen/Geometry b/Eigen/Geometry index efd9d4504..f9bc6fc57 100644 --- a/Eigen/Geometry +++ b/Eigen/Geometry @@ -47,7 +47,9 @@ #include "src/Geometry/AlignedBox.h" #include "src/Geometry/Umeyama.h" - #if defined EIGEN_VECTORIZE_SSE + // Use the SSE optimized version whenever possible. At the moment the + // SSE version doesn't compile when AVX is enabled + #if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX #include "src/Geometry/arch/Geometry_SSE.h" #endif #endif diff --git a/Eigen/LU b/Eigen/LU index db5795504..e5c3f32f7 100644 --- a/Eigen/LU +++ b/Eigen/LU @@ -27,7 +27,9 @@ #include "src/LU/Determinant.h" #include "src/LU/Inverse.h" -#if defined EIGEN_VECTORIZE_SSE +// Use the SSE optimized version whenever possible. At the moment the +// SSE version doesn't compile when AVX is enabled +#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX #include "src/LU/arch/Inverse_SSE.h" #endif diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index b9dd75ade..2342b08a1 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -83,7 +83,7 @@ struct plain_array template struct plain_array { - EIGEN_USER_ALIGN16 T array[Size]; + EIGEN_USER_ALIGN32 T array[Size]; EIGEN_DEVICE_FUNC plain_array() @@ -102,7 +102,7 @@ struct plain_array template struct plain_array { - EIGEN_USER_ALIGN16 T array[1]; + EIGEN_USER_ALIGN32 T array[1]; EIGEN_DEVICE_FUNC plain_array() {} EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {} }; diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index b2c775d90..5b82c9a65 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -303,10 +303,15 @@ struct redux_impl static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func) { eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); - Scalar res = func.predux(redux_vec_unroller::run(mat,func)); - if (VectorizedSize != Size) - res = func(res,redux_novec_unroller::run(mat,func)); - return res; + if (VectorizedSize > 0) { + Scalar res = func.predux(redux_vec_unroller::run(mat,func)); + if (VectorizedSize != Size) + res = func(res,redux_novec_unroller::run(mat,func)); + return res; + } + else { + return redux_novec_unroller::run(mat,func); + } } }; diff --git a/Eigen/src/Core/arch/AVX/CMakeLists.txt b/Eigen/src/Core/arch/AVX/CMakeLists.txt new file mode 100644 index 000000000..bdb71ab99 --- /dev/null +++ b/Eigen/src/Core/arch/AVX/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_Core_arch_AVX_SRCS "*.h") + +INSTALL(FILES + ${Eigen_Core_arch_AVX_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AVX COMPONENT Devel +) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h new file mode 100644 index 000000000..9fb44ecab --- /dev/null +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -0,0 +1,443 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_AVX_H +#define EIGEN_COMPLEX_AVX_H + +namespace Eigen { + +namespace internal { + +//---------- float ---------- +struct Packet4cf +{ + EIGEN_STRONG_INLINE Packet4cf() {} + EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {} + __m256 v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet4cf type; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits { typedef std::complex type; enum {size=4}; }; + +template<> EIGEN_STRONG_INLINE Packet4cf padd(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf psub(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a) +{ + return Packet4cf(pnegate(a.v)); +} +template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a) +{ + const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet4cf(_mm256_xor_ps(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) +{ + __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v); + __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1))); + __m256 result = _mm256_addsub_ps(tmp1, tmp2); + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pandnot(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet4cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload(&numext::real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu(&numext::real_ref(*from))); } + + +template<> EIGEN_STRONG_INLINE Packet4cf pset1(const std::complex& from) +{ + __m256 result; + for (int i = 0; i < 8; i+=2) { + result[i] = std::real(from); + result[i+1] = std::imag(from); + } + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE Packet4cf ploaddup(const std::complex* from) +{ + __m256 result; + for (int i = 0; i < 2; ++i) { + result[4*i] = std::real(from[i]); + result[4*i+1] = std::imag(from[i]); + result[4*i+2] = std::real(from[i]); + result[4*i+3] = std::imag(from[i]); + } + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } + +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cf& a) +{ + return std::complex(a.v[0], a.v[1]); +} + +template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) { + __m256 result; + result[0] = a.v[6]; + result[1] = a.v[7]; + result[2] = a.v[4]; + result[3] = a.v[5]; + result[4] = a.v[2]; + result[5] = a.v[3]; + result[6] = a.v[0]; + result[7] = a.v[1]; + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet4cf& a) +{ + return std::complex(a.v[0]+a.v[2]+a.v[4]+a.v[6], a.v[1]+a.v[3]+a.v[5]+a.v[7]); +} + +template<> EIGEN_STRONG_INLINE Packet4cf preduxp(const Packet4cf* vecs) +{ + __m256 result = _mm256_setzero_ps(); + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 8; j+=2) { + result[2*i] += vecs[i].v[j]; + result[2*i+1] += vecs[i].v[j+1]; + } + } + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cf& a) +{ + std::complex result(a.v[0], a.v[1]); + for (int i = 2; i < 8; i+=2) { + result *= std::complex(a.v[i], a.v[i+1]); + } + return result; +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second) + { + if (Offset==0) return; + for (int i = 0; i < 4-Offset; ++i) + { + first.v[2*i] = first.v[2*(i+Offset)]; + first.v[2*i+1] = first.v[2*(i+Offset)+1]; + } + for (int i = 4-Offset; i < 4; ++i) + { + first.v[2*i] = second.v[2*(i-4+Offset)]; + first.v[2*i+1] = second.v[2*(i-4+Offset)+1]; + } + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const + { return Packet4cf(Eigen::internal::pmul(x, y.v)); } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const + { return Packet4cf(Eigen::internal::pmul(x.v, y)); } +}; + +template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) +{ + Packet4cf res; + for (int i = 0; i < 8; i+=2) { + std::complex result = std::complex(a.v[i], a.v[i+1]) / std::complex(b.v[i], b.v[i+1]); + res.v[i] = std::real(result); + res.v[i+1] = std::imag(result); + } + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip(const Packet4cf& x) +{ + Packet4cf res; + for (int i = 0; i < 8; i+=2) { + res.v[i] = x.v[i+1]; + res.v[i+1] = x.v[i]; + } + return res; +} + +//---------- double ---------- +struct Packet2cd +{ + EIGEN_STRONG_INLINE Packet2cd() {} + EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {} + __m256d v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet2cd type; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 2, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; }; + +template<> EIGEN_STRONG_INLINE Packet2cd padd(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd psub(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) +{ + const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0)); + return Packet2cd(_mm256_xor_pd(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) +{ + __m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a.v, 0), b.v); + // FIXME: _mm256_permute_pd(b.v, _MM_SHUFFLE2(1,0) won't work as expected, figure out an alternative. + __m256d op = {b.v[1], b.v[0], b.v[3], b.v[2]}; + __m256d tmp2 = _mm256_mul_pd(_mm256_permute_pd(a.v, 15), op); + __m256d result = _mm256_addsub_pd(tmp1, tmp2); + + return Packet2cd(result); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pandnot(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet2cd pload (const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cd ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu((const double*)from)); } + +template<> EIGEN_STRONG_INLINE Packet2cd pset1(const std::complex& from) +{ + __m256d result; + for (int i = 0; i < 4; i+=2) { + result[i] = std::real(from); + result[i+1] = std::imag(from); + } + return Packet2cd(result); +} + +template<> EIGEN_STRONG_INLINE Packet2cd ploaddup(const std::complex* from) { return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cd& a) +{ + return std::complex(a.v[0],a.v[1]); +} + +template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) { + __m256d result; + result[0] = a.v[2]; + result[1] = a.v[3]; + result[2] = a.v[0]; + result[3] = a.v[1]; + return Packet2cd(result); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cd& a) +{ + return std::complex(a.v[0]+a.v[2], a.v[1]+a.v[3]); +} + +template<> EIGEN_STRONG_INLINE Packet2cd preduxp(const Packet2cd* vecs) +{ + __m256d result = _mm256_setzero_pd(); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 4; j+=2) { + result[2*i] += vecs[i].v[j]; + result[2*i+1] += vecs[i].v[j+1]; + } + } + return Packet2cd(result); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cd& a) +{ + return std::complex(a.v[0], a.v[1]) * std::complex(a.v[2], a.v[3]); +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second) + { + if (Offset==0) return; + first.v[0] = first.v[2]; + first.v[1] = first.v[3]; + first.v[2] = second.v[0]; + first.v[3] = second.v[1]; + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const + { return Packet2cd(Eigen::internal::pmul(x, y.v)); } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const + { return Packet2cd(Eigen::internal::pmul(x.v, y)); } +}; + +template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) +{ + Packet2cd res; + for (int i = 0; i < 4; i+=2) { + std::complex result = std::complex(a.v[i], a.v[i+1]) / std::complex(b.v[i], b.v[i+1]); + res.v[i] = std::real(result); + res.v[i+1] = std::imag(result); + } + return res; +} + +template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip(const Packet2cd& x) +{ + Packet2cd res; + for (int i = 0; i < 4; i+=2) { + res.v[i] = x.v[i+1]; + res.v[i+1] = x.v[i]; + } + return res; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_AVX_H diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h new file mode 100644 index 000000000..244e63e74 --- /dev/null +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -0,0 +1,423 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_AVX_H +#define EIGEN_PACKET_MATH_AVX_H + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#endif + +typedef __m256 Packet8f; +typedef __m256i Packet8i; +typedef __m256d Packet4d; + +template<> struct is_arithmetic<__m256> { enum { value = true }; }; +template<> struct is_arithmetic<__m256i> { enum { value = true }; }; +template<> struct is_arithmetic<__m256d> { enum { value = true }; }; + +#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ + const Packet8f p8f_##NAME = pset1(X) + +#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \ + const Packet4d p4d_##NAME = pset1(X) + + +template<> struct packet_traits : default_packet_traits +{ + typedef Packet8f type; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=8, + + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 0, + HasSqrt = 0 + }; + }; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4d type; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + + HasDiv = 1, + HasExp = 0 + }; +}; + +/* Proper support for integers is only provided by AVX2. In the meantime, we'll + use SSE instructions and packets to deal with integers. +template<> struct packet_traits : default_packet_traits +{ + typedef Packet8i type; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=8 + }; +}; +*/ + +template<> struct unpacket_traits { typedef float type; enum {size=8}; }; +template<> struct unpacket_traits { typedef double type; enum {size=4}; }; +template<> struct unpacket_traits { typedef int type; enum {size=8}; }; + +template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i pset1(const int& from) { return _mm256_set1_epi32(from); } + +template<> EIGEN_STRONG_INLINE Packet8f plset(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } +template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } + +template<> EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d padd(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f psub(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d psub(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) +{ + return _mm256_sub_ps(_mm256_set1_ps(0.0),a); +} +template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) +{ + return _mm256_sub_pd(_mm256_set1_pd(0.0),a); +} + +template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8f pmul(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmul(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); } + + +template<> EIGEN_STRONG_INLINE Packet8f pdiv(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pdiv(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, const Packet8i& /*b*/) +{ eigen_assert(false && "packet integer division are not supported by AVX"); + return pset1(0); +} + +template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d por(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pxor(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast(from)); } + +template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast(from)); } + +// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} +template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) +{ + Packet8f tmp = ploadu(from); + Packet8f tmp1 = _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); + Packet8f tmp2 = _mm256_permute_ps(tmp, _MM_SHUFFLE(1,1,0,0)); + return _mm256_blend_ps(_mm256_permute2f128_ps(tmp1,tmp1,1),tmp2,15); +} +// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} +template<> EIGEN_STRONG_INLINE Packet4d ploaddup(const double* from) +{ + Packet4d tmp = ploadu(from); + Packet4d tmp1 = _mm256_permute_pd(tmp,0); + Packet4d tmp2 = _mm256_permute_pd(tmp,3); + return _mm256_blend_pd(tmp1,_mm256_permute2f128_pd(tmp2,tmp2,1),12); +} + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } + +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } + +template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) +{ + Packet8f pa = pset1(a); + pstore(to, pa); +} +template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) +{ + Packet4d pa = pset1(a); + pstore(to, pa); +} +template<> EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) +{ + Packet8i pa = pset1(a); + pstore(to, pa); +} + +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + +template<> EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) { + return _mm_cvtss_f32(_mm256_castps256_ps128(a)); +} +template<> EIGEN_STRONG_INLINE double pfirst(const Packet4d& a) { + return _mm_cvtsd_f64(_mm256_castpd256_pd128(a)); +} +template<> EIGEN_STRONG_INLINE int pfirst(const Packet8i& a) { + return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); +} + + +template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) +{ + __m256 tmp = _mm256_shuffle_ps(a,a,0x1b); + return _mm256_permute2f128_ps(tmp, tmp, 1); +} +template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) +{ + __m256d tmp = _mm256_shuffle_pd(a,a,5); + return _mm256_permute2f128_pd(tmp, tmp, 1); + + __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); + return _mm256_permute_pd(swap_halves,5); +} + +// pabs should be ok +template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) +{ + const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); + return _mm256_and_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) +{ + const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); + return _mm256_and_pd(a,mask); +} + +// preduxp should be ok +// FIXME: why is this ok? why isn't the simply implementation working as expected? +template<> EIGEN_STRONG_INLINE Packet8f preduxp(const Packet8f* vecs) +{ + __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]); + __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]); + __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]); + __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]); + + __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); + __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); + __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); + __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); + + __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + + __m256 sum1 = _mm256_add_ps(perm1, hsum5); + __m256 sum2 = _mm256_add_ps(perm2, hsum6); + __m256 sum3 = _mm256_add_ps(perm3, hsum7); + __m256 sum4 = _mm256_add_ps(perm4, hsum8); + + __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + + __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); + return final; +} +template<> EIGEN_STRONG_INLINE Packet4d preduxp(const Packet4d* vecs) +{ + Packet4d tmp0, tmp1; + + tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + + tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + + return _mm256_blend_pd(tmp0, tmp1, 0xC); +} + +template<> EIGEN_STRONG_INLINE float predux(const Packet8f& a) +{ + Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1)); + tmp0 = _mm256_hadd_ps(tmp0,tmp0); + return pfirst(_mm256_hadd_ps(tmp0, tmp0)); +} +template<> EIGEN_STRONG_INLINE double predux(const Packet4d& a) +{ + Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_hadd_pd(tmp0,tmp0)); +} + +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet8f& a) +{ + Packet8f tmp; + tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet4d& a) +{ + Packet4d tmp; + tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1))); +} + +template<> EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) +{ + float result = a[0]; + for (int i = 1; i < 8; ++i) { + if (a[i] < result) result = a[i]; + } + return result; +} +template<> EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) +{ + double result = a[0]; + for (int i = 1; i < 4; ++i) { + if (a[i] < result) result = a[i]; + } + return result; +} + +template<> EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) +{ + float result = a[0]; + for (int i = 1; i < 8; ++i) { + if (a[i] > result) result = a[i]; + } + return result; +} + +template<> EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) +{ + double result = a[0]; + for (int i = 1; i < 4; ++i) { + if (a[i] > result) result = a[i]; + } + return result; +} + + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) + { + if (Offset==1) + { + first = _mm256_blend_ps(first, second, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0x88); + } + else if (Offset==2) + { + first = _mm256_blend_ps(first, second, 3); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xcc); + } + else if (Offset==3) + { + first = _mm256_blend_ps(first, second, 7); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xee); + } + else if (Offset==4) + { + first = _mm256_blend_ps(first, second, 15); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); + first = _mm256_permute_ps(_mm256_permute2f128_ps (tmp, tmp, 1), _MM_SHUFFLE(3,2,1,0)); + } + else if (Offset==5) + { + first = _mm256_blend_ps(first, second, 31); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0x88); + } + else if (Offset==6) + { + first = _mm256_blend_ps(first, second, 63); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xcc); + } + else if (Offset==7) + { + first = _mm256_blend_ps(first, second, 127); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xee); + } + } +}; + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) + { + if (Offset==1) + { + first = _mm256_blend_pd(first, second, 1); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 0xA); + } + else if (Offset==2) + { + first = _mm256_blend_pd(first, second, 3); + first = _mm256_permute2f128_pd(first, first, 1); + } + else if (Offset==3) + { + first = _mm256_blend_pd(first, second, 7); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 5); + } + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_AVX_H diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index f5a3dab52..e913af650 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -58,6 +58,9 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; const Packet4i p4i_##NAME = pset1(X) +// Use the packet_traits defined in AVX/PacketMath.h instead if we're going +// to leverage AVX instructions. +#ifndef EIGEN_VECTORIZE_AVX template<> struct packet_traits : default_packet_traits { typedef Packet4f type; @@ -87,6 +90,7 @@ template<> struct packet_traits : default_packet_traits HasSqrt = 1 }; }; +#endif template<> struct packet_traits : default_packet_traits { typedef Packet4i type; @@ -115,8 +119,10 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { re template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } #endif +#ifndef EIGEN_VECTORIZE_AVX template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return _mm_add_ps(pset1(a), _mm_set_ps(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a),_mm_set_pd(1,0)); } +#endif template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return _mm_add_epi32(pset1(a),_mm_set_epi32(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } @@ -331,9 +337,11 @@ template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& pstore(to, vec2d_swizzle1(pa,0,0)); } +#ifndef EIGEN_VECTORIZE_AVX template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +#endif #if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER) // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 3f5ffcf51..eb399a824 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -286,9 +286,9 @@ class gemm_blocking_space 4. + if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) || LhsPacketSize > 4) { alignedSize = 0; alignedStart = 0; @@ -404,7 +405,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product 4. + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) || + (LhsPacketSize > 4)) { alignedSize = 0; alignedStart = 0; @@ -446,7 +449,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(tmp0); const LhsScalar* lhs0 = lhs + i*lhsStride; // process first unaligned result's coeffs diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index debc04f3f..787f800b8 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -72,7 +72,11 @@ #endif #define EIGEN_ALIGN 0 #else - #define EIGEN_ALIGN 1 + #if !defined(EIGEN_DONT_VECTORIZE) && defined(__AVX__) + #define EIGEN_ALIGN 32 + #else + #define EIGEN_ALIGN 16 + #endif #endif // EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable @@ -281,13 +285,16 @@ #endif #define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) +#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) #if EIGEN_ALIGN_STATICALLY #define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n) #define EIGEN_USER_ALIGN16 EIGEN_ALIGN16 +#define EIGEN_USER_ALIGN32 EIGEN_ALIGN32 #else #define EIGEN_USER_ALIGN_TO_BOUNDARY(n) #define EIGEN_USER_ALIGN16 +#define EIGEN_USER_ALIGN32 #endif #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index d177e8b5a..76bdb6cfc 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -32,7 +32,7 @@ // page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed // quite safe, at least within the context of glibc, to equate 64-bit with LP64. #if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \ - && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) + && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_ALIGN == 16) #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1 #else #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0 @@ -42,14 +42,14 @@ // See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup // FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures // See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup -#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) +#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) && (EIGEN_ALIGN == 16) #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1 #else #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0 #endif -#if defined(__APPLE__) \ - || defined(_WIN64) \ +#if (defined(__APPLE__) && (EIGEN_ALIGN == 16)) \ + || (defined(_WIN64) && (EIGEN_ALIGN == 16)) \ || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \ || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED #define EIGEN_MALLOC_ALREADY_ALIGNED 1 @@ -73,7 +73,7 @@ #define EIGEN_HAS_POSIX_MEMALIGN 0 #endif -#ifdef EIGEN_VECTORIZE_SSE +#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_AVX #define EIGEN_HAS_MM_MALLOC 1 #else #define EIGEN_HAS_MM_MALLOC 0 @@ -105,9 +105,9 @@ inline void throw_std_bad_alloc() */ inline void* handmade_aligned_malloc(std::size_t size) { - void *original = std::malloc(size+16); + void *original = std::malloc(size+EIGEN_ALIGN); if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(15))) + 16); + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_ALIGN-1))) + EIGEN_ALIGN); *(reinterpret_cast(aligned) - 1) = original; return aligned; } @@ -128,9 +128,9 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = if (ptr == 0) return handmade_aligned_malloc(size); void *original = *(reinterpret_cast(ptr) - 1); std::ptrdiff_t previous_offset = static_cast(ptr)-static_cast(original); - original = std::realloc(original,size+16); + original = std::realloc(original,size+EIGEN_ALIGN); if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(15))) + 16); + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_ALIGN-1))) + EIGEN_ALIGN); void *previous_aligned = static_cast(original)+previous_offset; if(aligned!=previous_aligned) std::memmove(aligned, previous_aligned, size); @@ -208,7 +208,7 @@ inline void check_that_malloc_is_allowed() {} #endif -/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment. +/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements. * On allocation error, the returned pointer is null, and std::bad_alloc is thrown. */ inline void* aligned_malloc(size_t size) @@ -221,11 +221,11 @@ inline void* aligned_malloc(size_t size) #elif EIGEN_MALLOC_ALREADY_ALIGNED result = std::malloc(size); #elif EIGEN_HAS_POSIX_MEMALIGN - if(posix_memalign(&result, 16, size)) result = 0; + if(posix_memalign(&result, EIGEN_ALIGN, size)) result = 0; #elif EIGEN_HAS_MM_MALLOC - result = _mm_malloc(size, 16); + result = _mm_malloc(size, EIGEN_ALIGN); #elif defined(_MSC_VER) && (!defined(_WIN32_WCE)) - result = _aligned_malloc(size, 16); + result = _aligned_malloc(size, EIGEN_ALIGN); #else result = handmade_aligned_malloc(size); #endif @@ -275,12 +275,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) // implements _mm_malloc/_mm_free based on the corresponding _aligned_ // functions. This may not always be the case and we just try to be safe. #if defined(_MSC_VER) && defined(_mm_free) - result = _aligned_realloc(ptr,new_size,16); + result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN); #else result = generic_aligned_realloc(ptr,new_size,old_size); #endif #elif defined(_MSC_VER) - result = _aligned_realloc(ptr,new_size,16); + result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN); #else result = handmade_aligned_realloc(ptr,new_size,old_size); #endif @@ -607,9 +607,9 @@ template class aligned_stack_memory_handler * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token. */ #ifdef EIGEN_ALLOCA - - #ifdef __arm__ - #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast((reinterpret_cast(EIGEN_ALLOCA(SIZE+16)) & ~(size_t(15))) + 16) + // The native alloca() that comes with llvm aligns buffer on 16 bytes even when AVX is enabled. + #if defined(__arm__) || EIGEN_ALIGN > 16 + #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast((reinterpret_cast(EIGEN_ALLOCA(SIZE+EIGEN_ALIGN)) & ~(size_t(EIGEN_ALIGN-1))) + EIGEN_ALIGN) #else #define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA #endif @@ -679,7 +679,7 @@ template class aligned_stack_memory_handler #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true) #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0))) + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_ALIGN==0))) /****************************************************************************/ diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index fdc166bf8..d1ea4b9d2 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -264,6 +264,12 @@ macro(ei_testing_print_summary) message(STATUS "SSE4.2: Using architecture defaults") endif() + if(EIGEN_TEST_AVX) + message(STATUS "AVX: ON") + else() + message(STATUS "AVX: Using architecture defaults") + endif() + if(EIGEN_TEST_ALTIVEC) message(STATUS "Altivec: ON") else() diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 2c0519c41..d7c336c22 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -104,11 +104,12 @@ template void packetmath() const int PacketSize = internal::packet_traits::size; typedef typename NumTraits::Real RealScalar; - const int size = PacketSize*4; - EIGEN_ALIGN16 Scalar data1[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Scalar data2[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Packet packets[PacketSize*2]; - EIGEN_ALIGN16 Scalar ref[internal::packet_traits::size*4]; + const int max_size = PacketSize > 4 ? PacketSize : 4; + const int size = PacketSize*max_size; + EIGEN_ALIGN32 Scalar data1[size]; + EIGEN_ALIGN32 Scalar data2[size]; + EIGEN_ALIGN32 Packet packets[PacketSize*2]; + EIGEN_ALIGN32 Scalar ref[size]; RealScalar refvalue = 0; for (int i=0; i void packetmath() else if (offset==1) internal::palign<1>(packets[0], packets[1]); else if (offset==2) internal::palign<2>(packets[0], packets[1]); else if (offset==3) internal::palign<3>(packets[0], packets[1]); + else if (offset==4) internal::palign<4>(packets[0], packets[1]); + else if (offset==5) internal::palign<5>(packets[0], packets[1]); + else if (offset==6) internal::palign<6>(packets[0], packets[1]); + else if (offset==7) internal::palign<7>(packets[0], packets[1]); internal::pstore(data2, packets[0]); for (int i=0; i void packetmath_real() const int PacketSize = internal::packet_traits::size; const int size = PacketSize*4; - EIGEN_ALIGN16 Scalar data1[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Scalar data2[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Scalar ref[internal::packet_traits::size*4]; + EIGEN_ALIGN32 Scalar data1[internal::packet_traits::size*4]; + EIGEN_ALIGN32 Scalar data2[internal::packet_traits::size*4]; + EIGEN_ALIGN32 Scalar ref[internal::packet_traits::size*4]; for (int i=0; i void packetmath_notcomplex() typedef typename internal::packet_traits::type Packet; const int PacketSize = internal::packet_traits::size; - EIGEN_ALIGN16 Scalar data1[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Scalar data2[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Scalar ref[internal::packet_traits::size*4]; + EIGEN_ALIGN32 Scalar data1[internal::packet_traits::size*4]; + EIGEN_ALIGN32 Scalar data2[internal::packet_traits::size*4]; + EIGEN_ALIGN32 Scalar ref[internal::packet_traits::size*4]; Array::Map(data1, internal::packet_traits::size*4).setRandom(); @@ -317,10 +322,10 @@ template void packetmath_complex() const int PacketSize = internal::packet_traits::size; const int size = PacketSize*4; - EIGEN_ALIGN16 Scalar data1[PacketSize*4]; - EIGEN_ALIGN16 Scalar data2[PacketSize*4]; - EIGEN_ALIGN16 Scalar ref[PacketSize*4]; - EIGEN_ALIGN16 Scalar pval[PacketSize*4]; + EIGEN_ALIGN32 Scalar data1[PacketSize*4]; + EIGEN_ALIGN32 Scalar data2[PacketSize*4]; + EIGEN_ALIGN32 Scalar ref[PacketSize*4]; + EIGEN_ALIGN32 Scalar pval[PacketSize*4]; for (int i=0; i