diff options
-rw-r--r-- | Eigen/Core | 1 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/Complex.h | 144 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 10 | ||||
-rw-r--r-- | Eigen/src/Core/util/BlasUtil.h | 12 | ||||
-rw-r--r-- | bench/bench_gemm.cpp | 17 | ||||
-rw-r--r-- | test/packetmath.cpp | 41 |
6 files changed, 195 insertions, 30 deletions
diff --git a/Eigen/Core b/Eigen/Core index 5e3e0960a..3135d7530 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -221,6 +221,7 @@ using std::size_t; #if defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/MathFunctions.h" + #include "src/Core/arch/SSE/Complex.h" #elif defined EIGEN_VECTORIZE_ALTIVEC #include "src/Core/arch/AltiVec/PacketMath.h" #elif defined EIGEN_VECTORIZE_NEON diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h new file mode 100644 index 000000000..ab8bf8b84 --- /dev/null +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -0,0 +1,144 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_COMPLEX_SSE_H +#define EIGEN_COMPLEX_SSE_H + +struct Packet2cf +{ + EIGEN_STRONG_INLINE Packet2cf() {} + EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {} + __m128 v; +}; + +typedef __m128d Packet1cd; + +template<> struct ei_packet_traits<std::complex<float> > : ei_default_packet_traits +{ + typedef Packet2cf type; enum {size=2}; +}; + +template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; + +template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>& from) +{ + Packet2cf res; + res.v = _mm_loadl_pi(res.v, (const __m64*)&from); + return Packet2cf(_mm_movelh_ps(res.v,res.v)); +} + +// template<> EIGEN_STRONG_INLINE Packet4f ei_plset<std::complex<float> >(const std::complex<float> & a) { } + +template<> EIGEN_STRONG_INLINE Packet2cf ei_padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_pnegate(const Packet2cf& a) +{ + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); + return Packet2cf(_mm_xor_ps(a.v,mask)); + +} + +template<> EIGEN_STRONG_INLINE Packet2cf ei_pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for SSE3 and 4 + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000)); + return Packet2cf(_mm_add_ps(_mm_mul_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a.v), 0xa0)), b.v), + _mm_xor_ps(_mm_mul_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a.v), 0xf5)), + _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b.v), 0xb1 ))), mask))); +} + +// template<> EIGEN_STRONG_INLINE Packet2cf ei_pmadd<Packet2cf>(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) +// {std::cerr << __LINE__ << "\n"; +// // TODO optimize it for SSE3 and 4 +// const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000)); +// return Packet2cf(_mm_add_ps(c.v, +// _mm_add_ps(_mm_mul_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a.v), 0xa0)), b.v), +// _mm_xor_ps(_mm_mul_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a.v), 0xf5)), +// _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b.v), 0xb1 ))), mask)))); +// } + +template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for SSE3 and 4 + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + Packet2cf res(_mm_add_ps(_mm_mul_ps(a.v, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b.v), 0xa0))), + _mm_xor_ps(_mm_mul_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a.v), 0xb1)), + _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b.v), 0xf5 ))), mask))); + __m128 s = _mm_mul_ps(b.v,b.v); + return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); +} + +template<> EIGEN_STRONG_INLINE Packet2cf ei_pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(_mm_load_ps((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_ploadu((const float*)from)); } + +template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstoreu((float*)to, from.v); } + +template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + +template<> EIGEN_STRONG_INLINE std::complex<float> ei_pfirst<Packet2cf>(const Packet2cf& a) +{ + std::complex<float> res; + _mm_storel_pi((__m64*)&res, a.v); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet2cf ei_preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(ei_preverse(_mm_castps_pd(a.v)))); } + +// template<> EIGEN_STRONG_INLINE Packet2cf ei_pabs(const Packet2cf& a) {} + +template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux<Packet2cf>(const Packet2cf& a) +{ + return ei_pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v)))); +} + +template<> EIGEN_STRONG_INLINE Packet2cf ei_preduxp<Packet2cf>(const Packet2cf* vecs) +{ + return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v))); +} + +template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux_mul<Packet2cf>(const Packet2cf& a) +{ + return ei_pfirst(ei_pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v)))); +} + +template<int Offset> +struct ei_palign_impl<Offset,Packet2cf> +{ + EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset==1) + { + first.v = _mm_movehl_ps(first.v, first.v); + first.v = _mm_movelh_ps(first.v, second.v); + } + } +}; + +#endif // EIGEN_COMPLEX_SSE_H diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 2c42ad5b6..4d9a09708 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -259,7 +259,7 @@ struct ei_gebp_kernel #ifndef EIGEN_HAS_FUSE_CJMADD PacketType T0; #endif - +EIGEN_ASM_COMMENT("mybegin"); A0 = ei_pload(&blA[0*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); @@ -295,6 +295,7 @@ struct ei_gebp_kernel B0 = ei_pload(&blB[7*PacketSize]); CJMADD(A0,B0,C1,T0); CJMADD(A1,B0,C5,B0); +EIGEN_ASM_COMMENT("myend"); } else { @@ -302,7 +303,7 @@ struct ei_gebp_kernel #ifndef EIGEN_HAS_FUSE_CJMADD PacketType T0; #endif - +EIGEN_ASM_COMMENT("mybegin"); A0 = ei_pload(&blA[0*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); @@ -361,6 +362,7 @@ struct ei_gebp_kernel CJMADD(A1,B2,C6,B2); CJMADD(A0,B3,C3,T0); CJMADD(A1,B3,C7,B3); +EIGEN_ASM_COMMENT("myend"); } blB += 4*nr*PacketSize; @@ -683,7 +685,9 @@ struct ei_gebp_kernel const Scalar* blB = unpackedB; for(Index k=0; k<depth; k++) { - C0 = cj.pmadd(ei_pload(blA), ei_pload(blB), C0); + PacketType T0; + CJMADD(ei_pload(blA), ei_pload(blB), C0, T0); + //C0 = cj.pmadd(ei_pload(blA), ei_pload(blB), C0); blB += PacketSize; blA += PacketSize; } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 8bcd8c95f..139ea73d2 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -140,6 +140,18 @@ struct ei_product_blocking_traits }; }; +template<typename Real> +struct ei_product_blocking_traits<std::complex<Real> > +{ + typedef std::complex<Real> Scalar; + typedef typename ei_packet_traits<Scalar>::type PacketType; + enum { + PacketSize = sizeof(PacketType)/sizeof(Scalar), + nr = 2, + mr = 2 * PacketSize + }; +}; + /* Helper class to analyze the factors of a Product expression. * In particular it allows to pop out operator-, scalar multiples, * and conjugate */ diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp index 4142236e9..0da87b583 100644 --- a/bench/bench_gemm.cpp +++ b/bench/bench_gemm.cpp @@ -10,7 +10,7 @@ using namespace std; using namespace Eigen; #ifndef SCALAR -#define SCALAR float +#define SCALAR std::complex<float> #endif typedef SCALAR Scalar; @@ -26,6 +26,8 @@ static float fone = 1; static float fzero = 0; static double done = 1; static double szero = 0; +static std::complex<float> cfone = 1; +static std::complex<float> cfzero = 0; static char notrans = 'N'; static char trans = 'T'; static char nonunit = 'N'; @@ -44,6 +46,17 @@ void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c) c.data(),&ldc); } +void blas_gemm(const MatrixXcf& a, const MatrixXcf& b, MatrixXcf& c) +{ + int M = c.rows(); int N = c.cols(); int K = a.cols(); + int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); + + cgemm_(¬rans,¬rans,&M,&N,&K,(float*)&cfone, + const_cast<float*>((const float*)a.data()),&lda, + const_cast<float*>((const float*)b.data()),&ldb,(float*)&cfone, + (float*)c.data(),&ldc); +} + void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c) { int M = c.rows(); int N = c.cols(); int K = a.cols(); @@ -98,7 +111,7 @@ int main(int argc, char ** argv) } if(cache_size>0) - setCpuCacheSizes(cache_size,32*cache_size); + setCpuCacheSizes(cache_size,96*cache_size); int m = s; int n = s; diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 9218b6736..f8fb044cd 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -108,16 +108,6 @@ struct packet_helper<false,Packet> #define REF_MUL(a,b) ((a)*(b)) #define REF_DIV(a,b) ((a)/(b)) -namespace std { - -template<> const complex<float>& min(const complex<float>& a, const complex<float>& b) -{ return a.real() < b.real() ? a : b; } - -template<> const complex<float>& max(const complex<float>& a, const complex<float>& b) -{ return a.real() < b.real() ? b : a; } - -} - template<typename Scalar> void packetmath() { typedef typename ei_packet_traits<Scalar>::type Packet; @@ -176,9 +166,6 @@ template<typename Scalar> void packetmath() if (!ei_is_same_type<Scalar,int>::ret) CHECK_CWISE2(REF_DIV, ei_pdiv); #endif - CHECK_CWISE2(std::min, ei_pmin); - CHECK_CWISE2(std::max, ei_pmax); - CHECK_CWISE1(ei_abs, ei_pabs); CHECK_CWISE1(ei_negate, ei_pnegate); for (int i=0; i<PacketSize; ++i) @@ -198,16 +185,6 @@ template<typename Scalar> void packetmath() ref[0] *= data1[i]; VERIFY(ei_isApprox(ref[0], ei_predux_mul(ei_pload(data1))) && "ei_predux_mul"); - ref[0] = data1[0]; - for (int i=0; i<PacketSize; ++i) - ref[0] = std::min(ref[0],data1[i]); - VERIFY(ei_isApprox(ref[0], ei_predux_min(ei_pload(data1))) && "ei_predux_min"); - - ref[0] = data1[0]; - for (int i=0; i<PacketSize; ++i) - ref[0] = std::max(ref[0],data1[i]); - VERIFY(ei_isApprox(ref[0], ei_predux_max(ei_pload(data1))) && "ei_predux_max"); - for (int j=0; j<PacketSize; ++j) { ref[j] = 0; @@ -256,17 +233,31 @@ template<typename Scalar> void packetmath_real() } CHECK_CWISE1_IF(ei_packet_traits<Scalar>::HasLog, ei_log, ei_plog); CHECK_CWISE1_IF(ei_packet_traits<Scalar>::HasSqrt, ei_sqrt, ei_psqrt); + + ref[0] = data1[0]; + for (int i=0; i<PacketSize; ++i) + ref[0] = std::min(ref[0],data1[i]); + VERIFY(ei_isApprox(ref[0], ei_predux_min(ei_pload(data1))) && "ei_predux_min"); + + CHECK_CWISE2(std::min, ei_pmin); + CHECK_CWISE2(std::max, ei_pmax); + CHECK_CWISE1(ei_abs, ei_pabs); + + ref[0] = data1[0]; + for (int i=0; i<PacketSize; ++i) + ref[0] = std::max(ref[0],data1[i]); + VERIFY(ei_isApprox(ref[0], ei_predux_max(ei_pload(data1))) && "ei_predux_max"); } void test_packetmath() { for(int i = 0; i < g_repeat; i++) { - CALL_SUBTEST_1( packetmath<float>() ); +// CALL_SUBTEST_1( packetmath<float>() ); CALL_SUBTEST_2( packetmath<double>() ); CALL_SUBTEST_3( packetmath<int>() ); CALL_SUBTEST_1( packetmath<std::complex<float> >() ); - CALL_SUBTEST_1( packetmath_real<float>() ); +// CALL_SUBTEST_1( packetmath_real<float>() ); CALL_SUBTEST_2( packetmath_real<double>() ); } } |