diff options
author | Gael Guennebaud <g.gael@free.fr> | 2010-07-07 16:37:20 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2010-07-07 16:37:20 +0200 |
commit | a2415388ef05154ca5f655a58694ce908e21213a (patch) | |
tree | c4afc8c9cfaede8dbd770e3d6e580322ad1b9f60 /Eigen | |
parent | 65257f6b29362bbd4b45faa6ba957d53096e2f11 (diff) |
optimized conjugate products for SSE3
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/src/Core/arch/SSE/Complex.h | 24 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 18 |
2 files changed, 36 insertions, 6 deletions
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 5c35a84fd..4ecfc2f43 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -147,10 +147,14 @@ template<> struct ei_conj_helper<Packet2cf, Packet2cf, false,true> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { + #ifdef EIGEN_VECTORIZE_SSE3 + return ei_pmul(a, ei_pconj(b)); + #else const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + #endif } }; @@ -161,10 +165,14 @@ template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,false> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { + #ifdef EIGEN_VECTORIZE_SSE3 + return ei_pmul(ei_pconj(a), b); + #else const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); return Packet2cf(_mm_add_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); + #endif } }; @@ -175,10 +183,14 @@ template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,true> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { + #ifdef EIGEN_VECTORIZE_SSE3 + return ei_pconj(ei_pmul(a, b)); + #else const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + #endif } }; @@ -300,10 +312,14 @@ template<> struct ei_conj_helper<Packet1cd, Packet1cd, false,true> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { + #ifdef EIGEN_VECTORIZE_SSE3 + return ei_pmul(a, ei_pconj(b)); + #else const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), mask), _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), ei_vec2d_swizzle1(b.v, 1, 0)))); + #endif } }; @@ -314,10 +330,14 @@ template<> struct ei_conj_helper<Packet1cd, Packet1cd, true,false> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { + #ifdef EIGEN_VECTORIZE_SSE3 + return ei_pmul(ei_pconj(a), b); + #else const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), _mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), ei_vec2d_swizzle1(b.v, 1, 0)), mask))); + #endif } }; @@ -328,10 +348,14 @@ template<> struct ei_conj_helper<Packet1cd, Packet1cd, true,true> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { + #ifdef EIGEN_VECTORIZE_SSE3 + return ei_pconj(ei_pmul(a, b)); + #else const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), mask), _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), ei_vec2d_swizzle1(b.v, 1, 0)))); + #endif } }; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 6a9402cba..2ae78c1e7 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -73,9 +73,6 @@ static void run(Index rows, Index cols, Index depth, ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - if (ConjugateRhs) - alpha = ei_conj(alpha); - typedef typename ei_packet_traits<Scalar>::type PacketType; typedef ei_product_blocking_traits<Scalar> Blocking; @@ -83,9 +80,18 @@ static void run(Index rows, Index cols, Index depth, Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction //Index nc = blocking.nc(); // cache block size along the N direction - ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs; - ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs; - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp; + // FIXME starting from SSE3, normal complex product cannot be optimized as well as + // conjugate product, therefore it is better to conjugate during the copies. + // With SSE2, this is the other way round. + ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder, ConjugateLhs> pack_lhs; + ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder, ConjugateRhs> pack_rhs; + ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr> gebp; + +// if (ConjugateRhs) +// alpha = ei_conj(alpha); +// ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs; +// ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs; +// ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp; #ifdef EIGEN_HAS_OPENMP if(info) |