From 64c49de7baf6b843739b170d61aebc744d943ba9 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 5 May 2008 17:19:47 +0000 Subject: * split PacketMath.h to SSE and Altivec specific files * improved the flexibility of the new product implementation, now all sizes seems to be properly handled. --- Eigen/Core | 6 + Eigen/src/Core/PacketMath.h | 205 +++------------------------- Eigen/src/Core/PacketMath_Altivec.h | 113 ++++++++++++++++ Eigen/src/Core/PacketMath_SSE.h | 154 +++++++++++++++++++++ Eigen/src/Core/ProductWIP.h | 260 +++++++++++++++++++++++++++++++----- Eigen/src/Core/util/Macros.h | 6 + Eigen/src/Core/util/Meta.h | 3 + 7 files changed, 527 insertions(+), 220 deletions(-) create mode 100644 Eigen/src/Core/PacketMath_Altivec.h create mode 100644 Eigen/src/Core/PacketMath_SSE.h diff --git a/Eigen/Core b/Eigen/Core index cfe7a1059..999bb96f2 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -39,6 +39,12 @@ namespace Eigen { #include "src/Core/NumTraits.h" #include "src/Core/MathFunctions.h" #include "src/Core/PacketMath.h" +#if defined EIGEN_VECTORIZE_SSE +#include "src/Core/PacketMath_SSE.h" +#elif defined EIGEN_VECTORIZE_ALTIVEC +#include "src/Core/PacketMath_Altivec.h" +#endif + #include "src/Core/Functors.h" #include "src/Core/MatrixBase.h" #include "src/Core/Coeffs.h" diff --git a/Eigen/src/Core/PacketMath.h b/Eigen/src/Core/PacketMath.h index cfa19eb6a..59c143ede 100644 --- a/Eigen/src/Core/PacketMath.h +++ b/Eigen/src/Core/PacketMath.h @@ -26,7 +26,7 @@ #define EIGEN_PACKET_MATH_H #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16 +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16 #endif // Default implementation for types not supported by the vectorization. @@ -35,211 +35,46 @@ // called, TODO so sould we raise an assertion or not ? /** \internal \returns a + b (coeff-wise) */ template inline Scalar ei_padd(const Scalar& a, const Scalar& b) { return a + b; } + /** \internal \returns a - b (coeff-wise) */ template inline Scalar ei_psub(const Scalar& a, const Scalar& b) { return a - b; } + /** \internal \returns a * b (coeff-wise) */ template inline Scalar ei_pmul(const Scalar& a, const Scalar& b) { return a * b; } + /** \internal \returns a * b - c (coeff-wise) */ template inline Scalar ei_pmadd(const Scalar& a, const Scalar& b, const Scalar& c) { return ei_padd(ei_pmul(a, b),c); } + /** \internal \returns the min of \a a and \a b (coeff-wise) */ template inline Scalar ei_pmin(const Scalar& a, const Scalar& b) { return std::min(a,b); } + /** \internal \returns the max of \a a and \a b (coeff-wise) */ template inline Scalar ei_pmax(const Scalar& a, const Scalar& b) { return std::max(a,b); } + /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template inline Scalar ei_pload(const Scalar* from) { return *from; } + +/** \internal \returns a packet version of \a *from, (un-aligned load) */ +template inline Scalar ei_ploadu(const Scalar* from) { return *from; } + /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ template inline Scalar ei_pset1(const Scalar& a) { return a; } + /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ template inline void ei_pstore(Scalar* to, const Scalar& from) { (*to) = from; } -/** \internal \returns the first element of a packet */ -template inline Scalar ei_pfirst(const Scalar& a) { return a; } -/** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */ -template inline Scalar ei_predux(const Scalar vecs[1]) { return vecs[0]; } -/** \internal \returns the sum of the elements of \a a*/ -template inline Scalar ei_predux(const Scalar& a) { return a; } - -#ifdef EIGEN_VECTORIZE_SSE - -#ifdef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#undef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16 -#endif -template<> struct ei_packet_traits { typedef __m128 type; enum {size=4}; }; -template<> struct ei_packet_traits { typedef __m128d type; enum {size=2}; }; -template<> struct ei_packet_traits { typedef __m128i type; enum {size=4}; }; - -template<> inline __m128 ei_padd(const __m128& a, const __m128& b) { return _mm_add_ps(a,b); } -template<> inline __m128d ei_padd(const __m128d& a, const __m128d& b) { return _mm_add_pd(a,b); } -template<> inline __m128i ei_padd(const __m128i& a, const __m128i& b) { return _mm_add_epi32(a,b); } - -template<> inline __m128 ei_psub(const __m128& a, const __m128& b) { return _mm_sub_ps(a,b); } -template<> inline __m128d ei_psub(const __m128d& a, const __m128d& b) { return _mm_sub_pd(a,b); } -template<> inline __m128i ei_psub(const __m128i& a, const __m128i& b) { return _mm_sub_epi32(a,b); } - -template<> inline __m128 ei_pmul(const __m128& a, const __m128& b) { return _mm_mul_ps(a,b); } -template<> inline __m128d ei_pmul(const __m128d& a, const __m128d& b) { return _mm_mul_pd(a,b); } -template<> inline __m128i ei_pmul(const __m128i& a, const __m128i& b) -{ - return _mm_or_si128( - _mm_and_si128( - _mm_mul_epu32(a,b), - _mm_setr_epi32(0xffffffff,0,0xffffffff,0)), - _mm_slli_si128( - _mm_and_si128( - _mm_mul_epu32(_mm_srli_si128(a,4),_mm_srli_si128(b,4)), - _mm_setr_epi32(0xffffffff,0,0xffffffff,0)), 4)); -} - -// for some weird raisons, it has to be overloaded for packet integer -template<> inline __m128i ei_pmadd(const __m128i& a, const __m128i& b, const __m128i& c) { return ei_padd(ei_pmul(a,b), c); } - -template<> inline __m128 ei_pmin(const __m128& a, const __m128& b) { return _mm_min_ps(a,b); } -template<> inline __m128d ei_pmin(const __m128d& a, const __m128d& b) { return _mm_min_pd(a,b); } -// FIXME this vectorized min operator is likely to be slower than the standard one -template<> inline __m128i ei_pmin(const __m128i& a, const __m128i& b) -{ - __m128i mask = _mm_cmplt_epi32(a,b); - return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); -} - -template<> inline __m128 ei_pmax(const __m128& a, const __m128& b) { return _mm_max_ps(a,b); } -template<> inline __m128d ei_pmax(const __m128d& a, const __m128d& b) { return _mm_max_pd(a,b); } -// FIXME this vectorized max operator is likely to be slower than the standard one -template<> inline __m128i ei_pmax(const __m128i& a, const __m128i& b) -{ - __m128i mask = _mm_cmpgt_epi32(a,b); - return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); -} - -inline __m128 ei_pload(const float* from) { return _mm_load_ps(from); } -inline __m128d ei_pload(const double* from) { return _mm_load_pd(from); } -inline __m128i ei_pload(const int* from) { return _mm_load_si128(reinterpret_cast(from)); } - -inline __m128 ei_ploadu(const float* from) { return _mm_loadu_ps(from); } -inline __m128d ei_ploadu(const double* from) { return _mm_loadu_pd(from); } -inline __m128i ei_ploadu(const int* from) { return _mm_loadu_si128(reinterpret_cast(from)); } - -inline __m128 ei_pset1(const float& from) { return _mm_set1_ps(from); } -inline __m128d ei_pset1(const double& from) { return _mm_set1_pd(from); } -inline __m128i ei_pset1(const int& from) { return _mm_set1_epi32(from); } - -inline void ei_pstore(float* to, const __m128& from) { _mm_store_ps(to, from); } -inline void ei_pstore(double* to, const __m128d& from) { _mm_store_pd(to, from); } -inline void ei_pstore(int* to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } - -inline void ei_pstoreu(float* to, const __m128& from) { _mm_storeu_ps(to, from); } -inline void ei_pstoreu(double* to, const __m128d& from) { _mm_storeu_pd(to, from); } -inline void ei_pstoreu(int* to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } - -inline float ei_pfirst(const __m128& a) { return _mm_cvtss_f32(a); } -inline double ei_pfirst(const __m128d& a) { return _mm_cvtsd_f64(a); } -inline int ei_pfirst(const __m128i& a) { return _mm_cvtsi128_si32(a); } - -#ifdef __SSE3__ -// TODO implement SSE2 versions as well as integer versions -inline __m128 ei_predux(const __m128* vecs) -{ - return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); -} -inline __m128d ei_predux(const __m128d* vecs) -{ - return _mm_hadd_pd(vecs[0], vecs[1]); -} - -inline float ei_predux(const __m128& a) -{ - __m128 tmp0 = _mm_hadd_ps(a,a); - return ei_pfirst(_mm_hadd_ps(tmp0, tmp0)); -} - -inline double ei_predux(const __m128d& a) { return ei_pfirst(_mm_hadd_pd(a, a)); } -#endif +/** \internal copy the packet \a from to \a *to, (un-aligned store) */ +template inline void ei_pstoreu(Scalar* to, const Scalar& from) { (*to) = from; } -#elif defined(EIGEN_VECTORIZE_ALTIVEC) +/** \internal \returns the first element of a packet */ +template inline Scalar ei_pfirst(const Scalar& a) { return a; } -#ifdef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#undef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 -#endif +/** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */ +// template inline Scalar ei_predux(const Scalar* vecs) { return vecs[0]; } -static const vector int v0i = vec_splat_u32(0); -static const vector int v16i_ = vec_splat_u32(-16); -static const vector float v0f = (vector float) v0i; - -template<> struct ei_packet_traits { typedef vector float type; enum {size=4}; }; -template<> struct ei_packet_traits { typedef vector int type; enum {size=4}; }; - -inline vector float ei_padd(const vector float a, const vector float b) { return vec_add(a,b); } -inline vector int ei_padd(const vector int a, const vector int b) { return vec_add(a,b); } - -inline vector float ei_psub(const vector float a, const vector float b) { return vec_sub(a,b); } -inline vector int ei_psub(const vector int a, const vector int b) { return vec_sub(a,b); } - -inline vector float ei_pmul(const vector float a, const vector float b) { return vec_madd(a,b, v0f); } -inline vector int ei_pmul(const vector int a, const vector int b) -{ - // Taken from http:// - - //Set up constants - vector int bswap, lowProduct, highProduct; - - //Do real work - bswap = vec_rl( (vector unsigned int)b, (vector unsigned int)v16i_ ); - lowProduct = vec_mulo( (vector short)a,(vector short)b ); - highProduct = vec_msum((vector short)a,(vector short)bswap, v0i); - highProduct = vec_sl( (vector unsigned int)highProduct, (vector unsigned int)v16i_ ); - return vec_add( lowProduct, highProduct ); -} - -inline vector float ei_pmadd(const vector float a, const vector float b, const vector float c) { return vec_madd(a, b, c); } - -inline vector float ei_pmin(const vector float a, const vector float b) { return vec_min(a,b); } -inline vector int ei_pmin(const vector int a, const vector int b) { return vec_min(a,b); } - -inline vector float ei_pmax(const vector float a, const vector float b) { return vec_max(a,b); } -inline vector int ei_pmax(const vector int a, const vector int b) { return vec_max(a,b); } - -inline vector float ei_pload(const float* from) { return vec_ld(0, from); } -inline vector int ei_pload(const int* from) { return vec_ld(0, from); } - -inline vector float ei_pset1(const float& from) -{ - static float __attribute__(aligned(16)) af[4]; - af[0] = from; - vector float vc = vec_ld(0, af); - vc = vec_splat(vc, 0); - return vc; -} - -inline vector int ei_pset1(const int& from) -{ - static int __attribute__(aligned(16)) ai[4]; - ai[0] = from; - vector int vc = vec_ld(0, ai); - vc = vec_splat(vc, 0); - return vc; -} - -inline void ei_pstore(float* to, const vector float from) { vec_st(from, 0, to); } -inline void ei_pstore(int* to, const vector int from) { vec_st(from, 0, to); } - -inline float ei_pfirst(const vector float a) -{ - static float __attribute__(aligned(16)) af[4]; - vec_st(a, 0, af); - return af[0]; -} - -inline int ei_pfirst(const vector int a) -{ - static int __attribute__(aligned(16)) ai[4]; - vec_st(a, 0, ai); - return ai[0]; -} - -#endif // EIGEN_VECTORIZE_ALTIVEC & SSE +/** \internal \returns the sum of the elements of \a a*/ +// template inline Scalar ei_predux(const Scalar& a) { return a; } #endif // EIGEN_PACKET_MATH_H diff --git a/Eigen/src/Core/PacketMath_Altivec.h b/Eigen/src/Core/PacketMath_Altivec.h new file mode 100644 index 000000000..eb702af8c --- /dev/null +++ b/Eigen/src/Core/PacketMath_Altivec.h @@ -0,0 +1,113 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. Eigen itself is part of the KDE project. +// +// Copyright (C) 2008 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_PACKET_MATH_ALTIVEC_H +#define EIGEN_PACKET_MATH_ALTIVEC_H + +#ifndef EIGEN_VECTORIZE_ALTIVEC +#error include PacketMath_Altivec without EIGEN_VECTORIZE_ALTIVEC +#endif + +#ifdef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#undef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 +#endif + +static const vector int v0i = vec_splat_u32(0); +static const vector int v16i_ = vec_splat_u32(-16); +static const vector float v0f = (vector float) v0i; + +template<> struct ei_packet_traits { typedef vector float type; enum {size=4}; }; +template<> struct ei_packet_traits { typedef vector int type; enum {size=4}; }; + +inline vector float ei_padd(const vector float a, const vector float b) { return vec_add(a,b); } +inline vector int ei_padd(const vector int a, const vector int b) { return vec_add(a,b); } + +inline vector float ei_psub(const vector float a, const vector float b) { return vec_sub(a,b); } +inline vector int ei_psub(const vector int a, const vector int b) { return vec_sub(a,b); } + +inline vector float ei_pmul(const vector float a, const vector float b) { return vec_madd(a,b, v0f); } +inline vector int ei_pmul(const vector int a, const vector int b) +{ + // Taken from http:// + + //Set up constants + vector int bswap, lowProduct, highProduct; + + //Do real work + bswap = vec_rl( (vector unsigned int)b, (vector unsigned int)v16i_ ); + lowProduct = vec_mulo( (vector short)a,(vector short)b ); + highProduct = vec_msum((vector short)a,(vector short)bswap, v0i); + highProduct = vec_sl( (vector unsigned int)highProduct, (vector unsigned int)v16i_ ); + return vec_add( lowProduct, highProduct ); +} + +inline vector float ei_pmadd(const vector float a, const vector float b, const vector float c) { return vec_madd(a, b, c); } + +inline vector float ei_pmin(const vector float a, const vector float b) { return vec_min(a,b); } +inline vector int ei_pmin(const vector int a, const vector int b) { return vec_min(a,b); } + +inline vector float ei_pmax(const vector float a, const vector float b) { return vec_max(a,b); } +inline vector int ei_pmax(const vector int a, const vector int b) { return vec_max(a,b); } + +inline vector float ei_pload(const float* from) { return vec_ld(0, from); } +inline vector int ei_pload(const int* from) { return vec_ld(0, from); } + +inline vector float ei_pset1(const float& from) +{ + static float __attribute__(aligned(16)) af[4]; + af[0] = from; + vector float vc = vec_ld(0, af); + vc = vec_splat(vc, 0); + return vc; +} + +inline vector int ei_pset1(const int& from) +{ + static int __attribute__(aligned(16)) ai[4]; + ai[0] = from; + vector int vc = vec_ld(0, ai); + vc = vec_splat(vc, 0); + return vc; +} + +inline void ei_pstore(float* to, const vector float from) { vec_st(from, 0, to); } +inline void ei_pstore(int* to, const vector int from) { vec_st(from, 0, to); } + +inline float ei_pfirst(const vector float a) +{ + static float __attribute__(aligned(16)) af[4]; + vec_st(a, 0, af); + return af[0]; +} + +inline int ei_pfirst(const vector int a) +{ + static int __attribute__(aligned(16)) ai[4]; + vec_st(a, 0, ai); + return ai[0]; +} + +#endif // EIGEN_PACKET_MATH_ALTIVEC_H + diff --git a/Eigen/src/Core/PacketMath_SSE.h b/Eigen/src/Core/PacketMath_SSE.h new file mode 100644 index 000000000..1872affea --- /dev/null +++ b/Eigen/src/Core/PacketMath_SSE.h @@ -0,0 +1,154 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. Eigen itself is part of the KDE project. +// +// Copyright (C) 2008 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_PACKET_MATH_SSE_H +#define EIGEN_PACKET_MATH_SSE_H + +#ifndef EIGEN_VECTORIZE_SSE +#error include PacketMath_SSE without EIGEN_VECTORIZE_SSE +#endif + +template<> struct ei_packet_traits { typedef __m128 type; enum {size=4}; }; +template<> struct ei_packet_traits { typedef __m128d type; enum {size=2}; }; +template<> struct ei_packet_traits { typedef __m128i type; enum {size=4}; }; + +template<> inline __m128 ei_padd(const __m128& a, const __m128& b) { return _mm_add_ps(a,b); } +template<> inline __m128d ei_padd(const __m128d& a, const __m128d& b) { return _mm_add_pd(a,b); } +template<> inline __m128i ei_padd(const __m128i& a, const __m128i& b) { return _mm_add_epi32(a,b); } + +template<> inline __m128 ei_psub(const __m128& a, const __m128& b) { return _mm_sub_ps(a,b); } +template<> inline __m128d ei_psub(const __m128d& a, const __m128d& b) { return _mm_sub_pd(a,b); } +template<> inline __m128i ei_psub(const __m128i& a, const __m128i& b) { return _mm_sub_epi32(a,b); } + +template<> inline __m128 ei_pmul(const __m128& a, const __m128& b) { return _mm_mul_ps(a,b); } +template<> inline __m128d ei_pmul(const __m128d& a, const __m128d& b) { return _mm_mul_pd(a,b); } +template<> inline __m128i ei_pmul(const __m128i& a, const __m128i& b) +{ + return _mm_or_si128( + _mm_and_si128( + _mm_mul_epu32(a,b), + _mm_setr_epi32(0xffffffff,0,0xffffffff,0)), + _mm_slli_si128( + _mm_and_si128( + _mm_mul_epu32(_mm_srli_si128(a,4),_mm_srli_si128(b,4)), + _mm_setr_epi32(0xffffffff,0,0xffffffff,0)), 4)); +} + +// for some weird raisons, it has to be overloaded for packet integer +template<> inline __m128i ei_pmadd(const __m128i& a, const __m128i& b, const __m128i& c) { return ei_padd(ei_pmul(a,b), c); } + +template<> inline __m128 ei_pmin(const __m128& a, const __m128& b) { return _mm_min_ps(a,b); } +template<> inline __m128d ei_pmin(const __m128d& a, const __m128d& b) { return _mm_min_pd(a,b); } +// FIXME this vectorized min operator is likely to be slower than the standard one +template<> inline __m128i ei_pmin(const __m128i& a, const __m128i& b) +{ + __m128i mask = _mm_cmplt_epi32(a,b); + return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); +} + +template<> inline __m128 ei_pmax(const __m128& a, const __m128& b) { return _mm_max_ps(a,b); } +template<> inline __m128d ei_pmax(const __m128d& a, const __m128d& b) { return _mm_max_pd(a,b); } +// FIXME this vectorized max operator is likely to be slower than the standard one +template<> inline __m128i ei_pmax(const __m128i& a, const __m128i& b) +{ + __m128i mask = _mm_cmpgt_epi32(a,b); + return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); +} + +inline __m128 ei_pload(const float* from) { return _mm_load_ps(from); } +inline __m128d ei_pload(const double* from) { return _mm_load_pd(from); } +inline __m128i ei_pload(const int* from) { return _mm_load_si128(reinterpret_cast(from)); } + +inline __m128 ei_ploadu(const float* from) { return _mm_loadu_ps(from); } +inline __m128d ei_ploadu(const double* from) { return _mm_loadu_pd(from); } +inline __m128i ei_ploadu(const int* from) { return _mm_loadu_si128(reinterpret_cast(from)); } + +inline __m128 ei_pset1(const float& from) { return _mm_set1_ps(from); } +inline __m128d ei_pset1(const double& from) { return _mm_set1_pd(from); } +inline __m128i ei_pset1(const int& from) { return _mm_set1_epi32(from); } + +inline void ei_pstore(float* to, const __m128& from) { _mm_store_ps(to, from); } +inline void ei_pstore(double* to, const __m128d& from) { _mm_store_pd(to, from); } +inline void ei_pstore(int* to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } + +inline void ei_pstoreu(float* to, const __m128& from) { _mm_storeu_ps(to, from); } +inline void ei_pstoreu(double* to, const __m128d& from) { _mm_storeu_pd(to, from); } +inline void ei_pstoreu(int* to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } + +inline float ei_pfirst(const __m128& a) { return _mm_cvtss_f32(a); } +inline double ei_pfirst(const __m128d& a) { return _mm_cvtsd_f64(a); } +inline int ei_pfirst(const __m128i& a) { return _mm_cvtsi128_si32(a); } + +#ifdef __SSE3__ +// TODO implement SSE2 versions as well as integer versions +inline __m128 ei_predux(const __m128* vecs) +{ + return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); +} +inline __m128d ei_predux(const __m128d* vecs) +{ + return _mm_hadd_pd(vecs[0], vecs[1]); +} + +inline float ei_predux(const __m128& a) +{ + __m128 tmp0 = _mm_hadd_ps(a,a); + return ei_pfirst(_mm_hadd_ps(tmp0, tmp0)); +} + +inline double ei_predux(const __m128d& a) { return ei_pfirst(_mm_hadd_pd(a, a)); } +#else +// SSE2 versions +inline float ei_predux(const __m128& a) +{ + __m128 tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); + return ei_pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); +} +inline double ei_predux(const __m128d& a) +{ + return ei_pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); +} + +inline __m128 ei_predux(const __m128* vecs) +{ + __m128 tmp0, tmp1, tmp2; + tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); + tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]); + tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]); + tmp0 = _mm_add_ps(tmp0, tmp1); + tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]); + tmp1 = _mm_add_ps(tmp1, tmp2); + tmp2 = _mm_movehl_ps(tmp1, tmp0); + tmp0 = _mm_movelh_ps(tmp0, tmp1); + return _mm_add_ps(tmp0, tmp2); +} + +inline __m128d ei_predux(const __m128d* vecs) +{ + return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); +} +#endif // SSE3 + +#endif // EIGEN_PACKET_MATH_SSE_H + diff --git a/Eigen/src/Core/ProductWIP.h b/Eigen/src/Core/ProductWIP.h index a1c10d5d8..57bb899d6 100644 --- a/Eigen/src/Core/ProductWIP.h +++ b/Eigen/src/Core/ProductWIP.h @@ -193,6 +193,17 @@ template class Product : ei_no_assignm typedef typename ei_traits::_LhsNested _LhsNested; typedef typename ei_traits::_RhsNested _RhsNested; + enum { + PacketSize = ei_packet_traits::size, + #if (defined __i386__) + // i386 architectures provides only 8 xmmm register, + // so let's reduce the max number of rows processed at once + MaxBlockRows = 4, + #else + MaxBlockRows = 8, + #endif + }; + Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) { @@ -200,7 +211,18 @@ template class Product : ei_no_assignm } /** \internal */ - template void _cacheFriendlyEval(DestDerived& res) const; + template + void _cacheFriendlyEval(DestDerived& res) const; + + /** \internal */ + template + void _cacheFriendlyEvalImpl(DestDerived& res) const __attribute__ ((noinline)); + + /** \internal */ + template + void _cacheFriendlyEvalKernel(DestDerived& res, + int l2i, int l2j, int l2k, int l1i, + int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const EIGEN_DONT_INLINE; private: @@ -299,13 +321,134 @@ template template Derived& MatrixBase::lazyAssign(const Product& product) { - product._cacheFriendlyEval(*this); + product._cacheFriendlyEval(derived()); return derived(); } template template void Product::_cacheFriendlyEval(DestDerived& res) const +{ + const bool rhsIsAligned = (m_lhs.cols()%PacketSize == 0); + const bool resIsAligned = ((_rows()%PacketSize) == 0); + + if (rhsIsAligned && resIsAligned) + _cacheFriendlyEvalImpl(res); + else if (rhsIsAligned && (!resIsAligned)) + _cacheFriendlyEvalImpl(res); + else if ((!rhsIsAligned) && resIsAligned) + _cacheFriendlyEvalImpl(res); + else + _cacheFriendlyEvalImpl(res); + +} + +template +template +void Product::_cacheFriendlyEvalKernel(DestDerived& res, + int l2i, int l2j, int l2k, int l1i, + int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const +{ + asm("#eigen begin kernel"); + + ei_internal_assert(BlockRows<=8); + + // NOTE: sounds like we cannot rely on meta-unrolling to access dst[I] without enforcing GCC + // to create the dst's elements in memory, hence killing the performance. + + for(int l1j=l2j; l1j(k, l1j); + if (RhsAlignment==Aligned) + { + tmp = ei_pload(&m_rhs.data()[l1jsize + k]); + } + else + { + tmp = tmp1; + if (k+PacketSize=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+ PacketSize])), dst[1]); + if (BlockRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+2*PacketSize])), dst[2]); + if (BlockRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+3*PacketSize])), dst[3]); + if (BlockRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+4*PacketSize])), dst[4]); + if (BlockRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+5*PacketSize])), dst[5]); + if (BlockRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+6*PacketSize])), dst[6]); + if (BlockRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+7*PacketSize])), dst[7]); + } + + enum { + // Number of rows we can reduce per packet + PacketRows = (ResAlignment==Aligned && PacketSize>1) ? (BlockRows / PacketSize) : 0, + // First row index from which we have to to do redux once at a time + RemainingStart = PacketSize * PacketRows + }; + + // we have up to 4 packets (for doubles: 8 rows / 2) + if (PacketRows>=1) + res.template writePacketCoeff(l1i, l1j, + ei_padd(res.template packetCoeff(l1i, l1j), ei_predux(&(dst[0])))); + if (PacketRows>=2) + res.template writePacketCoeff(l1i+PacketSize, l1j, + ei_padd(res.template packetCoeff(l1i+PacketSize, l1j), ei_predux(&(dst[PacketSize])))); + if (PacketRows>=3) + res.template writePacketCoeff(l1i+2*PacketSize, l1j, + ei_padd(res.template packetCoeff(l1i+2*PacketSize, l1j), ei_predux(&(dst[2*PacketSize])))); + if (PacketRows>=4) + res.template writePacketCoeff(l1i+3*PacketSize, l1j, + ei_padd(res.template packetCoeff(l1i+3*PacketSize, l1j), ei_predux(&(dst[3*PacketSize])))); + + // process the remaining rows one at a time + if (RemainingStart<=0 && BlockRows>=1) res.coeffRef(l1i+0, l1j) += ei_predux(dst[0]); + if (RemainingStart<=1 && BlockRows>=2) res.coeffRef(l1i+1, l1j) += ei_predux(dst[1]); + if (RemainingStart<=2 && BlockRows>=3) res.coeffRef(l1i+2, l1j) += ei_predux(dst[2]); + if (RemainingStart<=3 && BlockRows>=4) res.coeffRef(l1i+3, l1j) += ei_predux(dst[3]); + if (RemainingStart<=4 && BlockRows>=5) res.coeffRef(l1i+4, l1j) += ei_predux(dst[4]); + if (RemainingStart<=5 && BlockRows>=6) res.coeffRef(l1i+5, l1j) += ei_predux(dst[5]); + if (RemainingStart<=6 && BlockRows>=7) res.coeffRef(l1i+6, l1j) += ei_predux(dst[6]); + if (RemainingStart<=7 && BlockRows>=8) res.coeffRef(l1i+7, l1j) += ei_predux(dst[7]); + + asm("#eigen end kernel"); + } +} + +template +template +void Product::_cacheFriendlyEvalImpl(DestDerived& res) const { // allow direct access to data for benchmark purpose const Scalar* __restrict__ a = m_lhs.derived().data(); @@ -316,21 +459,13 @@ void Product::_cacheFriendlyEval(DestDerived& res) const // then we don't need to clear res and avoid and additional mat-mat sum // res.setZero(); - const int ps = ei_packet_traits::size; // size of a packet - #if (defined __i386__) - // i386 architectures provides only 8 xmmm register, - // so let's reduce the max number of rows processed at once - const int bw = 4; // number of rows treated at once - #else - const int bw = 8; // number of rows treated at once - #endif - const int bs = ps * bw; // total number of elements treated at once + const int bs = PacketSize * MaxBlockRows; // total number of elements treated at once const int rows = _rows(); const int cols = _cols(); - const int size = m_lhs.cols(); // third dimension of the product + const int remainingSize = m_lhs.cols()%PacketSize; + const int size = m_lhs.cols() - remainingSize; // third dimension of the product clamped to packet boundaries const int l2blocksize = 256 > _cols() ? _cols() : 256; - const bool rhsIsAligned = ((size%ps) == 0); - const bool resIsAligned = ((cols%ps) == 0); + // FIXME use calloca ?? (allocation on the stack) Scalar* __restrict__ block = new Scalar[l2blocksize*size]; // loops on each L2 cache friendly blocks of the result @@ -348,23 +483,23 @@ void Product::_cacheFriendlyEval(DestDerived& res) const { const int l2blockSizeEnd = std::min(l2k+l2blocksize, size); - for (int i = l2i; i0) { - for (int k=l2k; k::_cacheFriendlyEval(DestDerived& res) const for(int l2k=0; l2k( + res, l2i, l2j, l2k, l1i, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); +#if 0 for(int l1j=l2j; l1j::_cacheFriendlyEval(DestDerived& res) const asm("#eigen begincore"); for(int k=l2k; k(k, l1j); + // TODO make this branching compile time (costly for doubles) if (rhsIsAligned) tmp = ei_pload(&m_rhs.derived().data()[l1jsize + k]); else @@ -436,21 +574,61 @@ void Product::_cacheFriendlyEval(DestDerived& res) const } } - res.template writePacketCoeff(l1i, l1j, ei_padd(res.template packetCoeff(l1i, l1j), ei_predux(dst))); - if (ps==2) - res.template writePacketCoeff(l1i+2,l1j, ei_padd(res.template packetCoeff(l1i+2,l1j), ei_predux(&(dst[2])))); - if (bw==8) +// if (resIsAligned) { - res.template writePacketCoeff(l1i+4,l1j, ei_padd(res.template packetCoeff(l1i+4,l1j), ei_predux(&(dst[4])))); + res.template writePacketCoeff(l1i, l1j, ei_padd(res.template packetCoeff(l1i, l1j), ei_predux(dst))); if (ps==2) - res.template writePacketCoeff(l1i+6,l1j, ei_padd(res.template packetCoeff(l1i+6,l1j), ei_predux(&(dst[6])))); + res.template writePacketCoeff(l1i+2,l1j, ei_padd(res.template packetCoeff(l1i+2,l1j), ei_predux(&(dst[2])))); + if (bw==8) + { + res.template writePacketCoeff(l1i+4,l1j, ei_padd(res.template packetCoeff(l1i+4,l1j), ei_predux(&(dst[4])))); + if (ps==2) + res.template writePacketCoeff(l1i+6,l1j, ei_padd(res.template packetCoeff(l1i+6,l1j), ei_predux(&(dst[6])))); + } } +// else +// { +// // TODO uncommenting this code kill the perf, even though it is never called !! +// // TODO optimize this loop +// // TODO is it better to do one redux at once or packet reduxes + unaligned store ? +// for (int w = 0; w0) { + // this is an attempt to build an array of kernels, but I did not manage to get it compiles +// typedef void (*Kernel)(DestDerived& , int, int, int, int, int, int, int, const Scalar*); +// Kernel kernels[8]; +// kernels[0] = (Kernel)(&Product::template _cacheFriendlyEvalKernel); +// kernels[l2blockRowRemaining](res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); + + switch(l2blockRowRemaining) + { + case 1:_cacheFriendlyEvalKernel( + res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; + case 2:_cacheFriendlyEvalKernel( + res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; + case 3:_cacheFriendlyEvalKernel( + res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; + case 4:_cacheFriendlyEvalKernel( + res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; + case 5:_cacheFriendlyEvalKernel( + res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; + case 6:_cacheFriendlyEvalKernel( + res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; + case 7:_cacheFriendlyEvalKernel( + res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; + default: + ei_internal_assert(false && "internal error"); break; + } + +#if 0 // TODO optimize this part using a generic templated function that processes N rows // here we process the remaining l2blockRowRemaining rows for(int l1j=l2j; l1j::_cacheFriendlyEval(DestDerived& res) const int l1jsize = l1j * size; - PacketScalar dst[bw]; + PacketScalar dst[MaxBlockRows]; dst[0] = ei_pset1(Scalar(0.)); for (int w = 1; w::_cacheFriendlyEval(DestDerived& res) const // TODO optimize this loop for (int w = 0; w::_cacheFriendlyEval(DestDerived& res) const asm("#eigen endcore dynamic"); } +#endif } } } } + // handle the part which cannot be processed by the vectorized path + if (remainingSize) + { + res += Product< + Block::type,Dynamic,Dynamic>, + Block::type,Dynamic,Dynamic>, + NormalProduct>( + m_lhs.block(0,size, _rows(), remainingSize), + m_rhs.block(size,0, remainingSize, _cols())).lazy(); + } + delete[] block; } diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index be5e7bba5..613ab9617 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -84,6 +84,12 @@ using Eigen::MatrixBase; #define EIGEN_ALWAYS_INLINE #endif +#if (defined __GNUC__) +#define EIGEN_DONT_INLINE __attribute__((noinline)) +#else +#define EIGEN_DONT_INLINE +#endif + #if (defined __GNUC__) #define EIGEN_ALIGN_128 __attribute__ ((aligned(16))) #else diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 19768c1ca..264802b9b 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -189,6 +189,9 @@ template class ei_eval template struct ei_unref { typedef T type; }; template struct ei_unref { typedef T type; }; +template struct ei_unconst { typedef T type; }; +template struct ei_unconst { typedef T type; }; + template struct ei_is_temporary { enum { ret = 0 }; -- cgit v1.2.3