aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt3
-rw-r--r--Eigen/src/Core/arch/SSE/Complex.h22
-rw-r--r--Eigen/src/Core/arch/SSE/MathFunctions.h10
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h68
4 files changed, 63 insertions, 40 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 838a41b79..fb13769f4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -199,9 +199,6 @@ if(NOT MSVC)
option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF)
if(EIGEN_TEST_AVX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
- if(CMAKE_COMPILER_IS_GNUCXX)
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6")
- endif()
message(STATUS "Enabling AVX in tests/examples")
endif()
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 715e5a13c..d0c080c4f 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -110,8 +110,8 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); }
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
@@ -145,7 +145,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Pack
#endif
}
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); }
template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
{
@@ -235,7 +235,7 @@ template<> struct conj_helper<Packet4f, Packet2cf, false,false>
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
- { return Packet2cf(Eigen::internal::pmul(x, y.v)); }
+ { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
};
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
@@ -244,7 +244,7 @@ template<> struct conj_helper<Packet2cf, Packet4f, false,false>
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
- { return Packet2cf(Eigen::internal::pmul(x.v, y)); }
+ { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
@@ -300,7 +300,7 @@ template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type
template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
{
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
@@ -338,8 +338,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou
template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
// FIXME force unaligned store, this is a temporary fix
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
@@ -437,7 +437,7 @@ template<> struct conj_helper<Packet2d, Packet1cd, false,false>
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
- { return Packet1cd(Eigen::internal::pmul(x, y.v)); }
+ { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
};
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
@@ -446,7 +446,7 @@ template<> struct conj_helper<Packet1cd, Packet2d, false,false>
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
- { return Packet1cd(Eigen::internal::pmul(x.v, y)); }
+ { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
@@ -459,7 +459,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con
EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
{
- return Packet1cd(preverse(x.v));
+ return Packet1cd(preverse(Packet2d(x.v)));
}
template<> EIGEN_DEVICE_FUNC inline void
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 09f74c651..8f78b3a6c 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -63,7 +63,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
x = _mm_or_ps(x, p4f_half);
emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
- Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1);
+ Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);
/* part2:
if( x < SQRTHF ) {
@@ -72,9 +72,9 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
} else { x = x - 1.0; }
*/
Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
- Packet4f tmp = _mm_and_ps(x, mask);
+ Packet4f tmp = pand(x, mask);
x = psub(x, p4f_1);
- e = psub(e, _mm_and_ps(p4f_1, mask));
+ e = psub(e, pand(p4f_1, mask));
x = padd(x, tmp);
Packet4f x2 = pmul(x,x);
@@ -167,7 +167,7 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
emm0 = _mm_cvttps_epi32(fx);
emm0 = _mm_add_epi32(emm0, p4i_0x7f);
emm0 = _mm_slli_epi32(emm0, 23);
- return pmul(y, _mm_castsi128_ps(emm0));
+ return pmul(y, Packet4f(_mm_castsi128_ps(emm0)));
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet2d pexp<Packet2d>(const Packet2d& _x)
@@ -241,7 +241,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
emm0 = _mm_add_epi32(emm0, p4i_1023_0);
emm0 = _mm_slli_epi32(emm0, 20);
emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
- return pmul(x, _mm_castsi128_pd(emm0));
+ return pmul(x, Packet2d(_mm_castsi128_pd(emm0)));
}
/* evaluation of 4 sines at onces, using SSE2 intrinsics.
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 89dfa6975..e7e035f4e 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -22,9 +22,35 @@ namespace internal {
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
#endif
+#if defined EIGEN_VECTORIZE_AVX && defined __GNUC__ && !(defined __clang__ || defined __INTEL_COMPILER)
+// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
+// have overloads for both types without linking error.
+// One solution is to increase ABI version using -fabi-version=4 (or greater).
+// To workaround this inconvenince, we rather wrap 128bit types into the following helper
+// structure:
+// TODO disable this wrapper if abi-versio>=4, but to detect that without asking the user to define a macro?
+template<typename T>
+struct eigen_packet_wrapper
+{
+ EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
+ EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
+ EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
+ EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
+ EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
+ m_val = v;
+ return *this;
+ }
+
+ T m_val;
+};
+typedef eigen_packet_wrapper<__m128> Packet4f;
+typedef eigen_packet_wrapper<__m128i> Packet4i;
+typedef eigen_packet_wrapper<__m128d> Packet2d;
+#else
typedef __m128 Packet4f;
typedef __m128i Packet4i;
typedef __m128d Packet2d;
+#endif
template<> struct is_arithmetic<__m128> { enum { value = true }; };
template<> struct is_arithmetic<__m128i> { enum { value = true }; };
@@ -161,7 +187,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
}
template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
{
- return psub(_mm_setr_epi32(0,0,0,0), a);
+ return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
}
template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
@@ -244,7 +270,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, con
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
#if defined(_MSC_VER)
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
@@ -262,7 +288,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { E
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
- template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
+ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
#else
// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
// require pointer casting to incompatible pointer types and leads to invalid code
@@ -312,7 +338,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
{
EIGEN_DEBUG_UNALIGNED_LOAD
#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
- return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from));
+ return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
#else
__m128d res;
res = _mm_load_sd((const double*)(from)) ;
@@ -331,13 +357,13 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
{
Packet4i tmp;
- tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));
+ tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
return vec4i_swizzle1(tmp, 0, 0, 1, 1);
}
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
EIGEN_DEBUG_UNALIGNED_STORE
@@ -348,8 +374,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d&
_mm_storeh_pd((to+1), from);
#endif
}
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castps_pd(from))); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castsi128_pd(from))); }
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
{
@@ -388,13 +414,13 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
{
Packet4f pa = _mm_set_ss(a);
- pstore(to, vec4f_swizzle1(pa,0,0,0,0));
+ pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
}
// some compilers might be tempted to perform multiple moves instead of using a vector path.
template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
{
Packet2d pa = _mm_set_sd(a);
- pstore(to, vec2d_swizzle1(pa,0,0));
+ pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
}
#ifndef EIGEN_VECTORIZE_AVX
@@ -507,10 +533,10 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
{
Packet4f tmp0 = _mm_hadd_ps(a,a);
- return pfirst(_mm_hadd_ps(tmp0, tmp0));
+ return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
}
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); }
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
// SSSE3 version:
// EIGEN_STRONG_INLINE float predux(const Packet4i& a)
@@ -553,7 +579,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
{
Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
- return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1));
+ return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
}
template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
@@ -576,11 +602,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
{
Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
- return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+ return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
}
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
{
- return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
+ return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
}
template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
{
@@ -596,17 +622,17 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
{
Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
- return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+ return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
}
template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
{
- return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
+ return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
}
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
{
#ifdef EIGEN_VECTORIZE_SSE4_1
Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
- return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+ return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
#else
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!)
@@ -622,17 +648,17 @@ template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
{
Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
- return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+ return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
}
template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
{
- return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
+ return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
}
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
{
#ifdef EIGEN_VECTORIZE_SSE4_1
Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
- return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+ return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
#else
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!)