diff options
Diffstat (limited to 'Eigen/src/Core')
-rwxr-xr-x | Eigen/src/Core/GenericPacketMath.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/Complex.h | 18 | ||||
-rwxr-xr-x | Eigen/src/Core/arch/SSE/PacketMath.h | 33 |
3 files changed, 52 insertions, 1 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 03e7f410c..82eeeed4a 100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -223,7 +223,7 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu { return ploadu<Packet>(from); } template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, int /*stride*/) -{ (*to) = from; } + { pstore(to, from); } /** \internal tries to do cache prefetching of \a addr */ template<typename Scalar> inline void prefetch(const Scalar* addr) diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index b92919b75..86b90b2ee 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -111,6 +111,24 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex< template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride) +{ + return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride) +{ + /* for (int i = 0; i < 2; i+=2) { + to[stride*i] = std::complex<float>(from.v[i], from.v[i+1]); + }*/ + to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1))); + to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); +} + template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 937f63f88..a98ca6bea 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -339,6 +339,39 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); } template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); } +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride) +{ + return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, int stride) +{ + return _mm_set_pd(from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride) +{ + return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); + } + +template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride) +{ + to[stride*0] = _mm_cvtss_f32(from); + to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1)); + to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2)); + to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, int stride) +{ + to[stride*0] = _mm_cvtsd_f64(from); + to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride) +{ + to[stride*0] = _mm_cvtsi128_si32(from); + to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); + to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); + to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); +} + // some compilers might be tempted to perform multiple moves instead of using a vector path. template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) { |