From 6d6df90c9a813287d18fe045b78a7f4e3996ee5f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 17 Apr 2014 12:28:01 -0700 Subject: Implemented the pgather/pscatter packet primitives for the arm/NEON architecture --- Eigen/src/Core/arch/NEON/Complex.h | 16 ++++++++++++++++ Eigen/src/Core/arch/NEON/PacketMath.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'Eigen/src/Core/arch') diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 7ca76714f..7b94733ab 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -111,6 +111,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, int stride) +{ + Packet4f res; + res = vsetq_lane_f32(std::real(from[0*stride]), res, 0); + res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1); + res = vsetq_lane_f32(std::real(from[1*stride]), res, 2); + res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3); + return Packet2cf(res); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, int stride) +{ + to[stride*0] = std::complex(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1)); + to[stride*1] = std::complex(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); +} + template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((float *)addr); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 83150507a..7f3301f51 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -221,6 +221,40 @@ template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& f template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, int stride) +{ + Packet4f res; + res = vsetq_lane_f32(from[0*stride], res, 0); + res = vsetq_lane_f32(from[1*stride], res, 1); + res = vsetq_lane_f32(from[2*stride], res, 2); + res = vsetq_lane_f32(from[3*stride], res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, int stride) +{ + Packet4i res; + res = vsetq_lane_s32(from[0*stride], res, 0); + res = vsetq_lane_s32(from[1*stride], res, 1); + res = vsetq_lane_s32(from[2*stride], res, 2); + res = vsetq_lane_s32(from[3*stride], res, 3); + return res; +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, int stride) +{ + to[stride*0] = vgetq_lane_f32(from, 0); + to[stride*1] = vgetq_lane_f32(from, 1); + to[stride*2] = vgetq_lane_f32(from, 2); + to[stride*3] = vgetq_lane_f32(from, 3); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, int stride) +{ + to[stride*0] = vgetq_lane_s32(from, 0); + to[stride*1] = vgetq_lane_s32(from, 1); + to[stride*2] = vgetq_lane_s32(from, 2); + to[stride*3] = vgetq_lane_s32(from, 3); +} + template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ARM_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ARM_PREFETCH(addr); } -- cgit v1.2.3