aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2014-04-23 18:22:10 -0700
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2014-04-23 18:22:10 -0700
commitccb4dec719c1dffbd3565497cd897abb1b9f730f (patch)
tree4a469e085ec21d31968f08e3a99e26c56d04bf2c /Eigen/src/Core
parent82b09fcb911c5a0734ccf441504269f33034194a (diff)
Created a NEON version of the ptranspose packet primitives
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r--Eigen/src/Core/arch/NEON/Complex.h8
-rw-r--r--Eigen/src/Core/arch/NEON/PacketMath.h23
2 files changed, 30 insertions, 1 deletions
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 7b94733ab..a668382e2 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -263,6 +263,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
}
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet2cf>& kernel) {
+ float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
+ kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
+ kernel.packet[1].v = tmp;
+}
+
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index c9b9e5e9b..6a6876e69 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -447,9 +447,30 @@ PALIGN_NEON(0,Packet4i,vextq_s32)
PALIGN_NEON(1,Packet4i,vextq_s32)
PALIGN_NEON(2,Packet4i,vextq_s32)
PALIGN_NEON(3,Packet4i,vextq_s32)
-
+
#undef PALIGN_NEON
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet4f>& kernel) {
+ float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
+ float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
+
+ kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
+ kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
+ kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
+ kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet4i>& kernel) {
+ int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
+ int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
+ kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
+ kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
+ kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
+ kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
+}
+
} // end namespace internal
} // end namespace Eigen