aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/NEON/PacketMath.h
diff options
context:
space:
mode:
authorGravatar Kan Chen <chenkan0734@163.com>2020-05-28 20:26:09 +0800
committerGravatar Kan Chen <chenkan5@huawei.com>2020-05-29 00:33:45 +0000
commit8d1302f566f70c29d0ba864972ab50460a96cf2d (patch)
tree674936e743a34b74e596cf7df27c9467acd9884a /Eigen/src/Core/arch/NEON/PacketMath.h
parent8719b9c5bc1a97e62d675c02495ed72dda6fae73 (diff)
Add support for PacketBlock<Packet8s,4> and PacketBlock<Packet16uc,4> ptranspose on NEON
Diffstat (limited to 'Eigen/src/Core/arch/NEON/PacketMath.h')
-rw-r--r--Eigen/src/Core/arch/NEON/PacketMath.h29
1 files changed, 29 insertions, 0 deletions
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index e11af1dca..065c8100f 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -2869,6 +2869,35 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4s, 4>& kernel)
kernel.packet[2] = vreinterpret_s16_u32(zip32_2.val[0]);
kernel.packet[3] = vreinterpret_s16_u32(zip32_2.val[1]);
}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 4>& kernel)
+{
+ const int16x8x2_t zip16_1 = vzipq_s16(kernel.packet[0], kernel.packet[1]);
+ const int16x8x2_t zip16_2 = vzipq_s16(kernel.packet[2], kernel.packet[3]);
+
+ const uint32x4x2_t zip32_1 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[0]), vreinterpretq_u32_s16(zip16_2.val[0]));
+ const uint32x4x2_t zip32_2 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[1]), vreinterpretq_u32_s16(zip16_2.val[1]));
+
+ kernel.packet[0] = vreinterpretq_s16_u32(zip32_1.val[0]);
+ kernel.packet[1] = vreinterpretq_s16_u32(zip32_1.val[1]);
+ kernel.packet[2] = vreinterpretq_s16_u32(zip32_2.val[0]);
+ kernel.packet[3] = vreinterpretq_s16_u32(zip32_2.val[1]);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 4>& kernel)
+{
+ const uint8x16x2_t zip8_1 = vzipq_u8(kernel.packet[0], kernel.packet[1]);
+ const uint8x16x2_t zip8_2 = vzipq_u8(kernel.packet[2], kernel.packet[3]);
+
+ const uint16x8x2_t zip16_1 = vzipq_u16(vreinterpretq_u16_u8(zip8_1.val[0]), vreinterpretq_u16_u8(zip8_2.val[0]));
+ const uint16x8x2_t zip16_2 = vzipq_u16(vreinterpretq_u16_u8(zip8_1.val[1]), vreinterpretq_u16_u8(zip8_2.val[1]));
+
+ kernel.packet[0] = vreinterpretq_u8_u16(zip16_1.val[0]);
+ kernel.packet[1] = vreinterpretq_u8_u16(zip16_1.val[1]);
+ kernel.packet[2] = vreinterpretq_u8_u16(zip16_2.val[0]);
+ kernel.packet[3] = vreinterpretq_u8_u16(zip16_2.val[1]);
+}
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 8>& kernel)
{
const int16x8x2_t zip16_1 = vzipq_s16(kernel.packet[0], kernel.packet[1]);