diff options
author | Ashutosh Sharma <ashutosh.sharma@amperecomputing.com> | 2021-02-10 10:21:37 -0800 |
---|---|---|
committer | Ashutosh Sharma <ashutosh.sharma@amperecomputing.com> | 2021-02-10 10:21:37 -0800 |
commit | 7eb07da538ecc1b8937bfb5dac0d071067728397 (patch) | |
tree | 1c9127795176a363f8a431a25bac6754397985a3 /Eigen/src/Core/arch/NEON/PacketMath.h | |
parent | 36200b7855580cd90801d07c8c538649a4e24554 (diff) |
loop less ptranspose
Diffstat (limited to 'Eigen/src/Core/arch/NEON/PacketMath.h')
-rw-r--r-- | Eigen/src/Core/arch/NEON/PacketMath.h | 31 |
1 files changed, 8 insertions, 23 deletions
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 1f34faae0..f038a8ffb 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -4511,31 +4511,16 @@ EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4hf>(const Packet4hf& a) { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel) { - EIGEN_ALIGN16 Eigen::half in[4][8]; + const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]); + const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]); - pstore<Eigen::half>(in[0], kernel.packet[0]); - pstore<Eigen::half>(in[1], kernel.packet[1]); - pstore<Eigen::half>(in[2], kernel.packet[2]); - pstore<Eigen::half>(in[3], kernel.packet[3]); + const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0])); + const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1])); - EIGEN_ALIGN16 Eigen::half out[4][8]; - - EIGEN_UNROLL_LOOP - for (int i = 0; i < 4; ++i) { - EIGEN_UNROLL_LOOP - for (int j = 0; j < 4; ++j) { - out[i][j] = in[j][2 * i]; - } - EIGEN_UNROLL_LOOP - for (int j = 0; j < 4; ++j) { - out[i][j + 4] = in[j][2 * i + 1]; - } - } - - kernel.packet[0] = pload<Packet8hf>(out[0]); - kernel.packet[1] = pload<Packet8hf>(out[1]); - kernel.packet[2] = pload<Packet8hf>(out[2]); - kernel.packet[3] = pload<Packet8hf>(out[3]); + kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]); + kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]); + kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]); + kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4hf, 4>& kernel) { |