aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/NEON/PacketMath.h
diff options
context:
space:
mode:
authorGravatar Ashutosh Sharma <ashutosh.sharma@amperecomputing.com>2021-02-10 10:21:37 -0800
committerGravatar Ashutosh Sharma <ashutosh.sharma@amperecomputing.com>2021-02-10 10:21:37 -0800
commit7eb07da538ecc1b8937bfb5dac0d071067728397 (patch)
tree1c9127795176a363f8a431a25bac6754397985a3 /Eigen/src/Core/arch/NEON/PacketMath.h
parent36200b7855580cd90801d07c8c538649a4e24554 (diff)
loop less ptranspose
Diffstat (limited to 'Eigen/src/Core/arch/NEON/PacketMath.h')
-rw-r--r--Eigen/src/Core/arch/NEON/PacketMath.h31
1 files changed, 8 insertions, 23 deletions
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 1f34faae0..f038a8ffb 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -4511,31 +4511,16 @@ EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4hf>(const Packet4hf& a) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel)
{
- EIGEN_ALIGN16 Eigen::half in[4][8];
+ const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
+ const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
- pstore<Eigen::half>(in[0], kernel.packet[0]);
- pstore<Eigen::half>(in[1], kernel.packet[1]);
- pstore<Eigen::half>(in[2], kernel.packet[2]);
- pstore<Eigen::half>(in[3], kernel.packet[3]);
+ const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0]));
+ const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1]));
- EIGEN_ALIGN16 Eigen::half out[4][8];
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 4; ++i) {
- EIGEN_UNROLL_LOOP
- for (int j = 0; j < 4; ++j) {
- out[i][j] = in[j][2 * i];
- }
- EIGEN_UNROLL_LOOP
- for (int j = 0; j < 4; ++j) {
- out[i][j + 4] = in[j][2 * i + 1];
- }
- }
-
- kernel.packet[0] = pload<Packet8hf>(out[0]);
- kernel.packet[1] = pload<Packet8hf>(out[1]);
- kernel.packet[2] = pload<Packet8hf>(out[2]);
- kernel.packet[3] = pload<Packet8hf>(out[3]);
+ kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]);
+ kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]);
+ kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]);
+ kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {