aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2018-11-23 10:25:19 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2018-11-23 10:25:19 +0100
commita4760548793811ee1accf8de05ff791a43d54be5 (patch)
tree7a993f1c1636e72567412ff95a524d636df4f08a /Eigen
parentc685fe98381cb0005ff4074d8b91b70559a89b1a (diff)
bug #1624: improve matrix-matrix product on ARM 64, 20% speedup
Diffstat (limited to 'Eigen')
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h53
1 files changed, 44 insertions, 9 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index e7cab4720..b8b83c320 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -15,7 +15,7 @@ namespace Eigen {
namespace internal {
-template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target>
class gebp_traits;
@@ -347,7 +347,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
* cplx*real : unpack rhs to constant packets, ...
* real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
*/
-template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch>
class gebp_traits
{
public:
@@ -461,8 +461,8 @@ public:
};
-template<typename RealScalar, bool _ConjLhs>
-class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
+template<typename RealScalar, bool _ConjLhs, int Arch>
+class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch>
{
public:
typedef std::complex<RealScalar> LhsScalar;
@@ -597,8 +597,8 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typede
// return res;
// }
-template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
+template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch>
+class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs,Arch>
{
public:
typedef std::complex<RealScalar> Scalar;
@@ -746,8 +746,8 @@ protected:
conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
};
-template<typename RealScalar, bool _ConjRhs>
-class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
+template<typename RealScalar, bool _ConjRhs, int Arch>
+class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch>
{
public:
typedef std::complex<RealScalar> Scalar;
@@ -852,7 +852,42 @@ protected:
conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
};
-/* optimized GEneral packed Block * packed Panel product kernel
+
+#if EIGEN_ARCH_ARM64
+
+template<>
+struct gebp_traits <float, float, false, false,Architecture::NEON>
+ : gebp_traits<float,float,false,false,Architecture::Generic>
+{
+ typedef float32x2_t RhsPacket;
+
+ EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+ {
+ loadRhs(b+0, b0);
+ loadRhs(b+1, b1);
+ loadRhs(b+2, b2);
+ loadRhs(b+3, b3);
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = vld1_f32(b);
+ }
+
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+ {
+ loadRhs(b,dest);
+ }
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/) const
+ {
+ c = vfmaq_lane_f32(c, a, b, 0);
+ }
+};
+
+#endif
+
+/* optimized General packed Block * packed Panel product kernel
*
* Mixing type logic: C += A * B
* | A | B | comments