From ef1fd341a895fda883f655102f371fa8b41f2088 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 16 Jun 2021 08:49:22 -0500 Subject: EIGEN_STRONG_INLINE was NOT inlining in some critical needed areas (6.6X slowdown) when used with Tensorflow. Changing to EIGEN_ALWAYS_INLINE where appropiate. --- Eigen/src/Core/arch/AltiVec/MatrixProduct.h | 74 +++++++++++------------ Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h | 26 ++++---- Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h | 28 ++++----- 3 files changed, 59 insertions(+), 69 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index dbdb81ef1..4c5cf1762 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -113,7 +113,7 @@ const static Packet16uc p16uc_GETIMAG64 = { 8, 9, 10, 11, 12, 13, 14, 15, * float32/64 and complex float32/64 version. **/ template -EIGEN_STRONG_INLINE std::complex getAdjointVal(Index i, Index j, const_blas_data_mapper, Index, StorageOrder>& dt) +EIGEN_ALWAYS_INLINE std::complex getAdjointVal(Index i, Index j, const_blas_data_mapper, Index, StorageOrder>& dt) { std::complex v; if(i < j) @@ -403,7 +403,7 @@ struct symm_pack_lhs **/ template -EIGEN_STRONG_INLINE void storeBlock(Scalar* to, PacketBlock& block) +EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) { const Index size = 16 / sizeof(Scalar); pstore(to + (0 * size), block.packet[0]); @@ -413,7 +413,7 @@ EIGEN_STRONG_INLINE void storeBlock(Scalar* to, PacketBlock& block) } template -EIGEN_STRONG_INLINE void storeBlock(Scalar* to, PacketBlock& block) +EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) { const Index size = 16 / sizeof(Scalar); pstore(to + (0 * size), block.packet[0]); @@ -992,7 +992,7 @@ struct dhs_cpack -EIGEN_STRONG_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) +EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) { if(NegativeAccumulate) { @@ -1009,7 +1009,7 @@ EIGEN_STRONG_INLINE void pger_common(PacketBlock* acc, const Packet& l } template -EIGEN_STRONG_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) +EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) { if(NegativeAccumulate) { @@ -1020,7 +1020,7 @@ EIGEN_STRONG_INLINE void pger_common(PacketBlock* acc, const Packet& l } template -EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) +EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) { Packet lhsV = pload(lhs); @@ -1028,7 +1028,7 @@ EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, con } template -EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows) +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows) { #ifdef _ARCH_PWR9 lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar)); @@ -1041,7 +1041,7 @@ EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, In } template -EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows) +EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows) { Packet lhsV; loadPacketRemaining(lhs, lhsV, remaining_rows); @@ -1051,7 +1051,7 @@ EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, con // 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real. template -EIGEN_STRONG_INLINE void pgerc_common(PacketBlock* accReal, PacketBlock* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi) +EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock* accReal, PacketBlock* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi) { pger_common(accReal, lhsV, rhsV); if(LhsIsReal) @@ -1070,7 +1070,7 @@ EIGEN_STRONG_INLINE void pgerc_common(PacketBlock* accReal, PacketBloc } template -EIGEN_STRONG_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) +EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) { Packet lhsV = ploadLhs(lhs_ptr); Packet lhsVi; @@ -1081,7 +1081,7 @@ EIGEN_STRONG_INLINE void pgerc(PacketBlock* accReal, PacketBlock -EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows) +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows) { #ifdef _ARCH_PWR9 lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows * sizeof(Scalar)); @@ -1098,7 +1098,7 @@ EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar } template -EIGEN_STRONG_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows) +EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows) { Packet lhsV, lhsVi; loadPacketRemaining(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows); @@ -1107,14 +1107,14 @@ EIGEN_STRONG_INLINE void pgerc(PacketBlock* accReal, PacketBlock -EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar* lhs) +EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs) { return *reinterpret_cast(const_cast(lhs)); } // Zero the accumulator on PacketBlock. template -EIGEN_STRONG_INLINE void bsetzero(PacketBlock& acc) +EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) { acc.packet[0] = pset1((Scalar)0); acc.packet[1] = pset1((Scalar)0); @@ -1123,14 +1123,14 @@ EIGEN_STRONG_INLINE void bsetzero(PacketBlock& acc) } template -EIGEN_STRONG_INLINE void bsetzero(PacketBlock& acc) +EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) { acc.packet[0] = pset1((Scalar)0); } // Scale the PacketBlock vectors by alpha. template -EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]); @@ -1139,13 +1139,13 @@ EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock -EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); } template -EIGEN_STRONG_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmul(accZ.packet[0], pAlpha); acc.packet[1] = pmul(accZ.packet[1], pAlpha); @@ -1154,14 +1154,14 @@ EIGEN_STRONG_INLINE void bscalec_common(PacketBlock& acc, PacketBlock< } template -EIGEN_STRONG_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmul(accZ.packet[0], pAlpha); } // Complex version of PacketBlock scaling. template -EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag) +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag) { bscalec_common(cReal, aReal, bReal); @@ -1173,7 +1173,7 @@ EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock -EIGEN_STRONG_INLINE void band(PacketBlock& acc, const Packet& pMask) +EIGEN_ALWAYS_INLINE void band(PacketBlock& acc, const Packet& pMask) { acc.packet[0] = pand(acc.packet[0], pMask); acc.packet[1] = pand(acc.packet[1], pMask); @@ -1182,7 +1182,7 @@ EIGEN_STRONG_INLINE void band(PacketBlock& acc, const Packet& pMask) } template -EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) { band(aReal, pMask); band(aImag, pMask); @@ -1192,7 +1192,7 @@ EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) { if (StorageOrder == RowMajor) { acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); @@ -1209,7 +1209,7 @@ EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res // An overload of bload when you have a PacketBLock with 8 vectors. template -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) { if (StorageOrder == RowMajor) { acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); @@ -1233,7 +1233,7 @@ EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res } template -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) { acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); acc.packet[1] = res.template loadPacket(row + (N+1)*accCols, col + 0); @@ -1246,7 +1246,7 @@ const static Packet4i mask43 = { -1, -1, -1, 0 }; const static Packet2l mask21 = { -1, 0 }; template -EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows) +EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows) { if (remaining_rows == 0) { return pset1(float(0.0)); // Not used @@ -1260,7 +1260,7 @@ EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows) } template<> -EIGEN_STRONG_INLINE Packet2d bmask(const int remaining_rows) +EIGEN_ALWAYS_INLINE Packet2d bmask(const int remaining_rows) { if (remaining_rows == 0) { return pset1(double(0.0)); // Not used @@ -1270,7 +1270,7 @@ EIGEN_STRONG_INLINE Packet2d bmask(const int remaining_rows) } template -EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) { band(accZ, pMask); @@ -1278,13 +1278,13 @@ EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock -EIGEN_STRONG_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3) +EIGEN_ALWAYS_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3) { pbroadcast4(a, a0, a1, a2, a3); } template<> -EIGEN_STRONG_INLINE void pbroadcast4_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +EIGEN_ALWAYS_INLINE void pbroadcast4_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) { a1 = pload(a); a3 = pload(a + 2); @@ -1298,7 +1298,7 @@ EIGEN_STRONG_INLINE void pbroadcast4_old(const double* a, Packet2d& a0 #define PEEL 7 template -EIGEN_STRONG_INLINE void MICRO_EXTRA_COL( +EIGEN_ALWAYS_INLINE void MICRO_EXTRA_COL( const Scalar* &lhs_ptr, const Scalar* &rhs_ptr, PacketBlock &accZero, @@ -1362,7 +1362,7 @@ EIGEN_STRONG_INLINE void gemm_extra_col( } template -EIGEN_STRONG_INLINE void MICRO_EXTRA_ROW( +EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW( const Scalar* &lhs_ptr, const Scalar* &rhs_ptr, PacketBlock &accZero, @@ -1565,7 +1565,6 @@ EIGEN_STRONG_INLINE void gemm_unrolled_iteration( Index col, const Packet& pAlpha) { -asm("#gemm begin"); const Scalar* rhs_ptr = rhs_base; const Scalar* lhs_ptr0, * lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, * lhs_ptr7; PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; @@ -1588,7 +1587,6 @@ asm("#gemm begin"); MICRO_STORE row += unroll_factor*accCols; -asm("#gemm end"); } template @@ -1789,7 +1787,7 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const #define PEEL_COMPLEX 3 template -EIGEN_STRONG_INLINE void MICRO_COMPLEX_EXTRA_COL( +EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_COL( const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, PacketBlock &accReal, PacketBlock &accImag, @@ -1888,7 +1886,7 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_col( } template -EIGEN_STRONG_INLINE void MICRO_COMPLEX_EXTRA_ROW( +EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW( const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, PacketBlock &accReal, PacketBlock &accImag, @@ -1924,7 +1922,6 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( const Packet& pAlphaImag, const Packet& pMask) { -asm("#gemm_complex begin"); const Scalar* rhs_ptr_real = rhs_base; const Scalar* rhs_ptr_imag; if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB; @@ -2001,7 +1998,6 @@ asm("#gemm_complex begin"); } } } -asm("#gemm_complex end"); } #define MICRO_COMPLEX_UNROLL(func) \ @@ -2173,7 +2169,6 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( const Packet& pAlphaReal, const Packet& pAlphaImag) { -asm("#gemm_complex_unrolled begin"); const Scalar* rhs_ptr_real = rhs_base; const Scalar* rhs_ptr_imag; if(!RhsIsReal) { @@ -2211,7 +2206,6 @@ asm("#gemm_complex_unrolled begin"); MICRO_COMPLEX_STORE row += unroll_factor*accCols; -asm("#gemm_complex_unrolled end"); } template diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h index 6e74116b9..41b27bf3d 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h @@ -54,7 +54,7 @@ EIGEN_STRONG_INLINE void gemm_unrolled_col( const Packet& pAlpha); template -EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows); +EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows); template EIGEN_STRONG_INLINE void gemm_complex_extra_col( @@ -107,19 +107,19 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( const Packet& pAlphaImag); template -EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar* lhs); +EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs); template -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); template -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); template -EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); template -EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag); +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag); const static Packet16uc p16uc_SETCOMPLEX32_FIRST = { 0, 1, 2, 3, 16, 17, 18, 19, @@ -141,7 +141,7 @@ const static Packet16uc p16uc_SETCOMPLEX64_SECOND = { 8, 9, 10, 11, 12, 13, 14 // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks. template -EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) { acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST); @@ -155,7 +155,7 @@ EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketB } template -EIGEN_STRONG_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) +EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) { bcouple_common(taccReal, taccImag, acc1, acc2); @@ -171,7 +171,7 @@ EIGEN_STRONG_INLINE void bcouple(PacketBlock& taccReal, PacketBlock -EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) { acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); @@ -179,7 +179,7 @@ EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketB } template -EIGEN_STRONG_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) +EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) { bcouple_common(taccReal, taccImag, acc1, acc2); @@ -189,7 +189,7 @@ EIGEN_STRONG_INLINE void bcouple(PacketBlock& taccReal, PacketBlock -EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) { acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST); @@ -203,7 +203,7 @@ EIGEN_STRONG_INLINE void bcouple_common(PacketBlock -EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) { acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); @@ -212,7 +212,7 @@ EIGEN_STRONG_INLINE void bcouple_common(PacketBlock -EIGEN_STRONG_INLINE Packet ploadRhs(const Scalar* rhs) +EIGEN_ALWAYS_INLINE Packet ploadRhs(const Scalar* rhs) { return *reinterpret_cast(const_cast(rhs)); } diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h index 08855bd01..13d9517e4 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h @@ -24,13 +24,13 @@ namespace Eigen { namespace internal { template -EIGEN_STRONG_INLINE void bsetzeroMMA(__vector_quad* acc) +EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc) { __builtin_mma_xxsetaccz(acc); } template -EIGEN_STRONG_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc) +EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc) { PacketBlock result; __builtin_mma_disassemble_acc(&result.packet, acc); @@ -44,7 +44,7 @@ EIGEN_STRONG_INLINE void storeAccumulator(Index i, Index j, const DataMapper& da } template -EIGEN_STRONG_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag) +EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag) { PacketBlock resultReal, resultImag; __builtin_mma_disassemble_acc(&resultReal.packet, accReal); @@ -65,7 +65,7 @@ EIGEN_STRONG_INLINE void storeComplexAccumulator(Index i, Index j, const DataMap // Defaults to float32, since Eigen still supports C++03 we can't use default template arguments template -EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) { if(NegativeAccumulate) { @@ -76,7 +76,7 @@ EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const L } template -EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock& a, const Packet2d& b) +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock& a, const Packet2d& b) { __vector_pair* a0 = (__vector_pair *)(&a.packet[0]); if(NegativeAccumulate) @@ -88,7 +88,7 @@ EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock -EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b) +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b) { if(NegativeAccumulate) { @@ -99,13 +99,13 @@ EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, con } template -EIGEN_STRONG_INLINE void pgerMMA(__vector_quad*, const __vector_pair&, const Packet4f&) +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad*, const __vector_pair&, const Packet4f&) { // Just for compilation } template -EIGEN_STRONG_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi) +EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi) { pgerMMA(accReal, rhsV, lhsV); if(LhsIsReal) { @@ -123,20 +123,20 @@ EIGEN_STRONG_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled. template -EIGEN_STRONG_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) { rhsV = ploadRhs((const Scalar*)(rhs)); } template<> -EIGEN_STRONG_INLINE void ploadRhsMMA >(const double* rhs, PacketBlock& rhsV) +EIGEN_ALWAYS_INLINE void ploadRhsMMA >(const double* rhs, PacketBlock& rhsV) { rhsV.packet[0] = ploadRhs((const double *)((Packet2d *)rhs )); rhsV.packet[1] = ploadRhs((const double *)(((Packet2d *)rhs) + 1)); } template<> -EIGEN_STRONG_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV) +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV) { #if EIGEN_COMP_LLVM __builtin_vsx_assemble_pair(&rhsV, @@ -148,7 +148,7 @@ EIGEN_STRONG_INLINE void ploadRhsMMA(const double* rhs, _ } template<> -EIGEN_STRONG_INLINE void ploadRhsMMA(const float*, __vector_pair&) +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&) { // Just for compilation } @@ -255,7 +255,6 @@ EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( Index col, const Packet& pAlpha) { -asm("#gemm_MMA begin"); const Scalar* rhs_ptr = rhs_base; const Scalar* lhs_ptr0, * lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, * lhs_ptr7; __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; @@ -277,7 +276,6 @@ asm("#gemm_MMA begin"); MICRO_MMA_STORE row += unroll_factor*accCols; -asm("#gemm_MMA end"); } template @@ -505,7 +503,6 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration( const Packet& pAlphaReal, const Packet& pAlphaImag) { -asm("#gemm_complex_MMA begin"); const Scalar* rhs_ptr_real = rhs_base; const Scalar* rhs_ptr_imag; if(!RhsIsReal) { @@ -538,7 +535,6 @@ asm("#gemm_complex_MMA begin"); MICRO_COMPLEX_MMA_STORE row += unroll_factor*accCols; -asm("#gemm_complex_MMA end"); } template -- cgit v1.2.3