From ef1fd341a895fda883f655102f371fa8b41f2088 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 16 Jun 2021 08:49:22 -0500 Subject: EIGEN_STRONG_INLINE was NOT inlining in some critical needed areas (6.6X slowdown) when used with Tensorflow. Changing to EIGEN_ALWAYS_INLINE where appropiate. --- Eigen/src/Core/arch/AltiVec/MatrixProduct.h | 74 +++++++++++++---------------- 1 file changed, 34 insertions(+), 40 deletions(-) (limited to 'Eigen/src/Core/arch/AltiVec/MatrixProduct.h') diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index dbdb81ef1..4c5cf1762 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -113,7 +113,7 @@ const static Packet16uc p16uc_GETIMAG64 = { 8, 9, 10, 11, 12, 13, 14, 15, * float32/64 and complex float32/64 version. **/ template -EIGEN_STRONG_INLINE std::complex getAdjointVal(Index i, Index j, const_blas_data_mapper, Index, StorageOrder>& dt) +EIGEN_ALWAYS_INLINE std::complex getAdjointVal(Index i, Index j, const_blas_data_mapper, Index, StorageOrder>& dt) { std::complex v; if(i < j) @@ -403,7 +403,7 @@ struct symm_pack_lhs **/ template -EIGEN_STRONG_INLINE void storeBlock(Scalar* to, PacketBlock& block) +EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) { const Index size = 16 / sizeof(Scalar); pstore(to + (0 * size), block.packet[0]); @@ -413,7 +413,7 @@ EIGEN_STRONG_INLINE void storeBlock(Scalar* to, PacketBlock& block) } template -EIGEN_STRONG_INLINE void storeBlock(Scalar* to, PacketBlock& block) +EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) { const Index size = 16 / sizeof(Scalar); pstore(to + (0 * size), block.packet[0]); @@ -992,7 +992,7 @@ struct dhs_cpack -EIGEN_STRONG_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) +EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) { if(NegativeAccumulate) { @@ -1009,7 +1009,7 @@ EIGEN_STRONG_INLINE void pger_common(PacketBlock* acc, const Packet& l } template -EIGEN_STRONG_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) +EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) { if(NegativeAccumulate) { @@ -1020,7 +1020,7 @@ EIGEN_STRONG_INLINE void pger_common(PacketBlock* acc, const Packet& l } template -EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) +EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) { Packet lhsV = pload(lhs); @@ -1028,7 +1028,7 @@ EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, con } template -EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows) +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows) { #ifdef _ARCH_PWR9 lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar)); @@ -1041,7 +1041,7 @@ EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, In } template -EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows) +EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows) { Packet lhsV; loadPacketRemaining(lhs, lhsV, remaining_rows); @@ -1051,7 +1051,7 @@ EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, con // 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real. template -EIGEN_STRONG_INLINE void pgerc_common(PacketBlock* accReal, PacketBlock* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi) +EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock* accReal, PacketBlock* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi) { pger_common(accReal, lhsV, rhsV); if(LhsIsReal) @@ -1070,7 +1070,7 @@ EIGEN_STRONG_INLINE void pgerc_common(PacketBlock* accReal, PacketBloc } template -EIGEN_STRONG_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) +EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) { Packet lhsV = ploadLhs(lhs_ptr); Packet lhsVi; @@ -1081,7 +1081,7 @@ EIGEN_STRONG_INLINE void pgerc(PacketBlock* accReal, PacketBlock -EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows) +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows) { #ifdef _ARCH_PWR9 lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows * sizeof(Scalar)); @@ -1098,7 +1098,7 @@ EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar } template -EIGEN_STRONG_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows) +EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows) { Packet lhsV, lhsVi; loadPacketRemaining(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows); @@ -1107,14 +1107,14 @@ EIGEN_STRONG_INLINE void pgerc(PacketBlock* accReal, PacketBlock -EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar* lhs) +EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs) { return *reinterpret_cast(const_cast(lhs)); } // Zero the accumulator on PacketBlock. template -EIGEN_STRONG_INLINE void bsetzero(PacketBlock& acc) +EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) { acc.packet[0] = pset1((Scalar)0); acc.packet[1] = pset1((Scalar)0); @@ -1123,14 +1123,14 @@ EIGEN_STRONG_INLINE void bsetzero(PacketBlock& acc) } template -EIGEN_STRONG_INLINE void bsetzero(PacketBlock& acc) +EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) { acc.packet[0] = pset1((Scalar)0); } // Scale the PacketBlock vectors by alpha. template -EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]); @@ -1139,13 +1139,13 @@ EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock -EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); } template -EIGEN_STRONG_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmul(accZ.packet[0], pAlpha); acc.packet[1] = pmul(accZ.packet[1], pAlpha); @@ -1154,14 +1154,14 @@ EIGEN_STRONG_INLINE void bscalec_common(PacketBlock& acc, PacketBlock< } template -EIGEN_STRONG_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmul(accZ.packet[0], pAlpha); } // Complex version of PacketBlock scaling. template -EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag) +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag) { bscalec_common(cReal, aReal, bReal); @@ -1173,7 +1173,7 @@ EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock -EIGEN_STRONG_INLINE void band(PacketBlock& acc, const Packet& pMask) +EIGEN_ALWAYS_INLINE void band(PacketBlock& acc, const Packet& pMask) { acc.packet[0] = pand(acc.packet[0], pMask); acc.packet[1] = pand(acc.packet[1], pMask); @@ -1182,7 +1182,7 @@ EIGEN_STRONG_INLINE void band(PacketBlock& acc, const Packet& pMask) } template -EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) { band(aReal, pMask); band(aImag, pMask); @@ -1192,7 +1192,7 @@ EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) { if (StorageOrder == RowMajor) { acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); @@ -1209,7 +1209,7 @@ EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res // An overload of bload when you have a PacketBLock with 8 vectors. template -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) { if (StorageOrder == RowMajor) { acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); @@ -1233,7 +1233,7 @@ EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res } template -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) { acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); acc.packet[1] = res.template loadPacket(row + (N+1)*accCols, col + 0); @@ -1246,7 +1246,7 @@ const static Packet4i mask43 = { -1, -1, -1, 0 }; const static Packet2l mask21 = { -1, 0 }; template -EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows) +EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows) { if (remaining_rows == 0) { return pset1(float(0.0)); // Not used @@ -1260,7 +1260,7 @@ EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows) } template<> -EIGEN_STRONG_INLINE Packet2d bmask(const int remaining_rows) +EIGEN_ALWAYS_INLINE Packet2d bmask(const int remaining_rows) { if (remaining_rows == 0) { return pset1(double(0.0)); // Not used @@ -1270,7 +1270,7 @@ EIGEN_STRONG_INLINE Packet2d bmask(const int remaining_rows) } template -EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) { band(accZ, pMask); @@ -1278,13 +1278,13 @@ EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock -EIGEN_STRONG_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3) +EIGEN_ALWAYS_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3) { pbroadcast4(a, a0, a1, a2, a3); } template<> -EIGEN_STRONG_INLINE void pbroadcast4_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +EIGEN_ALWAYS_INLINE void pbroadcast4_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) { a1 = pload(a); a3 = pload(a + 2); @@ -1298,7 +1298,7 @@ EIGEN_STRONG_INLINE void pbroadcast4_old(const double* a, Packet2d& a0 #define PEEL 7 template -EIGEN_STRONG_INLINE void MICRO_EXTRA_COL( +EIGEN_ALWAYS_INLINE void MICRO_EXTRA_COL( const Scalar* &lhs_ptr, const Scalar* &rhs_ptr, PacketBlock &accZero, @@ -1362,7 +1362,7 @@ EIGEN_STRONG_INLINE void gemm_extra_col( } template -EIGEN_STRONG_INLINE void MICRO_EXTRA_ROW( +EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW( const Scalar* &lhs_ptr, const Scalar* &rhs_ptr, PacketBlock &accZero, @@ -1565,7 +1565,6 @@ EIGEN_STRONG_INLINE void gemm_unrolled_iteration( Index col, const Packet& pAlpha) { -asm("#gemm begin"); const Scalar* rhs_ptr = rhs_base; const Scalar* lhs_ptr0, * lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, * lhs_ptr7; PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; @@ -1588,7 +1587,6 @@ asm("#gemm begin"); MICRO_STORE row += unroll_factor*accCols; -asm("#gemm end"); } template @@ -1789,7 +1787,7 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const #define PEEL_COMPLEX 3 template -EIGEN_STRONG_INLINE void MICRO_COMPLEX_EXTRA_COL( +EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_COL( const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, PacketBlock &accReal, PacketBlock &accImag, @@ -1888,7 +1886,7 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_col( } template -EIGEN_STRONG_INLINE void MICRO_COMPLEX_EXTRA_ROW( +EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW( const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, PacketBlock &accReal, PacketBlock &accImag, @@ -1924,7 +1922,6 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( const Packet& pAlphaImag, const Packet& pMask) { -asm("#gemm_complex begin"); const Scalar* rhs_ptr_real = rhs_base; const Scalar* rhs_ptr_imag; if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB; @@ -2001,7 +1998,6 @@ asm("#gemm_complex begin"); } } } -asm("#gemm_complex end"); } #define MICRO_COMPLEX_UNROLL(func) \ @@ -2173,7 +2169,6 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( const Packet& pAlphaReal, const Packet& pAlphaImag) { -asm("#gemm_complex_unrolled begin"); const Scalar* rhs_ptr_real = rhs_base; const Scalar* rhs_ptr_imag; if(!RhsIsReal) { @@ -2211,7 +2206,6 @@ asm("#gemm_complex_unrolled begin"); MICRO_COMPLEX_STORE row += unroll_factor*accCols; -asm("#gemm_complex_unrolled end"); } template -- cgit v1.2.3