aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
diff options
context:
space:
mode:
authorGravatar Pedro Caldeira <pedro.caldeira@ibm.com>2020-09-09 12:16:44 -0500
committerGravatar Pedro Henrique Moreira Caldeira <pedro.caldeira@ibm.com>2020-11-12 11:31:15 -0300
commitc29935b323ffb0b903f640111f0a0b0440e94a2e (patch)
tree4695715859d2900ffa834eeedcf05aaaf966556d /Eigen/src/Core/arch/AltiVec/MatrixProduct.h
parentb714dd9701752f4c3961d577d20055ed105154df (diff)
Add support for dynamic dispatch of MMA instructions for POWER 10
Diffstat (limited to 'Eigen/src/Core/arch/AltiVec/MatrixProduct.h')
-rw-r--r--Eigen/src/Core/arch/AltiVec/MatrixProduct.h451
1 files changed, 149 insertions, 302 deletions
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index 57227e23b..b86367571 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -10,6 +10,19 @@
#ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H
#define EIGEN_MATRIX_PRODUCT_ALTIVEC_H
+#include "MatrixProductCommon.h"
+
+#if __GNUC__ > 10 || \
+ (__GNUC__ == 10 && (__GNUC_MINOR__ > 2 || \
+ (__GNUC_MINOR__ == 2 && \
+ __GNUC_PATCHLEVEL__ >= 1)))
+ #define ALTIVEC_MMA_SUPPORT
+#endif
+
+#if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+ #include "MatrixProductMMA.h"
+#endif
+
/**************************************************************************************************
* TODO *
* - Check StorageOrder on lhs_pack (the innermost second loop seems unvectorized when it could). *
@@ -26,18 +39,6 @@ namespace internal {
**************************/
const int QuadRegisterCount = 8;
-#ifdef __MMA__
-
-template<typename Packet>
-union Packetx2u
-{
- __vector_pair vectorpair;
- PacketBlock<Packet, 2> pair;
-};
-
-#endif
-
-
template<typename Scalar>
struct quad_traits
{
@@ -82,17 +83,6 @@ const static Packet16uc p16uc_GETIMAG32 = { 4, 5, 6, 7,
12, 13, 14, 15,
20, 21, 22, 23,
28, 29, 30, 31};
-
-const static Packet16uc p16uc_SETCOMPLEX32_FIRST = { 0, 1, 2, 3,
- 16, 17, 18, 19,
- 4, 5, 6, 7,
- 20, 21, 22, 23};
-
-const static Packet16uc p16uc_SETCOMPLEX32_SECOND = { 8, 9, 10, 11,
- 24, 25, 26, 27,
- 12, 13, 14, 15,
- 28, 29, 30, 31};
-//[a,ai],[b,bi] = [a,b]
const static Packet16uc p16uc_GETREAL64 = { 0, 1, 2, 3, 4, 5, 6, 7,
16, 17, 18, 19, 20, 21, 22, 23};
@@ -100,14 +90,6 @@ const static Packet16uc p16uc_GETREAL64 = { 0, 1, 2, 3, 4, 5, 6, 7,
const static Packet16uc p16uc_GETIMAG64 = { 8, 9, 10, 11, 12, 13, 14, 15,
24, 25, 26, 27, 28, 29, 30, 31};
-//[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64
-const static Packet16uc p16uc_SETCOMPLEX64_FIRST = { 0, 1, 2, 3, 4, 5, 6, 7,
- 16, 17, 18, 19, 20, 21, 22, 23};
-
-//[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64
-const static Packet16uc p16uc_SETCOMPLEX64_SECOND = { 8, 9, 10, 11, 12, 13, 14, 15,
- 24, 25, 26, 27, 28, 29, 30, 31};
-
/*********************************************
* Single precision real and complex packing *
* *******************************************/
@@ -1316,154 +1298,6 @@ struct rhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conju
* GEMM utils *
**************/
-// Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
-template<typename Packet, typename Packetc>
-EIGEN_STRONG_INLINE void bcouple(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc,8>& tRes, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
-{
- acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST);
- acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST);
- acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_FIRST);
- acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_FIRST);
-
- acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND);
- acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_SECOND);
- acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_SECOND);
- acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_SECOND);
-
- acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
- acc1.packet[1] = padd<Packetc>(tRes.packet[1], acc1.packet[1]);
- acc1.packet[2] = padd<Packetc>(tRes.packet[2], acc1.packet[2]);
- acc1.packet[3] = padd<Packetc>(tRes.packet[3], acc1.packet[3]);
-
- acc2.packet[0] = padd<Packetc>(tRes.packet[4], acc2.packet[0]);
- acc2.packet[1] = padd<Packetc>(tRes.packet[5], acc2.packet[1]);
- acc2.packet[2] = padd<Packetc>(tRes.packet[6], acc2.packet[2]);
- acc2.packet[3] = padd<Packetc>(tRes.packet[7], acc2.packet[3]);
-}
-
-template<>
-EIGEN_STRONG_INLINE void bcouple<Packet2d, Packet1cd>(PacketBlock<Packet2d,4>& taccReal, PacketBlock<Packet2d,4>& taccImag, PacketBlock<Packet1cd,8>& tRes, PacketBlock<Packet1cd, 4>& acc1, PacketBlock<Packet1cd, 4>& acc2)
-{
- acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST);
- acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST);
- acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_FIRST);
- acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_FIRST);
-
- acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND);
- acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_SECOND);
- acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_SECOND);
- acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_SECOND);
-
- acc1.packet[0] = padd<Packet1cd>(tRes.packet[0], acc1.packet[0]);
- acc1.packet[1] = padd<Packet1cd>(tRes.packet[1], acc1.packet[1]);
- acc1.packet[2] = padd<Packet1cd>(tRes.packet[2], acc1.packet[2]);
- acc1.packet[3] = padd<Packet1cd>(tRes.packet[3], acc1.packet[3]);
-
- acc2.packet[0] = padd<Packet1cd>(tRes.packet[4], acc2.packet[0]);
- acc2.packet[1] = padd<Packet1cd>(tRes.packet[5], acc2.packet[1]);
- acc2.packet[2] = padd<Packet1cd>(tRes.packet[6], acc2.packet[2]);
- acc2.packet[3] = padd<Packet1cd>(tRes.packet[7], acc2.packet[3]);
-}
-
-#ifdef __MMA__
-template<typename Packet>
-EIGEN_STRONG_INLINE PacketBlock<Packet,2> pmul (const PacketBlock<Packet,2>& a, const Packet& b)
-{
- PacketBlock<Packet,2> pb;
- pb.packet[0] = a.packet[0]*b;
- pb.packet[1] = a.packet[1]*b;
- return pb;
-}
-template<typename DataMapper, typename Index, typename Packet>
-EIGEN_STRONG_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad *acc)
-{
- PacketBlock<Packet, 4> result;
- __builtin_mma_disassemble_acc(&result.packet, acc);
-
- PacketBlock<Packet, 4> block;
- block.packet[0] = data.template loadPacket<Packet>(i, j + 0) + pmul<Packet>(alpha, result.packet[0]);
- block.packet[1] = data.template loadPacket<Packet>(i, j + 1) + pmul<Packet>(alpha, result.packet[1]);
- block.packet[2] = data.template loadPacket<Packet>(i, j + 2) + pmul<Packet>(alpha, result.packet[2]);
- block.packet[3] = data.template loadPacket<Packet>(i, j + 3) + pmul<Packet>(alpha, result.packet[3]);
-
- data.template storePacketBlock<Packet, 4>(i, j, block);
-}
-
-template<typename DataMapper, typename Index, typename Packet, typename Packetc, int N>
-EIGEN_STRONG_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad *accReal, __vector_quad *accImag, const int accColsC)
-{
- PacketBlock<Packet, 4> resultReal, resultImag;
- __builtin_mma_disassemble_acc(&resultReal.packet, accReal);
- __builtin_mma_disassemble_acc(&resultImag.packet, accImag);
-
- PacketBlock<Packet,4> taccReal, taccImag;
- taccReal.packet[0] = pmul<Packet>(resultReal.packet[0], alphaReal);
- taccReal.packet[1] = pmul<Packet>(resultReal.packet[1], alphaReal);
- taccReal.packet[2] = pmul<Packet>(resultReal.packet[2], alphaReal);
- taccReal.packet[3] = pmul<Packet>(resultReal.packet[3], alphaReal);
-
- taccImag.packet[0] = pmul<Packet>(resultImag.packet[0], alphaReal);
- taccImag.packet[1] = pmul<Packet>(resultImag.packet[1], alphaReal);
- taccImag.packet[2] = pmul<Packet>(resultImag.packet[2], alphaReal);
- taccImag.packet[3] = pmul<Packet>(resultImag.packet[3], alphaReal);
-
- taccReal.packet[0] = psub<Packet>(taccReal.packet[0], pmul<Packet>(resultImag.packet[0], alphaImag));
- taccReal.packet[1] = psub<Packet>(taccReal.packet[1], pmul<Packet>(resultImag.packet[1], alphaImag));
- taccReal.packet[2] = psub<Packet>(taccReal.packet[2], pmul<Packet>(resultImag.packet[2], alphaImag));
- taccReal.packet[3] = psub<Packet>(taccReal.packet[3], pmul<Packet>(resultImag.packet[3], alphaImag));
-
- taccImag.packet[0] = pmadd<Packet>(resultReal.packet[0], alphaImag, taccImag.packet[0]);
- taccImag.packet[1] = pmadd<Packet>(resultReal.packet[1], alphaImag, taccImag.packet[1]);
- taccImag.packet[2] = pmadd<Packet>(resultReal.packet[2], alphaImag, taccImag.packet[2]);
- taccImag.packet[3] = pmadd<Packet>(resultReal.packet[3], alphaImag, taccImag.packet[3]);
-
- PacketBlock<Packetc, 8> tRes;
- tRes.packet[0] = data.template loadPacket<Packetc>(i + N*accColsC, j + 0);
- tRes.packet[1] = data.template loadPacket<Packetc>(i + N*accColsC, j + 1);
- tRes.packet[2] = data.template loadPacket<Packetc>(i + N*accColsC, j + 2);
- tRes.packet[3] = data.template loadPacket<Packetc>(i + N*accColsC, j + 3);
-
- tRes.packet[4] = data.template loadPacket<Packetc>(i + (N+1)*accColsC, j + 0);
- tRes.packet[5] = data.template loadPacket<Packetc>(i + (N+1)*accColsC, j + 1);
- tRes.packet[6] = data.template loadPacket<Packetc>(i + (N+1)*accColsC, j + 2);
- tRes.packet[7] = data.template loadPacket<Packetc>(i + (N+1)*accColsC, j + 3);
-
- PacketBlock<Packetc, 4> acc1, acc2;
- bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc1, acc2);
-
- data.template storePacketBlock<Packetc, 4>(i + N*accColsC, j, acc1);
- data.template storePacketBlock<Packetc, 4>(i + (N+1)*accColsC, j, acc2);
-}
-
-// Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
-template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pger(__vector_quad *acc, const RhsPacket& a, const LhsPacket& b)
-{
- if(NegativeAccumulate)
- {
- __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
- } else {
- __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
- }
-}
-
-template<>
-EIGEN_STRONG_INLINE void pger<Packet2d, PacketBlock<Packet2d, 2>, false>(__vector_quad *acc, const PacketBlock<Packet2d,2>& a, const Packet2d& b)
-{
- Packetx2u<Packet2d> p;
- p.pair = a;
- __builtin_mma_xvf64gerpp(acc, p.vectorpair, (__vector unsigned char)b);
-}
-
-template<>
-EIGEN_STRONG_INLINE void pger<Packet2d, PacketBlock<Packet2d, 2>, true>(__vector_quad *acc, const PacketBlock<Packet2d, 2>& a, const Packet2d& b)
-{
- Packetx2u<Packet2d> p;
- p.pair = a;
- __builtin_mma_xvf64gernp(acc, p.vectorpair, (__vector unsigned char)b);
-}
-#else
-
// 512-bits rank1-update of acc. It can either positive or negative accumulate (useful for complex gemm).
template<typename Scalar, typename Packet, bool NegativeAccumulate>
EIGEN_STRONG_INLINE void pger(PacketBlock<Packet, 4> *acc, const Scalar* lhs, const Scalar* rhs)
@@ -1561,25 +1395,6 @@ EIGEN_STRONG_INLINE void pgerc(PacketBlock<Packet, 4>& accReal, PacketBlock<Pack
accImag.packet[3] = pmadd<Packet>(rhsV4, lhsVi, accImag.packet[3]);
}
}
-#endif
-
-// This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
-template<typename Scalar, typename Packet>
-EIGEN_STRONG_INLINE Packet ploadRhs(const Scalar *rhs)
-{
- return *((Packet *)rhs);
-}
-
-#ifdef __MMA__
-template<>
-EIGEN_STRONG_INLINE PacketBlock<Packet2d, 2> ploadRhs<double, PacketBlock<Packet2d, 2> >(const double *rhs)
-{
- PacketBlock<Packet2d, 2> pair;
- pair.packet[0] = *((Packet2d *)rhs );
- pair.packet[1] = *(((Packet2d *)rhs) + 1);
- return pair;
-}
-#endif
template<typename Scalar, typename Packet>
EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar *lhs)
@@ -1587,7 +1402,6 @@ EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar *lhs)
return *((Packet *)lhs);
}
-#ifndef __MMA__
// Zero the accumulator on PacketBlock.
template<typename Scalar, typename Packet>
EIGEN_STRONG_INLINE void bsetzero(PacketBlock<Packet,4>& acc)
@@ -1656,7 +1470,7 @@ EIGEN_STRONG_INLINE void bload(PacketBlock<Packet,8>& acc, const DataMapper& res
acc.packet[6] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 2);
acc.packet[7] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 3);
}
-#endif
+
// PEEL loop factor.
#define PEEL 10
@@ -1682,31 +1496,6 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const
const Scalar *lhs_base = blockA;
Index row = 0;
-#ifdef __MMA__
- for(; row + accCols <= rows; row += accCols)
- {
- const Scalar *rhs_ptr = rhs_base;
- const Scalar *lhs_ptr1 = lhs_base + (row/accCols)*strideA*accCols;
-
- __vector_quad acc;
- __builtin_mma_xxsetaccz(&acc);
-
- lhs_ptr1 += accCols*offsetA;
- rhs_ptr += accRows*offsetB;
- for(Index k = 0; k < depth; k++)
- {
- Packet lhsV = ploadLhs<Scalar, Packet>(lhs_ptr1);
- RhsPacket rhsV = ploadRhs<Scalar, RhsPacket>(rhs_ptr);
-
- pger<Packet, RhsPacket, false>(&acc, rhsV, lhsV);
-
- lhs_ptr1 += accCols;
- rhs_ptr += accRows;
- }
-
- storeAccumulator<DataMapper, Index, Packet>(row, col, res, pAlpha, &acc);
- }
-#else
for(; row + 6*accCols <= rows; row += 6*accCols)
{
#define MICRO() \
@@ -2135,7 +1924,6 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const
res.template storePacketBlock<Packet, 4>(row, col, acc1);
#undef MICRO
}
-#endif
if(remaining_rows > 0)
{
const Scalar *rhs_ptr = rhs_base;
@@ -2239,60 +2027,6 @@ EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* bl
const Scalar *lhs_base = blockA;
Index row = 0;
-#ifdef __MMA__
- for(; row + accCols <= rows; row += accCols)
- {
- const Scalar *rhs_ptr = rhs_base;
- const Scalar *rhs_ptr_imag = rhs_ptr + accRows*strideB;
- const Scalar *lhs_ptr = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols;
- const Scalar *lhs_ptr_imag = lhs_ptr + accCols*strideA;
-
- __vector_quad accReal, accImag;
- __builtin_mma_xxsetaccz(&accReal);
- __builtin_mma_xxsetaccz(&accImag);
-
- lhs_ptr += accCols*offsetA;
- if(!LhsIsReal)
- lhs_ptr_imag += accCols*offsetA;
- rhs_ptr += accRows*offsetB;
- if(!RhsIsReal)
- rhs_ptr_imag += accRows*offsetB;
- for(Index k = 0; k < depth; k++)
- {
- Packet lhsV = ploadLhs<Scalar, Packet>(lhs_ptr);
- RhsPacket rhsV = ploadRhs<Scalar, RhsPacket>(rhs_ptr);
-
- Packet lhsVi = ploadLhs<Scalar, Packet>(lhs_ptr_imag);
- RhsPacket rhsVi = ploadRhs<Scalar, RhsPacket>(rhs_ptr_imag);
-
- if(ConjugateLhs && !LhsIsReal) lhsVi = pmul<Packet>(lhsVi, conj);
- if(ConjugateRhs && !RhsIsReal) rhsVi = pmul<Packet>(rhsVi, conj);
-
- if(LhsIsReal)
- {
- pger<Packet, RhsPacket, false>(&accReal, rhsV, lhsV);
- pger<Packet, RhsPacket, false>(&accImag, rhsVi, lhsV);
- } else if(RhsIsReal) {
- pger<Packet, RhsPacket, false>(&accReal, rhsV, lhsV);
- pger<Packet, RhsPacket, false>(&accImag, rhsV, lhsVi);
- } else {
- pger<Packet, RhsPacket, false>(&accReal, rhsV, lhsV);
- pger<Packet, RhsPacket, true>(&accReal, rhsVi, lhsVi);
- pger<Packet, RhsPacket, false>(&accImag, rhsVi, lhsV);
- pger<Packet, RhsPacket, false>(&accImag, rhsV, lhsVi);
- }
-
- lhs_ptr += accCols;
- rhs_ptr += accRows;
- if(!LhsIsReal)
- lhs_ptr_imag += accCols;
- if(!RhsIsReal)
- rhs_ptr_imag += accRows;
- }
-
- storeComplexAccumulator<DataMapper, Index, Packet, Packetc, 0>(row, col, res, pAlphaReal, pAlphaImag, &accReal, &accImag, accColsC);
- }
-#else
for(; row + accCols <= rows; row += accCols)
{
#define MICRO() \
@@ -2302,7 +2036,7 @@ EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* bl
if(!LhsIsReal) \
lhs_ptr_imag1 += accCols; \
if(!RhsIsReal) \
- rhs_ptr_imag += accRows;
+ rhs_ptr_imag += accRows;
const Scalar *rhs_ptr = rhs_base;
const Scalar *rhs_ptr_imag = rhs_ptr + accRows*strideB;
@@ -2356,7 +2090,6 @@ EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* bl
res.template storePacketBlock<Packetc, 4>(row + accColsC, col, acc2);
#undef MICRO
}
-#endif
if(remaining_rows > 0)
{
const Scalar *rhs_ptr = rhs_base;
@@ -2383,7 +2116,7 @@ EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* bl
lhsc.real(lhs_real);
if(!LhsIsReal)
{
- if(ConjugateLhs)
+ if(ConjugateLhs)
lhsc.imag(-lhs_imag);
else
lhsc.imag(lhs_imag);
@@ -2457,7 +2190,7 @@ EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* bl
{
Scalar lhs_real = lhs_ptr[arow];
Scalar lhs_imag;
- if(!LhsIsReal)
+ if(!LhsIsReal)
{
lhs_imag = lhs_ptr_imag[arow];
@@ -2534,7 +2267,7 @@ EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* bl
lhsc.real(lhs_real);
if(!LhsIsReal)
{
- if(ConjugateLhs)
+ if(ConjugateLhs)
lhsc.imag(-lhs_imag);
else
lhsc.imag(lhs_imag);
@@ -2819,8 +2552,22 @@ void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, Conjugat
{
const int accRows = quad_traits<float>::rows;
const int accCols = quad_traits<float>::size;
-
- gemm<float, Index, Packet, RhsPacket, DataMapper>(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+ void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index, const int, const int);
+
+ #ifdef EIGEN_ALTIVEC_MMA_ONLY
+ //generate with MMA only
+ gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper>;
+ #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+ if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+ gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper>;
+ }
+ else{
+ gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper>;
+ }
+ #else
+ gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper>;
+ #endif
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
}
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2843,8 +2590,23 @@ void gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr
{
const int accRows = quad_traits<float>::rows;
const int accCols = quad_traits<float>::size;
-
- gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+ void (*gemm_function)(const DataMapper&, const std::complex<float>*, const std::complex<float>*,
+ Index, Index, Index, std::complex<float>, Index, Index , Index, Index, const int, const int);
+
+ #ifdef EIGEN_ALTIVEC_MMA_ONLY
+ //generate with MMA only
+ gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+ #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+ if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+ gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+ }
+ else{
+ gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+ }
+ #else
+ gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+ #endif
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
}
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2867,8 +2629,22 @@ void gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, Conjugat
{
const int accRows = quad_traits<float>::rows;
const int accCols = quad_traits<float>::size;
-
- gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+ void (*gemm_function)(const DataMapper&, const float*, const std::complex<float>*,
+ Index, Index, Index, std::complex<float>, Index, Index , Index, Index, const int, const int);
+ #ifdef EIGEN_ALTIVEC_MMA_ONLY
+ //generate with MMA only
+ gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+ #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+ if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+ gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+ }
+ else{
+ gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+ }
+ #else
+ gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+ #endif
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
}
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2891,8 +2667,22 @@ void gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, Conjugat
{
const int accRows = quad_traits<float>::rows;
const int accCols = quad_traits<float>::size;
-
- gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+ void (*gemm_function)(const DataMapper&, const std::complex<float>*, const float*,
+ Index, Index, Index, std::complex<float>, Index, Index , Index, Index, const int, const int);
+ #ifdef EIGEN_ALTIVEC_MMA_ONLY
+ //generate with MMA only
+ gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+ #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+ if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+ gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+ }
+ else{
+ gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+ }
+ #else
+ gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+ #endif
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
}
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2914,8 +2704,22 @@ void gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, Conjug
{
const int accRows = quad_traits<double>::rows;
const int accCols = quad_traits<double>::size;
-
- gemm<double, Index, Packet, RhsPacket, DataMapper>(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+ void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index, const int, const int);
+
+ #ifdef EIGEN_ALTIVEC_MMA_ONLY
+ //generate with MMA only
+ gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper>;
+ #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+ if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+ gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper>;
+ }
+ else{
+ gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper>;
+ }
+ #else
+ gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper>;
+ #endif
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
}
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2938,8 +2742,22 @@ void gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper,
{
const int accRows = quad_traits<double>::rows;
const int accCols = quad_traits<double>::size;
-
- gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+ void (*gemm_function)(const DataMapper&, const std::complex<double>*, const std::complex<double>*,
+ Index, Index, Index, std::complex<double>, Index, Index , Index, Index, const int, const int);
+ #ifdef EIGEN_ALTIVEC_MMA_ONLY
+ //generate with MMA only
+ gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+ #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+ if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+ gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+ }
+ else{
+ gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+ }
+ #else
+ gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+ #endif
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
}
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2962,8 +2780,22 @@ void gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, Conjug
{
const int accRows = quad_traits<double>::rows;
const int accCols = quad_traits<double>::size;
-
- gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+ void (*gemm_function)(const DataMapper&, const std::complex<double>*, const double*,
+ Index, Index, Index, std::complex<double>, Index, Index , Index, Index, const int, const int);
+ #ifdef EIGEN_ALTIVEC_MMA_ONLY
+ //generate with MMA only
+ gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+ #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+ if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+ gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+ }
+ else{
+ gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+ }
+ #else
+ gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+ #endif
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
}
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2986,10 +2818,25 @@ void gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, Conjug
{
const int accRows = quad_traits<double>::rows;
const int accCols = quad_traits<double>::size;
-
- gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+ void (*gemm_function)(const DataMapper&, const double*, const std::complex<double>*,
+ Index, Index, Index, std::complex<double>, Index, Index , Index, Index, const int, const int);
+ #ifdef EIGEN_ALTIVEC_MMA_ONLY
+ //generate with MMA only
+ gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+ #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+ if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+ gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+ }
+ else{
+ gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+ }
+ #else
+ gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+ #endif
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
}
} // end namespace internal
} // end namespace Eigen
-#endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H \ No newline at end of file
+
+#endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H