aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
diff options
context:
space:
mode:
authorGravatar Pedro Caldeira <pedro.caldeira@ibm.com>2020-09-09 12:16:44 -0500
committerGravatar Pedro Henrique Moreira Caldeira <pedro.caldeira@ibm.com>2020-11-12 11:31:15 -0300
commitc29935b323ffb0b903f640111f0a0b0440e94a2e (patch)
tree4695715859d2900ffa834eeedcf05aaaf966556d /Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
parentb714dd9701752f4c3961d577d20055ed105154df (diff)
Add support for dynamic dispatch of MMA instructions for POWER 10
Diffstat (limited to 'Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h')
-rw-r--r--Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h638
1 files changed, 638 insertions, 0 deletions
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
new file mode 100644
index 000000000..1866a71bf
--- /dev/null
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -0,0 +1,638 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+#define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+
+#pragma GCC target("cpu=power10")
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Packet>
+union Packetx2u
+{
+ __vector_pair vectorpair;
+ PacketBlock<Packet, 2> pair;
+};
+const static Packet16uc MMA_p16uc_SETCOMPLEX32_FIRST = { 0, 1, 2, 3,
+ 16, 17, 18, 19,
+ 4, 5, 6, 7,
+ 20, 21, 22, 23};
+
+const static Packet16uc MMA_p16uc_SETCOMPLEX32_SECOND = { 8, 9, 10, 11,
+ 24, 25, 26, 27,
+ 12, 13, 14, 15,
+ 28, 29, 30, 31};
+//[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64
+const static Packet16uc MMA_p16uc_SETCOMPLEX64_FIRST = { 0, 1, 2, 3, 4, 5, 6, 7,
+ 16, 17, 18, 19, 20, 21, 22, 23};
+
+//[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64
+const static Packet16uc MMA_p16uc_SETCOMPLEX64_SECOND = { 8, 9, 10, 11, 12, 13, 14, 15,
+ 24, 25, 26, 27, 28, 29, 30, 31};
+
+
+
+// Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
+template<typename Packet, typename Packetc>
+EIGEN_STRONG_INLINE void bcoupleMMA(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc,8>& tRes, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
+{
+ acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], MMA_p16uc_SETCOMPLEX32_FIRST);
+ acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], MMA_p16uc_SETCOMPLEX32_FIRST);
+ acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], MMA_p16uc_SETCOMPLEX32_FIRST);
+ acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], MMA_p16uc_SETCOMPLEX32_FIRST);
+
+ acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], MMA_p16uc_SETCOMPLEX32_SECOND);
+ acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], MMA_p16uc_SETCOMPLEX32_SECOND);
+ acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], MMA_p16uc_SETCOMPLEX32_SECOND);
+ acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], MMA_p16uc_SETCOMPLEX32_SECOND);
+
+ acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
+ acc1.packet[1] = padd<Packetc>(tRes.packet[1], acc1.packet[1]);
+ acc1.packet[2] = padd<Packetc>(tRes.packet[2], acc1.packet[2]);
+ acc1.packet[3] = padd<Packetc>(tRes.packet[3], acc1.packet[3]);
+
+ acc2.packet[0] = padd<Packetc>(tRes.packet[4], acc2.packet[0]);
+ acc2.packet[1] = padd<Packetc>(tRes.packet[5], acc2.packet[1]);
+ acc2.packet[2] = padd<Packetc>(tRes.packet[6], acc2.packet[2]);
+ acc2.packet[3] = padd<Packetc>(tRes.packet[7], acc2.packet[3]);
+}
+
+template<>
+EIGEN_STRONG_INLINE void bcoupleMMA<Packet2d, Packet1cd>(PacketBlock<Packet2d,4>& taccReal, PacketBlock<Packet2d,4>& taccImag, PacketBlock<Packet1cd,8>& tRes, PacketBlock<Packet1cd, 4>& acc1, PacketBlock<Packet1cd, 4>& acc2)
+{
+ acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], MMA_p16uc_SETCOMPLEX64_FIRST);
+ acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], MMA_p16uc_SETCOMPLEX64_FIRST);
+ acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], MMA_p16uc_SETCOMPLEX64_FIRST);
+ acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], MMA_p16uc_SETCOMPLEX64_FIRST);
+
+ acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], MMA_p16uc_SETCOMPLEX64_SECOND);
+ acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], MMA_p16uc_SETCOMPLEX64_SECOND);
+ acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], MMA_p16uc_SETCOMPLEX64_SECOND);
+ acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], MMA_p16uc_SETCOMPLEX64_SECOND);
+
+ acc1.packet[0] = padd<Packet1cd>(tRes.packet[0], acc1.packet[0]);
+ acc1.packet[1] = padd<Packet1cd>(tRes.packet[1], acc1.packet[1]);
+ acc1.packet[2] = padd<Packet1cd>(tRes.packet[2], acc1.packet[2]);
+ acc1.packet[3] = padd<Packet1cd>(tRes.packet[3], acc1.packet[3]);
+
+ acc2.packet[0] = padd<Packet1cd>(tRes.packet[4], acc2.packet[0]);
+ acc2.packet[1] = padd<Packet1cd>(tRes.packet[5], acc2.packet[1]);
+ acc2.packet[2] = padd<Packet1cd>(tRes.packet[6], acc2.packet[2]);
+ acc2.packet[3] = padd<Packet1cd>(tRes.packet[7], acc2.packet[3]);
+}
+
+template<typename Scalar, typename Packet>
+EIGEN_STRONG_INLINE Packet ploadLhsMMA(const Scalar *lhs)
+{
+ return *((Packet *)lhs);
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE PacketBlock<Packet,2> pmul (const PacketBlock<Packet,2>& a, const Packet& b)
+{
+ PacketBlock<Packet,2> pb;
+ pb.packet[0] = a.packet[0]*b;
+ pb.packet[1] = a.packet[1]*b;
+ return pb;
+}
+
+template<typename Scalar, typename Packet>
+EIGEN_STRONG_INLINE void bsetzeroMMA(__vector_quad *acc)
+{
+ __builtin_mma_xxsetaccz(acc);
+}
+
+template<typename DataMapper, typename Index, typename Packet>
+EIGEN_STRONG_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad *acc)
+{
+ PacketBlock<Packet, 4> result;
+ __builtin_mma_disassemble_acc(&result.packet, acc);
+
+ PacketBlock<Packet, 4> block;
+ block.packet[0] = data.template loadPacket<Packet>(i, j + 0) + pmul<Packet>(alpha, result.packet[0]);
+ block.packet[1] = data.template loadPacket<Packet>(i, j + 1) + pmul<Packet>(alpha, result.packet[1]);
+ block.packet[2] = data.template loadPacket<Packet>(i, j + 2) + pmul<Packet>(alpha, result.packet[2]);
+ block.packet[3] = data.template loadPacket<Packet>(i, j + 3) + pmul<Packet>(alpha, result.packet[3]);
+
+ data.template storePacketBlock<Packet, 4>(i, j, block);
+}
+
+template<typename DataMapper, typename Index, typename Packet, typename Packetc, int N>
+EIGEN_STRONG_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad *accReal, __vector_quad *accImag, const int accColsC)
+{
+ PacketBlock<Packet, 4> resultReal, resultImag;
+ __builtin_mma_disassemble_acc(&resultReal.packet, accReal);
+ __builtin_mma_disassemble_acc(&resultImag.packet, accImag);
+
+ PacketBlock<Packet,4> taccReal, taccImag;
+ taccReal.packet[0] = pmul<Packet>(resultReal.packet[0], alphaReal);
+ taccReal.packet[1] = pmul<Packet>(resultReal.packet[1], alphaReal);
+ taccReal.packet[2] = pmul<Packet>(resultReal.packet[2], alphaReal);
+ taccReal.packet[3] = pmul<Packet>(resultReal.packet[3], alphaReal);
+
+ taccImag.packet[0] = pmul<Packet>(resultImag.packet[0], alphaReal);
+ taccImag.packet[1] = pmul<Packet>(resultImag.packet[1], alphaReal);
+ taccImag.packet[2] = pmul<Packet>(resultImag.packet[2], alphaReal);
+ taccImag.packet[3] = pmul<Packet>(resultImag.packet[3], alphaReal);
+
+ taccReal.packet[0] = psub<Packet>(taccReal.packet[0], pmul<Packet>(resultImag.packet[0], alphaImag));
+ taccReal.packet[1] = psub<Packet>(taccReal.packet[1], pmul<Packet>(resultImag.packet[1], alphaImag));
+ taccReal.packet[2] = psub<Packet>(taccReal.packet[2], pmul<Packet>(resultImag.packet[2], alphaImag));
+ taccReal.packet[3] = psub<Packet>(taccReal.packet[3], pmul<Packet>(resultImag.packet[3], alphaImag));
+
+ taccImag.packet[0] = pmadd<Packet>(resultReal.packet[0], alphaImag, taccImag.packet[0]);
+ taccImag.packet[1] = pmadd<Packet>(resultReal.packet[1], alphaImag, taccImag.packet[1]);
+ taccImag.packet[2] = pmadd<Packet>(resultReal.packet[2], alphaImag, taccImag.packet[2]);
+ taccImag.packet[3] = pmadd<Packet>(resultReal.packet[3], alphaImag, taccImag.packet[3]);
+
+ PacketBlock<Packetc, 8> tRes;
+ tRes.packet[0] = data.template loadPacket<Packetc>(i + N*accColsC, j + 0);
+ tRes.packet[1] = data.template loadPacket<Packetc>(i + N*accColsC, j + 1);
+ tRes.packet[2] = data.template loadPacket<Packetc>(i + N*accColsC, j + 2);
+ tRes.packet[3] = data.template loadPacket<Packetc>(i + N*accColsC, j + 3);
+
+ tRes.packet[4] = data.template loadPacket<Packetc>(i + (N+1)*accColsC, j + 0);
+ tRes.packet[5] = data.template loadPacket<Packetc>(i + (N+1)*accColsC, j + 1);
+ tRes.packet[6] = data.template loadPacket<Packetc>(i + (N+1)*accColsC, j + 2);
+ tRes.packet[7] = data.template loadPacket<Packetc>(i + (N+1)*accColsC, j + 3);
+
+ PacketBlock<Packetc, 4> acc1, acc2;
+ bcoupleMMA<Packet, Packetc>(taccReal, taccImag, tRes, acc1, acc2);
+
+ data.template storePacketBlock<Packetc, 4>(i + N*accColsC, j, acc1);
+ data.template storePacketBlock<Packetc, 4>(i + (N+1)*accColsC, j, acc2);
+}
+
+// Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
+template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_STRONG_INLINE void pgerMMA(__vector_quad *acc, const RhsPacket& a, const LhsPacket& b)
+{
+ if(NegativeAccumulate)
+ {
+ __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+ } else {
+ __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+ }
+}
+
+template<>
+EIGEN_STRONG_INLINE void pgerMMA<Packet2d, PacketBlock<Packet2d, 2>, false>(__vector_quad *acc, const PacketBlock<Packet2d,2>& a, const Packet2d& b)
+{
+ Packetx2u<Packet2d> p;
+ p.pair = a;
+ __builtin_mma_xvf64gerpp(acc, p.vectorpair, (__vector unsigned char)b);
+}
+
+template<>
+EIGEN_STRONG_INLINE void pgerMMA<Packet2d, PacketBlock<Packet2d, 2>, true>(__vector_quad *acc, const PacketBlock<Packet2d, 2>& a, const Packet2d& b)
+{
+ Packetx2u<Packet2d> p;
+ p.pair = a;
+ __builtin_mma_xvf64gernp(acc, p.vectorpair, (__vector unsigned char)b);
+}
+
+// This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
+template<typename Scalar, typename Packet>
+EIGEN_STRONG_INLINE Packet ploadRhsMMA(const Scalar *rhs)
+{
+ return *((Packet *)rhs);
+}
+
+template<>
+EIGEN_STRONG_INLINE PacketBlock<Packet2d, 2> ploadRhsMMA<double, PacketBlock<Packet2d, 2> >(const double *rhs)
+{
+ PacketBlock<Packet2d, 2> pair;
+ pair.packet[0] = *((Packet2d *)rhs );
+ pair.packet[1] = *(((Packet2d *)rhs) + 1);
+ return pair;
+}
+
+template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper>
+void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+ Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB, const int accRows, const int accCols)
+{
+ const Index remaining_rows = rows % accCols;
+ const Index remaining_cols = cols % accRows;
+
+ if( strideA == -1 ) strideA = depth;
+ if( strideB == -1 ) strideB = depth;
+
+ const Packet pAlpha = pset1<Packet>(alpha);
+ Index col = 0;
+ for(; col + accRows <= cols; col += accRows)
+ {
+ const Scalar *rhs_base = blockB + ( col/accRows )*strideB*accRows;
+ const Scalar *lhs_base = blockA;
+
+ Index row = 0;
+ for(; row + accCols <= rows; row += accCols)
+ {
+ const Scalar *rhs_ptr = rhs_base;
+ const Scalar *lhs_ptr1 = lhs_base + (row/accCols)*strideA*accCols;
+
+ __vector_quad acc;
+ bsetzeroMMA<Scalar, Packet>(&acc);
+
+ lhs_ptr1 += accCols*offsetA;
+ rhs_ptr += accRows*offsetB;
+ for(Index k = 0; k < depth; k++)
+ {
+ Packet lhsV = ploadLhsMMA<Scalar, Packet>(lhs_ptr1);
+ RhsPacket rhsV = ploadRhsMMA<Scalar, RhsPacket>(rhs_ptr);
+
+ pgerMMA<Packet, RhsPacket, false>(&acc, rhsV, lhsV);
+
+ lhs_ptr1 += accCols;
+ rhs_ptr += accRows;
+ }
+
+ storeAccumulator<DataMapper, Index, Packet>(row, col, res, pAlpha, &acc);
+ }
+ if(remaining_rows > 0)
+ {
+ const Scalar *rhs_ptr = rhs_base;
+ const Scalar *lhs_ptr = lhs_base + (row/accCols)*strideA*accCols;
+
+ lhs_ptr += remaining_rows*offsetA;
+ rhs_ptr += accRows*offsetB;
+ for(Index k = 0; k < depth; k++)
+ {
+ for(Index arow = 0; arow < remaining_rows; arow++)
+ {
+ for(Index acol = 0; acol < accRows; acol++ )
+ {
+ res(row + arow, col + acol) += alpha*lhs_ptr[arow]*rhs_ptr[acol];
+ }
+ }
+ rhs_ptr += accRows;
+ lhs_ptr += remaining_rows;
+ }
+ }
+ }
+
+ if(remaining_cols > 0)
+ {
+ const Scalar *rhs_base = blockB + (col/accRows)*strideB*accRows;
+ const Scalar *lhs_base = blockA;
+
+ Index row = 0;
+ for(; row + accCols <= rows; row += accCols)
+ {
+ const Scalar *rhs_ptr = rhs_base;
+ const Scalar *lhs_ptr = lhs_base + (row/accCols)*strideA*accCols;
+
+ lhs_ptr += accCols*offsetA;
+ rhs_ptr += remaining_cols*offsetB;
+ for(Index k = 0; k < depth; k++)
+ {
+ for(Index arow = 0; arow < accCols; arow++)
+ {
+ for(Index acol = 0; acol < remaining_cols; acol++ )
+ {
+ res(row + arow, col + acol) += alpha*lhs_ptr[arow]*rhs_ptr[acol];
+ }
+ }
+ rhs_ptr += remaining_cols;
+ lhs_ptr += accCols;
+ }
+ }
+
+ if(remaining_rows > 0 )
+ {
+ const Scalar *rhs_ptr = rhs_base;
+ const Scalar *lhs_ptr = lhs_base + (row/accCols)*strideA*accCols;
+
+ lhs_ptr += remaining_rows*offsetA;
+ rhs_ptr += remaining_cols*offsetB;
+ for(Index k = 0; k < depth; k++)
+ {
+ for(Index arow = 0; arow < remaining_rows; arow++)
+ {
+ for(Index acol = 0; acol < remaining_cols; acol++ )
+ {
+ res(row + arow, col + acol) += alpha*lhs_ptr[arow]*rhs_ptr[acol];
+ }
+ }
+ rhs_ptr += remaining_cols;
+ lhs_ptr += remaining_rows;
+ }
+ }
+ }
+}
+
+template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc,
+ Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB, const int accRows, const int accCols)
+{
+ const int remaining_rows = rows % accCols;
+ const int remaining_cols = cols % accRows;
+ const int accColsC = accCols / 2;
+ int advanceCols = 2;
+ int advanceRows = 2;
+
+ if(LhsIsReal) advanceRows = 1;
+ if(RhsIsReal) advanceCols = 1;
+
+ if( strideA == -1 ) strideA = depth;
+ if( strideB == -1 ) strideB = depth;
+
+ const Packet pAlphaReal = pset1<Packet>(alpha.real());
+ const Packet pAlphaImag = pset1<Packet>(alpha.imag());
+
+ const Scalar *blockA = (Scalar *) blockAc;
+ const Scalar *blockB = (Scalar *) blockBc;
+
+ Packet conj = pset1<Packet>((Scalar)-1.0f);
+
+ Index col = 0;
+ for(; col + accRows <= cols; col += accRows)
+ {
+ const Scalar *rhs_base = blockB + ( (advanceCols*col)/accRows )*strideB*accRows;
+ const Scalar *lhs_base = blockA;
+
+ Index row = 0;
+
+ for(; row + accCols <= rows; row += accCols)
+ {
+ const Scalar *rhs_ptr = rhs_base;
+ const Scalar *rhs_ptr_imag = rhs_ptr + accRows*strideB;
+ const Scalar *lhs_ptr = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols;
+ const Scalar *lhs_ptr_imag = lhs_ptr + accCols*strideA;
+
+ __vector_quad accReal, accImag;
+ __builtin_mma_xxsetaccz(&accReal);
+ __builtin_mma_xxsetaccz(&accImag);
+
+ lhs_ptr += accCols*offsetA;
+ if(!LhsIsReal)
+ lhs_ptr_imag += accCols*offsetA;
+ rhs_ptr += accRows*offsetB;
+ if(!RhsIsReal)
+ rhs_ptr_imag += accRows*offsetB;
+ for(Index k = 0; k < depth; k++)
+ {
+ Packet lhsV = ploadLhsMMA<Scalar, Packet>(lhs_ptr);
+ RhsPacket rhsV = ploadRhs<Scalar, RhsPacket>(rhs_ptr);
+
+ Packet lhsVi = ploadLhsMMA<Scalar, Packet>(lhs_ptr_imag);
+ RhsPacket rhsVi = ploadRhs<Scalar, RhsPacket>(rhs_ptr_imag);
+
+ if(ConjugateLhs && !LhsIsReal) lhsVi = pmul<Packet>(lhsVi, conj);
+ if(ConjugateRhs && !RhsIsReal) rhsVi = pmul<Packet>(rhsVi, conj);
+
+ if(LhsIsReal)
+ {
+ pgerMMA<Packet, RhsPacket, false>(&accReal, rhsV, lhsV);
+ pgerMMA<Packet, RhsPacket, false>(&accImag, rhsVi, lhsV);
+ } else if(RhsIsReal) {
+ pgerMMA<Packet, RhsPacket, false>(&accReal, rhsV, lhsV);
+ pgerMMA<Packet, RhsPacket, false>(&accImag, rhsV, lhsVi);
+ } else {
+ pgerMMA<Packet, RhsPacket, false>(&accReal, rhsV, lhsV);
+ pgerMMA<Packet, RhsPacket, true>(&accReal, rhsVi, lhsVi);
+ pgerMMA<Packet, RhsPacket, false>(&accImag, rhsVi, lhsV);
+ pgerMMA<Packet, RhsPacket, false>(&accImag, rhsV, lhsVi);
+ }
+
+ lhs_ptr += accCols;
+ rhs_ptr += accRows;
+ if(!LhsIsReal)
+ lhs_ptr_imag += accCols;
+ if(!RhsIsReal)
+ rhs_ptr_imag += accRows;
+ }
+
+ storeComplexAccumulator<DataMapper, Index, Packet, Packetc, 0>(row, col, res, pAlphaReal, pAlphaImag, &accReal, &accImag, accColsC);
+ }
+
+ if(remaining_rows > 0)
+ {
+ const Scalar *rhs_ptr = rhs_base;
+ const Scalar *rhs_ptr_imag = rhs_ptr + accRows*strideB;
+ const Scalar *lhs_ptr = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols;
+ const Scalar *lhs_ptr_imag = lhs_ptr + remaining_rows*strideA;
+
+ lhs_ptr += remaining_rows*offsetA;
+ if(!LhsIsReal)
+ lhs_ptr_imag += remaining_rows*offsetA;
+ rhs_ptr += accRows*offsetB;
+ if(!RhsIsReal)
+ rhs_ptr_imag += accRows*offsetB;
+ for(Index k = 0; k < depth; k++)
+ {
+ for(Index arow = 0; arow < remaining_rows; arow++)
+ {
+ Scalar lhs_real = lhs_ptr[arow];
+ Scalar lhs_imag;
+ if(!LhsIsReal) lhs_imag = lhs_ptr_imag[arow];
+
+ Scalarc lhsc;
+
+ lhsc.real(lhs_real);
+ if(!LhsIsReal)
+ {
+ if(ConjugateLhs)
+ lhsc.imag(-lhs_imag);
+ else
+ lhsc.imag(lhs_imag);
+ } else {
+ //Lazy approach for now
+ lhsc.imag((Scalar)0);
+ }
+
+ for(int acol = 0; acol < accRows; acol++ )
+ {
+ Scalar rhs_real = rhs_ptr[acol];
+ Scalar rhs_imag;
+ if(!RhsIsReal) rhs_imag = rhs_ptr_imag[acol];
+ Scalarc rhsc;
+
+ rhsc.real(rhs_real);
+ if(!RhsIsReal)
+ {
+ if(ConjugateRhs)
+ rhsc.imag(-rhs_imag);
+ else
+ rhsc.imag(rhs_imag);
+ } else {
+ //Lazy approach for now
+ rhsc.imag((Scalar)0);
+ }
+ res(row + arow, col + acol) += alpha*lhsc*rhsc;
+ }
+ }
+ rhs_ptr += accRows;
+ lhs_ptr += remaining_rows;
+ if(!LhsIsReal)
+ lhs_ptr_imag += remaining_rows;
+ if(!RhsIsReal)
+ rhs_ptr_imag += accRows;
+ }
+ }
+ }
+
+ if(remaining_cols > 0)
+ {
+ const Scalar *rhs_base = blockB + ( (advanceCols*col)/accRows )*strideB*accRows;
+ const Scalar *lhs_base = blockA;
+ Index row = 0;
+
+ for(; row + accCols <= rows; row += accCols)
+ {
+ const Scalar *rhs_ptr = rhs_base;
+ const Scalar *rhs_ptr_imag = rhs_ptr + remaining_cols*strideB;
+ const Scalar *lhs_ptr = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols;
+ const Scalar *lhs_ptr_imag = lhs_ptr + accCols*strideA;
+
+ lhs_ptr += accCols*offsetA;
+ if(!LhsIsReal)
+ lhs_ptr_imag += accCols*offsetA;
+ rhs_ptr += remaining_cols*offsetB;
+ if(!RhsIsReal)
+ rhs_ptr_imag += remaining_cols*offsetB;
+ Scalarc scalarAcc[4][4];
+ for(Index arow = 0; arow < 4; arow++ )
+ {
+ for(Index acol = 0; acol < 4; acol++ )
+ {
+ scalarAcc[arow][acol].real((Scalar)0.0f);
+ scalarAcc[arow][acol].imag((Scalar)0.0f);
+ }
+ }
+ for(Index k = 0; k < depth; k++)
+ {
+ for(Index arow = 0; arow < accCols; arow++)
+ {
+ Scalar lhs_real = lhs_ptr[arow];
+ Scalar lhs_imag;
+ if(!LhsIsReal)
+ {
+ lhs_imag = lhs_ptr_imag[arow];
+
+ if(ConjugateLhs)
+ lhs_imag *= -1;
+ } else {
+ lhs_imag = (Scalar)0;
+ }
+
+ for(int acol = 0; acol < remaining_cols; acol++ )
+ {
+ Scalar rhs_real = rhs_ptr[acol];
+ Scalar rhs_imag;
+ if(!RhsIsReal)
+ {
+ rhs_imag = rhs_ptr_imag[acol];
+
+ if(ConjugateRhs)
+ rhs_imag *= -1;
+ } else {
+ rhs_imag = (Scalar)0;
+ }
+
+ scalarAcc[arow][acol].real(scalarAcc[arow][acol].real() + lhs_real*rhs_real - lhs_imag*rhs_imag);
+ scalarAcc[arow][acol].imag(scalarAcc[arow][acol].imag() + lhs_imag*rhs_real + lhs_real*rhs_imag);
+ }
+ }
+ rhs_ptr += remaining_cols;
+ lhs_ptr += accCols;
+ if(!RhsIsReal)
+ rhs_ptr_imag += remaining_cols;
+ if(!LhsIsReal)
+ lhs_ptr_imag += accCols;
+ }
+ for(int arow = 0; arow < accCols; arow++ )
+ {
+ for(int acol = 0; acol < remaining_cols; acol++ )
+ {
+ Scalar accR = scalarAcc[arow][acol].real();
+ Scalar accI = scalarAcc[arow][acol].imag();
+ Scalar aR = alpha.real();
+ Scalar aI = alpha.imag();
+ Scalar resR = res(row + arow, col + acol).real();
+ Scalar resI = res(row + arow, col + acol).imag();
+
+ res(row + arow, col + acol).real(resR + accR*aR - accI*aI);
+ res(row + arow, col + acol).imag(resI + accR*aI + accI*aR);
+ }
+ }
+ }
+
+ if(remaining_rows > 0)
+ {
+ const Scalar *rhs_ptr = rhs_base;
+ const Scalar *rhs_ptr_imag = rhs_ptr + remaining_cols*strideB;
+ const Scalar *lhs_ptr = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols;
+ const Scalar *lhs_ptr_imag = lhs_ptr + remaining_rows*strideA;
+
+ lhs_ptr += remaining_rows*offsetA;
+ if(!LhsIsReal)
+ lhs_ptr_imag += remaining_rows*offsetA;
+ rhs_ptr += remaining_cols*offsetB;
+ if(!RhsIsReal)
+ rhs_ptr_imag += remaining_cols*offsetB;
+ for(Index k = 0; k < depth; k++)
+ {
+ for(Index arow = 0; arow < remaining_rows; arow++)
+ {
+ Scalar lhs_real = lhs_ptr[arow];
+ Scalar lhs_imag;
+ if(!LhsIsReal) lhs_imag = lhs_ptr_imag[arow];
+ Scalarc lhsc;
+
+ lhsc.real(lhs_real);
+ if(!LhsIsReal)
+ {
+ if(ConjugateLhs)
+ lhsc.imag(-lhs_imag);
+ else
+ lhsc.imag(lhs_imag);
+ } else {
+ lhsc.imag((Scalar)0);
+ }
+
+ for(Index acol = 0; acol < remaining_cols; acol++ )
+ {
+ Scalar rhs_real = rhs_ptr[acol];
+ Scalar rhs_imag;
+ if(!RhsIsReal) rhs_imag = rhs_ptr_imag[acol];
+ Scalarc rhsc;
+
+ rhsc.real(rhs_real);
+ if(!RhsIsReal)
+ {
+ if(ConjugateRhs)
+ rhsc.imag(-rhs_imag);
+ else
+ rhsc.imag(rhs_imag);
+ } else {
+ rhsc.imag((Scalar)0);
+ }
+ res(row + arow, col + acol) += alpha*lhsc*rhsc;
+ }
+ }
+ rhs_ptr += remaining_cols;
+ lhs_ptr += remaining_rows;
+ if(!LhsIsReal)
+ lhs_ptr_imag += remaining_rows;
+ if(!RhsIsReal)
+ rhs_ptr_imag += remaining_cols;
+ }
+ }
+ }
+}
+
+#pragma GCC reset_options
+} // end namespace internal
+
+} // end namespace Eigen
+#endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+