// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com) // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H #define EIGEN_MATRIX_PRODUCT_ALTIVEC_H #include "MatrixProductCommon.h" #if EIGEN_COMP_LLVM #if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY) #ifdef __MMA__ #define EIGEN_ALTIVEC_MMA_ONLY #else #define EIGEN_ALTIVEC_DISABLE_MMA #endif #endif #endif #ifdef __has_builtin #if __has_builtin(__builtin_mma_assemble_acc) #define ALTIVEC_MMA_SUPPORT #endif #endif #if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) #include "MatrixProductMMA.h" #endif /************************************************************************************************** * TODO * * - Check StorageOrder on lhs_pack (the innermost second loop seems unvectorized when it could). * * - Check the possibility of transposing as GETREAL and GETIMAG when needed. * * - Check if change conjugation to xor instead of mul gains any performance. * * - Remove IsComplex template argument from complex packing. * **************************************************************************************************/ namespace Eigen { namespace internal { /************************** * Constants and typedefs * **************************/ const int QuadRegisterCount = 8; template struct quad_traits { typedef typename packet_traits::type vectortype; typedef PacketBlock type; typedef vectortype rhstype; enum { vectorsize = packet_traits::size, size = 4, rows = 4 }; }; template<> struct quad_traits { typedef Packet2d vectortype; typedef PacketBlock type; typedef PacketBlock rhstype; enum { vectorsize = packet_traits::size, size = 2, rows = 4 }; }; // MatrixProduct decomposes real/imaginary vectors into a real vector and an imaginary vector, this turned out // to be faster than Eigen's usual approach of having real/imaginary pairs on a single vector. This constants then // are responsible to extract from convert between Eigen's and MatrixProduct approach. const static Packet4f p4f_CONJUGATE = {float(-1.0), float(-1.0), float(-1.0), float(-1.0)}; const static Packet2d p2d_CONJUGATE = {-1.0, -1.0}; const static Packet16uc p16uc_GETREAL32 = { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27}; const static Packet16uc p16uc_GETIMAG32 = { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; const static Packet16uc p16uc_GETREAL64 = { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; //[a,ai],[b,bi] = [ai,bi] const static Packet16uc p16uc_GETIMAG64 = { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; /********************************************* * Single precision real and complex packing * * *******************************************/ /** * Symm packing is related to packing of symmetric adjoint blocks, as expected the packing leaves * the diagonal real, whatever is below it is copied from the respective upper diagonal element and * conjugated. There's no PanelMode available for symm packing. * * Packing in general is supposed to leave the lhs block and the rhs block easy to be read by gemm using * it's respective rank-update instructions. The float32/64 versions are different because at this moment * the size of the accumulator is fixed at 512-bits so you can't have a 4x4 accumulator of 64-bit elements. * * As mentioned earlier MatrixProduct breaks complex numbers into a real vector and a complex vector so packing has * to take that into account, at the moment, we run pack the real part and then the imaginary part, this is the main * reason why packing for complex is broken down into several different parts, also the reason why we endup having a * float32/64 and complex float32/64 version. **/ template EIGEN_STRONG_INLINE std::complex getAdjointVal(Index i, Index j, const_blas_data_mapper, Index, StorageOrder>& dt) { std::complex v; if(i < j) { v.real(dt(j,i).real()); v.imag(-dt(j,i).imag()); } else if(i > j) { v.real(dt(i,j).real()); v.imag(dt(i,j).imag()); } else { v.real(dt(i,j).real()); v.imag((Scalar)0.0); } return v; } template EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex *blockB, const std::complex* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { const Index depth = k2 + rows; const_blas_data_mapper, Index, StorageOrder> rhs(_rhs, rhsStride); const int vectorSize = N*quad_traits::vectorsize; Scalar* blockBf = reinterpret_cast(blockB); Index ri = 0, j = 0; for(; j + vectorSize <= cols; j+=vectorSize) { Index i = k2; for(; i < depth; i++) { for(Index k = 0; k < vectorSize; k++) { std::complex v = getAdjointVal(i, j + k, rhs); blockBf[ri + k] = v.real(); } ri += vectorSize; } i = k2; for(; i < depth; i++) { for(Index k = 0; k < vectorSize; k++) { std::complex v = getAdjointVal(i, j + k, rhs); blockBf[ri + k] = v.imag(); } ri += vectorSize; } } for(Index i = k2; i < depth; i++) { Index k = j; for(; k < cols; k++) { std::complex v = getAdjointVal(i, k, rhs); blockBf[ri] = v.real(); ri += 1; } } for(Index i = k2; i < depth; i++) { Index k = j; for(; k < cols; k++) { std::complex v = getAdjointVal(i, k, rhs); blockBf[ri] = v.imag(); ri += 1; } } } template EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex *blockA, const std::complex* _lhs, Index lhsStride, Index cols, Index rows) { const Index depth = cols; const_blas_data_mapper, Index, StorageOrder> lhs(_lhs, lhsStride); const int vectorSize = quad_traits::vectorsize; Index ri = 0, j = 0; Scalar *blockAf = (Scalar *)(blockA); for(; j + vectorSize <= rows; j+=vectorSize) { Index i = 0; for(; i < depth; i++) { for(int k = 0; k < vectorSize; k++) { std::complex v = getAdjointVal(j+k, i, lhs); blockAf[ri + k] = v.real(); } ri += vectorSize; } i = 0; for(; i < depth; i++) { for(int k = 0; k < vectorSize; k++) { std::complex v = getAdjointVal(j+k, i, lhs); blockAf[ri + k] = v.imag(); } ri += vectorSize; } } for(Index i = 0; i < depth; i++) { Index k = j; for(; k < rows; k++) { std::complex v = getAdjointVal(k, i, lhs); blockAf[ri] = v.real(); ri += 1; } } for(Index i = 0; i < depth; i++) { Index k = j; for(; k < rows; k++) { std::complex v = getAdjointVal(k, i, lhs); blockAf[ri] = v.imag(); ri += 1; } } } template EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar *blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { const Index depth = k2 + rows; const_blas_data_mapper rhs(_rhs, rhsStride); const int vectorSize = quad_traits::vectorsize; Index ri = 0, j = 0; for(; j + N*vectorSize <= cols; j+=N*vectorSize) { Index i = k2; for(; i < depth; i++) { for(int k = 0; k < N*vectorSize; k++) { if(i <= j+k) blockB[ri + k] = rhs(j+k, i); else blockB[ri + k] = rhs(i, j+k); } ri += N*vectorSize; } } for(Index i = k2; i < depth; i++) { Index k = j; for(; k < cols; k++) { if(k <= i) blockB[ri] = rhs(i, k); else blockB[ri] = rhs(k, i); ri += 1; } } } template EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar *blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { const Index depth = cols; const_blas_data_mapper lhs(_lhs, lhsStride); const int vectorSize = quad_traits::vectorsize; Index ri = 0, j = 0; for(j = 0; j + vectorSize <= rows; j+=vectorSize) { Index i = 0; for(; i < depth; i++) { for(int k = 0; k < vectorSize; k++) { if(i <= j+k) blockA[ri + k] = lhs(j+k, i); else blockA[ri + k] = lhs(i, j+k); } ri += vectorSize; } } for(Index i = 0; i < depth; i++) { Index k = j; for(; k < rows; k++) { if(i <= k) blockA[ri] = lhs(k, i); else blockA[ri] = lhs(i, k); ri += 1; } } } template struct symm_pack_rhs, Index, nr, StorageOrder> { void operator()(std::complex* blockB, const std::complex* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { symm_pack_complex_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); } }; template struct symm_pack_lhs, Index, Pack1, Pack2_dummy, StorageOrder> { void operator()(std::complex* blockA, const std::complex* _lhs, Index lhsStride, Index cols, Index rows) { symm_pack_complex_lhs_helper(blockA, _lhs, lhsStride, cols, rows); } }; // *********** symm_pack std::complex *********** template struct symm_pack_rhs, Index, nr, StorageOrder> { void operator()(std::complex* blockB, const std::complex* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { symm_pack_complex_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); } }; template struct symm_pack_lhs, Index, Pack1, Pack2_dummy, StorageOrder> { void operator()(std::complex* blockA, const std::complex* _lhs, Index lhsStride, Index cols, Index rows) { symm_pack_complex_lhs_helper(blockA, _lhs, lhsStride, cols, rows); } }; // *********** symm_pack float32 *********** template struct symm_pack_rhs { void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { symm_pack_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); } }; template struct symm_pack_lhs { void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows) { symm_pack_lhs_helper(blockA, _lhs, lhsStride, cols, rows); } }; // *********** symm_pack float64 *********** template struct symm_pack_rhs { void operator()(double* blockB, const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { symm_pack_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); } }; template struct symm_pack_lhs { void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows) { symm_pack_lhs_helper(blockA, _lhs, lhsStride, cols, rows); } }; /** * PanelMode * Packing might be called several times before being multiplied by gebp_kernel, this happens because * on special occasions it fills part of block with other parts of the matrix. Two variables control * how PanelMode should behave: offset and stride. The idea is that those variables represent whatever * is going to be the real offset and stride in the future and this is what you should obey. The process * is to behave as you would with normal packing but leave the start of each part with the correct offset * and the end as well respecting the real stride the block will have. Gebp is aware of both blocks stride * and offset and behaves accordingly. **/ // General template for lhs complex packing. template struct lhs_cpack { EIGEN_STRONG_INLINE void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { const int vectorSize = quad_traits::vectorsize; Index ri = 0, j = 0; Scalar *blockAt = reinterpret_cast(blockA); Packet conj = pset1((Scalar)-1.0); for(j = 0; j + vectorSize <= rows; j+=vectorSize) { Index i = 0; if(PanelMode) ri += vectorSize*offset; for(; i + vectorSize <= depth; i+=vectorSize) { PacketBlock block; PacketBlock cblock; if(StorageOrder == ColMajor) { cblock.packet[0] = lhs.template loadPacket(j, i + 0); cblock.packet[1] = lhs.template loadPacket(j, i + 1); cblock.packet[2] = lhs.template loadPacket(j, i + 2); cblock.packet[3] = lhs.template loadPacket(j, i + 3); cblock.packet[4] = lhs.template loadPacket(j + 2, i + 0); cblock.packet[5] = lhs.template loadPacket(j + 2, i + 1); cblock.packet[6] = lhs.template loadPacket(j + 2, i + 2); cblock.packet[7] = lhs.template loadPacket(j + 2, i + 3); } else { cblock.packet[0] = lhs.template loadPacket(j + 0, i); cblock.packet[1] = lhs.template loadPacket(j + 1, i); cblock.packet[2] = lhs.template loadPacket(j + 2, i); cblock.packet[3] = lhs.template loadPacket(j + 3, i); cblock.packet[4] = lhs.template loadPacket(j + 0, i + 2); cblock.packet[5] = lhs.template loadPacket(j + 1, i + 2); cblock.packet[6] = lhs.template loadPacket(j + 2, i + 2); cblock.packet[7] = lhs.template loadPacket(j + 3, i + 2); } block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32); block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32); block.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32); block.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32); if(StorageOrder == RowMajor) ptranspose(block); pstore(blockAt + ri , block.packet[0]); pstore(blockAt + ri + 4, block.packet[1]); pstore(blockAt + ri + 8, block.packet[2]); pstore(blockAt + ri + 12, block.packet[3]); ri += 4*vectorSize; } for(; i < depth; i++) { blockAt[ri + 0] = lhs(j + 0, i).real(); blockAt[ri + 1] = lhs(j + 1, i).real(); blockAt[ri + 2] = lhs(j + 2, i).real(); blockAt[ri + 3] = lhs(j + 3, i).real(); ri += vectorSize; } if(PanelMode) ri += vectorSize*(stride - offset - depth); i = 0; if(PanelMode) ri += vectorSize*offset; for(; i + vectorSize <= depth; i+=vectorSize) { PacketBlock cblock; if(StorageOrder == ColMajor) { cblock.packet[0] = lhs.template loadPacket(j, i + 0); cblock.packet[1] = lhs.template loadPacket(j, i + 1); cblock.packet[2] = lhs.template loadPacket(j, i + 2); cblock.packet[3] = lhs.template loadPacket(j, i + 3); cblock.packet[4] = lhs.template loadPacket(j + 2, i + 0); cblock.packet[5] = lhs.template loadPacket(j + 2, i + 1); cblock.packet[6] = lhs.template loadPacket(j + 2, i + 2); cblock.packet[7] = lhs.template loadPacket(j + 2, i + 3); } else { cblock.packet[0] = lhs.template loadPacket(j + 0, i); cblock.packet[1] = lhs.template loadPacket(j + 1, i); cblock.packet[2] = lhs.template loadPacket(j + 2, i); cblock.packet[3] = lhs.template loadPacket(j + 3, i); cblock.packet[4] = lhs.template loadPacket(j + 0, i + 2); cblock.packet[5] = lhs.template loadPacket(j + 1, i + 2); cblock.packet[6] = lhs.template loadPacket(j + 2, i + 2); cblock.packet[7] = lhs.template loadPacket(j + 3, i + 2); } PacketBlock block; block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32); block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32); block.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32); block.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32); if(Conjugate) { block.packet[0] *= conj; block.packet[1] *= conj; block.packet[2] *= conj; block.packet[3] *= conj; } if(StorageOrder == RowMajor) ptranspose(block); pstore(blockAt + ri , block.packet[0]); pstore(blockAt + ri + 4, block.packet[1]); pstore(blockAt + ri + 8, block.packet[2]); pstore(blockAt + ri + 12, block.packet[3]); ri += 4*vectorSize; } for(; i < depth; i++) { if(Conjugate) { blockAt[ri + 0] = -lhs(j + 0, i).imag(); blockAt[ri + 1] = -lhs(j + 1, i).imag(); blockAt[ri + 2] = -lhs(j + 2, i).imag(); blockAt[ri + 3] = -lhs(j + 3, i).imag(); } else { blockAt[ri + 0] = lhs(j + 0, i).imag(); blockAt[ri + 1] = lhs(j + 1, i).imag(); blockAt[ri + 2] = lhs(j + 2, i).imag(); blockAt[ri + 3] = lhs(j + 3, i).imag(); } ri += vectorSize; } if(PanelMode) ri += vectorSize*(stride - offset - depth); } if(PanelMode) ri += offset*(rows - j); for(Index i = 0; i < depth; i++) { Index k = j; for(; k < rows; k++) { blockAt[ri] = lhs(k, i).real(); ri += 1; } } if(PanelMode) ri += (rows - j)*(stride - offset - depth); if(PanelMode) ri += offset*(rows - j); for(Index i = 0; i < depth; i++) { Index k = j; for(; k < rows; k++) { if(Conjugate) blockAt[ri] = -lhs(k, i).imag(); else blockAt[ri] = lhs(k, i).imag(); ri += 1; } } if(PanelMode) ri += (rows - j)*(stride - offset - depth); } }; // General template for lhs packing. template struct lhs_pack{ EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { const int vectorSize = quad_traits::vectorsize; Index ri = 0, j = 0; for(j = 0; j + vectorSize <= rows; j+=vectorSize) { Index i = 0; if(PanelMode) ri += vectorSize*offset; for(; i + vectorSize <= depth; i+=vectorSize) { PacketBlock block; if(StorageOrder == RowMajor) { block.packet[0] = lhs.template loadPacket(j + 0, i); block.packet[1] = lhs.template loadPacket(j + 1, i); block.packet[2] = lhs.template loadPacket(j + 2, i); block.packet[3] = lhs.template loadPacket(j + 3, i); ptranspose(block); } else { block.packet[0] = lhs.template loadPacket(j, i + 0); block.packet[1] = lhs.template loadPacket(j, i + 1); block.packet[2] = lhs.template loadPacket(j, i + 2); block.packet[3] = lhs.template loadPacket(j, i + 3); } pstore(blockA + ri , block.packet[0]); pstore(blockA + ri + 4, block.packet[1]); pstore(blockA + ri + 8, block.packet[2]); pstore(blockA + ri + 12, block.packet[3]); ri += 4*vectorSize; } for(; i < depth; i++) { if(StorageOrder == RowMajor) { blockA[ri+0] = lhs(j+0, i); blockA[ri+1] = lhs(j+1, i); blockA[ri+2] = lhs(j+2, i); blockA[ri+3] = lhs(j+3, i); } else { Packet lhsV = lhs.template loadPacket(j, i); pstore(blockA + ri, lhsV); } ri += vectorSize; } if(PanelMode) ri += vectorSize*(stride - offset - depth); } if(PanelMode) ri += offset*(rows - j); if (j < rows) { for(Index i = 0; i < depth; i++) { Index k = j; for(; k < rows; k++) { blockA[ri] = lhs(k, i); ri += 1; } } } if(PanelMode) ri += (rows - j)*(stride - offset - depth); } }; // General template for rhs complex packing. template struct rhs_cpack { EIGEN_STRONG_INLINE void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { const int vectorSize = quad_traits::vectorsize; Scalar *blockBt = reinterpret_cast(blockB); Packet conj = pset1((Scalar)-1.0); Index ri = 0, j = 0; for(; j + vectorSize <= cols; j+=vectorSize) { Index i = 0; if(PanelMode) ri += offset*vectorSize; for(; i + vectorSize <= depth; i+=vectorSize) { PacketBlock cblock; if(StorageOrder == ColMajor) { cblock.packet[0] = rhs.template loadPacket(i, j + 0); cblock.packet[1] = rhs.template loadPacket(i, j + 1); cblock.packet[2] = rhs.template loadPacket(i, j + 2); cblock.packet[3] = rhs.template loadPacket(i, j + 3); cblock.packet[4] = rhs.template loadPacket(i + 2, j + 0); cblock.packet[5] = rhs.template loadPacket(i + 2, j + 1); cblock.packet[6] = rhs.template loadPacket(i + 2, j + 2); cblock.packet[7] = rhs.template loadPacket(i + 2, j + 3); } else { cblock.packet[0] = rhs.template loadPacket(i + 0, j); cblock.packet[1] = rhs.template loadPacket(i + 1, j); cblock.packet[2] = rhs.template loadPacket(i + 2, j); cblock.packet[3] = rhs.template loadPacket(i + 3, j); cblock.packet[4] = rhs.template loadPacket(i + 0, j + 2); cblock.packet[5] = rhs.template loadPacket(i + 1, j + 2); cblock.packet[6] = rhs.template loadPacket(i + 2, j + 2); cblock.packet[7] = rhs.template loadPacket(i + 3, j + 2); } PacketBlock block; block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32); block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32); block.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32); block.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32); if(StorageOrder == ColMajor) ptranspose(block); pstore(blockBt + ri , block.packet[0]); pstore(blockBt + ri + 4, block.packet[1]); pstore(blockBt + ri + 8, block.packet[2]); pstore(blockBt + ri + 12, block.packet[3]); ri += 4*vectorSize; } for(; i < depth; i++) { blockBt[ri+0] = rhs(i, j+0).real(); blockBt[ri+1] = rhs(i, j+1).real(); blockBt[ri+2] = rhs(i, j+2).real(); blockBt[ri+3] = rhs(i, j+3).real(); ri += vectorSize; } if(PanelMode) ri += vectorSize*(stride - offset - depth); i = 0; if(PanelMode) ri += offset*vectorSize; for(; i + vectorSize <= depth; i+=vectorSize) { PacketBlock cblock; if(StorageOrder == ColMajor) { cblock.packet[0] = rhs.template loadPacket(i, j + 0); cblock.packet[1] = rhs.template loadPacket(i, j + 1); cblock.packet[2] = rhs.template loadPacket(i, j + 2); cblock.packet[3] = rhs.template loadPacket(i, j + 3); cblock.packet[4] = rhs.template loadPacket(i + 2, j + 0); cblock.packet[5] = rhs.template loadPacket(i + 2, j + 1); cblock.packet[6] = rhs.template loadPacket(i + 2, j + 2); cblock.packet[7] = rhs.template loadPacket(i + 2, j + 3); } else { cblock.packet[0] = rhs.template loadPacket(i + 0, j); cblock.packet[1] = rhs.template loadPacket(i + 1, j); cblock.packet[2] = rhs.template loadPacket(i + 2, j); cblock.packet[3] = rhs.template loadPacket(i + 3, j); cblock.packet[4] = rhs.template loadPacket(i + 0, j + 2); cblock.packet[5] = rhs.template loadPacket(i + 1, j + 2); cblock.packet[6] = rhs.template loadPacket(i + 2, j + 2); cblock.packet[7] = rhs.template loadPacket(i + 3, j + 2); } PacketBlock block; block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32); block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32); block.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32); block.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32); if(Conjugate) { block.packet[0] *= conj; block.packet[1] *= conj; block.packet[2] *= conj; block.packet[3] *= conj; } if(StorageOrder == ColMajor) ptranspose(block); pstore(blockBt + ri , block.packet[0]); pstore(blockBt + ri + 4, block.packet[1]); pstore(blockBt + ri + 8, block.packet[2]); pstore(blockBt + ri + 12, block.packet[3]); ri += 4*vectorSize; } for(; i < depth; i++) { if(Conjugate) { blockBt[ri+0] = -rhs(i, j+0).imag(); blockBt[ri+1] = -rhs(i, j+1).imag(); blockBt[ri+2] = -rhs(i, j+2).imag(); blockBt[ri+3] = -rhs(i, j+3).imag(); } else { blockBt[ri+0] = rhs(i, j+0).imag(); blockBt[ri+1] = rhs(i, j+1).imag(); blockBt[ri+2] = rhs(i, j+2).imag(); blockBt[ri+3] = rhs(i, j+3).imag(); } ri += vectorSize; } if(PanelMode) ri += vectorSize*(stride - offset - depth); } if(PanelMode) ri += offset*(cols - j); for(Index i = 0; i < depth; i++) { Index k = j; for(; k < cols; k++) { blockBt[ri] = rhs(i, k).real(); ri += 1; } } if(PanelMode) ri += (cols - j)*(stride - offset - depth); if(PanelMode) ri += offset*(cols - j); for(Index i = 0; i < depth; i++) { Index k = j; for(; k < cols; k++) { if(Conjugate) blockBt[ri] = -rhs(i, k).imag(); else blockBt[ri] = rhs(i, k).imag(); ri += 1; } } if(PanelMode) ri += (cols - j)*(stride - offset - depth); } }; // General template for rhs packing. template struct rhs_pack { EIGEN_STRONG_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { const int vectorSize = quad_traits::vectorsize; Index ri = 0, j = 0; for(; j + vectorSize <= cols; j+=vectorSize) { Index i = 0; if(PanelMode) ri += offset*vectorSize; for(; i + vectorSize <= depth; i+=vectorSize) { PacketBlock block; if(StorageOrder == ColMajor) { block.packet[0] = rhs.template loadPacket(i, j + 0); block.packet[1] = rhs.template loadPacket(i, j + 1); block.packet[2] = rhs.template loadPacket(i, j + 2); block.packet[3] = rhs.template loadPacket(i, j + 3); ptranspose(block); } else { block.packet[0] = rhs.template loadPacket(i + 0, j); block.packet[1] = rhs.template loadPacket(i + 1, j); block.packet[2] = rhs.template loadPacket(i + 2, j); block.packet[3] = rhs.template loadPacket(i + 3, j); } pstore(blockB + ri , block.packet[0]); pstore(blockB + ri + 4, block.packet[1]); pstore(blockB + ri + 8, block.packet[2]); pstore(blockB + ri + 12, block.packet[3]); ri += 4*vectorSize; } for(; i < depth; i++) { if(StorageOrder == ColMajor) { blockB[ri+0] = rhs(i, j+0); blockB[ri+1] = rhs(i, j+1); blockB[ri+2] = rhs(i, j+2); blockB[ri+3] = rhs(i, j+3); } else { Packet rhsV = rhs.template loadPacket(i, j); pstore(blockB + ri, rhsV); } ri += vectorSize; } if(PanelMode) ri += vectorSize*(stride - offset - depth); } if(PanelMode) ri += offset*(cols - j); if (j < cols) { for(Index i = 0; i < depth; i++) { Index k = j; for(; k < cols; k++) { blockB[ri] = rhs(i, k); ri += 1; } } } if(PanelMode) ri += (cols - j)*(stride - offset - depth); } }; // General template for lhs packing, float64 specialization. template struct lhs_pack { EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { const int vectorSize = quad_traits::vectorsize; Index ri = 0, j = 0; for(j = 0; j + vectorSize <= rows; j+=vectorSize) { Index i = 0; if(PanelMode) ri += vectorSize*offset; for(; i + vectorSize <= depth; i+=vectorSize) { PacketBlock block; if(StorageOrder == RowMajor) { block.packet[0] = lhs.template loadPacket(j + 0, i); block.packet[1] = lhs.template loadPacket(j + 1, i); ptranspose(block); } else { block.packet[0] = lhs.template loadPacket(j, i + 0); block.packet[1] = lhs.template loadPacket(j, i + 1); } pstore(blockA + ri , block.packet[0]); pstore(blockA + ri + 2, block.packet[1]); ri += 2*vectorSize; } for(; i < depth; i++) { if(StorageOrder == RowMajor) { blockA[ri+0] = lhs(j+0, i); blockA[ri+1] = lhs(j+1, i); } else { Packet2d lhsV = lhs.template loadPacket(j, i); pstore(blockA + ri, lhsV); } ri += vectorSize; } if(PanelMode) ri += vectorSize*(stride - offset - depth); } if(PanelMode) ri += offset*(rows - j); for(Index i = 0; i < depth; i++) { Index k = j; for(; k < rows; k++) { blockA[ri] = lhs(k, i); ri += 1; } } if(PanelMode) ri += (rows - j)*(stride - offset - depth); } }; // General template for rhs packing, float64 specialization. template struct rhs_pack { EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { const int vectorSize = quad_traits::vectorsize; Index ri = 0, j = 0; for(; j + 2*vectorSize <= cols; j+=2*vectorSize) { Index i = 0; if(PanelMode) ri += offset*(2*vectorSize); for(; i + vectorSize <= depth; i+=vectorSize) { PacketBlock block; if(StorageOrder == ColMajor) { PacketBlock block1, block2; block1.packet[0] = rhs.template loadPacket(i, j + 0); block1.packet[1] = rhs.template loadPacket(i, j + 1); block2.packet[0] = rhs.template loadPacket(i, j + 2); block2.packet[1] = rhs.template loadPacket(i, j + 3); ptranspose(block1); ptranspose(block2); pstore(blockB + ri , block1.packet[0]); pstore(blockB + ri + 2, block2.packet[0]); pstore(blockB + ri + 4, block1.packet[1]); pstore(blockB + ri + 6, block2.packet[1]); } else { block.packet[0] = rhs.template loadPacket(i + 0, j + 0); //[a1 a2] block.packet[1] = rhs.template loadPacket(i + 0, j + 2); //[a3 a4] block.packet[2] = rhs.template loadPacket(i + 1, j + 0); //[b1 b2] block.packet[3] = rhs.template loadPacket(i + 1, j + 2); //[b3 b4] pstore(blockB + ri , block.packet[0]); pstore(blockB + ri + 2, block.packet[1]); pstore(blockB + ri + 4, block.packet[2]); pstore(blockB + ri + 6, block.packet[3]); } ri += 4*vectorSize; } for(; i < depth; i++) { if(StorageOrder == ColMajor) { blockB[ri+0] = rhs(i, j+0); blockB[ri+1] = rhs(i, j+1); ri += vectorSize; blockB[ri+0] = rhs(i, j+2); blockB[ri+1] = rhs(i, j+3); } else { Packet2d rhsV = rhs.template loadPacket(i, j); pstore(blockB + ri, rhsV); ri += vectorSize; rhsV = rhs.template loadPacket(i, j + 2); pstore(blockB + ri, rhsV); } ri += vectorSize; } if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth); } if(PanelMode) ri += offset*(cols - j); for(Index i = 0; i < depth; i++) { Index k = j; for(; k < cols; k++) { blockB[ri] = rhs(i, k); ri += 1; } } if(PanelMode) ri += (cols - j)*(stride - offset - depth); } }; // General template for lhs complex packing, float64 specialization. template struct lhs_cpack { EIGEN_STRONG_INLINE void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { const int vectorSize = quad_traits::vectorsize; Index ri = 0, j = 0; double *blockAt = reinterpret_cast(blockA); Packet conj = pset1(-1.0); for(j = 0; j + vectorSize <= rows; j+=vectorSize) { Index i = 0; if(PanelMode) ri += vectorSize*offset; for(; i + vectorSize <= depth; i+=vectorSize) { PacketBlock block; PacketBlock cblock; if(StorageOrder == ColMajor) { cblock.packet[0] = lhs.template loadPacket(j, i + 0); //[a1 a1i] cblock.packet[1] = lhs.template loadPacket(j, i + 1); //[b1 b1i] cblock.packet[2] = lhs.template loadPacket(j + 1, i + 0); //[a2 a2i] cblock.packet[3] = lhs.template loadPacket(j + 1, i + 1); //[b2 b2i] block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETREAL64); //[a1 a2] block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2] } else { cblock.packet[0] = lhs.template loadPacket(j + 0, i); //[a1 a1i] cblock.packet[1] = lhs.template loadPacket(j + 1, i); //[a2 a2i] cblock.packet[2] = lhs.template loadPacket(j + 0, i + 1); //[b1 b1i] cblock.packet[3] = lhs.template loadPacket(j + 1, i + 1); //[b2 b2i] block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); //[a1 a2] block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2] } pstore(blockAt + ri , block.packet[0]); pstore(blockAt + ri + 2, block.packet[1]); ri += 2*vectorSize; } for(; i < depth; i++) { blockAt[ri + 0] = lhs(j + 0, i).real(); blockAt[ri + 1] = lhs(j + 1, i).real(); ri += vectorSize; } if(PanelMode) ri += vectorSize*(stride - offset - depth); i = 0; if(PanelMode) ri += vectorSize*offset; for(; i + vectorSize <= depth; i+=vectorSize) { PacketBlock block; PacketBlock cblock; if(StorageOrder == ColMajor) { cblock.packet[0] = lhs.template loadPacket(j, i + 0); cblock.packet[1] = lhs.template loadPacket(j, i + 1); cblock.packet[2] = lhs.template loadPacket(j + 1, i + 0); cblock.packet[3] = lhs.template loadPacket(j + 1, i + 1); block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETIMAG64); block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETIMAG64); } else { cblock.packet[0] = lhs.template loadPacket(j + 0, i); cblock.packet[1] = lhs.template loadPacket(j + 1, i); cblock.packet[2] = lhs.template loadPacket(j + 0, i + 1); cblock.packet[3] = lhs.template loadPacket(j + 1, i + 1); block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64); block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64); } if(Conjugate) { block.packet[0] *= conj; block.packet[1] *= conj; } pstore(blockAt + ri , block.packet[0]); pstore(blockAt + ri + 2, block.packet[1]); ri += 2*vectorSize; } for(; i < depth; i++) { if(Conjugate) { blockAt[ri + 0] = -lhs(j + 0, i).imag(); blockAt[ri + 1] = -lhs(j + 1, i).imag(); } else { blockAt[ri + 0] = lhs(j + 0, i).imag(); blockAt[ri + 1] = lhs(j + 1, i).imag(); } ri += vectorSize; } if(PanelMode) ri += vectorSize*(stride - offset - depth); } if(PanelMode) ri += offset*(rows - j); for(Index i = 0; i < depth; i++) { Index k = j; for(; k < rows; k++) { blockAt[ri] = lhs(k, i).real(); ri += 1; } } if(PanelMode) ri += (rows - j)*(stride - offset - depth); if(PanelMode) ri += offset*(rows - j); for(Index i = 0; i < depth; i++) { Index k = j; for(; k < rows; k++) { if(Conjugate) blockAt[ri] = -lhs(k, i).imag(); else blockAt[ri] = lhs(k, i).imag(); ri += 1; } } if(PanelMode) ri += (rows - j)*(stride - offset - depth); } }; // General template for rhs complex packing, float64 specialization. template struct rhs_cpack { EIGEN_STRONG_INLINE void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { const int vectorSize = quad_traits::vectorsize; double *blockBt = reinterpret_cast(blockB); Packet conj = pset1(-1.0); Index ri = 0, j = 0; for(; j + 2*vectorSize <= cols; j+=2*vectorSize) { Index i = 0; if(PanelMode) ri += offset*(2*vectorSize); for(; i < depth; i++) { PacketBlock cblock; PacketBlock block; cblock.packet[0] = rhs.template loadPacket(i, j + 0); cblock.packet[1] = rhs.template loadPacket(i, j + 1); cblock.packet[2] = rhs.template loadPacket(i, j + 2); cblock.packet[3] = rhs.template loadPacket(i, j + 3); block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); pstore(blockBt + ri , block.packet[0]); pstore(blockBt + ri + 2, block.packet[1]); ri += 2*vectorSize; } if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth); i = 0; if(PanelMode) ri += offset*(2*vectorSize); for(; i < depth; i++) { PacketBlock cblock; PacketBlock block; cblock.packet[0] = rhs.template loadPacket(i, j + 0); //[a1 a1i] cblock.packet[1] = rhs.template loadPacket(i, j + 1); //[b1 b1i] cblock.packet[2] = rhs.template loadPacket(i, j + 2); //[c1 c1i] cblock.packet[3] = rhs.template loadPacket(i, j + 3); //[d1 d1i] block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64); block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64); if(Conjugate) { block.packet[0] *= conj; block.packet[1] *= conj; } pstore(blockBt + ri , block.packet[0]); pstore(blockBt + ri + 2, block.packet[1]); ri += 2*vectorSize; } if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth); } if(PanelMode) ri += offset*(cols - j); for(Index i = 0; i < depth; i++) { Index k = j; for(; k < cols; k++) { blockBt[ri] = rhs(i, k).real(); ri += 1; } } if(PanelMode) ri += (cols - j)*(stride - offset - depth); if(PanelMode) ri += offset*(cols - j); for(Index i = 0; i < depth; i++) { Index k = j; for(; k < cols; k++) { if(Conjugate) blockBt[ri] = -rhs(i, k).imag(); else blockBt[ri] = rhs(i, k).imag(); ri += 1; } } if(PanelMode) ri += (cols - j)*(stride - offset - depth); } }; /************** * GEMM utils * **************/ // 512-bits rank1-update of acc. It can either positive or negative accumulate (useful for complex gemm). template EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) { asm("#pger begin"); Packet lhsV = pload(lhs); if(NegativeAccumulate) { acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]); acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]); acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]); } else { acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]); acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]); acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]); } asm("#pger end"); } template EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) { Packet lhsV = pload(lhs); if(NegativeAccumulate) { acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); } else { acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); } } template EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows) { #ifdef _ARCH_PWR9 Packet lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar)); #else Packet lhsV; Index i = 0; do { lhsV[i] = lhs[i]; } while (++i < remaining_rows); #endif if(NegativeAccumulate) { acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]); acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]); acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]); } else { acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]); acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]); acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]); } } template EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows) { #ifdef _ARCH_PWR9 Packet lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar)); #else Packet lhsV; Index i = 0; do { lhsV[i] = lhs[i]; } while (++i < remaining_rows); #endif if(NegativeAccumulate) { acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); } else { acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); } } // 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real. template EIGEN_STRONG_INLINE void pgerc(PacketBlock& accReal, PacketBlock& accImag, const Scalar *rhs_ptr, const Scalar *rhs_ptr_imag, const Scalar *lhs_ptr, const Scalar* lhs_ptr_imag, Packet& conj) { Packet lhsV = *((Packet *) lhs_ptr); Packet rhsV1 = pset1(rhs_ptr[0]); Packet rhsV2 = pset1(rhs_ptr[1]); Packet rhsV3 = pset1(rhs_ptr[2]); Packet rhsV4 = pset1(rhs_ptr[3]); Packet lhsVi; if(!LhsIsReal) lhsVi = *((Packet *) lhs_ptr_imag); Packet rhsV1i, rhsV2i, rhsV3i, rhsV4i; if(!RhsIsReal) { rhsV1i = pset1(rhs_ptr_imag[0]); rhsV2i = pset1(rhs_ptr_imag[1]); rhsV3i = pset1(rhs_ptr_imag[2]); rhsV4i = pset1(rhs_ptr_imag[3]); } if(ConjugateLhs && !LhsIsReal) lhsVi = pmul(lhsVi,conj); if(ConjugateRhs && !RhsIsReal) { rhsV1i = pmul(rhsV1i,conj); rhsV2i = pmul(rhsV2i,conj); rhsV3i = pmul(rhsV3i,conj); rhsV4i = pmul(rhsV4i,conj); } if(LhsIsReal) { accReal.packet[0] = pmadd(rhsV1, lhsV, accReal.packet[0]); accReal.packet[1] = pmadd(rhsV2, lhsV, accReal.packet[1]); accReal.packet[2] = pmadd(rhsV3, lhsV, accReal.packet[2]); accReal.packet[3] = pmadd(rhsV4, lhsV, accReal.packet[3]); accImag.packet[0] = pmadd(rhsV1i, lhsV, accImag.packet[0]); accImag.packet[1] = pmadd(rhsV2i, lhsV, accImag.packet[1]); accImag.packet[2] = pmadd(rhsV3i, lhsV, accImag.packet[2]); accImag.packet[3] = pmadd(rhsV4i, lhsV, accImag.packet[3]); } else if(RhsIsReal) { accReal.packet[0] = pmadd(rhsV1, lhsV, accReal.packet[0]); accReal.packet[1] = pmadd(rhsV2, lhsV, accReal.packet[1]); accReal.packet[2] = pmadd(rhsV3, lhsV, accReal.packet[2]); accReal.packet[3] = pmadd(rhsV4, lhsV, accReal.packet[3]); accImag.packet[0] = pmadd(rhsV1, lhsVi, accImag.packet[0]); accImag.packet[1] = pmadd(rhsV2, lhsVi, accImag.packet[1]); accImag.packet[2] = pmadd(rhsV3, lhsVi, accImag.packet[2]); accImag.packet[3] = pmadd(rhsV4, lhsVi, accImag.packet[3]); } else { accReal.packet[0] = pmadd(rhsV1, lhsV, accReal.packet[0]); accReal.packet[1] = pmadd(rhsV2, lhsV, accReal.packet[1]); accReal.packet[2] = pmadd(rhsV3, lhsV, accReal.packet[2]); accReal.packet[3] = pmadd(rhsV4, lhsV, accReal.packet[3]); accImag.packet[0] = pmadd(rhsV1i, lhsV, accImag.packet[0]); accImag.packet[1] = pmadd(rhsV2i, lhsV, accImag.packet[1]); accImag.packet[2] = pmadd(rhsV3i, lhsV, accImag.packet[2]); accImag.packet[3] = pmadd(rhsV4i, lhsV, accImag.packet[3]); accReal.packet[0] = psub(accReal.packet[0], pmul(rhsV1i, lhsVi)); accReal.packet[1] = psub(accReal.packet[1], pmul(rhsV2i, lhsVi)); accReal.packet[2] = psub(accReal.packet[2], pmul(rhsV3i, lhsVi)); accReal.packet[3] = psub(accReal.packet[3], pmul(rhsV4i, lhsVi)); accImag.packet[0] = pmadd(rhsV1, lhsVi, accImag.packet[0]); accImag.packet[1] = pmadd(rhsV2, lhsVi, accImag.packet[1]); accImag.packet[2] = pmadd(rhsV3, lhsVi, accImag.packet[2]); accImag.packet[3] = pmadd(rhsV4, lhsVi, accImag.packet[3]); } } template EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar *lhs) { return *((Packet *)lhs); } // Zero the accumulator on PacketBlock. template EIGEN_STRONG_INLINE void bsetzero(PacketBlock& acc) { acc.packet[0] = pset1((Scalar)0); acc.packet[1] = pset1((Scalar)0); acc.packet[2] = pset1((Scalar)0); acc.packet[3] = pset1((Scalar)0); } template EIGEN_STRONG_INLINE void bsetzero(PacketBlock& acc) { acc.packet[0] = pset1((Scalar)0); } // Scale the PacketBlock vectors by alpha. template EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]); acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]); acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]); } template EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); } // Complex version of PacketBlock scaling. template EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag) { cReal.packet[0] = pmul(aReal.packet[0], bReal); cReal.packet[1] = pmul(aReal.packet[1], bReal); cReal.packet[2] = pmul(aReal.packet[2], bReal); cReal.packet[3] = pmul(aReal.packet[3], bReal); cImag.packet[0] = pmul(aImag.packet[0], bReal); cImag.packet[1] = pmul(aImag.packet[1], bReal); cImag.packet[2] = pmul(aImag.packet[2], bReal); cImag.packet[3] = pmul(aImag.packet[3], bReal); cReal.packet[0] = psub(cReal.packet[0], pmul(aImag.packet[0], bImag)); cReal.packet[1] = psub(cReal.packet[1], pmul(aImag.packet[1], bImag)); cReal.packet[2] = psub(cReal.packet[2], pmul(aImag.packet[2], bImag)); cReal.packet[3] = psub(cReal.packet[3], pmul(aImag.packet[3], bImag)); cImag.packet[0] = pmadd(aReal.packet[0], bImag, cImag.packet[0]); cImag.packet[1] = pmadd(aReal.packet[1], bImag, cImag.packet[1]); cImag.packet[2] = pmadd(aReal.packet[2], bImag, cImag.packet[2]); cImag.packet[3] = pmadd(aReal.packet[3], bImag, cImag.packet[3]); } // Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed. template EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col, Index accCols) { acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); } // An overload of bload when you have a PacketBLock with 8 vectors. template EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col, Index accCols) { acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); acc.packet[4] = res.template loadPacket(row + (N+1)*accCols, col + 0); acc.packet[5] = res.template loadPacket(row + (N+1)*accCols, col + 1); acc.packet[6] = res.template loadPacket(row + (N+1)*accCols, col + 2); acc.packet[7] = res.template loadPacket(row + (N+1)*accCols, col + 3); } const static Packet4i mask41 = { -1, 0, 0, 0 }; const static Packet4i mask42 = { -1, -1, 0, 0 }; const static Packet4i mask43 = { -1, -1, -1, 0 }; const static Packet2l mask21 = { -1, 0 }; template EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows) { if (remaining_rows == 0) { return pset1(float(0.0)); } else { switch (remaining_rows) { case 1: return Packet(mask41); case 2: return Packet(mask42); default: return Packet(mask43); } } } template<> EIGEN_STRONG_INLINE Packet2d bmask(const int remaining_rows) { if (remaining_rows == 0) { return pset1(double(0.0)); } else { return Packet2d(mask21); } } template EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) { acc.packet[0] = pmadd(pAlpha, pand(accZ.packet[0], pMask), acc.packet[0]); acc.packet[1] = pmadd(pAlpha, pand(accZ.packet[1], pMask), acc.packet[1]); acc.packet[2] = pmadd(pAlpha, pand(accZ.packet[2], pMask), acc.packet[2]); acc.packet[3] = pmadd(pAlpha, pand(accZ.packet[3], pMask), acc.packet[3]); } // PEEL loop factor. #define PEEL 10 template EIGEN_STRONG_INLINE void MICRO_EXTRA_COL( const Scalar* &lhs_ptr, const Scalar* &rhs_ptr, PacketBlock &accZero, Index remaining_rows, Index remaining_cols) { Packet rhsV[1]; rhsV[0] = pset1(rhs_ptr[0]); pger(&accZero, lhs_ptr, rhsV); lhs_ptr += remaining_rows; rhs_ptr += remaining_cols; } template EIGEN_STRONG_INLINE void gemm_extra_col( const DataMapper& res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index row, Index col, Index remaining_rows, Index remaining_cols, const Packet& pAlpha) { const Scalar *rhs_ptr = rhs_base; const Scalar *lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; PacketBlock accZero, acc; bsetzero(accZero); Index remaining_depth = (depth & -accRows); Index k = 0; for(; k + PEEL <= remaining_depth; k+= PEEL) { prefetch(rhs_ptr); prefetch(lhs_ptr); for (int l = 0; l < PEEL; l++) { MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); } } for(; k < remaining_depth; k++) { MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); } for(; k < depth; k++) { Packet rhsV[1]; rhsV[0] = pset1(rhs_ptr[0]); pger(&accZero, lhs_ptr, rhsV, remaining_rows); lhs_ptr += remaining_rows; rhs_ptr += remaining_cols; } acc.packet[0] = vec_mul(pAlpha, accZero.packet[0]); for(Index i = 0; i < remaining_rows; i++){ res(row + i, col) += acc.packet[0][i]; } } template EIGEN_STRONG_INLINE void MICRO_EXTRA_ROW( const Scalar* &lhs_ptr, const Scalar* &rhs_ptr, PacketBlock &accZero, Index remaining_rows) { Packet rhsV[4]; pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); pger(&accZero, lhs_ptr, rhsV); lhs_ptr += remaining_rows; rhs_ptr += accRows; } template EIGEN_STRONG_INLINE void gemm_extra_row( const DataMapper& res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index row, Index col, Index cols, Index remaining_rows, const Packet& pAlpha, const Packet& pMask) { const Scalar *rhs_ptr = rhs_base; const Scalar *lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; PacketBlock accZero, acc; bsetzero(accZero); Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows); Index k = 0; for(; k + PEEL <= remaining_depth; k+= PEEL) { prefetch(rhs_ptr); prefetch(lhs_ptr); for (int l = 0; l < PEEL; l++) { MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); } } for(; k < remaining_depth; k++) { MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); } if (remaining_depth == depth) { for(Index j = 0; j < 4; j++){ acc.packet[j] = res.template loadPacket(row, col + j); } bscale(acc, accZero, pAlpha, pMask); res.template storePacketBlock(row, col, acc); } else { for(; k < depth; k++) { Packet rhsV[4]; pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); pger(&accZero, lhs_ptr, rhsV, remaining_rows); lhs_ptr += remaining_rows; rhs_ptr += accRows; } for(Index j = 0; j < 4; j++){ acc.packet[j] = vec_mul(pAlpha, accZero.packet[j]); } for(Index j = 0; j < 4; j++){ for(Index i = 0; i < remaining_rows; i++){ res(row + i, col + j) += acc.packet[j][i]; } } } } #define MICRO_DST \ PacketBlock *accZero0, PacketBlock *accZero1, PacketBlock *accZero2, \ PacketBlock *accZero3, PacketBlock *accZero4, PacketBlock *accZero5, \ PacketBlock *accZero6, PacketBlock *accZero7 #define MICRO_COL_DST \ PacketBlock *accZero0, PacketBlock *accZero1, PacketBlock *accZero2, \ PacketBlock *accZero3, PacketBlock *accZero4, PacketBlock *accZero5, \ PacketBlock *accZero6, PacketBlock *accZero7 #define MICRO_SRC \ const Scalar **lhs_ptr0, const Scalar **lhs_ptr1, const Scalar **lhs_ptr2, \ const Scalar **lhs_ptr3, const Scalar **lhs_ptr4, const Scalar **lhs_ptr5, \ const Scalar **lhs_ptr6, const Scalar **lhs_ptr7 #define MICRO_ONE \ MICRO(\ &lhs_ptr0, &lhs_ptr1, &lhs_ptr2, &lhs_ptr3, &lhs_ptr4, &lhs_ptr5, &lhs_ptr6, &lhs_ptr7, \ rhs_ptr, \ &accZero0, &accZero1, &accZero2, &accZero3, &accZero4, &accZero5, &accZero6, &accZero7); #define MICRO_COL_ONE \ MICRO_COL(\ &lhs_ptr0, &lhs_ptr1, &lhs_ptr2, &lhs_ptr3, &lhs_ptr4, &lhs_ptr5, &lhs_ptr6, &lhs_ptr7, \ rhs_ptr, \ &accZero0, &accZero1, &accZero2, &accZero3, &accZero4, &accZero5, &accZero6, &accZero7, \ remaining_cols); #define MICRO_WORK_ONE(iter) \ if (N > iter) { \ pger(accZero##iter, *lhs_ptr##iter, rhsV); \ *lhs_ptr##iter += accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(accZero##iter); \ EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ } #define MICRO_UNROLL(func) \ func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) #define MICRO_WORK MICRO_UNROLL(MICRO_WORK_ONE) #define MICRO_DST_PTR_ONE(iter) \ if (unroll_factor > iter){ \ bsetzero(accZero##iter); \ } else { \ EIGEN_UNUSED_VARIABLE(accZero##iter); \ } #define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE) #define MICRO_SRC_PTR_ONE(iter) \ if (unroll_factor > iter) { \ lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ } else { \ EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ } #define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE) #define MICRO_PREFETCH_ONE(iter) \ if (unroll_factor > iter){ \ prefetch(lhs_ptr##iter); \ } #define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE) #define MICRO_STORE_ONE(iter) \ if (unroll_factor > iter){ \ acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ acc.packet[1] = res.template loadPacket(row + iter*accCols, col + 1); \ acc.packet[2] = res.template loadPacket(row + iter*accCols, col + 2); \ acc.packet[3] = res.template loadPacket(row + iter*accCols, col + 3); \ bscale(acc, accZero##iter, pAlpha); \ res.template storePacketBlock(row + iter*accCols, col, acc); \ } #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE) #define MICRO_COL_STORE_ONE(iter) \ if (unroll_factor > iter){ \ acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ bscale(acc, accZero##iter, pAlpha); \ res.template storePacketBlock(row + iter*accCols, col, acc); \ } #define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE) template EIGEN_STRONG_INLINE void MICRO( MICRO_SRC, const Scalar* &rhs_ptr, MICRO_DST) { Packet rhsV[4]; pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); asm("#unrolled pger? begin"); MICRO_WORK asm("#unrolled pger? end"); rhs_ptr += accRows; } template EIGEN_STRONG_INLINE void gemm_unrolled_iteration( const DataMapper& res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index& row, Index col, const Packet& pAlpha) { const Scalar *rhs_ptr = rhs_base; const Scalar *lhs_ptr0, *lhs_ptr1, *lhs_ptr2, *lhs_ptr3, *lhs_ptr4, *lhs_ptr5, *lhs_ptr6, *lhs_ptr7; PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; PacketBlock acc; asm("#unrolled start"); MICRO_SRC_PTR asm("#unrolled zero?"); MICRO_DST_PTR Index k = 0; for(; k + PEEL <= depth; k+= PEEL) { prefetch(rhs_ptr); MICRO_PREFETCH asm("#unrolled inner loop?"); for (int l = 0; l < PEEL; l++) { MICRO_ONE } asm("#unrolled inner loop end?"); } for(; k < depth; k++) { MICRO_ONE } MICRO_STORE row += unroll_factor*accCols; } template EIGEN_STRONG_INLINE void MICRO_COL( MICRO_SRC, const Scalar* &rhs_ptr, MICRO_COL_DST, Index remaining_rows) { Packet rhsV[1]; rhsV[0] = pset1(rhs_ptr[0]); asm("#unrolled pger? begin"); MICRO_WORK asm("#unrolled pger? end"); rhs_ptr += remaining_rows; } template EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration( const DataMapper& res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index& row, Index col, Index remaining_cols, const Packet& pAlpha) { const Scalar *rhs_ptr = rhs_base; const Scalar *lhs_ptr0, *lhs_ptr1, *lhs_ptr2, *lhs_ptr3, *lhs_ptr4, *lhs_ptr5, *lhs_ptr6, *lhs_ptr7; PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; PacketBlock acc; MICRO_SRC_PTR MICRO_DST_PTR Index k = 0; for(; k + PEEL <= depth; k+= PEEL) { prefetch(rhs_ptr); MICRO_PREFETCH for (int l = 0; l < PEEL; l++) { MICRO_COL_ONE } } for(; k < depth; k++) { MICRO_COL_ONE } MICRO_COL_STORE row += unroll_factor*accCols; } template EIGEN_STRONG_INLINE void gemm_unrolled_col( const DataMapper& res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index& row, Index rows, Index col, Index remaining_cols, const Packet& pAlpha) { #define MAX_UNROLL 6 while(row + MAX_UNROLL*accCols <= rows){ gemm_unrolled_col_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); } switch( (rows-row)/accCols ){ #if MAX_UNROLL > 7 case 7: gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); break; #endif #if MAX_UNROLL > 6 case 6: gemm_unrolled_col_iteration<6, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); break; #endif #if MAX_UNROLL > 5 case 5: gemm_unrolled_col_iteration<5, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); break; #endif #if MAX_UNROLL > 4 case 4: gemm_unrolled_col_iteration<4, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); break; #endif #if MAX_UNROLL > 3 case 3: gemm_unrolled_col_iteration<3, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); break; #endif #if MAX_UNROLL > 2 case 2: gemm_unrolled_col_iteration<2, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); break; #endif #if MAX_UNROLL > 1 case 1: gemm_unrolled_col_iteration<1, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); break; #endif default: break; } #undef MAX_UNROLL } /**************** * GEMM kernels * * **************/ template EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; const Index remaining_cols = cols % accRows; if( strideA == -1 ) strideA = depth; if( strideB == -1 ) strideB = depth; const Packet pAlpha = pset1(alpha); const Packet pMask = bmask((const int)(remaining_rows)); Index col = 0; for(; col + accRows <= cols; col += accRows) { const Scalar *rhs_base = blockB + col*strideB + accRows*offsetB; const Scalar *lhs_base = blockA; Index row = 0; asm("#jump table"); #define MAX_UNROLL 6 while(row + MAX_UNROLL*accCols <= rows){ gemm_unrolled_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); } switch( (rows-row)/accCols ){ #if MAX_UNROLL > 7 case 7: gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); break; #endif #if MAX_UNROLL > 6 case 6: gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); break; #endif #if MAX_UNROLL > 5 case 5: gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); break; #endif #if MAX_UNROLL > 4 case 4: gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); break; #endif #if MAX_UNROLL > 3 case 3: gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); break; #endif #if MAX_UNROLL > 2 case 2: gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); break; #endif #if MAX_UNROLL > 1 case 1: gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); break; #endif default: break; } #undef MAX_UNROLL asm("#jump table end"); if(remaining_rows > 0) { gemm_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, cols, remaining_rows, pAlpha, pMask); } } if(remaining_cols > 0) { const Scalar *rhs_base = blockB + col*strideB + remaining_cols*offsetB; const Scalar *lhs_base = blockA; for(; col < cols; col++) { Index row = 0; gemm_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha); if (remaining_rows > 0) { gemm_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha); } rhs_base++; } } } template EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const int remaining_rows = rows % accCols; const int remaining_cols = cols % accRows; const int accColsC = accCols / 2; int advanceCols = 2; int advanceRows = 2; if(LhsIsReal) advanceRows = 1; if(RhsIsReal) advanceCols = 1; if( strideA == -1 ) strideA = depth; if( strideB == -1 ) strideB = depth; const Packet pAlphaReal = pset1(alpha.real()); const Packet pAlphaImag = pset1(alpha.imag()); const Scalar *blockA = (Scalar *) blockAc; const Scalar *blockB = (Scalar *) blockBc; Packet conj = pset1((Scalar)-1.0); Index col = 0; for(; col + accRows <= cols; col += accRows) { const Scalar *rhs_base = blockB + ( (advanceCols*col)/accRows )*strideB*accRows; const Scalar *lhs_base = blockA; Index row = 0; for(; row + accCols <= rows; row += accCols) { #define MICRO() \ pgerc(accReal1, accImag1, rhs_ptr, rhs_ptr_imag, lhs_ptr1, lhs_ptr_imag1, conj); \ lhs_ptr1 += accCols; \ rhs_ptr += accRows; \ if(!LhsIsReal) \ lhs_ptr_imag1 += accCols; \ if(!RhsIsReal) \ rhs_ptr_imag += accRows; const Scalar *rhs_ptr = rhs_base; const Scalar *rhs_ptr_imag = rhs_ptr + accRows*strideB; const Scalar *lhs_ptr1 = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols; const Scalar *lhs_ptr_imag1 = lhs_ptr1 + accCols*strideA; PacketBlock accReal1, accImag1; bsetzero(accReal1); bsetzero(accImag1); lhs_ptr1 += accCols*offsetA; if(!LhsIsReal) lhs_ptr_imag1 += accCols*offsetA; rhs_ptr += accRows*offsetB; if(!RhsIsReal) rhs_ptr_imag += accRows*offsetB; Index k = 0; for(; k + PEEL <= depth; k+=PEEL) { prefetch(rhs_ptr); prefetch(rhs_ptr_imag); prefetch(lhs_ptr1); prefetch(lhs_ptr_imag1); MICRO(); MICRO(); MICRO(); MICRO(); MICRO(); MICRO(); MICRO(); MICRO(); #if PEEL > 8 MICRO(); MICRO(); #endif } for(; k < depth; k++) { MICRO(); } PacketBlock taccReal, taccImag; bscalec(accReal1, accImag1, pAlphaReal, pAlphaImag, taccReal, taccImag); PacketBlock tRes; bload(tRes, res, row, col, accColsC); PacketBlock acc1, acc2; bcouple(taccReal, taccImag, tRes, acc1, acc2); res.template storePacketBlock(row + 0, col, acc1); res.template storePacketBlock(row + accColsC, col, acc2); #undef MICRO } if(remaining_rows > 0) { const Scalar *rhs_ptr = rhs_base; const Scalar *rhs_ptr_imag = rhs_ptr + accRows*strideB; const Scalar *lhs_ptr = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols; const Scalar *lhs_ptr_imag = lhs_ptr + remaining_rows*strideA; lhs_ptr += remaining_rows*offsetA; if(!LhsIsReal) lhs_ptr_imag += remaining_rows*offsetA; rhs_ptr += accRows*offsetB; if(!RhsIsReal) rhs_ptr_imag += accRows*offsetB; for(Index k = 0; k < depth; k++) { for(Index arow = 0; arow < remaining_rows; arow++) { Scalar lhs_real = lhs_ptr[arow]; Scalar lhs_imag; if(!LhsIsReal) lhs_imag = lhs_ptr_imag[arow]; Scalarc lhsc; lhsc.real(lhs_real); if(!LhsIsReal) { if(ConjugateLhs) lhsc.imag(-lhs_imag); else lhsc.imag(lhs_imag); } else { //Lazy approach for now lhsc.imag((Scalar)0); } for(int acol = 0; acol < accRows; acol++ ) { Scalar rhs_real = rhs_ptr[acol]; Scalar rhs_imag; if(!RhsIsReal) rhs_imag = rhs_ptr_imag[acol]; Scalarc rhsc; rhsc.real(rhs_real); if(!RhsIsReal) { if(ConjugateRhs) rhsc.imag(-rhs_imag); else rhsc.imag(rhs_imag); } else { //Lazy approach for now rhsc.imag((Scalar)0); } res(row + arow, col + acol) += alpha*lhsc*rhsc; } } rhs_ptr += accRows; lhs_ptr += remaining_rows; if(!LhsIsReal) lhs_ptr_imag += remaining_rows; if(!RhsIsReal) rhs_ptr_imag += accRows; } } } if(remaining_cols > 0) { const Scalar *rhs_base = blockB + ( (advanceCols*col)/accRows )*strideB*accRows; const Scalar *lhs_base = blockA; Index row = 0; for(; row + accCols <= rows; row += accCols) { const Scalar *rhs_ptr = rhs_base; const Scalar *rhs_ptr_imag = rhs_ptr + remaining_cols*strideB; const Scalar *lhs_ptr = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols; const Scalar *lhs_ptr_imag = lhs_ptr + accCols*strideA; lhs_ptr += accCols*offsetA; if(!LhsIsReal) lhs_ptr_imag += accCols*offsetA; rhs_ptr += remaining_cols*offsetB; if(!RhsIsReal) rhs_ptr_imag += remaining_cols*offsetB; Scalarc scalarAcc[4][4]; for(Index arow = 0; arow < 4; arow++ ) { for(Index acol = 0; acol < 4; acol++ ) { scalarAcc[arow][acol].real((Scalar)0.0); scalarAcc[arow][acol].imag((Scalar)0.0); } } for(Index k = 0; k < depth; k++) { for(Index arow = 0; arow < accCols; arow++) { Scalar lhs_real = lhs_ptr[arow]; Scalar lhs_imag; if(!LhsIsReal) { lhs_imag = lhs_ptr_imag[arow]; if(ConjugateLhs) lhs_imag *= -1; } else { lhs_imag = (Scalar)0; } for(int acol = 0; acol < remaining_cols; acol++ ) { Scalar rhs_real = rhs_ptr[acol]; Scalar rhs_imag; if(!RhsIsReal) { rhs_imag = rhs_ptr_imag[acol]; if(ConjugateRhs) rhs_imag *= -1; } else { rhs_imag = (Scalar)0; } scalarAcc[arow][acol].real(scalarAcc[arow][acol].real() + lhs_real*rhs_real - lhs_imag*rhs_imag); scalarAcc[arow][acol].imag(scalarAcc[arow][acol].imag() + lhs_imag*rhs_real + lhs_real*rhs_imag); } } rhs_ptr += remaining_cols; lhs_ptr += accCols; if(!RhsIsReal) rhs_ptr_imag += remaining_cols; if(!LhsIsReal) lhs_ptr_imag += accCols; } for(int arow = 0; arow < accCols; arow++ ) { for(int acol = 0; acol < remaining_cols; acol++ ) { Scalar accR = scalarAcc[arow][acol].real(); Scalar accI = scalarAcc[arow][acol].imag(); Scalar aR = alpha.real(); Scalar aI = alpha.imag(); Scalar resR = res(row + arow, col + acol).real(); Scalar resI = res(row + arow, col + acol).imag(); res(row + arow, col + acol).real(resR + accR*aR - accI*aI); res(row + arow, col + acol).imag(resI + accR*aI + accI*aR); } } } if(remaining_rows > 0) { const Scalar *rhs_ptr = rhs_base; const Scalar *rhs_ptr_imag = rhs_ptr + remaining_cols*strideB; const Scalar *lhs_ptr = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols; const Scalar *lhs_ptr_imag = lhs_ptr + remaining_rows*strideA; lhs_ptr += remaining_rows*offsetA; if(!LhsIsReal) lhs_ptr_imag += remaining_rows*offsetA; rhs_ptr += remaining_cols*offsetB; if(!RhsIsReal) rhs_ptr_imag += remaining_cols*offsetB; for(Index k = 0; k < depth; k++) { for(Index arow = 0; arow < remaining_rows; arow++) { Scalar lhs_real = lhs_ptr[arow]; Scalar lhs_imag; if(!LhsIsReal) lhs_imag = lhs_ptr_imag[arow]; Scalarc lhsc; lhsc.real(lhs_real); if(!LhsIsReal) { if(ConjugateLhs) lhsc.imag(-lhs_imag); else lhsc.imag(lhs_imag); } else { lhsc.imag((Scalar)0); } for(Index acol = 0; acol < remaining_cols; acol++ ) { Scalar rhs_real = rhs_ptr[acol]; Scalar rhs_imag; if(!RhsIsReal) rhs_imag = rhs_ptr_imag[acol]; Scalarc rhsc; rhsc.real(rhs_real); if(!RhsIsReal) { if(ConjugateRhs) rhsc.imag(-rhs_imag); else rhsc.imag(rhs_imag); } else { rhsc.imag((Scalar)0); } res(row + arow, col + acol) += alpha*lhsc*rhsc; } } rhs_ptr += remaining_cols; lhs_ptr += remaining_rows; if(!LhsIsReal) lhs_ptr_imag += remaining_rows; if(!RhsIsReal) rhs_ptr_imag += remaining_cols; } } } } /************************************ * ppc64le template specializations * * **********************************/ template struct gemm_pack_lhs { void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; template void gemm_pack_lhs ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { lhs_pack pack; pack(blockA, lhs, depth, rows, stride, offset); } template struct gemm_pack_lhs { void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; template void gemm_pack_lhs ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { lhs_pack pack; pack(blockA, lhs, depth, rows, stride, offset); } template struct gemm_pack_rhs { void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; template void gemm_pack_rhs ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { rhs_pack pack; pack(blockB, rhs, depth, cols, stride, offset); } template struct gemm_pack_rhs { void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; template void gemm_pack_rhs ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { rhs_pack pack; pack(blockB, rhs, depth, cols, stride, offset); } template struct gemm_pack_lhs { void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; template void gemm_pack_lhs ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { lhs_pack pack; pack(blockA, lhs, depth, rows, stride, offset); } template struct gemm_pack_lhs { void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; template void gemm_pack_lhs ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { lhs_pack pack; pack(blockA, lhs, depth, rows, stride, offset); } template struct gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> { void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; template void gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { lhs_cpack pack; pack(blockA, lhs, depth, rows, stride, offset); } template struct gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> { void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; template void gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { lhs_cpack pack; pack(blockA, lhs, depth, rows, stride, offset); } template struct gemm_pack_rhs { void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; template void gemm_pack_rhs ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { rhs_pack pack; pack(blockB, rhs, depth, cols, stride, offset); } template struct gemm_pack_rhs { void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; template void gemm_pack_rhs ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { rhs_pack pack; pack(blockB, rhs, depth, cols, stride, offset); } template struct gemm_pack_rhs, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> { void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; template void gemm_pack_rhs, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { rhs_cpack pack; pack(blockB, rhs, depth, cols, stride, offset); } template struct gemm_pack_rhs, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> { void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; template void gemm_pack_rhs, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { rhs_cpack pack; pack(blockB, rhs, depth, cols, stride, offset); } template struct gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> { void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; template void gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { lhs_cpack pack; pack(blockA, lhs, depth, rows, stride, offset); } template struct gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> { void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; template void gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { lhs_cpack pack; pack(blockA, lhs, depth, rows, stride, offset); } template struct gemm_pack_rhs, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> { void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; template void gemm_pack_rhs, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { rhs_cpack pack; pack(blockB, rhs, depth, cols, stride, offset); } template struct gemm_pack_rhs, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> { void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; template void gemm_pack_rhs, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { rhs_cpack pack; pack(blockB, rhs, depth, cols, stride, offset); } // ********* gebp specializations ********* template struct gebp_kernel { typedef typename quad_traits::vectortype Packet; typedef typename quad_traits::rhstype RhsPacket; void operator()(const DataMapper& res, const float* blockA, const float* blockB, Index rows, Index depth, Index cols, float alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; template void gebp_kernel ::operator()(const DataMapper& res, const float* blockA, const float* blockB, Index rows, Index depth, Index cols, float alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index accRows = quad_traits::rows; const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index); #ifdef EIGEN_ALTIVEC_MMA_ONLY //generate with MMA only gemm_function = &Eigen::internal::gemmMMA; #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ gemm_function = &Eigen::internal::gemmMMA; } else{ gemm_function = &Eigen::internal::gemm; } #else gemm_function = &Eigen::internal::gemm; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template struct gebp_kernel, std::complex, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> { typedef Packet4f Packet; typedef Packet2cf Packetc; typedef Packet4f RhsPacket; void operator()(const DataMapper& res, const std::complex* blockA, const std::complex* blockB, Index rows, Index depth, Index cols, std::complex alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; template void gebp_kernel, std::complex, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> ::operator()(const DataMapper& res, const std::complex* blockA, const std::complex* blockB, Index rows, Index depth, Index cols, std::complex alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const int accRows = quad_traits::rows; const int accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); #ifdef EIGEN_ALTIVEC_MMA_ONLY //generate with MMA only gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; } else{ gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; } #else gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template struct gebp_kernel, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> { typedef Packet4f Packet; typedef Packet2cf Packetc; typedef Packet4f RhsPacket; void operator()(const DataMapper& res, const float* blockA, const std::complex* blockB, Index rows, Index depth, Index cols, std::complex alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; template void gebp_kernel, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> ::operator()(const DataMapper& res, const float* blockA, const std::complex* blockB, Index rows, Index depth, Index cols, std::complex alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const int accRows = quad_traits::rows; const int accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const float*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); #ifdef EIGEN_ALTIVEC_MMA_ONLY //generate with MMA only gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; } else{ gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; } #else gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template struct gebp_kernel, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> { typedef Packet4f Packet; typedef Packet2cf Packetc; typedef Packet4f RhsPacket; void operator()(const DataMapper& res, const std::complex* blockA, const float* blockB, Index rows, Index depth, Index cols, std::complex alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; template void gebp_kernel, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> ::operator()(const DataMapper& res, const std::complex* blockA, const float* blockB, Index rows, Index depth, Index cols, std::complex alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const int accRows = quad_traits::rows; const int accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const float*, Index, Index, Index, std::complex, Index, Index, Index, Index); #ifdef EIGEN_ALTIVEC_MMA_ONLY //generate with MMA only gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; } else{ gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; } #else gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template struct gebp_kernel { typedef typename quad_traits::vectortype Packet; typedef typename quad_traits::rhstype RhsPacket; void operator()(const DataMapper& res, const double* blockA, const double* blockB, Index rows, Index depth, Index cols, double alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; template void gebp_kernel ::operator()(const DataMapper& res, const double* blockA, const double* blockB, Index rows, Index depth, Index cols, double alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index accRows = quad_traits::rows; const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index); #ifdef EIGEN_ALTIVEC_MMA_ONLY //generate with MMA only gemm_function = &Eigen::internal::gemmMMA; #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ gemm_function = &Eigen::internal::gemmMMA; } else{ gemm_function = &Eigen::internal::gemm; } #else gemm_function = &Eigen::internal::gemm; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template struct gebp_kernel, std::complex, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> { typedef quad_traits::vectortype Packet; typedef Packet1cd Packetc; typedef quad_traits::rhstype RhsPacket; void operator()(const DataMapper& res, const std::complex* blockA, const std::complex* blockB, Index rows, Index depth, Index cols, std::complex alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; template void gebp_kernel, std::complex, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> ::operator()(const DataMapper& res, const std::complex* blockA, const std::complex* blockB, Index rows, Index depth, Index cols, std::complex alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const int accRows = quad_traits::rows; const int accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); #ifdef EIGEN_ALTIVEC_MMA_ONLY //generate with MMA only gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; } else{ gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; } #else gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template struct gebp_kernel, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> { typedef quad_traits::vectortype Packet; typedef Packet1cd Packetc; typedef quad_traits::rhstype RhsPacket; void operator()(const DataMapper& res, const std::complex* blockA, const double* blockB, Index rows, Index depth, Index cols, std::complex alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; template void gebp_kernel, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> ::operator()(const DataMapper& res, const std::complex* blockA, const double* blockB, Index rows, Index depth, Index cols, std::complex alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const int accRows = quad_traits::rows; const int accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const double*, Index, Index, Index, std::complex, Index, Index, Index, Index); #ifdef EIGEN_ALTIVEC_MMA_ONLY //generate with MMA only gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; } else{ gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; } #else gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template struct gebp_kernel, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> { typedef quad_traits::vectortype Packet; typedef Packet1cd Packetc; typedef quad_traits::rhstype RhsPacket; void operator()(const DataMapper& res, const double* blockA, const std::complex* blockB, Index rows, Index depth, Index cols, std::complex alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; template void gebp_kernel, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> ::operator()(const DataMapper& res, const double* blockA, const std::complex* blockB, Index rows, Index depth, Index cols, std::complex alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const int accRows = quad_traits::rows; const int accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const double*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); #ifdef EIGEN_ALTIVEC_MMA_ONLY //generate with MMA only gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; } else{ gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; } #else gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } } // end namespace internal } // end namespace Eigen #endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H