//#define EIGEN_POWER_USE_PREFETCH // Use prefetching in gemm routines #ifdef EIGEN_POWER_USE_PREFETCH #define EIGEN_POWER_PREFETCH(p) prefetch(p) #else #define EIGEN_POWER_PREFETCH(p) #endif namespace Eigen { namespace internal { template EIGEN_STRONG_INLINE void gemm_extra_col( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, Index depth, Index strideA, Index offsetA, Index row, Index col, Index remaining_rows, Index remaining_cols, const Packet& pAlpha); template EIGEN_STRONG_INLINE void gemm_extra_row( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, Index depth, Index strideA, Index offsetA, Index row, Index col, Index rows, Index cols, Index remaining_rows, const Packet& pAlpha, const Packet& pMask); template EIGEN_STRONG_INLINE void gemm_unrolled_col( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, Index depth, Index strideA, Index offsetA, Index& row, Index rows, Index col, Index remaining_cols, const Packet& pAlpha); template EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows); template EIGEN_STRONG_INLINE void gemm_complex_extra_col( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index col, Index remaining_rows, Index remaining_cols, const Packet& pAlphaReal, const Packet& pAlphaImag); template EIGEN_STRONG_INLINE void gemm_complex_extra_row( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index col, Index rows, Index cols, Index remaining_rows, const Packet& pAlphaReal, const Packet& pAlphaImag, const Packet& pMask); template EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index& row, Index rows, Index col, Index remaining_cols, const Packet& pAlphaReal, const Packet& pAlphaImag); template EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar* lhs); template EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); template EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); template EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); template EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag); const static Packet16uc p16uc_SETCOMPLEX32_FIRST = { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}; const static Packet16uc p16uc_SETCOMPLEX32_SECOND = { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31}; //[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64 const static Packet16uc p16uc_SETCOMPLEX64_FIRST = { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; //[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64 const static Packet16uc p16uc_SETCOMPLEX64_SECOND = { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks. template EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) { acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST); acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_FIRST); acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_FIRST); acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_SECOND); acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_SECOND); acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_SECOND); } template EIGEN_STRONG_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) { bcouple_common(taccReal, taccImag, acc1, acc2); acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); acc1.packet[1] = padd(tRes.packet[1], acc1.packet[1]); acc1.packet[2] = padd(tRes.packet[2], acc1.packet[2]); acc1.packet[3] = padd(tRes.packet[3], acc1.packet[3]); acc2.packet[0] = padd(tRes.packet[4], acc2.packet[0]); acc2.packet[1] = padd(tRes.packet[5], acc2.packet[1]); acc2.packet[2] = padd(tRes.packet[6], acc2.packet[2]); acc2.packet[3] = padd(tRes.packet[7], acc2.packet[3]); } template EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) { acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); } template EIGEN_STRONG_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) { bcouple_common(taccReal, taccImag, acc1, acc2); acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); acc2.packet[0] = padd(tRes.packet[1], acc2.packet[0]); } template<> EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) { acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST); acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_FIRST); acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_FIRST); acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_SECOND); acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_SECOND); acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_SECOND); } template<> EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) { acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); } // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled. template EIGEN_STRONG_INLINE Packet ploadRhs(const Scalar* rhs) { return *reinterpret_cast(const_cast(rhs)); } } // end namespace internal } // end namespace Eigen