diff options
Diffstat (limited to 'Eigen/src/Core/products')
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 80 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixVector.h | 15 |
3 files changed, 86 insertions, 13 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index cba76edfe..0f47f6de5 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -208,7 +208,16 @@ public: EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const { + // It would be a lot cleaner to call pmadd all the time. Unfortunately if we + // let gcc allocate the register in which to store the result of the pmul + // (in the case where there is no FMA) gcc fails to figure out how to avoid + // spilling register. +#ifdef EIGEN_VECTORIZE_FMA + EIGEN_UNUSED_VARIABLE(tmp); + c = pmadd(a,b,c); +#else tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); +#endif } EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const @@ -287,7 +296,12 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { +#ifdef EIGEN_VECTORIZE_FMA + EIGEN_UNUSED_VARIABLE(tmp); + c.v = pmadd(a.v,b,c.v); +#else tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); +#endif } EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const @@ -983,9 +997,22 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder, } else { - for(Index k=0; k<depth; k++) - { - // TODO add a vectorized transpose here + const Index peeled_k = (depth/PacketSize)*PacketSize; + Index k=0; + for(; k<peeled_k; k+=PacketSize) { + for (Index m = 0; m < Pack1; m += PacketSize) { + Kernel<Packet> kernel; + for (int p = 0; p < PacketSize; ++p) { + kernel.packet[p] = ploadu<Packet>(&lhs(i+p+m, k)); + } + ptranspose(kernel); + for (int p = 0; p < PacketSize; ++p) { + pstore(blockA+count+m+Pack1*p, cj.pconj(kernel.packet[p])); + } + } + count += PacketSize*Pack1; + } + for(; k<depth; k++) { Index w=0; for(; w<Pack1-3; w+=4) { @@ -1050,6 +1077,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; + const Index peeled_k = (depth/PacketSize)*PacketSize; if(nr>=8) { for(Index j2=0; j2<packet_cols8; j2+=8) @@ -1064,7 +1092,22 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan const Scalar* b5 = &rhs[(j2+5)*rhsStride]; const Scalar* b6 = &rhs[(j2+6)*rhsStride]; const Scalar* b7 = &rhs[(j2+7)*rhsStride]; - for(Index k=0; k<depth; k++) + Index k=0; + if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4 + { + for(; k<peeled_k; k+=PacketSize) { + Kernel<Packet> kernel; + for (int p = 0; p < PacketSize; ++p) { + kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]); + } + ptranspose(kernel); + for (int p = 0; p < PacketSize; ++p) { + pstoreu(blockB+count, cj.pconj(kernel.packet[p])); + count+=PacketSize; + } + } + } + for(; k<depth; k++) { blockB[count+0] = cj(b0[k]); blockB[count+1] = cj(b1[k]); @@ -1091,7 +1134,22 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan const Scalar* b1 = &rhs[(j2+1)*rhsStride]; const Scalar* b2 = &rhs[(j2+2)*rhsStride]; const Scalar* b3 = &rhs[(j2+3)*rhsStride]; - for(Index k=0; k<depth; k++) + Index k=0; + if(PacketSize==4) // TODO enbale vectorized transposition for PacketSize==2 ?? + { + for(; k<peeled_k; k+=PacketSize) { + Kernel<Packet> kernel; + for (int p = 0; p < PacketSize; ++p) { + kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]); + } + ptranspose(kernel); + for (int p = 0; p < PacketSize; ++p) { + pstoreu(blockB+count, cj.pconj(kernel.packet[p])); + count+=PacketSize; + } + } + } + for(; k<depth; k++) { blockB[count+0] = cj(b0[k]); blockB[count+1] = cj(b1[k]); @@ -1148,10 +1206,14 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan if(PanelMode) count += 8 * offset; for(Index k=0; k<depth; k++) { - if (8 == PacketSize) { + if (PacketSize==8) { Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); pstoreu(blockB+count, cj.pconj(A)); - count += PacketSize; + } else if (PacketSize==4) { + Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); + Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]); + pstoreu(blockB+count, cj.pconj(A)); + pstoreu(blockB+count+PacketSize, cj.pconj(B)); } else { const Scalar* b0 = &rhs[k*rhsStride + j2]; blockB[count+0] = cj(b0[0]); @@ -1162,8 +1224,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan blockB[count+5] = cj(b0[5]); blockB[count+6] = cj(b0[6]); blockB[count+7] = cj(b0[7]); - count += 8; } + count += 8; } // skip what we have after if(PanelMode) count += 8 * (stride-offset-depth); @@ -1177,7 +1239,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan if(PanelMode) count += 4 * offset; for(Index k=0; k<depth; k++) { - if (4 == PacketSize) { + if (PacketSize==4) { Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); pstoreu(blockB+count, cj.pconj(A)); count += PacketSize; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 19991fa3f..dd9d79657 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -281,8 +281,8 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M SizeB = ActualCols * MaxDepth }; - EIGEN_ALIGN16 LhsScalar m_staticA[SizeA]; - EIGEN_ALIGN16 RhsScalar m_staticB[SizeB]; + EIGEN_ALIGN_DEFAULT LhsScalar m_staticA[SizeA]; + EIGEN_ALIGN_DEFAULT RhsScalar m_staticB[SizeB]; public: diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index a73ce5ff0..340c51394 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -141,6 +141,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co alignedSize = 0; alignedStart = 0; } + else if(LhsPacketSize > 4) + { + // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. + // Currently, it seems to be better to perform unaligned loads anyway + alignmentPattern = NoneAligned; + } else if (LhsPacketSize>1) { eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); @@ -405,6 +411,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co alignedSize = 0; alignedStart = 0; } + else if(LhsPacketSize > 4) + { + // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. + alignmentPattern = NoneAligned; + } else if (LhsPacketSize>1) { eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); @@ -442,7 +453,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) { - EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); + EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0); ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0); // this helps the compiler generating good binary code @@ -551,7 +562,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co { for (Index i=start; i<end; ++i) { - EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); + EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0); ResPacket ptmp0 = pset1<ResPacket>(tmp0); const LhsScalar* lhs0 = lhs + i*lhsStride; // process first unaligned result's coeffs |