aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/products
diff options
context:
space:
mode:
Diffstat (limited to 'Eigen/src/Core/products')
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h80
-rw-r--r--Eigen/src/Core/products/GeneralMatrixMatrix.h4
-rw-r--r--Eigen/src/Core/products/GeneralMatrixVector.h15
3 files changed, 86 insertions, 13 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index cba76edfe..0f47f6de5 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -208,7 +208,16 @@ public:
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
{
+ // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
+ // let gcc allocate the register in which to store the result of the pmul
+ // (in the case where there is no FMA) gcc fails to figure out how to avoid
+ // spilling register.
+#ifdef EIGEN_VECTORIZE_FMA
+ EIGEN_UNUSED_VARIABLE(tmp);
+ c = pmadd(a,b,c);
+#else
tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
+#endif
}
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
@@ -287,7 +296,12 @@ public:
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
{
+#ifdef EIGEN_VECTORIZE_FMA
+ EIGEN_UNUSED_VARIABLE(tmp);
+ c.v = pmadd(a.v,b,c.v);
+#else
tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
+#endif
}
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
@@ -983,9 +997,22 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder,
}
else
{
- for(Index k=0; k<depth; k++)
- {
- // TODO add a vectorized transpose here
+ const Index peeled_k = (depth/PacketSize)*PacketSize;
+ Index k=0;
+ for(; k<peeled_k; k+=PacketSize) {
+ for (Index m = 0; m < Pack1; m += PacketSize) {
+ Kernel<Packet> kernel;
+ for (int p = 0; p < PacketSize; ++p) {
+ kernel.packet[p] = ploadu<Packet>(&lhs(i+p+m, k));
+ }
+ ptranspose(kernel);
+ for (int p = 0; p < PacketSize; ++p) {
+ pstore(blockA+count+m+Pack1*p, cj.pconj(kernel.packet[p]));
+ }
+ }
+ count += PacketSize*Pack1;
+ }
+ for(; k<depth; k++) {
Index w=0;
for(; w<Pack1-3; w+=4)
{
@@ -1050,6 +1077,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
Index count = 0;
+ const Index peeled_k = (depth/PacketSize)*PacketSize;
if(nr>=8)
{
for(Index j2=0; j2<packet_cols8; j2+=8)
@@ -1064,7 +1092,22 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
const Scalar* b5 = &rhs[(j2+5)*rhsStride];
const Scalar* b6 = &rhs[(j2+6)*rhsStride];
const Scalar* b7 = &rhs[(j2+7)*rhsStride];
- for(Index k=0; k<depth; k++)
+ Index k=0;
+ if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
+ {
+ for(; k<peeled_k; k+=PacketSize) {
+ Kernel<Packet> kernel;
+ for (int p = 0; p < PacketSize; ++p) {
+ kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
+ }
+ ptranspose(kernel);
+ for (int p = 0; p < PacketSize; ++p) {
+ pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
+ count+=PacketSize;
+ }
+ }
+ }
+ for(; k<depth; k++)
{
blockB[count+0] = cj(b0[k]);
blockB[count+1] = cj(b1[k]);
@@ -1091,7 +1134,22 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
- for(Index k=0; k<depth; k++)
+ Index k=0;
+ if(PacketSize==4) // TODO enbale vectorized transposition for PacketSize==2 ??
+ {
+ for(; k<peeled_k; k+=PacketSize) {
+ Kernel<Packet> kernel;
+ for (int p = 0; p < PacketSize; ++p) {
+ kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
+ }
+ ptranspose(kernel);
+ for (int p = 0; p < PacketSize; ++p) {
+ pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
+ count+=PacketSize;
+ }
+ }
+ }
+ for(; k<depth; k++)
{
blockB[count+0] = cj(b0[k]);
blockB[count+1] = cj(b1[k]);
@@ -1148,10 +1206,14 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
if(PanelMode) count += 8 * offset;
for(Index k=0; k<depth; k++)
{
- if (8 == PacketSize) {
+ if (PacketSize==8) {
Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
pstoreu(blockB+count, cj.pconj(A));
- count += PacketSize;
+ } else if (PacketSize==4) {
+ Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+ Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
+ pstoreu(blockB+count, cj.pconj(A));
+ pstoreu(blockB+count+PacketSize, cj.pconj(B));
} else {
const Scalar* b0 = &rhs[k*rhsStride + j2];
blockB[count+0] = cj(b0[0]);
@@ -1162,8 +1224,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
blockB[count+5] = cj(b0[5]);
blockB[count+6] = cj(b0[6]);
blockB[count+7] = cj(b0[7]);
- count += 8;
}
+ count += 8;
}
// skip what we have after
if(PanelMode) count += 8 * (stride-offset-depth);
@@ -1177,7 +1239,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
if(PanelMode) count += 4 * offset;
for(Index k=0; k<depth; k++)
{
- if (4 == PacketSize) {
+ if (PacketSize==4) {
Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
pstoreu(blockB+count, cj.pconj(A));
count += PacketSize;
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 19991fa3f..dd9d79657 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -281,8 +281,8 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
SizeB = ActualCols * MaxDepth
};
- EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
- EIGEN_ALIGN16 RhsScalar m_staticB[SizeB];
+ EIGEN_ALIGN_DEFAULT LhsScalar m_staticA[SizeA];
+ EIGEN_ALIGN_DEFAULT RhsScalar m_staticB[SizeB];
public:
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index a73ce5ff0..340c51394 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -141,6 +141,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
alignedSize = 0;
alignedStart = 0;
}
+ else if(LhsPacketSize > 4)
+ {
+ // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+ // Currently, it seems to be better to perform unaligned loads anyway
+ alignmentPattern = NoneAligned;
+ }
else if (LhsPacketSize>1)
{
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
@@ -405,6 +411,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
alignedSize = 0;
alignedStart = 0;
}
+ else if(LhsPacketSize > 4)
+ {
+ // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+ alignmentPattern = NoneAligned;
+ }
else if (LhsPacketSize>1)
{
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
@@ -442,7 +453,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
{
- EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+ EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
// this helps the compiler generating good binary code
@@ -551,7 +562,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
{
for (Index i=start; i<end; ++i)
{
- EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+ EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
ResPacket ptmp0 = pset1<ResPacket>(tmp0);
const LhsScalar* lhs0 = lhs + i*lhsStride;
// process first unaligned result's coeffs