aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
diff options
context:
space:
mode:
authorGravatar Gustavo Lima Chaves <gustavo.lima.chaves@intel.com>2018-12-21 11:03:18 -0800
committerGravatar Gustavo Lima Chaves <gustavo.lima.chaves@intel.com>2018-12-21 11:03:18 -0800
commit1024a70e82c0301d9f699fd344613e9cd417ab95 (patch)
tree8bade6795acd372e4a5e4d84e062640b52fb9a9a /Eigen/src/Core/products/SelfadjointMatrixMatrix.h
parente763fcd09e620300226ca22d152b94867123b603 (diff)
gebp: Add new ½ and ¼ packet rows per (peeling) round on the lhs
MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The patch works by altering the gebp lhs packing routines to also consider ½ and ¼ packet lenght rows when packing, besides the original whole package and row-by-row attempts. Finally, gebp itself will try to fit a fraction of a packet at a time if: i) ½ and/or ¼ packets are available for the current context (e.g. AVX2 and SSE-sized SIMD register for x86) ii) The matrix's height is favorable to it (it may be it's too small in that dimension to take full advantage of the current/maximum packet width or it may be the case that last rows may take advantage of smaller packets before gebp goes row-by-row) This helps mitigate huge slowdowns one had on AVX512 builds when compared to AVX2 ones, for some dimensions. Gains top at an extra 1x in throughput. This patch is a complement to changeset 4ad359237aeb519dbd4b55eba43057b37988838c . Since packing is changed, Eigen users which would go for very low-level API usage, like TensorFlow, will have to be adapted to work fine with the changes.
Diffstat (limited to 'Eigen/src/Core/products/SelfadjointMatrixMatrix.h')
-rw-r--r--Eigen/src/Core/products/SelfadjointMatrixMatrix.h23
1 files changed, 20 insertions, 3 deletions
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index c84c71609..673073601 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -45,14 +45,23 @@ struct symm_pack_lhs
}
void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
{
- enum { PacketSize = packet_traits<Scalar>::size };
+ typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket;
+ typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket;
+ enum { PacketSize = packet_traits<Scalar>::size,
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
+
const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
Index count = 0;
//Index peeled_mc3 = (rows/Pack1)*Pack1;
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
- const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+ const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
+ const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
+ const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0;
if(Pack1>=3*PacketSize)
for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
@@ -66,8 +75,16 @@ struct symm_pack_lhs
for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
pack<1*PacketSize>(blockA, lhs, cols, i, count);
+ if(HasHalf && Pack1>=HalfPacketSize)
+ for(Index i=peeled_mc1; i<peeled_mc_half; i+=HalfPacketSize)
+ pack<HalfPacketSize>(blockA, lhs, cols, i, count);
+
+ if(HasQuarter && Pack1>=QuarterPacketSize)
+ for(Index i=peeled_mc_half; i<peeled_mc_quarter; i+=QuarterPacketSize)
+ pack<QuarterPacketSize>(blockA, lhs, cols, i, count);
+
// do the same with mr==1
- for(Index i=peeled_mc1; i<rows; i++)
+ for(Index i=peeled_mc_quarter; i<rows; i++)
{
for(Index k=0; k<i; k++)
blockA[count++] = lhs(i, k); // normal