diff options
author | 2018-12-21 11:03:18 -0800 | |
---|---|---|
committer | 2018-12-21 11:03:18 -0800 | |
commit | 1024a70e82c0301d9f699fd344613e9cd417ab95 (patch) | |
tree | 8bade6795acd372e4a5e4d84e062640b52fb9a9a /Eigen/src/Core/products/SelfadjointMatrixMatrix.h | |
parent | e763fcd09e620300226ca22d152b94867123b603 (diff) |
gebp: Add new ½ and ¼ packet rows per (peeling) round on the lhs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The patch works by altering the gebp lhs packing routines to also
consider ½ and ¼ packet lenght rows when packing, besides the original
whole package and row-by-row attempts. Finally, gebp itself will try
to fit a fraction of a packet at a time if:
i) ½ and/or ¼ packets are available for the current context (e.g. AVX2
and SSE-sized SIMD register for x86)
ii) The matrix's height is favorable to it (it may be it's too small
in that dimension to take full advantage of the current/maximum
packet width or it may be the case that last rows may take
advantage of smaller packets before gebp goes row-by-row)
This helps mitigate huge slowdowns one had on AVX512 builds when
compared to AVX2 ones, for some dimensions. Gains top at an extra 1x
in throughput. This patch is a complement to changeset 4ad359237aeb519dbd4b55eba43057b37988838c
.
Since packing is changed, Eigen users which would go for very
low-level API usage, like TensorFlow, will have to be adapted to work
fine with the changes.
Diffstat (limited to 'Eigen/src/Core/products/SelfadjointMatrixMatrix.h')
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 23 |
1 files changed, 20 insertions, 3 deletions
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index c84c71609..673073601 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -45,14 +45,23 @@ struct symm_pack_lhs } void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { - enum { PacketSize = packet_traits<Scalar>::size }; + typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket; + typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket; + enum { PacketSize = packet_traits<Scalar>::size, + HalfPacketSize = unpacket_traits<HalfPacket>::size, + QuarterPacketSize = unpacket_traits<QuarterPacket>::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; + const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride); Index count = 0; //Index peeled_mc3 = (rows/Pack1)*Pack1; const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0; if(Pack1>=3*PacketSize) for(Index i=0; i<peeled_mc3; i+=3*PacketSize) @@ -66,8 +75,16 @@ struct symm_pack_lhs for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize) pack<1*PacketSize>(blockA, lhs, cols, i, count); + if(HasHalf && Pack1>=HalfPacketSize) + for(Index i=peeled_mc1; i<peeled_mc_half; i+=HalfPacketSize) + pack<HalfPacketSize>(blockA, lhs, cols, i, count); + + if(HasQuarter && Pack1>=QuarterPacketSize) + for(Index i=peeled_mc_half; i<peeled_mc_quarter; i+=QuarterPacketSize) + pack<QuarterPacketSize>(blockA, lhs, cols, i, count); + // do the same with mr==1 - for(Index i=peeled_mc1; i<rows; i++) + for(Index i=peeled_mc_quarter; i<rows; i++) { for(Index k=0; k<i; k++) blockA[count++] = lhs(i, k); // normal |