Artificially increase l1-blocking size for AVX512. +10% speedup with current kernels.

With a 6pX4 kernel (not committed yet), this provides a +20% speedup.
author: Gael Guennebaud <g.gael@free.fr> 2018-12-11 15:36:27 +0100
committer: Gael Guennebaud <g.gael@free.fr> 2018-12-11 15:36:27 +0100
commit: f159cf3d750a7930a29abf172d9436550cc8369f (patch)
tree: 52297242c76952775adb1fe9c30d5d2be651abd2 /Eigen/src/Core/products
parent: 0a7e7af6fdd46aae6c56d1868f2cda5c9f4efa70 (diff)
1 files changed, 11 insertions, 1 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 3aaa68c4c..968cec78b 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -101,6 +101,16 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
   // at the register level. This small horizontal panel has to stay within L1 cache.
   std::ptrdiff_t l1, l2, l3;
   manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  #ifdef EIGEN_VECTORIZE_AVX512
+  // We need to find a rationale for that, but without this adjustment,
+  // performance with AVX512 is pretty bad, like -20% slower.
+  // One reason is that with increasing packet-size, the blocking size k
+  // has to become pretty small if we want that 1 lhs panel fit within L1.
+  // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
+  //   k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
+  // This is quite small for a good reuse of the accumulation registers.
+  l1 *= 4;
+  #endif
 
   if (num_threads > 1) {
     typedef typename Traits::ResScalar ResScalar;
@@ -372,7 +382,7 @@ public:
     default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
     && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
-    // we assume 16 registers
+    // we assume 16 registers or more
     // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
     // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
     // Bug 1515: MSVC prior to v19.14 yields to register spilling.
author	Gael Guennebaud <g.gael@free.fr>	2018-12-11 15:36:27 +0100
committer	Gael Guennebaud <g.gael@free.fr>	2018-12-11 15:36:27 +0100
commit	f159cf3d750a7930a29abf172d9436550cc8369f (patch)
tree	52297242c76952775adb1fe9c30d5d2be651abd2 /Eigen/src/Core/products
parent	0a7e7af6fdd46aae6c56d1868f2cda5c9f4efa70 (diff)