diff options
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 14 |
1 files changed, 9 insertions, 5 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 24623963b..320f96a39 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -112,14 +112,18 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n nr = Traits::nr, nr_mask = (0xffffffff/nr)*nr }; - Index k_cache = (l1-ksub)/kdiv; + // Increasing k gives us more time to prefetch the content of the "C" + // registers. However once the latency is hidden there is no point in + // increasing the value of k, so we'll cap it at 320 (value determined + // experimentally). + const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320); if (k_cache < k) { k = k_cache & k_mask; eigen_internal_assert(k > 0); } - Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); - Index n_per_thread = numext::div_ceil(n, num_threads); + const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); + const Index n_per_thread = numext::div_ceil(n, num_threads); if (n_cache <= n_per_thread) { // Don't exceed the capacity of the l2 cache. eigen_internal_assert(n_cache >= static_cast<Index>(nr)); @@ -131,8 +135,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n if (l3 > l2) { // l3 is shared between all cores, so we'll give each thread its own chunk of l3. - Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); - Index m_per_thread = numext::div_ceil(m, num_threads); + const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); + const Index m_per_thread = numext::div_ceil(m, num_threads); if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) { m = m_cache & mr_mask; eigen_internal_assert(m > 0); |