aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2015-04-09 16:44:10 -0700
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2015-04-09 16:44:10 -0700
commit5401fbcc50747583b0d47e195f23f988f7dfac5e (patch)
tree5279381d1f09215baee643987ee09e151d448aba /Eigen
parent0eb220c00d9773c29c7d169ad0e20745b0ef21bb (diff)
Improved the blocking strategy to speedup multithreaded tensor contractions.
Diffstat (limited to 'Eigen')
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h14
1 files changed, 9 insertions, 5 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 24623963b..320f96a39 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -112,14 +112,18 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
nr = Traits::nr,
nr_mask = (0xffffffff/nr)*nr
};
- Index k_cache = (l1-ksub)/kdiv;
+ // Increasing k gives us more time to prefetch the content of the "C"
+ // registers. However once the latency is hidden there is no point in
+ // increasing the value of k, so we'll cap it at 320 (value determined
+ // experimentally).
+ const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
if (k_cache < k) {
k = k_cache & k_mask;
eigen_internal_assert(k > 0);
}
- Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
- Index n_per_thread = numext::div_ceil(n, num_threads);
+ const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+ const Index n_per_thread = numext::div_ceil(n, num_threads);
if (n_cache <= n_per_thread) {
// Don't exceed the capacity of the l2 cache.
eigen_internal_assert(n_cache >= static_cast<Index>(nr));
@@ -131,8 +135,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
if (l3 > l2) {
// l3 is shared between all cores, so we'll give each thread its own chunk of l3.
- Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
- Index m_per_thread = numext::div_ceil(m, num_threads);
+ const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+ const Index m_per_thread = numext::div_ceil(m, num_threads);
if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
m = m_cache & mr_mask;
eigen_internal_assert(m > 0);