From 59998117bb0e4e0dc4b37b062f02ea5e6aab711e Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 7 Feb 2019 09:21:25 -0800 Subject: Don't do parallel_pack if we can use thread_local memory in tensor contractions --- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 55 ++++++++++++---------- 1 file changed, 30 insertions(+), 25 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 4af8d3b18..d7cd995fb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -208,6 +208,23 @@ struct TensorEvaluatorm_device.numThreadsInPool(); + + // With small number of threads we want to make sure that we do not reduce + // parallelism too much. + const int oversharding_factor = + num_worker_threads <= 4 ? 8 : + num_worker_threads <= 8 ? 4 : + num_worker_threads <= 16 ? 2 : 1; + + const bool parallelize_by_sharding_dim_only = + sharding_dim_tasks >= oversharding_factor * num_worker_threads; + // Last by not least, decide whether we want to issue both lhs and rhs // packing in parallel; or issue lhs packing first, and then issue rhs // packing when lhs packing completes (for !shard_by_col lhs and rhs are @@ -223,10 +240,13 @@ struct TensorEvaluatorm_device), lhs_(self->m_leftImpl, self->m_left_nocontract_strides, self->m_i_strides, self->m_left_contracting_strides, @@ -275,6 +295,7 @@ struct TensorEvaluator= oversharding_factor * num_worker_threads) { - parallelize_by_sharding_dim_only_ = true; + if (parallelize_by_sharding_dim_only_) { + const int num_worker_threads = device_.numThreadsInPool(); if (shard_by_col) { can_use_thread_local_packed_ = new std::atomic[nn_]; @@ -422,6 +432,7 @@ struct TensorEvaluator packed_lhs_[P - 1]; std::vector packed_rhs_[P - 1]; - // If there is enough concurrency in the sharding dimension, we choose not - // to paralellize by the other dimension, and execute all kernels in sync - // mode. This reduces parallelism from the nm_ x nn_ down to nn_ - // (shard_by_col==true) or nm_ (shard_by_col==false). - bool parallelize_by_sharding_dim_only_ = false; - // If we choose to parallelize only by the sharding dimension, each thread // will have it's own "thead local" (not a c++ thread local storage) memory // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory -- cgit v1.2.3