From 4e2f6de1a8fd9a659dc40ed54fedff9abdef3b1f Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 1 Apr 2019 11:47:31 -0700 Subject: Add support for custom packed Lhs/Rhs blocks in tensor contractions --- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 102 ++++++++++----------- 1 file changed, 46 insertions(+), 56 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index adf57c892..caa8d1767 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -280,6 +280,10 @@ struct TensorEvaluator TensorContractionKernel; + typedef typename TensorContractionKernel::LhsBlock LhsBlock; + typedef typename TensorContractionKernel::RhsBlock RhsBlock; + typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; + Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, Index gn, Index nm0, Index nn0, bool shard_by_col, @@ -311,7 +315,8 @@ struct TensorEvaluator(bm_ * bk_ * sizeof(LhsScalar), align) * align; - size_t rhs_size = - divup(bn_ * bk_ * sizeof(RhsScalar), align) * align; - packed_mem_ = static_cast(device_.allocate( - (nm0_ * lhs_size + nn0_ * rhs_size) * std::min(nk_, P - 1))); - char* mem = static_cast(packed_mem_); - for (Index x = 0; x < numext::mini(nk_, P - 1); x++) { - packed_lhs_[x].resize(nm0_); - for (Index m = 0; m < nm0_; m++) { - packed_lhs_[x][m] = reinterpret_cast(mem); - mem += lhs_size; - } - packed_rhs_[x].resize(nn0_); - for (Index n = 0; n < nn0_; n++) { - packed_rhs_[x][n] = reinterpret_cast(mem); - mem += rhs_size; - } - } + packed_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/nm0_, // + /*num_rhs=*/nn0_, // + /*num_slices=*/std::min(nk_, P - 1), // + packed_lhs_, packed_rhs_); if (parallelize_by_sharding_dim_only_) { const int num_worker_threads = device_.numThreadsInPool(); @@ -373,14 +364,13 @@ struct TensorEvaluator(thread_local_packed_mem_); + thread_local_packed_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/0, // + /*num_rhs=*/num_blocks, // + /*num_slices=*/1, // + /*lhs_blocks=*/nullptr, &thread_local_packed_rhs_); - thread_local_packed_rhs_.resize(num_blocks, nullptr); - for (Index i = 0; i < num_blocks; ++i) { - thread_local_packed_rhs_[i] = reinterpret_cast(mem); - mem += rhs_size; - } } else { can_use_thread_local_packed_ = new std::atomic[nm_]; for (int i = 0; i < nm_; ++i) @@ -388,14 +378,12 @@ struct TensorEvaluator(thread_local_packed_mem_); - - thread_local_packed_lhs_.resize(num_blocks, nullptr); - for (Index i = 0; i < num_blocks; ++i) { - thread_local_packed_lhs_[i] = reinterpret_cast(mem); - mem += lhs_size; - } + thread_local_packed_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/num_blocks, // + /*num_rhs=*/0, // + /*num_slices=*/1, &thread_local_packed_lhs_, // + /*rhs_blocks=*/nullptr); } } } @@ -405,9 +393,9 @@ struct TensorEvaluator packed_lhs_[P - 1]; - std::vector packed_rhs_[P - 1]; + + // Handle to the allocated temporary storage for Lhs/Rhs blocks. + BlockMemHandle packed_mem_; + std::vector packed_lhs_[P - 1]; + std::vector packed_rhs_[P - 1]; // If we choose to parallelize only by the sharding dimension, each thread // will have it's own "thead local" (not a c++ thread local storage) memory @@ -511,11 +503,11 @@ struct TensorEvaluator thread_local_packed_lhs_; - std::vector thread_local_packed_rhs_; + // Only one of these will be initialized depending on shard_by_col value. + std::vector thread_local_packed_lhs_; + std::vector thread_local_packed_rhs_; // After a particular shard for Kth slice missed thread local execution // opportunity (K-1 slice didn't complete kernels execution), we can no @@ -532,7 +524,7 @@ struct TensorEvaluator state_packing_ready_[P]; std::atomic state_switch_[P]; - LhsScalar* packed_lhs(Index m, Index k, Index m1, bool use_thread_local) { + LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) { if (use_thread_local) { eigen_assert(!shard_by_col_); @@ -546,7 +538,7 @@ struct TensorEvaluator 0); can_use_thread_local_packed_[m].store(false, std::memory_order_relaxed); @@ -589,9 +581,8 @@ struct TensorEvaluator