diff options
author | Deven Desai <deven.desai.amd@gmail.com> | 2019-03-19 16:52:38 -0400 |
---|---|---|
committer | Deven Desai <deven.desai.amd@gmail.com> | 2019-03-19 16:52:38 -0400 |
commit | 2dbea5510fe5cb64dbfdef9042c04a3a92b87f76 (patch) | |
tree | c187e7ec5e90a191e19466ff6084dd8f053dba7e /unsupported/Eigen/CXX11/src/Tensor | |
parent | e7e6809e6b38a5928efc0b5ca9520258e4d1fb3a (diff) | |
parent | 5c93b38c5fca514a08084e32feb8a8fb27bf3665 (diff) |
Merged eigen/eigen into default
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor')
12 files changed, 671 insertions, 142 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 7d9afa685..dbacf494e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -244,9 +244,11 @@ class TensorBase<Derived, ReadOnlyAccessors> } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived> + EIGEN_STRONG_INLINE const typename internal::conditional<NumTraits<CoeffReturnType>::IsComplex, + TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>, + Derived>::type conjugate() const { - return unaryExpr(internal::scalar_conjugate_op<Scalar>()); + return choose(Cond<NumTraits<CoeffReturnType>::IsComplex>(), unaryExpr(internal::scalar_conjugate_op<Scalar>()), derived()); } EIGEN_DEVICE_FUNC @@ -339,10 +341,13 @@ class TensorBase<Derived, ReadOnlyAccessors> return cwiseMin(constant(threshold)); } - template <typename NewType> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorConversionOp<NewType, const Derived> + template<typename NewType> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const typename internal::conditional<internal::is_same<NewType, CoeffReturnType>::value, + Derived, + TensorConversionOp<NewType, const Derived> >::type cast() const { - return TensorConversionOp<NewType, const Derived>(derived()); + return choose(Cond<internal::is_same<NewType, CoeffReturnType>::value>(), derived(), TensorConversionOp<NewType, const Derived>(derived())); } EIGEN_DEVICE_FUNC @@ -628,26 +633,26 @@ class TensorBase<Derived, ReadOnlyAccessors> } template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp<internal::AndReducer, const Dims, const TensorConversionOp<bool, const Derived> > + const TensorReductionOp<internal::AndReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type > all(const Dims& dims) const { return cast<bool>().reduce(dims, internal::AndReducer()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> > + const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type > all() const { DimensionList<Index, NumDimensions> in_dims; return cast<bool>().reduce(in_dims, internal::AndReducer()); } template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp<internal::OrReducer, const Dims, const TensorConversionOp<bool, const Derived> > + const TensorReductionOp<internal::OrReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type > any(const Dims& dims) const { return cast<bool>().reduce(dims, internal::OrReducer()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> > + const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type > any() const { DimensionList<Index, NumDimensions> in_dims; return cast<bool>().reduce(in_dims, internal::OrReducer()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 61a4e1a3a..6ca881f27 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -102,7 +102,7 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKern typedef typename remove_reference<RhsNested>::type _RhsNested; // From NumDims below. - static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value; + static const int NumDimensions = traits<LhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value; static const int Layout = traits<LhsXprType>::Layout; typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val, typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index 71fd19774..c51f3f8dd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -51,6 +51,10 @@ class TensorContractionBlocking { else { computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads); } + + const int rhs_packet_size = internal::packet_traits<RhsScalar>::size; + kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ? + kc_ : (kc_ / rhs_packet_size) * rhs_packet_size; } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h index 056665749..5d19652e6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -1219,9 +1219,6 @@ template<typename Indices, typename LeftArgType, typename RightArgType, typename struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> : public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > { - static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value, - "GPU tensor contraction does not support output kernels."); - typedef GpuDevice Device; typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self; @@ -1274,7 +1271,11 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT typedef typename RightEvaluator::Dimensions RightDimensions; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} + Base(op, device) + { + EIGEN_STATIC_ASSERT( (internal::is_same<OutputKernelType, const NoOpOutputKernel>::value), + GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS); + } // We need to redefine this method to make nvcc happy EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 2d3b69128..142492603 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -120,6 +120,7 @@ class SimpleTensorContractionMapper { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 Index nocontract_val = left ? row : col; Index linidx = 0; for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) { @@ -158,6 +159,7 @@ class SimpleTensorContractionMapper { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const { const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; Index linidx[2] = {0, 0}; if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) { @@ -239,8 +241,10 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } template <typename PacketT,int AlignmentType> - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if<internal::unpacket_traits<PacketT>::size==packet_size,PacketT>::type + load(Index i, Index j) const + { // whole method makes column major assumption // don't need to add offsets for now (because operator handles that) @@ -282,6 +286,29 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, } template <typename PacketT,int AlignmentType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if<internal::unpacket_traits<PacketT>::size!=packet_size,PacketT>::type + load(Index i, Index j) const + { + const Index requested_packet_size = internal::unpacket_traits<PacketT>::size; + EIGEN_ALIGN_MAX Scalar data[requested_packet_size]; + + const IndexPair<Index> indexPair = this->computeIndexPair(i, j, requested_packet_size - 1); + const Index first = indexPair.first; + const Index lastIdx = indexPair.second; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < requested_packet_size - 1; k += 2) { + const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx); + + return pload<PacketT>(data); + } + + template <typename PacketT,int AlignmentType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { return this->load<PacketT,AlignmentType>(i,j); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 24ba3e431..adf57c892 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -208,6 +208,26 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT Index nm = divup(nm0, gm); Index nn = divup(nn0, gn); + // If there is enough concurrency in the sharding dimension, we choose not + // to paralellize by the other dimension, and execute all kernels in sync + // mode. This reduces parallelism from the nm x nn down to nn + // (shard_by_col==true) or nm (shard_by_col==false). + const Index sharding_dim_tasks = shard_by_col ? nn : nm; + const int num_worker_threads = this->m_device.numThreadsInPool(); + + // With small number of threads we want to make sure that we do not reduce + // parallelism too much. With large number of threads we trade maximum + // parallelism for better memory locality. + const float oversharding_factor = + num_worker_threads <= 4 ? 8.0 : + num_worker_threads <= 8 ? 4.0 : + num_worker_threads <= 16 ? 2.0 : + num_worker_threads <= 32 ? 1.0 : + num_worker_threads <= 64 ? 0.8 : /* num_worker_threads > 64 */ 0.6; + + const bool parallelize_by_sharding_dim_only = + sharding_dim_tasks >= oversharding_factor * num_worker_threads; + // Last by not least, decide whether we want to issue both lhs and rhs // packing in parallel; or issue lhs packing first, and then issue rhs // packing when lhs packing completes (for !shard_by_col lhs and rhs are @@ -223,10 +243,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // But don't do it if we will use each rhs only once. Locality seems to be // more important in this case. if ((shard_by_col ? nm : nn) == 1) parallel_pack = false; + // Also don't get in the way of parallelize_by_sharding_dim_only + // optimization. + if (parallelize_by_sharding_dim_only) parallel_pack = false; - #define CONTEXT_ARGS \ +#define CONTEXT_ARGS \ (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \ - nn0, shard_by_col, parallel_pack) \ + nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only) \ .run() TENSOR_CONTRACTION_DISPATCH(Context, Alignment, CONTEXT_ARGS); @@ -260,7 +283,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, Index gn, Index nm0, Index nn0, bool shard_by_col, - bool parallel_pack) + bool parallel_pack, bool parallelize_by_sharding_dim_only) : device_(self->m_device), lhs_(self->m_leftImpl, self->m_left_nocontract_strides, self->m_i_strides, self->m_left_contracting_strides, @@ -275,6 +298,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT num_threads_(num_threads), shard_by_col_(shard_by_col), parallel_pack_(parallel_pack), + parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only), m_(tm), n_(tn), k_(tk), @@ -289,6 +313,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT nm0_(nm0), nn0_(nn0) { + // These two options are mutually exclusive. + eigen_assert(!(parallel_pack && parallelize_by_sharding_dim_only)); + for (Index x = 0; x < P; x++) { // Normal number of notifications for k slice switch is // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only @@ -335,6 +362,42 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT mem += rhs_size; } } + + if (parallelize_by_sharding_dim_only_) { + const int num_worker_threads = device_.numThreadsInPool(); + + if (shard_by_col) { + can_use_thread_local_packed_ = new std::atomic<bool>[nn_]; + for (int i = 0; i < nn_; ++i) + can_use_thread_local_packed_[i].store(true, + std::memory_order_relaxed); + + Index num_blocks = num_worker_threads * gn_; + thread_local_packed_mem_ = device_.allocate(num_blocks * rhs_size); + mem = static_cast<char*>(thread_local_packed_mem_); + + thread_local_packed_rhs_.resize(num_blocks, nullptr); + for (Index i = 0; i < num_blocks; ++i) { + thread_local_packed_rhs_[i] = reinterpret_cast<RhsScalar*>(mem); + mem += rhs_size; + } + } else { + can_use_thread_local_packed_ = new std::atomic<bool>[nm_]; + for (int i = 0; i < nm_; ++i) + can_use_thread_local_packed_[i].store(true, + std::memory_order_relaxed); + + Index num_blocks = num_worker_threads * gm_; + thread_local_packed_mem_ = device_.allocate(num_blocks * lhs_size); + mem = static_cast<char*>(thread_local_packed_mem_); + + thread_local_packed_lhs_.resize(num_blocks, nullptr); + for (Index i = 0; i < num_blocks; ++i) { + thread_local_packed_lhs_[i] = reinterpret_cast<LhsScalar*>(mem); + mem += lhs_size; + } + } + } } ~Context() { @@ -343,6 +406,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT delete[] state_kernel_[x]; } device_.deallocate(packed_mem_); + if (parallelize_by_sharding_dim_only_) { + device_.deallocate(thread_local_packed_mem_); + delete[] can_use_thread_local_packed_; + } } void run() { @@ -368,6 +435,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT const int num_threads_; const bool shard_by_col_; const bool parallel_pack_; + const bool parallelize_by_sharding_dim_only_; // Matrix sizes. const Index m_; const Index n_; @@ -426,6 +494,36 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT void* packed_mem_; std::vector<LhsScalar*> packed_lhs_[P - 1]; std::vector<RhsScalar*> packed_rhs_[P - 1]; + + // If we choose to parallelize only by the sharding dimension, each thread + // will have it's own "thead local" (not a c++ thread local storage) memory + // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory + // can't be passed to a kernel that might execute on a different thread. + // + // In practice when we are ready to pack memory for the sharding dimension + // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice + // already computed (99% of the time), and we can pack data into the thread + // local storage, and guarantee that all the kernels will be executed + // immediately in the same thread. This significantly increases L1 cache hit + // ratio and reduces pressure on the memory bus. + // + // It's still possible that kernel for the K-th slice will be ready before + // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_ + // and packed_rhs_ to allow kernels to be executed later on a thread + // different from the thread that was used for packing. + void* thread_local_packed_mem_; + + // Only one of these will beinitialized depending on shard_by_col value. + std::vector<LhsScalar*> thread_local_packed_lhs_; + std::vector<RhsScalar*> thread_local_packed_rhs_; + + // After a particular shard for Kth slice missed thread local execution + // opportunity (K-1 slice didn't complete kernels execution), we can no + // longer schedule K+1 and following slices in thread local mode, because + // there is no more guarantee that previous kernels were executed + // sequentially in the same thread (size is nn_ or nm_). + std::atomic<bool>* can_use_thread_local_packed_; + std::atomic<uint8_t>** state_kernel_[P]; // state_switch_ is frequently modified by worker threads, while other // fields are read-only after constructor. Let's move it to a separate cache @@ -434,22 +532,96 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT std::atomic<Index> state_packing_ready_[P]; std::atomic<Index> state_switch_[P]; + LhsScalar* packed_lhs(Index m, Index k, Index m1, bool use_thread_local) { + if (use_thread_local) { + eigen_assert(!shard_by_col_); + + Index base_idx = gm_ * device_.currentThreadId(); + Index grain_idx = m1 - m * gm_; + Index block_idx = base_idx + grain_idx; + + return thread_local_packed_lhs_[block_idx]; + } else { + return packed_lhs_[k % (P - 1)][m1]; + } + } + + RhsScalar* packed_rhs(Index n, Index k, Index n1, bool use_thread_local) { + if (use_thread_local) { + eigen_assert(shard_by_col_); + + Index base_idx = gn_ * device_.currentThreadId(); + Index grain_idx = n1 - n * gn_; + Index block_idx = base_idx + grain_idx; + + return thread_local_packed_rhs_[block_idx]; + } else { + return packed_rhs_[k % (P - 1)][n1]; + } + } + + // In following two methods (pack_lhs and pack_rhs), if we know for sure + // that we'll be able to immediately call a kernel with packed data, and do + // not submit it to the thread pool, we can use thread local memory for + // packed data. + // + // We can only reliably check it if we are running all kernels in sync mode + // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to + // run, it's guaranteed that all kernels with larger values of m (n) are + // also ready, because we execute them in the same order for all K slices. + void pack_lhs(Index m, Index k) { + bool use_thread_local = false; + + if (parallelize_by_sharding_dim_only_ && !shard_by_col_ && + can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) { + if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) { + use_thread_local = true; + } else { + // If we can't guarantee that all kernels in `k` slice will be + // executed sequentially in current thread, it's no longer safe to use + // thread local memory in followig slices along the k dimensions. + eigen_assert(k > 0); + can_use_thread_local_packed_[m].store(false, + std::memory_order_relaxed); + } + } + const Index mend = m * gm_ + gm(m); for (Index m1 = m * gm_; m1 < mend; m1++) - TensorContractionKernel::packLhs(packed_lhs_[k % (P - 1)][m1], + TensorContractionKernel::packLhs(packed_lhs(m, k, m1, use_thread_local), lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); if (!parallel_pack_ && shard_by_col_) { + assert(!use_thread_local); signal_packing(k); } else { signal_switch(k + 1); - for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0); + for (Index n = nn_ - 1; n >= 0; n--) { + bool sync = parallelize_by_sharding_dim_only_ || n == 0; + signal_kernel(m, n, k, sync, use_thread_local); + } } } void pack_rhs(Index n, Index k) { + bool use_thread_local = false; + + if (parallelize_by_sharding_dim_only_ && shard_by_col_ && + can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) { + if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) { + use_thread_local = true; + } else { + // If we can't guarantee that all kernels in `k` slice will be + // executed sequentially in current thread, it's no longer safe to use + // thread local memory in followig slices along the k dimensions. + eigen_assert(k > 0); + can_use_thread_local_packed_[n].store(false, + std::memory_order_relaxed); + } + } + const Index nend = n * gn_ + gn(n); for (Index n1 = n * gn_; n1 < nend; n1++) { if (k == 0) { @@ -462,20 +634,24 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // deadlocks. memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); } - TensorContractionKernel::packRhs(packed_rhs_[k % (P - 1)][n1], + TensorContractionKernel::packRhs(packed_rhs(n, k, n1, use_thread_local), rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); } if (parallel_pack_ || shard_by_col_) { signal_switch(k + 1); - for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0); + for (Index m = nm_ - 1; m >= 0; m--) { + bool sync = parallelize_by_sharding_dim_only_ || m == 0; + signal_kernel(m, n, k, sync, use_thread_local); + } } else { + assert(!use_thread_local); signal_packing(k); } } - void kernel(Index m, Index n, Index k) { + void kernel(Index m, Index n, Index k, bool use_thread_local) { // Note: order of iteration matters here. Iteration over m is innermost // because we want to reuse the same packed rhs in consecutive tasks // (rhs fits into L2$ while lhs only into L3$). @@ -486,8 +662,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT for (Index m1 = m * gm_; m1 < mend; m1++) { const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); TensorContractionKernel::invoke( - output_mapper, packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1)); + output_mapper, + packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), + packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), + bk(k), bn(n1), Scalar(1)); // We are done with the last task for the [m1, n1] block. if (k + 1 == nk_) { @@ -501,8 +679,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT for (Index n1 = n * gn_; n1 < nend; n1++) { const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); TensorContractionKernel::invoke( - output_mapper, packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1)); + output_mapper, + packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), + packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), + bk(k), bn(n1), Scalar(1)); // We are done with the last task for the [m1, n1] block. if (k + 1 == nk_) { @@ -511,7 +691,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT } } } - signal_kernel(m, n, k + 1, false); + signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false); signal_switch(k + 2); } @@ -524,16 +704,23 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT enqueue_packing(k, shard_by_col_); } - void signal_kernel(Index m, Index n, Index k, bool sync) { + void signal_kernel(Index m, Index n, Index k, bool sync, + bool use_thread_local) { std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n]; Index s = state->load(); eigen_assert(s > 0); - if (s != 1 && state->fetch_sub(1) != 1) return; + if (s != 1 && state->fetch_sub(1) != 1) { + eigen_assert(!use_thread_local); + return; + } state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); - if (sync) - kernel(m, n, k); - else - device_.enqueueNoNotification([=]() { kernel(m, n, k); }); + if (sync) { + kernel(m, n, k, use_thread_local); + } else { + eigen_assert(!use_thread_local); + device_.enqueueNoNotification( + [=]() { kernel(m, n, k, use_thread_local); }); + } } void signal_switch(Index k, Index v = 1) { @@ -589,7 +776,26 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT [=]() { enqueue_packing_helper(mid, end, k, rhs); }); end = mid; } - enqueue_packing_helper(start, end, k, rhs); + + // Decide if we want to run first packing task (start == 0) in + // async mode if we parallelize only by sharding dim: + // (1) pack_lhs and pack_rhs call signal_switch before completing + // all calls to signal_kernel, which in sync mode might lead + // to the execution of the first kernel of the k+1 slice, before + // completing a call to the last kernel of the k slice. + // (2) all pack tasks for sharded dim must be executed in a thread + // pool. + bool pack_async = + (start == 0) && + (parallelize_by_sharding_dim_only_&& shard_by_col_ == rhs) && + (k > 0 || device_.currentThreadId() < 0); + + if (pack_async) { + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(start, end, k, rhs); }); + } else { + enqueue_packing_helper(start, end, k, rhs); + } } } @@ -756,6 +962,36 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT } } + template <int Alignment> + EIGEN_STRONG_INLINE void addAllToBuffer(size_t n, const Scalar* src_buf0, + const Scalar* src_buf1, + const Scalar* src_buf2, + Scalar* dst_buf) const { + using ::Eigen::internal::padd; + using ::Eigen::internal::pload; + using ::Eigen::internal::ploadt; + using ::Eigen::internal::pstoret; + + const int output_packet_size = + internal::unpacket_traits<PacketReturnType>::size; + + size_t i = 0; + const size_t num_packets = n / output_packet_size; + for (; i < output_packet_size * num_packets; i += output_packet_size) { + const auto src_val0 = pload<PacketReturnType>(src_buf0 + i); + const auto src_val1 = pload<PacketReturnType>(src_buf1 + i); + const auto src_val2 = pload<PacketReturnType>(src_buf2 + i); + + const auto dst_val = ploadt<PacketReturnType, Alignment>(dst_buf + i); + const auto sum = padd(padd(dst_val, src_val0), padd(src_val1, src_val2)); + + pstoret<Scalar, PacketReturnType, Alignment>(dst_buf + i, sum); + } + for (; i < n; ++i) { + dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i]; + } + } + // Decide whether we want to shard m x k x n contraction over the inner // (contraction) dimension (k). static bool shardByInnerDim(Index m, Index n, Index k, int num_threads, @@ -788,48 +1024,147 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT const Index m = this->m_i_size; const Index n = this->m_j_size; const Index k = this->m_k_size; - // The underlying GEMM kernel assumes that k is a multiple of 8 and - // subtle breakage occurs if this is violated. - Index block_size = 8 * divup<Index>(k, 8 * num_threads); - Index num_blocks = divup<Index>(k, block_size); - // we use 'result' for the first block's partial result. - MaxSizeVector<Scalar*> block_buffers(num_blocks - 1); - Barrier barrier(internal::convert_index<int>(num_blocks)); - auto process_block = [=, &barrier](Scalar* buf, Index begin, Index end) { - ::memset(buf, 0, m * n * sizeof(Scalar)); + + // We will compute partial results into the buffers of this size. + const Index buffer_size_bytes = m * n * sizeof(Scalar); + + // The underlying GEMM kernel assumes that k is a multiple of + // the packet size and subtle breakage occurs if this is violated. + const Index packet_size = internal::packet_traits<RhsScalar>::size; + + const auto round_up = [=](Index index) -> Index { + const Index kmultiple = packet_size <= 8 ? 8 : packet_size; + return divup<Index>(index, kmultiple) * kmultiple; + }; + + // Cost model doesn't capture well the cost associated with constructing + // tensor contraction mappers and computing loop bounds in gemm_pack_lhs and + // gemm_pack_rhs, so we specify minimum desired block size. + const Index target_block_size = round_up(divup<Index>(k, num_threads)); + const Index desired_min_block_size = 12 * packet_size; + + const Index block_size = numext::mini<Index>( + k, numext::maxi<Index>(desired_min_block_size, target_block_size)); + const Index num_blocks = divup<Index>(k, block_size); + + // Compute block size with accounting for potentially incomplete last block. + const auto actual_block_size = [=](Index block_idx) -> Index { + return block_idx + 1 < num_blocks + ? block_size + : k + block_size - block_size * num_blocks; + }; + + // We compute partial gemm results in parallel, and to get the final result + // we need to add them all together. For the large number of threads (>= 48) + // this adds a very expensive sequential step at the end. + // + // We split the [0, num_blocks) into small ranges, and when a task for the + // block finishes its partial gemm computation, it checks if it was the last + // gemm in the range, and if so, it will add all blocks of the range. + // + // After all tasks finihes, we need to add only these pre-aggregated blocks. + + // Compute range size with accounting for potentially incomplete last range. + const auto actual_range_size = [=](Index num_ranges, Index range_size, + Index range_idx) -> Index { + eigen_assert(range_idx < num_ranges); + return range_idx + 1 < num_ranges + ? range_size + : num_blocks + range_size - range_size * num_ranges; + }; + + // For now we use just a single level of ranges to compute pre-aggregated + // partial sums, but in general we can use more layers to compute tree + // aggregation in parallel and reduce the size of the sequential step. + // + // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make + // sense only if number of threads >= ~128? + static const Index l0_size = 4; + const Index l0_ranges = divup<Index>(num_blocks, l0_size); + + // Keep count of pending gemm tasks for each l0 range. + MaxSizeVector<std::atomic<int>> l0_state(l0_ranges); + for (int i = 0; i < l0_ranges; ++i) { + const Index num_pending_tasks = actual_range_size(l0_ranges, l0_size, i); + l0_state.emplace_back(internal::convert_index<int>(num_pending_tasks)); + } + + MaxSizeVector<Scalar*> block_buffers(num_blocks); + + auto process_block = [&, this](Index block_idx, Index begin, Index end) { + Scalar* buf = block_buffers[block_idx]; + ::memset(buf, 0, buffer_size_bytes); + TENSOR_CONTRACTION_DISPATCH( this->template evalGemmPartialWithoutOutputKernel, Alignment, - (buf, begin, end, this->m_device.numThreads())); - barrier.Notify(); - }; - Index start = 0; - for (Index blocks_left = num_blocks; blocks_left > 0; --blocks_left) { - // The underlying GEMM kernel assumes that k is a multiple of packet size - // (currently largest packet size is 8) and subtle breakage occurs if - // this is violated. - block_size = 8 * divup<Index>(k - start, 8 * blocks_left); - Scalar* buf; - if (start == 0) { - buf = result; - } else { - buf = static_cast<Scalar*>( - this->m_device.allocate(m * n * sizeof(Scalar))); - block_buffers.push_back(buf); - } - Index end = start + block_size; - if (end > k) { - end = k; + (buf, begin, end, + /*num_threads=*/internal::convert_index<int>(num_blocks))); + + // Check if it was the last task in l0 range. + const Index l0_index = block_idx / l0_size; + const int v = l0_state[l0_index].fetch_sub(1); + eigen_assert(v >= 1); + + // If we processed the last block of the range, we can aggregate all + // partial results into the first block of the range. + if (v == 1) { + const Index rng_size = actual_range_size(l0_ranges, l0_size, l0_index); + const Index dst_block_idx = l0_index * l0_size; + + if (rng_size == l0_size) { + addAllToBuffer<Alignment>( + m * n, + /*src_buf0=*/block_buffers[dst_block_idx + 1], + /*src_buf1=*/block_buffers[dst_block_idx + 2], + /*src_buf2=*/block_buffers[dst_block_idx + 3], + /*dst_buf= */ block_buffers[dst_block_idx]); + } else { + // Aggregate blocks of potentially incomplete last range. + for (int i = 1; i < rng_size; ++i) { + addToBuffer<Alignment>(m * n, + /*src_buf=*/block_buffers[dst_block_idx + i], + /*dst_buf=*/block_buffers[dst_block_idx]); + } + } } - this->m_device.enqueueNoNotification( - [=, &process_block]() { process_block(buf, start, end); }); - start = end; + }; + + Barrier barrier(internal::convert_index<int>(num_blocks)); + for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) { + Scalar* buf = block_idx == 0 + ? result + : static_cast<Scalar*>( + this->m_device.allocate(buffer_size_bytes)); + block_buffers.push_back(buf); + + Index block_start = block_idx * block_size; + Index block_end = block_start + actual_block_size(block_idx); + + this->m_device.enqueueNoNotification([=, &barrier, &process_block]() { + process_block(block_idx, block_start, block_end); + barrier.Notify(); + }); } barrier.Wait(); - // Add other partial results into first partial result. - for (const auto& buf : block_buffers) { - addToBuffer<Alignment>(m * n, buf, result); - this->m_device.deallocate(buf); + // Aggregate partial sums from l0 ranges. + Index l0_index = 1; + for (; l0_index + 2 < l0_ranges; l0_index += 3) { + addAllToBuffer<Alignment>( + m * n, + /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size], + /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size], + /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size], + /*dst_buf= */block_buffers[0]); + } + for (; l0_index < l0_ranges; ++l0_index) { + addToBuffer<Alignment>(m * n, block_buffers[l0_index * l0_size], + block_buffers[0]); + } + + // Don't forget to deallocate ALL temporary buffers. + for (Index i = 1; i < num_blocks; ++i) { + this->m_device.deallocate(block_buffers[i]); } // Finally call output kernel with finalized output buffer. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 1f613d3c7..938fd0f34 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -32,7 +32,7 @@ struct traits<TensorConversionOp<TargetType, XprType> > static const int NumDimensions = traits<XprType>::NumDimensions; static const int Layout = traits<XprType>::Layout; enum { Flags = 0 }; - typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType; + typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType; }; template<typename TargetType, typename XprType> @@ -177,6 +177,81 @@ template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eva } }; +namespace internal { + +template <typename SrcType, typename TargetType, bool IsSameT> +struct CoeffConv { + template <typename ArgType, typename Device> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) { + internal::scalar_cast_op<SrcType, TargetType> converter; + return converter(impl.coeff(index)); + } +}; + +template <typename SrcType, typename TargetType> +struct CoeffConv<SrcType, TargetType, true> { + template <typename ArgType, typename Device> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) { + return impl.coeff(index); + } +}; + +template <typename SrcPacket, typename TargetPacket, int LoadMode, bool ActuallyVectorize, bool IsSameT> +struct PacketConv { + typedef typename internal::unpacket_traits<SrcPacket>::type SrcType; + typedef typename internal::unpacket_traits<TargetPacket>::type TargetType; + + static const int PacketSize = internal::unpacket_traits<TargetPacket>::size; + + template <typename ArgType, typename Device> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) { + internal::scalar_cast_op<SrcType, TargetType> converter; + EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = converter(impl.coeff(index+i)); + } + TargetPacket rslt = internal::pload<TargetPacket>(values); + return rslt; + } +}; + +template <typename SrcPacket, typename TargetPacket, int LoadMode, bool IsSameT> +struct PacketConv<SrcPacket, TargetPacket, LoadMode, true, IsSameT> { + typedef typename internal::unpacket_traits<SrcPacket>::type SrcType; + typedef typename internal::unpacket_traits<TargetPacket>::type TargetType; + + template <typename ArgType, typename Device> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) { + const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio; + const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio; + PacketConverter<TensorEvaluator<ArgType, Device>, SrcPacket, TargetPacket, + SrcCoeffRatio, TgtCoeffRatio> converter(impl); + return converter.template packet<LoadMode>(index); + } +}; + +template <typename SrcPacket, typename TargetPacket, int LoadMode> +struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/false, /*IsSameT=*/true> { + typedef typename internal::unpacket_traits<TargetPacket>::type TargetType; + static const int PacketSize = internal::unpacket_traits<TargetPacket>::size; + + template <typename ArgType, typename Device> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) { + EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index+i); + return internal::pload<TargetPacket>(values); + } +}; + +template <typename SrcPacket, typename TargetPacket, int LoadMode> +struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/true, /*IsSameT=*/true> { + template <typename ArgType, typename Device> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) { + return impl.template packet<LoadMode>(index); + } +}; + +} // namespace internal // Eval as rvalue template<typename TargetType, typename ArgType, typename Device> @@ -191,6 +266,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename PacketType<SrcType, Device>::type PacketSourceType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; + static const bool IsSameType = internal::is_same<TargetType, SrcType>::value; enum { IsAligned = false, @@ -210,7 +286,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - return ConversionSubExprEval<internal::is_same<TargetType, SrcType>::value, TensorEvaluator<ArgType, Device>, Scalar>::run(m_impl, data); + return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, Scalar>::run(m_impl, data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() @@ -220,16 +296,23 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - internal::scalar_cast_op<SrcType, TargetType> converter; - return converter(m_impl.coeff(index)); + return internal::CoeffConv<SrcType, TargetType, IsSameType>::run(m_impl,index); } template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const bool Vectorizable = TensorEvaluator<ArgType, Device>::PacketAccess & - internal::type_casting_traits<SrcType, TargetType>::VectorizedCast; - return PacketConv<LoadMode, Vectorizable>::run(m_impl, index); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType + packet(Index index) const { + // If we are not going to do the cast, we just need to check that base + // TensorEvaluator has packet access. Otherwise we also need to make sure, + // that we have an implementation of vectorized cast. + const bool Vectorizable = + IsSameType + ? TensorEvaluator<ArgType, Device>::PacketAccess + : TensorEvaluator<ArgType, Device>::PacketAccess & + internal::type_casting_traits<SrcType, TargetType>::VectorizedCast; + + return internal::PacketConv<PacketSourceType, PacketReturnType, LoadMode, + Vectorizable, IsSameType>::run(m_impl, index); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost @@ -252,31 +335,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> /// required by sycl in order to extract the sycl accessor const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - protected: - template <int LoadMode, bool ActuallyVectorize> - struct PacketConv { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) { - internal::scalar_cast_op<SrcType, TargetType> converter; - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = converter(impl.coeff(index+i)); - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - }; - - template <int LoadMode> - struct PacketConv<LoadMode, true> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) { - const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio; - const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio; - PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType, - SrcCoeffRatio, TgtCoeffRatio> converter(impl); - return converter.template packet<LoadMode>(index); - } - }; - + protected: TensorEvaluator<ArgType, Device> m_impl; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index bb330a77b..b43db40c8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -87,13 +87,13 @@ struct ThreadPoolDevice { const size_t kMinBlockSize = 32768; typedef TensorCostModel<ThreadPoolDevice> CostModel; const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4); - if (n <= kMinBlockSize || num_threads == 1) { + if (n <= kMinBlockSize || num_threads < 2) { ::memcpy(dst, src, n); } else { const char* src_ptr = static_cast<const char*>(src); char* dst_ptr = static_cast<char*>(dst); const size_t blocksize = (n + (num_threads - 1)) / num_threads; - Barrier barrier(num_threads - 1); + Barrier barrier(static_cast<int>(num_threads - 1)); // Launch the last 3 blocks on worker threads. for (size_t i = 1; i < num_threads; ++i) { enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] { @@ -122,6 +122,12 @@ struct ThreadPoolDevice { return num_threads_; } + // Number of theads available in the underlying thread pool. This number can + // be different from the value returned by numThreads(). + EIGEN_STRONG_INLINE int numThreadsInPool() const { + return pool_->NumThreads(); + } + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return l1CacheSize(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 1c44541bd..e2ff11129 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -325,7 +325,6 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> { static void run(const Expression& expr, const GpuDevice& device); }; - #if defined(EIGEN_GPUCC) template <typename Evaluator, typename StorageIndex, bool Vectorizable> struct EigenMetaKernelEval { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 78068be35..74b905329 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -90,14 +90,21 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> static const int PacketSize = PacketType<CoeffReturnType, Device>::size; enum { - IsAligned = true, - PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), - BlockAccess = false, + IsAligned = true, + PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), + BlockAccess = internal::is_arithmetic<CoeffReturnType>::value, PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - RawAccess = true + Layout = TensorEvaluator<ArgType, Device>::Layout, + RawAccess = true }; + typedef typename internal::TensorBlock< + CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout> + TensorBlock; + typedef typename internal::TensorBlockReader< + CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout> + TensorBlockReader; + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) /// op_ is used for sycl : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) @@ -139,6 +146,14 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>*) const {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const { + assert(m_buffer != NULL); + TensorBlockReader::Run(block, m_buffer); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index ac66f9cf1..204a6fd33 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -89,17 +89,22 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; enum { - IsAligned = false, - PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), + BlockAccess = true, + PreferBlockAccess = true, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; + typedef internal::TensorIntDivisor<Index> IndexDivisor; + + typedef internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout> + TensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_generator(op.generator()) + : m_device(device), m_generator(op.generator()) #ifdef EIGEN_USE_SYCL , m_argImpl(op.expression(), device) #endif @@ -111,11 +116,13 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> m_strides[0] = 1; for (int i = 1; i < NumDims; ++i) { m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); } } else { m_strides[NumDims - 1] = 1; for (int i = NumDims - 2; i >= 0; --i) { m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); } } } @@ -150,6 +157,75 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>( + 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); + resources->push_back(internal::TensorOpResourceRequirements( + internal::kSkewedInnerDims, block_total_size_max)); + } + + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (NumDims <= 0) return; + + static const bool is_col_major = + static_cast<int>(Layout) == static_cast<int>(ColMajor); + + // Compute spatial coordinates for the first block element. + array<Index, NumDims> coords; + extract_coordinates(output_block->first_coeff_index(), coords); + array<Index, NumDims> initial_coords = coords; + + CoeffReturnType* data = output_block->data(); + Index offset = 0; + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array<BlockIteratorState, NumDims> it; + for (Index i = 0; i < NumDims; ++i) { + const Index dim = is_col_major ? i : NumDims - 1 - i; + it[i].size = output_block->block_sizes()[dim]; + it[i].stride = output_block->block_strides()[dim]; + it[i].span = it[i].stride * (it[i].size - 1); + it[i].count = 0; + } + eigen_assert(it[0].stride == 1); + + while (it[NumDims - 1].count < it[NumDims - 1].size) { + // Generate data for the inner-most dimension. + for (Index i = 0; i < it[0].size; ++i) { + *(data + offset + i) = m_generator(coords); + coords[is_col_major ? 0 : NumDims - 1]++; + } + coords[is_col_major ? 0 : NumDims - 1] = + initial_coords[is_col_major ? 0 : NumDims - 1]; + + // For the 1d tensor we need to generate only one inner-most dimension. + if (NumDims == 1) break; + + // Update offset. + for (Index i = 1; i < NumDims; ++i) { + if (++it[i].count < it[i].size) { + offset += it[i].stride; + coords[is_col_major ? i : NumDims - 1 - i]++; + break; + } + if (i != NumDims - 1) it[i].count = 0; + coords[is_col_major ? i : NumDims - 1 - i] = + initial_coords[is_col_major ? i : NumDims - 1 - i]; + offset -= it[i].span; + } + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { // TODO(rmlarsen): This is just a placeholder. Define interface to make @@ -170,14 +246,14 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> void extract_coordinates(Index index, array<Index, NumDims>& coords) const { if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_strides[i]; + const Index idx = index / m_fast_strides[i]; index -= idx * m_strides[i]; coords[i] = idx; } coords[0] = index; } else { for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_strides[i]; + const Index idx = index / m_fast_strides[i]; index -= idx * m_strides[i]; coords[i] = idx; } @@ -185,8 +261,10 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> } } + const Device& m_device; Dimensions m_dimensions; array<Index, NumDims> m_strides; + array<IndexDivisor, NumDims> m_fast_strides; Generator m_generator; #ifdef EIGEN_USE_SYCL TensorEvaluator<ArgType, Device> m_argImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 50fa0cb2e..bb63433fe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -402,25 +402,25 @@ struct OuterReducer { #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) -template <int B, int N, typename S, typename R, typename I> -__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); +template <int B, int N, typename S, typename R, typename I_> +__global__ void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); #if defined(EIGEN_HAS_GPU_FP16) -template <typename S, typename R, typename I> -__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); -template <int B, int N, typename S, typename R, typename I> -__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*); -template <int NPT, typename S, typename R, typename I> -__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*); +template <typename S, typename R, typename I_> +__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, half2*); +template <int B, int N, typename S, typename R, typename I_> +__global__ void FullReductionKernelHalfFloat(R, const S, I_, half*, half2*); +template <int NPT, typename S, typename R, typename I_> +__global__ void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); #endif -template <int NPT, typename S, typename R, typename I> -__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); +template <int NPT, typename S, typename R, typename I_> +__global__ void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); -template <int NPT, typename S, typename R, typename I> -__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); +template <int NPT, typename S, typename R, typename I_> +__global__ void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); #endif template <typename Self, typename Op, @@ -1114,15 +1114,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, template <typename S, typename O, bool V> friend struct internal::FullReducerShard; #endif #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) - template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); + template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); #if defined(EIGEN_HAS_GPU_FP16) - template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); - template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); - template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); + template <typename S, typename R, typename I_> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, half2*); + template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, half2*); + template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); #endif - template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); - template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); #endif #if defined(EIGEN_USE_SYCL) |