aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor
diff options
context:
space:
mode:
authorGravatar Deven Desai <deven.desai.amd@gmail.com>2019-03-19 16:52:38 -0400
committerGravatar Deven Desai <deven.desai.amd@gmail.com>2019-03-19 16:52:38 -0400
commit2dbea5510fe5cb64dbfdef9042c04a3a92b87f76 (patch)
treec187e7ec5e90a191e19466ff6084dd8f053dba7e /unsupported/Eigen/CXX11/src/Tensor
parente7e6809e6b38a5928efc0b5ca9520258e4d1fb3a (diff)
parent5c93b38c5fca514a08084e32feb8a8fb27bf3665 (diff)
Merged eigen/eigen into default
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBase.h23
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h9
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h31
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h447
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h127
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h25
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h98
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h36
12 files changed, 671 insertions, 142 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 7d9afa685..dbacf494e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -244,9 +244,11 @@ class TensorBase<Derived, ReadOnlyAccessors>
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE const typename internal::conditional<NumTraits<CoeffReturnType>::IsComplex,
+ TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>,
+ Derived>::type
conjugate() const {
- return unaryExpr(internal::scalar_conjugate_op<Scalar>());
+ return choose(Cond<NumTraits<CoeffReturnType>::IsComplex>(), unaryExpr(internal::scalar_conjugate_op<Scalar>()), derived());
}
EIGEN_DEVICE_FUNC
@@ -339,10 +341,13 @@ class TensorBase<Derived, ReadOnlyAccessors>
return cwiseMin(constant(threshold));
}
- template <typename NewType> EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorConversionOp<NewType, const Derived>
+ template<typename NewType>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const typename internal::conditional<internal::is_same<NewType, CoeffReturnType>::value,
+ Derived,
+ TensorConversionOp<NewType, const Derived> >::type
cast() const {
- return TensorConversionOp<NewType, const Derived>(derived());
+ return choose(Cond<internal::is_same<NewType, CoeffReturnType>::value>(), derived(), TensorConversionOp<NewType, const Derived>(derived()));
}
EIGEN_DEVICE_FUNC
@@ -628,26 +633,26 @@ class TensorBase<Derived, ReadOnlyAccessors>
}
template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorReductionOp<internal::AndReducer, const Dims, const TensorConversionOp<bool, const Derived> >
+ const TensorReductionOp<internal::AndReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
all(const Dims& dims) const {
return cast<bool>().reduce(dims, internal::AndReducer());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
+ const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
all() const {
DimensionList<Index, NumDimensions> in_dims;
return cast<bool>().reduce(in_dims, internal::AndReducer());
}
template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorReductionOp<internal::OrReducer, const Dims, const TensorConversionOp<bool, const Derived> >
+ const TensorReductionOp<internal::OrReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
any(const Dims& dims) const {
return cast<bool>().reduce(dims, internal::OrReducer());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
+ const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
any() const {
DimensionList<Index, NumDimensions> in_dims;
return cast<bool>().reduce(in_dims, internal::OrReducer());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 61a4e1a3a..6ca881f27 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -102,7 +102,7 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKern
typedef typename remove_reference<RhsNested>::type _RhsNested;
// From NumDims below.
- static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
+ static const int NumDimensions = traits<LhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
static const int Layout = traits<LhsXprType>::Layout;
typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index 71fd19774..c51f3f8dd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -51,6 +51,10 @@ class TensorContractionBlocking {
else {
computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
}
+
+ const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+ kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ?
+ kc_ : (kc_ / rhs_packet_size) * rhs_packet_size;
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
index 056665749..5d19652e6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
@@ -1219,9 +1219,6 @@ template<typename Indices, typename LeftArgType, typename RightArgType, typename
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> :
public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {
- static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
- "GPU tensor contraction does not support output kernels.");
-
typedef GpuDevice Device;
typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
@@ -1274,7 +1271,11 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
typedef typename RightEvaluator::Dimensions RightDimensions;
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
- Base(op, device) {}
+ Base(op, device)
+ {
+ EIGEN_STATIC_ASSERT( (internal::is_same<OutputKernelType, const NoOpOutputKernel>::value),
+ GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS);
+ }
// We need to redefine this method to make nvcc happy
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index 2d3b69128..142492603 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -120,6 +120,7 @@ class SimpleTensorContractionMapper {
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
const bool left = (side == Lhs);
+ EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
Index nocontract_val = left ? row : col;
Index linidx = 0;
for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
@@ -158,6 +159,7 @@ class SimpleTensorContractionMapper {
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
const bool left = (side == Lhs);
+ EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
Index linidx[2] = {0, 0};
if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
@@ -239,8 +241,10 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
template <typename PacketT,int AlignmentType>
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename internal::enable_if<internal::unpacket_traits<PacketT>::size==packet_size,PacketT>::type
+ load(Index i, Index j) const
+ {
// whole method makes column major assumption
// don't need to add offsets for now (because operator handles that)
@@ -282,6 +286,29 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
}
template <typename PacketT,int AlignmentType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename internal::enable_if<internal::unpacket_traits<PacketT>::size!=packet_size,PacketT>::type
+ load(Index i, Index j) const
+ {
+ const Index requested_packet_size = internal::unpacket_traits<PacketT>::size;
+ EIGEN_ALIGN_MAX Scalar data[requested_packet_size];
+
+ const IndexPair<Index> indexPair = this->computeIndexPair(i, j, requested_packet_size - 1);
+ const Index first = indexPair.first;
+ const Index lastIdx = indexPair.second;
+
+ data[0] = this->m_tensor.coeff(first);
+ for (Index k = 1; k < requested_packet_size - 1; k += 2) {
+ const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+ data[k] = this->m_tensor.coeff(internal_pair.first);
+ data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+ }
+ data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx);
+
+ return pload<PacketT>(data);
+ }
+
+ template <typename PacketT,int AlignmentType>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
return this->load<PacketT,AlignmentType>(i,j);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 24ba3e431..adf57c892 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -208,6 +208,26 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
Index nm = divup(nm0, gm);
Index nn = divup(nn0, gn);
+ // If there is enough concurrency in the sharding dimension, we choose not
+ // to paralellize by the other dimension, and execute all kernels in sync
+ // mode. This reduces parallelism from the nm x nn down to nn
+ // (shard_by_col==true) or nm (shard_by_col==false).
+ const Index sharding_dim_tasks = shard_by_col ? nn : nm;
+ const int num_worker_threads = this->m_device.numThreadsInPool();
+
+ // With small number of threads we want to make sure that we do not reduce
+ // parallelism too much. With large number of threads we trade maximum
+ // parallelism for better memory locality.
+ const float oversharding_factor =
+ num_worker_threads <= 4 ? 8.0 :
+ num_worker_threads <= 8 ? 4.0 :
+ num_worker_threads <= 16 ? 2.0 :
+ num_worker_threads <= 32 ? 1.0 :
+ num_worker_threads <= 64 ? 0.8 : /* num_worker_threads > 64 */ 0.6;
+
+ const bool parallelize_by_sharding_dim_only =
+ sharding_dim_tasks >= oversharding_factor * num_worker_threads;
+
// Last by not least, decide whether we want to issue both lhs and rhs
// packing in parallel; or issue lhs packing first, and then issue rhs
// packing when lhs packing completes (for !shard_by_col lhs and rhs are
@@ -223,10 +243,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// But don't do it if we will use each rhs only once. Locality seems to be
// more important in this case.
if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
+ // Also don't get in the way of parallelize_by_sharding_dim_only
+ // optimization.
+ if (parallelize_by_sharding_dim_only) parallel_pack = false;
- #define CONTEXT_ARGS \
+#define CONTEXT_ARGS \
(this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
- nn0, shard_by_col, parallel_pack) \
+ nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only) \
.run()
TENSOR_CONTRACTION_DISPATCH(Context, Alignment, CONTEXT_ARGS);
@@ -260,7 +283,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn,
Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk,
Index gm, Index gn, Index nm0, Index nn0, bool shard_by_col,
- bool parallel_pack)
+ bool parallel_pack, bool parallelize_by_sharding_dim_only)
: device_(self->m_device),
lhs_(self->m_leftImpl, self->m_left_nocontract_strides,
self->m_i_strides, self->m_left_contracting_strides,
@@ -275,6 +298,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
num_threads_(num_threads),
shard_by_col_(shard_by_col),
parallel_pack_(parallel_pack),
+ parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only),
m_(tm),
n_(tn),
k_(tk),
@@ -289,6 +313,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
nm0_(nm0),
nn0_(nn0)
{
+ // These two options are mutually exclusive.
+ eigen_assert(!(parallel_pack && parallelize_by_sharding_dim_only));
+
for (Index x = 0; x < P; x++) {
// Normal number of notifications for k slice switch is
// nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
@@ -335,6 +362,42 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
mem += rhs_size;
}
}
+
+ if (parallelize_by_sharding_dim_only_) {
+ const int num_worker_threads = device_.numThreadsInPool();
+
+ if (shard_by_col) {
+ can_use_thread_local_packed_ = new std::atomic<bool>[nn_];
+ for (int i = 0; i < nn_; ++i)
+ can_use_thread_local_packed_[i].store(true,
+ std::memory_order_relaxed);
+
+ Index num_blocks = num_worker_threads * gn_;
+ thread_local_packed_mem_ = device_.allocate(num_blocks * rhs_size);
+ mem = static_cast<char*>(thread_local_packed_mem_);
+
+ thread_local_packed_rhs_.resize(num_blocks, nullptr);
+ for (Index i = 0; i < num_blocks; ++i) {
+ thread_local_packed_rhs_[i] = reinterpret_cast<RhsScalar*>(mem);
+ mem += rhs_size;
+ }
+ } else {
+ can_use_thread_local_packed_ = new std::atomic<bool>[nm_];
+ for (int i = 0; i < nm_; ++i)
+ can_use_thread_local_packed_[i].store(true,
+ std::memory_order_relaxed);
+
+ Index num_blocks = num_worker_threads * gm_;
+ thread_local_packed_mem_ = device_.allocate(num_blocks * lhs_size);
+ mem = static_cast<char*>(thread_local_packed_mem_);
+
+ thread_local_packed_lhs_.resize(num_blocks, nullptr);
+ for (Index i = 0; i < num_blocks; ++i) {
+ thread_local_packed_lhs_[i] = reinterpret_cast<LhsScalar*>(mem);
+ mem += lhs_size;
+ }
+ }
+ }
}
~Context() {
@@ -343,6 +406,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
delete[] state_kernel_[x];
}
device_.deallocate(packed_mem_);
+ if (parallelize_by_sharding_dim_only_) {
+ device_.deallocate(thread_local_packed_mem_);
+ delete[] can_use_thread_local_packed_;
+ }
}
void run() {
@@ -368,6 +435,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
const int num_threads_;
const bool shard_by_col_;
const bool parallel_pack_;
+ const bool parallelize_by_sharding_dim_only_;
// Matrix sizes.
const Index m_;
const Index n_;
@@ -426,6 +494,36 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
void* packed_mem_;
std::vector<LhsScalar*> packed_lhs_[P - 1];
std::vector<RhsScalar*> packed_rhs_[P - 1];
+
+ // If we choose to parallelize only by the sharding dimension, each thread
+ // will have it's own "thead local" (not a c++ thread local storage) memory
+ // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory
+ // can't be passed to a kernel that might execute on a different thread.
+ //
+ // In practice when we are ready to pack memory for the sharding dimension
+ // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice
+ // already computed (99% of the time), and we can pack data into the thread
+ // local storage, and guarantee that all the kernels will be executed
+ // immediately in the same thread. This significantly increases L1 cache hit
+ // ratio and reduces pressure on the memory bus.
+ //
+ // It's still possible that kernel for the K-th slice will be ready before
+ // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_
+ // and packed_rhs_ to allow kernels to be executed later on a thread
+ // different from the thread that was used for packing.
+ void* thread_local_packed_mem_;
+
+ // Only one of these will beinitialized depending on shard_by_col value.
+ std::vector<LhsScalar*> thread_local_packed_lhs_;
+ std::vector<RhsScalar*> thread_local_packed_rhs_;
+
+ // After a particular shard for Kth slice missed thread local execution
+ // opportunity (K-1 slice didn't complete kernels execution), we can no
+ // longer schedule K+1 and following slices in thread local mode, because
+ // there is no more guarantee that previous kernels were executed
+ // sequentially in the same thread (size is nn_ or nm_).
+ std::atomic<bool>* can_use_thread_local_packed_;
+
std::atomic<uint8_t>** state_kernel_[P];
// state_switch_ is frequently modified by worker threads, while other
// fields are read-only after constructor. Let's move it to a separate cache
@@ -434,22 +532,96 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
std::atomic<Index> state_packing_ready_[P];
std::atomic<Index> state_switch_[P];
+ LhsScalar* packed_lhs(Index m, Index k, Index m1, bool use_thread_local) {
+ if (use_thread_local) {
+ eigen_assert(!shard_by_col_);
+
+ Index base_idx = gm_ * device_.currentThreadId();
+ Index grain_idx = m1 - m * gm_;
+ Index block_idx = base_idx + grain_idx;
+
+ return thread_local_packed_lhs_[block_idx];
+ } else {
+ return packed_lhs_[k % (P - 1)][m1];
+ }
+ }
+
+ RhsScalar* packed_rhs(Index n, Index k, Index n1, bool use_thread_local) {
+ if (use_thread_local) {
+ eigen_assert(shard_by_col_);
+
+ Index base_idx = gn_ * device_.currentThreadId();
+ Index grain_idx = n1 - n * gn_;
+ Index block_idx = base_idx + grain_idx;
+
+ return thread_local_packed_rhs_[block_idx];
+ } else {
+ return packed_rhs_[k % (P - 1)][n1];
+ }
+ }
+
+ // In following two methods (pack_lhs and pack_rhs), if we know for sure
+ // that we'll be able to immediately call a kernel with packed data, and do
+ // not submit it to the thread pool, we can use thread local memory for
+ // packed data.
+ //
+ // We can only reliably check it if we are running all kernels in sync mode
+ // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to
+ // run, it's guaranteed that all kernels with larger values of m (n) are
+ // also ready, because we execute them in the same order for all K slices.
+
void pack_lhs(Index m, Index k) {
+ bool use_thread_local = false;
+
+ if (parallelize_by_sharding_dim_only_ && !shard_by_col_ &&
+ can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) {
+ if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) {
+ use_thread_local = true;
+ } else {
+ // If we can't guarantee that all kernels in `k` slice will be
+ // executed sequentially in current thread, it's no longer safe to use
+ // thread local memory in followig slices along the k dimensions.
+ eigen_assert(k > 0);
+ can_use_thread_local_packed_[m].store(false,
+ std::memory_order_relaxed);
+ }
+ }
+
const Index mend = m * gm_ + gm(m);
for (Index m1 = m * gm_; m1 < mend; m1++)
- TensorContractionKernel::packLhs(packed_lhs_[k % (P - 1)][m1],
+ TensorContractionKernel::packLhs(packed_lhs(m, k, m1, use_thread_local),
lhs_.getSubMapper(m1 * bm_, k * bk_),
bk(k), bm(m1));
if (!parallel_pack_ && shard_by_col_) {
+ assert(!use_thread_local);
signal_packing(k);
} else {
signal_switch(k + 1);
- for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0);
+ for (Index n = nn_ - 1; n >= 0; n--) {
+ bool sync = parallelize_by_sharding_dim_only_ || n == 0;
+ signal_kernel(m, n, k, sync, use_thread_local);
+ }
}
}
void pack_rhs(Index n, Index k) {
+ bool use_thread_local = false;
+
+ if (parallelize_by_sharding_dim_only_ && shard_by_col_ &&
+ can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) {
+ if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) {
+ use_thread_local = true;
+ } else {
+ // If we can't guarantee that all kernels in `k` slice will be
+ // executed sequentially in current thread, it's no longer safe to use
+ // thread local memory in followig slices along the k dimensions.
+ eigen_assert(k > 0);
+ can_use_thread_local_packed_[n].store(false,
+ std::memory_order_relaxed);
+ }
+ }
+
const Index nend = n * gn_ + gn(n);
for (Index n1 = n * gn_; n1 < nend; n1++) {
if (k == 0) {
@@ -462,20 +634,24 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// deadlocks.
memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
}
- TensorContractionKernel::packRhs(packed_rhs_[k % (P - 1)][n1],
+ TensorContractionKernel::packRhs(packed_rhs(n, k, n1, use_thread_local),
rhs_.getSubMapper(k * bk_, n1 * bn_),
bk(k), bn(n1));
}
if (parallel_pack_ || shard_by_col_) {
signal_switch(k + 1);
- for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0);
+ for (Index m = nm_ - 1; m >= 0; m--) {
+ bool sync = parallelize_by_sharding_dim_only_ || m == 0;
+ signal_kernel(m, n, k, sync, use_thread_local);
+ }
} else {
+ assert(!use_thread_local);
signal_packing(k);
}
}
- void kernel(Index m, Index n, Index k) {
+ void kernel(Index m, Index n, Index k, bool use_thread_local) {
// Note: order of iteration matters here. Iteration over m is innermost
// because we want to reuse the same packed rhs in consecutive tasks
// (rhs fits into L2$ while lhs only into L3$).
@@ -486,8 +662,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
for (Index m1 = m * gm_; m1 < mend; m1++) {
const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
TensorContractionKernel::invoke(
- output_mapper, packed_lhs_[k % (P - 1)][m1],
- packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1));
+ output_mapper,
+ packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
+ packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1),
+ bk(k), bn(n1), Scalar(1));
// We are done with the last task for the [m1, n1] block.
if (k + 1 == nk_) {
@@ -501,8 +679,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
for (Index n1 = n * gn_; n1 < nend; n1++) {
const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
TensorContractionKernel::invoke(
- output_mapper, packed_lhs_[k % (P - 1)][m1],
- packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1));
+ output_mapper,
+ packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
+ packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1),
+ bk(k), bn(n1), Scalar(1));
// We are done with the last task for the [m1, n1] block.
if (k + 1 == nk_) {
@@ -511,7 +691,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
}
}
}
- signal_kernel(m, n, k + 1, false);
+ signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false);
signal_switch(k + 2);
}
@@ -524,16 +704,23 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
enqueue_packing(k, shard_by_col_);
}
- void signal_kernel(Index m, Index n, Index k, bool sync) {
+ void signal_kernel(Index m, Index n, Index k, bool sync,
+ bool use_thread_local) {
std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
Index s = state->load();
eigen_assert(s > 0);
- if (s != 1 && state->fetch_sub(1) != 1) return;
+ if (s != 1 && state->fetch_sub(1) != 1) {
+ eigen_assert(!use_thread_local);
+ return;
+ }
state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
- if (sync)
- kernel(m, n, k);
- else
- device_.enqueueNoNotification([=]() { kernel(m, n, k); });
+ if (sync) {
+ kernel(m, n, k, use_thread_local);
+ } else {
+ eigen_assert(!use_thread_local);
+ device_.enqueueNoNotification(
+ [=]() { kernel(m, n, k, use_thread_local); });
+ }
}
void signal_switch(Index k, Index v = 1) {
@@ -589,7 +776,26 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
[=]() { enqueue_packing_helper(mid, end, k, rhs); });
end = mid;
}
- enqueue_packing_helper(start, end, k, rhs);
+
+ // Decide if we want to run first packing task (start == 0) in
+ // async mode if we parallelize only by sharding dim:
+ // (1) pack_lhs and pack_rhs call signal_switch before completing
+ // all calls to signal_kernel, which in sync mode might lead
+ // to the execution of the first kernel of the k+1 slice, before
+ // completing a call to the last kernel of the k slice.
+ // (2) all pack tasks for sharded dim must be executed in a thread
+ // pool.
+ bool pack_async =
+ (start == 0) &&
+ (parallelize_by_sharding_dim_only_&& shard_by_col_ == rhs) &&
+ (k > 0 || device_.currentThreadId() < 0);
+
+ if (pack_async) {
+ device_.enqueueNoNotification(
+ [=]() { enqueue_packing_helper(start, end, k, rhs); });
+ } else {
+ enqueue_packing_helper(start, end, k, rhs);
+ }
}
}
@@ -756,6 +962,36 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
}
}
+ template <int Alignment>
+ EIGEN_STRONG_INLINE void addAllToBuffer(size_t n, const Scalar* src_buf0,
+ const Scalar* src_buf1,
+ const Scalar* src_buf2,
+ Scalar* dst_buf) const {
+ using ::Eigen::internal::padd;
+ using ::Eigen::internal::pload;
+ using ::Eigen::internal::ploadt;
+ using ::Eigen::internal::pstoret;
+
+ const int output_packet_size =
+ internal::unpacket_traits<PacketReturnType>::size;
+
+ size_t i = 0;
+ const size_t num_packets = n / output_packet_size;
+ for (; i < output_packet_size * num_packets; i += output_packet_size) {
+ const auto src_val0 = pload<PacketReturnType>(src_buf0 + i);
+ const auto src_val1 = pload<PacketReturnType>(src_buf1 + i);
+ const auto src_val2 = pload<PacketReturnType>(src_buf2 + i);
+
+ const auto dst_val = ploadt<PacketReturnType, Alignment>(dst_buf + i);
+ const auto sum = padd(padd(dst_val, src_val0), padd(src_val1, src_val2));
+
+ pstoret<Scalar, PacketReturnType, Alignment>(dst_buf + i, sum);
+ }
+ for (; i < n; ++i) {
+ dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i];
+ }
+ }
+
// Decide whether we want to shard m x k x n contraction over the inner
// (contraction) dimension (k).
static bool shardByInnerDim(Index m, Index n, Index k, int num_threads,
@@ -788,48 +1024,147 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
const Index m = this->m_i_size;
const Index n = this->m_j_size;
const Index k = this->m_k_size;
- // The underlying GEMM kernel assumes that k is a multiple of 8 and
- // subtle breakage occurs if this is violated.
- Index block_size = 8 * divup<Index>(k, 8 * num_threads);
- Index num_blocks = divup<Index>(k, block_size);
- // we use 'result' for the first block's partial result.
- MaxSizeVector<Scalar*> block_buffers(num_blocks - 1);
- Barrier barrier(internal::convert_index<int>(num_blocks));
- auto process_block = [=, &barrier](Scalar* buf, Index begin, Index end) {
- ::memset(buf, 0, m * n * sizeof(Scalar));
+
+ // We will compute partial results into the buffers of this size.
+ const Index buffer_size_bytes = m * n * sizeof(Scalar);
+
+ // The underlying GEMM kernel assumes that k is a multiple of
+ // the packet size and subtle breakage occurs if this is violated.
+ const Index packet_size = internal::packet_traits<RhsScalar>::size;
+
+ const auto round_up = [=](Index index) -> Index {
+ const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
+ return divup<Index>(index, kmultiple) * kmultiple;
+ };
+
+ // Cost model doesn't capture well the cost associated with constructing
+ // tensor contraction mappers and computing loop bounds in gemm_pack_lhs and
+ // gemm_pack_rhs, so we specify minimum desired block size.
+ const Index target_block_size = round_up(divup<Index>(k, num_threads));
+ const Index desired_min_block_size = 12 * packet_size;
+
+ const Index block_size = numext::mini<Index>(
+ k, numext::maxi<Index>(desired_min_block_size, target_block_size));
+ const Index num_blocks = divup<Index>(k, block_size);
+
+ // Compute block size with accounting for potentially incomplete last block.
+ const auto actual_block_size = [=](Index block_idx) -> Index {
+ return block_idx + 1 < num_blocks
+ ? block_size
+ : k + block_size - block_size * num_blocks;
+ };
+
+ // We compute partial gemm results in parallel, and to get the final result
+ // we need to add them all together. For the large number of threads (>= 48)
+ // this adds a very expensive sequential step at the end.
+ //
+ // We split the [0, num_blocks) into small ranges, and when a task for the
+ // block finishes its partial gemm computation, it checks if it was the last
+ // gemm in the range, and if so, it will add all blocks of the range.
+ //
+ // After all tasks finihes, we need to add only these pre-aggregated blocks.
+
+ // Compute range size with accounting for potentially incomplete last range.
+ const auto actual_range_size = [=](Index num_ranges, Index range_size,
+ Index range_idx) -> Index {
+ eigen_assert(range_idx < num_ranges);
+ return range_idx + 1 < num_ranges
+ ? range_size
+ : num_blocks + range_size - range_size * num_ranges;
+ };
+
+ // For now we use just a single level of ranges to compute pre-aggregated
+ // partial sums, but in general we can use more layers to compute tree
+ // aggregation in parallel and reduce the size of the sequential step.
+ //
+ // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make
+ // sense only if number of threads >= ~128?
+ static const Index l0_size = 4;
+ const Index l0_ranges = divup<Index>(num_blocks, l0_size);
+
+ // Keep count of pending gemm tasks for each l0 range.
+ MaxSizeVector<std::atomic<int>> l0_state(l0_ranges);
+ for (int i = 0; i < l0_ranges; ++i) {
+ const Index num_pending_tasks = actual_range_size(l0_ranges, l0_size, i);
+ l0_state.emplace_back(internal::convert_index<int>(num_pending_tasks));
+ }
+
+ MaxSizeVector<Scalar*> block_buffers(num_blocks);
+
+ auto process_block = [&, this](Index block_idx, Index begin, Index end) {
+ Scalar* buf = block_buffers[block_idx];
+ ::memset(buf, 0, buffer_size_bytes);
+
TENSOR_CONTRACTION_DISPATCH(
this->template evalGemmPartialWithoutOutputKernel, Alignment,
- (buf, begin, end, this->m_device.numThreads()));
- barrier.Notify();
- };
- Index start = 0;
- for (Index blocks_left = num_blocks; blocks_left > 0; --blocks_left) {
- // The underlying GEMM kernel assumes that k is a multiple of packet size
- // (currently largest packet size is 8) and subtle breakage occurs if
- // this is violated.
- block_size = 8 * divup<Index>(k - start, 8 * blocks_left);
- Scalar* buf;
- if (start == 0) {
- buf = result;
- } else {
- buf = static_cast<Scalar*>(
- this->m_device.allocate(m * n * sizeof(Scalar)));
- block_buffers.push_back(buf);
- }
- Index end = start + block_size;
- if (end > k) {
- end = k;
+ (buf, begin, end,
+ /*num_threads=*/internal::convert_index<int>(num_blocks)));
+
+ // Check if it was the last task in l0 range.
+ const Index l0_index = block_idx / l0_size;
+ const int v = l0_state[l0_index].fetch_sub(1);
+ eigen_assert(v >= 1);
+
+ // If we processed the last block of the range, we can aggregate all
+ // partial results into the first block of the range.
+ if (v == 1) {
+ const Index rng_size = actual_range_size(l0_ranges, l0_size, l0_index);
+ const Index dst_block_idx = l0_index * l0_size;
+
+ if (rng_size == l0_size) {
+ addAllToBuffer<Alignment>(
+ m * n,
+ /*src_buf0=*/block_buffers[dst_block_idx + 1],
+ /*src_buf1=*/block_buffers[dst_block_idx + 2],
+ /*src_buf2=*/block_buffers[dst_block_idx + 3],
+ /*dst_buf= */ block_buffers[dst_block_idx]);
+ } else {
+ // Aggregate blocks of potentially incomplete last range.
+ for (int i = 1; i < rng_size; ++i) {
+ addToBuffer<Alignment>(m * n,
+ /*src_buf=*/block_buffers[dst_block_idx + i],
+ /*dst_buf=*/block_buffers[dst_block_idx]);
+ }
+ }
}
- this->m_device.enqueueNoNotification(
- [=, &process_block]() { process_block(buf, start, end); });
- start = end;
+ };
+
+ Barrier barrier(internal::convert_index<int>(num_blocks));
+ for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) {
+ Scalar* buf = block_idx == 0
+ ? result
+ : static_cast<Scalar*>(
+ this->m_device.allocate(buffer_size_bytes));
+ block_buffers.push_back(buf);
+
+ Index block_start = block_idx * block_size;
+ Index block_end = block_start + actual_block_size(block_idx);
+
+ this->m_device.enqueueNoNotification([=, &barrier, &process_block]() {
+ process_block(block_idx, block_start, block_end);
+ barrier.Notify();
+ });
}
barrier.Wait();
- // Add other partial results into first partial result.
- for (const auto& buf : block_buffers) {
- addToBuffer<Alignment>(m * n, buf, result);
- this->m_device.deallocate(buf);
+ // Aggregate partial sums from l0 ranges.
+ Index l0_index = 1;
+ for (; l0_index + 2 < l0_ranges; l0_index += 3) {
+ addAllToBuffer<Alignment>(
+ m * n,
+ /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size],
+ /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size],
+ /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size],
+ /*dst_buf= */block_buffers[0]);
+ }
+ for (; l0_index < l0_ranges; ++l0_index) {
+ addToBuffer<Alignment>(m * n, block_buffers[l0_index * l0_size],
+ block_buffers[0]);
+ }
+
+ // Don't forget to deallocate ALL temporary buffers.
+ for (Index i = 1; i < num_blocks; ++i) {
+ this->m_device.deallocate(block_buffers[i]);
}
// Finally call output kernel with finalized output buffer.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 1f613d3c7..938fd0f34 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -32,7 +32,7 @@ struct traits<TensorConversionOp<TargetType, XprType> >
static const int NumDimensions = traits<XprType>::NumDimensions;
static const int Layout = traits<XprType>::Layout;
enum { Flags = 0 };
- typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType;
+ typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType;
};
template<typename TargetType, typename XprType>
@@ -177,6 +177,81 @@ template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eva
}
};
+namespace internal {
+
+template <typename SrcType, typename TargetType, bool IsSameT>
+struct CoeffConv {
+ template <typename ArgType, typename Device>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ internal::scalar_cast_op<SrcType, TargetType> converter;
+ return converter(impl.coeff(index));
+ }
+};
+
+template <typename SrcType, typename TargetType>
+struct CoeffConv<SrcType, TargetType, true> {
+ template <typename ArgType, typename Device>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ return impl.coeff(index);
+ }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode, bool ActuallyVectorize, bool IsSameT>
+struct PacketConv {
+ typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+ typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+
+ static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
+
+ template <typename ArgType, typename Device>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ internal::scalar_cast_op<SrcType, TargetType> converter;
+ EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = converter(impl.coeff(index+i));
+ }
+ TargetPacket rslt = internal::pload<TargetPacket>(values);
+ return rslt;
+ }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode, bool IsSameT>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, true, IsSameT> {
+ typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+ typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+
+ template <typename ArgType, typename Device>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+ const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+ PacketConverter<TensorEvaluator<ArgType, Device>, SrcPacket, TargetPacket,
+ SrcCoeffRatio, TgtCoeffRatio> converter(impl);
+ return converter.template packet<LoadMode>(index);
+ }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/false, /*IsSameT=*/true> {
+ typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+ static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
+
+ template <typename ArgType, typename Device>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index+i);
+ return internal::pload<TargetPacket>(values);
+ }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/true, /*IsSameT=*/true> {
+ template <typename ArgType, typename Device>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ return impl.template packet<LoadMode>(index);
+ }
+};
+
+} // namespace internal
// Eval as rvalue
template<typename TargetType, typename ArgType, typename Device>
@@ -191,6 +266,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename PacketType<SrcType, Device>::type PacketSourceType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ static const bool IsSameType = internal::is_same<TargetType, SrcType>::value;
enum {
IsAligned = false,
@@ -210,7 +286,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data)
{
- return ConversionSubExprEval<internal::is_same<TargetType, SrcType>::value, TensorEvaluator<ArgType, Device>, Scalar>::run(m_impl, data);
+ return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, Scalar>::run(m_impl, data);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
@@ -220,16 +296,23 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
- internal::scalar_cast_op<SrcType, TargetType> converter;
- return converter(m_impl.coeff(index));
+ return internal::CoeffConv<SrcType, TargetType, IsSameType>::run(m_impl,index);
}
template<int LoadMode>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
- {
- const bool Vectorizable = TensorEvaluator<ArgType, Device>::PacketAccess &
- internal::type_casting_traits<SrcType, TargetType>::VectorizedCast;
- return PacketConv<LoadMode, Vectorizable>::run(m_impl, index);
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
+ packet(Index index) const {
+ // If we are not going to do the cast, we just need to check that base
+ // TensorEvaluator has packet access. Otherwise we also need to make sure,
+ // that we have an implementation of vectorized cast.
+ const bool Vectorizable =
+ IsSameType
+ ? TensorEvaluator<ArgType, Device>::PacketAccess
+ : TensorEvaluator<ArgType, Device>::PacketAccess &
+ internal::type_casting_traits<SrcType, TargetType>::VectorizedCast;
+
+ return internal::PacketConv<PacketSourceType, PacketReturnType, LoadMode,
+ Vectorizable, IsSameType>::run(m_impl, index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -252,31 +335,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
/// required by sycl in order to extract the sycl accessor
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
- protected:
- template <int LoadMode, bool ActuallyVectorize>
- struct PacketConv {
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
- internal::scalar_cast_op<SrcType, TargetType> converter;
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
- for (int i = 0; i < PacketSize; ++i) {
- values[i] = converter(impl.coeff(index+i));
- }
- PacketReturnType rslt = internal::pload<PacketReturnType>(values);
- return rslt;
- }
- };
-
- template <int LoadMode>
- struct PacketConv<LoadMode, true> {
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
- const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
- const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
- PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
- SrcCoeffRatio, TgtCoeffRatio> converter(impl);
- return converter.template packet<LoadMode>(index);
- }
- };
-
+ protected:
TensorEvaluator<ArgType, Device> m_impl;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index bb330a77b..b43db40c8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -87,13 +87,13 @@ struct ThreadPoolDevice {
const size_t kMinBlockSize = 32768;
typedef TensorCostModel<ThreadPoolDevice> CostModel;
const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4);
- if (n <= kMinBlockSize || num_threads == 1) {
+ if (n <= kMinBlockSize || num_threads < 2) {
::memcpy(dst, src, n);
} else {
const char* src_ptr = static_cast<const char*>(src);
char* dst_ptr = static_cast<char*>(dst);
const size_t blocksize = (n + (num_threads - 1)) / num_threads;
- Barrier barrier(num_threads - 1);
+ Barrier barrier(static_cast<int>(num_threads - 1));
// Launch the last 3 blocks on worker threads.
for (size_t i = 1; i < num_threads; ++i) {
enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] {
@@ -122,6 +122,12 @@ struct ThreadPoolDevice {
return num_threads_;
}
+ // Number of theads available in the underlying thread pool. This number can
+ // be different from the value returned by numThreads().
+ EIGEN_STRONG_INLINE int numThreadsInPool() const {
+ return pool_->NumThreads();
+ }
+
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
return l1CacheSize();
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 1c44541bd..e2ff11129 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -325,7 +325,6 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
static void run(const Expression& expr, const GpuDevice& device);
};
-
#if defined(EIGEN_GPUCC)
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
struct EigenMetaKernelEval {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 78068be35..74b905329 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -90,14 +90,21 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
enum {
- IsAligned = true,
- PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = false,
+ IsAligned = true,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = internal::is_arithmetic<CoeffReturnType>::value,
PreferBlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- RawAccess = true
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = true
};
+ typedef typename internal::TensorBlock<
+ CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
+ TensorBlock;
+ typedef typename internal::TensorBlockReader<
+ CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
+ TensorBlockReader;
+
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
/// op_ is used for sycl
: m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
@@ -139,6 +146,14 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>*) const {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
+ assert(m_buffer != NULL);
+ TensorBlockReader::Run(block, m_buffer);
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index ac66f9cf1..204a6fd33 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -89,17 +89,22 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
enum {
- IsAligned = false,
- PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = false,
- PreferBlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = false
+ IsAligned = false,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = true,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
};
+ typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+ typedef internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout>
+ TensorBlock;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_generator(op.generator())
+ : m_device(device), m_generator(op.generator())
#ifdef EIGEN_USE_SYCL
, m_argImpl(op.expression(), device)
#endif
@@ -111,11 +116,13 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
m_strides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+ if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
}
} else {
m_strides[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+ if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
}
}
}
@@ -150,6 +157,75 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
return rslt;
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
+ 1, m_device.firstLevelCacheSize() / sizeof(Scalar));
+ resources->push_back(internal::TensorOpResourceRequirements(
+ internal::kSkewedInnerDims, block_total_size_max));
+ }
+
+ struct BlockIteratorState {
+ Index stride;
+ Index span;
+ Index size;
+ Index count;
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+ TensorBlock* output_block) const {
+ if (NumDims <= 0) return;
+
+ static const bool is_col_major =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+ // Compute spatial coordinates for the first block element.
+ array<Index, NumDims> coords;
+ extract_coordinates(output_block->first_coeff_index(), coords);
+ array<Index, NumDims> initial_coords = coords;
+
+ CoeffReturnType* data = output_block->data();
+ Index offset = 0;
+
+ // Initialize output block iterator state. Dimension in this array are
+ // always in inner_most -> outer_most order (col major layout).
+ array<BlockIteratorState, NumDims> it;
+ for (Index i = 0; i < NumDims; ++i) {
+ const Index dim = is_col_major ? i : NumDims - 1 - i;
+ it[i].size = output_block->block_sizes()[dim];
+ it[i].stride = output_block->block_strides()[dim];
+ it[i].span = it[i].stride * (it[i].size - 1);
+ it[i].count = 0;
+ }
+ eigen_assert(it[0].stride == 1);
+
+ while (it[NumDims - 1].count < it[NumDims - 1].size) {
+ // Generate data for the inner-most dimension.
+ for (Index i = 0; i < it[0].size; ++i) {
+ *(data + offset + i) = m_generator(coords);
+ coords[is_col_major ? 0 : NumDims - 1]++;
+ }
+ coords[is_col_major ? 0 : NumDims - 1] =
+ initial_coords[is_col_major ? 0 : NumDims - 1];
+
+ // For the 1d tensor we need to generate only one inner-most dimension.
+ if (NumDims == 1) break;
+
+ // Update offset.
+ for (Index i = 1; i < NumDims; ++i) {
+ if (++it[i].count < it[i].size) {
+ offset += it[i].stride;
+ coords[is_col_major ? i : NumDims - 1 - i]++;
+ break;
+ }
+ if (i != NumDims - 1) it[i].count = 0;
+ coords[is_col_major ? i : NumDims - 1 - i] =
+ initial_coords[is_col_major ? i : NumDims - 1 - i];
+ offset -= it[i].span;
+ }
+ }
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool) const {
// TODO(rmlarsen): This is just a placeholder. Define interface to make
@@ -170,14 +246,14 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
- const Index idx = index / m_strides[i];
+ const Index idx = index / m_fast_strides[i];
index -= idx * m_strides[i];
coords[i] = idx;
}
coords[0] = index;
} else {
for (int i = 0; i < NumDims - 1; ++i) {
- const Index idx = index / m_strides[i];
+ const Index idx = index / m_fast_strides[i];
index -= idx * m_strides[i];
coords[i] = idx;
}
@@ -185,8 +261,10 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
}
}
+ const Device& m_device;
Dimensions m_dimensions;
array<Index, NumDims> m_strides;
+ array<IndexDivisor, NumDims> m_fast_strides;
Generator m_generator;
#ifdef EIGEN_USE_SYCL
TensorEvaluator<ArgType, Device> m_argImpl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 50fa0cb2e..bb63433fe 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -402,25 +402,25 @@ struct OuterReducer {
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
-template <int B, int N, typename S, typename R, typename I>
-__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+template <int B, int N, typename S, typename R, typename I_>
+__global__ void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
#if defined(EIGEN_HAS_GPU_FP16)
-template <typename S, typename R, typename I>
-__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
-template <int B, int N, typename S, typename R, typename I>
-__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
-template <int NPT, typename S, typename R, typename I>
-__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+template <typename S, typename R, typename I_>
+__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, half2*);
+template <int B, int N, typename S, typename R, typename I_>
+__global__ void FullReductionKernelHalfFloat(R, const S, I_, half*, half2*);
+template <int NPT, typename S, typename R, typename I_>
+__global__ void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
#endif
-template <int NPT, typename S, typename R, typename I>
-__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+template <int NPT, typename S, typename R, typename I_>
+__global__ void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
-template <int NPT, typename S, typename R, typename I>
-__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+template <int NPT, typename S, typename R, typename I_>
+__global__ void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
#endif
template <typename Self, typename Op,
@@ -1114,15 +1114,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
#endif
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
- template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+ template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
#if defined(EIGEN_HAS_GPU_FP16)
- template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
- template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
- template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+ template <typename S, typename R, typename I_> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, half2*);
+ template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, half2*);
+ template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
#endif
- template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+ template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
- template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+ template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
#endif
#if defined(EIGEN_USE_SYCL)