aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen/CXX11/src')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h105
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h44
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h47
3 files changed, 132 insertions, 64 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index ea17a897d..a4df45098 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -136,6 +136,81 @@ struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_,
static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
};
+// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
+// ColMajor storage order. This property is guaranteed by the
+// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
+// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
+// multiplication for these blocks. Default tensor contraction uses
+// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
+// GeneralBlocPanelKernel.h for details).
+//
+// By specializing contraction kernels we can use other low level libraries to
+// perform matrix multiplication, and still rely on Eigen contraction evaluator.
+// This also includes full support in TensorContractionThreadPool, assuming that
+// underlying gemm do not use it's own threading.
+//
+// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
+// multiplication, lhs tensor and rhs tensor respectively.
+//
+// - StorageIndex - index type for the tensor expressions. In practice almost
+// always is Eigen::Index.
+//
+// - OutputMapper provides access to the memory of the output matrix. In
+// practice it's always column major blas_data_mapper (it must be of ResScalar
+// type).
+//
+// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
+// view into the Lhs/Rhs tensor expressions. In practice it's
+// TensorContractionInputMapper, or some specialization of it based on the
+// type of tensor expression (e.g. TensorImagePatchOp has optimized input
+// mapper).
+template<typename ResScalar, typename LhsScalar, typename RhsScalar,
+ typename StorageIndex, typename OutputMapper, typename LhsMapper,
+ typename RhsMapper>
+struct TensorContractionKernel {
+ typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+ typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex,
+ typename LhsMapper::SubMapper,
+ Traits::mr, Traits::LhsProgress,
+ typename Traits::LhsPacket4Packing, ColMajor>
+ LhsPacker;
+
+ typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex,
+ typename RhsMapper::SubMapper, Traits::nr,
+ ColMajor>
+ RhsPacker;
+
+ typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex,
+ OutputMapper, Traits::mr, Traits::nr,
+ /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
+ GebpKernel;
+
+ EIGEN_DONT_INLINE
+ static void packLhs(LhsScalar* lhsBlock,
+ const typename LhsMapper::SubMapper& data_mapper,
+ const StorageIndex depth, const StorageIndex rows) {
+ LhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0, /*offset*/ 0);
+ }
+
+ EIGEN_DONT_INLINE
+ static void packRhs(RhsScalar* rhsBlock,
+ const typename RhsMapper::SubMapper& data_mapper,
+ const StorageIndex depth, const StorageIndex cols) {
+ RhsPacker()(rhsBlock, data_mapper, depth, cols);
+ }
+
+ EIGEN_DONT_INLINE
+ static void invoke(const OutputMapper& output_mapper,
+ const LhsScalar* lhsBlock, const RhsScalar* rhsBlock,
+ const StorageIndex rows, const StorageIndex depth,
+ const StorageIndex cols, const ResScalar alpha) {
+ GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
+ /*strideA*/ -1, /*strideB*/ -1,
+ /*offsetA*/ 0, /*offsetB*/ 0);
+ }
+};
+
} // end namespace internal
// Tensor contraction params that should enable to get from output matrix
@@ -597,12 +672,11 @@ struct TensorContractionEvaluatorBase
}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
- EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
+ EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int /*num_threads*/) const {
// columns in left side, rows in right side
const Index k = this->m_k_size;
eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= k);
- const Index k_slice = k_end - k_start;
// rows in left side
const Index m = this->m_i_size;
@@ -610,13 +684,9 @@ struct TensorContractionEvaluatorBase
// columns in right side
const Index n = this->m_j_size;
- // define mr, nr, and all of my data mapper types
+ // define data mappers for Lhs and Rhs
typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
- typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
-
- const Index nr = Traits::nr;
- const Index mr = Traits::mr;
typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
@@ -638,11 +708,9 @@ struct TensorContractionEvaluatorBase
typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
- // Declare GEBP packing and kernel structs
- internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> pack_lhs;
- internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
-
- internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
+ typedef internal::TensorContractionKernel<
+ Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+ TensorContractionKernel;
// initialize data mappers
LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
@@ -654,7 +722,7 @@ struct TensorContractionEvaluatorBase
OutputMapper output(buffer, m);
// Sizes of the blocks to load in cache. See the Goto paper for details.
- internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k_slice, m, n, num_threads);
+ internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k, m, n, 1);
const Index kc = blocking.kc();
const Index mc = numext::mini(m, blocking.mc());
const Index nc = numext::mini(n, blocking.nc());
@@ -670,19 +738,22 @@ struct TensorContractionEvaluatorBase
for (Index k2 = k_start; k2 < k_end; k2 += kc) {
// make sure we don't overshoot right edge of left matrix, then pack vertical panel
const Index actual_kc = numext::mini(k2 + kc, k) - k2;
- pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
+ TensorContractionKernel::packLhs(blockA, lhs.getSubMapper(i2, k2),
+ actual_kc, actual_mc);
// series of horizontal blocks
for (Index j2 = 0; j2 < n; j2 += nc) {
// make sure we don't overshoot right edge of right matrix, then pack block
const Index actual_nc = numext::mini(j2 + nc, n) - j2;
- pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
+ TensorContractionKernel::packRhs(blockB, rhs.getSubMapper(k2, j2),
+ actual_kc, actual_nc);
// call gebp (matrix kernel)
// The parameters here are copied from Eigen's GEMM implementation
const OutputMapper output_mapper = output.getSubMapper(i2, j2);
- gebp(output_mapper, blockA, blockB, actual_mc, actual_kc, actual_nc,
- Scalar(1), -1, -1, 0, 0);
+ TensorContractionKernel::invoke(output_mapper, blockA, blockB,
+ actual_mc, actual_kc, actual_nc,
+ Scalar(1));
// We are done with this [i2, j2] output block.
if (k2 + kc >= k) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index cf281192c..71fd19774 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -21,7 +21,7 @@ enum {
// Default Blocking Strategy
-template <typename LhsScalar, typename RhsScalar, typename Index, int ShardingType=ShardByCol>
+template<typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex, int ShardingType = ShardByCol>
class TensorContractionBlocking {
public:
@@ -42,7 +42,7 @@ class TensorContractionBlocking {
#if !defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
- TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
+ TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) :
kc_(k), mc_(m), nc_(n)
{
if (ShardingType == ShardByCol) {
@@ -53,23 +53,23 @@ class TensorContractionBlocking {
}
}
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
private:
- Index kc_;
- Index mc_;
- Index nc_;
+ StorageIndex kc_;
+ StorageIndex mc_;
+ StorageIndex nc_;
};
#if defined(EIGEN_USE_LIBXSMM)
-template <typename LhsScalar, typename RhsScalar, typename Index>
+template <typename LhsScalar, typename RhsScalar, typename StorageIndex>
class TensorXsmmContractionBlocking {
public:
- TensorXsmmContractionBlocking(Index k, Index m, Index n,
+ TensorXsmmContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n,
size_t max_num_threads = 1, bool transposeA = false,
bool transposeB = false):
k_(k), m_(m), n_(n), transposeA_(transposeA),
@@ -164,28 +164,28 @@ class TensorXsmmContractionBlocking {
eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n);
}
- EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
- EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
- EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
- EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; }
- EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; }
- EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; }
+ EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+ EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+ EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
+ EIGEN_ALWAYS_INLINE StorageIndex outer_k() const { return outer_k_; }
+ EIGEN_ALWAYS_INLINE StorageIndex outer_m() const { return outer_m_; }
+ EIGEN_ALWAYS_INLINE StorageIndex outer_n() const { return outer_n_; }
EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; }
EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; }
EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; }
EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; }
EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; }
- EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); }
- EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); }
- EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); }
+ EIGEN_ALWAYS_INLINE StorageIndex blocks_m() const { return divup(m_, mc_); }
+ EIGEN_ALWAYS_INLINE StorageIndex blocks_k() const { return divup(k_, kc_); }
+ EIGEN_ALWAYS_INLINE StorageIndex blocks_n() const { return divup(n_, nc_); }
EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const {
return prefetch_;
}
private:
- Index k_, m_, n_;
- Index kc_, mc_, nc_;
- Index outer_k_, outer_m_, outer_n_;
+ StorageIndex k_, m_, n_;
+ StorageIndex kc_, mc_, nc_;
+ StorageIndex outer_k_, outer_m_, outer_n_;
bool copyA_, copyB_, transposeA_, transposeB_;
size_t num_threads_;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 70eec78fe..4553c3785 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -124,14 +124,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// Again, we don't know number of threads yet, so we use 2.
Index bm, bn, bk;
if (shard_by_col) {
- internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
+ internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
internal::ShardByCol>
blocking(k, m, n, 2);
bm = blocking.mc();
bn = blocking.nc();
bk = blocking.kc();
} else {
- internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
+ internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
internal::ShardByRow>
blocking(k, m, n, 2);
bm = blocking.mc();
@@ -169,14 +169,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// Now that we know number of threads, recalculate sharding and blocking.
shard_by_col = shardByCol(m, n, num_threads);
if (shard_by_col) {
- internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
+ internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
internal::ShardByCol>
blocking(k, m, n, num_threads);
bm = blocking.mc();
bn = blocking.nc();
bk = blocking.kc();
} else {
- internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
+ internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
internal::ShardByRow>
blocking(k, m, n, num_threads);
bm = blocking.mc();
@@ -250,17 +250,12 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
contract_t, internal::packet_traits<RhsScalar>::size,
rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
RhsMapper;
- typedef internal::gemm_pack_lhs<
- LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr,
- Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>
- LhsPacker;
- typedef internal::gemm_pack_rhs<
- RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
- RhsPacker;
+
typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
- typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
- Traits::mr, Traits::nr, false, false>
- GebpKernel;
+
+ typedef internal::TensorContractionKernel<
+ Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+ TensorContractionKernel;
Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn,
Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk,
@@ -442,8 +437,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
void pack_lhs(Index m, Index k) {
const Index mend = m * gm_ + gm(m);
for (Index m1 = m * gm_; m1 < mend; m1++)
- LhsPacker()(packed_lhs_[k % (P - 1)][m1],
- lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+ TensorContractionKernel::packLhs(packed_lhs_[k % (P - 1)][m1],
+ lhs_.getSubMapper(m1 * bm_, k * bk_),
+ bk(k), bm(m1));
if (!parallel_pack_ && shard_by_col_) {
signal_packing(k);
@@ -466,8 +462,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// deadlocks.
memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
}
- RhsPacker()(packed_rhs_[k % (P - 1)][n1],
- rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+ TensorContractionKernel::packRhs(packed_rhs_[k % (P - 1)][n1],
+ rhs_.getSubMapper(k * bk_, n1 * bn_),
+ bk(k), bn(n1));
}
if (parallel_pack_ || shard_by_col_) {
@@ -488,9 +485,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
for (Index n1 = n * gn_; n1 < nend; n1++) {
for (Index m1 = m * gm_; m1 < mend; m1++) {
const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
- GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1],
- packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
- Scalar(1), -1, -1, 0, 0);
+ TensorContractionKernel::invoke(
+ output_mapper, packed_lhs_[k % (P - 1)][m1],
+ packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1));
// We are done with the last task for the [m1, n1] block.
if (k + 1 == nk_) {
@@ -503,9 +500,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
for (Index m1 = m * gm_; m1 < mend; m1++)
for (Index n1 = n * gn_; n1 < nend; n1++) {
const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
- GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1],
- packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
- Scalar(1), -1, -1, 0, 0);
+ TensorContractionKernel::invoke(
+ output_mapper, packed_lhs_[k % (P - 1)][m1],
+ packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1));
// We are done with the last task for the [m1, n1] block.
if (k + 1 == nk_) {
@@ -763,7 +760,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// (contraction) dimension (k).
static bool shardByInnerDim(Index m, Index n, Index k, int num_threads,
int num_threads_by_k) {
- size_t bufsize = m * n * sizeof(Scalar);
+ std::ptrdiff_t bufsize = m * n * sizeof(Scalar);
bool shard_by_k = false;
if (n == 1 || // If mat*vec or...
num_threads_by_k < 2 || // running single threaded or...