diff options
Diffstat (limited to 'unsupported/Eigen/CXX11/src')
3 files changed, 132 insertions, 64 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index ea17a897d..a4df45098 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -136,6 +136,81 @@ struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value; }; +// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in +// ColMajor storage order. This property is guaranteed by the +// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack +// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix +// multiplication for these blocks. Default tensor contraction uses +// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see +// GeneralBlocPanelKernel.h for details). +// +// By specializing contraction kernels we can use other low level libraries to +// perform matrix multiplication, and still rely on Eigen contraction evaluator. +// This also includes full support in TensorContractionThreadPool, assuming that +// underlying gemm do not use it's own threading. +// +// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of +// multiplication, lhs tensor and rhs tensor respectively. +// +// - StorageIndex - index type for the tensor expressions. In practice almost +// always is Eigen::Index. +// +// - OutputMapper provides access to the memory of the output matrix. In +// practice it's always column major blas_data_mapper (it must be of ResScalar +// type). +// +// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional +// view into the Lhs/Rhs tensor expressions. In practice it's +// TensorContractionInputMapper, or some specialization of it based on the +// type of tensor expression (e.g. TensorImagePatchOp has optimized input +// mapper). +template<typename ResScalar, typename LhsScalar, typename RhsScalar, + typename StorageIndex, typename OutputMapper, typename LhsMapper, + typename RhsMapper> +struct TensorContractionKernel { + typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; + + typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex, + typename LhsMapper::SubMapper, + Traits::mr, Traits::LhsProgress, + typename Traits::LhsPacket4Packing, ColMajor> + LhsPacker; + + typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex, + typename RhsMapper::SubMapper, Traits::nr, + ColMajor> + RhsPacker; + + typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex, + OutputMapper, Traits::mr, Traits::nr, + /*ConjugateLhs*/ false, /*ConjugateRhs*/ false> + GebpKernel; + + EIGEN_DONT_INLINE + static void packLhs(LhsScalar* lhsBlock, + const typename LhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex rows) { + LhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0, /*offset*/ 0); + } + + EIGEN_DONT_INLINE + static void packRhs(RhsScalar* rhsBlock, + const typename RhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex cols) { + RhsPacker()(rhsBlock, data_mapper, depth, cols); + } + + EIGEN_DONT_INLINE + static void invoke(const OutputMapper& output_mapper, + const LhsScalar* lhsBlock, const RhsScalar* rhsBlock, + const StorageIndex rows, const StorageIndex depth, + const StorageIndex cols, const ResScalar alpha) { + GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, + /*strideA*/ -1, /*strideB*/ -1, + /*offsetA*/ 0, /*offsetB*/ 0); + } +}; + } // end namespace internal // Tensor contraction params that should enable to get from output matrix @@ -597,12 +672,11 @@ struct TensorContractionEvaluatorBase } template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> - EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const { + EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int /*num_threads*/) const { // columns in left side, rows in right side const Index k = this->m_k_size; eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= k); - const Index k_slice = k_end - k_start; // rows in left side const Index m = this->m_i_size; @@ -610,13 +684,9 @@ struct TensorContractionEvaluatorBase // columns in right side const Index n = this->m_j_size; - // define mr, nr, and all of my data mapper types + // define data mappers for Lhs and Rhs typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; - typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; - - const Index nr = Traits::nr; - const Index mr = Traits::mr; typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; @@ -638,11 +708,9 @@ struct TensorContractionEvaluatorBase typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; - // Declare GEBP packing and kernel structs - internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> pack_lhs; - internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs; - - internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp; + typedef internal::TensorContractionKernel< + Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> + TensorContractionKernel; // initialize data mappers LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, @@ -654,7 +722,7 @@ struct TensorContractionEvaluatorBase OutputMapper output(buffer, m); // Sizes of the blocks to load in cache. See the Goto paper for details. - internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k_slice, m, n, num_threads); + internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k, m, n, 1); const Index kc = blocking.kc(); const Index mc = numext::mini(m, blocking.mc()); const Index nc = numext::mini(n, blocking.nc()); @@ -670,19 +738,22 @@ struct TensorContractionEvaluatorBase for (Index k2 = k_start; k2 < k_end; k2 += kc) { // make sure we don't overshoot right edge of left matrix, then pack vertical panel const Index actual_kc = numext::mini(k2 + kc, k) - k2; - pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0); + TensorContractionKernel::packLhs(blockA, lhs.getSubMapper(i2, k2), + actual_kc, actual_mc); // series of horizontal blocks for (Index j2 = 0; j2 < n; j2 += nc) { // make sure we don't overshoot right edge of right matrix, then pack block const Index actual_nc = numext::mini(j2 + nc, n) - j2; - pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0); + TensorContractionKernel::packRhs(blockB, rhs.getSubMapper(k2, j2), + actual_kc, actual_nc); // call gebp (matrix kernel) // The parameters here are copied from Eigen's GEMM implementation const OutputMapper output_mapper = output.getSubMapper(i2, j2); - gebp(output_mapper, blockA, blockB, actual_mc, actual_kc, actual_nc, - Scalar(1), -1, -1, 0, 0); + TensorContractionKernel::invoke(output_mapper, blockA, blockB, + actual_mc, actual_kc, actual_nc, + Scalar(1)); // We are done with this [i2, j2] output block. if (k2 + kc >= k) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index cf281192c..71fd19774 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -21,7 +21,7 @@ enum { // Default Blocking Strategy -template <typename LhsScalar, typename RhsScalar, typename Index, int ShardingType=ShardByCol> +template<typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex, int ShardingType = ShardByCol> class TensorContractionBlocking { public: @@ -42,7 +42,7 @@ class TensorContractionBlocking { #if !defined(EIGEN_HIPCC) EIGEN_DEVICE_FUNC #endif - TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : + TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) : kc_(k), mc_(m), nc_(n) { if (ShardingType == ShardByCol) { @@ -53,23 +53,23 @@ class TensorContractionBlocking { } } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; } private: - Index kc_; - Index mc_; - Index nc_; + StorageIndex kc_; + StorageIndex mc_; + StorageIndex nc_; }; #if defined(EIGEN_USE_LIBXSMM) -template <typename LhsScalar, typename RhsScalar, typename Index> +template <typename LhsScalar, typename RhsScalar, typename StorageIndex> class TensorXsmmContractionBlocking { public: - TensorXsmmContractionBlocking(Index k, Index m, Index n, + TensorXsmmContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, size_t max_num_threads = 1, bool transposeA = false, bool transposeB = false): k_(k), m_(m), n_(n), transposeA_(transposeA), @@ -164,28 +164,28 @@ class TensorXsmmContractionBlocking { eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n); } - EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } - EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } - EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } - EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; } - EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; } - EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; } + EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } + EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; } + EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; } + EIGEN_ALWAYS_INLINE StorageIndex outer_k() const { return outer_k_; } + EIGEN_ALWAYS_INLINE StorageIndex outer_m() const { return outer_m_; } + EIGEN_ALWAYS_INLINE StorageIndex outer_n() const { return outer_n_; } EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; } EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; } EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; } EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; } EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; } - EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); } - EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); } - EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); } + EIGEN_ALWAYS_INLINE StorageIndex blocks_m() const { return divup(m_, mc_); } + EIGEN_ALWAYS_INLINE StorageIndex blocks_k() const { return divup(k_, kc_); } + EIGEN_ALWAYS_INLINE StorageIndex blocks_n() const { return divup(n_, nc_); } EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const { return prefetch_; } private: - Index k_, m_, n_; - Index kc_, mc_, nc_; - Index outer_k_, outer_m_, outer_n_; + StorageIndex k_, m_, n_; + StorageIndex kc_, mc_, nc_; + StorageIndex outer_k_, outer_m_, outer_n_; bool copyA_, copyB_, transposeA_, transposeB_; size_t num_threads_; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 70eec78fe..4553c3785 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -124,14 +124,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // Again, we don't know number of threads yet, so we use 2. Index bm, bn, bk; if (shard_by_col) { - internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, + internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k, m, n, 2); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { - internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, + internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByRow> blocking(k, m, n, 2); bm = blocking.mc(); @@ -169,14 +169,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // Now that we know number of threads, recalculate sharding and blocking. shard_by_col = shardByCol(m, n, num_threads); if (shard_by_col) { - internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, + internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k, m, n, num_threads); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { - internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, + internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByRow> blocking(k, m, n, num_threads); bm = blocking.mc(); @@ -250,17 +250,12 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT contract_t, internal::packet_traits<RhsScalar>::size, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> RhsMapper; - typedef internal::gemm_pack_lhs< - LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr, - Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> - LhsPacker; - typedef internal::gemm_pack_rhs< - RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> - RhsPacker; + typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; - typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, - Traits::mr, Traits::nr, false, false> - GebpKernel; + + typedef internal::TensorContractionKernel< + Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> + TensorContractionKernel; Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk, @@ -442,8 +437,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT void pack_lhs(Index m, Index k) { const Index mend = m * gm_ + gm(m); for (Index m1 = m * gm_; m1 < mend; m1++) - LhsPacker()(packed_lhs_[k % (P - 1)][m1], - lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); + TensorContractionKernel::packLhs(packed_lhs_[k % (P - 1)][m1], + lhs_.getSubMapper(m1 * bm_, k * bk_), + bk(k), bm(m1)); if (!parallel_pack_ && shard_by_col_) { signal_packing(k); @@ -466,8 +462,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // deadlocks. memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); } - RhsPacker()(packed_rhs_[k % (P - 1)][n1], - rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); + TensorContractionKernel::packRhs(packed_rhs_[k % (P - 1)][n1], + rhs_.getSubMapper(k * bk_, n1 * bn_), + bk(k), bn(n1)); } if (parallel_pack_ || shard_by_col_) { @@ -488,9 +485,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT for (Index n1 = n * gn_; n1 < nend; n1++) { for (Index m1 = m * gm_; m1 < mend; m1++) { const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); - GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), - Scalar(1), -1, -1, 0, 0); + TensorContractionKernel::invoke( + output_mapper, packed_lhs_[k % (P - 1)][m1], + packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1)); // We are done with the last task for the [m1, n1] block. if (k + 1 == nk_) { @@ -503,9 +500,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT for (Index m1 = m * gm_; m1 < mend; m1++) for (Index n1 = n * gn_; n1 < nend; n1++) { const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); - GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), - Scalar(1), -1, -1, 0, 0); + TensorContractionKernel::invoke( + output_mapper, packed_lhs_[k % (P - 1)][m1], + packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1)); // We are done with the last task for the [m1, n1] block. if (k + 1 == nk_) { @@ -763,7 +760,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // (contraction) dimension (k). static bool shardByInnerDim(Index m, Index n, Index k, int num_threads, int num_threads_by_k) { - size_t bufsize = m * n * sizeof(Scalar); + std::ptrdiff_t bufsize = m * n * sizeof(Scalar); bool shard_by_k = false; if (n == 1 || // If mat*vec or... num_threads_by_k < 2 || // running single threaded or... |