From 71cd3fbd6a03991977fd3faf82109bf27b701d2d Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 26 Sep 2018 11:08:47 -0700 Subject: Support multiple contraction kernel types in TensorContractionThreadPool --- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 212 +++++++++++++++++++-- 1 file changed, 192 insertions(+), 20 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 0980854b4..0c4d2f0bf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -15,6 +15,177 @@ namespace Eigen { +namespace internal { + +// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in +// ColMajor storage order. This property is guaranteed by the +// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack +// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix +// multiplication for these blocks. Default tensor contraction uses +// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see +// GeneralBlocPanelKernel.h for details). +// +// By specializing contraction kernels we can use other low level libraries to +// perform matrix multiplication, and still rely on Eigen thread pool evaluator +// for scaling. Assumption is that custom gemm do not use it's own threading for +// parallelisation. +// +// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of +// multiplication, lhs tensor and rhs tensor respectively. +// +// - StorageIndex - index type for the tensor expressions. In practice almost +// always is Eigen::Index. +// +// - OutputMapper provides access to the memory of the output matrix. In +// practice it's always column major blas_data_mapper (it must be of ResScalar +// type). +// +// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional +// view into the Lhs/Rhs tensor expressions. In practice it's +// TensorContractionInputMapper, or some specialization of it based on the +// type of tensor expression (e.g. TensorImagePatchOp has optimized input +// mapper). +// +// TODO(ezhulenev): Use TensorContractionKernel in default tensor contraction +// evaluator. +template +struct TensorContractionKernel { + typedef typename internal::gebp_traits Traits; + + typedef internal::gemm_pack_lhs + LhsPacker; + + typedef internal::gemm_pack_rhs + RhsPacker; + + typedef internal::gebp_kernel + GebpKernel; + + EIGEN_DONT_INLINE + static void packLhs(LhsScalar* lhsBlock, + const typename LhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex rows) { + LhsPacker()(lhsBlock, data_mapper, depth, rows); + } + + EIGEN_DONT_INLINE + static void packRhs(RhsScalar* rhsBlock, + const typename RhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex cols) { + RhsPacker()(rhsBlock, data_mapper, depth, cols); + } + + EIGEN_DONT_INLINE + static void invoke(const OutputMapper& output_mapper, + const LhsScalar* lhsBlock, const RhsScalar* rhsBlock, + const StorageIndex rows, const StorageIndex depth, + const StorageIndex cols, const ResScalar alpha) { + GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, + /*strideA*/ -1, /*strideB*/ -1, + /*offsetA*/ 0, /*offsetB*/ 0); + } +}; + +// Some tensor contraction kernels might rely on the gemm libraries that are +// optimized for a specific dimension sizes. By default Eigen picks block +// sizes to fit the working set in the L1/L2 caches, by specializing we can +// refine this choice and round up these sizes to work well with underlying gemm +// library. +// TODO(ezhulenev): Move it to TensorContractionBlocking, or keep separate? +template +struct TensorContractionKernelBlocking { + static void refine(const StorageIndex /*m*/, + const StorageIndex /*n*/, + const StorageIndex /*k*/, + StorageIndex* /*bm*/, + StorageIndex* /*bn*/, + StorageIndex* /*bk*/) { + // By default we do nothing and stick to the block sizes picked by Eigen. + } +}; + +#if defined(EIGEN_USE_MKLDNN) +// If all scalar types in tensor contraction are floats, we can use mkldnn gemm +// as our low level kernel. +template +struct TensorContractionKernel { + // For now mkldnn has only mkldnn_sgemm (gemm for floats). + typedef float Scalar; + + typedef typename internal::gebp_traits Traits; + + typedef internal::mkldnn_gemm_pack + LhsPacker; + + typedef internal::mkldnn_gemm_pack + RhsPacker; + + typedef internal::mkldnn_gemm_kernel + GemmKernel; + + EIGEN_DONT_INLINE + static void packLhs(Scalar* lhsBlock, + const typename LhsMapper::SubMapper& data_mapper, + StorageIndex depth, StorageIndex rows) { + LhsPacker()(lhsBlock, data_mapper, rows, depth); + } + + EIGEN_DONT_INLINE + static void packRhs(Scalar* rhsBlock, + const typename RhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex cols) { + RhsPacker()(rhsBlock, data_mapper, depth, cols); + } + + EIGEN_DONT_INLINE + static void invoke(const OutputMapper& output_mapper, const Scalar* lhsBlock, + const Scalar* rhsBlock, const StorageIndex rows, + const StorageIndex depth, const StorageIndex cols, + const Scalar alpha) { + GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha); + } +}; + +// For mkldnn_sgemm having the right dimensions (especially for small matrices) +// is more important than fitting all the working set in L1/L2 caches. +template +struct TensorContractionKernelBlocking { + // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48. We pick the largest. + static const StorageIndex kUnrollM = 48; + // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8. We pick the closest + // number that divides to both of them. + static const StorageIndex kUnrollN = 24; + + static void refine(const StorageIndex m, + const StorageIndex n, + const StorageIndex /*k*/, + StorageIndex* bm, + StorageIndex* bn, + StorageIndex* /*bk*/) { + // TODO(ezhulenev): There is probably a better way to pick block sizes. + *bm = (std::min)(m, Eigen::divup(*bm, kUnrollM) * kUnrollM); + *bn = (std::min)(n, Eigen::divup(*bn, kUnrollN) * kUnrollN); + // Stick with default bk. + } +}; + +#endif // EIGEN_USE_MKLDNN +} // namespace internal + template struct TensorEvaluator, ThreadPoolDevice> : public TensorContractionEvaluatorBase, ThreadPoolDevice> > { @@ -175,6 +346,10 @@ struct TensorEvaluator::refine(m, n, k, &bm, + &bn, &bk); // Number of kernels for each dimension. Index nm0 = divup(m, bm); @@ -242,17 +417,12 @@ struct TensorEvaluator::size, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> RhsMapper; - typedef internal::gemm_pack_lhs - LhsPacker; - typedef internal::gemm_pack_rhs< - RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> - RhsPacker; + typedef internal::blas_data_mapper OutputMapper; - typedef internal::gebp_kernel - GebpKernel; + + typedef internal::TensorContractionKernel< + Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> + TensorContractionKernel; Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk, @@ -434,8 +604,9 @@ struct TensorEvaluator