diff options
author | Eugene Zhulenev <ezhulenev@google.com> | 2018-09-27 11:49:19 -0700 |
---|---|---|
committer | Eugene Zhulenev <ezhulenev@google.com> | 2018-09-27 11:49:19 -0700 |
commit | 9f4988959f1b0394ee027f474f49916543ad2f3c (patch) | |
tree | 3f8921c64e345014475df7bfa828f636c16bdda7 /unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | |
parent | b314376f9c6d69208b437ae59b412aa57aefd2ec (diff) |
Remove explicit mkldnn support and redundant TensorContractionKernelBlocking
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 183 |
1 files changed, 4 insertions, 179 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 0c4d2f0bf..8a464b073 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -15,177 +15,6 @@ namespace Eigen { -namespace internal { - -// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in -// ColMajor storage order. This property is guaranteed by the -// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack -// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix -// multiplication for these blocks. Default tensor contraction uses -// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see -// GeneralBlocPanelKernel.h for details). -// -// By specializing contraction kernels we can use other low level libraries to -// perform matrix multiplication, and still rely on Eigen thread pool evaluator -// for scaling. Assumption is that custom gemm do not use it's own threading for -// parallelisation. -// -// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of -// multiplication, lhs tensor and rhs tensor respectively. -// -// - StorageIndex - index type for the tensor expressions. In practice almost -// always is Eigen::Index. -// -// - OutputMapper provides access to the memory of the output matrix. In -// practice it's always column major blas_data_mapper (it must be of ResScalar -// type). -// -// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional -// view into the Lhs/Rhs tensor expressions. In practice it's -// TensorContractionInputMapper, or some specialization of it based on the -// type of tensor expression (e.g. TensorImagePatchOp has optimized input -// mapper). -// -// TODO(ezhulenev): Use TensorContractionKernel in default tensor contraction -// evaluator. -template<typename ResScalar, typename LhsScalar, typename RhsScalar, - typename StorageIndex, typename OutputMapper, typename LhsMapper, - typename RhsMapper> -struct TensorContractionKernel { - typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; - - typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex, - typename LhsMapper::SubMapper, - Traits::mr, Traits::LhsProgress, - typename Traits::LhsPacket4Packing, ColMajor> - LhsPacker; - - typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex, - typename RhsMapper::SubMapper, Traits::nr, - ColMajor> - RhsPacker; - - typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex, - OutputMapper, Traits::mr, Traits::nr, - /*ConjugateLhs*/ false, /*ConjugateRhs*/ false> - GebpKernel; - - EIGEN_DONT_INLINE - static void packLhs(LhsScalar* lhsBlock, - const typename LhsMapper::SubMapper& data_mapper, - const StorageIndex depth, const StorageIndex rows) { - LhsPacker()(lhsBlock, data_mapper, depth, rows); - } - - EIGEN_DONT_INLINE - static void packRhs(RhsScalar* rhsBlock, - const typename RhsMapper::SubMapper& data_mapper, - const StorageIndex depth, const StorageIndex cols) { - RhsPacker()(rhsBlock, data_mapper, depth, cols); - } - - EIGEN_DONT_INLINE - static void invoke(const OutputMapper& output_mapper, - const LhsScalar* lhsBlock, const RhsScalar* rhsBlock, - const StorageIndex rows, const StorageIndex depth, - const StorageIndex cols, const ResScalar alpha) { - GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, - /*strideA*/ -1, /*strideB*/ -1, - /*offsetA*/ 0, /*offsetB*/ 0); - } -}; - -// Some tensor contraction kernels might rely on the gemm libraries that are -// optimized for a specific dimension sizes. By default Eigen picks block -// sizes to fit the working set in the L1/L2 caches, by specializing we can -// refine this choice and round up these sizes to work well with underlying gemm -// library. -// TODO(ezhulenev): Move it to TensorContractionBlocking, or keep separate? -template<typename ResScalar, typename LhsScalar, typename RhsScalar, - typename StorageIndex> -struct TensorContractionKernelBlocking { - static void refine(const StorageIndex /*m*/, - const StorageIndex /*n*/, - const StorageIndex /*k*/, - StorageIndex* /*bm*/, - StorageIndex* /*bn*/, - StorageIndex* /*bk*/) { - // By default we do nothing and stick to the block sizes picked by Eigen. - } -}; - -#if defined(EIGEN_USE_MKLDNN) -// If all scalar types in tensor contraction are floats, we can use mkldnn gemm -// as our low level kernel. -template<typename StorageIndex, typename OutputMapper, typename LhsMapper, - typename RhsMapper> -struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper, - LhsMapper, RhsMapper> { - // For now mkldnn has only mkldnn_sgemm (gemm for floats). - typedef float Scalar; - - typedef typename internal::gebp_traits<Scalar, Scalar> Traits; - - typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex, - typename LhsMapper::SubMapper, ColMajor> - LhsPacker; - - typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex, - typename RhsMapper::SubMapper, ColMajor> - RhsPacker; - - typedef internal::mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper> - GemmKernel; - - EIGEN_DONT_INLINE - static void packLhs(Scalar* lhsBlock, - const typename LhsMapper::SubMapper& data_mapper, - StorageIndex depth, StorageIndex rows) { - LhsPacker()(lhsBlock, data_mapper, rows, depth); - } - - EIGEN_DONT_INLINE - static void packRhs(Scalar* rhsBlock, - const typename RhsMapper::SubMapper& data_mapper, - const StorageIndex depth, const StorageIndex cols) { - RhsPacker()(rhsBlock, data_mapper, depth, cols); - } - - EIGEN_DONT_INLINE - static void invoke(const OutputMapper& output_mapper, const Scalar* lhsBlock, - const Scalar* rhsBlock, const StorageIndex rows, - const StorageIndex depth, const StorageIndex cols, - const Scalar alpha) { - GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha); - } -}; - -// For mkldnn_sgemm having the right dimensions (especially for small matrices) -// is more important than fitting all the working set in L1/L2 caches. -template<typename StorageIndex> -struct TensorContractionKernelBlocking<float, float, float, StorageIndex> { - // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48. We pick the largest. - static const StorageIndex kUnrollM = 48; - // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8. We pick the closest - // number that divides to both of them. - static const StorageIndex kUnrollN = 24; - - static void refine(const StorageIndex m, - const StorageIndex n, - const StorageIndex /*k*/, - StorageIndex* bm, - StorageIndex* bn, - StorageIndex* /*bk*/) { - // TODO(ezhulenev): There is probably a better way to pick block sizes. - *bm = (std::min)(m, Eigen::divup(*bm, kUnrollM) * kUnrollM); - *bn = (std::min)(n, Eigen::divup(*bn, kUnrollN) * kUnrollN); - // Stick with default bk. - } -}; - -#endif // EIGEN_USE_MKLDNN -} // namespace internal - template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType> struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> : public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > { @@ -295,14 +124,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // Again, we don't know number of threads yet, so we use 2. Index bm, bn, bk; if (shard_by_col) { - internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, + internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k, m, n, 2); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { - internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, + internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByRow> blocking(k, m, n, 2); bm = blocking.mc(); @@ -332,24 +161,20 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // Now that we know number of threads, recalculate sharding and blocking. shard_by_col = shardByCol(m, n, num_threads); if (shard_by_col) { - internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, + internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k, m, n, num_threads); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { - internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, + internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByRow> blocking(k, m, n, num_threads); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } - // Refine blocking choice to work well with contraction kernel. - internal::TensorContractionKernelBlocking<Scalar, LhsScalar, RhsScalar, - Index>::refine(m, n, k, &bm, - &bn, &bk); // Number of kernels for each dimension. Index nm0 = divup(m, bm); |