diff options
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h | 116 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 212 |
2 files changed, 308 insertions, 20 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h new file mode 100644 index 000000000..a97f043c1 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h @@ -0,0 +1,116 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H + +#if defined(EIGEN_USE_MKLDNN) +// Support for MklDnn sgemm kernel in Tensor contractions: +// +// 1. Prepare packed Lhs/Rhs blocks from tensor expressions using +// DataMapper (see TensorContractionInputMapper). +// 2. Invoke gemm kernel with packed blocks (replacement for default +// gebp_kernel). + +namespace Eigen { +namespace internal { + +template <typename Scalar, typename StorageIndex, typename DataMapper, + int StorageOrder> +struct mkldnn_gemm_pack; + +// mkl_gemm_pack for ColMajor storage order. +template <typename Scalar, typename StorageIndex, typename DataMapper> +struct mkldnn_gemm_pack<Scalar, StorageIndex, DataMapper, + /*StorageOrder*/ ColMajor> { + typedef typename internal::packet_traits<Scalar>::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; + + enum { PacketSize = internal::packet_traits<Scalar>::size }; + + EIGEN_DONT_INLINE + void operator()(Scalar *block, const DataMapper &data_mapper, + StorageIndex rows, StorageIndex cols) { + const StorageIndex unrolled_rows = + (rows / (4 * PacketSize)) * (4 * PacketSize); + const StorageIndex vectorized_rows = (rows / PacketSize) * PacketSize; + + for (StorageIndex col = 0; col < cols; ++col) { + LinearMapper lm = data_mapper.getLinearMapper(0, col); + + // Give compiler a strong possibility to unroll the loop. + for (StorageIndex i = 0; i < unrolled_rows; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; ++j) { + const Packet p = lm.template loadPacket<Packet>(i + j * PacketSize); + internal::pstoreu(block + j * PacketSize, p); + } + block += 4 * PacketSize; + } + + // Process remaining rows with packets. + for (StorageIndex i = unrolled_rows; i < vectorized_rows; + i += PacketSize) { + const Packet p = lm.template loadPacket<Packet>(i); + internal::pstoreu(block, p); + block += PacketSize; + } + + // Finalize with coefficients. + for (StorageIndex i = vectorized_rows; i < rows; ++i) { + *block = lm(i); + ++block; + } + } + } +}; + +template <typename Scalar, typename StorageIndex, typename OutputMapper, + bool ConjugateLhs = false, bool ConjugateRhs = false> +struct mkldnn_gemm_kernel; + +// mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm. +template <typename StorageIndex, typename OutputMapper, bool ConjugateLhs, + bool ConjugateRhs> +struct mkldnn_gemm_kernel</*Scalar*/ float, StorageIndex, OutputMapper, + ConjugateLhs, ConjugateRhs> { + EIGEN_DONT_INLINE + void operator()(const OutputMapper &output, const float *blockA, + const float *blockB, const StorageIndex rows, + const StorageIndex depth, const StorageIndex cols, + float alpha) { + static const int max_index = (std::numeric_limits<int>::max)(); + + eigen_assert(max_index > rows); + eigen_assert(max_index > cols); + eigen_assert(max_index > depth); + eigen_assert(max_index > output.stride()); + + const int m = static_cast<int>(rows); + const int n = static_cast<int>(cols); + const int k = static_cast<int>(depth); + + const char transposeA = ConjugateLhs ? 'Y' : 'N'; + const char transposeB = ConjugateRhs ? 'Y' : 'N'; + + const int ldA = ConjugateLhs ? k : m; + const int ldB = ConjugateRhs ? n : k; + const int ldC = static_cast<int>(output.stride()); + + const float beta = 1.0; + + mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k, + &alpha, blockA, &ldA, blockB, &ldB, &beta, + const_cast<float*>(output.data()), &ldC); + eigen_assert(st == 0); + } +}; + +} // namespace internal +} // namespace Eigen +#endif // EIGEN_USE_MKLDNN +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 0980854b4..0c4d2f0bf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -15,6 +15,177 @@ namespace Eigen { +namespace internal { + +// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in +// ColMajor storage order. This property is guaranteed by the +// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack +// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix +// multiplication for these blocks. Default tensor contraction uses +// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see +// GeneralBlocPanelKernel.h for details). +// +// By specializing contraction kernels we can use other low level libraries to +// perform matrix multiplication, and still rely on Eigen thread pool evaluator +// for scaling. Assumption is that custom gemm do not use it's own threading for +// parallelisation. +// +// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of +// multiplication, lhs tensor and rhs tensor respectively. +// +// - StorageIndex - index type for the tensor expressions. In practice almost +// always is Eigen::Index. +// +// - OutputMapper provides access to the memory of the output matrix. In +// practice it's always column major blas_data_mapper (it must be of ResScalar +// type). +// +// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional +// view into the Lhs/Rhs tensor expressions. In practice it's +// TensorContractionInputMapper, or some specialization of it based on the +// type of tensor expression (e.g. TensorImagePatchOp has optimized input +// mapper). +// +// TODO(ezhulenev): Use TensorContractionKernel in default tensor contraction +// evaluator. +template<typename ResScalar, typename LhsScalar, typename RhsScalar, + typename StorageIndex, typename OutputMapper, typename LhsMapper, + typename RhsMapper> +struct TensorContractionKernel { + typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; + + typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex, + typename LhsMapper::SubMapper, + Traits::mr, Traits::LhsProgress, + typename Traits::LhsPacket4Packing, ColMajor> + LhsPacker; + + typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex, + typename RhsMapper::SubMapper, Traits::nr, + ColMajor> + RhsPacker; + + typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex, + OutputMapper, Traits::mr, Traits::nr, + /*ConjugateLhs*/ false, /*ConjugateRhs*/ false> + GebpKernel; + + EIGEN_DONT_INLINE + static void packLhs(LhsScalar* lhsBlock, + const typename LhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex rows) { + LhsPacker()(lhsBlock, data_mapper, depth, rows); + } + + EIGEN_DONT_INLINE + static void packRhs(RhsScalar* rhsBlock, + const typename RhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex cols) { + RhsPacker()(rhsBlock, data_mapper, depth, cols); + } + + EIGEN_DONT_INLINE + static void invoke(const OutputMapper& output_mapper, + const LhsScalar* lhsBlock, const RhsScalar* rhsBlock, + const StorageIndex rows, const StorageIndex depth, + const StorageIndex cols, const ResScalar alpha) { + GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, + /*strideA*/ -1, /*strideB*/ -1, + /*offsetA*/ 0, /*offsetB*/ 0); + } +}; + +// Some tensor contraction kernels might rely on the gemm libraries that are +// optimized for a specific dimension sizes. By default Eigen picks block +// sizes to fit the working set in the L1/L2 caches, by specializing we can +// refine this choice and round up these sizes to work well with underlying gemm +// library. +// TODO(ezhulenev): Move it to TensorContractionBlocking, or keep separate? +template<typename ResScalar, typename LhsScalar, typename RhsScalar, + typename StorageIndex> +struct TensorContractionKernelBlocking { + static void refine(const StorageIndex /*m*/, + const StorageIndex /*n*/, + const StorageIndex /*k*/, + StorageIndex* /*bm*/, + StorageIndex* /*bn*/, + StorageIndex* /*bk*/) { + // By default we do nothing and stick to the block sizes picked by Eigen. + } +}; + +#if defined(EIGEN_USE_MKLDNN) +// If all scalar types in tensor contraction are floats, we can use mkldnn gemm +// as our low level kernel. +template<typename StorageIndex, typename OutputMapper, typename LhsMapper, + typename RhsMapper> +struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper, + LhsMapper, RhsMapper> { + // For now mkldnn has only mkldnn_sgemm (gemm for floats). + typedef float Scalar; + + typedef typename internal::gebp_traits<Scalar, Scalar> Traits; + + typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex, + typename LhsMapper::SubMapper, ColMajor> + LhsPacker; + + typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex, + typename RhsMapper::SubMapper, ColMajor> + RhsPacker; + + typedef internal::mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper> + GemmKernel; + + EIGEN_DONT_INLINE + static void packLhs(Scalar* lhsBlock, + const typename LhsMapper::SubMapper& data_mapper, + StorageIndex depth, StorageIndex rows) { + LhsPacker()(lhsBlock, data_mapper, rows, depth); + } + + EIGEN_DONT_INLINE + static void packRhs(Scalar* rhsBlock, + const typename RhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex cols) { + RhsPacker()(rhsBlock, data_mapper, depth, cols); + } + + EIGEN_DONT_INLINE + static void invoke(const OutputMapper& output_mapper, const Scalar* lhsBlock, + const Scalar* rhsBlock, const StorageIndex rows, + const StorageIndex depth, const StorageIndex cols, + const Scalar alpha) { + GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha); + } +}; + +// For mkldnn_sgemm having the right dimensions (especially for small matrices) +// is more important than fitting all the working set in L1/L2 caches. +template<typename StorageIndex> +struct TensorContractionKernelBlocking<float, float, float, StorageIndex> { + // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48. We pick the largest. + static const StorageIndex kUnrollM = 48; + // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8. We pick the closest + // number that divides to both of them. + static const StorageIndex kUnrollN = 24; + + static void refine(const StorageIndex m, + const StorageIndex n, + const StorageIndex /*k*/, + StorageIndex* bm, + StorageIndex* bn, + StorageIndex* /*bk*/) { + // TODO(ezhulenev): There is probably a better way to pick block sizes. + *bm = (std::min)(m, Eigen::divup(*bm, kUnrollM) * kUnrollM); + *bn = (std::min)(n, Eigen::divup(*bn, kUnrollN) * kUnrollN); + // Stick with default bk. + } +}; + +#endif // EIGEN_USE_MKLDNN +} // namespace internal + template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType> struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> : public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > { @@ -175,6 +346,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT bn = blocking.nc(); bk = blocking.kc(); } + // Refine blocking choice to work well with contraction kernel. + internal::TensorContractionKernelBlocking<Scalar, LhsScalar, RhsScalar, + Index>::refine(m, n, k, &bm, + &bn, &bk); // Number of kernels for each dimension. Index nm0 = divup(m, bm); @@ -242,17 +417,12 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT contract_t, internal::packet_traits<RhsScalar>::size, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> RhsMapper; - typedef internal::gemm_pack_lhs<LhsScalar, Index, - typename LhsMapper::SubMapper, Traits::mr, - Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> - LhsPacker; - typedef internal::gemm_pack_rhs< - RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> - RhsPacker; + typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; - typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, - Traits::mr, Traits::nr, false, false> - GebpKernel; + + typedef internal::TensorContractionKernel< + Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> + TensorContractionKernel; Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk, @@ -434,8 +604,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT void pack_lhs(Index m, Index k) { const Index mend = m * gm_ + gm(m); for (Index m1 = m * gm_; m1 < mend; m1++) - LhsPacker()(packed_lhs_[k % (P - 1)][m1], - lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); + TensorContractionKernel::packLhs(packed_lhs_[k % (P - 1)][m1], + lhs_.getSubMapper(m1 * bm_, k * bk_), + bk(k), bm(m1)); if (!parallel_pack_ && shard_by_col_) { signal_packing(k); @@ -458,8 +629,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // deadlocks. memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); } - RhsPacker()(packed_rhs_[k % (P - 1)][n1], - rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); + TensorContractionKernel::packRhs(packed_rhs_[k % (P - 1)][n1], + rhs_.getSubMapper(k * bk_, n1 * bn_), + bk(k), bn(n1)); } if (parallel_pack_ || shard_by_col_) { @@ -480,9 +652,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT for (Index n1 = n * gn_; n1 < nend; n1++) { for (Index m1 = m * gm_; m1 < mend; m1++) { const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); - GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), - Scalar(1), -1, -1, 0, 0); + TensorContractionKernel::invoke( + output_mapper, packed_lhs_[k % (P - 1)][m1], + packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1)); // We are done with the last task for the [m1, n1] block. if (k + 1 == nk_) { @@ -495,9 +667,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT for (Index m1 = m * gm_; m1 < mend; m1++) for (Index n1 = n * gn_; n1 < nend; n1++) { const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); - GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), - Scalar(1), -1, -1, 0, 0); + TensorContractionKernel::invoke( + output_mapper, packed_lhs_[k % (P - 1)][m1], + packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1)); // We are done with the last task for the [m1, n1] block. if (k + 1 == nk_) { |