diff options
author | Mark D Ryan <mark.d.ryan@intel.com> | 2018-10-12 15:20:21 +0200 |
---|---|---|
committer | Mark D Ryan <mark.d.ryan@intel.com> | 2018-10-12 15:20:21 +0200 |
commit | aa110e681b8b2237757a652ba47da49e1fbd2cd6 (patch) | |
tree | 3e5e0d1481045c9bdc64d6c4d700ea13a0c7b2f4 /Eigen/src/Core/AssignEvaluator.h | |
parent | d9392f9e557950e0160b6d4ba1c44035649d8e41 (diff) |
PR 526: Speed up multiplication of small, dynamically sized matrices
The Packet16f, Packet8f and Packet8d types are too large to use with dynamically
sized matrices typically processed by the SliceVectorizedTraversal specialization of
the dense_assignment_loop. Using these types is likely to lead to little or no
vectorization. Significant slowdown in the multiplication of these small matrices can
be observed when building with AVX and AVX512 enabled.
This patch introduces a new dense_assignment_kernel that is used when
computing small products whose operands have dynamic dimensions. It ensures that the
PacketSize used is no larger than 4, thereby increasing the chance that vectorized
instructions will be used when computing the product.
I tested all 969 possible combinations of M, K, and N that are handled by the
dense_assignment_loop on x86 builds. Although a few combinations are slowed down
by this patch they are far outnumbered by the cases that are sped up, as the
following results demonstrate.
Disabling Packed8d on AVX512 builds:
Total Cases: 969
Better: 511
Worse: 85
Same: 373
Max Improvement: 169.00% (4 8 6)
Max Degradation: 36.50% (8 5 3)
Median Improvement: 35.46%
Median Degradation: 17.41%
Total FLOPs Improvement: 19.42%
Disabling Packet16f and Packed8f on AVX512 builds:
Total Cases: 969
Better: 658
Worse: 5
Same: 306
Max Improvement: 214.05% (8 6 5)
Max Degradation: 22.26% (16 2 1)
Median Improvement: 60.05%
Median Degradation: 13.32%
Total FLOPs Improvement: 59.58%
Disabling Packed8f on AVX builds:
Total Cases: 969
Better: 663
Worse: 96
Same: 210
Max Improvement: 155.29% (4 10 5)
Max Degradation: 35.12% (8 3 2)
Median Improvement: 34.28%
Median Degradation: 15.05%
Total FLOPs Improvement: 26.02%
Diffstat (limited to 'Eigen/src/Core/AssignEvaluator.h')
-rw-r--r-- | Eigen/src/Core/AssignEvaluator.h | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 83cec500f..6e6452099 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -699,6 +699,26 @@ protected: DstXprType& m_dstExpr; }; +// Special kernel used when computing small products whose operands have dynamic dimensions. It ensures that the +// PacketSize used is no larger than 4, thereby increasing the chance that vectorized instructions will be used +// when computing the product. + +template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor> +class restricted_packet_dense_assignment_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn> +{ +protected: + typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn> Base; + public: + typedef typename Base::Scalar Scalar; + typedef typename Base::DstXprType DstXprType; + typedef typename find_best_packet<Scalar, 4>::type PacketType; + + EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr) + : Base(dst, src, func, dstExpr) + { + } + }; + /*************************************************************************** * Part 5 : Entry point for dense rectangular assignment ***************************************************************************/ @@ -837,6 +857,27 @@ void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func) Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func); } + +template<typename Dst, typename Src, typename Func> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_restricted_packet_assignment_no_alias(Dst& dst, const Src& src, const Func& func) +{ + typedef evaluator<Dst> DstEvaluatorType; + typedef evaluator<Src> SrcEvaluatorType; + typedef restricted_packet_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Func> Kernel; + + EIGEN_STATIC_ASSERT_LVALUE(Dst) + EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar); + + SrcEvaluatorType srcEvaluator(src); + resize_if_allowed(dst, src, func); + + DstEvaluatorType dstEvaluator(dst); + Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); + + dense_assignment_loop<Kernel>::run(kernel); +} + template<typename Dst, typename Src> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment_no_alias(Dst& dst, const Src& src) |