diff options
author | Gael Guennebaud <g.gael@free.fr> | 2016-07-28 13:47:33 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2016-07-28 13:47:33 +0200 |
commit | 4057f9b1fce05fef4730fa6b4bfe1b84a1f44973 (patch) | |
tree | 7e7d554fd1ebb12b7c7fad1c8fefc4ea18065784 /Eigen | |
parent | 5fbe7aa60453f31e24f9c3fce9e3f788f385d38c (diff) |
Enable slice-vectorization+inner-unrolling when unaligned vectorization is allowed. For instance, this permits to vectorize 5x5 matrices (including product)
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/src/Core/AssignEvaluator.h | 54 |
1 files changed, 41 insertions, 13 deletions
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 5b15b4ee9..b7cc7c0e9 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -88,10 +88,11 @@ private: /* If the destination isn't aligned, we have to do runtime checks and we don't unroll, so it's only good for large enough sizes. */ MaySliceVectorize = bool(MightVectorize) && bool(DstHasDirectAccess) - && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*InnerPacketSize) + && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=(EIGEN_UNALIGNED_VECTORIZE?InnerPacketSize:(3*InnerPacketSize))) /* slice vectorization can be slow, so we only want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block - in a fixed-size matrix */ + in a fixed-size matrix + However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */ }; public: @@ -136,6 +137,11 @@ public: : int(Traversal) == int(LinearTraversal) ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) ) +#if EIGEN_UNALIGNED_VECTORIZE + : int(Traversal) == int(SliceVectorizedTraversal) + ? ( bool(MayUnrollInner) ? int(InnerUnrolling) + : int(NoUnrolling) ) +#endif : int(NoUnrolling) }; @@ -277,24 +283,20 @@ struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop> EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { } }; -template<typename Kernel, int Index_, int Stop> +template<typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment> struct copy_using_evaluator_innervec_InnerUnrolling { typedef typename Kernel::PacketType PacketType; - enum { - SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, - DstAlignment = Kernel::AssignmentTraits::DstAlignment - }; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer) { kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_); enum { NextIndex = Index_ + unpacket_traits<PacketType>::size }; - copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop>::run(kernel, outer); + copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel, outer); } }; -template<typename Kernel, int Stop> -struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop> +template<typename Kernel, int Stop, int SrcAlignment, int DstAlignment> +struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlignment, DstAlignment> { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { } }; @@ -423,9 +425,10 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrollin EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + typedef typename Kernel::PacketType PacketType; enum { size = DstXprType::SizeAtCompileTime, - packetSize = packet_traits<typename Kernel::Scalar>::size, + packetSize =unpacket_traits<PacketType>::size, alignedSize = (size/packetSize)*packetSize }; copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel); @@ -472,9 +475,11 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling> EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + typedef typename Kernel::AssignmentTraits Traits; const Index outerSize = kernel.outerSize(); for(Index outer = 0; outer < outerSize; ++outer) - copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer); + copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime, + Traits::SrcAlignment, Traits::DstAlignment>::run(kernel, outer); } }; @@ -554,6 +559,29 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> } }; +#if EIGEN_UNALIGNED_VECTORIZE +template<typename Kernel> +struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling> +{ + EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel) + { + typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + typedef typename Kernel::PacketType PacketType; + + enum { size = DstXprType::InnerSizeAtCompileTime, + packetSize =unpacket_traits<PacketType>::size, + vectorizableSize = (size/packetSize)*packetSize }; + + for(Index outer = 0; outer < kernel.outerSize(); ++outer) + { + copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer); + copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, size>::run(kernel, outer); + } + } +}; +#endif + + /*************************************************************************** * Part 4 : Generic dense assignment kernel ***************************************************************************/ @@ -681,7 +709,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstX typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel; Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); - + dense_assignment_loop<Kernel>::run(kernel); } |