diff options
Diffstat (limited to 'Eigen/src/Core/AssignEvaluator.h')
-rw-r--r-- | Eigen/src/Core/AssignEvaluator.h | 119 |
1 files changed, 78 insertions, 41 deletions
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index b1193e421..b7cc7c0e9 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -75,23 +75,24 @@ private: DstIsRowMajor = DstFlags&RowMajorBit, SrcIsRowMajor = SrcFlags&RowMajorBit, StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)), - MightVectorize = StorageOrdersAgree + MightVectorize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit) - && (functor_traits<AssignFunc>::PacketAccess), + && bool(functor_traits<AssignFunc>::PacketAccess), MayInnerVectorize = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0 && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0 - && int(JointAlignment)>=int(InnerRequiredAlignment), - MayLinearize = StorageOrdersAgree && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), - MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess - && ((int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), + && (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)), + MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), + MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess + && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), /* If the destination isn't aligned, we have to do runtime checks and we don't unroll, so it's only good for large enough sizes. */ - MaySliceVectorize = MightVectorize && DstHasDirectAccess - && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*InnerPacketSize) + MaySliceVectorize = bool(MightVectorize) && bool(DstHasDirectAccess) + && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=(EIGEN_UNALIGNED_VECTORIZE?InnerPacketSize:(3*InnerPacketSize))) /* slice vectorization can be slow, so we only want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block - in a fixed-size matrix */ + in a fixed-size matrix + However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */ }; public: @@ -116,9 +117,9 @@ private: : 1, UnrollingLimit = EIGEN_UNROLLING_LIMIT * ActualPacketSize, MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic - && int(Dst::SizeAtCompileTime) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit), + && int(Dst::SizeAtCompileTime) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit), MayUnrollInner = int(InnerSize) != Dynamic - && int(InnerSize) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit) + && int(InnerSize) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit) }; public: @@ -130,11 +131,17 @@ public: : int(NoUnrolling) ) : int(Traversal) == int(LinearVectorizedTraversal) - ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(LinearRequiredAlignment)) ? int(CompleteUnrolling) - : int(NoUnrolling) ) + ? ( bool(MayUnrollCompletely) && ( EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment))) + ? int(CompleteUnrolling) + : int(NoUnrolling) ) : int(Traversal) == int(LinearTraversal) ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) ) +#if EIGEN_UNALIGNED_VECTORIZE + : int(Traversal) == int(SliceVectorizedTraversal) + ? ( bool(MayUnrollInner) ? int(InnerUnrolling) + : int(NoUnrolling) ) +#endif : int(NoUnrolling) }; @@ -156,6 +163,7 @@ public: EIGEN_DEBUG_VAR(InnerMaxSize) EIGEN_DEBUG_VAR(LinearPacketSize) EIGEN_DEBUG_VAR(InnerPacketSize) + EIGEN_DEBUG_VAR(ActualPacketSize) EIGEN_DEBUG_VAR(StorageOrdersAgree) EIGEN_DEBUG_VAR(MightVectorize) EIGEN_DEBUG_VAR(MayLinearize) @@ -163,6 +171,7 @@ public: EIGEN_DEBUG_VAR(MayLinearVectorize) EIGEN_DEBUG_VAR(MaySliceVectorize) std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl; + EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost) EIGEN_DEBUG_VAR(UnrollingLimit) EIGEN_DEBUG_VAR(MayUnrollCompletely) EIGEN_DEBUG_VAR(MayUnrollInner) @@ -256,13 +265,13 @@ struct copy_using_evaluator_innervec_CompleteUnrolling enum { outer = Index / DstXprType::InnerSizeAtCompileTime, inner = Index % DstXprType::InnerSizeAtCompileTime, - JointAlignment = Kernel::AssignmentTraits::JointAlignment, - DefaultAlignment = unpacket_traits<PacketType>::alignment + SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, + DstAlignment = Kernel::AssignmentTraits::DstAlignment }; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { - kernel.template assignPacketByOuterInner<DefaultAlignment, JointAlignment, PacketType>(outer, inner); + kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner); enum { NextIndex = Index + unpacket_traits<PacketType>::size }; copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel); } @@ -274,23 +283,20 @@ struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop> EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { } }; -template<typename Kernel, int Index_, int Stop> +template<typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment> struct copy_using_evaluator_innervec_InnerUnrolling { typedef typename Kernel::PacketType PacketType; - enum { - DefaultAlignment = unpacket_traits<PacketType>::alignment - }; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer) { - kernel.template assignPacketByOuterInner<DefaultAlignment, DefaultAlignment, PacketType>(outer, Index_); + kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_); enum { NextIndex = Index_ + unpacket_traits<PacketType>::size }; - copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop>::run(kernel, outer); + copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel, outer); } }; -template<typename Kernel, int Stop> -struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop> +template<typename Kernel, int Stop, int SrcAlignment, int DstAlignment> +struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlignment, DstAlignment> { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { } }; @@ -419,9 +425,10 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrollin EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + typedef typename Kernel::PacketType PacketType; enum { size = DstXprType::SizeAtCompileTime, - packetSize = packet_traits<typename Kernel::Scalar>::size, + packetSize =unpacket_traits<PacketType>::size, alignedSize = (size/packetSize)*packetSize }; copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel); @@ -438,7 +445,8 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling> { typedef typename Kernel::PacketType PacketType; enum { - DefaultAlignment = unpacket_traits<PacketType>::alignment + SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, + DstAlignment = Kernel::AssignmentTraits::DstAlignment }; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { @@ -447,7 +455,7 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling> const Index packetSize = unpacket_traits<PacketType>::size; for(Index outer = 0; outer < outerSize; ++outer) for(Index inner = 0; inner < innerSize; inner+=packetSize) - kernel.template assignPacketByOuterInner<DefaultAlignment, DefaultAlignment, PacketType>(outer, inner); + kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner); } }; @@ -467,9 +475,11 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling> EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + typedef typename Kernel::AssignmentTraits Traits; const Index outerSize = kernel.outerSize(); for(Index outer = 0; outer < outerSize; ++outer) - copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer); + copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime, + Traits::SrcAlignment, Traits::DstAlignment>::run(kernel, outer); } }; @@ -518,7 +528,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> : int(Kernel::AssignmentTraits::DstAlignment) }; const Scalar *dst_ptr = &kernel.dstEvaluator().coeffRef(0,0); - if((!bool(dstIsAligned)) && (size_t(dst_ptr) % sizeof(Scalar))>0) + if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0) { // the pointer is not aligend-on scalar, so alignment is not possible return dense_assignment_loop<Kernel,DefaultTraversal,NoUnrolling>::run(kernel); @@ -549,6 +559,29 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> } }; +#if EIGEN_UNALIGNED_VECTORIZE +template<typename Kernel> +struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling> +{ + EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel) + { + typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + typedef typename Kernel::PacketType PacketType; + + enum { size = DstXprType::InnerSizeAtCompileTime, + packetSize =unpacket_traits<PacketType>::size, + vectorizableSize = (size/packetSize)*packetSize }; + + for(Index outer = 0; outer < kernel.outerSize(); ++outer) + { + copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer); + copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, size>::run(kernel, outer); + } + } +}; +#endif + + /*************************************************************************** * Part 4 : Generic dense assignment kernel ***************************************************************************/ @@ -676,14 +709,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstX typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel; Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); - + dense_assignment_loop<Kernel>::run(kernel); } template<typename DstXprType, typename SrcXprType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src) { - call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar>()); + call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>()); } /*************************************************************************** @@ -705,7 +738,7 @@ template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Ki // This is the main assignment class template< typename DstXprType, typename SrcXprType, typename Functor, typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind, - typename Scalar = typename DstXprType::Scalar> + typename EnableIf = void> struct Assignment; @@ -718,13 +751,13 @@ template<typename Dst, typename Src> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment(Dst& dst, const Src& src) { - call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>()); + call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>()); } template<typename Dst, typename Src> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment(const Dst& dst, const Src& src) { - call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>()); + call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>()); } // Deal with "assume-aliasing" @@ -783,7 +816,7 @@ template<typename Dst, typename Src> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment_no_alias(Dst& dst, const Src& src) { - call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar>()); + call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>()); } template<typename Dst, typename Src, typename Func> @@ -805,15 +838,17 @@ template<typename Dst, typename Src> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src) { - call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar>()); + call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>()); } // forward declaration template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src); // Generic Dense to Dense assignment -template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar> -struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar> +// Note that the last template argument "Weak" is needed to make it possible to perform +// both partial specialization+SFINAE without ambiguous specialization +template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak> +struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak> { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func) @@ -830,11 +865,13 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar> // Generic assignment through evalTo. // TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism. -template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar> -struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Scalar> +// Note that the last template argument "Weak" is needed to make it possible to perform +// both partial specialization+SFINAE without ambiguous specialization +template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak> +struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak> { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/) + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); src.evalTo(dst); |