diff options
author | Gael Guennebaud <g.gael@free.fr> | 2015-03-13 21:15:50 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2015-03-13 21:15:50 +0100 |
commit | 1330f8bbd12306cc4955d943f27e5281d413bed4 (patch) | |
tree | 0296bc727247353a3032375cfca1af8499e91c36 | |
parent | d99ab35f9e886a014be6d47606d232af1e668f76 (diff) |
bug #973, improve AVX support by enabling vectorization of Vector4i-like types, and enforcing alignement of Vector4f/Vector2d-like types to preserve compatibility with SSE and future Eigen versions that will vectorize them with AVX enabled.
-rw-r--r-- | Eigen/src/Core/CoreEvaluators.h | 16 | ||||
-rw-r--r-- | Eigen/src/Core/DenseStorage.h | 88 | ||||
-rw-r--r-- | Eigen/src/Core/util/Macros.h | 3 | ||||
-rw-r--r-- | Eigen/src/Core/util/XprHelper.h | 5 | ||||
-rw-r--r-- | test/unalignedassert.cpp | 7 | ||||
-rw-r--r-- | test/vectorization_logic.cpp | 2 |
6 files changed, 108 insertions, 13 deletions
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 9485080d3..85f46cb8d 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -647,11 +647,15 @@ struct evaluator<Map<PlainObjectType, MapOptions, StrideType> > HasNoStride = HasNoInnerStride && HasNoOuterStride, IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned), IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic, + + // TODO: should check for smaller packet types once we can handle multi-sized packet types + AlignBytes = int(packet_traits<Scalar>::size) * sizeof(Scalar), + KeepsPacketAccess = bool(HasNoInnerStride) && ( bool(IsDynamicSize) || HasNoOuterStride || ( OuterStrideAtCompileTime!=Dynamic - && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ), + && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime) % AlignBytes)==0 ) ), Flags0 = evaluator<PlainObjectType>::Flags, Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit), Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime)) @@ -717,7 +721,10 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> > && (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, - MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0, + // TODO: should check for smaller packet types once we can handle multi-sized packet types + AlignBytes = int(packet_traits<Scalar>::size) * sizeof(Scalar), + + MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % AlignBytes) == 0)) ? AlignedBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsRowMajorBit = XprType::Flags&RowMajorBit, Flags0 = evaluator<ArgType>::Flags & ( (HereditaryBits & ~RowMajorBit) | @@ -825,12 +832,15 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAc typename Block<ArgType, BlockRows, BlockCols, InnerPanel>::PlainObject> { typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType; + typedef typename XprType::Scalar Scalar; EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block) : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) { + // TODO: should check for smaller packet types once we can handle multi-sized packet types + const int AlignBytes = int(packet_traits<Scalar>::size) * sizeof(Scalar); // FIXME this should be an internal assertion - eigen_assert(EIGEN_IMPLIES(evaluator<XprType>::Flags&AlignedBit, (size_t(block.data()) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned"); + eigen_assert(EIGEN_IMPLIES(evaluator<XprType>::Flags&AlignedBit, (size_t(block.data()) % AlignBytes) == 0) && "data is not aligned"); } }; diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 9186f59a7..522aaa299 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -34,14 +34,35 @@ void check_static_allocation_size() #endif } +template<typename T, int Size, typename Packet = typename packet_traits<T>::type, + bool Match = bool((Size%unpacket_traits<Packet>::size)==0), + bool TryHalf = bool(unpacket_traits<Packet>::size > Size) + && bool(unpacket_traits<Packet>::size > unpacket_traits<typename unpacket_traits<Packet>::half>::size) > +struct compute_default_alignment +{ + enum { value = 0 }; +}; + +template<typename T, int Size, typename Packet> +struct compute_default_alignment<T, Size, Packet, true, false> // Match +{ + enum { value = sizeof(T) * unpacket_traits<Packet>::size }; +}; + +template<typename T, int Size, typename Packet> +struct compute_default_alignment<T, Size, Packet, false, true> +{ + // current packet too large, try with an half-packet + enum { value = compute_default_alignment<T, Size, typename unpacket_traits<Packet>::half>::value }; +}; + /** \internal * Static array. If the MatrixOrArrayOptions require auto-alignment, the array will be automatically aligned: * to 16 bytes boundary if the total size is a multiple of 16 bytes. */ template <typename T, int Size, int MatrixOrArrayOptions, int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0 - : (((Size*sizeof(T))%EIGEN_ALIGN_BYTES)==0) ? EIGEN_ALIGN_BYTES - : 0 > + : compute_default_alignment<T,Size>::value > struct plain_array { T array[Size]; @@ -81,14 +102,71 @@ struct plain_array #endif template <typename T, int Size, int MatrixOrArrayOptions> -struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES> +struct plain_array<T, Size, MatrixOrArrayOptions, 8> +{ + EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size]; + + EIGEN_DEVICE_FUNC + plain_array() + { + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7); + check_static_allocation_size<T,Size>(); + } + + EIGEN_DEVICE_FUNC + plain_array(constructor_without_unaligned_array_assert) + { + check_static_allocation_size<T,Size>(); + } +}; + +template <typename T, int Size, int MatrixOrArrayOptions> +struct plain_array<T, Size, MatrixOrArrayOptions, 16> +{ + EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size]; + + EIGEN_DEVICE_FUNC + plain_array() + { + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15); + check_static_allocation_size<T,Size>(); + } + + EIGEN_DEVICE_FUNC + plain_array(constructor_without_unaligned_array_assert) + { + check_static_allocation_size<T,Size>(); + } +}; + +template <typename T, int Size, int MatrixOrArrayOptions> +struct plain_array<T, Size, MatrixOrArrayOptions, 32> +{ + EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size]; + + EIGEN_DEVICE_FUNC + plain_array() + { + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31); + check_static_allocation_size<T,Size>(); + } + + EIGEN_DEVICE_FUNC + plain_array(constructor_without_unaligned_array_assert) + { + check_static_allocation_size<T,Size>(); + } +}; + +template <typename T, int Size, int MatrixOrArrayOptions> +struct plain_array<T, Size, MatrixOrArrayOptions, 64> { - EIGEN_USER_ALIGN_DEFAULT T array[Size]; + EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size]; EIGEN_DEVICE_FUNC plain_array() { - EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1); + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63); check_static_allocation_size<T,Size>(); } diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index aaea9f035..6b294e77f 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -318,6 +318,9 @@ // Defined the boundary (in bytes) on which the data needs to be aligned. Note // that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be // aligned at all regardless of the value of this #define. +// TODO should be renamed EIGEN_MAXIMAL_ALIGN_BYTES, +// for instance with AVX 1 EIGEN_MAXIMAL_ALIGN_BYTES=32 while for 'int' 16 bytes alignment is always enough, +// and 16 bytes alignment is also enough for Vector4f. #define EIGEN_ALIGN_BYTES 16 #ifdef EIGEN_DONT_ALIGN diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 528ebe297..562f425bd 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -159,13 +159,16 @@ class compute_matrix_evaluator_flags enum { row_major_bit = Options&RowMajor ? RowMajorBit : 0, is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic, + + // TODO: should check for smaller packet types once we can handle multi-sized packet types + align_bytes = int(packet_traits<Scalar>::size) * sizeof(Scalar), aligned_bit = ( ((Options&DontAlign)==0) && ( #if EIGEN_ALIGN_STATICALLY - ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) + ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % align_bytes) == 0)) #else 0 #endif diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp index d8815263a..6f7b72167 100644 --- a/test/unalignedassert.cpp +++ b/test/unalignedassert.cpp @@ -81,7 +81,7 @@ void construct_at_boundary(int boundary) void unalignedassert() { - #if EIGEN_ALIGN_STATICALLY +#if EIGEN_ALIGN_STATICALLY construct_at_boundary<Vector2f>(4); construct_at_boundary<Vector3f>(4); construct_at_boundary<Vector4f>(16); @@ -100,7 +100,7 @@ void unalignedassert() construct_at_boundary<Vector3cf>(4); construct_at_boundary<Vector2cd>(EIGEN_ALIGN_BYTES); construct_at_boundary<Vector3cd>(16); - #endif +#endif check_unalignedassert_good<TestNew1>(); check_unalignedassert_good<TestNew2>(); @@ -112,11 +112,12 @@ void unalignedassert() check_unalignedassert_good<Depends<true> >(); #if EIGEN_ALIGN_STATICALLY - if(EIGEN_ALIGN_BYTES==16) + if(EIGEN_ALIGN_BYTES>=16) { VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(8)); VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2d>(8)); VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cf>(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4i>(8)); } for(int b=8; b<EIGEN_ALIGN_BYTES; b+=8) { diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 2f839cf51..97477072a 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -214,7 +214,7 @@ template<typename Scalar, bool Enable = internal::packet_traits<Scalar>::Vectori >(DefaultTraversal,CompleteUnrolling))); VERIFY((test_assign(Matrix11(), Matrix<Scalar,PacketSize,EIGEN_PLAIN_ENUM_MIN(2,PacketSize)>()*Matrix<Scalar,EIGEN_PLAIN_ENUM_MIN(2,PacketSize),PacketSize>(), - PacketSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD?DefaultTraversal:InnerVectorizedTraversal, CompleteUnrolling))); + InnerVectorizedTraversal, CompleteUnrolling))); #endif VERIFY(test_assign(MatrixXX(10,10),MatrixXX(20,20).block(10,10,2,3), |