diff options
-rw-r--r-- | Eigen/src/Core/Assign.h | 12 | ||||
-rw-r--r-- | Eigen/src/Core/Coeffs.h | 32 | ||||
-rw-r--r-- | Eigen/src/Core/Redux.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/StableNorm.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixVector.h | 8 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixVector.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/util/Memory.h | 50 | ||||
-rw-r--r-- | Eigen/src/Jacobi/Jacobi.h | 4 | ||||
-rw-r--r-- | bench/btl/libs/eigen2/eigen2_interface.hh | 2 | ||||
-rwxr-xr-x | bench/btl/libs/hand_vec/hand_vec_interface.hh | 2 | ||||
-rw-r--r-- | doc/I03_InsideEigenExample.dox | 2 | ||||
-rw-r--r-- | test/CMakeLists.txt | 1 | ||||
-rw-r--r-- | test/first_aligned.cpp | 64 |
13 files changed, 140 insertions, 43 deletions
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h index e3af5f61f..d519b47be 100644 --- a/Eigen/src/Core/Assign.h +++ b/Eigen/src/Core/Assign.h @@ -386,14 +386,22 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling const int size = dst.size(); const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size; const int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0 - : ei_alignmentOffset(&dst.coeffRef(0), size); + : ei_first_aligned(&dst.coeffRef(0), size); const int alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize; + EIGEN_DEBUG_VAR(&dst.coeffRef(0)); + EIGEN_DEBUG_VAR(size); + EIGEN_DEBUG_VAR(packetSize); + EIGEN_DEBUG_VAR(alignedStart); + EIGEN_DEBUG_VAR(alignedEnd); + for(int index = 0; index < alignedStart; ++index) dst.copyCoeff(index, src); for(int index = alignedStart; index < alignedEnd; index += packetSize) { + EIGEN_DEBUG_VAR(index); + EIGEN_DEBUG_VAR(&dst.coeffRef(index)); dst.template copyPacket<Derived2, Aligned, ei_assign_traits<Derived1,Derived2>::SrcAlignment>(index, src); } @@ -431,7 +439,7 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorizedTraversal, NoUnrolling> const int outerSize = dst.outerSize(); const int alignedStep = (packetSize - dst.stride() % packetSize) & packetAlignedMask; int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0 - : ei_alignmentOffset(&dst.coeffRef(0,0), innerSize); + : ei_first_aligned(&dst.coeffRef(0,0), innerSize); for(int i = 0; i < outerSize; ++i) { diff --git a/Eigen/src/Core/Coeffs.h b/Eigen/src/Core/Coeffs.h index eda36b7bb..d2b3237c5 100644 --- a/Eigen/src/Core/Coeffs.h +++ b/Eigen/src/Core/Coeffs.h @@ -380,35 +380,33 @@ EIGEN_STRONG_INLINE void MatrixBase<Derived>::copyPacket(int index, const Matrix } -template<typename Derived, typename Integer, bool JustReturnZero> -struct ei_alignmentOffset_impl +template<typename Derived, bool JustReturnZero> +struct ei_first_aligned_impl { - inline static Integer run(const MatrixBase<Derived>&, Integer) + inline static int run(const MatrixBase<Derived>&) { return 0; } }; -template<typename Derived, typename Integer> -struct ei_alignmentOffset_impl<Derived, Integer, false> +template<typename Derived> +struct ei_first_aligned_impl<Derived, false> { - inline static Integer run(const MatrixBase<Derived>& m, Integer maxOffset) + inline static int run(const MatrixBase<Derived>& m) { - return ei_alignmentOffset(&m.const_cast_derived().coeffRef(0,0), maxOffset); + return ei_first_aligned(&m.const_cast_derived().coeffRef(0,0), m.size()); } }; -/** \internal \returns the number of elements which have to be skipped, starting - * from the address of coeffRef(0,0), to find the first 16-byte aligned element. - * - * \note If the expression doesn't have the DirectAccessBit, this function returns 0. +/** \internal \returns the index of the first element of the array that is well aligned for vectorization. * - * There is also the variant ei_alignmentOffset(const Scalar*, Integer) defined in Memory.h. + * There is also the variant ei_first_aligned(const Scalar*, Integer) defined in Memory.h. See it for more + * documentation. */ -template<typename Derived, typename Integer> -inline static Integer ei_alignmentOffset(const MatrixBase<Derived>& m, Integer maxOffset) +template<typename Derived> +inline static int ei_first_aligned(const MatrixBase<Derived>& m) { - return ei_alignmentOffset_impl<Derived, Integer, - (Derived::Flags & AlignedBit) || !(Derived::Flags & DirectAccessBit)> - ::run(m, maxOffset); + return ei_first_aligned_impl + <Derived, (Derived::Flags & AlignedBit) || !(Derived::Flags & DirectAccessBit)> + ::run(m); } #endif diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index ae6b01f57..399e2cd90 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -209,7 +209,7 @@ struct ei_redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling> { const int size = mat.size(); const int packetSize = ei_packet_traits<Scalar>::size; - const int alignedStart = ei_alignmentOffset(mat,size); + const int alignedStart = ei_first_aligned(mat); enum { alignment = (Derived::Flags & DirectAccessBit) || (Derived::Flags & AlignedBit) ? Aligned : Unaligned diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h index f2d1e7240..fa1e095ca 100644 --- a/Eigen/src/Core/StableNorm.h +++ b/Eigen/src/Core/StableNorm.h @@ -65,7 +65,7 @@ MatrixBase<Derived>::stableNorm() const int bi=0; if ((int(Flags)&DirectAccessBit) && !(int(Flags)&AlignedBit)) { - bi = ei_alignmentOffset(&const_cast_derived().coeffRef(0), n); + bi = ei_first_aligned(&const_cast_derived().coeffRef(0), n); if (bi>0) ei_stable_norm_kernel(start(bi), ssq, scale, invScale); } diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index a18e5ef1d..3296f32ff 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -69,7 +69,7 @@ void ei_cache_friendly_product_colmajor_times_vector( // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type. - int alignedStart = ei_alignmentOffset(res,size); + int alignedStart = ei_first_aligned(res,size); int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0; const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; @@ -79,7 +79,7 @@ void ei_cache_friendly_product_colmajor_times_vector( : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices - const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size); + const int lhsAlignmentOffset = ei_first_aligned(lhs,size); // find how many columns do we have to skip to be aligned with the result (if possible) int skipColumns = 0; @@ -282,7 +282,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type // if that's not the case then vectorization is discarded, see below. - int alignedStart = ei_alignmentOffset(rhs, size); + int alignedStart = ei_first_aligned(rhs, size); int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0; const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; @@ -292,7 +292,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices - const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size); + const int lhsAlignmentOffset = ei_first_aligned(lhs,size); // find how many rows do we have to skip to be aligned with rhs (if possible) int skipRows = 0; diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index c27454bee..32b7f220e 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -86,7 +86,7 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( size_t starti = FirstTriangular ? 0 : j+2; size_t endi = FirstTriangular ? j : size; size_t alignedEnd = starti; - size_t alignedStart = (starti) + ei_alignmentOffset(&res[starti], endi-starti); + size_t alignedStart = (starti) + ei_first_aligned(&res[starti], endi-starti); alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize); res[j] += cj0.pmul(A0[j], t0); diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 524bec2fc..ecc13dab4 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -209,27 +209,53 @@ template<typename T, bool Align> inline void ei_conditional_aligned_delete(T *pt ei_conditional_aligned_free<Align>(ptr); } -/** \internal \returns the number of elements which have to be skipped to - * find the first 16-byte aligned element +/** \internal \returns the index of the first element of the array that is well aligned for vectorization. * - * There is also the variant ei_alignmentOffset(const MatrixBase&, Integer) defined in Coeffs.h. + * \param array the address of the start of the array + * \param size the size of the array + * + * \note If no element of the array is well aligned, the size of the array is returned. Typically, + * for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the + * packet size for the given scalar type is 1, then everything is considered well-aligned. + * + * \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a + * power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the + * other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for + * example with Scalar=double on certain 32-bit platforms, see bug #79. + * + * There is also the variant ei_first_aligned(const MatrixBase&, Integer) defined in Coeffs.h. */ template<typename Scalar, typename Integer> -inline static Integer ei_alignmentOffset(const Scalar* ptr, Integer maxOffset) +inline static Integer ei_first_aligned(const Scalar* array, Integer size) { typedef typename ei_packet_traits<Scalar>::type Packet; - const Integer PacketSize = ei_packet_traits<Scalar>::size; - const Integer PacketAlignedMask = PacketSize-1; - const bool Vectorized = PacketSize>1; - return Vectorized - ? std::min<Integer>( (PacketSize - (Integer((size_t(ptr)/sizeof(Scalar))) & PacketAlignedMask)) - & PacketAlignedMask, maxOffset) - : 0; + enum { PacketSize = ei_packet_traits<Scalar>::size, + PacketAlignedMask = PacketSize-1 + }; + + if(PacketSize==1) + { + // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements + // of the array have the same aligment. + return 0; + } + else if(size_t(array) & (sizeof(Scalar)-1)) + { + // There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar. + // Consequently, no element of the array is well aligned. + return size; + } + else + { + return std::min<Integer>( (PacketSize - (Integer((size_t(array)/sizeof(Scalar))) & PacketAlignedMask)) + & PacketAlignedMask, size); + } } /** \internal * ei_aligned_stack_alloc(SIZE) allocates an aligned buffer of SIZE bytes - * on the stack if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT. + * on the stack if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and + * if stack allocation is supported by the platform (currently, this is linux only). * Otherwise the memory is allocated on the heap. * Data allocated with ei_aligned_stack_alloc \b must be freed by calling ei_aligned_stack_free(PTR,SIZE). * \code diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index eeb81c178..9d6d1d171 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -318,7 +318,7 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY& typedef typename ei_packet_traits<Scalar>::type Packet; enum { PacketSize = ei_packet_traits<Scalar>::size, Peeling = 2 }; - int alignedStart = ei_alignmentOffset(y, size); + int alignedStart = ei_first_aligned(y, size); int alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; const Packet pc = ei_pset1(Scalar(j.c())); @@ -336,7 +336,7 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY& Scalar* px = x + alignedStart; Scalar* py = y + alignedStart; - if(ei_alignmentOffset(x, size)==alignedStart) + if(ei_first_aligned(x, size)==alignedStart) { for(int i=alignedStart; i<alignedEnd; i+=PacketSize) { diff --git a/bench/btl/libs/eigen2/eigen2_interface.hh b/bench/btl/libs/eigen2/eigen2_interface.hh index f93ccad58..f14c8c383 100644 --- a/bench/btl/libs/eigen2/eigen2_interface.hh +++ b/bench/btl/libs/eigen2/eigen2_interface.hh @@ -124,7 +124,7 @@ public : Scalar* A0 = dst.data() + j*dst.stride(); int starti = j; int alignedEnd = starti; - int alignedStart = (starti) + ei_alignmentOffset(&A0[starti], size-starti); + int alignedStart = (starti) + ei_first_aligned(&A0[starti], size-starti); alignedEnd = alignedStart + ((size-alignedStart)/(2*PacketSize))*(PacketSize*2); // do the non-vectorizable part of the assignment diff --git a/bench/btl/libs/hand_vec/hand_vec_interface.hh b/bench/btl/libs/hand_vec/hand_vec_interface.hh index 6080b2460..4b54c03a3 100755 --- a/bench/btl/libs/hand_vec/hand_vec_interface.hh +++ b/bench/btl/libs/hand_vec/hand_vec_interface.hh @@ -265,7 +265,7 @@ public : int starti = j+2; int alignedEnd = starti; - int alignedStart = (starti) + ei_alignmentOffset(&X[starti], N-starti); + int alignedStart = (starti) + ei_first_aligned(&X[starti], N-starti); alignedEnd = alignedStart + ((N-alignedStart)/(PacketSize))*(PacketSize); X[j] += t0 * A0[j]; diff --git a/doc/I03_InsideEigenExample.dox b/doc/I03_InsideEigenExample.dox index d4960e79d..95cbe6800 100644 --- a/doc/I03_InsideEigenExample.dox +++ b/doc/I03_InsideEigenExample.dox @@ -343,7 +343,7 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling> const int size = dst.size(); const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size; const int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0 - : ei_alignmentOffset(&dst.coeffRef(0), size); + : ei_first_aligned(&dst.coeffRef(0), size); const int alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize; for(int index = 0; index < alignedStart; index++) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ebb8a59d2..a64e09126 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -88,6 +88,7 @@ ei_add_test(meta) ei_add_test(sizeof) ei_add_test(dynalloc) ei_add_test(nomalloc) +ei_add_test(first_aligned) ei_add_test(mixingtypes) ei_add_test(packetmath) ei_add_test(unalignedassert) diff --git a/test/first_aligned.cpp b/test/first_aligned.cpp new file mode 100644 index 000000000..3cf1a7eef --- /dev/null +++ b/test/first_aligned.cpp @@ -0,0 +1,64 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#include "main.h" + +template<typename Scalar> +void test_first_aligned_helper(Scalar *array, int size) +{ + const int packet_size = sizeof(Scalar) * ei_packet_traits<Scalar>::size; + VERIFY(((size_t(array) + sizeof(Scalar) * ei_first_aligned(array, size)) % packet_size) == 0); +} + +template<typename Scalar> +void test_none_aligned_helper(Scalar *array, int size) +{ + VERIFY(ei_packet_traits<Scalar>::size == 1 || ei_first_aligned(array, size) == size); +} + +struct some_non_vectorizable_type { float x; }; + +void test_first_aligned() +{ + EIGEN_ALIGN16 float array_float[100]; + test_first_aligned_helper(array_float, 50); + test_first_aligned_helper(array_float+1, 50); + test_first_aligned_helper(array_float+2, 50); + test_first_aligned_helper(array_float+3, 50); + test_first_aligned_helper(array_float+4, 50); + test_first_aligned_helper(array_float+5, 50); + + EIGEN_ALIGN16 double array_double[100]; + test_first_aligned_helper(array_float, 50); + test_first_aligned_helper(array_float+1, 50); + test_first_aligned_helper(array_float+2, 50); + + double *array_double_plus_4_bytes = (double*)(size_t(array_double)+4); + test_none_aligned_helper(array_double_plus_4_bytes, 50); + test_none_aligned_helper(array_double_plus_4_bytes+1, 50); + + some_non_vectorizable_type array_nonvec[100]; + test_first_aligned_helper(array_nonvec, 100); + test_none_aligned_helper(array_nonvec, 100); +} |