diff options
author | Gael Guennebaud <g.gael@free.fr> | 2015-08-06 17:52:01 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2015-08-06 17:52:01 +0200 |
commit | 2afdef6a54e7fd09a4a6d5e933cf15ffa129beb6 (patch) | |
tree | 796b23d51bcba9c17ade33b51b59d681979a8ba4 /Eigen | |
parent | 1f5024332e47f295c991c3781d57d0466d41a9c8 (diff) |
Generalize first_aligned to take the requested alignment as a template parameter, and add a first_default_aligned variante calling first_aligned with the requirement of the largest packet for the given scalar type.
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/src/Core/AssignEvaluator.h | 14 | ||||
-rw-r--r-- | Eigen/src/Core/DenseCoeffsBase.h | 26 | ||||
-rw-r--r-- | Eigen/src/Core/Redux.h | 7 | ||||
-rw-r--r-- | Eigen/src/Core/StableNorm.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixVector.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixVector.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/products/TriangularMatrixMatrix.h | 3 | ||||
-rwxr-xr-x | Eigen/src/Core/util/BlasUtil.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/util/Memory.h | 41 | ||||
-rw-r--r-- | Eigen/src/Jacobi/Jacobi.h | 4 | ||||
-rw-r--r-- | Eigen/src/SparseLU/SparseLU_gemm_kernel.h | 4 | ||||
-rw-r--r-- | Eigen/src/SparseLU/SparseLU_kernel_bmod.h | 4 | ||||
-rw-r--r-- | Eigen/src/SparseLU/SparseLU_panel_bmod.h | 2 |
13 files changed, 67 insertions, 46 deletions
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 39efb1d5a..f589555cc 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -365,13 +365,14 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling> typedef typename Kernel::Scalar Scalar; typedef packet_traits<Scalar> PacketTraits; enum { + requestedAlignment = Kernel::AssignmentTraits::RequiredAlignment, packetSize = PacketTraits::size, - dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(Kernel::AssignmentTraits::RequiredAlignment), - dstAlignment = PacketTraits::AlignedOnScalar ? int(Kernel::AssignmentTraits::RequiredAlignment) + dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment), + dstAlignment = PacketTraits::AlignedOnScalar ? int(requestedAlignment) : int(Kernel::AssignmentTraits::DstAlignment), srcAlignment = Kernel::AssignmentTraits::JointAlignment }; - const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0), size); + const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(&kernel.dstEvaluator().coeffRef(0), size); const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize; unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart); @@ -479,9 +480,10 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> typedef packet_traits<Scalar> PacketTraits; enum { packetSize = PacketTraits::size, + requestedAlignment = int(Kernel::AssignmentTraits::RequiredAlignment), alignable = PacketTraits::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar), - dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(Kernel::AssignmentTraits::RequiredAlignment), - dstAlignment = alignable ? int(Kernel::AssignmentTraits::RequiredAlignment) + dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment), + dstAlignment = alignable ? int(requestedAlignment) : int(Kernel::AssignmentTraits::DstAlignment) }; const Scalar *dst_ptr = &kernel.dstEvaluator().coeffRef(0,0); @@ -494,7 +496,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> const Index innerSize = kernel.innerSize(); const Index outerSize = kernel.outerSize(); const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0; - Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned(dst_ptr, innerSize); + Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned<requestedAlignment>(dst_ptr, innerSize); for(Index outer = 0; outer < outerSize; ++outer) { diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h index 11e2a1809..d053911e3 100644 --- a/Eigen/src/Core/DenseCoeffsBase.h +++ b/Eigen/src/Core/DenseCoeffsBase.h @@ -580,33 +580,41 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors> namespace internal { -template<typename Derived, bool JustReturnZero> +template<int Alignment, typename Derived, bool JustReturnZero> struct first_aligned_impl { static inline Index run(const Derived&) { return 0; } }; -template<typename Derived> -struct first_aligned_impl<Derived, false> +template<int Alignment, typename Derived> +struct first_aligned_impl<Alignment, Derived, false> { static inline Index run(const Derived& m) { - return internal::first_aligned(&m.const_cast_derived().coeffRef(0,0), m.size()); + return internal::first_aligned<Alignment>(&m.const_cast_derived().coeffRef(0,0), m.size()); } }; -/** \internal \returns the index of the first element of the array that is well aligned for vectorization. +/** \internal \returns the index of the first element of the array stored by \a m that is properly aligned with respect to \a Alignment for vectorization. + * + * \tparam Alignment requested alignment in Bytes. * * There is also the variant first_aligned(const Scalar*, Integer) defined in Memory.h. See it for more * documentation. */ -template<typename Derived> +template<int Alignment, typename Derived> static inline Index first_aligned(const DenseBase<Derived>& m) { - return first_aligned_impl - <Derived, (evaluator<Derived>::Alignment > 0 ) || !(Derived::Flags & DirectAccessBit)> // FIXME Alignment! - ::run(m.derived()); + enum { ReturnZero = (int(evaluator<Derived>::Alignment) >= Alignment) || !(Derived::Flags & DirectAccessBit) }; + return first_aligned_impl<Alignment, Derived, ReturnZero>::run(m.derived()); +} + +template<typename Derived> +static inline Index first_default_aligned(const DenseBase<Derived>& m) +{ + typedef typename Derived::Scalar Scalar; + return first_aligned<packet_traits<Scalar>::size*sizeof(Scalar)>(m); } template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret> diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 0c25223aa..fa308b53e 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -221,12 +221,13 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling> { const Index size = mat.size(); - const Index packetSize = packet_traits<Scalar>::size; - const Index alignedStart = internal::first_aligned(mat.nestedExpression()); + const Index packetSize = packet_traits<Scalar>::size; + const int packetBytes = int(packetSize*sizeof(Scalar)); enum { - alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(sizeof(Scalar)*packetSize) : int(Unaligned), // FIXME take into account alignment requirement + alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetBytes) : int(Unaligned), // FIXME take into account alignment requirement alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment) }; + const Index alignedStart = internal::first_default_aligned(mat.nestedExpression()); const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize); const Index alignedEnd2 = alignedStart + alignedSize2; diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h index aca81f463..7fe39808b 100644 --- a/Eigen/src/Core/StableNorm.h +++ b/Eigen/src/Core/StableNorm.h @@ -178,7 +178,7 @@ MatrixBase<Derived>::stableNorm() const if(n==1) return abs(this->coeff(0)); - Index bi = internal::first_aligned(copy); + Index bi = internal::first_default_aligned(copy); if (bi>0) internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale); for (; bi<n; bi+=blockSize) diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 439f14456..8b7dca45f 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -125,7 +125,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type. - Index alignedStart = internal::first_aligned(res,size); + Index alignedStart = internal::first_default_aligned(res,size); Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 5d6ef9913..f3443bd10 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -94,7 +94,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd size_t starti = FirstTriangular ? 0 : j+2; size_t endi = FirstTriangular ? j : size; - size_t alignedStart = (starti) + internal::first_aligned(&res[starti], endi-starti); + size_t alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti); size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize); // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 3d2345b66..39ab87df8 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -257,6 +257,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Scalar* _res, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { + const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar); // strip zeros Index diagSize = (std::min)(_cols,_depth); Index rows = _rows; @@ -311,7 +312,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc; Scalar* geb = blockB+ts*ts; - geb = geb + internal::first_aligned(geb,EIGEN_MAX_ALIGN_BYTES/sizeof(Scalar)); + geb = geb + internal::first_aligned<PacketBytes>(geb,PacketBytes/sizeof(Scalar)); pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs); diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 934948ebd..d00fa9707 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -230,7 +230,7 @@ class blas_data_mapper { if (size_t(m_data)%sizeof(Scalar)) { return -1; } - return internal::first_aligned(m_data, size); + return internal::first_default_aligned(m_data, size); } protected: diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index c9517acfc..957c36bcf 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -506,47 +506,56 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_align /****************************************************************************/ -/** \internal Returns the index of the first element of the array that is well aligned for vectorization. +/** \internal Returns the index of the first element of the array that is well aligned with respect to the requested \a Alignment. * + * \tparam Alignment requested alignment in Bytes. * \param array the address of the start of the array * \param size the size of the array * - * \note If no element of the array is well aligned, the size of the array is returned. Typically, - * for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the + * \note If no element of the array is well aligned or the requested alignment is not a multiple of a scalar, + * the size of the array is returned. For example with SSE, the requested alignment is typically 16-bytes. If * packet size for the given scalar type is 1, then everything is considered well-aligned. * - * \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a - * power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the - * other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for + * \note Otherwise, if the Alignment is larger that the scalar size, we rely on the assumptions that sizeof(Scalar) is a + * power of 2. On the other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for * example with Scalar=double on certain 32-bit platforms, see bug #79. * * There is also the variant first_aligned(const MatrixBase&) defined in DenseCoeffsBase.h. + * \sa first_default_aligned() */ -template<typename Scalar, typename Index> +template<int Alignment, typename Scalar, typename Index> inline Index first_aligned(const Scalar* array, Index size) { - static const Index PacketSize = packet_traits<Scalar>::size; - static const Index PacketAlignedMask = PacketSize-1; + static const Index ScalarSize = sizeof(Scalar); + static const Index AlignmentSize = Alignment / ScalarSize; + static const Index AlignmentMask = AlignmentSize-1; - if(PacketSize==1) + if(AlignmentSize<=1) { - // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements - // of the array have the same alignment. + // Either the requested alignment if smaller than a scalar, or it exactly match a 1 scalar + // so that all elements of the array have the same alignment. return 0; } - else if(size_t(array) & (sizeof(Scalar)-1)) + else if( (std::size_t(array) & (sizeof(Scalar)-1)) || (Alignment%ScalarSize)!=0) { - // There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar. + // The array is not aligned to the size of a single scalar, or the requested alignment is not a multiple of the scalar size. // Consequently, no element of the array is well aligned. return size; } else { - return std::min<Index>( (PacketSize - (Index((size_t(array)/sizeof(Scalar))) & PacketAlignedMask)) - & PacketAlignedMask, size); + return std::min<Index>( (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask, size); } } +/** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement. + * \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase<Derived>) */ +template<typename Scalar, typename Index> +inline Index first_default_aligned(const Scalar* array, Index size) +{ + return first_aligned<packet_traits<Scalar>::size*sizeof(Scalar)>(array, size); +} + /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size */ template<typename Index> diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index b7b83dcd2..55de15e87 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -325,7 +325,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x // both vectors are sequentially stored in memory => vectorization enum { Peeling = 2 }; - Index alignedStart = internal::first_aligned(y, size); + Index alignedStart = internal::first_default_aligned(y, size); Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; const Packet pc = pset1<Packet>(c); @@ -343,7 +343,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x Scalar* EIGEN_RESTRICT px = x + alignedStart; Scalar* EIGEN_RESTRICT py = y + alignedStart; - if(internal::first_aligned(x, size)==alignedStart) + if(internal::first_default_aligned(x, size)==alignedStart) { for(Index i=alignedStart; i<alignedEnd; i+=PacketSize) { diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h index 1d456ee0c..ae3685ac8 100644 --- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +++ b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h @@ -39,9 +39,9 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const }; Index d_end = (d/RK)*RK; // number of columns of A (rows of B) suitable for full register blocking Index n_end = (n/RN)*RN; // number of columns of B-C suitable for processing RN columns at once - Index i0 = internal::first_aligned(A,m); + Index i0 = internal::first_default_aligned(A,m); - eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_aligned(C,m))); + eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_default_aligned(C,m))); // handle the non aligned rows of A and C without any optimization: for(Index i=0; i<i0; ++i) diff --git a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h index 9513f8369..b8ea5f50f 100644 --- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h @@ -66,8 +66,8 @@ EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const Index seg const Index PacketSize = internal::packet_traits<Scalar>::size; Index ldl = internal::first_multiple(nrow, PacketSize); Map<Matrix<Scalar,Dynamic,SegSizeAtCompileTime>, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(lda) ); - Index aligned_offset = internal::first_aligned(tempv.data()+segsize, PacketSize); - Index aligned_with_B_offset = (PacketSize-internal::first_aligned(B.data(), PacketSize))%PacketSize; + Index aligned_offset = internal::first_default_aligned(tempv.data()+segsize, PacketSize); + Index aligned_with_B_offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize))%PacketSize; Map<Matrix<Scalar,Dynamic,1>, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) ); l.setZero(); diff --git a/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/Eigen/src/SparseLU/SparseLU_panel_bmod.h index bd3cf87b9..ca78406e0 100644 --- a/Eigen/src/SparseLU/SparseLU_panel_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_panel_bmod.h @@ -145,7 +145,7 @@ void SparseLUImpl<Scalar,StorageIndex>::panel_bmod(const Index m, const Index w, eigen_assert(tempv.size()>w*ldu + nrow*w + 1); Index ldl = internal::first_multiple<Index>(nrow, PacketSize); - Index offset = (PacketSize-internal::first_aligned(B.data(), PacketSize)) % PacketSize; + Index offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize)) % PacketSize; Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl)); L.setZero(); |