aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2015-08-06 17:52:01 +0200
committerGravatar Gael Guennebaud <g.gael@free.fr>2015-08-06 17:52:01 +0200
commit2afdef6a54e7fd09a4a6d5e933cf15ffa129beb6 (patch)
tree796b23d51bcba9c17ade33b51b59d681979a8ba4 /Eigen
parent1f5024332e47f295c991c3781d57d0466d41a9c8 (diff)
Generalize first_aligned to take the requested alignment as a template parameter, and add a first_default_aligned variante calling first_aligned with the requirement of the largest packet for the given scalar type.
Diffstat (limited to 'Eigen')
-rw-r--r--Eigen/src/Core/AssignEvaluator.h14
-rw-r--r--Eigen/src/Core/DenseCoeffsBase.h26
-rw-r--r--Eigen/src/Core/Redux.h7
-rw-r--r--Eigen/src/Core/StableNorm.h2
-rw-r--r--Eigen/src/Core/products/GeneralMatrixVector.h2
-rw-r--r--Eigen/src/Core/products/SelfadjointMatrixVector.h2
-rw-r--r--Eigen/src/Core/products/TriangularMatrixMatrix.h3
-rwxr-xr-xEigen/src/Core/util/BlasUtil.h2
-rw-r--r--Eigen/src/Core/util/Memory.h41
-rw-r--r--Eigen/src/Jacobi/Jacobi.h4
-rw-r--r--Eigen/src/SparseLU/SparseLU_gemm_kernel.h4
-rw-r--r--Eigen/src/SparseLU/SparseLU_kernel_bmod.h4
-rw-r--r--Eigen/src/SparseLU/SparseLU_panel_bmod.h2
13 files changed, 67 insertions, 46 deletions
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index 39efb1d5a..f589555cc 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -365,13 +365,14 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
typedef typename Kernel::Scalar Scalar;
typedef packet_traits<Scalar> PacketTraits;
enum {
+ requestedAlignment = Kernel::AssignmentTraits::RequiredAlignment,
packetSize = PacketTraits::size,
- dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(Kernel::AssignmentTraits::RequiredAlignment),
- dstAlignment = PacketTraits::AlignedOnScalar ? int(Kernel::AssignmentTraits::RequiredAlignment)
+ dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
+ dstAlignment = PacketTraits::AlignedOnScalar ? int(requestedAlignment)
: int(Kernel::AssignmentTraits::DstAlignment),
srcAlignment = Kernel::AssignmentTraits::JointAlignment
};
- const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0), size);
+ const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(&kernel.dstEvaluator().coeffRef(0), size);
const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
@@ -479,9 +480,10 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
typedef packet_traits<Scalar> PacketTraits;
enum {
packetSize = PacketTraits::size,
+ requestedAlignment = int(Kernel::AssignmentTraits::RequiredAlignment),
alignable = PacketTraits::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar),
- dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(Kernel::AssignmentTraits::RequiredAlignment),
- dstAlignment = alignable ? int(Kernel::AssignmentTraits::RequiredAlignment)
+ dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
+ dstAlignment = alignable ? int(requestedAlignment)
: int(Kernel::AssignmentTraits::DstAlignment)
};
const Scalar *dst_ptr = &kernel.dstEvaluator().coeffRef(0,0);
@@ -494,7 +496,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
const Index innerSize = kernel.innerSize();
const Index outerSize = kernel.outerSize();
const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0;
- Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned(dst_ptr, innerSize);
+ Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned<requestedAlignment>(dst_ptr, innerSize);
for(Index outer = 0; outer < outerSize; ++outer)
{
diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h
index 11e2a1809..d053911e3 100644
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -580,33 +580,41 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
namespace internal {
-template<typename Derived, bool JustReturnZero>
+template<int Alignment, typename Derived, bool JustReturnZero>
struct first_aligned_impl
{
static inline Index run(const Derived&)
{ return 0; }
};
-template<typename Derived>
-struct first_aligned_impl<Derived, false>
+template<int Alignment, typename Derived>
+struct first_aligned_impl<Alignment, Derived, false>
{
static inline Index run(const Derived& m)
{
- return internal::first_aligned(&m.const_cast_derived().coeffRef(0,0), m.size());
+ return internal::first_aligned<Alignment>(&m.const_cast_derived().coeffRef(0,0), m.size());
}
};
-/** \internal \returns the index of the first element of the array that is well aligned for vectorization.
+/** \internal \returns the index of the first element of the array stored by \a m that is properly aligned with respect to \a Alignment for vectorization.
+ *
+ * \tparam Alignment requested alignment in Bytes.
*
* There is also the variant first_aligned(const Scalar*, Integer) defined in Memory.h. See it for more
* documentation.
*/
-template<typename Derived>
+template<int Alignment, typename Derived>
static inline Index first_aligned(const DenseBase<Derived>& m)
{
- return first_aligned_impl
- <Derived, (evaluator<Derived>::Alignment > 0 ) || !(Derived::Flags & DirectAccessBit)> // FIXME Alignment!
- ::run(m.derived());
+ enum { ReturnZero = (int(evaluator<Derived>::Alignment) >= Alignment) || !(Derived::Flags & DirectAccessBit) };
+ return first_aligned_impl<Alignment, Derived, ReturnZero>::run(m.derived());
+}
+
+template<typename Derived>
+static inline Index first_default_aligned(const DenseBase<Derived>& m)
+{
+ typedef typename Derived::Scalar Scalar;
+ return first_aligned<packet_traits<Scalar>::size*sizeof(Scalar)>(m);
}
template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 0c25223aa..fa308b53e 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -221,12 +221,13 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
{
const Index size = mat.size();
- const Index packetSize = packet_traits<Scalar>::size;
- const Index alignedStart = internal::first_aligned(mat.nestedExpression());
+ const Index packetSize = packet_traits<Scalar>::size;
+ const int packetBytes = int(packetSize*sizeof(Scalar));
enum {
- alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(sizeof(Scalar)*packetSize) : int(Unaligned), // FIXME take into account alignment requirement
+ alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetBytes) : int(Unaligned), // FIXME take into account alignment requirement
alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
};
+ const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
const Index alignedEnd2 = alignedStart + alignedSize2;
diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index aca81f463..7fe39808b 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -178,7 +178,7 @@ MatrixBase<Derived>::stableNorm() const
if(n==1)
return abs(this->coeff(0));
- Index bi = internal::first_aligned(copy);
+ Index bi = internal::first_default_aligned(copy);
if (bi>0)
internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
for (; bi<n; bi+=blockSize)
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 439f14456..8b7dca45f 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -125,7 +125,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type.
- Index alignedStart = internal::first_aligned(res,size);
+ Index alignedStart = internal::first_default_aligned(res,size);
Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index 5d6ef9913..f3443bd10 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -94,7 +94,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
size_t starti = FirstTriangular ? 0 : j+2;
size_t endi = FirstTriangular ? j : size;
- size_t alignedStart = (starti) + internal::first_aligned(&res[starti], endi-starti);
+ size_t alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti);
size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
// TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 3d2345b66..39ab87df8 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -257,6 +257,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Scalar* _res, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{
+ const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
// strip zeros
Index diagSize = (std::min)(_cols,_depth);
Index rows = _rows;
@@ -311,7 +312,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
Scalar* geb = blockB+ts*ts;
- geb = geb + internal::first_aligned(geb,EIGEN_MAX_ALIGN_BYTES/sizeof(Scalar));
+ geb = geb + internal::first_aligned<PacketBytes>(geb,PacketBytes/sizeof(Scalar));
pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 934948ebd..d00fa9707 100755
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -230,7 +230,7 @@ class blas_data_mapper {
if (size_t(m_data)%sizeof(Scalar)) {
return -1;
}
- return internal::first_aligned(m_data, size);
+ return internal::first_default_aligned(m_data, size);
}
protected:
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index c9517acfc..957c36bcf 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -506,47 +506,56 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_align
/****************************************************************************/
-/** \internal Returns the index of the first element of the array that is well aligned for vectorization.
+/** \internal Returns the index of the first element of the array that is well aligned with respect to the requested \a Alignment.
*
+ * \tparam Alignment requested alignment in Bytes.
* \param array the address of the start of the array
* \param size the size of the array
*
- * \note If no element of the array is well aligned, the size of the array is returned. Typically,
- * for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the
+ * \note If no element of the array is well aligned or the requested alignment is not a multiple of a scalar,
+ * the size of the array is returned. For example with SSE, the requested alignment is typically 16-bytes. If
* packet size for the given scalar type is 1, then everything is considered well-aligned.
*
- * \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a
- * power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the
- * other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
+ * \note Otherwise, if the Alignment is larger that the scalar size, we rely on the assumptions that sizeof(Scalar) is a
+ * power of 2. On the other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
* example with Scalar=double on certain 32-bit platforms, see bug #79.
*
* There is also the variant first_aligned(const MatrixBase&) defined in DenseCoeffsBase.h.
+ * \sa first_default_aligned()
*/
-template<typename Scalar, typename Index>
+template<int Alignment, typename Scalar, typename Index>
inline Index first_aligned(const Scalar* array, Index size)
{
- static const Index PacketSize = packet_traits<Scalar>::size;
- static const Index PacketAlignedMask = PacketSize-1;
+ static const Index ScalarSize = sizeof(Scalar);
+ static const Index AlignmentSize = Alignment / ScalarSize;
+ static const Index AlignmentMask = AlignmentSize-1;
- if(PacketSize==1)
+ if(AlignmentSize<=1)
{
- // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
- // of the array have the same alignment.
+ // Either the requested alignment if smaller than a scalar, or it exactly match a 1 scalar
+ // so that all elements of the array have the same alignment.
return 0;
}
- else if(size_t(array) & (sizeof(Scalar)-1))
+ else if( (std::size_t(array) & (sizeof(Scalar)-1)) || (Alignment%ScalarSize)!=0)
{
- // There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar.
+ // The array is not aligned to the size of a single scalar, or the requested alignment is not a multiple of the scalar size.
// Consequently, no element of the array is well aligned.
return size;
}
else
{
- return std::min<Index>( (PacketSize - (Index((size_t(array)/sizeof(Scalar))) & PacketAlignedMask))
- & PacketAlignedMask, size);
+ return std::min<Index>( (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask, size);
}
}
+/** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement.
+ * \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase<Derived>) */
+template<typename Scalar, typename Index>
+inline Index first_default_aligned(const Scalar* array, Index size)
+{
+ return first_aligned<packet_traits<Scalar>::size*sizeof(Scalar)>(array, size);
+}
+
/** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size
*/
template<typename Index>
diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h
index b7b83dcd2..55de15e87 100644
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h
@@ -325,7 +325,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x
// both vectors are sequentially stored in memory => vectorization
enum { Peeling = 2 };
- Index alignedStart = internal::first_aligned(y, size);
+ Index alignedStart = internal::first_default_aligned(y, size);
Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;
const Packet pc = pset1<Packet>(c);
@@ -343,7 +343,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x
Scalar* EIGEN_RESTRICT px = x + alignedStart;
Scalar* EIGEN_RESTRICT py = y + alignedStart;
- if(internal::first_aligned(x, size)==alignedStart)
+ if(internal::first_default_aligned(x, size)==alignedStart)
{
for(Index i=alignedStart; i<alignedEnd; i+=PacketSize)
{
diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
index 1d456ee0c..ae3685ac8 100644
--- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+++ b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
@@ -39,9 +39,9 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
};
Index d_end = (d/RK)*RK; // number of columns of A (rows of B) suitable for full register blocking
Index n_end = (n/RN)*RN; // number of columns of B-C suitable for processing RN columns at once
- Index i0 = internal::first_aligned(A,m);
+ Index i0 = internal::first_default_aligned(A,m);
- eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_aligned(C,m)));
+ eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_default_aligned(C,m)));
// handle the non aligned rows of A and C without any optimization:
for(Index i=0; i<i0; ++i)
diff --git a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
index 9513f8369..b8ea5f50f 100644
--- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
@@ -66,8 +66,8 @@ EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const Index seg
const Index PacketSize = internal::packet_traits<Scalar>::size;
Index ldl = internal::first_multiple(nrow, PacketSize);
Map<Matrix<Scalar,Dynamic,SegSizeAtCompileTime>, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(lda) );
- Index aligned_offset = internal::first_aligned(tempv.data()+segsize, PacketSize);
- Index aligned_with_B_offset = (PacketSize-internal::first_aligned(B.data(), PacketSize))%PacketSize;
+ Index aligned_offset = internal::first_default_aligned(tempv.data()+segsize, PacketSize);
+ Index aligned_with_B_offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize))%PacketSize;
Map<Matrix<Scalar,Dynamic,1>, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) );
l.setZero();
diff --git a/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
index bd3cf87b9..ca78406e0 100644
--- a/Eigen/src/SparseLU/SparseLU_panel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
@@ -145,7 +145,7 @@ void SparseLUImpl<Scalar,StorageIndex>::panel_bmod(const Index m, const Index w,
eigen_assert(tempv.size()>w*ldu + nrow*w + 1);
Index ldl = internal::first_multiple<Index>(nrow, PacketSize);
- Index offset = (PacketSize-internal::first_aligned(B.data(), PacketSize)) % PacketSize;
+ Index offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize)) % PacketSize;
Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl));
L.setZero();