aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--Eigen/src/Core/Assign.h63
-rw-r--r--Eigen/src/Core/Block.h99
-rw-r--r--Eigen/src/Core/Coeffs.h136
-rw-r--r--Eigen/src/Core/CwiseBinaryOp.h13
-rw-r--r--Eigen/src/Core/CwiseNullaryOp.h15
-rw-r--r--Eigen/src/Core/CwiseUnaryOp.h11
-rw-r--r--Eigen/src/Core/DiagonalCoeffs.h13
-rw-r--r--Eigen/src/Core/DiagonalProduct.h3
-rw-r--r--Eigen/src/Core/Dot.h25
-rw-r--r--Eigen/src/Core/Flagged.h22
-rw-r--r--Eigen/src/Core/Functors.h13
-rw-r--r--Eigen/src/Core/IO.h12
-rwxr-xr-xEigen/src/Core/InverseProduct.h30
-rw-r--r--Eigen/src/Core/Map.h10
-rw-r--r--Eigen/src/Core/Matrix.h45
-rw-r--r--Eigen/src/Core/MatrixBase.h13
-rw-r--r--Eigen/src/Core/NestByValue.h22
-rw-r--r--Eigen/src/Core/Product.h18
-rw-r--r--Eigen/src/Core/Sum.h45
-rw-r--r--Eigen/src/Core/util/Constants.h73
-rw-r--r--Eigen/src/Core/util/Meta.h3
-rw-r--r--Eigen/src/Core/util/StaticAssert.h4
-rw-r--r--bench/benchVecAdd.cpp134
23 files changed, 558 insertions, 264 deletions
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index c28a0371b..85694b35f 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -301,45 +301,14 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
const int size = dst.size();
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
const int alignedSize = (size/packetSize)*packetSize;
- const bool rowMajor = Derived1::Flags&RowMajorBit;
- const int innerSize = rowMajor ? dst.cols() : dst.rows();
- const int outerSize = rowMajor ? dst.rows() : dst.cols();
- int index = 0;
-
- // do the vectorizable part of the assignment
- int row = 0;
- int col = 0;
- while (index<alignedSize)
- {
- int start = rowMajor ? col : row;
- int end = std::min(innerSize, start + alignedSize-index);
- for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
- dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col));
- index += (rowMajor ? col : row) - start;
- row = rowMajor ? index/innerSize : index%innerSize;
- col = rowMajor ? index%innerSize : index/innerSize;
- }
-
- // now we must do the rest without vectorization.
- if(alignedSize == size) return;
- const int k = alignedSize/innerSize;
- // do the remainder of the current row or col
- for(int i = alignedSize%innerSize; i < innerSize; i++)
+ for(int index = 0; index < alignedSize; index += packetSize)
{
- const int row = rowMajor ? k : i;
- const int col = rowMajor ? i : k;
- dst.coeffRef(row, col) = src.coeff(row, col);
+ dst.template writePacket<Aligned>(index, src.template packet<Aligned>(index));
}
- // do the remaining rows or cols
- for(int j = k+1; j < outerSize; j++)
- for(int i = 0; i < innerSize; i++)
- {
- const int row = rowMajor ? i : j;
- const int col = rowMajor ? j : i;
- dst.coeffRef(row, col) = src.coeff(row, col);
- }
+ for(int index = alignedSize; index < size; index++)
+ dst.coeffRef(index) = src.coeff(index);
}
};
@@ -351,23 +320,9 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, CompleteUnrolling
const int size = Derived1::SizeAtCompileTime;
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
const int alignedSize = (size/packetSize)*packetSize;
- const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
- const int innerSize = rowMajor ? int(Derived1::ColsAtCompileTime) : int(Derived1::RowsAtCompileTime);
- const int outerSize = rowMajor ? int(Derived1::RowsAtCompileTime) : int(Derived1::ColsAtCompileTime);
- // do the vectorizable part of the assignment
ei_assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, alignedSize>::run(dst, src);
-
- // now we must do the rest without vectorization.
- const int k = alignedSize/innerSize;
- const int i = alignedSize%innerSize;
-
- // do the remainder of the current row or col
- ei_assign_novec_InnerUnrolling<Derived1, Derived2, i, k<outerSize ? innerSize : 0>::run(dst, src, k);
-
- // do the remaining rows or cols
- for(int j = k+1; j < outerSize; j++)
- ei_assign_novec_InnerUnrolling<Derived1, Derived2, 0, innerSize>::run(dst, src, j);
+ ei_assign_novec_CompleteUnrolling<Derived1, Derived2, alignedSize, size>::run(dst, src);
}
};
@@ -432,8 +387,8 @@ template<typename Derived, typename OtherDerived,
struct ei_assign_selector;
template<typename Derived, typename OtherDerived>
-struct ei_assign_selector<Derived,OtherDerived,true,true> {
- static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
+struct ei_assign_selector<Derived,OtherDerived,false,false> {
+ static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
};
template<typename Derived, typename OtherDerived>
struct ei_assign_selector<Derived,OtherDerived,true,false> {
@@ -444,8 +399,8 @@ struct ei_assign_selector<Derived,OtherDerived,false,true> {
static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose()); }
};
template<typename Derived, typename OtherDerived>
-struct ei_assign_selector<Derived,OtherDerived,false,false> {
- static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
+struct ei_assign_selector<Derived,OtherDerived,true,true> {
+ static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
};
template<typename Derived>
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 5c9ad69d5..e9f5bab29 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -71,13 +71,12 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols> >
|| (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic))
? ~LargeBit
: ~(unsigned int)0,
- MaskPacketAccessBit = ei_corrected_matrix_flags<
- Scalar, RowsAtCompileTime, ColsAtCompileTime,
- MaxRowsAtCompileTime, MaxColsAtCompileTime, MatrixType::Flags
- >::ret & PacketAccessBit,
- FlagsLinearAccessBit = MatrixType::Flags & RowMajorBit
- ? (RowsAtCompileTime == 1 ? LinearAccessBit : 0)
- : (ColsAtCompileTime == 1 ? LinearAccessBit : 0),
+ RowMajor = int(MatrixType::Flags)&RowMajorBit,
+ InnerSize = RowMajor ? ColsAtCompileTime : RowsAtCompileTime,
+ InnerMaxSize = RowMajor ? MaxColsAtCompileTime : MaxRowsAtCompileTime,
+ MaskPacketAccessBit = (InnerMaxSize == Dynamic || (InnerSize % ei_packet_traits<Scalar>::size) == 0)
+ ? PacketAccessBit : 0,
+ FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
Flags = (MatrixType::Flags & (HereditaryBits | MaskPacketAccessBit | DirectAccessBit) & MaskLargeBit)
| FlagsLinearAccessBit,
CoeffReadCost = MatrixType::CoeffReadCost
@@ -153,6 +152,21 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
return m_matrix.coeff(row + m_startRow.value(), col + m_startCol.value());
}
+ inline Scalar& _coeffRef(int index)
+ {
+ return m_matrix.const_cast_derived()
+ .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+ m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+
+ }
+
+ inline const Scalar _coeff(int index) const
+ {
+ return m_matrix
+ .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+ m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+ }
+
template<int LoadMode>
inline PacketScalar _packet(int row, int col) const
{
@@ -165,6 +179,21 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
m_matrix.const_cast_derived().template writePacket<UnAligned>(row + m_startRow.value(), col + m_startCol.value(), x);
}
+ template<int LoadMode>
+ inline PacketScalar _packet(int index) const
+ {
+ return m_matrix.template packet<UnAligned>(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+ m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+ }
+
+ template<int LoadMode>
+ inline void _writePacket(int index, const PacketScalar& x)
+ {
+ m_matrix.const_cast_derived().template writePacket<UnAligned>
+ (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+ m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), x);
+ }
+
protected:
const typename MatrixType::Nested m_matrix;
@@ -260,22 +289,30 @@ inline const Block<Derived> MatrixBase<Derived>
* \sa class Block, block(int,int)
*/
template<typename Derived>
-inline Block<Derived> MatrixBase<Derived>::start(int size)
+inline typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
+MatrixBase<Derived>::start(int size)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
- return Block<Derived>(derived(), 0, 0,
- RowsAtCompileTime == 1 ? 1 : size,
- ColsAtCompileTime == 1 ? 1 : size);
+ return Block<Derived,
+ RowsAtCompileTime == 1 ? 1 : Dynamic,
+ ColsAtCompileTime == 1 ? 1 : Dynamic>
+ (derived(), 0, 0,
+ RowsAtCompileTime == 1 ? 1 : size,
+ ColsAtCompileTime == 1 ? 1 : size);
}
/** This is the const version of start(int).*/
template<typename Derived>
-inline const Block<Derived> MatrixBase<Derived>::start(int size) const
+inline const typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
+MatrixBase<Derived>::start(int size) const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
- return Block<Derived>(derived(), 0, 0,
- RowsAtCompileTime == 1 ? 1 : size,
- ColsAtCompileTime == 1 ? 1 : size);
+ return Block<Derived,
+ RowsAtCompileTime == 1 ? 1 : Dynamic,
+ ColsAtCompileTime == 1 ? 1 : Dynamic>
+ (derived(), 0, 0,
+ RowsAtCompileTime == 1 ? 1 : size,
+ ColsAtCompileTime == 1 ? 1 : size);
}
/** \returns a dynamic-size expression of the last coefficients of *this.
@@ -294,26 +331,34 @@ inline const Block<Derived> MatrixBase<Derived>::start(int size) const
* \sa class Block, block(int,int)
*/
template<typename Derived>
-inline Block<Derived> MatrixBase<Derived>::end(int size)
+inline typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
+MatrixBase<Derived>::end(int size)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
- return Block<Derived>(derived(),
- RowsAtCompileTime == 1 ? 0 : rows() - size,
- ColsAtCompileTime == 1 ? 0 : cols() - size,
- RowsAtCompileTime == 1 ? 1 : size,
- ColsAtCompileTime == 1 ? 1 : size);
+ return Block<Derived,
+ RowsAtCompileTime == 1 ? 1 : Dynamic,
+ ColsAtCompileTime == 1 ? 1 : Dynamic>
+ (derived(),
+ RowsAtCompileTime == 1 ? 0 : rows() - size,
+ ColsAtCompileTime == 1 ? 0 : cols() - size,
+ RowsAtCompileTime == 1 ? 1 : size,
+ ColsAtCompileTime == 1 ? 1 : size);
}
/** This is the const version of end(int).*/
template<typename Derived>
-inline const Block<Derived> MatrixBase<Derived>::end(int size) const
+inline const typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
+MatrixBase<Derived>::end(int size) const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
- return Block<Derived>(derived(),
- RowsAtCompileTime == 1 ? 0 : rows() - size,
- ColsAtCompileTime == 1 ? 0 : cols() - size,
- RowsAtCompileTime == 1 ? 1 : size,
- ColsAtCompileTime == 1 ? 1 : size);
+ return Block<Derived,
+ RowsAtCompileTime == 1 ? 1 : Dynamic,
+ ColsAtCompileTime == 1 ? 1 : Dynamic>
+ (derived(),
+ RowsAtCompileTime == 1 ? 0 : rows() - size,
+ ColsAtCompileTime == 1 ? 0 : cols() - size,
+ RowsAtCompileTime == 1 ? 1 : size,
+ ColsAtCompileTime == 1 ? 1 : size);
}
/** \returns a fixed-size expression of the first coefficients of *this.
diff --git a/Eigen/src/Core/Coeffs.h b/Eigen/src/Core/Coeffs.h
index deb015136..53b551b19 100644
--- a/Eigen/src/Core/Coeffs.h
+++ b/Eigen/src/Core/Coeffs.h
@@ -104,7 +104,7 @@ inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
* \link operator[](int) const \endlink, but without the assertion.
* Use this for limiting the performance cost of debugging code when doing
* repeated coefficient access. Only use this when it is guaranteed that the
- * parameters \a row and \a col are in range.
+ * parameter \a index is in range.
*
* If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
* function equivalent to \link operator[](int) const \endlink.
@@ -115,22 +115,13 @@ template<typename Derived>
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
::coeff(int index) const
{
- EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
- if(RowsAtCompileTime == 1)
- {
- ei_internal_assert(index >= 0 && index < cols());
- return coeff(0, index);
- }
- else
- {
- ei_internal_assert(index >= 0 && index < rows());
- return coeff(index, 0);
- }
+ ei_internal_assert(index >= 0 && index < size());
+ return derived()._coeff(index);
}
/** \returns the coefficient at given index.
*
- * \only_for_vectors
+ * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
*
* \sa operator[](int), operator()(int,int) const, x() const, y() const,
* z() const, w() const
@@ -139,17 +130,8 @@ template<typename Derived>
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
::operator[](int index) const
{
- EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
- if(RowsAtCompileTime == 1)
- {
- ei_assert(index >= 0 && index < cols());
- return coeff(0, index);
- }
- else
- {
- ei_assert(index >= 0 && index < rows());
- return coeff(index, 0);
- }
+ ei_assert(index >= 0 && index < size());
+ return derived()._coeff(index);
}
/** Short version: don't use this function, use
@@ -170,22 +152,13 @@ template<typename Derived>
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
::coeffRef(int index)
{
- EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
- if(RowsAtCompileTime == 1)
- {
- ei_internal_assert(index >= 0 && index < cols());
- return coeffRef(0, index);
- }
- else
- {
- ei_internal_assert(index >= 0 && index < rows());
- return coeffRef(index, 0);
- }
+ ei_internal_assert(index >= 0 && index < size());
+ return derived()._coeffRef(index);
}
/** \returns a reference to the coefficient at given index.
*
- * \only_for_vectors
+ * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
*
* \sa operator[](int) const, operator()(int,int), x(), y(), z(), w()
*/
@@ -193,70 +166,119 @@ template<typename Derived>
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
::operator[](int index)
{
- ei_assert(IsVectorAtCompileTime);
- if(RowsAtCompileTime == 1)
- {
- ei_assert(index >= 0 && index < cols());
- return coeffRef(0, index);
- }
- else
- {
- ei_assert(index >= 0 && index < rows());
- return coeffRef(index, 0);
- }
+ ei_assert(index >= 0 && index < size());
+ return derived()._coeffRef(index);
}
-/** equivalent to operator[](0). \only_for_vectors */
+/** equivalent to operator[](0). */
template<typename Derived>
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
::x() const { return (*this)[0]; }
-/** equivalent to operator[](1). \only_for_vectors */
+/** equivalent to operator[](1). */
template<typename Derived>
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
::y() const { return (*this)[1]; }
-/** equivalent to operator[](2). \only_for_vectors */
+/** equivalent to operator[](2). */
template<typename Derived>
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
::z() const { return (*this)[2]; }
-/** equivalent to operator[](3). \only_for_vectors */
+/** equivalent to operator[](3). */
template<typename Derived>
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
::w() const { return (*this)[3]; }
-/** equivalent to operator[](0). \only_for_vectors */
+/** equivalent to operator[](0). */
template<typename Derived>
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
::x() { return (*this)[0]; }
-/** equivalent to operator[](1). \only_for_vectors */
+/** equivalent to operator[](1). */
template<typename Derived>
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
::y() { return (*this)[1]; }
-/** equivalent to operator[](2). \only_for_vectors */
+/** equivalent to operator[](2). */
template<typename Derived>
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
::z() { return (*this)[2]; }
-/** equivalent to operator[](3). \only_for_vectors */
+/** equivalent to operator[](3). */
template<typename Derived>
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
::w() { return (*this)[3]; }
+/** \returns the packet of coefficients starting at the given row and column. It is your responsibility
+ * to ensure that a packet really starts there. This method is only available on expressions having the
+ * PacketAccessBit.
+ *
+ * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+ * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+ * starting at an address which is a multiple of the packet size.
+ */
template<typename Derived>
template<int LoadMode>
inline typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type
MatrixBase<Derived>::packet(int row, int col) const
-{ return derived().template _packet<LoadMode>(row,col); }
+{
+ ei_internal_assert(row >= 0 && row < rows()
+ && col >= 0 && col < cols());
+ return derived().template _packet<LoadMode>(row,col);
+}
+/** Stores the given packet of coefficients, at the given row and column of this expression. It is your responsibility
+ * to ensure that a packet really starts there. This method is only available on expressions having the
+ * PacketAccessBit.
+ *
+ * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+ * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+ * starting at an address which is a multiple of the packet size.
+ */
template<typename Derived>
template<int StoreMode>
inline void MatrixBase<Derived>::writePacket
(int row, int col, const typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type& x)
-{ derived().template _writePacket<StoreMode>(row,col,x); }
+{
+ ei_internal_assert(row >= 0 && row < rows()
+ && col >= 0 && col < cols());
+ derived().template _writePacket<StoreMode>(row,col,x);
+}
+
+/** \returns the packet of coefficients starting at the given index. It is your responsibility
+ * to ensure that a packet really starts there. This method is only available on expressions having the
+ * PacketAccessBit and the LinearAccessBit.
+ *
+ * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+ * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+ * starting at an address which is a multiple of the packet size.
+ */
+template<typename Derived>
+template<int LoadMode>
+inline typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type
+MatrixBase<Derived>::packet(int index) const
+{
+ ei_internal_assert(index >= 0 && index < size());
+ return derived().template _packet<LoadMode>(index);
+}
+
+/** Stores the given packet of coefficients, at the given index in this expression. It is your responsibility
+ * to ensure that a packet really starts there. This method is only available on expressions having the
+ * PacketAccessBit and the LinearAccessBit.
+ *
+ * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+ * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+ * starting at an address which is a multiple of the packet size.
+ */
+template<typename Derived>
+template<int StoreMode>
+inline void MatrixBase<Derived>::writePacket
+(int index, const typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type& x)
+{
+ ei_internal_assert(index >= 0 && index < size());
+ derived().template _writePacket<StoreMode>(index,x);
+}
#endif // EIGEN_COEFFS_H
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index ec4619781..6672edcbe 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -69,7 +69,7 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
HereditaryBits
| (int(LhsFlags) & int(RhsFlags) & LinearAccessBit)
| (ei_functor_traits<BinaryOp>::PacketAccess && ((int(LhsFlags) & RowMajorBit)==(int(RhsFlags) & RowMajorBit))
- ? int(LhsFlags) & int(RhsFlags) & PacketAccessBit : 0)),
+ ? (int(LhsFlags) & int(RhsFlags) & PacketAccessBit) : 0)),
CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + ei_functor_traits<BinaryOp>::Cost
};
};
@@ -108,6 +108,17 @@ class CwiseBinaryOp : ei_no_assignment_operator,
return m_functor.packetOp(m_lhs.template packet<LoadMode>(row, col), m_rhs.template packet<LoadMode>(row, col));
}
+ inline const Scalar _coeff(int index) const
+ {
+ return m_functor(m_lhs.coeff(index), m_rhs.coeff(index));
+ }
+
+ template<int LoadMode>
+ inline PacketScalar _packet(int index) const
+ {
+ return m_functor.packetOp(m_lhs.template packet<LoadMode>(index), m_rhs.template packet<LoadMode>(index));
+ }
+
protected:
const LhsNested m_lhs;
const RhsNested m_rhs;
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index 069fad8a2..998b7ce56 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -50,7 +50,9 @@ struct ei_traits<CwiseNullaryOp<NullaryOp, MatrixType> >
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
Flags = (MatrixType::Flags
- & (HereditaryBits | LinearAccessBit | (ei_functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
+ & ( HereditaryBits
+ | (ei_functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0)
+ | (ei_functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
| (ei_functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
CoeffReadCost = ei_functor_traits<NullaryOp>::Cost
};
@@ -89,6 +91,17 @@ class CwiseNullaryOp : ei_no_assignment_operator,
return m_functor.packetOp();
}
+ const Scalar _coeff(int index) const
+ {
+ return m_functor(index);
+ }
+
+ template<int LoadMode>
+ PacketScalar _packet(int) const
+ {
+ return m_functor.packetOp();
+ }
+
protected:
const ei_int_if_dynamic<RowsAtCompileTime> m_rows;
const ei_int_if_dynamic<ColsAtCompileTime> m_cols;
diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index 7c466d8c2..881853d28 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -90,6 +90,17 @@ class CwiseUnaryOp : ei_no_assignment_operator,
return m_functor.packetOp(m_matrix.template packet<LoadMode>(row, col));
}
+ inline const Scalar _coeff(int index) const
+ {
+ return m_functor(m_matrix.coeff(index));
+ }
+
+ template<int LoadMode>
+ inline PacketScalar _packet(int index) const
+ {
+ return m_functor.packetOp(m_matrix.template packet<LoadMode>(index));
+ }
+
protected:
const typename MatrixType::Nested m_matrix;
const UnaryOp m_functor;
diff --git a/Eigen/src/Core/DiagonalCoeffs.h b/Eigen/src/Core/DiagonalCoeffs.h
index b7d3ef475..516d52526 100644
--- a/Eigen/src/Core/DiagonalCoeffs.h
+++ b/Eigen/src/Core/DiagonalCoeffs.h
@@ -56,7 +56,8 @@ struct ei_traits<DiagonalCoeffs<MatrixType> >
MaxColsAtCompileTime = 1,
Flags = (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic
? (unsigned int)_MatrixTypeNested::Flags
- : (unsigned int)_MatrixTypeNested::Flags &~ LargeBit) & HereditaryBits,
+ : (unsigned int)_MatrixTypeNested::Flags &~ LargeBit)
+ & (HereditaryBits | LinearAccessBit),
CoeffReadCost = _MatrixTypeNested::CoeffReadCost
};
};
@@ -87,6 +88,16 @@ template<typename MatrixType> class DiagonalCoeffs
return m_matrix.coeff(row, row);
}
+ inline Scalar& _coeffRef(int index)
+ {
+ return m_matrix.const_cast_derived().coeffRef(index, index);
+ }
+
+ inline const Scalar _coeff(int index) const
+ {
+ return m_matrix.coeff(index, index);
+ }
+
protected:
const typename MatrixType::Nested m_matrix;
diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h
index f902d7bbe..38d6ad46b 100644
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -52,8 +52,7 @@ struct ei_traits<Product<LhsNested, RhsNested, DiagonalProduct> >
&& (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
RemovedBits = ~(((RhsFlags & RowMajorBit) && (!CanVectorizeLhs) ? 0 : RowMajorBit)
- | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit))
- | LinearAccessBit,
+ | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
| (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0),
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 8f38ee946..275a40ff2 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -175,27 +175,20 @@ struct ei_dot_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
const int size = v1.size();
const int packetSize = ei_packet_traits<Scalar>::size;
const int alignedSize = (size/packetSize)*packetSize;
- const bool rowVector1 = Derived1::RowsAtCompileTime == 1;
- const bool rowVector2 = Derived2::RowsAtCompileTime == 1;
Scalar res;
// do the vectorizable part of the sum
if(size >= packetSize)
{
- PacketScalar packet_res;
- packet_res = ei_pmul(
- v1.template packet<Aligned>(0, 0),
- v2.template packet<Aligned>(0, 0)
- );
+ PacketScalar packet_res = ei_pmul(
+ v1.template packet<Aligned>(0),
+ v2.template packet<Aligned>(0)
+ );
for(int index = packetSize; index<alignedSize; index += packetSize)
{
- const int row1 = rowVector1 ? 0 : index;
- const int col1 = rowVector1 ? index : 0;
- const int row2 = rowVector2 ? 0 : index;
- const int col2 = rowVector2 ? index : 0;
packet_res = ei_pmadd(
- v1.template packet<Aligned>(row1, col1),
- v2.template packet<Aligned>(row2, col2),
+ v1.template packet<Aligned>(index),
+ v2.template packet<Aligned>(index),
packet_res
);
}
@@ -213,11 +206,7 @@ struct ei_dot_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
// do the remainder of the vector
for(int index = alignedSize; index < size; index++)
{
- const int row1 = rowVector1 ? 0 : index;
- const int col1 = rowVector1 ? index : 0;
- const int row2 = rowVector2 ? 0 : index;
- const int col2 = rowVector2 ? index : 0;
- res += v1.coeff(row1, col1) * v2.coeff(row2, col2);
+ res += v1.coeff(index) * v2.coeff(index);
}
return res;
diff --git a/Eigen/src/Core/Flagged.h b/Eigen/src/Core/Flagged.h
index edf1b3fd1..8afd068c4 100644
--- a/Eigen/src/Core/Flagged.h
+++ b/Eigen/src/Core/Flagged.h
@@ -84,6 +84,16 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
return m_matrix.const_cast_derived().coeffRef(row, col);
}
+ inline const Scalar _coeff(int index) const
+ {
+ return m_matrix.coeff(index);
+ }
+
+ inline Scalar& _coeffRef(int index)
+ {
+ return m_matrix.const_cast_derived().coeffRef(index);
+ }
+
template<int LoadMode>
inline const PacketScalar _packet(int row, int col) const
{
@@ -96,6 +106,18 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
m_matrix.const_cast_derived().template writePacket<LoadMode>(row, col, x);
}
+ template<int LoadMode>
+ inline const PacketScalar _packet(int index) const
+ {
+ return m_matrix.template packet<LoadMode>(index);
+ }
+
+ template<int LoadMode>
+ inline void _writePacket(int index, const PacketScalar& x)
+ {
+ m_matrix.const_cast_derived().template writePacket<LoadMode>(index, x);
+ }
+
protected:
ExpressionTypeNested m_matrix;
};
diff --git a/Eigen/src/Core/Functors.h b/Eigen/src/Core/Functors.h
index ab841bae7..2f00fdd05 100644
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
@@ -302,13 +302,13 @@ struct ei_scalar_quotient1_op : ei_scalar_quotient1_impl<Scalar, NumTraits<Scala
// nullary functors
-template<typename Scalar, bool PacketAccess = (int(ei_packet_traits<Scalar>::size)>1?true:false) > struct ei_scalar_constant_op;
+template<typename Scalar, bool PacketAccess = (int(ei_packet_traits<Scalar>::size)>1) > struct ei_scalar_constant_op;
template<typename Scalar>
struct ei_scalar_constant_op<Scalar,true> {
typedef typename ei_packet_traits<Scalar>::type PacketScalar;
inline ei_scalar_constant_op(const Scalar& other) : m_other(ei_pset1(other)) { }
- inline const Scalar operator() (int, int) const { return ei_pfirst(m_other); }
+ inline const Scalar operator() (int, int = 0) const { return ei_pfirst(m_other); }
inline const PacketScalar packetOp() const
{ return m_other; }
const PacketScalar m_other;
@@ -316,7 +316,7 @@ struct ei_scalar_constant_op<Scalar,true> {
template<typename Scalar>
struct ei_scalar_constant_op<Scalar,false> {
inline ei_scalar_constant_op(const Scalar& other) : m_other(other) { }
- inline const Scalar operator() (int, int) const { return m_other; }
+ inline const Scalar operator() (int, int = 0) const { return m_other; }
const Scalar m_other;
};
template<typename Scalar>
@@ -331,4 +331,11 @@ template<typename Scalar>
struct ei_functor_traits<ei_scalar_identity_op<Scalar> >
{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
+// NOTE quick hack:
+// all functors allow linear access, except ei_scalar_identity_op. So we fix here a quick meta
+// to indicate whether a functor allows linear access, just always answering 'yes' except for
+// ei_scalar_identity_op.
+template<typename Functor> struct ei_functor_has_linear_access { enum { ret = 1 }; };
+template<typename Scalar> struct ei_functor_has_linear_access<ei_scalar_identity_op<Scalar> > { enum { ret = 0 }; };
+
#endif // EIGEN_FUNCTORS_H
diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h
index f7094a4a1..762e1fb31 100644
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -31,14 +31,14 @@
*/
template<typename Derived>
std::ostream & operator <<
-( std::ostream & s,
- const MatrixBase<Derived> & m )
+(std::ostream & s,
+ const MatrixBase<Derived> & m)
{
- for( int i = 0; i < m.rows(); i++ )
+ for(int i = 0; i < m.rows(); i++)
{
- s << m( i, 0 );
- for (int j = 1; j < m.cols(); j++ )
- s << " " << m( i, j );
+ s << m.coeff(i, 0);
+ for(int j = 1; j < m.cols(); j++)
+ s << " " << m.coeff(i, j);
if( i < m.rows() - 1)
s << "\n";
}
diff --git a/Eigen/src/Core/InverseProduct.h b/Eigen/src/Core/InverseProduct.h
index 4a3579bbf..40496e01d 100755
--- a/Eigen/src/Core/InverseProduct.h
+++ b/Eigen/src/Core/InverseProduct.h
@@ -55,33 +55,33 @@ typename OtherDerived::Eval MatrixBase<Derived>::inverseProduct(const MatrixBase
{
// forward substitution
if(Flags & UnitDiagBit)
- res.coeffRef(0,c) = other.coeff(0,c);
+ res.coeffRef(0,c) = other.coeff(0,c);
else
- res.coeffRef(0,c) = other.coeff(0,c)/coeff(0, 0);
+ res.coeffRef(0,c) = other.coeff(0,c)/coeff(0, 0);
for(int i=1; i<rows(); ++i)
{
- Scalar tmp = other.coeff(i,c) - ((this->row(i).start(i)) * res.col(c).start(i)).coeff(0,0);
- if (Flags & UnitDiagBit)
- res.coeffRef(i,c) = tmp;
- else
- res.coeffRef(i,c) = tmp/coeff(i,i);
+ Scalar tmp = other.coeff(i,c) - ((this->row(i).start(i)) * res.col(c).start(i)).coeff(0,0);
+ if (Flags & UnitDiagBit)
+ res.coeffRef(i,c) = tmp;
+ else
+ res.coeffRef(i,c) = tmp/coeff(i,i);
}
}
else
{
// backward substitution
if(Flags & UnitDiagBit)
- res.coeffRef(cols()-1,c) = other.coeff(cols()-1,c);
+ res.coeffRef(cols()-1,c) = other.coeff(cols()-1,c);
else
- res.coeffRef(cols()-1,c) = other.coeff(cols()-1, c)/coeff(rows()-1, cols()-1);
+ res.coeffRef(cols()-1,c) = other.coeff(cols()-1, c)/coeff(rows()-1, cols()-1);
for(int i=rows()-2 ; i>=0 ; --i)
{
- Scalar tmp = other.coeff(i,c)
- - ((this->row(i).end(cols()-i-1)) * res.col(c).end(cols()-i-1)).coeff(0,0);
- if (Flags & UnitDiagBit)
- res.coeffRef(i,c) = tmp;
- else
- res.coeffRef(i,c) = tmp/coeff(i,i);
+ Scalar tmp = other.coeff(i,c)
+ - ((this->row(i).end(cols()-i-1)) * res.col(c).end(cols()-i-1)).coeff(0,0);
+ if (Flags & UnitDiagBit)
+ res.coeffRef(i,c) = tmp;
+ else
+ res.coeffRef(i,c) = tmp/coeff(i,i);
}
}
}
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index f4c660af9..bcd72bdb8 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -80,6 +80,16 @@ template<typename MatrixType> class Map
return const_cast<Scalar*>(m_data)[row + col * m_rows];
}
+ inline const Scalar& _coeff(int index) const
+ {
+ return m_data[index];
+ }
+
+ inline Scalar& _coeffRef(int index)
+ {
+ return m_data[index];
+ }
+
public:
inline Map(const Scalar* data, int rows, int cols) : m_data(data), m_rows(rows), m_cols(cols)
{
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 11fadf49b..3d037cbee 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -128,6 +128,11 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
return m_storage.data()[row + col * m_storage.rows()];
}
+ inline const Scalar& _coeff(int index) const
+ {
+ return m_storage.data()[index];
+ }
+
inline Scalar& _coeffRef(int row, int col)
{
if(Flags & RowMajorBit)
@@ -136,20 +141,33 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
return m_storage.data()[row + col * m_storage.rows()];
}
+ inline Scalar& _coeffRef(int index)
+ {
+ return m_storage.data()[index];
+ }
+
template<int LoadMode>
inline PacketScalar _packet(int row, int col) const
{
- ei_internal_assert(Flags & PacketAccessBit);
if(Flags & RowMajorBit)
if (LoadMode==Aligned)
- return ei_pload(&m_storage.data()[col + row * m_storage.cols()]);
+ return ei_pload(m_storage.data() + col + row * m_storage.cols());
else
- return ei_ploadu(&m_storage.data()[col + row * m_storage.cols()]);
+ return ei_ploadu(m_storage.data() + col + row * m_storage.cols());
else
if (LoadMode==Aligned)
- return ei_pload(&m_storage.data()[row + col * m_storage.rows()]);
+ return ei_pload(m_storage.data() + row + col * m_storage.rows());
else
- return ei_ploadu(&m_storage.data()[row + col * m_storage.rows()]);
+ return ei_ploadu(m_storage.data() + row + col * m_storage.rows());
+ }
+
+ template<int LoadMode>
+ inline PacketScalar _packet(int index) const
+ {
+ if (LoadMode==Aligned)
+ return ei_pload(m_storage.data() + index);
+ else
+ return ei_ploadu(m_storage.data() + index);
}
template<int StoreMode>
@@ -158,14 +176,23 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
ei_internal_assert(Flags & PacketAccessBit);
if(Flags & RowMajorBit)
if (StoreMode==Aligned)
- ei_pstore(&m_storage.data()[col + row * m_storage.cols()], x);
+ ei_pstore(m_storage.data() + col + row * m_storage.cols(), x);
else
- ei_pstoreu(&m_storage.data()[col + row * m_storage.cols()], x);
+ ei_pstoreu(m_storage.data() + col + row * m_storage.cols(), x);
else
if (StoreMode==Aligned)
- ei_pstore(&m_storage.data()[row + col * m_storage.rows()], x);
+ ei_pstore(m_storage.data() + row + col * m_storage.rows(), x);
+ else
+ ei_pstoreu(m_storage.data() + row + col * m_storage.rows(), x);
+ }
+
+ template<int StoreMode>
+ inline void _writePacket(int index, const PacketScalar& x)
+ {
+ if (StoreMode==Aligned)
+ ei_pstore(m_storage.data() + index, x);
else
- ei_pstoreu(&m_storage.data()[row + col * m_storage.rows()], x);
+ ei_pstoreu(m_storage.data() + index, x);
}
public:
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 5698b7602..f503ebcbf 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -228,6 +228,11 @@ template<typename Derived> class MatrixBase
template<int StoreMode>
void writePacket(int row, int col, const PacketScalar& x);
+ template<int LoadMode>
+ PacketScalar packet(int index) const;
+ template<int StoreMode>
+ void writePacket(int index, const PacketScalar& x);
+
const Scalar x() const;
const Scalar y() const;
const Scalar z() const;
@@ -307,11 +312,11 @@ template<typename Derived> class MatrixBase
Block<Derived> block(int start, int size);
const Block<Derived> block(int start, int size) const;
- Block<Derived> start(int size);
- const Block<Derived> start(int size) const;
+ typename SubVectorReturnType<Dynamic>::Type start(int size);
+ const typename SubVectorReturnType<Dynamic>::Type start(int size) const;
- Block<Derived> end(int size);
- const Block<Derived> end(int size) const;
+ typename SubVectorReturnType<Dynamic>::Type end(int size);
+ const typename SubVectorReturnType<Dynamic>::Type end(int size) const;
Block<Derived> corner(CornerType type, int cRows, int cCols);
const Block<Derived> corner(CornerType type, int cRows, int cCols) const;
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index 0c6cdb114..a63202dfd 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -76,6 +76,16 @@ template<typename ExpressionType> class NestByValue
return m_expression.const_cast_derived().coeffRef(row, col);
}
+ inline const Scalar _coeff(int index) const
+ {
+ return m_expression.coeff(index);
+ }
+
+ inline Scalar& _coeffRef(int index)
+ {
+ return m_expression.const_cast_derived().coeffRef(index);
+ }
+
template<int LoadMode>
inline const PacketScalar _packet(int row, int col) const
{
@@ -88,6 +98,18 @@ template<typename ExpressionType> class NestByValue
m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
}
+ template<int LoadMode>
+ inline const PacketScalar _packet(int index) const
+ {
+ return m_expression.template packet<LoadMode>(index);
+ }
+
+ template<int LoadMode>
+ inline void _writePacket(int index, const PacketScalar& x)
+ {
+ m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
+ }
+
protected:
const ExpressionType m_expression;
};
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 1e90d2ef9..df35ffc4f 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -151,8 +151,7 @@ struct ei_traits<Product<LhsNested, RhsNested, ProductMode> >
&& (ProductMode==(int)CacheFriendlyProduct ? (int)LhsFlags & RowMajorBit : (!CanVectorizeLhs)),
RemovedBits = ~((EvalToRowMajor ? 0 : RowMajorBit)
- | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)
- | LinearAccessBit),
+ | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
| EvalBeforeAssigningBit
@@ -224,6 +223,18 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
return res;
}
+ /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
+ * which is why we don't set the LinearAccessBit.
+ */
+ const Scalar _coeff(int index) const
+ {
+ Scalar res;
+ const int row = RowsAtCompileTime == 1 ? 0 : index;
+ const int col = RowsAtCompileTime == 1 ? index : 0;
+ ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
+ return res;
+ }
+
template<int LoadMode>
const PacketScalar _packet(int row, int col) const
{
@@ -235,9 +246,6 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
return res;
}
- template<typename Lhs_, typename Rhs_, int ProductMode_, typename DestDerived_, bool DirectAccess_>
- friend struct ei_cache_friendly_selector;
-
protected:
const LhsNested m_lhs;
const RhsNested m_rhs;
diff --git a/Eigen/src/Core/Sum.h b/Eigen/src/Core/Sum.h
index ccdb6f4a3..af35d20f9 100644
--- a/Eigen/src/Core/Sum.h
+++ b/Eigen/src/Core/Sum.h
@@ -186,34 +186,15 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
const int size = mat.size();
const int packetSize = ei_packet_traits<Scalar>::size;
const int alignedSize = (size/packetSize)*packetSize;
- const bool rowMajor = Derived::Flags&RowMajorBit;
- const int innerSize = rowMajor ? mat.cols() : mat.rows();
- const int outerSize = rowMajor ? mat.rows() : mat.cols();
Scalar res;
- // do the vectorizable part of the sum
if(size >= packetSize)
{
- PacketScalar packet_res;
- packet_res = mat.template packet<Aligned>(0, 0);
- int row = 0;
- int col = 0;
- int index = packetSize;
- while (index<alignedSize)
- {
- row = rowMajor ? index/innerSize : index%innerSize;
- col = rowMajor ? index%innerSize : index/innerSize;
- int start = rowMajor ? col : row;
- int end = std::min(innerSize, start+alignedSize-index);
- if (end<start) getchar();
- for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
- packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
- index += (rowMajor ? col : row) - start;
- }
- res = ei_predux(packet_res);
+ PacketScalar packet_res = mat.template packet<Aligned>(0, 0);
+ for(int index = packetSize; index < alignedSize; index += packetSize)
+ packet_res = ei_padd(packet_res, mat.template packet<Aligned>(index));
- // now we must do the rest without vectorization.
- if(alignedSize == size) return res;
+ res = ei_predux(packet_res);
}
else // too small to vectorize anything.
// since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
@@ -221,25 +202,11 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
res = Scalar(0);
}
- const int k = alignedSize/innerSize;
-
- // do the remainder of the current row or col
- for(int i = alignedSize%innerSize; i < innerSize; i++)
+ for(int index = alignedSize; index < size; index++)
{
- const int row = rowMajor ? k : i;
- const int col = rowMajor ? i : k;
- res += mat.coeff(row, col);
+ res += mat.coeff(index);
}
- // do the remaining rows or cols
- for(int j = k+1; j < outerSize; j++)
- for(int i = 0; i < innerSize; i++)
- {
- const int row = rowMajor ? i : j;
- const int col = rowMajor ? j : i;
- res += mat.coeff(row, col);
- }
-
return res;
}
};
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index ac1b583fa..716d86243 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -56,26 +56,60 @@ const unsigned int EvalBeforeNestingBit = 0x2;
* means the expression should be evaluated before any assignement */
const unsigned int EvalBeforeAssigningBit = 0x4;
-/** \ingroup flags
- *
- * currently unused. Means the matrix probably has a very big size.
- * Could eventually be used as a hint to determine which algorithms
- * to use. */
-const unsigned int LargeBit = 0x8;
-
#ifdef EIGEN_VECTORIZE
/** \ingroup flags
*
- * means the expression might be vectorized */
-const unsigned int PacketAccessBit = 0x10;
+ * Short version: means the expression might be vectorized
+ *
+ * Long version: means that the coefficients can be handled by packets
+ * and start at a memory location whose alignment meets the requirements
+ * of the present CPU architecture for optimized packet access. In the fixed-size
+ * case, there is the additional condition that the total size of the coefficients
+ * array is a multiple of the packet size, so that it is possible to access all the
+ * coefficients by packets. In the dynamic-size case, there is no such condition
+ * on the total size, so it might not be possible to access the few last coeffs
+ * by packets.
+ *
+ * \note If vectorization is not enabled (EIGEN_VECTORIZE is not defined) this constant
+ * is set to the value 0.
+ */
+const unsigned int PacketAccessBit = 0x8;
#else
const unsigned int PacketAccessBit = 0x0;
#endif
/** \ingroup flags
*
- * means the expression can be seen as 1D vector (used for explicit vectorization) */
-const unsigned int LinearAccessBit = 0x20;
+ * Short version: means the expression can be seen as 1D vector.
+ *
+ * Long version: means that one can access the coefficients
+ * of this expression by coeff(int), and coeffRef(int) in the case of a lvalue expression. These
+ * index-based access methods are guaranteed
+ * to not have to do any runtime computation of a (row, col)-pair from the index, so that it
+ * is guaranteed that whenever it is available, index-based access is at least as fast as
+ * (row,col)-based access. Expressions for which that isn't possible don't have the LinearAccessBit.
+ *
+ * If both PacketAccessBit and LinearAccessBit are set, then the
+ * packets of this expression can be accessed by packet(int), and writePacket(int) in the case of a
+ * lvalue expression.
+ *
+ * Typically, all vector expressions have the LinearAccessBit, but there is one exception:
+ * Product expressions don't have it, because it would be troublesome for vectorization, even when the
+ * Product is a vector expression. Thus, vector Product expressions allow index-based coefficient access but
+ * not index-based packet access, so they don't have the LinearAccessBit.
+ */
+const unsigned int LinearAccessBit = 0x10;
+
+/** \ingroup flags
+ *
+ * Means that the underlying array of coefficients can be directly accessed. This means two things.
+ * First, references to the coefficients must be available through coeffRef(int, int). This rules out read-only
+ * expressions whose coefficients are computed on demand by coeff(int, int). Second, the memory layout of the
+ * array of coefficients must be exactly the natural one suggested by rows(), cols(), stride(), and the RowMajorBit.
+ * This rules out expressions such as DiagonalCoeffs, whose coefficients, though referencable, do not have
+ * such a regular memory layout.
+ */
+const unsigned int DirectAccessBit = 0x20;
/** \ingroup flags
*
@@ -104,17 +138,17 @@ const unsigned int LowerTriangularBit = 0x400;
/** \ingroup flags
*
- * means the underlying matrix data can be direclty accessed (contrary to certain
- * expressions where the matrix coefficients need to be computed rather than just read from
- * memory) */
-const unsigned int DirectAccessBit = 0x800;
-
-/** \ingroup flags
- *
* means the object is just an array of scalars, and operations on it are regarded as operations
* on every of these scalars taken separately.
*/
-const unsigned int ArrayBit = 0x1000;
+const unsigned int ArrayBit = 0x800;
+
+/** \ingroup flags
+ *
+ * currently unused. Means the matrix probably has a very big size.
+ * Could eventually be used as a hint to determine which algorithms
+ * to use. */
+const unsigned int LargeBit = 0x1000;
// list of flags that are inherited by default
const unsigned int HereditaryBits = RowMajorBit
@@ -135,7 +169,6 @@ const unsigned int UnitUpper = UpperTriangularBit | UnitDiagBit;
const unsigned int UnitLower = LowerTriangularBit | UnitDiagBit;
const unsigned int Diagonal = Upper | Lower;
-
enum { Aligned=0, UnAligned=1 };
enum { ConditionalJumpCost = 5 };
enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 078beb681..5d809f619 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -155,7 +155,8 @@ class ei_corrected_matrix_flags
// so let us strictly honor the user's choice.
? SuggestedFlags&RowMajorBit
: Cols > 1 ? RowMajorBit : 0,
- is_big = MaxRows == Dynamic || MaxCols == Dynamic,
+ inner_max_size = row_major_bit ? MaxCols : MaxRows,
+ is_big = inner_max_size == Dynamic,
linear_size = Cols * Rows,
packet_access_bit
= ei_packet_traits<Scalar>::size > 1
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index 37ae1ed82..f74bc7775 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -75,7 +75,9 @@
// static assertion failling if the type \a TYPE is not a vector type
-#define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime,you_tried_calling_a_vector_method_on_a_matrix)
+#define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \
+ EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime, \
+ you_tried_calling_a_vector_method_on_a_matrix)
// static assertion failling if the two vector expression types are not compatible (same fixed-size or dynamic size)
#define EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(TYPE0,TYPE1) \
diff --git a/bench/benchVecAdd.cpp b/bench/benchVecAdd.cpp
new file mode 100644
index 000000000..aa211dce0
--- /dev/null
+++ b/bench/benchVecAdd.cpp
@@ -0,0 +1,134 @@
+
+#include <Eigen/Core>
+#include <bench/BenchTimer.h>
+using namespace Eigen;
+
+#ifndef SIZE
+#define SIZE 50
+#endif
+
+#ifndef REPEAT
+#define REPEAT 10000
+#endif
+
+typedef float Scalar;
+
+__attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size);
+__attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c);
+__attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c);
+
+int main(int argc, char* argv[])
+{
+ int size = SIZE * 8;
+ int size2 = size * size;
+ Scalar* a = ei_aligned_malloc<Scalar>(size2);
+ Scalar* b = ei_aligned_malloc<Scalar>(size2);
+ Scalar* c = ei_aligned_malloc<Scalar>(size2);
+
+ for (int i=0; i<size; ++i)
+ {
+ a[i] = b[i] = c[i] = 0;
+ }
+
+ BenchTimer timer;
+
+ timer.reset();
+ for (int k=0; k<3; ++k)
+ {
+ timer.start();
+ benchVec(a, b, c, size2);
+ timer.stop();
+ }
+ std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
+
+ for (int innersize = size; innersize>2 ; --innersize)
+ {
+ if (size2%innersize==0)
+ {
+ int outersize = size2/innersize;
+ MatrixXf ma = MatrixXf::map(a, innersize, outersize );
+ MatrixXf mb = MatrixXf::map(b, innersize, outersize );
+ MatrixXf mc = MatrixXf::map(c, innersize, outersize );
+ timer.reset();
+ for (int k=0; k<3; ++k)
+ {
+ timer.start();
+ benchVec(ma, mb, mc);
+ timer.stop();
+ }
+ std::cout << innersize << " x " << outersize << " " << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
+ }
+ }
+
+ VectorXf va = VectorXf::map(a, size2);
+ VectorXf vb = VectorXf::map(b, size2);
+ VectorXf vc = VectorXf::map(c, size2);
+ timer.reset();
+ for (int k=0; k<3; ++k)
+ {
+ timer.start();
+ benchVec(va, vb, vc);
+ timer.stop();
+ }
+ std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
+
+ return 0;
+}
+
+void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c)
+{
+ for (int k=0; k<REPEAT; ++k)
+ a = a + b;
+}
+
+void benchVec(VectorXf& a, VectorXf& b, VectorXf& c)
+{
+ for (int k=0; k<REPEAT; ++k)
+ a = a + b;
+}
+
+void benchVec(Scalar* a, Scalar* b, Scalar* c, int size)
+{
+ typedef ei_packet_traits<Scalar>::type PacketScalar;
+ const int PacketSize = ei_packet_traits<Scalar>::size;
+ PacketScalar a0, a1, a2, a3, b0, b1, b2, b3;
+ for (int k=0; k<REPEAT; ++k)
+ for (int i=0; i<size; i+=PacketSize*8)
+ {
+ a0 = ei_pload(&a[i]);
+ b0 = ei_pload(&b[i]);
+ a1 = ei_pload(&a[i+1*PacketSize]);
+ b1 = ei_pload(&b[i+1*PacketSize]);
+ a2 = ei_pload(&a[i+2*PacketSize]);
+ b2 = ei_pload(&b[i+2*PacketSize]);
+ a3 = ei_pload(&a[i+3*PacketSize]);
+ b3 = ei_pload(&b[i+3*PacketSize]);
+ ei_pstore(&a[i], ei_padd(a0, b0));
+ a0 = ei_pload(&a[i+4*PacketSize]);
+ b0 = ei_pload(&b[i+4*PacketSize]);
+
+ ei_pstore(&a[i+1*PacketSize], ei_padd(a1, b1));
+ a1 = ei_pload(&a[i+5*PacketSize]);
+ b1 = ei_pload(&b[i+5*PacketSize]);
+
+ ei_pstore(&a[i+2*PacketSize], ei_padd(a2, b2));
+ a2 = ei_pload(&a[i+6*PacketSize]);
+ b2 = ei_pload(&b[i+6*PacketSize]);
+
+ ei_pstore(&a[i+3*PacketSize], ei_padd(a3, b3));
+ a3 = ei_pload(&a[i+7*PacketSize]);
+ b3 = ei_pload(&b[i+7*PacketSize]);
+
+ ei_pstore(&a[i+4*PacketSize], ei_padd(a0, b0));
+ ei_pstore(&a[i+5*PacketSize], ei_padd(a1, b1));
+ ei_pstore(&a[i+6*PacketSize], ei_padd(a2, b2));
+ ei_pstore(&a[i+7*PacketSize], ei_padd(a3, b3));
+
+// ei_pstore(&a[i+2*PacketSize], ei_padd(ei_pload(&a[i+2*PacketSize]), ei_pload(&b[i+2*PacketSize])));
+// ei_pstore(&a[i+3*PacketSize], ei_padd(ei_pload(&a[i+3*PacketSize]), ei_pload(&b[i+3*PacketSize])));
+// ei_pstore(&a[i+4*PacketSize], ei_padd(ei_pload(&a[i+4*PacketSize]), ei_pload(&b[i+4*PacketSize])));
+// ei_pstore(&a[i+5*PacketSize], ei_padd(ei_pload(&a[i+5*PacketSize]), ei_pload(&b[i+5*PacketSize])));
+// ei_pstore(&a[i+6*PacketSize], ei_padd(ei_pload(&a[i+6*PacketSize]), ei_pload(&b[i+6*PacketSize])));
+// ei_pstore(&a[i+7*PacketSize], ei_padd(ei_pload(&a[i+7*PacketSize]), ei_pload(&b[i+7*PacketSize])));
+ }
+}