aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core
diff options
context:
space:
mode:
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r--Eigen/src/Core/Assign.h35
-rw-r--r--Eigen/src/Core/Product.h55
-rw-r--r--Eigen/src/Core/arch/SSE/PacketMath.h2
3 files changed, 59 insertions, 33 deletions
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index d5604824f..1c292d104 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -52,6 +52,9 @@ private:
InnerSize = int(Derived::Flags)&RowMajorBit
? Derived::ColsAtCompileTime
: Derived::RowsAtCompileTime,
+ InnerMaxSize = int(Derived::Flags)&RowMajorBit
+ ? Derived::MaxColsAtCompileTime
+ : Derived::MaxRowsAtCompileTime,
PacketSize = ei_packet_traits<typename Derived::Scalar>::size
};
@@ -60,7 +63,9 @@ private:
&& ((int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit)),
MayInnerVectorize = MightVectorize && InnerSize!=Dynamic && int(InnerSize)%int(PacketSize)==0,
MayLinearVectorize = MightVectorize && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
- MaySliceVectorize = MightVectorize && InnerSize==Dynamic
+ MaySliceVectorize = MightVectorize && InnerMaxSize==Dynamic /* slice vectorization can be slow, so we only
+ want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case
+ of a dynamic block in a fixed-size matrix */
};
public:
@@ -349,7 +354,7 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
template<typename Derived1, typename Derived2>
struct ei_assign_impl<Derived1, Derived2, LinearVectorization, CompleteUnrolling>
{
- inline static void run(Derived1 &dst, const Derived2 &src)
+ static void run(Derived1 &dst, const Derived2 &src)
{
const int size = Derived1::SizeAtCompileTime;
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
@@ -383,8 +388,30 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling>
{
static void run(Derived1 &dst, const Derived2 &src)
{
- //FIXME unimplemented, so for now we fall back to non-vectorized path
- ei_assign_impl<Derived1, Derived2, NoVectorization, NoUnrolling>::run(dst, src);
+ const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
+ const bool rowMajor = Derived1::Flags&RowMajorBit;
+ const int innerSize = rowMajor ? dst.cols() : dst.rows();
+ const int outerSize = rowMajor ? dst.rows() : dst.cols();
+ const int alignedInnerSize = (innerSize/packetSize)*packetSize;
+
+ for(int i = 0; i < outerSize; i++)
+ {
+ // do the vectorizable part of the assignment
+ for (int index = 0; index<alignedInnerSize ; index+=packetSize)
+ {
+ const int row = rowMajor ? i : index;
+ const int col = rowMajor ? index : i;
+ dst.template writePacket<UnAligned>(row, col, src.template packet<UnAligned>(row, col));
+ }
+
+ // do the non-vectorizable part of the assignment
+ for (int index = alignedInnerSize; index<innerSize ; index++)
+ {
+ const int row = rowMajor ? i : index;
+ const int col = rowMajor ? index : i;
+ dst.coeffRef(row, col) = src.coeff(row, col);
+ }
+ }
}
};
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 1f387af32..1e90d2ef9 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -38,7 +38,7 @@ enum {
template<int VectorizationMode, int Index, typename Lhs, typename Rhs>
struct ei_product_coeff_impl;
-template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar>
+template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
struct ei_product_packet_impl;
template<typename T> class ei_product_eval_to_column_major;
@@ -188,10 +188,6 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
Unroll ? InnerSize-1 : Dynamic,
_LhsNested, _RhsNested> ScalarCoeffImpl;
- typedef ei_product_packet_impl<Flags&RowMajorBit ? RowMajorProduct : ColMajorProduct,
- Unroll ? InnerSize-1 : Dynamic,
- _LhsNested, _RhsNested, PacketScalar> PacketCoeffImpl;
-
public:
template<typename Lhs, typename Rhs>
@@ -232,7 +228,10 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
const PacketScalar _packet(int row, int col) const
{
PacketScalar res;
- PacketCoeffImpl::run(row, col, m_lhs, m_rhs, res);
+ ei_product_packet_impl<Flags&RowMajorBit ? RowMajorProduct : ColMajorProduct,
+ Unroll ? InnerSize-1 : Dynamic,
+ _LhsNested, _RhsNested, PacketScalar, LoadMode>
+ ::run(row, col, m_lhs, m_rhs, res);
return res;
}
@@ -356,63 +355,63 @@ struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs>
*** Packet path ***
*******************/
-template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_packet_impl<RowMajorProduct, Index, Lhs, Rhs, PacketScalar>
+template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<RowMajorProduct, Index, Lhs, Rhs, PacketScalar, LoadMode>
{
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
{
- ei_product_packet_impl<RowMajorProduct, Index-1, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
- res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<Aligned>(Index, col), res);
+ ei_product_packet_impl<RowMajorProduct, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
+ res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<LoadMode>(Index, col), res);
}
};
-template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_packet_impl<ColMajorProduct, Index, Lhs, Rhs, PacketScalar>
+template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<ColMajorProduct, Index, Lhs, Rhs, PacketScalar, LoadMode>
{
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
{
- ei_product_packet_impl<ColMajorProduct, Index-1, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
- res = ei_pmadd(lhs.template packet<Aligned>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
+ ei_product_packet_impl<ColMajorProduct, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
+ res = ei_pmadd(lhs.template packet<LoadMode>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
}
};
-template<typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_packet_impl<RowMajorProduct, 0, Lhs, Rhs, PacketScalar>
+template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<RowMajorProduct, 0, Lhs, Rhs, PacketScalar, LoadMode>
{
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
{
- res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<Aligned>(0, col));
+ res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
}
};
-template<typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_packet_impl<ColMajorProduct, 0, Lhs, Rhs, PacketScalar>
+template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<ColMajorProduct, 0, Lhs, Rhs, PacketScalar, LoadMode>
{
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
{
- res = ei_pmul(lhs.template packet<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
+ res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
}
};
-template<int StorageOrder, typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_packet_impl<StorageOrder, Dynamic, Lhs, Rhs, PacketScalar>
+template<int StorageOrder, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<StorageOrder, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
{
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
{
- res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<Aligned>(0, col));
+ res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
for(int i = 1; i < lhs.cols(); i++)
- res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<Aligned>(i, col), res);
+ res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
}
};
-template<typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_packet_impl<ColMajorProduct, Dynamic, Lhs, Rhs, PacketScalar>
+template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<ColMajorProduct, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
{
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
{
- res = ei_pmul(lhs.template packet<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
+ res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
for(int i = 1; i < lhs.cols(); i++)
- res = ei_pmadd(lhs.template packet<Aligned>(row, i), ei_pset1(rhs.coeff(i, col)), res);
+ res = ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1(rhs.coeff(i, col)), res);
}
};
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index ffd6aebeb..03fa6bce5 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -94,7 +94,7 @@ inline void ei_pstore(int* to, const __m128i& from) { _mm_store_si128(reinter
inline void ei_pstoreu(float* to, const __m128& from) { _mm_storeu_ps(to, from); }
inline void ei_pstoreu(double* to, const __m128d& from) { _mm_storeu_pd(to, from); }
-inline void ei_pstoreu(int* to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
+inline void ei_pstoreu(int* to, const __m128i& from) { _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
inline float ei_pfirst(const __m128& a) { return _mm_cvtss_f32(a); }
inline double ei_pfirst(const __m128d& a) { return _mm_cvtsd_f64(a); }