aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src
diff options
context:
space:
mode:
Diffstat (limited to 'Eigen/src')
-rw-r--r--Eigen/src/Core/ArrayBase.h8
-rw-r--r--Eigen/src/Core/Assign.h40
-rw-r--r--Eigen/src/Core/CwiseBinaryOp.h32
-rw-r--r--Eigen/src/Core/DenseStorageBase.h4
-rw-r--r--Eigen/src/Core/DiagonalProduct.h2
-rw-r--r--Eigen/src/Core/Functors.h154
-rw-r--r--Eigen/src/Core/GenericPacketMath.h27
-rw-r--r--Eigen/src/Core/MapBase.h4
-rw-r--r--Eigen/src/Core/Product.h88
-rw-r--r--Eigen/src/Core/SelfCwiseBinaryOp.h34
-rw-r--r--Eigen/src/Core/SolveTriangular.h23
-rw-r--r--Eigen/src/Core/Transpose.h6
-rw-r--r--Eigen/src/Core/arch/AltiVec/Complex.h2
-rw-r--r--Eigen/src/Core/arch/AltiVec/PacketMath.h24
-rw-r--r--Eigen/src/Core/arch/NEON/Complex.h7
-rw-r--r--Eigen/src/Core/arch/NEON/PacketMath.h43
-rw-r--r--Eigen/src/Core/arch/SSE/Complex.h98
-rw-r--r--Eigen/src/Core/arch/SSE/MathFunctions.h16
-rw-r--r--Eigen/src/Core/arch/SSE/PacketMath.h65
-rw-r--r--Eigen/src/Core/products/CoeffBasedProduct.h80
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h1357
-rw-r--r--Eigen/src/Core/products/GeneralMatrixMatrix.h132
-rw-r--r--Eigen/src/Core/products/GeneralMatrixVector.h361
-rw-r--r--Eigen/src/Core/products/SelfadjointMatrixMatrix.h111
-rw-r--r--Eigen/src/Core/products/SelfadjointMatrixVector.h16
-rw-r--r--Eigen/src/Core/products/SelfadjointProduct.h42
-rw-r--r--Eigen/src/Core/products/TriangularMatrixMatrix.h57
-rw-r--r--Eigen/src/Core/products/TriangularMatrixVector.h11
-rw-r--r--Eigen/src/Core/products/TriangularSolverMatrix.h49
-rw-r--r--Eigen/src/Core/util/BlasUtil.h87
-rw-r--r--Eigen/src/Core/util/Constants.h4
-rw-r--r--Eigen/src/Core/util/ForwardDeclarations.h8
-rw-r--r--Eigen/src/Core/util/Macros.h8
-rw-r--r--Eigen/src/Core/util/Meta.h8
-rw-r--r--Eigen/src/Jacobi/Jacobi.h20
35 files changed, 1792 insertions, 1236 deletions
diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h
index 1a54b2335..941e39938 100644
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -178,7 +178,7 @@ template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
{
- SelfCwiseBinaryOp<ei_scalar_difference_op<Scalar>, Derived> tmp(derived());
+ SelfCwiseBinaryOp<ei_scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
tmp = other;
return derived();
}
@@ -192,7 +192,7 @@ template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
{
- SelfCwiseBinaryOp<ei_scalar_sum_op<Scalar>, Derived> tmp(derived());
+ SelfCwiseBinaryOp<ei_scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
tmp = other.derived();
return derived();
}
@@ -206,7 +206,7 @@ template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
{
- SelfCwiseBinaryOp<ei_scalar_product_op<Scalar>, Derived> tmp(derived());
+ SelfCwiseBinaryOp<ei_scalar_product_op<Scalar>, Derived, OtherDerived> tmp(derived());
tmp = other.derived();
return derived();
}
@@ -220,7 +220,7 @@ template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
{
- SelfCwiseBinaryOp<ei_scalar_quotient_op<Scalar>, Derived> tmp(derived());
+ SelfCwiseBinaryOp<ei_scalar_quotient_op<Scalar>, Derived, OtherDerived> tmp(derived());
tmp = other.derived();
return derived();
}
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 2a7ca4786..335a888f6 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -256,6 +256,12 @@ struct ei_assign_impl;
*** Default traversal ***
************************/
+template<typename Derived1, typename Derived2, int Unrolling>
+struct ei_assign_impl<Derived1, Derived2, InvalidTraversal, Unrolling>
+{
+ inline static void run(Derived1 &, const Derived2 &) { }
+};
+
template<typename Derived1, typename Derived2>
struct ei_assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling>
{
@@ -397,7 +403,12 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling
EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src)
{
const Index size = dst.size();
- const Index packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
+ typedef ei_packet_traits<typename Derived1::Scalar> PacketTraits;
+ enum {
+ packetSize = PacketTraits::size,
+ dstAlignment = PacketTraits::AlignedOnScalar ? Aligned : int(ei_assign_traits<Derived1,Derived2>::DstIsAligned) ,
+ srcAlignment = ei_assign_traits<Derived1,Derived2>::JointAlignment
+ };
const Index alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0
: ei_first_aligned(&dst.coeffRef(0), size);
const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
@@ -406,7 +417,7 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling
for(Index index = alignedStart; index < alignedEnd; index += packetSize)
{
- dst.template copyPacket<Derived2, Aligned, ei_assign_traits<Derived1,Derived2>::JointAlignment>(index, src);
+ dst.template copyPacket<Derived2, dstAlignment, srcAlignment>(index, src);
}
ei_unaligned_assign_impl<>::run(src,dst,alignedEnd,size);
@@ -438,12 +449,18 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorizedTraversal, NoUnrolling>
typedef typename Derived1::Index Index;
inline static void run(Derived1 &dst, const Derived2 &src)
{
- const Index packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
+ typedef ei_packet_traits<typename Derived1::Scalar> PacketTraits;
+ enum {
+ packetSize = PacketTraits::size,
+ alignable = PacketTraits::AlignedOnScalar,
+ dstAlignment = alignable ? Aligned : int(ei_assign_traits<Derived1,Derived2>::DstIsAligned) ,
+ srcAlignment = ei_assign_traits<Derived1,Derived2>::JointAlignment
+ };
const Index packetAlignedMask = packetSize - 1;
const Index innerSize = dst.innerSize();
const Index outerSize = dst.outerSize();
- const Index alignedStep = (packetSize - dst.outerStride() % packetSize) & packetAlignedMask;
- Index alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0
+ const Index alignedStep = alignable ? (packetSize - dst.outerStride() % packetSize) & packetAlignedMask : 0;
+ Index alignedStart = ((!alignable) || ei_assign_traits<Derived1,Derived2>::DstIsAligned) ? 0
: ei_first_aligned(&dst.coeffRef(0,0), innerSize);
for(Index outer = 0; outer < outerSize; ++outer)
@@ -475,14 +492,21 @@ template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
::lazyAssign(const DenseBase<OtherDerived>& other)
{
+ enum{
+ SameType = ei_is_same_type<typename Derived::Scalar,typename OtherDerived::Scalar>::ret
+ };
+
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived)
- EIGEN_STATIC_ASSERT((ei_is_same_type<typename Derived::Scalar, typename OtherDerived::Scalar>::ret),
- YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+ EIGEN_STATIC_ASSERT(SameType,YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+
+
#ifdef EIGEN_DEBUG_ASSIGN
ei_assign_traits<Derived, OtherDerived>::debug();
#endif
ei_assert(rows() == other.rows() && cols() == other.cols());
- ei_assign_impl<Derived, OtherDerived>::run(derived(),other.derived());
+ ei_assign_impl<Derived, OtherDerived, int(SameType) ? int(ei_assign_traits<Derived, OtherDerived>::Traversal)
+ : int(InvalidTraversal)>::run(derived(),other.derived());
#ifndef EIGEN_NO_DEBUG
checkTransposeAliasing(other.derived());
#endif
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index 7f4587b53..171140c27 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -80,13 +80,14 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
RhsCoeffReadCost = _RhsNested::CoeffReadCost,
LhsFlags = _LhsNested::Flags,
RhsFlags = _RhsNested::Flags,
+ SameType = ei_is_same_type<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::ret,
StorageOrdersAgree = (int(Lhs::Flags)&RowMajorBit)==(int(Rhs::Flags)&RowMajorBit),
Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
HereditaryBits
| (int(LhsFlags) & int(RhsFlags) &
( AlignedBit
| (StorageOrdersAgree ? LinearAccessBit : 0)
- | (ei_functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree ? PacketAccessBit : 0)
+ | (ei_functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
)
)
),
@@ -95,6 +96,19 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
};
};
+// we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor
+// that would take two operands of different types. If there were such an example, then this check should be
+// moved to the BinaryOp functors, on a per-case basis. This would however require a change in the BinaryOp functors, as
+// currently they take only one typename Scalar template parameter.
+// It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths.
+// So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to
+// add together a float matrix and a double matrix.
+#define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
+ EIGEN_STATIC_ASSERT((ei_functor_allows_mixing_real_and_complex<BINOP>::ret \
+ ? int(ei_is_same_type<typename NumTraits<LHS>::Real, typename NumTraits<RHS>::Real>::ret) \
+ : int(ei_is_same_type<LHS, RHS>::ret)), \
+ YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
class CwiseBinaryOpImpl;
@@ -121,17 +135,7 @@ class CwiseBinaryOp : ei_no_assignment_operator,
EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& lhs, const Rhs& rhs, const BinaryOp& func = BinaryOp())
: m_lhs(lhs), m_rhs(rhs), m_functor(func)
{
- // we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor
- // that would take two operands of different types. If there were such an example, then this check should be
- // moved to the BinaryOp functors, on a per-case basis. This would however require a change in the BinaryOp functors, as
- // currently they take only one typename Scalar template parameter.
- // It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths.
- // So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to
- // add together a float matrix and a double matrix.
- EIGEN_STATIC_ASSERT((ei_functor_allows_mixing_real_and_complex<BinaryOp>::ret
- ? int(ei_is_same_type<typename Lhs::RealScalar, typename Rhs::RealScalar>::ret)
- : int(ei_is_same_type<typename Lhs::Scalar, typename Rhs::Scalar>::ret)),
- YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+ EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar);
// require the sizes to match
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs, Rhs)
ei_assert(lhs.rows() == rhs.rows() && lhs.cols() == rhs.cols());
@@ -211,7 +215,7 @@ template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
{
- SelfCwiseBinaryOp<ei_scalar_difference_op<Scalar>, Derived> tmp(derived());
+ SelfCwiseBinaryOp<ei_scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
tmp = other;
return derived();
}
@@ -225,7 +229,7 @@ template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
{
- SelfCwiseBinaryOp<ei_scalar_sum_op<Scalar>, Derived> tmp(derived());
+ SelfCwiseBinaryOp<ei_scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
tmp = other.derived();
return derived();
}
diff --git a/Eigen/src/Core/DenseStorageBase.h b/Eigen/src/Core/DenseStorageBase.h
index 9b4a645e6..e818d40a1 100644
--- a/Eigen/src/Core/DenseStorageBase.h
+++ b/Eigen/src/Core/DenseStorageBase.h
@@ -108,7 +108,7 @@ class DenseStorageBase : public ei_dense_xpr_base<Derived>::type
template<int LoadMode>
EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
{
- return ei_ploadt<Scalar, LoadMode>
+ return ei_ploadt<PacketScalar, LoadMode>
(m_storage.data() + (Flags & RowMajorBit
? col + row * m_storage.cols()
: row + col * m_storage.rows()));
@@ -117,7 +117,7 @@ class DenseStorageBase : public ei_dense_xpr_base<Derived>::type
template<int LoadMode>
EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
{
- return ei_ploadt<Scalar, LoadMode>(m_storage.data() + index);
+ return ei_ploadt<PacketScalar, LoadMode>(m_storage.data() + index);
}
template<int StoreMode>
diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h
index 610d13dc8..14c49e828 100644
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -91,7 +91,7 @@ class DiagonalProduct : ei_no_assignment_operator,
EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, ei_meta_true) const
{
return ei_pmul(m_matrix.template packet<LoadMode>(row, col),
- ei_pset1(m_diagonal.diagonal().coeff(id)));
+ ei_pset1<PacketScalar>(m_diagonal.diagonal().coeff(id)));
}
template<int LoadMode>
diff --git a/Eigen/src/Core/Functors.h b/Eigen/src/Core/Functors.h
index c3a3a92c4..41ae4af42 100644
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
@@ -1,7 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
@@ -35,11 +35,11 @@
template<typename Scalar> struct ei_scalar_sum_op {
EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_sum_op)
EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
- template<typename PacketScalar>
- EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+ template<typename Packet>
+ EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
{ return ei_padd(a,b); }
- template<typename PacketScalar>
- EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+ template<typename Packet>
+ EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
{ return ei_predux(a); }
};
template<typename Scalar>
@@ -55,21 +55,25 @@ struct ei_functor_traits<ei_scalar_sum_op<Scalar> > {
*
* \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
*/
-template<typename Scalar> struct ei_scalar_product_op {
+template<typename LhsScalar,typename RhsScalar> struct ei_scalar_product_op {
+ enum {
+ Vectorizable = ei_is_same_type<LhsScalar,RhsScalar>::ret && ei_packet_traits<LhsScalar>::HasMul && ei_packet_traits<RhsScalar>::HasMul
+ };
+ typedef typename ei_scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_product_op)
- EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a * b; }
- template<typename PacketScalar>
- EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+ EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
+ template<typename Packet>
+ EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
{ return ei_pmul(a,b); }
- template<typename PacketScalar>
- EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+ template<typename Packet>
+ EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
{ return ei_predux_mul(a); }
};
-template<typename Scalar>
-struct ei_functor_traits<ei_scalar_product_op<Scalar> > {
+template<typename LhsScalar,typename RhsScalar>
+struct ei_functor_traits<ei_scalar_product_op<LhsScalar,RhsScalar> > {
enum {
- Cost = NumTraits<Scalar>::MulCost,
- PacketAccess = ei_packet_traits<Scalar>::HasMul
+ Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
+ PacketAccess = ei_scalar_product_op<LhsScalar,RhsScalar>::Vectorizable
};
};
@@ -83,9 +87,9 @@ template<typename Scalar> struct ei_scalar_conj_product_op {
EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_conj_product_op)
EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const
{ return ei_conj_helper<Scalar,Scalar,Conj,false>().pmul(a,b); }
- template<typename PacketScalar>
- EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
- { return ei_conj_helper<PacketScalar,PacketScalar,Conj,false>().pmul(a,b); }
+ template<typename Packet>
+ EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+ { return ei_conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
};
template<typename Scalar>
struct ei_functor_traits<ei_scalar_conj_product_op<Scalar> > {
@@ -103,11 +107,11 @@ struct ei_functor_traits<ei_scalar_conj_product_op<Scalar> > {
template<typename Scalar> struct ei_scalar_min_op {
EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_min_op)
EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return std::min(a, b); }
- template<typename PacketScalar>
- EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+ template<typename Packet>
+ EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
{ return ei_pmin(a,b); }
- template<typename PacketScalar>
- EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+ template<typename Packet>
+ EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
{ return ei_predux_min(a); }
};
template<typename Scalar>
@@ -126,11 +130,11 @@ struct ei_functor_traits<ei_scalar_min_op<Scalar> > {
template<typename Scalar> struct ei_scalar_max_op {
EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_max_op)
EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return std::max(a, b); }
- template<typename PacketScalar>
- EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+ template<typename Packet>
+ EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
{ return ei_pmax(a,b); }
- template<typename PacketScalar>
- EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+ template<typename Packet>
+ EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
{ return ei_predux_max(a); }
};
template<typename Scalar>
@@ -172,8 +176,8 @@ struct ei_functor_traits<ei_scalar_hypot_op<Scalar> > {
template<typename Scalar> struct ei_scalar_difference_op {
EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_difference_op)
EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
- template<typename PacketScalar>
- EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+ template<typename Packet>
+ EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
{ return ei_psub(a,b); }
};
template<typename Scalar>
@@ -192,8 +196,8 @@ struct ei_functor_traits<ei_scalar_difference_op<Scalar> > {
template<typename Scalar> struct ei_scalar_quotient_op {
EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_quotient_op)
EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a / b; }
- template<typename PacketScalar>
- EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+ template<typename Packet>
+ EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
{ return ei_pdiv(a,b); }
};
template<typename Scalar>
@@ -214,8 +218,8 @@ struct ei_functor_traits<ei_scalar_quotient_op<Scalar> > {
template<typename Scalar> struct ei_scalar_opposite_op {
EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_opposite_op)
EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; }
- template<typename PacketScalar>
- EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const
+ template<typename Packet>
+ EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
{ return ei_pnegate(a); }
};
template<typename Scalar>
@@ -234,8 +238,8 @@ template<typename Scalar> struct ei_scalar_abs_op {
EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_abs_op)
typedef typename NumTraits<Scalar>::Real result_type;
EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return ei_abs(a); }
- template<typename PacketScalar>
- EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const
+ template<typename Packet>
+ EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
{ return ei_pabs(a); }
};
template<typename Scalar>
@@ -256,8 +260,8 @@ template<typename Scalar> struct ei_scalar_abs2_op {
EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_abs2_op)
typedef typename NumTraits<Scalar>::Real result_type;
EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return ei_abs2(a); }
- template<typename PacketScalar>
- EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const
+ template<typename Packet>
+ EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
{ return ei_pmul(a,a); }
};
template<typename Scalar>
@@ -272,8 +276,8 @@ struct ei_functor_traits<ei_scalar_abs2_op<Scalar> >
template<typename Scalar> struct ei_scalar_conjugate_op {
EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_conjugate_op)
EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return ei_conj(a); }
- template<typename PacketScalar>
- EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const { return ei_pconj(a); }
+ template<typename Packet>
+ EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return ei_pconj(a); }
};
template<typename Scalar>
struct ei_functor_traits<ei_scalar_conjugate_op<Scalar> >
@@ -397,22 +401,22 @@ struct ei_functor_traits<ei_scalar_log_op<Scalar> >
* \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
*/
/* NOTE why doing the ei_pset1() in packetOp *is* an optimization ?
- * indeed it seems better to declare m_other as a PacketScalar and do the ei_pset1() once
+ * indeed it seems better to declare m_other as a Packet and do the ei_pset1() once
* in the constructor. However, in practice:
- * - GCC does not like m_other as a PacketScalar and generate a load every time it needs it
+ * - GCC does not like m_other as a Packet and generate a load every time it needs it
* - on the other hand GCC is able to moves the ei_pset1() away the loop :)
* - simpler code ;)
* (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y)
*/
template<typename Scalar>
struct ei_scalar_multiple_op {
- typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+ typedef typename ei_packet_traits<Scalar>::type Packet;
// FIXME default copy constructors seems bugged with std::complex<>
EIGEN_STRONG_INLINE ei_scalar_multiple_op(const ei_scalar_multiple_op& other) : m_other(other.m_other) { }
EIGEN_STRONG_INLINE ei_scalar_multiple_op(const Scalar& other) : m_other(other) { }
EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
- EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const
- { return ei_pmul(a, ei_pset1(m_other)); }
+ EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+ { return ei_pmul(a, ei_pset1<Packet>(m_other)); }
typename ei_makeconst<typename NumTraits<Scalar>::Nested>::type m_other;
};
template<typename Scalar>
@@ -433,13 +437,13 @@ struct ei_functor_traits<ei_scalar_multiple2_op<Scalar1,Scalar2> >
template<typename Scalar, bool IsInteger>
struct ei_scalar_quotient1_impl {
- typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+ typedef typename ei_packet_traits<Scalar>::type Packet;
// FIXME default copy constructors seems bugged with std::complex<>
EIGEN_STRONG_INLINE ei_scalar_quotient1_impl(const ei_scalar_quotient1_impl& other) : m_other(other.m_other) { }
EIGEN_STRONG_INLINE ei_scalar_quotient1_impl(const Scalar& other) : m_other(static_cast<Scalar>(1) / other) {}
EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
- EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const
- { return ei_pmul(a, ei_pset1(m_other)); }
+ EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+ { return ei_pmul(a, ei_pset1<Packet>(m_other)); }
const Scalar m_other;
};
template<typename Scalar>
@@ -480,13 +484,13 @@ struct ei_functor_traits<ei_scalar_quotient1_op<Scalar> >
template<typename Scalar>
struct ei_scalar_constant_op {
- typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+ typedef typename ei_packet_traits<Scalar>::type Packet;
EIGEN_STRONG_INLINE ei_scalar_constant_op(const ei_scalar_constant_op& other) : m_other(other.m_other) { }
EIGEN_STRONG_INLINE ei_scalar_constant_op(const Scalar& other) : m_other(other) { }
template<typename Index>
EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; }
template<typename Index>
- EIGEN_STRONG_INLINE const PacketScalar packetOp(Index, Index = 0) const { return ei_pset1(m_other); }
+ EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const { return ei_pset1<Packet>(m_other); }
const Scalar m_other;
};
template<typename Scalar>
@@ -513,22 +517,22 @@ template <typename Scalar, bool RandomAccess> struct ei_linspaced_op_impl;
template <typename Scalar>
struct ei_linspaced_op_impl<Scalar,false>
{
- typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+ typedef typename ei_packet_traits<Scalar>::type Packet;
ei_linspaced_op_impl(Scalar low, Scalar step) :
m_low(low), m_step(step),
- m_packetStep(ei_pset1(ei_packet_traits<Scalar>::size*step)),
- m_base(ei_padd(ei_pset1(low),ei_pmul(ei_pset1(step),ei_plset<Scalar>(-ei_packet_traits<Scalar>::size)))) {}
+ m_packetStep(ei_pset1<Packet>(ei_packet_traits<Scalar>::size*step)),
+ m_base(ei_padd(ei_pset1<Packet>(low),ei_pmul(ei_pset1<Packet>(step),ei_plset<Scalar>(-ei_packet_traits<Scalar>::size)))) {}
template<typename Index>
EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
template<typename Index>
- EIGEN_STRONG_INLINE const PacketScalar packetOp(Index) const { return m_base = ei_padd(m_base,m_packetStep); }
+ EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = ei_padd(m_base,m_packetStep); }
const Scalar m_low;
const Scalar m_step;
- const PacketScalar m_packetStep;
- mutable PacketScalar m_base;
+ const Packet m_packetStep;
+ mutable Packet m_base;
};
// random access for packet ops:
@@ -537,23 +541,23 @@ struct ei_linspaced_op_impl<Scalar,false>
template <typename Scalar>
struct ei_linspaced_op_impl<Scalar,true>
{
- typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+ typedef typename ei_packet_traits<Scalar>::type Packet;
ei_linspaced_op_impl(Scalar low, Scalar step) :
m_low(low), m_step(step),
- m_lowPacket(ei_pset1(m_low)), m_stepPacket(ei_pset1(m_step)), m_interPacket(ei_plset<Scalar>(0)) {}
+ m_lowPacket(ei_pset1<Packet>(m_low)), m_stepPacket(ei_pset1<Packet>(m_step)), m_interPacket(ei_plset<Scalar>(0)) {}
template<typename Index>
EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
template<typename Index>
- EIGEN_STRONG_INLINE const PacketScalar packetOp(Index i) const
- { return ei_padd(m_lowPacket, ei_pmul(m_stepPacket, ei_padd(ei_pset1<Scalar>(i),m_interPacket))); }
+ EIGEN_STRONG_INLINE const Packet packetOp(Index i) const
+ { return ei_padd(m_lowPacket, ei_pmul(m_stepPacket, ei_padd(ei_pset1<Packet>(i),m_interPacket))); }
const Scalar m_low;
const Scalar m_step;
- const PacketScalar m_lowPacket;
- const PacketScalar m_stepPacket;
- const PacketScalar m_interPacket;
+ const Packet m_lowPacket;
+ const Packet m_stepPacket;
+ const Packet m_interPacket;
};
// ----- Linspace functor ----------------------------------------------------------------
@@ -566,12 +570,12 @@ template <typename Scalar, bool RandomAccess> struct ei_functor_traits< ei_linsp
{ enum { Cost = 1, PacketAccess = ei_packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; };
template <typename Scalar, bool RandomAccess> struct ei_linspaced_op
{
- typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+ typedef typename ei_packet_traits<Scalar>::type Packet;
ei_linspaced_op(Scalar low, Scalar high, int num_steps) : impl(low, (high-low)/(num_steps-1)) {}
template<typename Index>
EIGEN_STRONG_INLINE const Scalar operator() (Index i, Index = 0) const { return impl(i); }
template<typename Index>
- EIGEN_STRONG_INLINE const PacketScalar packetOp(Index i, Index = 0) const { return impl.packetOp(i); }
+ EIGEN_STRONG_INLINE const Packet packetOp(Index i, Index = 0) const { return impl.packetOp(i); }
// This proxy object handles the actual required temporaries, the different
// implementations (random vs. sequential access) as well as the
// correct piping to size 2/4 packet operations.
@@ -581,13 +585,15 @@ template <typename Scalar, bool RandomAccess> struct ei_linspaced_op
// all functors allow linear access, except ei_scalar_identity_op. So we fix here a quick meta
// to indicate whether a functor allows linear access, just always answering 'yes' except for
// ei_scalar_identity_op.
+// FIXME move this to ei_functor_traits adding a ei_functor_default
template<typename Functor> struct ei_functor_has_linear_access { enum { ret = 1 }; };
template<typename Scalar> struct ei_functor_has_linear_access<ei_scalar_identity_op<Scalar> > { enum { ret = 0 }; };
// in CwiseBinaryOp, we require the Lhs and Rhs to have the same scalar type, except for multiplication
// where we only require them to have the same _real_ scalar type so one may multiply, say, float by complex<float>.
+// FIXME move this to ei_functor_traits adding a ei_functor_default
template<typename Functor> struct ei_functor_allows_mixing_real_and_complex { enum { ret = 0 }; };
-template<typename Scalar> struct ei_functor_allows_mixing_real_and_complex<ei_scalar_product_op<Scalar> > { enum { ret = 1 }; };
+template<typename LhsScalar,typename RhsScalar> struct ei_functor_allows_mixing_real_and_complex<ei_scalar_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
/** \internal
@@ -597,13 +603,13 @@ template<typename Scalar> struct ei_functor_allows_mixing_real_and_complex<ei_sc
/* If you wonder why doing the ei_pset1() in packetOp() is an optimization check ei_scalar_multiple_op */
template<typename Scalar>
struct ei_scalar_add_op {
- typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+ typedef typename ei_packet_traits<Scalar>::type Packet;
// FIXME default copy constructors seems bugged with std::complex<>
inline ei_scalar_add_op(const ei_scalar_add_op& other) : m_other(other.m_other) { }
inline ei_scalar_add_op(const Scalar& other) : m_other(other) { }
inline Scalar operator() (const Scalar& a) const { return a + m_other; }
- inline const PacketScalar packetOp(const PacketScalar& a) const
- { return ei_padd(a, ei_pset1(m_other)); }
+ inline const Packet packetOp(const Packet& a) const
+ { return ei_padd(a, ei_pset1<Packet>(m_other)); }
const Scalar m_other;
};
template<typename Scalar>
@@ -690,9 +696,9 @@ template<typename Scalar>
struct ei_scalar_inverse_op {
EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_inverse_op)
inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; }
- template<typename PacketScalar>
- inline const PacketScalar packetOp(const PacketScalar& a) const
- { return ei_pdiv(ei_pset1(Scalar(1)),a); }
+ template<typename Packet>
+ inline const Packet packetOp(const Packet& a) const
+ { return ei_pdiv(ei_pset1<Packet>(Scalar(1)),a); }
};
template<typename Scalar>
struct ei_functor_traits<ei_scalar_inverse_op<Scalar> >
@@ -706,8 +712,8 @@ template<typename Scalar>
struct ei_scalar_square_op {
EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_square_op)
inline Scalar operator() (const Scalar& a) const { return a*a; }
- template<typename PacketScalar>
- inline const PacketScalar packetOp(const PacketScalar& a) const
+ template<typename Packet>
+ inline const Packet packetOp(const Packet& a) const
{ return ei_pmul(a,a); }
};
template<typename Scalar>
@@ -722,8 +728,8 @@ template<typename Scalar>
struct ei_scalar_cube_op {
EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_cube_op)
inline Scalar operator() (const Scalar& a) const { return a*a*a; }
- template<typename PacketScalar>
- inline const PacketScalar packetOp(const PacketScalar& a) const
+ template<typename Packet>
+ inline const Packet packetOp(const Packet& a) const
{ return ei_pmul(a,ei_pmul(a,a)); }
};
template<typename Scalar>
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 77b1d748e..8ace18174 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -84,7 +84,8 @@ template<typename T> struct ei_packet_traits : ei_default_packet_traits
typedef T type;
enum {
Vectorizable = 0,
- size = 1
+ size = 1,
+ AlignedOnScalar = 0
};
enum {
HasAdd = 0,
@@ -159,16 +160,20 @@ template<typename Packet> inline Packet
ei_pandnot(const Packet& a, const Packet& b) { return a & (!b); }
/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
-template<typename Scalar> inline typename ei_packet_traits<Scalar>::type
-ei_pload(const Scalar* from) { return *from; }
+template<typename Packet> inline Packet
+ei_pload(const typename ei_unpacket_traits<Packet>::type* from) { return *from; }
/** \internal \returns a packet version of \a *from, (un-aligned load) */
-template<typename Scalar> inline typename ei_packet_traits<Scalar>::type
-ei_ploadu(const Scalar* from) { return *from; }
+template<typename Packet> inline Packet
+ei_ploadu(const typename ei_unpacket_traits<Packet>::type* from) { return *from; }
+
+/** \internal \returns a packet with elements of \a *from duplicated, e.g.: (from[0],from[0],from[1],from[1]) */
+template<typename Packet> inline Packet
+ei_ploaddup(const typename ei_unpacket_traits<Packet>::type* from) { return *from; }
/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
-template<typename Scalar> inline typename ei_packet_traits<Scalar>::type
-ei_pset1(const Scalar& a) { return a; }
+template<typename Packet> inline Packet
+ei_pset1(const typename ei_unpacket_traits<Packet>::type& a) { return a; }
/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
template<typename Scalar> inline typename ei_packet_traits<Scalar>::type
@@ -255,13 +260,13 @@ ei_pmadd(const Packet& a,
/** \internal \returns a packet version of \a *from.
* \If LoadMode equals Aligned, \a from must be 16 bytes aligned */
-template<typename Scalar, int LoadMode>
-inline typename ei_packet_traits<Scalar>::type ei_ploadt(const Scalar* from)
+template<typename Packet, int LoadMode>
+inline Packet ei_ploadt(const typename ei_unpacket_traits<Packet>::type* from)
{
if(LoadMode == Aligned)
- return ei_pload(from);
+ return ei_pload<Packet>(from);
else
- return ei_ploadu(from);
+ return ei_ploadu<Packet>(from);
}
/** \internal copy the packet \a from to \a *to.
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 6af7bb793..740f2b247 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -124,14 +124,14 @@ template<typename Derived> class MapBase
template<int LoadMode>
inline PacketScalar packet(Index row, Index col) const
{
- return ei_ploadt<Scalar, LoadMode>
+ return ei_ploadt<PacketScalar, LoadMode>
(m_data + (col * colStride() + row * rowStride()));
}
template<int LoadMode>
inline PacketScalar packet(Index index) const
{
- return ei_ploadt<Scalar, LoadMode>(m_data + index * innerStride());
+ return ei_ploadt<PacketScalar, LoadMode>(m_data + index * innerStride());
}
template<int StoreMode>
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 139132c6b..25b48517d 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -282,10 +282,13 @@ class GeneralProduct<Lhs, Rhs, GemvProduct>
public:
EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
+ typedef typename Lhs::Scalar LhsScalar;
+ typedef typename Rhs::Scalar RhsScalar;
+
GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
{
- EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::Scalar, typename Rhs::Scalar>::ret),
- YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+// EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::Scalar, typename Rhs::Scalar>::ret),
+// YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
}
enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
@@ -319,43 +322,66 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
template<typename ProductType, typename Dest>
static void run(const ProductType& prod, Dest& dest, typename ProductType::Scalar alpha)
{
- typedef typename ProductType::Scalar Scalar;
+ typedef typename ProductType::Index Index;
+ typedef typename ProductType::LhsScalar LhsScalar;
+ typedef typename ProductType::RhsScalar RhsScalar;
+ typedef typename ProductType::Scalar ResScalar;
+ typedef typename ProductType::RealScalar RealScalar;
typedef typename ProductType::ActualLhsType ActualLhsType;
typedef typename ProductType::ActualRhsType ActualRhsType;
typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
- typedef Map<Matrix<Scalar,Dynamic,1>, Aligned> MappedDest;
+ typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs());
ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs());
- Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
- * RhsBlasTraits::extractScalarFactor(prod.rhs());
+ ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
+ * RhsBlasTraits::extractScalarFactor(prod.rhs());
enum {
// FIXME find a way to allow an inner stride on the result if ei_packet_traits<Scalar>::size==1
- EvalToDest = Dest::InnerStrideAtCompileTime==1
+ EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1,
+ ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex)
};
- Scalar* EIGEN_RESTRICT actualDest;
- if (EvalToDest)
+ bool alphaIsCompatible = (!ComplexByReal) || (ei_imag(actualAlpha)==RealScalar(0));
+ bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+
+ RhsScalar compatibleAlpha = ei_get_factor<ResScalar,RhsScalar>::run(actualAlpha);
+
+ ResScalar* actualDest;
+ if (evalToDest)
+ {
actualDest = &dest.coeffRef(0);
+ }
else
{
- actualDest = ei_aligned_stack_new(Scalar,dest.size());
- MappedDest(actualDest, dest.size()) = dest;
+ actualDest = ei_aligned_stack_new(ResScalar,dest.size());
+ if(!alphaIsCompatible)
+ {
+ MappedDest(actualDest, dest.size()).setZero();
+ compatibleAlpha = RhsScalar(1);
+ }
+ else
+ MappedDest(actualDest, dest.size()) = dest;
}
- ei_cache_friendly_product_colmajor_times_vector
- <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>(
- dest.size(),
- &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(),
- actualRhs, actualDest, actualAlpha);
+ ei_general_matrix_vector_product
+ <Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
+ actualLhs.rows(), actualLhs.cols(),
+ &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(),
+ actualRhs.data(), actualRhs.innerStride(),
+ actualDest, 1,
+ compatibleAlpha);
- if (!EvalToDest)
+ if (!evalToDest)
{
- dest = MappedDest(actualDest, dest.size());
- ei_aligned_stack_delete(Scalar, actualDest, dest.size());
+ if(!alphaIsCompatible)
+ dest += actualAlpha * MappedDest(actualDest, dest.size());
+ else
+ dest = MappedDest(actualDest, dest.size());
+ ei_aligned_stack_delete(ResScalar, actualDest, dest.size());
}
}
};
@@ -365,7 +391,9 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
template<typename ProductType, typename Dest>
static void run(const ProductType& prod, Dest& dest, typename ProductType::Scalar alpha)
{
- typedef typename ProductType::Scalar Scalar;
+ typedef typename ProductType::LhsScalar LhsScalar;
+ typedef typename ProductType::RhsScalar RhsScalar;
+ typedef typename ProductType::Scalar ResScalar;
typedef typename ProductType::Index Index;
typedef typename ProductType::ActualLhsType ActualLhsType;
typedef typename ProductType::ActualRhsType ActualRhsType;
@@ -376,34 +404,34 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs());
ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs());
- Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
- * RhsBlasTraits::extractScalarFactor(prod.rhs());
+ ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
+ * RhsBlasTraits::extractScalarFactor(prod.rhs());
enum {
// FIXME I think here we really have to check for ei_packet_traits<Scalar>::size==1
// because in this case it is fine to have an inner stride
- DirectlyUseRhs = ((ei_packet_traits<Scalar>::size==1) || (_ActualRhsType::Flags&ActualPacketAccessBit))
+ DirectlyUseRhs = ((ei_packet_traits<RhsScalar>::size==1) || (_ActualRhsType::Flags&ActualPacketAccessBit))
&& (!(_ActualRhsType::Flags & RowMajorBit))
};
- Scalar* EIGEN_RESTRICT rhs_data;
+ RhsScalar* rhs_data;
if (DirectlyUseRhs)
- rhs_data = reinterpret_cast<Scalar* EIGEN_RESTRICT>(&actualRhs.const_cast_derived().coeffRef(0));
+ rhs_data = &actualRhs.const_cast_derived().coeffRef(0);
else
{
- rhs_data = ei_aligned_stack_new(Scalar, actualRhs.size());
- Map<typename _ActualRhsType::PlainObject>(reinterpret_cast<Scalar*>(rhs_data), actualRhs.size()) = actualRhs;
+ rhs_data = ei_aligned_stack_new(RhsScalar, actualRhs.size());
+ Map<typename _ActualRhsType::PlainObject>(rhs_data, actualRhs.size()) = actualRhs;
}
- ei_cache_friendly_product_rowmajor_times_vector
- <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate, Scalar, Index>(
+ ei_general_matrix_vector_product
+ <Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
actualLhs.rows(), actualLhs.cols(),
&actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(),
rhs_data, 1,
&dest.coeffRef(0,0), dest.innerStride(),
actualAlpha);
- if (!DirectlyUseRhs) ei_aligned_stack_delete(Scalar, rhs_data, prod.rhs().size());
+ if (!DirectlyUseRhs) ei_aligned_stack_delete(RhsScalar, rhs_data, prod.rhs().size());
}
};
diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h
index acdbb8658..de8c4e740 100644
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -39,14 +39,19 @@
*
* \sa class SwapWrapper for a similar trick.
*/
-template<typename BinaryOp, typename MatrixType>
-struct ei_traits<SelfCwiseBinaryOp<BinaryOp,MatrixType> > : ei_traits<MatrixType>
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct ei_traits<SelfCwiseBinaryOp<BinaryOp,Lhs,Rhs> >
+ : ei_traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >
{
-
+ enum {
+ Flags = ei_traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >::Flags | (Lhs::Flags&DirectAccessBit),
+ OuterStrideAtCompileTime = Lhs::OuterStrideAtCompileTime,
+ InnerStrideAtCompileTime = Lhs::InnerStrideAtCompileTime
+ };
};
-template<typename BinaryOp, typename MatrixType> class SelfCwiseBinaryOp
- : public ei_dense_xpr_base< SelfCwiseBinaryOp<BinaryOp, MatrixType> >::type
+template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp
+ : public ei_dense_xpr_base< SelfCwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
{
public:
@@ -57,7 +62,7 @@ template<typename BinaryOp, typename MatrixType> class SelfCwiseBinaryOp
using Base::operator=;
- inline SelfCwiseBinaryOp(MatrixType& xpr, const BinaryOp& func = BinaryOp()) : m_matrix(xpr), m_functor(func) {}
+ inline SelfCwiseBinaryOp(Lhs& xpr, const BinaryOp& func = BinaryOp()) : m_matrix(xpr), m_functor(func) {}
inline Index rows() const { return m_matrix.rows(); }
inline Index cols() const { return m_matrix.cols(); }
@@ -122,12 +127,8 @@ template<typename BinaryOp, typename MatrixType> class SelfCwiseBinaryOp
template<typename RhsDerived>
EIGEN_STRONG_INLINE SelfCwiseBinaryOp& lazyAssign(const DenseBase<RhsDerived>& rhs)
{
- EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(MatrixType,RhsDerived)
-
- EIGEN_STATIC_ASSERT((ei_functor_allows_mixing_real_and_complex<BinaryOp>::ret
- ? int(ei_is_same_type<typename MatrixType::RealScalar, typename RhsDerived::RealScalar>::ret)
- : int(ei_is_same_type<typename MatrixType::Scalar, typename RhsDerived::Scalar>::ret)),
- YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+ EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs,RhsDerived)
+ EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename RhsDerived::Scalar);
#ifdef EIGEN_DEBUG_ASSIGN
ei_assign_traits<SelfCwiseBinaryOp, RhsDerived>::debug();
@@ -141,7 +142,7 @@ template<typename BinaryOp, typename MatrixType> class SelfCwiseBinaryOp
}
protected:
- MatrixType& m_matrix;
+ Lhs& m_matrix;
const BinaryOp& m_functor;
private:
@@ -151,8 +152,8 @@ template<typename BinaryOp, typename MatrixType> class SelfCwiseBinaryOp
template<typename Derived>
inline Derived& DenseBase<Derived>::operator*=(const Scalar& other)
{
- SelfCwiseBinaryOp<ei_scalar_product_op<Scalar>, Derived> tmp(derived());
typedef typename Derived::PlainObject PlainObject;
+ SelfCwiseBinaryOp<ei_scalar_product_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
tmp = PlainObject::Constant(rows(),cols(),other);
return derived();
}
@@ -160,10 +161,11 @@ inline Derived& DenseBase<Derived>::operator*=(const Scalar& other)
template<typename Derived>
inline Derived& DenseBase<Derived>::operator/=(const Scalar& other)
{
- SelfCwiseBinaryOp<typename ei_meta_if<NumTraits<Scalar>::IsInteger,
+ typedef typename ei_meta_if<NumTraits<Scalar>::IsInteger,
ei_scalar_quotient_op<Scalar>,
- ei_scalar_product_op<Scalar> >::ret, Derived> tmp(derived());
+ ei_scalar_product_op<Scalar> >::ret BinOp;
typedef typename Derived::PlainObject PlainObject;
+ SelfCwiseBinaryOp<BinOp, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
tmp = PlainObject::Constant(rows(),cols(), NumTraits<Scalar>::IsInteger ? other : Scalar(1)/other);
return derived();
}
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 90ce2a802..960da31f3 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -53,7 +53,8 @@ struct ei_triangular_solver_selector;
template<typename Lhs, typename Rhs, int Mode>
struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,RowMajor,1>
{
- typedef typename Rhs::Scalar Scalar;
+ typedef typename Lhs::Scalar LhsScalar;
+ typedef typename Rhs::Scalar RhsScalar;
typedef ei_blas_traits<Lhs> LhsProductTraits;
typedef typename LhsProductTraits::ExtractType ActualLhsType;
typedef typename Lhs::Index Index;
@@ -81,12 +82,12 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,RowMajor
Index startRow = IsLower ? pi : pi-actualPanelWidth;
Index startCol = IsLower ? 0 : pi;
- ei_cache_friendly_product_rowmajor_times_vector<LhsProductTraits::NeedToConjugate,false,Scalar,Index>(
+ ei_general_matrix_vector_product<Index,LhsScalar,RowMajor,LhsProductTraits::NeedToConjugate,RhsScalar,false>::run(
actualPanelWidth, r,
&(actualLhs.const_cast_derived().coeffRef(startRow,startCol)), actualLhs.outerStride(),
&(other.coeffRef(startCol)), other.innerStride(),
&other.coeffRef(startRow), other.innerStride(),
- Scalar(-1));
+ RhsScalar(-1));
}
for(Index k=0; k<actualPanelWidth; ++k)
@@ -107,13 +108,12 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,RowMajor
template<typename Lhs, typename Rhs, int Mode>
struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,ColMajor,1>
{
- typedef typename Rhs::Scalar Scalar;
- typedef typename ei_packet_traits<Scalar>::type Packet;
+ typedef typename Lhs::Scalar LhsScalar;
+ typedef typename Rhs::Scalar RhsScalar;
typedef ei_blas_traits<Lhs> LhsProductTraits;
typedef typename LhsProductTraits::ExtractType ActualLhsType;
typedef typename Lhs::Index Index;
enum {
- PacketSize = ei_packet_traits<Scalar>::size,
IsLower = ((Mode&Lower)==Lower)
};
@@ -148,12 +148,11 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,ColMajor
// let's directly call the low level product function because:
// 1 - it is faster to compile
// 2 - it is slighlty faster at runtime
- ei_cache_friendly_product_colmajor_times_vector<LhsProductTraits::NeedToConjugate,false>(
- r,
- &(actualLhs.const_cast_derived().coeffRef(endBlock,startBlock)), actualLhs.outerStride(),
- other.segment(startBlock, actualPanelWidth),
- &(other.coeffRef(endBlock, 0)),
- Scalar(-1));
+ ei_general_matrix_vector_product<Index,LhsScalar,ColMajor,LhsProductTraits::NeedToConjugate,RhsScalar,false>::run(
+ r, actualPanelWidth,
+ &(actualLhs.const_cast_derived().coeffRef(endBlock,startBlock)), actualLhs.outerStride(),
+ &other.coeff(startBlock), other.innerStride(),
+ &(other.coeffRef(endBlock, 0)), other.innerStride(), RhsScalar(-1));
}
}
}
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 3a84b84ff..de8ae0a5b 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -303,11 +303,11 @@ inline void MatrixBase<Derived>::adjointInPlace()
// The following is to detect aliasing problems in most common cases.
-template<typename BinOp,typename NestedXpr>
-struct ei_blas_traits<SelfCwiseBinaryOp<BinOp,NestedXpr> >
+template<typename BinOp,typename NestedXpr,typename Rhs>
+struct ei_blas_traits<SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> >
: ei_blas_traits<NestedXpr>
{
- typedef SelfCwiseBinaryOp<BinOp,NestedXpr> XprType;
+ typedef SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> XprType;
static inline const XprType extract(const XprType& x) { return x; }
};
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 2dba95a2f..ecada02f4 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -63,7 +63,7 @@ template<> struct ei_packet_traits<std::complex<float> > : ei_default_packet_tr
template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
-template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>& from)
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>& from)
{
Packet2cf res;
/* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index ad694150a..8205beae5 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -59,13 +59,13 @@ typedef __vector unsigned char Packet16uc;
Packet4i ei_p4i_##NAME = vec_splat_s32(X)
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
- Packet4f ei_p4f_##NAME = ei_pset1<float>(X)
+ Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
- Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
+ Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X)
#define DST_CHAN 1
#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
@@ -89,6 +89,7 @@ template<> struct ei_packet_traits<float> : ei_default_packet_traits
typedef Packet4f type;
enum {
Vectorizable = 1,
+ AlignedOnScalar = 1,
size=4,
// FIXME check the Has*
@@ -105,6 +106,7 @@ template<> struct ei_packet_traits<int> : ei_default_packet_traits
enum {
// FIXME check the Has*
Vectorizable = 1,
+ AlignedOnScalar = 1,
size=4
};
};
@@ -156,7 +158,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
return s;
}
*/
-template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) {
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) {
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
float EIGEN_ALIGN16 af[4];
af[0] = from;
@@ -165,7 +167,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) {
return vc;
}
-template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) {
+template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int& from) {
int EIGEN_ALIGN16 ai[4];
ai[0] = from;
Packet4i vc = vec_ld(0, ai);
@@ -173,8 +175,8 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) {
return vc;
}
-template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return vec_add(ei_pset1(a), ei_p4f_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return vec_add(ei_pset1(a), ei_p4i_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return vec_add(ei_pset1<Packet4f>(a), ei_p4f_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return vec_add(ei_pset1<Packet4i>(a), ei_p4i_COUNTDOWN); }
template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i ei_padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
@@ -239,7 +241,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con
template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
{ ei_assert(false && "packet integer division are not supported by AltiVec");
- return ei_pset1<int>(0);
+ return ei_pset1<Packet4i>(0);
}
// for some weird raisons, it has to be overloaded for packet of integers
@@ -265,10 +267,10 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pxor<Packet4i>(const Packet4i& a, con
template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
-template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
-template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from)
{
EIGEN_DEBUG_ALIGNED_LOAD
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
@@ -280,7 +282,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data
}
-template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from)
{
EIGEN_DEBUG_ALIGNED_LOAD
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index bf68a2bbb..9678040e7 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -58,7 +58,7 @@ template<> struct ei_packet_traits<std::complex<float> > : ei_default_packet_tr
template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
-template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>& from)
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>& from)
{
float32x2_t r64;
r64 = vld1_f32((float *)&from);
@@ -141,6 +141,11 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_preverse(const Packet2cf& a)
return Packet2cf(a_r128);
}
+EIGEN_STRONG_INLINE Packet2cf ei_pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
+{
+ return Packet2cf(vrev64q_f32(a.v));
+}
+
template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux<Packet2cf>(const Packet2cf& a)
{
float32x2_t a1, a2;
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 66a6a30ee..8220ed07c 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -45,13 +45,13 @@ typedef float32x4_t Packet4f;
typedef int32x4_t Packet4i;
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
- const Packet4f ei_p4f_##NAME = ei_pset1<float>(X)
+ const Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
const Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
- const Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
+ const Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X)
#ifndef __pld
#define __pld(x) asm volatile ( " pld [%[addr]]\n" :: [addr] "r" (x) : "cc" );
@@ -62,6 +62,7 @@ template<> struct ei_packet_traits<float> : ei_default_packet_traits
typedef Packet4f type;
enum {
Vectorizable = 1,
+ AlignedOnScalar = 1,
size = 4,
HasDiv = 1,
@@ -78,6 +79,7 @@ template<> struct ei_packet_traits<int> : ei_default_packet_traits
typedef Packet4i type;
enum {
Vectorizable = 1,
+ AlignedOnScalar = 1,
size=4
// FIXME check the Has*
};
@@ -86,18 +88,18 @@ template<> struct ei_packet_traits<int> : ei_default_packet_traits
template<> struct ei_unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
template<> struct ei_unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
-template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { return vdupq_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { return vdupq_n_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); }
template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a)
{
Packet4f countdown = { 3, 2, 1, 0 };
- return vaddq_f32(ei_pset1(a), countdown);
+ return vaddq_f32(ei_pset1<Packet4f>(a), countdown);
}
template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a)
{
Packet4i countdown = { 3, 2, 1, 0 };
- return vaddq_s32(ei_pset1(a), countdown);
+ return vaddq_s32(ei_pset1<Packet4i>(a), countdown);
}
template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
@@ -135,7 +137,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con
}
template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
{ ei_assert(false && "packet integer division are not supported by NEON");
- return ei_pset1<int>(0);
+ return ei_pset1<Packet4i>(0);
}
// for some weird raisons, it has to be overloaded for packet of integers
@@ -178,6 +180,21 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIG
template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploaddup<Packet4f>(const float* from)
+{
+ float32x2_t lo, ho;
+ lo = vdup_n_f32(*from);
+ hi = vdup_n_f32(*from);
+ return vcombine_f32(lo, hi);
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploaddup<Packet4i>(const float* from)
+{
+ int32x2_t lo, ho;
+ lo = vdup_n_s32(*from);
+ hi = vdup_n_s32(*from);
+ return vcombine_s32(lo, hi);
+}
+
template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
@@ -193,25 +210,21 @@ template<> EIGEN_STRONG_INLINE int ei_pfirst<Packet4i>(const Packet4i& a) { i
template<> EIGEN_STRONG_INLINE Packet4f ei_preverse(const Packet4f& a) {
float32x2_t a_lo, a_hi;
- Packet4f a_r64, a_r128;
+ Packet4f a_r64;
a_r64 = vrev64q_f32(a);
a_lo = vget_low_f32(a_r64);
a_hi = vget_high_f32(a_r64);
- a_r128 = vcombine_f32(a_hi, a_lo);
-
- return a_r128;
+ return vcombine_f32(a_hi, a_lo);
}
template<> EIGEN_STRONG_INLINE Packet4i ei_preverse(const Packet4i& a) {
int32x2_t a_lo, a_hi;
- Packet4i a_r64, a_r128;
+ Packet4i a_r64;
a_r64 = vrev64q_s32(a);
a_lo = vget_low_s32(a_r64);
a_hi = vget_high_s32(a_r64);
- a_r128 = vcombine_s32(a_hi, a_lo);
-
- return a_r128;
+ return vcombine_s32(a_hi, a_lo);
}
template<> EIGEN_STRONG_INLINE Packet4f ei_pabs(const Packet4f& a) { return vabsq_f32(a); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a) { return vabsq_s32(a); }
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 6c91386c6..585630563 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -38,6 +38,7 @@ template<> struct ei_packet_traits<std::complex<float> > : ei_default_packet_tr
typedef Packet2cf type;
enum {
Vectorizable = 1,
+ AlignedOnScalar = 1,
size = 2,
HasAdd = 1,
@@ -55,13 +56,6 @@ template<> struct ei_packet_traits<std::complex<float> > : ei_default_packet_tr
template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
-template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>& from)
-{
- Packet2cf res;
- res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
- return Packet2cf(_mm_movelh_ps(res.v,res.v));
-}
-
template<> EIGEN_STRONG_INLINE Packet2cf ei_padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf ei_psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf ei_pnegate(const Packet2cf& a)
@@ -79,9 +73,12 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_pmul<Packet2cf>(const Packet2cf& a,
{
// TODO optimize it for SSE3 and 4
#ifdef EIGEN_VECTORIZE_SSE3
- return Packet2cf(_mm_addsub_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
- _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3),
+ return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
+ _mm_mul_ps(_mm_movehdup_ps(a.v),
ei_vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+// return Packet2cf(_mm_addsub_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
+// _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3),
+// ei_vec4f_swizzle1(b.v, 1, 0, 3, 2))));
#else
const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000));
return Packet2cf(_mm_add_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
@@ -95,14 +92,21 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_por <Packet2cf>(const Packet2cf&
template<> EIGEN_STRONG_INLINE Packet2cf ei_pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf ei_pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(_mm_load_ps((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_pload<Packet4f>(&ei_real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu<Packet4f>(&ei_real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstore(&ei_real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu(&ei_real_ref(*to), from.v); }
template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>& from)
+{
+ Packet2cf res;
+ res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
+ return Packet2cf(_mm_movelh_ps(res.v,res.v));
+}
+
template<> EIGEN_STRONG_INLINE std::complex<float> ei_pfirst<Packet2cf>(const Packet2cf& a)
{
std::complex<float> res;
@@ -194,6 +198,24 @@ template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,true>
}
};
+template<> struct ei_conj_helper<Packet4f, Packet2cf, false,false>
+{
+ EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
+ { return ei_padd(c, pmul(x,y)); }
+
+ EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
+ { return Packet2cf(ei_pmul(x, y.v)); }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet4f, false,false>
+{
+ EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
+ { return ei_padd(c, pmul(x,y)); }
+
+ EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
+ { return Packet2cf(ei_pmul(x.v, y)); }
+};
+
template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
// TODO optimize it for SSE3 and 4
@@ -202,6 +224,12 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a,
return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1)))));
}
+EIGEN_STRONG_INLINE Packet2cf ei_pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
+{
+ return Packet2cf(ei_vec4f_swizzle1(x.v, 1, 0, 3, 2));
+}
+
+
//---------- double ----------
struct Packet1cd
{
@@ -215,6 +243,7 @@ template<> struct ei_packet_traits<std::complex<double> > : ei_default_packet_t
typedef Packet1cd type;
enum {
Vectorizable = 1,
+ AlignedOnScalar = 0,
size = 1,
HasAdd = 1,
@@ -261,23 +290,25 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_por <Packet1cd>(const Packet1cd&
template<> EIGEN_STRONG_INLINE Packet1cd ei_pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
-// FIXME force unaligned load, this is a temporary fix
-template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<std::complex<double> >(const std::complex<double>& from)
-{ /* here we really have to use unaligned loads :( */ return ei_ploadu(&from); }
+// FIXME force unaligned load, this is a temporary fix
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<Packet1cd>(const std::complex<double>& from)
+{ /* here we really have to use unaligned loads :( */ return ei_ploadu<Packet1cd>(&from); }
// FIXME force unaligned store, this is a temporary fix
-template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstoreu((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstore((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<double> ei_pfirst<Packet1cd>(const Packet1cd& a)
{
- EIGEN_ALIGN16 std::complex<double> res;
- ei_pstore(&res, a);
- return res;
+ EIGEN_ALIGN16 double res[2];
+ _mm_store_pd(res, a.v);
+ return *(std::complex<double>*)res;
}
template<> EIGEN_STRONG_INLINE Packet1cd ei_preverse(const Packet1cd& a) { return a; }
@@ -361,6 +392,24 @@ template<> struct ei_conj_helper<Packet1cd, Packet1cd, true,true>
}
};
+template<> struct ei_conj_helper<Packet2d, Packet1cd, false,false>
+{
+ EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
+ { return ei_padd(c, pmul(x,y)); }
+
+ EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
+ { return Packet1cd(ei_pmul(x, y.v)); }
+};
+
+template<> struct ei_conj_helper<Packet1cd, Packet2d, false,false>
+{
+ EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
+ { return ei_padd(c, pmul(x,y)); }
+
+ EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
+ { return Packet1cd(ei_pmul(x.v, y)); }
+};
+
template<> EIGEN_STRONG_INLINE Packet1cd ei_pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{
// TODO optimize it for SSE3 and 4
@@ -369,4 +418,9 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pdiv<Packet1cd>(const Packet1cd& a,
return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
}
+EIGEN_STRONG_INLINE Packet1cd ei_pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{
+ return Packet1cd(ei_preverse(x.v));
+}
+
#endif // EIGEN_COMPLEX_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 742bfa92f..e4ca82985 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -378,14 +378,14 @@ Packet4f ei_pcos<Packet4f>(const Packet4f& _x)
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f ei_psqrt<Packet4f>(const Packet4f& _x)
{
- Packet4f half = ei_pmul(_x, ei_pset1(.5f));
-
- /* select only the inverse sqrt of non-zero inputs */
- Packet4f non_zero_mask = _mm_cmpgt_ps(_x, ei_pset1(std::numeric_limits<float>::epsilon()));
- Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x));
-
- x = ei_pmul(x, ei_psub(ei_pset1(1.5f), ei_pmul(half, ei_pmul(x,x))));
- return ei_pmul(_x,x);
+ Packet4f half = ei_pmul(_x, ei_pset1<Packet4f>(.5f));
+
+ /* select only the inverse sqrt of non-zero inputs */
+ Packet4f non_zero_mask = _mm_cmpgt_ps(_x, ei_pset1<Packet4f>(std::numeric_limits<float>::epsilon()));
+ Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x));
+
+ x = ei_pmul(x, ei_psub(ei_pset1<Packet4f>(1.5f), ei_pmul(half, ei_pmul(x,x))));
+ return ei_pmul(_x,x);
}
#endif // EIGEN_MATH_FUNCTIONS_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index dc28bdb56..8c441810c 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -45,7 +45,7 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; };
#define ei_vec2d_swizzle1(v,p,q) \
(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
-
+
#define ei_vec4f_swizzle2(a,b,p,q,r,s) \
(_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
@@ -53,13 +53,13 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; };
(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
- const Packet4f ei_p4f_##NAME = ei_pset1<float>(X)
+ const Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
- const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(ei_pset1<int>(X))
+ const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(ei_pset1<Packet4i>(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
- const Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
+ const Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X)
template<> struct ei_packet_traits<float> : ei_default_packet_traits
@@ -67,6 +67,7 @@ template<> struct ei_packet_traits<float> : ei_default_packet_traits
typedef Packet4f type;
enum {
Vectorizable = 1,
+ AlignedOnScalar = 1,
size=4,
HasDiv = 1,
@@ -82,6 +83,7 @@ template<> struct ei_packet_traits<double> : ei_default_packet_traits
typedef Packet2d type;
enum {
Vectorizable = 1,
+ AlignedOnScalar = 1,
size=2,
HasDiv = 1
@@ -93,6 +95,7 @@ template<> struct ei_packet_traits<int> : ei_default_packet_traits
enum {
// FIXME check the Has*
Vectorizable = 1,
+ AlignedOnScalar = 1,
size=4
};
};
@@ -104,27 +107,24 @@ template<> struct ei_unpacket_traits<Packet4i> { typedef int type; enum {size
#ifdef __GNUC__
// Sometimes GCC implements _mm_set1_p* using multiple moves,
// that is inefficient :( (e.g., see ei_gemm_pack_rhs)
-template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) {
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) {
Packet4f res = _mm_set_ss(from);
return ei_vec4f_swizzle1(res,0,0,0,0);
}
-template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) {
-#ifdef EIGEN_VECTORIZE_SSE3
- return _mm_loaddup_pd(&from);
-#else
+template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<Packet2d>(const double& from) {
+ // NOTE the SSE3 intrinsic _mm_loaddup_pd is never faster but sometimes much slower
Packet2d res = _mm_set_sd(from);
return ei_vec2d_swizzle1(res, 0, 0);
-#endif
}
#else
-template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { return _mm_set1_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) { return _mm_set1_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) { return _mm_set1_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
#endif
-template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { return _mm_set1_epi32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
-template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return _mm_add_ps(ei_pset1(a), _mm_set_ps(3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet2d ei_plset<double>(const double& a) { return _mm_add_pd(ei_pset1(a),_mm_set_pd(1,0)); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return _mm_add_epi32(ei_pset1(a),_mm_set_epi32(3,2,1,0)); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return _mm_add_ps(ei_pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_plset<double>(const double& a) { return _mm_add_pd(ei_pset1<Packet2d>(a),_mm_set_pd(1,0)); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return _mm_add_epi32(ei_pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d ei_padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
@@ -171,7 +171,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con
template<> EIGEN_STRONG_INLINE Packet2d ei_pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
{ ei_assert(false && "packet integer division are not supported by SSE");
- return ei_pset1<int>(0);
+ return ei_pset1<Packet4i>(0);
}
// for some weird raisons, it has to be overloaded for packet of integers
@@ -211,14 +211,14 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a,
template<> EIGEN_STRONG_INLINE Packet2d ei_pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
#if defined(_MSC_VER)
- template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); }
- template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
- template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
+ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); }
+ template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
+ template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
#else
// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
// require pointer casting to incompatible pointer types and leads to invalid code
@@ -226,7 +226,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_D
// a correct instruction dependency.
// TODO: do the same for MSVC (ICC is compatible)
// NOTE: with the code below, MSVC's compiler crashes!
-template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from)
{
EIGEN_DEBUG_UNALIGNED_LOAD
__m128d res;
@@ -234,7 +234,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
res = _mm_loadh_pd(res, (const double*)(from+2)) ;
return _mm_castpd_ps(res);
}
-template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from)
+template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<Packet2d>(const double* from)
{
EIGEN_DEBUG_UNALIGNED_LOAD
__m128d res;
@@ -242,7 +242,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from)
res = _mm_loadh_pd(res,from+1);
return res;
}
-template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from)
{
EIGEN_DEBUG_UNALIGNED_LOAD
__m128d res;
@@ -252,6 +252,19 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
}
#endif
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploaddup<Packet4f>(const float* from)
+{
+ return ei_vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd((const double*)from)), 0, 0, 1, 1);
+}
+template<> EIGEN_STRONG_INLINE Packet2d ei_ploaddup<Packet2d>(const double* from)
+{ return ei_pset1<Packet2d>(from[0]); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploaddup<Packet4i>(const int* from)
+{
+ Packet4i tmp;
+ tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));
+ return ei_vec4i_swizzle1(tmp, 0, 0, 1, 1);
+}
+
template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
diff --git a/Eigen/src/Core/products/CoeffBasedProduct.h b/Eigen/src/Core/products/CoeffBasedProduct.h
index 1474bc1bb..c79a34de0 100644
--- a/Eigen/src/Core/products/CoeffBasedProduct.h
+++ b/Eigen/src/Core/products/CoeffBasedProduct.h
@@ -42,7 +42,7 @@
template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
struct ei_product_coeff_impl;
-template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct ei_product_packet_impl;
template<typename LhsNested, typename RhsNested, int NestingFlags>
@@ -73,6 +73,8 @@ struct ei_traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
LhsRowMajor = LhsFlags & RowMajorBit,
RhsRowMajor = RhsFlags & RowMajorBit,
+ SameType = ei_is_same_type<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::ret,
+
CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
&& (ColsAtCompileTime == Dynamic
|| ( (ColsAtCompileTime % ei_packet_traits<Scalar>::size) == 0
@@ -94,7 +96,8 @@ struct ei_traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
| (EvalToRowMajor ? RowMajorBit : 0)
| NestingFlags
- | (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0),
+ // TODO enable vectorization for mixed types
+ | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),
CoeffReadCost = InnerSize == Dynamic ? Dynamic
: InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
@@ -105,7 +108,8 @@ struct ei_traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
* loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
* the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
*/
- CanVectorizeInner = LhsRowMajor
+ CanVectorizeInner = SameType
+ && LhsRowMajor
&& (!RhsRowMajor)
&& (LhsFlags & RhsFlags & ActualPacketAccessBit)
&& (LhsFlags & RhsFlags & AlignedBit)
@@ -275,20 +279,20 @@ struct ei_product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
*** Scalar path with inner vectorization ***
*******************************************/
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar>
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
struct ei_product_coeff_vectorized_unroller
{
typedef typename Lhs::Index Index;
enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size };
EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
{
- ei_product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
+ ei_product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
pres = ei_padd(pres, ei_pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
}
};
-template<typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar>
+template<typename Lhs, typename Rhs, typename Packet>
+struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
{
typedef typename Lhs::Index Index;
EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
@@ -300,13 +304,13 @@ struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar>
template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
struct ei_product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
{
- typedef typename Lhs::PacketScalar PacketScalar;
+ typedef typename Lhs::PacketScalar Packet;
typedef typename Lhs::Index Index;
enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size };
EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
{
- PacketScalar pres;
- ei_product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
+ Packet pres;
+ ei_product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
ei_product_coeff_impl<DefaultTraversal,UnrollingIndex,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res);
res = ei_predux(pres);
}
@@ -368,71 +372,71 @@ struct ei_product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetSca
*** Packet path ***
*******************/
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, PacketScalar, LoadMode>
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct ei_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
{
typedef typename Lhs::Index Index;
- EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+ EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
{
- ei_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
- res = ei_pmadd(ei_pset1(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
+ ei_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
+ res = ei_pmadd(ei_pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
}
};
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, PacketScalar, LoadMode>
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct ei_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
{
typedef typename Lhs::Index Index;
- EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+ EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
{
- ei_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
- res = ei_pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), ei_pset1(rhs.coeff(UnrollingIndex, col)), res);
+ ei_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
+ res = ei_pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), ei_pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
}
};
-template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<RowMajor, 0, Lhs, Rhs, PacketScalar, LoadMode>
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct ei_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
{
typedef typename Lhs::Index Index;
- EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+ EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
{
- res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+ res = ei_pmul(ei_pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
}
};
-template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<ColMajor, 0, Lhs, Rhs, PacketScalar, LoadMode>
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct ei_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
{
typedef typename Lhs::Index Index;
- EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+ EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
{
- res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
+ res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1<Packet>(rhs.coeff(0, col)));
}
};
-template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct ei_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
{
typedef typename Lhs::Index Index;
- EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
+ EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
{
ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
- res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+ res = ei_pmul(ei_pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
for(Index i = 1; i < lhs.cols(); ++i)
- res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
+ res = ei_pmadd(ei_pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
}
};
-template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
{
typedef typename Lhs::Index Index;
- EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
+ EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
{
ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
- res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
+ res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1<Packet>(rhs.coeff(0, col)));
for(Index i = 1; i < lhs.cols(); ++i)
- res = ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1(rhs.coeff(i, col)), res);
+ res = ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1<Packet>(rhs.coeff(i, col)), res);
}
};
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index c8eeeff4d..7e2d496fe 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -25,6 +25,9 @@
#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
#define EIGEN_GENERAL_BLOCK_PANEL_H
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
+class ei_gebp_traits;
+
/** \internal */
inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
{
@@ -97,7 +100,7 @@ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
* for matrix products and related algorithms. The blocking sizes depends on various
* parameters:
* - the L1 and L2 cache sizes,
- * - the register level blocking sizes defined by ei_product_blocking_traits,
+ * - the register level blocking sizes defined by ei_gebp_traits,
* - the number of scalars that fit into a packet (when vectorization is enabled).
*
* \sa setCpuCacheSizes */
@@ -109,14 +112,15 @@ void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrd
// mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
// per kc x nr vertical small panels where nr is the blocking size along the n dimension
// at the register level. For vectorization purpose, these small vertical panels are unpacked,
- // i.e., each coefficient is replicated to fit a packet. This small vertical panel has to
+ // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
// stay in L1 cache.
std::ptrdiff_t l1, l2;
+ typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits;
enum {
- kdiv = KcFactor * 2 * ei_product_blocking_traits<RhsScalar>::nr
- * ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar),
- mr = ei_product_blocking_traits<LhsScalar>::mr,
+ kdiv = KcFactor * 2 * Traits::nr
+ * Traits::RhsProgress * sizeof(RhsScalar),
+ mr = ei_gebp_traits<LhsScalar,RhsScalar>::mr,
mr_mask = (0xffffffff/mr)*mr
};
@@ -136,112 +140,475 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st
#ifdef EIGEN_HAS_FUSE_CJMADD
#define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
#else
- #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,T);
+
+ // FIXME (a bit overkill maybe ?)
+
+ template<typename CJ, typename A, typename B, typename C, typename T> struct ei_gebp_madd_selector {
+ EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
+ {
+ c = cj.pmadd(a,b,c);
+ }
+ };
+
+ template<typename CJ, typename T> struct ei_gebp_madd_selector<CJ,T,T,T,T> {
+ EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, T& a, T& b, T& c, T& t)
+ {
+ t = b; t = cj.pmul(a,t); c = ei_padd(c,t);
+ }
+ };
+
+ template<typename CJ, typename A, typename B, typename C, typename T>
+ EIGEN_STRONG_INLINE void ei_gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
+ {
+ ei_gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
+ }
+
+ #define MADD(CJ,A,B,C,T) ei_gebp_madd(CJ,A,B,C,T);
+// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,T);
#endif
-// optimized GEneral packed Block * packed Panel product kernel
-template<typename Scalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+/* Vectorization logic
+ * real*real: unpack rhs to constant packets, ...
+ *
+ * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
+ * storing each res packet into two packets (2x2),
+ * at the end combine them: swap the second and addsub them
+ * cf*cf : same but with 2x4 blocks
+ * cplx*real : unpack rhs to constant packets, ...
+ * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
+ */
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
+class ei_gebp_traits
+{
+public:
+ typedef _LhsScalar LhsScalar;
+ typedef _RhsScalar RhsScalar;
+ typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+ enum {
+ ConjLhs = _ConjLhs,
+ ConjRhs = _ConjRhs,
+ Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable,
+ LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
+ RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
+ ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
+
+ NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+
+ // register block size along the N direction (must be either 2 or 4)
+ nr = NumberOfRegisters/4,
+
+ // register block size along the M direction (currently, this one cannot be modified)
+ mr = 2 * LhsPacketSize,
+
+ WorkSpaceFactor = nr * RhsPacketSize,
+
+ LhsProgress = LhsPacketSize,
+ RhsProgress = RhsPacketSize
+ };
+
+ typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket;
+ typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket;
+ typedef typename ei_packet_traits<ResScalar>::type _ResPacket;
+
+ typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket;
+ typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket;
+ typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket;
+
+ typedef ResPacket AccPacket;
+
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+ {
+ p = ei_pset1<ResPacket>(ResScalar(0));
+ }
+
+ EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
+ {
+ for(DenseIndex k=0; k<n; k++)
+ ei_pstore(&b[k*RhsPacketSize], ei_pset1<RhsPacket>(rhs[k]));
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = ei_pload<RhsPacket>(b);
+ }
+
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+ {
+ dest = ei_pload<LhsPacket>(a);
+ }
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
+ {
+ tmp = b; tmp = ei_pmul(a,tmp); c = ei_padd(c,tmp);
+ }
+
+ EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+ {
+ r = ei_pmadd(c,alpha,r);
+ }
+
+protected:
+// ei_conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
+// ei_conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
+};
+
+template<typename RealScalar, bool _ConjLhs>
+class ei_gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
+{
+public:
+ typedef std::complex<RealScalar> LhsScalar;
+ typedef RealScalar RhsScalar;
+ typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+ enum {
+ ConjLhs = _ConjLhs,
+ ConjRhs = false,
+ Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable,
+ LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
+ RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
+ ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
+
+ NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+ nr = NumberOfRegisters/4,
+ mr = 2 * LhsPacketSize,
+ WorkSpaceFactor = nr*RhsPacketSize,
+
+ LhsProgress = LhsPacketSize,
+ RhsProgress = RhsPacketSize
+ };
+
+ typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket;
+ typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket;
+ typedef typename ei_packet_traits<ResScalar>::type _ResPacket;
+
+ typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket;
+ typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket;
+ typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket;
+
+ typedef ResPacket AccPacket;
+
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+ {
+ p = ei_pset1<ResPacket>(ResScalar(0));
+ }
+
+ EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
+ {
+ for(DenseIndex k=0; k<n; k++)
+ ei_pstore(&b[k*RhsPacketSize], ei_pset1<RhsPacket>(rhs[k]));
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = ei_pload<RhsPacket>(b);
+ }
+
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+ {
+ dest = ei_pload<LhsPacket>(a);
+ }
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
+ {
+ madd_impl(a, b, c, tmp, typename ei_meta_if<Vectorizable,ei_meta_true,ei_meta_false>::ret());
+ }
+
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const ei_meta_true&) const
+ {
+ tmp = b; tmp = ei_pmul(a.v,tmp); c.v = ei_padd(c.v,tmp);
+ }
+
+ EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const ei_meta_false&) const
+ {
+ c += a * b;
+ }
+
+ EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+ {
+ r = cj.pmadd(c,alpha,r);
+ }
+
+protected:
+ ei_conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
+};
+
+template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
+class ei_gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
+{
+public:
+ typedef std::complex<RealScalar> Scalar;
+ typedef std::complex<RealScalar> LhsScalar;
+ typedef std::complex<RealScalar> RhsScalar;
+ typedef std::complex<RealScalar> ResScalar;
+
+ enum {
+ ConjLhs = _ConjLhs,
+ ConjRhs = _ConjRhs,
+ Vectorizable = ei_packet_traits<RealScalar>::Vectorizable
+ && ei_packet_traits<Scalar>::Vectorizable,
+ RealPacketSize = Vectorizable ? ei_packet_traits<RealScalar>::size : 1,
+ ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
+
+ nr = 2,
+ mr = 2 * ResPacketSize,
+ WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
+
+ LhsProgress = ResPacketSize,
+ RhsProgress = Vectorizable ? 2*ResPacketSize : 1
+ };
+
+ typedef typename ei_packet_traits<RealScalar>::type RealPacket;
+ typedef typename ei_packet_traits<Scalar>::type ScalarPacket;
+ struct DoublePacket
+ {
+ RealPacket first;
+ RealPacket second;
+ };
+
+ typedef typename ei_meta_if<Vectorizable,RealPacket, Scalar>::ret LhsPacket;
+ typedef typename ei_meta_if<Vectorizable,DoublePacket,Scalar>::ret RhsPacket;
+ typedef typename ei_meta_if<Vectorizable,ScalarPacket,Scalar>::ret ResPacket;
+ typedef typename ei_meta_if<Vectorizable,DoublePacket,Scalar>::ret AccPacket;
+
+ EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
+
+ EIGEN_STRONG_INLINE void initAcc(DoublePacket& p)
+ {
+ p.first = ei_pset1<RealPacket>(RealScalar(0));
+ p.second = ei_pset1<RealPacket>(RealScalar(0));
+ }
+
+ /* Unpack the rhs coeff such that each complex coefficient is spread into
+ * two packects containing respectively the real and imaginary coefficient
+ * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y]
+ */
+ EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b)
+ {
+ for(DenseIndex k=0; k<n; k++)
+ {
+ if(Vectorizable)
+ {
+ ei_pstore((RealScalar*)&b[k*ResPacketSize*2+0], ei_pset1<RealPacket>(ei_real(rhs[k])));
+ ei_pstore((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], ei_pset1<RealPacket>(ei_imag(rhs[k])));
+ }
+ else
+ b[k] = rhs[k];
+ }
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const
+ {
+ dest.first = ei_pload<RealPacket>((const RealScalar*)b);
+ dest.second = ei_pload<RealPacket>((const RealScalar*)(b+ResPacketSize));
+ }
+
+ // nothing special here
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+ {
+ dest = ei_pload<LhsPacket>((const typename ei_unpacket_traits<LhsPacket>::type*)(a));
+ }
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const
+ {
+ c.first = ei_padd(ei_pmul(a,b.first), c.first);
+ c.second = ei_padd(ei_pmul(a,b.second),c.second);
+ }
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
+ {
+ c = cj.pmadd(a,b,c);
+ }
+
+ EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
+
+ EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const
+ {
+ // assemble c
+ ResPacket tmp;
+ if((!ConjLhs)&&(!ConjRhs))
+ {
+ tmp = ei_pcplxflip(ei_pconj(ResPacket(c.second)));
+ tmp = ei_padd(ResPacket(c.first),tmp);
+ }
+ else if((!ConjLhs)&&(ConjRhs))
+ {
+ tmp = ei_pconj(ei_pcplxflip(ResPacket(c.second)));
+ tmp = ei_padd(ResPacket(c.first),tmp);
+ }
+ else if((ConjLhs)&&(!ConjRhs))
+ {
+ tmp = ei_pcplxflip(ResPacket(c.second));
+ tmp = ei_padd(ei_pconj(ResPacket(c.first)),tmp);
+ }
+ else if((ConjLhs)&&(ConjRhs))
+ {
+ tmp = ei_pcplxflip(ResPacket(c.second));
+ tmp = ei_psub(ei_pconj(ResPacket(c.first)),tmp);
+ }
+
+ r = ei_pmadd(tmp,alpha,r);
+ }
+
+protected:
+ ei_conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
+};
+
+template<typename RealScalar, bool _ConjRhs>
+class ei_gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
+{
+public:
+ typedef std::complex<RealScalar> Scalar;
+ typedef RealScalar LhsScalar;
+ typedef Scalar RhsScalar;
+ typedef Scalar ResScalar;
+
+ enum {
+ ConjLhs = false,
+ ConjRhs = _ConjRhs,
+ Vectorizable = ei_packet_traits<RealScalar>::Vectorizable
+ && ei_packet_traits<Scalar>::Vectorizable,
+ LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
+ RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
+ ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
+
+ NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+ nr = 4,
+ mr = 2*ResPacketSize,
+ WorkSpaceFactor = nr*RhsPacketSize,
+
+ LhsProgress = ResPacketSize,
+ RhsProgress = ResPacketSize
+ };
+
+ typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket;
+ typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket;
+ typedef typename ei_packet_traits<ResScalar>::type _ResPacket;
+
+ typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket;
+ typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket;
+ typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket;
+
+ typedef ResPacket AccPacket;
+
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+ {
+ p = ei_pset1<ResPacket>(ResScalar(0));
+ }
+
+ EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
+ {
+ for(DenseIndex k=0; k<n; k++)
+ ei_pstore(&b[k*RhsPacketSize], ei_pset1<RhsPacket>(rhs[k]));
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = ei_pload<RhsPacket>(b);
+ }
+
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+ {
+ dest = ei_ploaddup<LhsPacket>(a);
+ }
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
+ {
+ madd_impl(a, b, c, tmp, typename ei_meta_if<Vectorizable,ei_meta_true,ei_meta_false>::ret());
+ }
+
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const ei_meta_true&) const
+ {
+ tmp = b; tmp.v = ei_pmul(a,tmp.v); c = ei_padd(c,tmp);
+ }
+
+ EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const ei_meta_false&) const
+ {
+ c += a * b;
+ }
+
+ EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+ {
+ r = cj.pmadd(alpha,c,r);
+ }
+
+protected:
+ ei_conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
+};
+
+/* optimized GEneral packed Block * packed Panel product kernel
+ *
+ * Mixing type logic: C += A * B
+ * | A | B | comments
+ * |real |cplx | no vectorization yet, would require to pack A with duplication
+ * |cplx |real | easy vectorization
+ */
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct ei_gebp_kernel
{
- void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols,
- Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, Scalar* unpackedB = 0)
+ typedef ei_gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
+ typedef typename Traits::ResScalar ResScalar;
+ typedef typename Traits::LhsPacket LhsPacket;
+ typedef typename Traits::RhsPacket RhsPacket;
+ typedef typename Traits::ResPacket ResPacket;
+ typedef typename Traits::AccPacket AccPacket;
+
+ enum {
+ Vectorizable = Traits::Vectorizable,
+ LhsProgress = Traits::LhsProgress,
+ RhsProgress = Traits::RhsProgress,
+ ResPacketSize = Traits::ResPacketSize
+ };
+
+ EIGEN_FLATTEN_ATTRIB
+ void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
+ Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0)
{
- typedef typename ei_packet_traits<Scalar>::type PacketType;
- enum { PacketSize = ei_packet_traits<Scalar>::size };
+ Traits traits;
+
if(strideA==-1) strideA = depth;
if(strideB==-1) strideB = depth;
- ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj;
- ei_conj_helper<PacketType,PacketType,ConjugateLhs,ConjugateRhs> pcj;
+ ei_conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+// ei_conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
Index packet_cols = (cols/nr) * nr;
const Index peeled_mc = (rows/mr)*mr;
- const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0);
+ // FIXME:
+ const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0);
const Index peeled_kc = (depth/4)*4;
if(unpackedB==0)
- unpackedB = const_cast<Scalar*>(blockB - strideB * nr * PacketSize);
+ unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress);
// loops on each micro vertical panel of rhs (depth x nr)
for(Index j2=0; j2<packet_cols; j2+=nr)
{
- // unpack B
- {
- const Scalar* blB = &blockB[j2*strideB+offsetB*nr];
- Index n = depth*nr;
- for(Index k=0; k<n; k++)
- ei_pstore(&unpackedB[k*PacketSize], ei_pset1(blB[k]));
- /*Scalar* dest = unpackedB;
- for(Index k=0; k<n; k+=4*PacketSize)
- {
- #ifdef EIGEN_VECTORIZE_SSE
- const Index S = 128;
- const Index G = 16;
- _mm_prefetch((const char*)(&blB[S/2+0]), _MM_HINT_T0);
- _mm_prefetch((const char*)(&dest[S+0*G]), _MM_HINT_T0);
- _mm_prefetch((const char*)(&dest[S+1*G]), _MM_HINT_T0);
- _mm_prefetch((const char*)(&dest[S+2*G]), _MM_HINT_T0);
- _mm_prefetch((const char*)(&dest[S+3*G]), _MM_HINT_T0);
- #endif
-
- PacketType C0[PacketSize], C1[PacketSize], C2[PacketSize], C3[PacketSize];
- C0[0] = ei_pload(blB+0*PacketSize);
- C1[0] = ei_pload(blB+1*PacketSize);
- C2[0] = ei_pload(blB+2*PacketSize);
- C3[0] = ei_pload(blB+3*PacketSize);
-
- ei_punpackp(C0);
- ei_punpackp(C1);
- ei_punpackp(C2);
- ei_punpackp(C3);
-
- ei_pstore(dest+ 0*PacketSize, C0[0]);
- ei_pstore(dest+ 1*PacketSize, C0[1]);
- ei_pstore(dest+ 2*PacketSize, C0[2]);
- ei_pstore(dest+ 3*PacketSize, C0[3]);
-
- ei_pstore(dest+ 4*PacketSize, C1[0]);
- ei_pstore(dest+ 5*PacketSize, C1[1]);
- ei_pstore(dest+ 6*PacketSize, C1[2]);
- ei_pstore(dest+ 7*PacketSize, C1[3]);
-
- ei_pstore(dest+ 8*PacketSize, C2[0]);
- ei_pstore(dest+ 9*PacketSize, C2[1]);
- ei_pstore(dest+10*PacketSize, C2[2]);
- ei_pstore(dest+11*PacketSize, C2[3]);
-
- ei_pstore(dest+12*PacketSize, C3[0]);
- ei_pstore(dest+13*PacketSize, C3[1]);
- ei_pstore(dest+14*PacketSize, C3[2]);
- ei_pstore(dest+15*PacketSize, C3[3]);
-
- blB += 4*PacketSize;
- dest += 16*PacketSize;
- }*/
- }
- // loops on each micro horizontal panel of lhs (mr x depth)
+ traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB);
+
+ // loops on each largest micro horizontal panel of lhs (mr x depth)
// => we select a mr x nr micro block of res which is entirely
// stored into mr/packet_size x nr registers.
for(Index i=0; i<peeled_mc; i+=mr)
{
- const Scalar* blA = &blockA[i*strideA+offsetA*mr];
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
ei_prefetch(&blA[0]);
- // TODO move the res loads to the stores
-
// gets res block as register
- PacketType C0, C1, C2, C3, C4, C5, C6, C7;
- C0 = ei_pset1(Scalar(0));
- C1 = ei_pset1(Scalar(0));
- if(nr==4) C2 = ei_pset1(Scalar(0));
- if(nr==4) C3 = ei_pset1(Scalar(0));
- C4 = ei_pset1(Scalar(0));
- C5 = ei_pset1(Scalar(0));
- if(nr==4) C6 = ei_pset1(Scalar(0));
- if(nr==4) C7 = ei_pset1(Scalar(0));
-
- Scalar* r0 = &res[(j2+0)*resStride + i];
- Scalar* r1 = r0 + resStride;
- Scalar* r2 = r1 + resStride;
- Scalar* r3 = r2 + resStride;
+ AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
+ traits.initAcc(C0);
+ traits.initAcc(C1);
+ if(nr==4) traits.initAcc(C2);
+ if(nr==4) traits.initAcc(C3);
+ traits.initAcc(C4);
+ traits.initAcc(C5);
+ if(nr==4) traits.initAcc(C6);
+ if(nr==4) traits.initAcc(C7);
+
+ ResScalar* r0 = &res[(j2+0)*resStride + i];
+ ResScalar* r1 = r0 + resStride;
+ ResScalar* r2 = r1 + resStride;
+ ResScalar* r3 = r2 + resStride;
ei_prefetch(r0+16);
ei_prefetch(r1+16);
@@ -251,122 +618,121 @@ struct ei_gebp_kernel
// performs "inner" product
// TODO let's check wether the folowing peeled loop could not be
// optimized via optimal prefetching from one loop to the other
- const Scalar* blB = unpackedB;
+ const RhsScalar* blB = unpackedB;
for(Index k=0; k<peeled_kc; k+=4)
{
if(nr==2)
{
- PacketType B0, A0, A1;
- #ifndef EIGEN_HAS_FUSE_CJMADD
- PacketType T0;
- #endif
-EIGEN_ASM_COMMENT("mybegin");
- A0 = ei_pload(&blA[0*PacketSize]);
- A1 = ei_pload(&blA[1*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- MADD(pcj,A0,B0,C0,T0);
- MADD(pcj,A1,B0,C4,B0);
- B0 = ei_pload(&blB[1*PacketSize]);
- MADD(pcj,A0,B0,C1,T0);
- MADD(pcj,A1,B0,C5,B0);
-
- A0 = ei_pload(&blA[2*PacketSize]);
- A1 = ei_pload(&blA[3*PacketSize]);
- B0 = ei_pload(&blB[2*PacketSize]);
- MADD(pcj,A0,B0,C0,T0);
- MADD(pcj,A1,B0,C4,B0);
- B0 = ei_pload(&blB[3*PacketSize]);
- MADD(pcj,A0,B0,C1,T0);
- MADD(pcj,A1,B0,C5,B0);
-
- A0 = ei_pload(&blA[4*PacketSize]);
- A1 = ei_pload(&blA[5*PacketSize]);
- B0 = ei_pload(&blB[4*PacketSize]);
- MADD(pcj,A0,B0,C0,T0);
- MADD(pcj,A1,B0,C4,B0);
- B0 = ei_pload(&blB[5*PacketSize]);
- MADD(pcj,A0,B0,C1,T0);
- MADD(pcj,A1,B0,C5,B0);
-
- A0 = ei_pload(&blA[6*PacketSize]);
- A1 = ei_pload(&blA[7*PacketSize]);
- B0 = ei_pload(&blB[6*PacketSize]);
- MADD(pcj,A0,B0,C0,T0);
- MADD(pcj,A1,B0,C4,B0);
- B0 = ei_pload(&blB[7*PacketSize]);
- MADD(pcj,A0,B0,C1,T0);
- MADD(pcj,A1,B0,C5,B0);
+ LhsPacket A0, A1;
+ RhsPacket B0;
+ RhsPacket T0;
+
+EIGEN_ASM_COMMENT("mybegin2");
+ traits.loadLhs(&blA[0*LhsProgress], A0);
+ traits.loadLhs(&blA[1*LhsProgress], A1);
+ traits.loadRhs(&blB[0*RhsProgress], B0);
+ traits.madd(A0,B0,C0,T0);
+ traits.madd(A1,B0,C4,B0);
+ traits.loadRhs(&blB[1*RhsProgress], B0);
+ traits.madd(A0,B0,C1,T0);
+ traits.madd(A1,B0,C5,B0);
+
+ traits.loadLhs(&blA[2*LhsProgress], A0);
+ traits.loadLhs(&blA[3*LhsProgress], A1);
+ traits.loadRhs(&blB[2*RhsProgress], B0);
+ traits.madd(A0,B0,C0,T0);
+ traits.madd(A1,B0,C4,B0);
+ traits.loadRhs(&blB[3*RhsProgress], B0);
+ traits.madd(A0,B0,C1,T0);
+ traits.madd(A1,B0,C5,B0);
+
+ traits.loadLhs(&blA[4*LhsProgress], A0);
+ traits.loadLhs(&blA[5*LhsProgress], A1);
+ traits.loadRhs(&blB[4*RhsProgress], B0);
+ traits.madd(A0,B0,C0,T0);
+ traits.madd(A1,B0,C4,B0);
+ traits.loadRhs(&blB[5*RhsProgress], B0);
+ traits.madd(A0,B0,C1,T0);
+ traits.madd(A1,B0,C5,B0);
+
+ traits.loadLhs(&blA[6*LhsProgress], A0);
+ traits.loadLhs(&blA[7*LhsProgress], A1);
+ traits.loadRhs(&blB[6*RhsProgress], B0);
+ traits.madd(A0,B0,C0,T0);
+ traits.madd(A1,B0,C4,B0);
+ traits.loadRhs(&blB[7*RhsProgress], B0);
+ traits.madd(A0,B0,C1,T0);
+ traits.madd(A1,B0,C5,B0);
EIGEN_ASM_COMMENT("myend");
}
else
{
- PacketType B0, B1, B2, B3, A0, A1;
- #ifndef EIGEN_HAS_FUSE_CJMADD
- PacketType T0;
- #endif
-EIGEN_ASM_COMMENT("mybegin");
- A0 = ei_pload(&blA[0*PacketSize]);
- A1 = ei_pload(&blA[1*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- B1 = ei_pload(&blB[1*PacketSize]);
-
- MADD(pcj,A0,B0,C0,T0);
- B2 = ei_pload(&blB[2*PacketSize]);
- MADD(pcj,A1,B0,C4,B0);
- B3 = ei_pload(&blB[3*PacketSize]);
- B0 = ei_pload(&blB[4*PacketSize]);
- MADD(pcj,A0,B1,C1,T0);
- MADD(pcj,A1,B1,C5,B1);
- B1 = ei_pload(&blB[5*PacketSize]);
- MADD(pcj,A0,B2,C2,T0);
- MADD(pcj,A1,B2,C6,B2);
- B2 = ei_pload(&blB[6*PacketSize]);
- MADD(pcj,A0,B3,C3,T0);
- A0 = ei_pload(&blA[2*PacketSize]);
- MADD(pcj,A1,B3,C7,B3);
- A1 = ei_pload(&blA[3*PacketSize]);
- B3 = ei_pload(&blB[7*PacketSize]);
- MADD(pcj,A0,B0,C0,T0);
- MADD(pcj,A1,B0,C4,B0);
- B0 = ei_pload(&blB[8*PacketSize]);
- MADD(pcj,A0,B1,C1,T0);
- MADD(pcj,A1,B1,C5,B1);
- B1 = ei_pload(&blB[9*PacketSize]);
- MADD(pcj,A0,B2,C2,T0);
- MADD(pcj,A1,B2,C6,B2);
- B2 = ei_pload(&blB[10*PacketSize]);
- MADD(pcj,A0,B3,C3,T0);
- A0 = ei_pload(&blA[4*PacketSize]);
- MADD(pcj,A1,B3,C7,B3);
- A1 = ei_pload(&blA[5*PacketSize]);
- B3 = ei_pload(&blB[11*PacketSize]);
-
- MADD(pcj,A0,B0,C0,T0);
- MADD(pcj,A1,B0,C4,B0);
- B0 = ei_pload(&blB[12*PacketSize]);
- MADD(pcj,A0,B1,C1,T0);
- MADD(pcj,A1,B1,C5,B1);
- B1 = ei_pload(&blB[13*PacketSize]);
- MADD(pcj,A0,B2,C2,T0);
- MADD(pcj,A1,B2,C6,B2);
- B2 = ei_pload(&blB[14*PacketSize]);
- MADD(pcj,A0,B3,C3,T0);
- A0 = ei_pload(&blA[6*PacketSize]);
- MADD(pcj,A1,B3,C7,B3);
- A1 = ei_pload(&blA[7*PacketSize]);
- B3 = ei_pload(&blB[15*PacketSize]);
- MADD(pcj,A0,B0,C0,T0);
- MADD(pcj,A1,B0,C4,B0);
- MADD(pcj,A0,B1,C1,T0);
- MADD(pcj,A1,B1,C5,B1);
- MADD(pcj,A0,B2,C2,T0);
- MADD(pcj,A1,B2,C6,B2);
- MADD(pcj,A0,B3,C3,T0);
- MADD(pcj,A1,B3,C7,B3);
-EIGEN_ASM_COMMENT("myend");
+EIGEN_ASM_COMMENT("mybegin4");
+ LhsPacket A0, A1;
+ RhsPacket B0, B1, B2, B3;
+ RhsPacket T0;
+
+ traits.loadLhs(&blA[0*LhsProgress], A0);
+ traits.loadLhs(&blA[1*LhsProgress], A1);
+ traits.loadRhs(&blB[0*RhsProgress], B0);
+ traits.loadRhs(&blB[1*RhsProgress], B1);
+
+ traits.madd(A0,B0,C0,T0);
+ traits.loadRhs(&blB[2*RhsProgress], B2);
+ traits.madd(A1,B0,C4,B0);
+ traits.loadRhs(&blB[3*RhsProgress], B3);
+ traits.loadRhs(&blB[4*RhsProgress], B0);
+ traits.madd(A0,B1,C1,T0);
+ traits.madd(A1,B1,C5,B1);
+ traits.loadRhs(&blB[5*RhsProgress], B1);
+ traits.madd(A0,B2,C2,T0);
+ traits.madd(A1,B2,C6,B2);
+ traits.loadRhs(&blB[6*RhsProgress], B2);
+ traits.madd(A0,B3,C3,T0);
+ traits.loadLhs(&blA[2*LhsProgress], A0);
+ traits.madd(A1,B3,C7,B3);
+ traits.loadLhs(&blA[3*LhsProgress], A1);
+ traits.loadRhs(&blB[7*RhsProgress], B3);
+ traits.madd(A0,B0,C0,T0);
+ traits.madd(A1,B0,C4,B0);
+ traits.loadRhs(&blB[8*RhsProgress], B0);
+ traits.madd(A0,B1,C1,T0);
+ traits.madd(A1,B1,C5,B1);
+ traits.loadRhs(&blB[9*RhsProgress], B1);
+ traits.madd(A0,B2,C2,T0);
+ traits.madd(A1,B2,C6,B2);
+ traits.loadRhs(&blB[10*RhsProgress], B2);
+ traits.madd(A0,B3,C3,T0);
+ traits.loadLhs(&blA[4*LhsProgress], A0);
+ traits.madd(A1,B3,C7,B3);
+ traits.loadLhs(&blA[5*LhsProgress], A1);
+ traits.loadRhs(&blB[11*RhsProgress], B3);
+
+ traits.madd(A0,B0,C0,T0);
+ traits.madd(A1,B0,C4,B0);
+ traits.loadRhs(&blB[12*RhsProgress], B0);
+ traits.madd(A0,B1,C1,T0);
+ traits.madd(A1,B1,C5,B1);
+ traits.loadRhs(&blB[13*RhsProgress], B1);
+ traits.madd(A0,B2,C2,T0);
+ traits.madd(A1,B2,C6,B2);
+ traits.loadRhs(&blB[14*RhsProgress], B2);
+ traits.madd(A0,B3,C3,T0);
+ traits.loadLhs(&blA[6*LhsProgress], A0);
+ traits.madd(A1,B3,C7,B3);
+ traits.loadLhs(&blA[7*LhsProgress], A1);
+ traits.loadRhs(&blB[15*RhsProgress], B3);
+ traits.madd(A0,B0,C0,T0);
+ traits.madd(A1,B0,C4,B0);
+ traits.madd(A0,B1,C1,T0);
+ traits.madd(A1,B1,C5,B1);
+ traits.madd(A0,B2,C2,T0);
+ traits.madd(A1,B2,C6,B2);
+ traits.madd(A0,B3,C3,T0);
+ traits.madd(A1,B3,C7,B3);
}
- blB += 4*nr*PacketSize;
+ blB += 4*nr*RhsProgress;
blA += 4*mr;
}
// process remaining peeled loop
@@ -374,343 +740,370 @@ EIGEN_ASM_COMMENT("myend");
{
if(nr==2)
{
- PacketType B0, A0, A1;
- #ifndef EIGEN_HAS_FUSE_CJMADD
- PacketType T0;
- #endif
-
- A0 = ei_pload(&blA[0*PacketSize]);
- A1 = ei_pload(&blA[1*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- MADD(pcj,A0,B0,C0,T0);
- MADD(pcj,A1,B0,C4,B0);
- B0 = ei_pload(&blB[1*PacketSize]);
- MADD(pcj,A0,B0,C1,T0);
- MADD(pcj,A1,B0,C5,B0);
+ LhsPacket A0, A1;
+ RhsPacket B0;
+ RhsPacket T0;
+
+ traits.loadLhs(&blA[0*LhsProgress], A0);
+ traits.loadLhs(&blA[1*LhsProgress], A1);
+ traits.loadRhs(&blB[0*RhsProgress], B0);
+ traits.madd(A0,B0,C0,T0);
+ traits.madd(A1,B0,C4,B0);
+ traits.loadRhs(&blB[1*RhsProgress], B0);
+ traits.madd(A0,B0,C1,T0);
+ traits.madd(A1,B0,C5,B0);
}
else
{
- PacketType B0, B1, B2, B3, A0, A1;
- #ifndef EIGEN_HAS_FUSE_CJMADD
- PacketType T0;
- #endif
-
- A0 = ei_pload(&blA[0*PacketSize]);
- A1 = ei_pload(&blA[1*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- B1 = ei_pload(&blB[1*PacketSize]);
-
- MADD(pcj,A0,B0,C0,T0);
- B2 = ei_pload(&blB[2*PacketSize]);
- MADD(pcj,A1,B0,C4,B0);
- B3 = ei_pload(&blB[3*PacketSize]);
- MADD(pcj,A0,B1,C1,T0);
- MADD(pcj,A1,B1,C5,B1);
- MADD(pcj,A0,B2,C2,T0);
- MADD(pcj,A1,B2,C6,B2);
- MADD(pcj,A0,B3,C3,T0);
- MADD(pcj,A1,B3,C7,B3);
+ LhsPacket A0, A1;
+ RhsPacket B0, B1, B2, B3;
+ RhsPacket T0;
+
+ traits.loadLhs(&blA[0*LhsProgress], A0);
+ traits.loadLhs(&blA[1*LhsProgress], A1);
+ traits.loadRhs(&blB[0*RhsProgress], B0);
+ traits.loadRhs(&blB[1*RhsProgress], B1);
+
+ traits.madd(A0,B0,C0,T0);
+ traits.loadRhs(&blB[2*RhsProgress], B2);
+ traits.madd(A1,B0,C4,B0);
+ traits.loadRhs(&blB[3*RhsProgress], B3);
+ traits.madd(A0,B1,C1,T0);
+ traits.madd(A1,B1,C5,B1);
+ traits.madd(A0,B2,C2,T0);
+ traits.madd(A1,B2,C6,B2);
+ traits.madd(A0,B3,C3,T0);
+ traits.madd(A1,B3,C7,B3);
}
- blB += nr*PacketSize;
+ blB += nr*RhsProgress;
blA += mr;
}
- PacketType R0, R1, R2, R3, R4, R5, R6, R7;
-
- R0 = ei_ploadu(r0);
- R1 = ei_ploadu(r1);
- if(nr==4) R2 = ei_ploadu(r2);
- if(nr==4) R3 = ei_ploadu(r3);
- R4 = ei_ploadu(r0 + PacketSize);
- R5 = ei_ploadu(r1 + PacketSize);
- if(nr==4) R6 = ei_ploadu(r2 + PacketSize);
- if(nr==4) R7 = ei_ploadu(r3 + PacketSize);
-
- C0 = ei_padd(R0, C0);
- C1 = ei_padd(R1, C1);
- if(nr==4) C2 = ei_padd(R2, C2);
- if(nr==4) C3 = ei_padd(R3, C3);
- C4 = ei_padd(R4, C4);
- C5 = ei_padd(R5, C5);
- if(nr==4) C6 = ei_padd(R6, C6);
- if(nr==4) C7 = ei_padd(R7, C7);
-
- ei_pstoreu(r0, C0);
- ei_pstoreu(r1, C1);
- if(nr==4) ei_pstoreu(r2, C2);
- if(nr==4) ei_pstoreu(r3, C3);
- ei_pstoreu(r0 + PacketSize, C4);
- ei_pstoreu(r1 + PacketSize, C5);
- if(nr==4) ei_pstoreu(r2 + PacketSize, C6);
- if(nr==4) ei_pstoreu(r3 + PacketSize, C7);
+ ResPacket R0, R1, R2, R3, R4, R5, R6, R7;
+ ResPacket alphav = ei_pset1<ResPacket>(alpha);
+
+ R0 = ei_ploadu<ResPacket>(r0);
+ R1 = ei_ploadu<ResPacket>(r1);
+ if(nr==4) R2 = ei_ploadu<ResPacket>(r2);
+ if(nr==4) R3 = ei_ploadu<ResPacket>(r3);
+ R4 = ei_ploadu<ResPacket>(r0 + ResPacketSize);
+ R5 = ei_ploadu<ResPacket>(r1 + ResPacketSize);
+ if(nr==4) R6 = ei_ploadu<ResPacket>(r2 + ResPacketSize);
+ if(nr==4) R7 = ei_ploadu<ResPacket>(r3 + ResPacketSize);
+
+ traits.acc(C0, alphav, R0);
+ traits.acc(C1, alphav, R1);
+ if(nr==4) traits.acc(C2, alphav, R2);
+ if(nr==4) traits.acc(C3, alphav, R3);
+ traits.acc(C4, alphav, R4);
+ traits.acc(C5, alphav, R5);
+ if(nr==4) traits.acc(C6, alphav, R6);
+ if(nr==4) traits.acc(C7, alphav, R7);
+
+ ei_pstoreu(r0, R0);
+ ei_pstoreu(r1, R1);
+ if(nr==4) ei_pstoreu(r2, R2);
+ if(nr==4) ei_pstoreu(r3, R3);
+ ei_pstoreu(r0 + ResPacketSize, R4);
+ ei_pstoreu(r1 + ResPacketSize, R5);
+ if(nr==4) ei_pstoreu(r2 + ResPacketSize, R6);
+ if(nr==4) ei_pstoreu(r3 + ResPacketSize, R7);
}
- if(rows-peeled_mc>=PacketSize)
+
+ if(rows-peeled_mc>=LhsProgress)
{
Index i = peeled_mc;
- const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize];
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
ei_prefetch(&blA[0]);
// gets res block as register
- PacketType C0, C1, C2, C3;
- C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
- C1 = ei_ploadu(&res[(j2+1)*resStride + i]);
- if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]);
- if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
+ AccPacket C0, C1, C2, C3;
+ traits.initAcc(C0);
+ traits.initAcc(C1);
+ if(nr==4) traits.initAcc(C2);
+ if(nr==4) traits.initAcc(C3);
// performs "inner" product
- const Scalar* blB = unpackedB;
+ const RhsScalar* blB = unpackedB;
for(Index k=0; k<peeled_kc; k+=4)
{
if(nr==2)
{
- PacketType B0, B1, A0;
-
- A0 = ei_pload(&blA[0*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- B1 = ei_pload(&blB[1*PacketSize]);
- MADD(pcj,A0,B0,C0,B0);
- B0 = ei_pload(&blB[2*PacketSize]);
- MADD(pcj,A0,B1,C1,B1);
- A0 = ei_pload(&blA[1*PacketSize]);
- B1 = ei_pload(&blB[3*PacketSize]);
- MADD(pcj,A0,B0,C0,B0);
- B0 = ei_pload(&blB[4*PacketSize]);
- MADD(pcj,A0,B1,C1,B1);
- A0 = ei_pload(&blA[2*PacketSize]);
- B1 = ei_pload(&blB[5*PacketSize]);
- MADD(pcj,A0,B0,C0,B0);
- B0 = ei_pload(&blB[6*PacketSize]);
- MADD(pcj,A0,B1,C1,B1);
- A0 = ei_pload(&blA[3*PacketSize]);
- B1 = ei_pload(&blB[7*PacketSize]);
- MADD(pcj,A0,B0,C0,B0);
- MADD(pcj,A0,B1,C1,B1);
+ LhsPacket A0;
+ RhsPacket B0, B1;
+
+ traits.loadLhs(&blA[0*LhsProgress], A0);
+ traits.loadRhs(&blB[0*RhsProgress], B0);
+ traits.loadRhs(&blB[1*RhsProgress], B1);
+ traits.madd(A0,B0,C0,B0);
+ traits.loadRhs(&blB[2*RhsProgress], B0);
+ traits.madd(A0,B1,C1,B1);
+ traits.loadLhs(&blA[1*LhsProgress], A0);
+ traits.loadRhs(&blB[3*RhsProgress], B1);
+ traits.madd(A0,B0,C0,B0);
+ traits.loadRhs(&blB[4*RhsProgress], B0);
+ traits.madd(A0,B1,C1,B1);
+ traits.loadLhs(&blA[2*LhsProgress], A0);
+ traits.loadRhs(&blB[5*RhsProgress], B1);
+ traits.madd(A0,B0,C0,B0);
+ traits.loadRhs(&blB[6*RhsProgress], B0);
+ traits.madd(A0,B1,C1,B1);
+ traits.loadLhs(&blA[3*LhsProgress], A0);
+ traits.loadRhs(&blB[7*RhsProgress], B1);
+ traits.madd(A0,B0,C0,B0);
+ traits.madd(A0,B1,C1,B1);
}
else
{
- PacketType B0, B1, B2, B3, A0;
-
- A0 = ei_pload(&blA[0*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- B1 = ei_pload(&blB[1*PacketSize]);
-
- MADD(pcj,A0,B0,C0,B0);
- B2 = ei_pload(&blB[2*PacketSize]);
- B3 = ei_pload(&blB[3*PacketSize]);
- B0 = ei_pload(&blB[4*PacketSize]);
- MADD(pcj,A0,B1,C1,B1);
- B1 = ei_pload(&blB[5*PacketSize]);
- MADD(pcj,A0,B2,C2,B2);
- B2 = ei_pload(&blB[6*PacketSize]);
- MADD(pcj,A0,B3,C3,B3);
- A0 = ei_pload(&blA[1*PacketSize]);
- B3 = ei_pload(&blB[7*PacketSize]);
- MADD(pcj,A0,B0,C0,B0);
- B0 = ei_pload(&blB[8*PacketSize]);
- MADD(pcj,A0,B1,C1,B1);
- B1 = ei_pload(&blB[9*PacketSize]);
- MADD(pcj,A0,B2,C2,B2);
- B2 = ei_pload(&blB[10*PacketSize]);
- MADD(pcj,A0,B3,C3,B3);
- A0 = ei_pload(&blA[2*PacketSize]);
- B3 = ei_pload(&blB[11*PacketSize]);
-
- MADD(pcj,A0,B0,C0,B0);
- B0 = ei_pload(&blB[12*PacketSize]);
- MADD(pcj,A0,B1,C1,B1);
- B1 = ei_pload(&blB[13*PacketSize]);
- MADD(pcj,A0,B2,C2,B2);
- B2 = ei_pload(&blB[14*PacketSize]);
- MADD(pcj,A0,B3,C3,B3);
-
- A0 = ei_pload(&blA[3*PacketSize]);
- B3 = ei_pload(&blB[15*PacketSize]);
- MADD(pcj,A0,B0,C0,B0);
- MADD(pcj,A0,B1,C1,B1);
- MADD(pcj,A0,B2,C2,B2);
- MADD(pcj,A0,B3,C3,B3);
+ LhsPacket A0;
+ RhsPacket B0, B1, B2, B3;
+
+ traits.loadLhs(&blA[0*LhsProgress], A0);
+ traits.loadRhs(&blB[0*RhsProgress], B0);
+ traits.loadRhs(&blB[1*RhsProgress], B1);
+
+ traits.madd(A0,B0,C0,B0);
+ traits.loadRhs(&blB[2*RhsProgress], B2);
+ traits.loadRhs(&blB[3*RhsProgress], B3);
+ traits.loadRhs(&blB[4*RhsProgress], B0);
+ traits.madd(A0,B1,C1,B1);
+ traits.loadRhs(&blB[5*RhsProgress], B1);
+ traits.madd(A0,B2,C2,B2);
+ traits.loadRhs(&blB[6*RhsProgress], B2);
+ traits.madd(A0,B3,C3,B3);
+ traits.loadLhs(&blA[1*LhsProgress], A0);
+ traits.loadRhs(&blB[7*RhsProgress], B3);
+ traits.madd(A0,B0,C0,B0);
+ traits.loadRhs(&blB[8*RhsProgress], B0);
+ traits.madd(A0,B1,C1,B1);
+ traits.loadRhs(&blB[9*RhsProgress], B1);
+ traits.madd(A0,B2,C2,B2);
+ traits.loadRhs(&blB[10*RhsProgress], B2);
+ traits.madd(A0,B3,C3,B3);
+ traits.loadLhs(&blA[2*LhsProgress], A0);
+ traits.loadRhs(&blB[11*RhsProgress], B3);
+
+ traits.madd(A0,B0,C0,B0);
+ traits.loadRhs(&blB[12*RhsProgress], B0);
+ traits.madd(A0,B1,C1,B1);
+ traits.loadRhs(&blB[13*RhsProgress], B1);
+ traits.madd(A0,B2,C2,B2);
+ traits.loadRhs(&blB[14*RhsProgress], B2);
+ traits.madd(A0,B3,C3,B3);
+
+ traits.loadLhs(&blA[3*LhsProgress], A0);
+ traits.loadRhs(&blB[15*RhsProgress], B3);
+ traits.madd(A0,B0,C0,B0);
+ traits.madd(A0,B1,C1,B1);
+ traits.madd(A0,B2,C2,B2);
+ traits.madd(A0,B3,C3,B3);
}
- blB += 4*nr*PacketSize;
- blA += 4*PacketSize;
+ blB += nr*4*RhsProgress;
+ blA += 4*LhsProgress;
}
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
{
if(nr==2)
{
- PacketType B0, A0;
- #ifndef EIGEN_HAS_FUSE_CJMADD
- PacketType T0;
- #endif
-
- A0 = ei_pload(&blA[0*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- MADD(pcj,A0,B0,C0,T0);
- B0 = ei_pload(&blB[1*PacketSize]);
- MADD(pcj,A0,B0,C1,T0);
+ LhsPacket A0;
+ RhsPacket B0, B1;
+
+ traits.loadLhs(&blA[0*LhsProgress], A0);
+ traits.loadRhs(&blB[0*RhsProgress], B0);
+ traits.loadRhs(&blB[1*RhsProgress], B1);
+ traits.madd(A0,B0,C0,B0);
+ traits.madd(A0,B1,C1,B1);
}
else
{
- PacketType B0, B1, B2, B3, A0;
- #ifndef EIGEN_HAS_FUSE_CJMADD
- PacketType T0, T1;
- #endif
-
- A0 = ei_pload(&blA[0*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- B1 = ei_pload(&blB[1*PacketSize]);
- B2 = ei_pload(&blB[2*PacketSize]);
- B3 = ei_pload(&blB[3*PacketSize]);
-
- MADD(pcj,A0,B0,C0,T0);
- MADD(pcj,A0,B1,C1,T1);
- MADD(pcj,A0,B2,C2,T0);
- MADD(pcj,A0,B3,C3,T1);
+ LhsPacket A0;
+ RhsPacket B0, B1, B2, B3;
+
+ traits.loadLhs(&blA[0*LhsProgress], A0);
+ traits.loadRhs(&blB[0*RhsProgress], B0);
+ traits.loadRhs(&blB[1*RhsProgress], B1);
+ traits.loadRhs(&blB[2*RhsProgress], B2);
+ traits.loadRhs(&blB[3*RhsProgress], B3);
+
+ traits.madd(A0,B0,C0,B0);
+ traits.madd(A0,B1,C1,B1);
+ traits.madd(A0,B2,C2,B2);
+ traits.madd(A0,B3,C3,B3);
}
- blB += nr*PacketSize;
- blA += PacketSize;
+ blB += nr*RhsProgress;
+ blA += LhsProgress;
}
- ei_pstoreu(&res[(j2+0)*resStride + i], C0);
- ei_pstoreu(&res[(j2+1)*resStride + i], C1);
- if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2);
- if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3);
+ ResPacket R0, R1, R2, R3;
+ ResPacket alphav = ei_pset1<ResPacket>(alpha);
+
+ ResScalar* r0 = &res[(j2+0)*resStride + i];
+ ResScalar* r1 = r0 + resStride;
+ ResScalar* r2 = r1 + resStride;
+ ResScalar* r3 = r2 + resStride;
+
+ R0 = ei_ploadu<ResPacket>(r0);
+ R1 = ei_ploadu<ResPacket>(r1);
+ if(nr==4) R2 = ei_ploadu<ResPacket>(r2);
+ if(nr==4) R3 = ei_ploadu<ResPacket>(r3);
+
+ traits.acc(C0, alphav, R0);
+ traits.acc(C1, alphav, R1);
+ if(nr==4) traits.acc(C2, alphav, R2);
+ if(nr==4) traits.acc(C3, alphav, R3);
+
+ ei_pstoreu(r0, R0);
+ ei_pstoreu(r1, R1);
+ if(nr==4) ei_pstoreu(r2, R2);
+ if(nr==4) ei_pstoreu(r3, R3);
}
for(Index i=peeled_mc2; i<rows; i++)
{
- const Scalar* blA = &blockA[i*strideA+offsetA];
+ const LhsScalar* blA = &blockA[i*strideA+offsetA];
ei_prefetch(&blA[0]);
// gets a 1 x nr res block as registers
- Scalar C0(0), C1(0), C2(0), C3(0);
+ ResScalar C0(0), C1(0), C2(0), C3(0);
// TODO directly use blockB ???
- const Scalar* blB = unpackedB;//&blockB[j2*strideB+offsetB*nr];
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
for(Index k=0; k<depth; k++)
{
if(nr==2)
{
- Scalar B0, A0;
- #ifndef EIGEN_HAS_FUSE_CJMADD
- Scalar T0;
- #endif
+ LhsScalar A0;
+ RhsScalar B0, B1;
A0 = blA[k];
- B0 = blB[0*PacketSize];
- MADD(cj,A0,B0,C0,T0);
- B0 = blB[1*PacketSize];
- MADD(cj,A0,B0,C1,T0);
+ B0 = blB[0];
+ B1 = blB[1];
+ MADD(cj,A0,B0,C0,B0);
+ MADD(cj,A0,B1,C1,B1);
}
else
{
- Scalar B0, B1, B2, B3, A0;
- #ifndef EIGEN_HAS_FUSE_CJMADD
- Scalar T0, T1;
- #endif
+ LhsScalar A0;
+ RhsScalar B0, B1, B2, B3;
A0 = blA[k];
- B0 = blB[0*PacketSize];
- B1 = blB[1*PacketSize];
- B2 = blB[2*PacketSize];
- B3 = blB[3*PacketSize];
-
- MADD(cj,A0,B0,C0,T0);
- MADD(cj,A0,B1,C1,T1);
- MADD(cj,A0,B2,C2,T0);
- MADD(cj,A0,B3,C3,T1);
+ B0 = blB[0];
+ B1 = blB[1];
+ B2 = blB[2];
+ B3 = blB[3];
+
+ MADD(cj,A0,B0,C0,B0);
+ MADD(cj,A0,B1,C1,B1);
+ MADD(cj,A0,B2,C2,B2);
+ MADD(cj,A0,B3,C3,B3);
}
- blB += nr*PacketSize;
+ blB += nr;
}
- res[(j2+0)*resStride + i] += C0;
- res[(j2+1)*resStride + i] += C1;
- if(nr==4) res[(j2+2)*resStride + i] += C2;
- if(nr==4) res[(j2+3)*resStride + i] += C3;
+ res[(j2+0)*resStride + i] += alpha*C0;
+ res[(j2+1)*resStride + i] += alpha*C1;
+ if(nr==4) res[(j2+2)*resStride + i] += alpha*C2;
+ if(nr==4) res[(j2+3)*resStride + i] += alpha*C3;
}
}
-
// process remaining rhs/res columns one at a time
// => do the same but with nr==1
for(Index j2=packet_cols; j2<cols; j2++)
{
// unpack B
{
- const Scalar* blB = &blockB[j2*strideB+offsetB];
- for(Index k=0; k<depth; k++)
- ei_pstore(&unpackedB[k*PacketSize], ei_pset1(blB[k]));
+ traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
+// const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+// for(Index k=0; k<depth; k++)
+// ei_pstore(&unpackedB[k*RhsPacketSize], ei_pset1<RhsPacket>(blB[k]));
}
for(Index i=0; i<peeled_mc; i+=mr)
{
- const Scalar* blA = &blockA[i*strideA+offsetA*mr];
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
ei_prefetch(&blA[0]);
// TODO move the res loads to the stores
// get res block as registers
- PacketType C0, C4;
- C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
- C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
+ AccPacket C0, C4;
+ traits.initAcc(C0);
+ traits.initAcc(C4);
- const Scalar* blB = unpackedB;
+ const RhsScalar* blB = unpackedB;
for(Index k=0; k<depth; k++)
{
- PacketType B0, A0, A1;
- #ifndef EIGEN_HAS_FUSE_CJMADD
- PacketType T0, T1;
- #endif
-
- A0 = ei_pload(&blA[0*PacketSize]);
- A1 = ei_pload(&blA[1*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- MADD(pcj,A0,B0,C0,T0);
- MADD(pcj,A1,B0,C4,T1);
-
- blB += PacketSize;
- blA += mr;
+ LhsPacket A0, A1;
+ RhsPacket B0;
+ RhsPacket T0;
+
+ traits.loadLhs(&blA[0*LhsProgress], A0);
+ traits.loadLhs(&blA[1*LhsProgress], A1);
+ traits.loadRhs(&blB[0*RhsProgress], B0);
+ traits.madd(A0,B0,C0,T0);
+ traits.madd(A1,B0,C4,B0);
+
+ blB += RhsProgress;
+ blA += 2*LhsProgress;
}
+ ResPacket R0, R4;
+ ResPacket alphav = ei_pset1<ResPacket>(alpha);
+
+ ResScalar* r0 = &res[(j2+0)*resStride + i];
+
+ R0 = ei_ploadu<ResPacket>(r0);
+ R4 = ei_ploadu<ResPacket>(r0+ResPacketSize);
+
+ traits.acc(C0, alphav, R0);
+ traits.acc(C4, alphav, R4);
- ei_pstoreu(&res[(j2+0)*resStride + i], C0);
- ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
+ ei_pstoreu(r0, R0);
+ ei_pstoreu(r0+ResPacketSize, R4);
}
- if(rows-peeled_mc>=PacketSize)
+ if(rows-peeled_mc>=LhsProgress)
{
Index i = peeled_mc;
- const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize];
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
ei_prefetch(&blA[0]);
- PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
+ AccPacket C0;
+ traits.initAcc(C0);
- const Scalar* blB = unpackedB;
+ const RhsScalar* blB = unpackedB;
for(Index k=0; k<depth; k++)
{
- PacketType T0;
- MADD(pcj,ei_pload(blA), ei_pload(blB), C0, T0);
- blB += PacketSize;
- blA += PacketSize;
+ LhsPacket A0;
+ RhsPacket B0;
+ traits.loadLhs(blA, A0);
+ traits.loadRhs(blB, B0);
+ traits.madd(A0, B0, C0, B0);
+ blB += RhsProgress;
+ blA += LhsProgress;
}
- ei_pstoreu(&res[(j2+0)*resStride + i], C0);
+ ResPacket alphav = ei_pset1<ResPacket>(alpha);
+ ResPacket R0 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
+ traits.acc(C0, alphav, R0);
+ ei_pstoreu(&res[(j2+0)*resStride + i], R0);
}
for(Index i=peeled_mc2; i<rows; i++)
{
- const Scalar* blA = &blockA[i*strideA+offsetA];
+ const LhsScalar* blA = &blockA[i*strideA+offsetA];
ei_prefetch(&blA[0]);
// gets a 1 x 1 res block as registers
- Scalar C0(0);
+ ResScalar C0(0);
// FIXME directly use blockB ??
- const Scalar* blB = unpackedB;
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
for(Index k=0; k<depth; k++)
{
- #ifndef EIGEN_HAS_FUSE_CJMADD
- Scalar T0;
- #endif
- MADD(cj,blA[k], blB[k*PacketSize], C0, T0);
+ LhsScalar A0 = blA[k];
+ RhsScalar B0 = blB[k];
+ MADD(cj, A0, B0, C0, B0);
}
- res[(j2+0)*resStride + i] += C0;
+ res[(j2+0)*resStride + i] += alpha*C0;
}
}
}
@@ -732,34 +1125,34 @@ EIGEN_ASM_COMMENT("myend");
//
// 32 33 34 35 ...
// 36 36 38 39 ...
-template<typename Scalar, typename Index, int mr, int StorageOrder, bool Conjugate, bool PanelMode>
+template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
struct ei_gemm_pack_lhs
{
void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows,
Index stride=0, Index offset=0)
{
- enum { PacketSize = ei_packet_traits<Scalar>::size };
+// enum { PacketSize = ei_packet_traits<Scalar>::size };
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
ei_const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
Index count = 0;
- Index peeled_mc = (rows/mr)*mr;
- for(Index i=0; i<peeled_mc; i+=mr)
+ Index peeled_mc = (rows/Pack1)*Pack1;
+ for(Index i=0; i<peeled_mc; i+=Pack1)
{
- if(PanelMode) count += mr * offset;
+ if(PanelMode) count += Pack1 * offset;
for(Index k=0; k<depth; k++)
- for(Index w=0; w<mr; w++)
+ for(Index w=0; w<Pack1; w++)
blockA[count++] = cj(lhs(i+w, k));
- if(PanelMode) count += mr * (stride-offset-depth);
+ if(PanelMode) count += Pack1 * (stride-offset-depth);
}
- if(rows-peeled_mc>=PacketSize)
+ if(rows-peeled_mc>=Pack2)
{
- if(PanelMode) count += PacketSize*offset;
+ if(PanelMode) count += Pack2*offset;
for(Index k=0; k<depth; k++)
- for(Index w=0; w<PacketSize; w++)
+ for(Index w=0; w<Pack2; w++)
blockA[count++] = cj(lhs(peeled_mc+w, k));
- if(PanelMode) count += PacketSize * (stride-offset-depth);
- peeled_mc += PacketSize;
+ if(PanelMode) count += Pack2 * (stride-offset-depth);
+ peeled_mc += Pack2;
}
for(Index i=peeled_mc; i<rows; i++)
{
@@ -783,12 +1176,11 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
{
typedef typename ei_packet_traits<Scalar>::type Packet;
enum { PacketSize = ei_packet_traits<Scalar>::size };
- void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Scalar alpha, Index depth, Index cols,
+ void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
Index stride=0, Index offset=0)
{
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
- bool hasAlpha = alpha != Scalar(1);
Index packet_cols = (cols/nr) * nr;
Index count = 0;
for(Index j2=0; j2<packet_cols; j2+=nr)
@@ -799,24 +1191,14 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
- if (hasAlpha)
- for(Index k=0; k<depth; k++)
- {
- blockB[count+0] = alpha*cj(b0[k]);
- blockB[count+1] = alpha*cj(b1[k]);
- if(nr==4) blockB[count+2] = alpha*cj(b2[k]);
- if(nr==4) blockB[count+3] = alpha*cj(b3[k]);
- count += nr;
- }
- else
- for(Index k=0; k<depth; k++)
- {
- blockB[count+0] = cj(b0[k]);
- blockB[count+1] = cj(b1[k]);
- if(nr==4) blockB[count+2] = cj(b2[k]);
- if(nr==4) blockB[count+3] = cj(b3[k]);
- count += nr;
- }
+ for(Index k=0; k<depth; k++)
+ {
+ blockB[count+0] = cj(b0[k]);
+ blockB[count+1] = cj(b1[k]);
+ if(nr==4) blockB[count+2] = cj(b2[k]);
+ if(nr==4) blockB[count+3] = cj(b3[k]);
+ count += nr;
+ }
// skip what we have after
if(PanelMode) count += nr * (stride-offset-depth);
}
@@ -826,18 +1208,11 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
{
if(PanelMode) count += offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
- if (hasAlpha)
- for(Index k=0; k<depth; k++)
- {
- blockB[count] = alpha*cj(b0[k]);
- count += 1;
- }
- else
- for(Index k=0; k<depth; k++)
- {
- blockB[count] = cj(b0[k]);
- count += 1;
- }
+ for(Index k=0; k<depth; k++)
+ {
+ blockB[count] = cj(b0[k]);
+ count += 1;
+ }
if(PanelMode) count += (stride-offset-depth);
}
}
@@ -848,41 +1223,25 @@ template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode
struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
{
enum { PacketSize = ei_packet_traits<Scalar>::size };
- void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Scalar alpha, Index depth, Index cols,
+ void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
Index stride=0, Index offset=0)
{
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
- bool hasAlpha = alpha != Scalar(1);
Index packet_cols = (cols/nr) * nr;
Index count = 0;
for(Index j2=0; j2<packet_cols; j2+=nr)
{
// skip what we have before
if(PanelMode) count += nr * offset;
- if (hasAlpha)
- {
- for(Index k=0; k<depth; k++)
- {
- const Scalar* b0 = &rhs[k*rhsStride + j2];
- blockB[count+0] = alpha*cj(b0[0]);
- blockB[count+1] = alpha*cj(b0[1]);
- if(nr==4) blockB[count+2] = alpha*cj(b0[2]);
- if(nr==4) blockB[count+3] = alpha*cj(b0[3]);
- count += nr;
- }
- }
- else
+ for(Index k=0; k<depth; k++)
{
- for(Index k=0; k<depth; k++)
- {
- const Scalar* b0 = &rhs[k*rhsStride + j2];
- blockB[count+0] = cj(b0[0]);
- blockB[count+1] = cj(b0[1]);
- if(nr==4) blockB[count+2] = cj(b0[2]);
- if(nr==4) blockB[count+3] = cj(b0[3]);
- count += nr;
- }
+ const Scalar* b0 = &rhs[k*rhsStride + j2];
+ blockB[count+0] = cj(b0[0]);
+ blockB[count+1] = cj(b0[1]);
+ if(nr==4) blockB[count+2] = cj(b0[2]);
+ if(nr==4) blockB[count+3] = cj(b0[3]);
+ count += nr;
}
// skip what we have after
if(PanelMode) count += nr * (stride-offset-depth);
@@ -894,7 +1253,7 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
const Scalar* b0 = &rhs[j2];
for(Index k=0; k<depth; k++)
{
- blockB[count] = alpha*cj(b0[k*rhsStride]);
+ blockB[count] = cj(b0[k*rhsStride]);
count += 1;
}
if(PanelMode) count += stride-offset-depth;
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 2ae78c1e7..1cdfb84d1 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -29,26 +29,25 @@ template<typename _LhsScalar, typename _RhsScalar> class ei_level3_blocking;
/* Specialization for a row-major destination matrix => simple transposition of the product */
template<
- typename Scalar, typename Index,
- int LhsStorageOrder, bool ConjugateLhs,
- int RhsStorageOrder, bool ConjugateRhs>
-struct ei_general_matrix_matrix_product<Scalar,Index,LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,RowMajor>
+ typename Index,
+ typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+ typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
+struct ei_general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
{
+ typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static EIGEN_STRONG_INLINE void run(
Index rows, Index cols, Index depth,
- const Scalar* lhs, Index lhsStride,
- const Scalar* rhs, Index rhsStride,
- Scalar* res, Index resStride,
- Scalar alpha,
- ei_level3_blocking<Scalar,Scalar>& blocking,
+ const LhsScalar* lhs, Index lhsStride,
+ const RhsScalar* rhs, Index rhsStride,
+ ResScalar* res, Index resStride,
+ ResScalar alpha,
+ ei_level3_blocking<RhsScalar,LhsScalar>& blocking,
GemmParallelInfo<Index>* info = 0)
{
// transpose the product such that the result is column major
- ei_general_matrix_matrix_product<Scalar, Index,
- RhsStorageOrder==RowMajor ? ColMajor : RowMajor,
- ConjugateRhs,
- LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
- ConjugateLhs,
+ ei_general_matrix_matrix_product<Index,
+ RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
+ LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
ColMajor>
::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info);
}
@@ -57,41 +56,32 @@ struct ei_general_matrix_matrix_product<Scalar,Index,LhsStorageOrder,ConjugateLh
/* Specialization for a col-major destination matrix
* => Blocking algorithm following Goto's paper */
template<
- typename Scalar, typename Index,
- int LhsStorageOrder, bool ConjugateLhs,
- int RhsStorageOrder, bool ConjugateRhs>
-struct ei_general_matrix_matrix_product<Scalar,Index,LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor>
+ typename Index,
+ typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+ typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
+struct ei_general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
{
+typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static void run(Index rows, Index cols, Index depth,
- const Scalar* _lhs, Index lhsStride,
- const Scalar* _rhs, Index rhsStride,
- Scalar* res, Index resStride,
- Scalar alpha,
- ei_level3_blocking<Scalar,Scalar>& blocking,
+ const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsStride,
+ ResScalar* res, Index resStride,
+ ResScalar alpha,
+ ei_level3_blocking<LhsScalar,RhsScalar>& blocking,
GemmParallelInfo<Index>* info = 0)
{
- ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
- ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+ ei_const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
+ ei_const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
- typedef typename ei_packet_traits<Scalar>::type PacketType;
- typedef ei_product_blocking_traits<Scalar> Blocking;
+ typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits;
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction
//Index nc = blocking.nc(); // cache block size along the N direction
- // FIXME starting from SSE3, normal complex product cannot be optimized as well as
- // conjugate product, therefore it is better to conjugate during the copies.
- // With SSE2, this is the other way round.
- ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder, ConjugateLhs> pack_lhs;
- ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder, ConjugateRhs> pack_rhs;
- ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr> gebp;
-
-// if (ConjugateRhs)
-// alpha = ei_conj(alpha);
-// ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs;
-// ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs;
-// ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp;
+ ei_gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ ei_gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
+ ei_gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
#ifdef EIGEN_HAS_OPENMP
if(info)
@@ -100,10 +90,10 @@ static void run(Index rows, Index cols, Index depth,
Index tid = omp_get_thread_num();
Index threads = omp_get_num_threads();
- Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
- std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr*8;
- Scalar* w = ei_aligned_stack_new(Scalar, sizeW);
- Scalar* blockB = blocking.blockB();
+ LhsScalar* blockA = ei_aligned_stack_new(LhsScalar, kc*mc);
+ std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+ RhsScalar* w = ei_aligned_stack_new(RhsScalar, sizeW);
+ RhsScalar* blockB = blocking.blockB();
ei_internal_assert(blockB!=0);
// For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
@@ -124,7 +114,7 @@ static void run(Index rows, Index cols, Index depth,
while(info[tid].users!=0) {}
info[tid].users += threads;
- pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length);
+ pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length);
// Notify the other threads that the part B'_j is ready to go.
info[tid].sync = k;
@@ -140,7 +130,7 @@ static void run(Index rows, Index cols, Index depth,
if(shift>0)
while(info[j].sync!=k) {}
- gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*kc, mc, actual_kc, info[j].rhs_length, -1,-1,0,0, w);
+ gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w);
}
// Then keep going as usual with the remaining A'
@@ -152,7 +142,7 @@ static void run(Index rows, Index cols, Index depth,
pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
// C_i += A' * B'
- gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1,-1,0,0, w);
+ gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w);
}
// Release all the sub blocks B'_j of B' for the current thread,
@@ -162,8 +152,8 @@ static void run(Index rows, Index cols, Index depth,
--(info[j].users);
}
- ei_aligned_stack_delete(Scalar, blockA, kc*mc);
- ei_aligned_stack_delete(Scalar, w, sizeW);
+ ei_aligned_stack_delete(LhsScalar, blockA, kc*mc);
+ ei_aligned_stack_delete(RhsScalar, w, sizeW);
}
else
#endif // EIGEN_HAS_OPENMP
@@ -173,10 +163,10 @@ static void run(Index rows, Index cols, Index depth,
// this is the sequential version!
std::size_t sizeA = kc*mc;
std::size_t sizeB = kc*cols;
- std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr;
- Scalar *blockA = blocking.blockA()==0 ? ei_aligned_stack_new(Scalar, sizeA) : blocking.blockA();
- Scalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(Scalar, sizeB) : blocking.blockB();
- Scalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(Scalar, sizeW) : blocking.blockW();
+ std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+ LhsScalar *blockA = blocking.blockA()==0 ? ei_aligned_stack_new(LhsScalar, sizeA) : blocking.blockA();
+ RhsScalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(RhsScalar, sizeB) : blocking.blockB();
+ RhsScalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(RhsScalar, sizeW) : blocking.blockW();
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
// (==GEMM_VAR1)
@@ -188,7 +178,7 @@ static void run(Index rows, Index cols, Index depth,
// => Pack rhs's panel into a sequential chunk of memory (L2 caching)
// Note that this panel will be read as many times as the number of blocks in the lhs's
// vertical panel which is, in practice, a very low number.
- pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
+ pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
// For each mc x kc block of the lhs's vertical panel...
@@ -203,14 +193,14 @@ static void run(Index rows, Index cols, Index depth,
pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
// Everything is packed, we can now call the block * panel kernel:
- gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1, -1, 0, 0, blockW);
+ gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
}
}
- if(blocking.blockA()==0) ei_aligned_stack_delete(Scalar, blockA, kc*mc);
- if(blocking.blockB()==0) ei_aligned_stack_delete(Scalar, blockB, sizeB);
- if(blocking.blockW()==0) ei_aligned_stack_delete(Scalar, blockW, sizeW);
+ if(blocking.blockA()==0) ei_aligned_stack_delete(LhsScalar, blockA, kc*mc);
+ if(blocking.blockB()==0) ei_aligned_stack_delete(RhsScalar, blockB, sizeB);
+ if(blocking.blockW()==0) ei_aligned_stack_delete(RhsScalar, blockW, sizeW);
}
}
@@ -245,8 +235,8 @@ struct ei_gemm_functor
cols = m_rhs.cols();
Gemm::run(rows, cols, m_lhs.cols(),
- (const Scalar*)&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.outerStride(),
- (const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.outerStride(),
+ /*(const Scalar*)*/&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.outerStride(),
+ /*(const Scalar*)*/&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.outerStride(),
(Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
m_actualAlpha, m_blocking, info);
}
@@ -305,11 +295,11 @@ class ei_gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols
};
typedef typename ei_meta_if<Transpose,_RhsScalar,_LhsScalar>::ret LhsScalar;
typedef typename ei_meta_if<Transpose,_LhsScalar,_RhsScalar>::ret RhsScalar;
- typedef ei_product_blocking_traits<RhsScalar> Blocking;
+ typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits;
enum {
SizeA = ActualRows * MaxDepth,
SizeB = ActualCols * MaxDepth,
- SizeW = MaxDepth * Blocking::nr * ei_packet_traits<RhsScalar>::size
+ SizeW = MaxDepth * Traits::WorkSpaceFactor
};
EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
@@ -345,7 +335,7 @@ class ei_gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols
};
typedef typename ei_meta_if<Transpose,_RhsScalar,_LhsScalar>::ret LhsScalar;
typedef typename ei_meta_if<Transpose,_LhsScalar,_RhsScalar>::ret RhsScalar;
- typedef ei_product_blocking_traits<RhsScalar> Blocking;
+ typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits;
DenseIndex m_sizeA;
DenseIndex m_sizeB;
@@ -362,7 +352,7 @@ class ei_gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols
computeProductBlockingSizes<LhsScalar,RhsScalar>(this->m_kc, this->m_mc, this->m_nc);
m_sizeA = this->m_mc * this->m_kc;
m_sizeB = this->m_kc * this->m_nc;
- m_sizeW = this->m_kc*ei_packet_traits<RhsScalar>::size*Blocking::nr;
+ m_sizeW = this->m_kc*Traits::WorkSpaceFactor;
}
void allocateA()
@@ -407,11 +397,15 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
};
public:
EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
+
+ typedef typename Lhs::Scalar LhsScalar;
+ typedef typename Rhs::Scalar RhsScalar;
+ typedef Scalar ResScalar;
GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
{
- EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::Scalar, typename Rhs::Scalar>::ret),
- YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+ typedef ei_scalar_product_op<LhsScalar,RhsScalar> BinOp;
+ EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
}
template<typename Dest> void scaleAndAddTo(Dest& dst, Scalar alpha) const
@@ -424,15 +418,15 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
* RhsBlasTraits::extractScalarFactor(m_rhs);
- typedef ei_gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
+ typedef ei_gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
typedef ei_gemm_functor<
Scalar, Index,
ei_general_matrix_matrix_product<
- Scalar, Index,
- (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
- (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
+ Index,
+ LhsScalar, (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
+ RhsScalar, (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
_ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 9713b4849..44986dc16 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -30,56 +30,77 @@
* the number of load/stores of the result by a factor 4 and to reduce
* the instruction dependency. Moreover, we know that all bands have the
* same alignment pattern.
- * TODO: since rhs gets evaluated only once, no need to evaluate it
+ *
+ * Mixing type logic: C += alpha * A * B
+ * | A | B |alpha| comments
+ * |real |cplx |cplx | no vectorization
+ * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
+ * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
+ * |cplx |real |real | optimal case, vectorization possible via real-cplx mul
*/
-template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename RhsType>
-static EIGEN_DONT_INLINE
-void ei_cache_friendly_product_colmajor_times_vector(
- Index size,
- const Scalar* lhs, Index lhsStride,
- const RhsType& rhs,
- Scalar* res,
- Scalar alpha)
+template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct ei_general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs>
{
+typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+enum {
+ Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable
+ && int(ei_packet_traits<LhsScalar>::size)==int(ei_packet_traits<RhsScalar>::size),
+ LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
+ RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
+ ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1
+};
+
+typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket;
+typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket;
+typedef typename ei_packet_traits<ResScalar>::type _ResPacket;
+
+typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket;
+typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket;
+typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket;
+
+EIGEN_DONT_INLINE static void run(
+ Index rows, Index cols,
+ const LhsScalar* lhs, Index lhsStride,
+ const RhsScalar* rhs, Index rhsIncr,
+ ResScalar* res, Index resIncr,
+ RhsScalar alpha)
+{
+ ei_internal_assert(resIncr==1);
#ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
ei_pstore(&res[j], \
- ei_padd(ei_pload(&res[j]), \
+ ei_padd(ei_pload<ResPacket>(&res[j]), \
ei_padd( \
- ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A0)(&lhs0[j]), ptmp0), \
- pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs1[j]), ptmp1)), \
- ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A2)(&lhs2[j]), ptmp2), \
- pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs3[j]), ptmp3)) )))
-
- typedef typename NumTraits<Scalar>::Real RealScalar;
- typedef typename ei_packet_traits<Scalar>::type Packet;
- enum {
- PacketSize = sizeof(Packet)/sizeof(Scalar),
- Vectorizable = ei_packet_traits<Scalar>::Vectorizable
- };
-
- ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj;
- ei_conj_helper<Packet,Packet,ConjugateLhs,ConjugateRhs> pcj;
+ ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
+ pcj.pmul(EIGEN_CAT(ei_ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
+ ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
+ pcj.pmul(EIGEN_CAT(ei_ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
+
+ ei_conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+ ei_conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
if(ConjugateRhs)
alpha = ei_conj(alpha);
enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
const Index columnsAtOnce = 4;
const Index peels = 2;
- const Index PacketAlignedMask = PacketSize-1;
- const Index PeelAlignedMask = PacketSize*peels-1;
-
+ const Index LhsPacketAlignedMask = LhsPacketSize-1;
+ const Index ResPacketAlignedMask = ResPacketSize-1;
+ const Index PeelAlignedMask = ResPacketSize*peels-1;
+ const Index size = rows;
+
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type.
Index alignedStart = ei_first_aligned(res,size);
- Index alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0;
+ Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
- const Index alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
+ const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
Index alignmentPattern = alignmentStep==0 ? AllAligned
- : alignmentStep==(PacketSize/2) ? EvenAligned
+ : alignmentStep==(LhsPacketSize/2) ? EvenAligned
: FirstAligned;
// we cannot assume the first element is aligned because of sub-matrices
@@ -88,19 +109,19 @@ void ei_cache_friendly_product_colmajor_times_vector(
// find how many columns do we have to skip to be aligned with the result (if possible)
Index skipColumns = 0;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
- if( (size_t(lhs)%sizeof(Scalar)) || (size_t(res)%sizeof(Scalar)) )
+ if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
{
alignedSize = 0;
alignedStart = 0;
}
- else if (PacketSize>1)
+ else if (LhsPacketSize>1)
{
- ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size<PacketSize);
+ ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
- while (skipColumns<PacketSize &&
- alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%PacketSize))
+ while (skipColumns<LhsPacketSize &&
+ alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
++skipColumns;
- if (skipColumns==PacketSize)
+ if (skipColumns==LhsPacketSize)
{
// nothing can be aligned, no need to skip any column
alignmentPattern = NoneAligned;
@@ -108,14 +129,14 @@ void ei_cache_friendly_product_colmajor_times_vector(
}
else
{
- skipColumns = std::min(skipColumns,rhs.size());
+ skipColumns = std::min(skipColumns,cols);
// note that the skiped columns are processed later.
}
ei_internal_assert( (alignmentPattern==NoneAligned)
- || (skipColumns + columnsAtOnce >= rhs.size())
- || PacketSize > size
- || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0);
+ || (skipColumns + columnsAtOnce >= cols)
+ || LhsPacketSize > size
+ || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
}
else if(Vectorizable)
{
@@ -127,15 +148,17 @@ void ei_cache_friendly_product_colmajor_times_vector(
Index offset1 = (FirstAligned && alignmentStep==1?3:1);
Index offset3 = (FirstAligned && alignmentStep==1?1:3);
- Index columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
+ Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
{
- Packet ptmp0 = ei_pset1(alpha*rhs[i]), ptmp1 = ei_pset1(alpha*rhs[i+offset1]),
- ptmp2 = ei_pset1(alpha*rhs[i+2]), ptmp3 = ei_pset1(alpha*rhs[i+offset3]);
+ RhsPacket ptmp0 = ei_pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
+ ptmp1 = ei_pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
+ ptmp2 = ei_pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
+ ptmp3 = ei_pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
// this helps a lot generating better binary code
- const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
- *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+ const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
+ *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
if (Vectorizable)
{
@@ -154,51 +177,52 @@ void ei_cache_friendly_product_colmajor_times_vector(
switch(alignmentPattern)
{
case AllAligned:
- for (Index j = alignedStart; j<alignedSize; j+=PacketSize)
+ for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,d,d);
break;
case EvenAligned:
- for (Index j = alignedStart; j<alignedSize; j+=PacketSize)
+ for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,d);
break;
case FirstAligned:
if(peels>1)
{
- Packet A00, A01, A02, A03, A10, A11, A12, A13;
+ LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
+ ResPacket T0, T1;
- A01 = ei_pload(&lhs1[alignedStart-1]);
- A02 = ei_pload(&lhs2[alignedStart-2]);
- A03 = ei_pload(&lhs3[alignedStart-3]);
+ A01 = ei_pload<LhsPacket>(&lhs1[alignedStart-1]);
+ A02 = ei_pload<LhsPacket>(&lhs2[alignedStart-2]);
+ A03 = ei_pload<LhsPacket>(&lhs3[alignedStart-3]);
- for (Index j = alignedStart; j<peeledSize; j+=peels*PacketSize)
+ for (Index j = alignedStart; j<peeledSize; j+=peels*ResPacketSize)
{
- A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11);
- A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12);
- A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13);
-
- A00 = ei_pload (&lhs0[j]);
- A10 = ei_pload (&lhs0[j+PacketSize]);
- A00 = pcj.pmadd(A00, ptmp0, ei_pload(&res[j]));
- A10 = pcj.pmadd(A10, ptmp0, ei_pload(&res[j+PacketSize]));
-
- A00 = pcj.pmadd(A01, ptmp1, A00);
- A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01);
- A00 = pcj.pmadd(A02, ptmp2, A00);
- A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02);
- A00 = pcj.pmadd(A03, ptmp3, A00);
- ei_pstore(&res[j],A00);
- A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03);
- A10 = pcj.pmadd(A11, ptmp1, A10);
- A10 = pcj.pmadd(A12, ptmp2, A10);
- A10 = pcj.pmadd(A13, ptmp3, A10);
- ei_pstore(&res[j+PacketSize],A10);
+ A11 = ei_pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); ei_palign<1>(A01,A11);
+ A12 = ei_pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); ei_palign<2>(A02,A12);
+ A13 = ei_pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); ei_palign<3>(A03,A13);
+
+ A00 = ei_pload<LhsPacket>(&lhs0[j]);
+ A10 = ei_pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
+ T0 = pcj.pmadd(A00, ptmp0, ei_pload<ResPacket>(&res[j]));
+ T1 = pcj.pmadd(A10, ptmp0, ei_pload<ResPacket>(&res[j+ResPacketSize]));
+
+ T0 = pcj.pmadd(A01, ptmp1, T0);
+ A01 = ei_pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); ei_palign<1>(A11,A01);
+ T0 = pcj.pmadd(A02, ptmp2, T0);
+ A02 = ei_pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); ei_palign<2>(A12,A02);
+ T0 = pcj.pmadd(A03, ptmp3, T0);
+ ei_pstore(&res[j],T0);
+ A03 = ei_pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); ei_palign<3>(A13,A03);
+ T1 = pcj.pmadd(A11, ptmp1, T1);
+ T1 = pcj.pmadd(A12, ptmp2, T1);
+ T1 = pcj.pmadd(A13, ptmp3, T1);
+ ei_pstore(&res[j+ResPacketSize],T1);
}
}
- for (Index j = peeledSize; j<alignedSize; j+=PacketSize)
+ for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,du);
break;
default:
- for (Index j = alignedStart; j<alignedSize; j+=PacketSize)
+ for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(du,du,du);
break;
}
@@ -216,14 +240,14 @@ void ei_cache_friendly_product_colmajor_times_vector(
}
// process remaining first and last columns (at most columnsAtOnce-1)
- Index end = rhs.size();
+ Index end = cols;
Index start = columnBound;
do
{
- for (Index i=start; i<end; ++i)
+ for (Index k=start; k<end; ++k)
{
- Packet ptmp0 = ei_pset1(alpha*rhs[i]);
- const Scalar* lhs0 = lhs + i*lhsStride;
+ RhsPacket ptmp0 = ei_pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
+ const LhsScalar* lhs0 = lhs + k*lhsStride;
if (Vectorizable)
{
@@ -231,19 +255,18 @@ void ei_cache_friendly_product_colmajor_times_vector(
// process first unaligned result's coeffs
for (Index j=0; j<alignedStart; ++j)
res[j] += cj.pmul(lhs0[j], ei_pfirst(ptmp0));
-
// process aligned result's coeffs
- if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0)
- for (Index j = alignedStart;j<alignedSize;j+=PacketSize)
- ei_pstore(&res[j], pcj.pmadd(ei_pload(&lhs0[j]), ptmp0, ei_pload(&res[j])));
+ if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+ for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
+ ei_pstore(&res[i], pcj.pmadd(ei_ploadu<LhsPacket>(&lhs0[i]), ptmp0, ei_pload<ResPacket>(&res[i])));
else
- for (Index j = alignedStart;j<alignedSize;j+=PacketSize)
- ei_pstore(&res[j], pcj.pmadd(ei_ploadu(&lhs0[j]), ptmp0, ei_pload(&res[j])));
+ for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
+ ei_pstore(&res[i], pcj.pmadd(ei_ploadu<LhsPacket>(&lhs0[i]), ptmp0, ei_pload<ResPacket>(&res[i])));
}
// process remaining scalars (or all if no explicit vectorization)
- for (Index j=alignedSize; j<size; ++j)
- res[j] += cj.pmul(lhs0[j], ei_pfirst(ptmp0));
+ for (Index i=alignedSize; i<size; ++i)
+ res[i] += cj.pmul(lhs0[i], ei_pfirst(ptmp0));
}
if (skipColumns)
{
@@ -256,15 +279,45 @@ void ei_cache_friendly_product_colmajor_times_vector(
} while(Vectorizable);
#undef _EIGEN_ACCUMULATE_PACKETS
}
+};
-// TODO add peeling to mask unaligned load/stores
-template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index>
-static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
+/* Optimized row-major matrix * vector product:
+ * This algorithm processes 4 rows at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 4 and to reduce
+ * the instruction dependency. Moreover, we know that all bands have the
+ * same alignment pattern.
+ *
+ * Mixing type logic:
+ * - alpha is always a complex (or converted to a complex)
+ * - no vectorization
+ */
+template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct ei_general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs>
+{
+typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+enum {
+ Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable
+ && int(ei_packet_traits<LhsScalar>::size)==int(ei_packet_traits<RhsScalar>::size),
+ LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
+ RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
+ ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1
+};
+
+typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket;
+typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket;
+typedef typename ei_packet_traits<ResScalar>::type _ResPacket;
+
+typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket;
+typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket;
+typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket;
+
+EIGEN_DONT_INLINE static void run(
Index rows, Index cols,
- const Scalar* lhs, Index lhsStride,
- const Scalar* rhs, Index rhsIncr,
- Scalar* res, Index resIncr,
- Scalar alpha)
+ const LhsScalar* lhs, Index lhsStride,
+ const RhsScalar* rhs, Index rhsIncr,
+ ResScalar* res, Index resIncr,
+ ResScalar alpha)
{
EIGEN_UNUSED_VARIABLE(rhsIncr);
ei_internal_assert(rhsIncr==1);
@@ -273,39 +326,33 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
#endif
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
- Packet b = ei_pload(&rhs[j]); \
- ptmp0 = pcj.pmadd(EIGEN_CAT(ei_ploa,A0) (&lhs0[j]), b, ptmp0); \
- ptmp1 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs1[j]), b, ptmp1); \
- ptmp2 = pcj.pmadd(EIGEN_CAT(ei_ploa,A2) (&lhs2[j]), b, ptmp2); \
- ptmp3 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs3[j]), b, ptmp3); }
-
- typedef typename NumTraits<Scalar>::Real RealScalar;
- typedef typename ei_packet_traits<Scalar>::type Packet;
- enum {
- PacketSize = sizeof(Packet)/sizeof(Scalar),
- Vectorizable = ei_packet_traits<Scalar>::Vectorizable
- };
-
- ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj;
- ei_conj_helper<Packet,Packet,ConjugateLhs,ConjugateRhs> pcj;
+ RhsPacket b = ei_pload<RhsPacket>(&rhs[j]); \
+ ptmp0 = pcj.pmadd(EIGEN_CAT(ei_ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
+ ptmp1 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
+ ptmp2 = pcj.pmadd(EIGEN_CAT(ei_ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
+ ptmp3 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
+
+ ei_conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+ ei_conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
const Index rowsAtOnce = 4;
const Index peels = 2;
- const Index PacketAlignedMask = PacketSize-1;
- const Index PeelAlignedMask = PacketSize*peels-1;
+ const Index RhsPacketAlignedMask = RhsPacketSize-1;
+ const Index LhsPacketAlignedMask = LhsPacketSize-1;
+ const Index PeelAlignedMask = RhsPacketSize*peels-1;
const Index depth = cols;
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type
// if that's not the case then vectorization is discarded, see below.
Index alignedStart = ei_first_aligned(rhs, depth);
- Index alignedSize = PacketSize>1 ? alignedStart + ((depth-alignedStart) & ~PacketAlignedMask) : 0;
+ Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
- const Index alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
+ const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
Index alignmentPattern = alignmentStep==0 ? AllAligned
- : alignmentStep==(PacketSize/2) ? EvenAligned
+ : alignmentStep==(LhsPacketSize/2) ? EvenAligned
: FirstAligned;
// we cannot assume the first element is aligned because of sub-matrices
@@ -314,19 +361,19 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
// find how many rows do we have to skip to be aligned with rhs (if possible)
Index skipRows = 0;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
- if( (size_t(lhs)%sizeof(Scalar)) || (size_t(rhs)%sizeof(Scalar)) )
+ if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
{
alignedSize = 0;
alignedStart = 0;
}
- else if (PacketSize>1)
+ else if (LhsPacketSize>1)
{
- ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || depth<PacketSize);
+ ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
- while (skipRows<PacketSize &&
- alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%PacketSize))
+ while (skipRows<LhsPacketSize &&
+ alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
++skipRows;
- if (skipRows==PacketSize)
+ if (skipRows==LhsPacketSize)
{
// nothing can be aligned, no need to skip any column
alignmentPattern = NoneAligned;
@@ -338,10 +385,10 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
// note that the skiped columns are processed later.
}
ei_internal_assert( alignmentPattern==NoneAligned
- || PacketSize==1
+ || LhsPacketSize==1
|| (skipRows + rowsAtOnce >= rows)
- || PacketSize > depth
- || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0);
+ || LhsPacketSize > depth
+ || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
}
else if(Vectorizable)
{
@@ -356,23 +403,24 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
{
- EIGEN_ALIGN16 Scalar tmp0 = Scalar(0);
- Scalar tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0);
+ EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+ ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
// this helps the compiler generating good binary code
- const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
- *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+ const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
+ *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
if (Vectorizable)
{
/* explicit vectorization */
- Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0));
+ ResPacket ptmp0 = ei_pset1<ResPacket>(ResScalar(0)), ptmp1 = ei_pset1<ResPacket>(ResScalar(0)),
+ ptmp2 = ei_pset1<ResPacket>(ResScalar(0)), ptmp3 = ei_pset1<ResPacket>(ResScalar(0));
// process initial unaligned coeffs
// FIXME this loop get vectorized by the compiler !
for (Index j=0; j<alignedStart; ++j)
{
- Scalar b = rhs[j];
+ RhsScalar b = rhs[j];
tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
}
@@ -382,11 +430,11 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
switch(alignmentPattern)
{
case AllAligned:
- for (Index j = alignedStart; j<alignedSize; j+=PacketSize)
+ for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,d,d);
break;
case EvenAligned:
- for (Index j = alignedStart; j<alignedSize; j+=PacketSize)
+ for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,d);
break;
case FirstAligned:
@@ -398,38 +446,38 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
* overlaping the desired unaligned packet. This is *much* more efficient
* than basic unaligned loads.
*/
- Packet A01, A02, A03, b, A11, A12, A13;
- A01 = ei_pload(&lhs1[alignedStart-1]);
- A02 = ei_pload(&lhs2[alignedStart-2]);
- A03 = ei_pload(&lhs3[alignedStart-3]);
+ LhsPacket A01, A02, A03, A11, A12, A13;
+ A01 = ei_pload<LhsPacket>(&lhs1[alignedStart-1]);
+ A02 = ei_pload<LhsPacket>(&lhs2[alignedStart-2]);
+ A03 = ei_pload<LhsPacket>(&lhs3[alignedStart-3]);
- for (Index j = alignedStart; j<peeledSize; j+=peels*PacketSize)
+ for (Index j = alignedStart; j<peeledSize; j+=peels*RhsPacketSize)
{
- b = ei_pload(&rhs[j]);
- A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11);
- A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12);
- A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13);
+ RhsPacket b = ei_pload<RhsPacket>(&rhs[j]);
+ A11 = ei_pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); ei_palign<1>(A01,A11);
+ A12 = ei_pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); ei_palign<2>(A02,A12);
+ A13 = ei_pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); ei_palign<3>(A03,A13);
- ptmp0 = pcj.pmadd(ei_pload (&lhs0[j]), b, ptmp0);
+ ptmp0 = pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j]), b, ptmp0);
ptmp1 = pcj.pmadd(A01, b, ptmp1);
- A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01);
+ A01 = ei_pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); ei_palign<1>(A11,A01);
ptmp2 = pcj.pmadd(A02, b, ptmp2);
- A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02);
+ A02 = ei_pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); ei_palign<2>(A12,A02);
ptmp3 = pcj.pmadd(A03, b, ptmp3);
- A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03);
+ A03 = ei_pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); ei_palign<3>(A13,A03);
- b = ei_pload(&rhs[j+PacketSize]);
- ptmp0 = pcj.pmadd(ei_pload (&lhs0[j+PacketSize]), b, ptmp0);
+ b = ei_pload<RhsPacket>(&rhs[j+RhsPacketSize]);
+ ptmp0 = pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
ptmp1 = pcj.pmadd(A11, b, ptmp1);
ptmp2 = pcj.pmadd(A12, b, ptmp2);
ptmp3 = pcj.pmadd(A13, b, ptmp3);
}
}
- for (Index j = peeledSize; j<alignedSize; j+=PacketSize)
+ for (Index j = peeledSize; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,du);
break;
default:
- for (Index j = alignedStart; j<alignedSize; j+=PacketSize)
+ for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(du,du,du);
break;
}
@@ -444,7 +492,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
// FIXME this loop get vectorized by the compiler !
for (Index j=alignedSize; j<depth; ++j)
{
- Scalar b = rhs[j];
+ RhsScalar b = rhs[j];
tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
}
@@ -461,9 +509,9 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
{
for (Index i=start; i<end; ++i)
{
- EIGEN_ALIGN16 Scalar tmp0 = Scalar(0);
- Packet ptmp0 = ei_pset1(tmp0);
- const Scalar* lhs0 = lhs + i*lhsStride;
+ EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+ ResPacket ptmp0 = ei_pset1<ResPacket>(tmp0);
+ const LhsScalar* lhs0 = lhs + i*lhsStride;
// process first unaligned result's coeffs
// FIXME this loop get vectorized by the compiler !
for (Index j=0; j<alignedStart; ++j)
@@ -472,12 +520,12 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
if (alignedSize>alignedStart)
{
// process aligned rhs coeffs
- if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0)
- for (Index j = alignedStart;j<alignedSize;j+=PacketSize)
- ptmp0 = pcj.pmadd(ei_pload(&lhs0[j]), ei_pload(&rhs[j]), ptmp0);
+ if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+ for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
+ ptmp0 = pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j]), ei_pload<RhsPacket>(&rhs[j]), ptmp0);
else
- for (Index j = alignedStart;j<alignedSize;j+=PacketSize)
- ptmp0 = pcj.pmadd(ei_ploadu(&lhs0[j]), ei_pload(&rhs[j]), ptmp0);
+ for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
+ ptmp0 = pcj.pmadd(ei_ploadu<LhsPacket>(&lhs0[j]), ei_pload<RhsPacket>(&rhs[j]), ptmp0);
tmp0 += ei_predux(ptmp0);
}
@@ -499,5 +547,6 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
#undef _EIGEN_ACCUMULATE_PACKETS
}
+};
#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 4a95ad5e1..ede8b77bf 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -26,10 +26,9 @@
#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H
// pack a selfadjoint block diagonal for use with the gebp_kernel
-template<typename Scalar, typename Index, int mr, int StorageOrder>
+template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder>
struct ei_symm_pack_lhs
{
- enum { PacketSize = ei_packet_traits<Scalar>::size };
template<int BlockRows> inline
void pack(Scalar* blockA, const ei_const_blas_data_mapper<Scalar,Index,StorageOrder>& lhs, Index cols, Index i, Index& count)
{
@@ -59,16 +58,16 @@ struct ei_symm_pack_lhs
{
ei_const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
Index count = 0;
- Index peeled_mc = (rows/mr)*mr;
- for(Index i=0; i<peeled_mc; i+=mr)
+ Index peeled_mc = (rows/Pack1)*Pack1;
+ for(Index i=0; i<peeled_mc; i+=Pack1)
{
- pack<mr>(blockA, lhs, cols, i, count);
+ pack<Pack1>(blockA, lhs, cols, i, count);
}
- if(rows-peeled_mc>=PacketSize)
+ if(rows-peeled_mc>=Pack2)
{
- pack<PacketSize>(blockA, lhs, cols, peeled_mc, count);
- peeled_mc += PacketSize;
+ pack<Pack2>(blockA, lhs, cols, peeled_mc, count);
+ peeled_mc += Pack2;
}
// do the same with mr==1
@@ -89,7 +88,7 @@ template<typename Scalar, typename Index, int nr, int StorageOrder>
struct ei_symm_pack_rhs
{
enum { PacketSize = ei_packet_traits<Scalar>::size };
- void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Scalar alpha, Index rows, Index cols, Index k2)
+ void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
{
Index end_k = k2 + rows;
Index count = 0;
@@ -101,12 +100,12 @@ struct ei_symm_pack_rhs
{
for(Index k=k2; k<end_k; k++)
{
- blockB[count+0] = alpha*rhs(k,j2+0);
- blockB[count+1] = alpha*rhs(k,j2+1);
+ blockB[count+0] = rhs(k,j2+0);
+ blockB[count+1] = rhs(k,j2+1);
if (nr==4)
{
- blockB[count+2] = alpha*rhs(k,j2+2);
- blockB[count+3] = alpha*rhs(k,j2+3);
+ blockB[count+2] = rhs(k,j2+2);
+ blockB[count+3] = rhs(k,j2+3);
}
count += nr;
}
@@ -119,12 +118,12 @@ struct ei_symm_pack_rhs
// transpose
for(Index k=k2; k<j2; k++)
{
- blockB[count+0] = alpha*ei_conj(rhs(j2+0,k));
- blockB[count+1] = alpha*ei_conj(rhs(j2+1,k));
+ blockB[count+0] = ei_conj(rhs(j2+0,k));
+ blockB[count+1] = ei_conj(rhs(j2+1,k));
if (nr==4)
{
- blockB[count+2] = alpha*ei_conj(rhs(j2+2,k));
- blockB[count+3] = alpha*ei_conj(rhs(j2+3,k));
+ blockB[count+2] = ei_conj(rhs(j2+2,k));
+ blockB[count+3] = ei_conj(rhs(j2+3,k));
}
count += nr;
}
@@ -134,25 +133,25 @@ struct ei_symm_pack_rhs
{
// normal
for (Index w=0 ; w<h; ++w)
- blockB[count+w] = alpha*rhs(k,j2+w);
+ blockB[count+w] = rhs(k,j2+w);
- blockB[count+h] = alpha*ei_real(rhs(k,k));
+ blockB[count+h] = ei_real(rhs(k,k));
// transpose
for (Index w=h+1 ; w<nr; ++w)
- blockB[count+w] = alpha*ei_conj(rhs(j2+w,k));
+ blockB[count+w] = ei_conj(rhs(j2+w,k));
count += nr;
++h;
}
// normal
for(Index k=j2+nr; k<end_k; k++)
{
- blockB[count+0] = alpha*rhs(k,j2+0);
- blockB[count+1] = alpha*rhs(k,j2+1);
+ blockB[count+0] = rhs(k,j2+0);
+ blockB[count+1] = rhs(k,j2+1);
if (nr==4)
{
- blockB[count+2] = alpha*rhs(k,j2+2);
- blockB[count+3] = alpha*rhs(k,j2+3);
+ blockB[count+2] = rhs(k,j2+2);
+ blockB[count+3] = rhs(k,j2+3);
}
count += nr;
}
@@ -163,12 +162,12 @@ struct ei_symm_pack_rhs
{
for(Index k=k2; k<end_k; k++)
{
- blockB[count+0] = alpha*ei_conj(rhs(j2+0,k));
- blockB[count+1] = alpha*ei_conj(rhs(j2+1,k));
+ blockB[count+0] = ei_conj(rhs(j2+0,k));
+ blockB[count+1] = ei_conj(rhs(j2+1,k));
if (nr==4)
{
- blockB[count+2] = alpha*ei_conj(rhs(j2+2,k));
- blockB[count+3] = alpha*ei_conj(rhs(j2+3,k));
+ blockB[count+2] = ei_conj(rhs(j2+2,k));
+ blockB[count+3] = ei_conj(rhs(j2+3,k));
}
count += nr;
}
@@ -181,13 +180,13 @@ struct ei_symm_pack_rhs
Index half = std::min(end_k,j2);
for(Index k=k2; k<half; k++)
{
- blockB[count] = alpha*ei_conj(rhs(j2,k));
+ blockB[count] = ei_conj(rhs(j2,k));
count += 1;
}
if(half==j2 && half<k2+rows)
{
- blockB[count] = alpha*ei_real(rhs(j2,j2));
+ blockB[count] = ei_real(rhs(j2,j2));
count += 1;
}
else
@@ -196,7 +195,7 @@ struct ei_symm_pack_rhs
// normal
for(Index k=half+1; k<k2+rows; k++)
{
- blockB[count] = alpha*rhs(k,j2);
+ blockB[count] = rhs(k,j2);
count += 1;
}
}
@@ -253,12 +252,9 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
- if (ConjugateRhs)
- alpha = ei_conj(alpha);
-
- typedef ei_product_blocking_traits<Scalar> Blocking;
+ typedef ei_gebp_traits<Scalar,Scalar> Traits;
- Index kc = size; // cache block size along the K direction
+ Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
@@ -266,14 +262,15 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
kc = std::min(kc,mc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
- std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+ std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+ std::size_t sizeB = sizeW + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
- Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
+ Scalar* blockB = allocatedBlockB + sizeW;
- ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
- ei_symm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
- ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
- ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
+ ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ ei_symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+ ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
for(Index k2=0; k2<size; k2+=kc)
{
@@ -282,7 +279,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
// we have selected one row panel of rhs and one column panel of lhs
// pack rhs's panel into a sequential chunk of memory
// and expand each coeff to a constant packet for further reuse
- pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
+ pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
// the select lhs's panel has to be split in three different parts:
// 1 - the transposed panel above the diagonal block => transposed packed copy
@@ -294,7 +291,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
// transposed packed copy
pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
- gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
+ gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
// the block diagonal
{
@@ -302,16 +299,16 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
// symmetric packed copy
pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
- gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
+ gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
for(Index i2=k2+kc; i2<size; i2+=mc)
{
const Index actual_mc = std::min(i2+mc,size)-i2;
- ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder,false>()
+ ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
- gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
+ gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
}
@@ -338,10 +335,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
- if (ConjugateRhs)
- alpha = ei_conj(alpha);
-
- typedef ei_product_blocking_traits<Scalar> Blocking;
+ typedef ei_gebp_traits<Scalar,Scalar> Traits;
Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
@@ -349,19 +343,20 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
- std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+ std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+ std::size_t sizeB = sizeW + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
- Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
+ Scalar* blockB = allocatedBlockB + sizeW;
- ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
- ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
- ei_symm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
+ ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ ei_symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=0; k2<size; k2+=kc)
{
const Index actual_kc = std::min(k2+kc,size)-k2;
- pack_rhs(blockB, _rhs, rhsStride, alpha, actual_kc, cols, k2);
+ pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2);
// => GEPP
for(Index i2=0; i2<rows; i2+=mc)
@@ -369,7 +364,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat
const Index actual_mc = std::min(i2+mc,rows)-i2;
pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
- gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
+ gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
}
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index 4514c7692..8a10075f0 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -77,14 +77,14 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector(
register const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
Scalar t0 = cjAlpha * rhs[j];
- Packet ptmp0 = ei_pset1(t0);
+ Packet ptmp0 = ei_pset1<Packet>(t0);
Scalar t1 = cjAlpha * rhs[j+1];
- Packet ptmp1 = ei_pset1(t1);
+ Packet ptmp1 = ei_pset1<Packet>(t1);
Scalar t2 = 0;
- Packet ptmp2 = ei_pset1(t2);
+ Packet ptmp2 = ei_pset1<Packet>(t2);
Scalar t3 = 0;
- Packet ptmp3 = ei_pset1(t3);
+ Packet ptmp3 = ei_pset1<Packet>(t3);
size_t starti = FirstTriangular ? 0 : j+2;
size_t endi = FirstTriangular ? j : size;
@@ -119,10 +119,10 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector(
Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize)
{
- Packet A0i = ei_ploadu(a0It); a0It += PacketSize;
- Packet A1i = ei_ploadu(a1It); a1It += PacketSize;
- Packet Bi = ei_ploadu(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases
- Packet Xi = ei_pload (resIt);
+ Packet A0i = ei_ploadu<Packet>(a0It); a0It += PacketSize;
+ Packet A1i = ei_ploadu<Packet>(a1It); a1It += PacketSize;
+ Packet Bi = ei_ploadu<Packet>(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases
+ Packet Xi = ei_pload <Packet>(resIt);
Xi = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi));
ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h
index 40c0c9aac..8f431c2e4 100644
--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h
@@ -65,23 +65,24 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL
{
ei_const_blas_data_mapper<Scalar, Index, MatStorageOrder> mat(_mat,matStride);
- if(AAT)
- alpha = ei_conj(alpha);
+// if(AAT)
+// alpha = ei_conj(alpha);
- typedef ei_product_blocking_traits<Scalar> Blocking;
+ typedef ei_gebp_traits<Scalar,Scalar> Traits;
Index kc = depth; // cache block size along the K direction
Index mc = size; // cache block size along the M direction
Index nc = size; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
// !!! mc must be a multiple of nr:
- if(mc>Blocking::nr)
- mc = (mc/Blocking::nr)*Blocking::nr;
+ if(mc>Traits::nr)
+ mc = (mc/Traits::nr)*Traits::nr;
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
- std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size;
+ std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+ std::size_t sizeB = sizeW + kc*size;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
- Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
+ Scalar* blockB = allocatedBlockB + sizeW;
// note that the actual rhs is the transpose/adjoint of mat
enum {
@@ -89,17 +90,17 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL
ConjRhs = NumTraits<Scalar>::IsComplex && AAT
};
- ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjLhs, ConjRhs> gebp_kernel;
- ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs;
- ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,MatStorageOrder, false> pack_lhs;
- ei_sybb_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjLhs, ConjRhs, UpLo> sybb;
+ ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjLhs, ConjRhs> gebp_kernel;
+ ei_gemm_pack_rhs<Scalar, Index, Traits::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs;
+ ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, MatStorageOrder, false> pack_lhs;
+ ei_sybb_kernel<Scalar, Index, Traits::mr, Traits::nr, ConjLhs, ConjRhs, UpLo> sybb;
for(Index k2=0; k2<depth; k2+=kc)
{
const Index actual_kc = std::min(k2+kc,depth)-k2;
// note that the actual rhs is the transpose/adjoint of mat
- pack_rhs(blockB, &mat(0,k2), matStride, alpha, actual_kc, size);
+ pack_rhs(blockB, &mat(0,k2), matStride, actual_kc, size);
for(Index i2=0; i2<size; i2+=mc)
{
@@ -112,15 +113,15 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL
// 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
// 3 - after the diagonal => processed with gebp or skipped
if (UpLo==Lower)
- gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2),
+ gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2), alpha,
-1, -1, 0, 0, allocatedBlockB);
- sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, allocatedBlockB);
+ sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB);
if (UpLo==Upper)
{
Index j2 = i2+actual_mc;
- gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, std::max(Index(0),size-j2),
+ gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, std::max(Index(0), size-j2), alpha,
-1, -1, 0, 0, allocatedBlockB);
}
}
@@ -173,9 +174,9 @@ struct ei_sybb_kernel
PacketSize = ei_packet_traits<Scalar>::size,
BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr)
};
- void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index size, Index depth, Scalar* workspace)
+ void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index size, Index depth, Scalar alpha, Scalar* workspace)
{
- ei_gebp_kernel<Scalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
+ ei_gebp_kernel<Scalar, Scalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
Matrix<Scalar,BlockSize,BlockSize,ColMajor> buffer;
// let's process the block per panel of actual_mc x BlockSize,
@@ -186,14 +187,15 @@ struct ei_sybb_kernel
const Scalar* actual_b = blockB+j*depth;
if(UpLo==Upper)
- gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, -1, -1, 0, 0, workspace);
+ gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
+ -1, -1, 0, 0, workspace);
// selfadjoint micro block
{
Index i = j;
buffer.setZero();
// 1 - apply the kernel on the temporary buffer
- gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize,
+ gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-1, -1, 0, 0, workspace);
// 2 - triangular accumulation
for(Index j1=0; j1<actualBlockSize; ++j1)
@@ -208,7 +210,7 @@ struct ei_sybb_kernel
if(UpLo==Lower)
{
Index i = j+actualBlockSize;
- gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize,
+ gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-1, -1, 0, 0, workspace);
}
}
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 98305f993..cef5eeba1 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -105,12 +105,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
- if (ConjugateRhs)
- alpha = ei_conj(alpha);
-
- typedef ei_product_blocking_traits<Scalar> Blocking;
+ typedef ei_gebp_traits<Scalar,Scalar> Traits;
enum {
- SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr),
+ SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower,
SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
};
@@ -121,10 +118,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
- std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+ std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+ std::size_t sizeB = sizeW + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
-// Scalar* allocatedBlockB = new Scalar[sizeB];
- Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
+ Scalar* blockB = allocatedBlockB + sizeW;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
triangularBuffer.setZero();
@@ -133,9 +130,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
else
triangularBuffer.diagonal().setOnes();
- ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
- ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
- ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
+ ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=IsLower ? depth : 0;
IsLower ? k2>0 : k2<depth;
@@ -151,7 +148,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
k2 = k2+actual_kc-kc;
}
- pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, alpha, actual_kc, cols);
+ pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols);
// the selected lhs's panel has to be split in three different parts:
// 1 - the part which is above the diagonal block => skip it
@@ -180,7 +177,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
}
pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
- gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols,
+ gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
actualPanelWidth, actual_kc, 0, blockBOffset);
// GEBP with remaining micro panel
@@ -190,7 +187,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
- gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols,
+ gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
actualPanelWidth, actual_kc, 0, blockBOffset);
}
}
@@ -202,10 +199,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
for(Index i2=start; i2<end; i2+=mc)
{
const Index actual_mc = std::min(i2+mc,end)-i2;
- ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder,false>()
+ ei_gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
- gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
+ gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
}
}
@@ -235,12 +232,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
- if (ConjugateRhs)
- alpha = ei_conj(alpha);
-
- typedef ei_product_blocking_traits<Scalar> Blocking;
+ typedef ei_gebp_traits<Scalar,Scalar> Traits;
enum {
- SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr),
+ SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower,
SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
};
@@ -251,9 +245,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
- std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+ std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+ std::size_t sizeB = sizeW + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar,sizeB);
- Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
+ Scalar* blockB = allocatedBlockB + sizeW;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
triangularBuffer.setZero();
@@ -262,10 +257,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
else
triangularBuffer.diagonal().setOnes();
- ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
- ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
- ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
- ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,false,true> pack_rhs_panel;
+ ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+ ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
for(Index k2=IsLower ? 0 : depth;
IsLower ? k2<depth : k2>0;
@@ -288,7 +283,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Scalar* geb = blockB+ts*ts;
- pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, alpha, actual_kc, rs);
+ pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
// pack the triangular part of the rhs padding the unrolled blocks with zeros
if(ts>0)
@@ -301,7 +296,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
// general part
pack_rhs_panel(blockB+j2*actual_kc,
- &rhs(actual_k2+panelOffset, actual_j2), rhsStride, alpha,
+ &rhs(actual_k2+panelOffset, actual_j2), rhsStride,
panelLength, actualPanelWidth,
actual_kc, panelOffset);
@@ -315,7 +310,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
}
pack_rhs_panel(blockB+j2*actual_kc,
- triangularBuffer.data(), triangularBuffer.outerStride(), alpha,
+ triangularBuffer.data(), triangularBuffer.outerStride(),
actualPanelWidth, actualPanelWidth,
actual_kc, j2);
}
@@ -338,6 +333,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
blockA, blockB+j2*actual_kc,
actual_mc, panelLength, actualPanelWidth,
+ alpha,
actual_kc, actual_kc, // strides
blockOffset, blockOffset,// offsets
allocatedBlockB); // workspace
@@ -345,6 +341,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
}
gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
blockA, geb, actual_mc, actual_kc, rs,
+ alpha,
-1, -1, 0, 0, allocatedBlockB);
}
}
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index de4f0b7bb..67c131ab2 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -76,12 +76,11 @@ struct ei_product_triangular_vector_selector<true,Lhs,Rhs,Result,Mode,ConjLhs,Co
if (r>0)
{
Index s = IsLower ? pi+actualPanelWidth : 0;
- ei_cache_friendly_product_colmajor_times_vector<ConjLhs,ConjRhs>(
- r,
+ ei_general_matrix_vector_product<Index,Scalar,ColMajor,ConjLhs,Scalar,ConjRhs>::run(
+ r, actualPanelWidth,
&(lhs.const_cast_derived().coeffRef(s,pi)), lhs.outerStride(),
- rhs.segment(pi, actualPanelWidth),
- &(res.coeffRef(s)),
- alpha);
+ &rhs.coeff(pi), rhs.innerStride(),
+ &res.coeffRef(s), res.innerStride(), alpha);
}
}
}
@@ -119,7 +118,7 @@ struct ei_product_triangular_vector_selector<true,Lhs,Rhs,Result,Mode,ConjLhs,Co
if (r>0)
{
Index s = IsLower ? 0 : pi + actualPanelWidth;
- ei_cache_friendly_product_rowmajor_times_vector<ConjLhs,ConjRhs,Scalar,Index>(
+ ei_general_matrix_vector_product<Index,Scalar,RowMajor,ConjLhs,Scalar,ConjRhs>::run(
actualPanelWidth, r,
&(lhs.const_cast_derived().coeffRef(pi,s)), lhs.outerStride(),
&(rhs.const_cast_derived().coeffRef(s)), 1,
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 0fce7159e..7163a800a 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -57,9 +57,9 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora
ei_const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
ei_blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
- typedef ei_product_blocking_traits<Scalar> Blocking;
+ typedef ei_gebp_traits<Scalar,Scalar> Traits;
enum {
- SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr),
+ SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower
};
@@ -69,14 +69,15 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
- std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+ std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+ std::size_t sizeB = sizeW + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
- Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
+ Scalar* blockB = allocatedBlockB + sizeW;
ei_conj_if<Conjugate> conj;
- ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, Conjugate, false> gebp_kernel;
- ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,TriStorageOrder> pack_lhs;
- ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, ColMajor, false, true> pack_rhs;
+ ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
+ ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
+ ei_gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
for(Index k2=IsLower ? 0 : size;
IsLower ? k2<size : k2>0;
@@ -140,7 +141,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora
Index blockBOffset = IsLower ? k1 : lengthTarget;
// update the respective rows of B from other
- pack_rhs(blockB, _other+startBlock, otherStride, -1, actualPanelWidth, cols, actual_kc, blockBOffset);
+ pack_rhs(blockB, _other+startBlock, otherStride, actualPanelWidth, cols, actual_kc, blockBOffset);
// GEBP
if (lengthTarget>0)
@@ -149,7 +150,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora
pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
- gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols,
+ gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, Scalar(-1),
actualPanelWidth, actual_kc, 0, blockBOffset);
}
}
@@ -166,7 +167,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora
{
pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
- gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols);
+ gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1));
}
}
}
@@ -191,15 +192,15 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
ei_const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
ei_blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
- typedef ei_product_blocking_traits<Scalar> Blocking;
+ typedef ei_gebp_traits<Scalar,Scalar> Traits;
enum {
RhsStorageOrder = TriStorageOrder,
- SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr),
+ SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower
};
-// Index kc = std::min<Index>(Blocking::Max_kc/4,size); // cache block size along the K direction
-// Index mc = std::min<Index>(Blocking::Max_mc,size); // cache block size along the M direction
+// Index kc = std::min<Index>(Traits::Max_kc/4,size); // cache block size along the K direction
+// Index mc = std::min<Index>(Traits::Max_mc,size); // cache block size along the M direction
// check that !!!!
Index kc = size; // cache block size along the K direction
Index mc = size; // cache block size along the M direction
@@ -207,15 +208,16 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
- std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size;
+ std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+ std::size_t sizeB = sizeW + kc*size;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
- Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
+ Scalar* blockB = allocatedBlockB + sizeW;
ei_conj_if<Conjugate> conj;
- ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, false, Conjugate> gebp_kernel;
- ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
- ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,false,true> pack_rhs_panel;
- ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, ColMajor, false, true> pack_lhs_panel;
+ ei_gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
+ ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+ ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
+ ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
for(Index k2=IsLower ? size : 0;
IsLower ? k2>0 : k2<size;
@@ -228,7 +230,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
Scalar* geb = blockB+actual_kc*actual_kc;
- if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, -1, actual_kc, rs);
+ if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs);
// triangular packing (we only pack the panels off the diagonal,
// neglecting the blocks overlapping the diagonal
@@ -242,7 +244,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
if (panelLength>0)
pack_rhs_panel(blockB+j2*actual_kc,
- &rhs(actual_k2+panelOffset, actual_j2), triStride, -1,
+ &rhs(actual_k2+panelOffset, actual_j2), triStride,
panelLength, actualPanelWidth,
actual_kc, panelOffset);
}
@@ -273,6 +275,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
gebp_kernel(&lhs(i2,absolute_j2), otherStride,
blockA, blockB+j2*actual_kc,
actual_mc, panelLength, actualPanelWidth,
+ Scalar(-1),
actual_kc, actual_kc, // strides
panelOffset, panelOffset, // offsets
allocatedBlockB); // workspace
@@ -305,7 +308,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
if (rs>0)
gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
- actual_mc, actual_kc, rs,
+ actual_mc, actual_kc, rs, Scalar(-1),
-1, -1, 0, 0, allocatedBlockB);
}
}
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index e53311100..972814dc9 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -29,30 +29,37 @@
// implement and control fast level 2 and level 3 BLAS-like routines.
// forward declarations
-template<typename Scalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
struct ei_gebp_kernel;
template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
struct ei_gemm_pack_rhs;
-template<typename Scalar, typename Index, int mr, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
struct ei_gemm_pack_lhs;
template<
- typename Scalar, typename Index,
- int LhsStorageOrder, bool ConjugateLhs,
- int RhsStorageOrder, bool ConjugateRhs,
+ typename Index,
+ typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+ typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
int ResStorageOrder>
struct ei_general_matrix_matrix_product;
-template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename RhsType>
-static void ei_cache_friendly_product_colmajor_times_vector(
- Index size, const Scalar* lhs, Index lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha);
+template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct ei_general_matrix_vector_product;
-template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index>
-static void ei_cache_friendly_product_rowmajor_times_vector(
- Index rows, Index Cols, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsIncr,
- Scalar* res, Index resIncr, Scalar alpha);
+
+template<bool Conjugate> struct ei_conj_if;
+
+template<> struct ei_conj_if<true> {
+ template<typename T>
+ inline T operator()(const T& x) { return ei_conj(x); }
+};
+
+template<> struct ei_conj_if<false> {
+ template<typename T>
+ inline const T& operator()(const T& x) { return x; }
+};
template<typename Scalar> struct ei_conj_helper<Scalar,Scalar,false,false>
{
@@ -90,16 +97,30 @@ template<typename RealScalar> struct ei_conj_helper<std::complex<RealScalar>, st
{ return Scalar(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); }
};
-template<bool Conjugate> struct ei_conj_if;
+template<typename RealScalar,bool Conj> struct ei_conj_helper<std::complex<RealScalar>, RealScalar, Conj,false>
+{
+ typedef std::complex<RealScalar> Scalar;
+ EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const
+ { return ei_padd(c, pmul(x,y)); }
+ EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const
+ { return ei_conj_if<Conj>()(x)*y; }
+};
-template<> struct ei_conj_if<true> {
- template<typename T>
- inline T operator()(const T& x) { return ei_conj(x); }
+template<typename RealScalar,bool Conj> struct ei_conj_helper<RealScalar, std::complex<RealScalar>, false,Conj>
+{
+ typedef std::complex<RealScalar> Scalar;
+ EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const
+ { return ei_padd(c, pmul(x,y)); }
+ EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const
+ { return x*ei_conj_if<Conj>()(y); }
};
-template<> struct ei_conj_if<false> {
- template<typename T>
- inline const T& operator()(const T& x) { return x; }
+template<typename From,typename To> struct ei_get_factor {
+ EIGEN_STRONG_INLINE static To run(const From& x) { return x; }
+};
+
+template<typename Scalar> struct ei_get_factor<Scalar,typename NumTraits<Scalar>::Real> {
+ EIGEN_STRONG_INLINE static typename NumTraits<Scalar>::Real run(const Scalar& x) { return ei_real(x); }
};
// Lightweight helper class to access matrix coefficients.
@@ -130,34 +151,6 @@ class ei_const_blas_data_mapper
Index m_stride;
};
-// Defines various constant controlling register blocking for matrix-matrix algorithms.
-template<typename Scalar>
-struct ei_product_blocking_traits
-{
- typedef typename ei_packet_traits<Scalar>::type PacketType;
- enum {
- PacketSize = sizeof(PacketType)/sizeof(Scalar),
- NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
-
- // register block size along the N direction (must be either 2 or 4)
- nr = NumberOfRegisters/4,
-
- // register block size along the M direction (currently, this one cannot be modified)
- mr = 2 * PacketSize
- };
-};
-
-template<typename Real>
-struct ei_product_blocking_traits<std::complex<Real> >
-{
- typedef std::complex<Real> Scalar;
- typedef typename ei_packet_traits<Scalar>::type PacketType;
- enum {
- PacketSize = sizeof(PacketType)/sizeof(Scalar),
- nr = 2,
- mr = 2 * PacketSize
- };
-};
/* Helper class to analyze the factors of a Product expression.
* In particular it allows to pop out operator-, scalar multiples,
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 2c100c809..60fdcf2e2 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -184,7 +184,9 @@ enum {
LinearVectorizedTraversal,
/** \internal Generic vectorization path using one vectorized loop per row/column with some
* scalar loops to handle the unaligned boundaries */
- SliceVectorizedTraversal
+ SliceVectorizedTraversal,
+ /** \internal Special case to properly handle incompatible scalar types or other defecting cases*/
+ InvalidTraversal
};
enum {
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index c3ff9d7c4..7f626c62b 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -70,7 +70,7 @@ template<typename NullaryOp, typename MatrixType> class CwiseNullaryOp;
template<typename UnaryOp, typename MatrixType> class CwiseUnaryOp;
template<typename ViewOp, typename MatrixType> class CwiseUnaryView;
template<typename BinaryOp, typename Lhs, typename Rhs> class CwiseBinaryOp;
-template<typename BinOp, typename MatrixType> class SelfCwiseBinaryOp;
+template<typename BinOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp;
template<typename Derived, typename Lhs, typename Rhs> class ProductBase;
template<typename Lhs, typename Rhs, int Mode> class GeneralProduct;
template<typename Lhs, typename Rhs, int NestingFlags> class CoeffBasedProduct;
@@ -111,11 +111,10 @@ struct ProductReturnType;
// Provides scalar/packet-wise product and product with accumulation
// with optional conjugation of the arguments.
-template<typename LhsScalar, typename RhsScalar, bool ConjLhs, bool ConjRhs> struct ei_conj_helper;
+template<typename LhsScalar, typename RhsScalar, bool ConjLhs=false, bool ConjRhs=false> struct ei_conj_helper;
template<typename Scalar> struct ei_scalar_sum_op;
template<typename Scalar> struct ei_scalar_difference_op;
-template<typename Scalar> struct ei_scalar_product_op;
template<typename Scalar> struct ei_scalar_conj_product_op;
template<typename Scalar> struct ei_scalar_quotient_op;
template<typename Scalar> struct ei_scalar_opposite_op;
@@ -143,7 +142,8 @@ template<typename Scalar> struct ei_scalar_add_op;
template<typename Scalar> struct ei_scalar_constant_op;
template<typename Scalar> struct ei_scalar_identity_op;
-template<typename Scalar1,typename Scalar2> struct ei_scalar_multiple2_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct ei_scalar_product_op;
+template<typename LhsScalar,typename RhsScalar> struct ei_scalar_multiple2_op;
struct IOFormat;
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index f52e8b57c..7600cc3e7 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -147,6 +147,12 @@
#define EIGEN_ALWAYS_INLINE_ATTRIB
#endif
+#if EIGEN_GNUC_AT_LEAST(4,0)
+#define EIGEN_FLATTEN_ATTRIB __attribute__((flatten))
+#else
+#define EIGEN_FLATTEN_ATTRIB
+#endif
+
// EIGEN_FORCE_INLINE means "inline as much as possible"
#if (defined _MSC_VER) || (defined __intel_compiler)
#define EIGEN_STRONG_INLINE __forceinline
@@ -353,10 +359,8 @@
#define EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS) \
CwiseBinaryOp< \
ei_scalar_product_op< \
- typename ei_scalar_product_traits< \
typename ei_traits<LHS>::Scalar, \
typename ei_traits<RHS>::Scalar \
- >::ReturnType \
>, \
LHS, \
RHS \
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index b01ceafb2..3d28680b6 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -205,10 +205,10 @@ template<typename T> struct ei_scalar_product_traits<std::complex<T>, T>
};
// FIXME quick workaround around current limitation of ei_result_of
-template<typename Scalar, typename ArgType0, typename ArgType1>
-struct ei_result_of<ei_scalar_product_op<Scalar>(ArgType0,ArgType1)> {
-typedef typename ei_scalar_product_traits<typename ei_cleantype<ArgType0>::type, typename ei_cleantype<ArgType1>::type>::ReturnType type;
-};
+// template<typename Scalar, typename ArgType0, typename ArgType1>
+// struct ei_result_of<ei_scalar_product_op<Scalar>(ArgType0,ArgType1)> {
+// typedef typename ei_scalar_product_traits<typename ei_cleantype<ArgType0>::type, typename ei_cleantype<ArgType1>::type>::ReturnType type;
+// };
template<typename T> struct ei_is_diagonal
{ enum { ret = false }; };
diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h
index 94bb5569e..f62d0d8a4 100644
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h
@@ -322,8 +322,8 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY&
Index alignedStart = ei_first_aligned(y, size);
Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;
- const Packet pc = ei_pset1(Scalar(j.c()));
- const Packet ps = ei_pset1(Scalar(j.s()));
+ const Packet pc = ei_pset1<Packet>(j.c());
+ const Packet ps = ei_pset1<Packet>(j.s());
ei_conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex,false> pcj;
for(Index i=0; i<alignedStart; ++i)
@@ -341,8 +341,8 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY&
{
for(Index i=alignedStart; i<alignedEnd; i+=PacketSize)
{
- Packet xi = ei_pload(px);
- Packet yi = ei_pload(py);
+ Packet xi = ei_pload<Packet>(px);
+ Packet yi = ei_pload<Packet>(py);
ei_pstore(px, ei_padd(ei_pmul(pc,xi),pcj.pmul(ps,yi)));
ei_pstore(py, ei_psub(ei_pmul(pc,yi),ei_pmul(ps,xi)));
px += PacketSize;
@@ -354,10 +354,10 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY&
Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize);
for(Index i=alignedStart; i<peelingEnd; i+=Peeling*PacketSize)
{
- Packet xi = ei_ploadu(px);
- Packet xi1 = ei_ploadu(px+PacketSize);
- Packet yi = ei_pload (py);
- Packet yi1 = ei_pload (py+PacketSize);
+ Packet xi = ei_ploadu<Packet>(px);
+ Packet xi1 = ei_ploadu<Packet>(px+PacketSize);
+ Packet yi = ei_pload <Packet>(py);
+ Packet yi1 = ei_pload <Packet>(py+PacketSize);
ei_pstoreu(px, ei_padd(ei_pmul(pc,xi),pcj.pmul(ps,yi)));
ei_pstoreu(px+PacketSize, ei_padd(ei_pmul(pc,xi1),pcj.pmul(ps,yi1)));
ei_pstore (py, ei_psub(ei_pmul(pc,yi),ei_pmul(ps,xi)));
@@ -367,8 +367,8 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY&
}
if(alignedEnd!=peelingEnd)
{
- Packet xi = ei_ploadu(x+peelingEnd);
- Packet yi = ei_pload (y+peelingEnd);
+ Packet xi = ei_ploadu<Packet>(x+peelingEnd);
+ Packet yi = ei_pload <Packet>(y+peelingEnd);
ei_pstoreu(x+peelingEnd, ei_padd(ei_pmul(pc,xi),pcj.pmul(ps,yi)));
ei_pstore (y+peelingEnd, ei_psub(ei_pmul(pc,yi),ei_pmul(ps,xi)));
}