aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2015-02-13 10:03:53 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2015-02-13 10:03:53 +0100
commitfe513199808654bfa5080fe16bda7dcdafbd57c6 (patch)
tree71c207f44df25ebd76d19531e65cb6e22efd5c89 /Eigen/src/Core
parente8cdbedefb1913b5a0e2f2b7d38470f081cb8d29 (diff)
parent0918c51e600bed36a53448fa276b01387119a3c2 (diff)
Merge Index-refactoring branch with default, fix PastixSupport, remove some useless typedefs
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r--Eigen/src/Core/Block.h2
-rw-r--r--Eigen/src/Core/CommaInitializer.h1
-rw-r--r--Eigen/src/Core/CoreEvaluators.h19
-rw-r--r--Eigen/src/Core/DiagonalMatrix.h6
-rw-r--r--Eigen/src/Core/GeneralProduct.h24
-rw-r--r--Eigen/src/Core/GenericPacketMath.h31
-rw-r--r--Eigen/src/Core/MapBase.h7
-rw-r--r--Eigen/src/Core/MathFunctions.h47
-rw-r--r--Eigen/src/Core/ProductEvaluators.h13
-rw-r--r--Eigen/src/Core/SolveTriangular.h2
-rw-r--r--Eigen/src/Core/Transpose.h27
-rw-r--r--Eigen/src/Core/arch/AVX/PacketMath.h27
-rwxr-xr-xEigen/src/Core/arch/AltiVec/PacketMath.h8
-rw-r--r--Eigen/src/Core/arch/CUDA/MathFunctions.h75
-rw-r--r--Eigen/src/Core/arch/CUDA/PacketMath.h296
-rw-r--r--Eigen/src/Core/arch/NEON/MathFunctions.h91
-rw-r--r--Eigen/src/Core/arch/NEON/PacketMath.h57
-rw-r--r--Eigen/src/Core/arch/SSE/Complex.h8
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h49
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h729
-rw-r--r--Eigen/src/Core/products/GeneralMatrixMatrix.h97
-rw-r--r--Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h56
-rw-r--r--Eigen/src/Core/products/GeneralMatrixVector.h250
-rw-r--r--Eigen/src/Core/products/Parallelizer.h4
-rw-r--r--Eigen/src/Core/products/SelfadjointMatrixMatrix.h57
-rw-r--r--Eigen/src/Core/products/TriangularMatrixMatrix.h67
-rw-r--r--Eigen/src/Core/products/TriangularMatrixVector.h42
-rw-r--r--Eigen/src/Core/products/TriangularSolverMatrix.h53
-rw-r--r--Eigen/src/Core/products/TriangularSolverVector.h24
-rw-r--r--Eigen/src/Core/util/BlasUtil.h147
-rw-r--r--Eigen/src/Core/util/Constants.h16
-rw-r--r--Eigen/src/Core/util/Macros.h29
-rw-r--r--Eigen/src/Core/util/Memory.h53
-rw-r--r--Eigen/src/Core/util/XprHelper.h15
34 files changed, 1662 insertions, 767 deletions
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 6ea383695..2ef37ca1c 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -87,7 +87,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
// FIXME, this traits is rather specialized for dense object and it needs to be cleaned further
FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
- Flags = (traits<XprType>::Flags & DirectAccessBit) | FlagsLvalueBit | FlagsRowMajorBit
+ Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit
// FIXME DirectAccessBit should not be handled by expressions
};
};
diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h
index 98ebe3bf6..808f3977c 100644
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -28,7 +28,6 @@ template<typename XprType>
struct CommaInitializer
{
typedef typename XprType::Scalar Scalar;
- typedef typename XprType::StorageIndex StorageIndex;
EIGEN_DEVICE_FUNC
inline CommaInitializer(XprType& xpr, const Scalar& s)
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index eb35b44cb..9485080d3 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -111,6 +111,7 @@ struct evaluator_base
typedef evaluator<ExpressionType> type;
typedef evaluator<ExpressionType> nestedType;
+ // FIXME is it really usefull?
typedef typename traits<ExpressionType>::StorageIndex StorageIndex;
// TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
typedef traits<ExpressionType> ExpressionTraits;
@@ -128,7 +129,6 @@ struct evaluator<PlainObjectBase<Derived> >
: evaluator_base<Derived>
{
typedef PlainObjectBase<Derived> PlainObjectType;
- typedef typename PlainObjectType::StorageIndex StorageIndex;
typedef typename PlainObjectType::Scalar Scalar;
typedef typename PlainObjectType::CoeffReturnType CoeffReturnType;
typedef typename PlainObjectType::PacketScalar PacketScalar;
@@ -264,7 +264,6 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased>
EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
- typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar;
@@ -343,7 +342,6 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
: m_functor(n.functor())
{ }
- typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar;
@@ -394,7 +392,6 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
m_argImpl(op.nestedExpression())
{ }
- typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar;
@@ -469,7 +466,6 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
m_rhsImpl(xpr.rhs())
{ }
- typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar;
@@ -522,7 +518,6 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
m_argImpl(op.nestedExpression())
{ }
- typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -563,7 +558,6 @@ struct mapbase_evaluator : evaluator_base<Derived>
{
typedef Derived XprType;
typedef typename XprType::PointerType PointerType;
- typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar;
@@ -760,7 +754,6 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
m_startCol(block.startCol())
{ }
- typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar;
@@ -865,7 +858,6 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
m_elseImpl(select.elseMatrix())
{ }
- typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::CoeffReturnType CoeffReturnType;
inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
@@ -898,7 +890,6 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
: evaluator_base<Replicate<ArgType, RowFactor, ColFactor> >
{
typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
- typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
enum {
@@ -981,7 +972,6 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
: m_expr(expr)
{}
- typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::CoeffReturnType CoeffReturnType;
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
@@ -1016,7 +1006,6 @@ struct evaluator_wrapper_base
EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
- typedef typename ArgType::StorageIndex StorageIndex;
typedef typename ArgType::Scalar Scalar;
typedef typename ArgType::CoeffReturnType CoeffReturnType;
typedef typename ArgType::PacketScalar PacketScalar;
@@ -1103,7 +1092,6 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
: evaluator_base<Reverse<ArgType, Direction> >
{
typedef Reverse<ArgType, Direction> XprType;
- typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar;
@@ -1219,9 +1207,10 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
m_index(diagonal.index())
{ }
- typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::Scalar Scalar;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
+ // FIXME having to check whether ArgType is sparse here i not very nice.
+ typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value,
+ typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType;
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const
{
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index f37091000..56beaf3bc 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -326,6 +326,12 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense, Scalar>
dst.setZero();
dst.diagonal() = src.diagonal();
}
+
+ static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/)
+ { dst.diagonal() += src.diagonal(); }
+
+ static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/)
+ { dst.diagonal() -= src.diagonal(); }
};
} // namespace internal
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index 4eb01b1b3..81750722c 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -11,7 +11,7 @@
#ifndef EIGEN_GENERAL_PRODUCT_H
#define EIGEN_GENERAL_PRODUCT_H
-namespace Eigen {
+namespace Eigen {
enum {
Large = 2,
@@ -252,12 +252,12 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-
+
RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
evalToDest ? dest.data() : static_dest.data());
-
+
if(!evalToDest)
{
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
@@ -273,11 +273,13 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
MappedDest(actualDestPtr, dest.size()) = dest;
}
+ typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
general_matrix_vector_product
- <Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
+ <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
actualLhs.rows(), actualLhs.cols(),
- actualLhs.data(), actualLhs.outerStride(),
- actualRhs.data(), actualRhs.innerStride(),
+ LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+ RhsMapper(actualRhs.data(), actualRhs.innerStride()),
actualDestPtr, 1,
compatibleAlpha);
@@ -333,11 +335,13 @@ template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,true>
Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
}
+ typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
general_matrix_vector_product
- <Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
+ <Index,LhsScalar,LhsMapper,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
actualLhs.rows(), actualLhs.cols(),
- actualLhs.data(), actualLhs.outerStride(),
- actualRhsPtr, 1,
+ LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+ RhsMapper(actualRhsPtr, 1),
dest.data(), dest.innerStride(),
actualAlpha);
}
@@ -410,7 +414,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
#ifdef EIGEN_DEBUG_PRODUCT
internal::product_type<Derived,OtherDerived>::debug();
#endif
-
+
return Product<Derived, OtherDerived>(derived(), other.derived());
}
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 81f4eef40..8759cd06c 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -54,6 +54,7 @@ struct default_packet_traits
HasMax = 1,
HasConj = 1,
HasSetLinear = 1,
+ HasBlend = 0,
HasDiv = 0,
HasSqrt = 0,
@@ -94,6 +95,8 @@ template<typename T> struct packet_traits : default_packet_traits
};
};
+template<typename T> struct packet_traits<const T> : packet_traits<T> { };
+
/** \internal \returns a + b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
padd(const Packet& a,
@@ -356,7 +359,7 @@ pmadd(const Packet& a,
/** \internal \returns a packet version of \a *from.
* If LoadMode equals #Aligned, \a from must be 16 bytes aligned */
template<typename Packet, int LoadMode>
-inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from)
{
if(LoadMode == Aligned)
return pload<Packet>(from);
@@ -367,7 +370,7 @@ inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
/** \internal copy the packet \a from to \a *to.
* If StoreMode equals #Aligned, \a to must be 16 bytes aligned */
template<typename Scalar, typename Packet, int LoadMode>
-inline void pstoret(Scalar* to, const Packet& from)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from)
{
if(LoadMode == Aligned)
pstore(to, from);
@@ -375,6 +378,17 @@ inline void pstoret(Scalar* to, const Packet& from)
pstoreu(to, from);
}
+/** \internal \returns a packet version of \a *from.
+ * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
+ * hardware if available to speedup the loading of data that won't be modified
+ * by the current computation.
+ */
+template<typename Packet, int LoadMode>
+inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
+{
+ return ploadt<Packet, LoadMode>(from);
+}
+
/** \internal default implementation of palign() allowing partial specialization */
template<int Offset,typename PacketType>
struct palign_impl
@@ -433,6 +447,19 @@ ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
// Nothing to do in the scalar case, i.e. a 1x1 matrix.
}
+/***************************************************************************
+ * Selector, i.e. vector of N boolean values used to select (i.e. blend)
+ * words from 2 packets.
+***************************************************************************/
+template <size_t N> struct Selector {
+ bool select[N];
+};
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
+ return ifPacket.select[0] ? thenPacket : elsePacket;
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 3dafee9d7..8dca9796d 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -171,6 +171,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
template<typename Derived> class MapBase<Derived, WriteAccessors>
: public MapBase<Derived, ReadOnlyAccessors>
{
+ typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
public:
typedef MapBase<Derived, ReadOnlyAccessors> Base;
@@ -238,11 +239,13 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
EIGEN_DEVICE_FUNC
Derived& operator=(const MapBase& other)
{
- Base::Base::operator=(other);
+ ReadOnlyMapBase::Base::operator=(other);
return derived();
}
- using Base::Base::operator=;
+ // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
+ // see bugs 821 and 920.
+ using ReadOnlyMapBase::Base::operator=;
};
#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 4c5fc1cae..16ad2dc7e 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -14,7 +14,7 @@ namespace Eigen {
// On WINCE, std::abs is defined for int only, so let's defined our own overloads:
// This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too.
-#if defined(_WIN32_WCE) && defined(_MSC_VER) && _MSC_VER<=1500
+#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500
long abs(long x) { return (labs(x)); }
double abs(double x) { return (fabs(x)); }
float abs(float x) { return (fabsf(x)); }
@@ -360,50 +360,31 @@ inline NewType cast(const OldType& x)
}
/****************************************************************************
-* Implementation of atanh2 *
+* Implementation of logp1 *
****************************************************************************/
template<typename Scalar>
-struct atanh2_impl
+struct log1p_impl
{
- static inline Scalar run(const Scalar& x, const Scalar& r)
+ static inline Scalar run(const Scalar& x)
{
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
- #if (__cplusplus >= 201103L) && !defined(__CYGWIN__)
+ // Let's be conservative and enable the default C++11 implementation only if we are sure it exists
+ #if (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC) \
+ && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)
using std::log1p;
- return log1p(2 * x / (r - x)) / 2;
+ return log1p(x);
#else
- using std::abs;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
using std::log;
- using std::sqrt;
- Scalar z = x / r;
- if (r == 0 || abs(z) > sqrt(NumTraits<Scalar>::epsilon()))
- return log((r + x) / (r - x)) / 2;
- else
- return z + z*z*z / 3;
+ Scalar x1p = RealScalar(1) + x;
+ return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
#endif
}
};
-template<typename RealScalar>
-struct atanh2_impl<std::complex<RealScalar> >
-{
- typedef std::complex<RealScalar> Scalar;
- static inline Scalar run(const Scalar& x, const Scalar& r)
- {
- using std::log;
- using std::norm;
- using std::sqrt;
- Scalar z = x / r;
- if (r == Scalar(0) || norm(z) > NumTraits<RealScalar>::epsilon())
- return RealScalar(0.5) * log((r + x) / (r - x));
- else
- return z + z*z*z / RealScalar(3);
- }
-};
-
template<typename Scalar>
-struct atanh2_retval
+struct log1p_retval
{
typedef Scalar type;
};
@@ -680,9 +661,9 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar&
template<typename Scalar>
EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
+inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
{
- return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
+ return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
}
template<typename Scalar>
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index b2c9b56ed..7f9d135f7 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -210,23 +210,26 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
template<typename Dst, typename Lhs, typename Rhs, typename Func>
EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
{
+ typename evaluator<Rhs>::type rhsEval(rhs);
// FIXME make sure lhs is sequentially stored
// FIXME not very good if rhs is real and lhs complex while alpha is real too
- // FIXME we should probably build an evaluator for dst and rhs
+ // FIXME we should probably build an evaluator for dst
const Index cols = dst.cols();
for (Index j=0; j<cols; ++j)
- func(dst.col(j), rhs.coeff(0,j) * lhs);
+ func(dst.col(j), rhsEval.coeff(0,j) * lhs);
}
// Row major result
template<typename Dst, typename Lhs, typename Rhs, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) {
+EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+{
+ typename evaluator<Lhs>::type lhsEval(lhs);
// FIXME make sure rhs is sequentially stored
// FIXME not very good if lhs is real and rhs complex while alpha is real too
- // FIXME we should probably build an evaluator for dst and lhs
+ // FIXME we should probably build an evaluator for dst
const Index rows = dst.rows();
for (Index i=0; i<rows; ++i)
- func(dst.row(i), lhs.coeff(i,0) * rhs);
+ func(dst.row(i), lhsEval.coeff(i,0) * rhs);
}
template<typename Lhs, typename Rhs>
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 0f17e3a89..f97048bda 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -96,7 +96,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;
- BlockingType blocking(rhs.rows(), rhs.cols(), size);
+ BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index e1316a73d..5d60ba149 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -213,18 +213,39 @@ MatrixBase<Derived>::adjoint() const
namespace internal {
template<typename MatrixType,
- bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic>
+ bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic,
+ bool MatchPacketSize =
+ (int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits<typename MatrixType::Scalar>::size))
+ && (internal::evaluator<MatrixType>::Flags&PacketAccessBit) >
struct inplace_transpose_selector;
template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,true> { // square matrix
+struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
static void run(MatrixType& m) {
m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
}
};
+// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only.
template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,false> { // non square matrix
+struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
+ static void run(MatrixType& m) {
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
+ typedef typename MatrixType::Index Index;
+ const Index PacketSize = internal::packet_traits<Scalar>::size;
+ const Index Alignment = internal::evaluator<MatrixType>::Flags&AlignedBit ? Aligned : Unaligned;
+ PacketBlock<Packet> A;
+ for (Index i=0; i<PacketSize; ++i)
+ A.packet[i] = m.template packetByOuterInner<Alignment>(i,0);
+ internal::ptranspose(A);
+ for (Index i=0; i<PacketSize; ++i)
+ m.template writePacket<Alignment>(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]);
+ }
+};
+
+template<typename MatrixType,bool MatchPacketSize>
+struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix
static void run(MatrixType& m) {
if (m.rows()==m.cols())
m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index e2376bd1f..be66a502a 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -22,9 +22,9 @@ namespace internal {
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
#endif
-#ifdef EIGEN_VECTORIZE_FMA
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
+#ifdef __FMA__
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
#endif
@@ -58,7 +58,8 @@ template<> struct packet_traits<float> : default_packet_traits
HasCos = 0,
HasLog = 0,
HasExp = 0,
- HasSqrt = 0
+ HasSqrt = 0,
+ HasBlend = 1
};
};
template<> struct packet_traits<double> : default_packet_traits
@@ -72,7 +73,8 @@ template<> struct packet_traits<double> : default_packet_traits
HasHalfPacket = 1,
HasDiv = 1,
- HasExp = 0
+ HasExp = 0,
+ HasBlend = 1
};
};
@@ -133,7 +135,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
return pset1<Packet8i>(0);
}
-#ifdef EIGEN_VECTORIZE_FMA
+#ifdef __FMA__
template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
// clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
@@ -557,6 +559,19 @@ ptranspose(PacketBlock<Packet4d,4>& kernel) {
kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
}
+template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
+ const __m256 zero = _mm256_setzero_ps();
+ const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+ __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
+ return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
+ const __m256d zero = _mm256_setzero_pd();
+ const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+ __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
+ return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index fa02f57a1..6b68fc7a5 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -18,12 +18,12 @@ namespace internal {
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
#endif
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
-#ifndef EIGEN_HAS_FUSE_CJMADD
-#define EIGEN_HAS_FUSE_CJMADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#endif
// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h
new file mode 100644
index 000000000..e7305c01e
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -0,0 +1,75 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
+#define EIGEN_MATH_FUNCTIONS_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+template<> EIGEN_STRONG_INLINE
+float4 plog<float4>(const float4& a)
+{
+ return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
+}
+
+template<> EIGEN_STRONG_INLINE
+double2 plog<double2>(const double2& a)
+{
+ return make_double2(log(a.x), log(a.y));
+}
+
+template<> EIGEN_STRONG_INLINE
+float4 pexp<float4>(const float4& a)
+{
+ return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
+}
+
+template<> EIGEN_STRONG_INLINE
+double2 pexp<double2>(const double2& a)
+{
+ return make_double2(exp(a.x), exp(a.y));
+}
+
+template<> EIGEN_STRONG_INLINE
+float4 psqrt<float4>(const float4& a)
+{
+ return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
+}
+
+template<> EIGEN_STRONG_INLINE
+double2 psqrt<double2>(const double2& a)
+{
+ return make_double2(sqrt(a.x), sqrt(a.y));
+}
+
+template<> EIGEN_STRONG_INLINE
+float4 prsqrt<float4>(const float4& a)
+{
+ return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
+}
+
+template<> EIGEN_STRONG_INLINE
+double2 prsqrt<double2>(const double2& a)
+{
+ return make_double2(rsqrt(a.x), rsqrt(a.y));
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
new file mode 100644
index 000000000..19749c832
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -0,0 +1,296 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_CUDA_H
+#define EIGEN_PACKET_MATH_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+template<> struct is_arithmetic<float4> { enum { value = true }; };
+template<> struct is_arithmetic<double2> { enum { value = true }; };
+
+
+template<> struct packet_traits<float> : default_packet_traits
+{
+ typedef float4 type;
+ typedef float4 half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size=4,
+ HasHalfPacket = 0,
+
+ HasDiv = 1,
+ HasSin = 0,
+ HasCos = 0,
+ HasLog = 1,
+ HasExp = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+
+ HasBlend = 0,
+ };
+};
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+ typedef double2 type;
+ typedef double2 half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size=2,
+ HasHalfPacket = 0,
+
+ HasDiv = 1,
+ HasLog = 1,
+ HasExp = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+
+ HasBlend = 0,
+ };
+};
+
+
+template<> struct unpacket_traits<float4> { typedef float type; enum {size=4}; typedef float4 half; };
+template<> struct unpacket_traits<double2> { typedef double type; enum {size=2}; typedef double2 half; };
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
+ return make_float4(from, from, from, from);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
+ return make_double2(from, from);
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float>(const float& a) {
+ return make_float4(a, a+1, a+2, a+3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double>(const double& a) {
+ return make_double2(a, a+1);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
+ return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
+ return make_double2(a.x+b.x, a.y+b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
+ return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
+ return make_double2(a.x-b.x, a.y-b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
+ return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
+ return make_double2(-a.x, -a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
+ return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
+ return make_double2(a.x*b.x, a.y*b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
+ return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
+ return make_double2(a.x/b.x, a.y/b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
+ return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
+ return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
+ return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
+ return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
+ return *reinterpret_cast<const float4*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
+ return *reinterpret_cast<const double2*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
+ return make_float4(from[0], from[1], from[2], from[3]);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
+ return make_double2(from[0], from[1]);
+}
+
+template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
+ return make_float4(from[0], from[0], from[1], from[1]);
+}
+template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
+ return make_double2(from[0], from[0]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
+ *reinterpret_cast<float4*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
+ *reinterpret_cast<double2*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
+ to[0] = from.x;
+ to[1] = from.y;
+ to[2] = from.z;
+ to[3] = from.w;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
+ to[0] = from.x;
+ to[1] = from.y;
+}
+
+#ifdef __CUDA_ARCH__
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
+ return __ldg((const float4*)from);
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
+ return __ldg((const double2*)from);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
+ return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
+ return make_double2(__ldg(from+0), __ldg(from+1));
+}
+#endif
+
+template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) {
+ return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) {
+ return make_double2(from[0*stride], from[1*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) {
+ to[stride*0] = from.x;
+ to[stride*1] = from.y;
+ to[stride*2] = from.z;
+ to[stride*3] = from.w;
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) {
+ to[stride*0] = from.x;
+ to[stride*1] = from.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
+ return a.x;
+}
+template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
+ return a.x;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
+ return a.x + a.y + a.z + a.w;
+}
+template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
+ return a.x + a.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
+ return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
+ return fmax(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
+ return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
+ return fmin(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
+ return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
+ return make_double2(abs(a.x), abs(a.y));
+}
+
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<float4,4>& kernel) {
+ double tmp = kernel.packet[0].y;
+ kernel.packet[0].y = kernel.packet[1].x;
+ kernel.packet[1].x = tmp;
+
+ tmp = kernel.packet[0].z;
+ kernel.packet[0].z = kernel.packet[2].x;
+ kernel.packet[2].x = tmp;
+
+ tmp = kernel.packet[0].w;
+ kernel.packet[0].w = kernel.packet[3].x;
+ kernel.packet[3].x = tmp;
+
+ tmp = kernel.packet[1].z;
+ kernel.packet[1].z = kernel.packet[2].y;
+ kernel.packet[2].y = tmp;
+
+ tmp = kernel.packet[1].w;
+ kernel.packet[1].w = kernel.packet[3].y;
+ kernel.packet[3].y = tmp;
+
+ tmp = kernel.packet[2].w;
+ kernel.packet[2].w = kernel.packet[3].z;
+ kernel.packet[3].z = tmp;
+}
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<double2,2>& kernel) {
+ double tmp = kernel.packet[0].y;
+ kernel.packet[0].y = kernel.packet[1].x;
+ kernel.packet[1].x = tmp;
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_PACKET_MATH_CUDA_H
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
new file mode 100644
index 000000000..6bb05bb92
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -0,0 +1,91 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_NEON_H
+#define EIGEN_MATH_FUNCTIONS_NEON_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& _x)
+{
+ Packet4f x = _x;
+ Packet4f tmp, fx;
+
+ _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+ _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+ _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+ _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
+ _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+
+ x = vminq_f32(x, p4f_exp_hi);
+ x = vmaxq_f32(x, p4f_exp_lo);
+
+ /* express exp(x) as exp(g + n*log(2)) */
+ fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF);
+
+ /* perform a floorf */
+ tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+
+ /* if greater, substract 1 */
+ Packet4ui mask = vcgtq_f32(tmp, fx);
+ mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1));
+
+ fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+
+ tmp = vmulq_f32(fx, p4f_cephes_exp_C1);
+ Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2);
+ x = vsubq_f32(x, tmp);
+ x = vsubq_f32(x, z);
+
+ Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x);
+ z = vmulq_f32(x, x);
+ y = vaddq_f32(y, p4f_cephes_exp_p1);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, p4f_cephes_exp_p2);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, p4f_cephes_exp_p3);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, p4f_cephes_exp_p4);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, p4f_cephes_exp_p5);
+
+ y = vmulq_f32(y, z);
+ y = vaddq_f32(y, x);
+ y = vaddq_f32(y, p4f_1);
+
+ /* build 2^n */
+ int32x4_t mm;
+ mm = vcvtq_s32_f32(fx);
+ mm = vaddq_s32(mm, p4i_0x7f);
+ mm = vshlq_n_s32(mm, 23);
+ Packet4f pow2n = vreinterpretq_f32_s32(mm);
+
+ y = vmulq_f32(y, pow2n);
+ return y;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_NEON_H
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 6c5c669a1..559682cf7 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -20,12 +20,12 @@ namespace internal {
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
-#ifndef EIGEN_HAS_FUSE_CJMADD
-#define EIGEN_HAS_FUSE_CJMADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#endif
// FIXME NEON has 16 quad registers, but since the current register allocator
@@ -88,7 +88,7 @@ template<> struct packet_traits<float> : default_packet_traits
HasSin = 0,
HasCos = 0,
HasLog = 0,
- HasExp = 0,
+ HasExp = 1,
HasSqrt = 0
};
};
@@ -177,8 +177,19 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
return pset1<Packet4i>(0);
}
-// for some weird raisons, it has to be overloaded for packet of integers
+#ifdef __ARM_FEATURE_FMA
+// See bug 936.
+// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
+// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
+// MLA is not fused i.e. does 2 roundings.
+// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
+// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); }
+#else
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); }
+#endif
+
+// No FMA instruction for int, so use MLA unconditionally.
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
@@ -492,6 +503,21 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
//---------- double ----------
#if EIGEN_ARCH_ARM64
+#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__)
+// Bug 907: workaround missing declarations of the following two functions in the ADK
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_f64 (float64x2_t __a)
+{
+ return (uint64x2_t) __a;
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_f64_u64 (uint64x2_t __a)
+{
+ return (float64x2_t) __a;
+}
+#endif
+
typedef float64x2_t Packet2d;
typedef float64x1_t Packet1d;
@@ -536,8 +562,12 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const
template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
-// for some weird raisons, it has to be overloaded for packet of integers
+#ifdef __ARM_FEATURE_FMA
+// See bug 936. See above comment about FMA for float.
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
+#else
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
+#endif
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
@@ -597,7 +627,12 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vco
template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_low_f64(a) + vget_high_f64(a); }
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+// workaround ICE, see bug 907
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
+#else
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
+#endif
template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
{
@@ -613,7 +648,11 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
}
// Other reduction functions:
// mul
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_low_f64(a) * vget_high_f64(a); }
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
+#else
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
+#endif
// min
template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 414c4cb6a..565e448fe 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -44,7 +44,8 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
- HasSetLinear = 0
+ HasSetLinear = 0,
+ HasBlend = 1
};
};
#endif
@@ -472,6 +473,11 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
kernel.packet[1].v = tmp;
}
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+ __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
+ return Packet2cf(_mm_castpd_ps(result));
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 28427c308..898cb9ab0 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -22,9 +22,9 @@ namespace internal {
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
#endif
-#ifdef EIGEN_VECTORIZE_FMA
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
+#ifdef __FMA__
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
#endif
#endif
@@ -108,7 +108,8 @@ template<> struct packet_traits<float> : default_packet_traits
HasCos = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
- HasSqrt = 1
+ HasSqrt = 1,
+ HasBlend = 1
};
};
template<> struct packet_traits<double> : default_packet_traits
@@ -123,7 +124,8 @@ template<> struct packet_traits<double> : default_packet_traits
HasDiv = 1,
HasExp = 1,
- HasSqrt = 1
+ HasSqrt = 1,
+ HasBlend = 1
};
};
#endif
@@ -135,7 +137,9 @@ template<> struct packet_traits<int> : default_packet_traits
// FIXME check the Has*
Vectorizable = 1,
AlignedOnScalar = 1,
- size=4
+ size=4,
+
+ HasBlend = 1
};
};
@@ -227,7 +231,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
// for some weird raisons, it has to be overloaded for packet of integers
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
-#ifdef EIGEN_VECTORIZE_FMA
+#ifdef __FMA__
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
#endif
@@ -809,6 +813,37 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
}
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+ __m128i false_mask = _mm_cmpeq_epi32(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+ return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
+#else
+ return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+ const __m128 zero = _mm_setzero_ps();
+ const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+ __m128 false_mask = _mm_cmpeq_ps(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+ return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
+#else
+ return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+ const __m128d zero = _mm_setzero_pd();
+ const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
+ __m128d false_mask = _mm_cmpeq_pd(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+ return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
+#else
+ return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
+#endif
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 7b2ed6728..11e5f591d 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -26,28 +26,37 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
}
/** \internal */
-inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
+inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
{
- static std::ptrdiff_t m_l1CacheSize = 0;
- static std::ptrdiff_t m_l2CacheSize = 0;
- if(m_l2CacheSize==0)
+ static bool m_cache_sizes_initialized = false;
+ static std::ptrdiff_t m_l1CacheSize = 32*1024;
+ static std::ptrdiff_t m_l2CacheSize = 256*1024;
+ static std::ptrdiff_t m_l3CacheSize = 2*1024*1024;
+
+ if(!m_cache_sizes_initialized)
{
- m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024);
- m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024);
+ int l1CacheSize, l2CacheSize, l3CacheSize;
+ queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
+ m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024);
+ m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024);
+ m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024);
+ m_cache_sizes_initialized = true;
}
-
+
if(action==SetAction)
{
// set the cpu cache size and cache all block sizes from a global cache size in byte
eigen_internal_assert(l1!=0 && l2!=0);
m_l1CacheSize = *l1;
m_l2CacheSize = *l2;
+ m_l3CacheSize = *l3;
}
else if(action==GetAction)
{
eigen_internal_assert(l1!=0 && l2!=0);
*l1 = m_l1CacheSize;
*l2 = m_l2CacheSize;
+ *l3 = m_l3CacheSize;
}
else
{
@@ -70,10 +79,11 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi
* - the number of scalars that fit into a packet (when vectorization is enabled).
*
* \sa setCpuCacheSizes */
+#define CEIL(a, b) ((a)+(b)-1)/(b)
+
template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
-void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
+void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
{
- EIGEN_UNUSED_VARIABLE(n);
// Explanations:
// Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
// mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
@@ -81,47 +91,75 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
// at the register level. For vectorization purpose, these small vertical panels are unpacked,
// e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
// stay in L1 cache.
- std::ptrdiff_t l1, l2;
-
- typedef gebp_traits<LhsScalar,RhsScalar> Traits;
- enum {
- kdiv = KcFactor * 2 * Traits::nr
- * Traits::RhsProgress * sizeof(RhsScalar),
- mr = gebp_traits<LhsScalar,RhsScalar>::mr,
- mr_mask = (0xffffffff/mr)*mr
- };
+ std::ptrdiff_t l1, l2, l3;
+ manage_caching_sizes(GetAction, &l1, &l2, &l3);
+
+ if (num_threads > 1) {
+ typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+ typedef typename Traits::ResScalar ResScalar;
+ enum {
+ kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+ ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
+ k_mask = (0xffffffff/8)*8,
+
+ mr = Traits::mr,
+ mr_mask = (0xffffffff/mr)*mr,
+
+ nr = Traits::nr,
+ nr_mask = (0xffffffff/nr)*nr
+ };
+ SizeType k_cache = (l1-ksub)/kdiv;
+ if (k_cache < k) {
+ k = k_cache & k_mask;
+ eigen_assert(k > 0);
+ }
- manage_caching_sizes(GetAction, &l1, &l2);
+ SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+ SizeType n_per_thread = CEIL(n, num_threads);
+ if (n_cache <= n_per_thread) {
+ // Don't exceed the capacity of the l2 cache.
+ eigen_assert(n_cache >= static_cast<SizeType>(nr));
+ n = n_cache & nr_mask;
+ eigen_assert(n > 0);
+ } else {
+ n = (std::min<SizeType>)(n, (n_per_thread + nr - 1) & nr_mask);
+ }
-// k = std::min<SizeType>(k, l1/kdiv);
-// SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
-// if(_m<m) m = _m & mr_mask;
-
- // In unit tests we do not want to use extra large matrices,
- // so we reduce the block size to check the blocking strategy is not flawed
+ if (l3 > l2) {
+ // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
+ SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+ SizeType m_per_thread = CEIL(m, num_threads);
+ if(m_cache < m_per_thread && m_cache >= static_cast<SizeType>(mr)) {
+ m = m_cache & mr_mask;
+ eigen_assert(m > 0);
+ } else {
+ m = (std::min<SizeType>)(m, (m_per_thread + mr - 1) & mr_mask);
+ }
+ }
+ }
+ else {
+ // In unit tests we do not want to use extra large matrices,
+ // so we reduce the block size to check the blocking strategy is not flawed
#ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
-// k = std::min<SizeType>(k,240);
-// n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
-// m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
-
- k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
- n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
- m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
+ k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
+ n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
+ m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
#else
- k = std::min<SizeType>(k,24);
- n = std::min<SizeType>(n,384/sizeof(RhsScalar));
- m = std::min<SizeType>(m,384/sizeof(RhsScalar));
+ k = std::min<SizeType>(k,24);
+ n = std::min<SizeType>(n,384/sizeof(RhsScalar));
+ m = std::min<SizeType>(m,384/sizeof(RhsScalar));
#endif
+ }
}
template<typename LhsScalar, typename RhsScalar, typename SizeType>
-inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
+inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
{
- computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
+ computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
}
-#ifdef EIGEN_HAS_FUSE_CJMADD
- #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+ #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
#else
// FIXME (a bit overkill maybe ?)
@@ -146,8 +184,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
}
- #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
-// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
+ #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
+// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
#endif
/* Vectorization logic
@@ -182,7 +220,7 @@ public:
nr = 4,
// register block size along the M direction (currently, this one cannot be modified)
-#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
// we assume 16 registers
mr = 3*LhsPacketSize,
#else
@@ -248,7 +286,7 @@ public:
// let gcc allocate the register in which to store the result of the pmul
// (in the case where there is no FMA) gcc fails to figure out how to avoid
// spilling register.
-#ifdef EIGEN_HAS_FUSED_MADD
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
EIGEN_UNUSED_VARIABLE(tmp);
c = pmadd(a,b,c);
#else
@@ -290,7 +328,7 @@ public:
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
nr = 4,
-#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
// we assume 16 registers
mr = 3*LhsPacketSize,
#else
@@ -353,7 +391,7 @@ public:
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
{
-#ifdef EIGEN_HAS_FUSED_MADD
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
EIGEN_UNUSED_VARIABLE(tmp);
c.v = pmadd(a.v,b,c.v);
#else
@@ -637,7 +675,7 @@ public:
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
{
-#ifdef EIGEN_HAS_FUSED_MADD
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
EIGEN_UNUSED_VARIABLE(tmp);
c.v = pmadd(a,b.v,c.v);
#else
@@ -667,7 +705,7 @@ protected:
* |real |cplx | no vectorization yet, would require to pack A with duplication
* |cplx |real | easy vectorization
*/
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel
{
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
@@ -676,14 +714,15 @@ struct gebp_kernel
typedef typename Traits::RhsPacket RhsPacket;
typedef typename Traits::ResPacket ResPacket;
typedef typename Traits::AccPacket AccPacket;
-
+
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
typedef typename SwappedTraits::ResScalar SResScalar;
typedef typename SwappedTraits::LhsPacket SLhsPacket;
typedef typename SwappedTraits::RhsPacket SRhsPacket;
typedef typename SwappedTraits::ResPacket SResPacket;
typedef typename SwappedTraits::AccPacket SAccPacket;
-
+
+ typedef typename DataMapper::LinearMapper LinearMapper;
enum {
Vectorizable = Traits::Vectorizable,
@@ -693,14 +732,16 @@ struct gebp_kernel
};
EIGEN_DONT_INLINE
- void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
+ void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+ Index rows, Index depth, Index cols, ResScalar alpha,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
};
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE
-void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
- ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
+void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
+ ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+ Index rows, Index depth, Index cols, ResScalar alpha,
Index strideA, Index strideB, Index offsetA, Index offsetB)
{
Traits traits;
@@ -743,15 +784,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
- ResScalar* r0 = &res[(j2+0)*resStride + i];
- ResScalar* r1 = &res[(j2+1)*resStride + i];
- ResScalar* r2 = &res[(j2+2)*resStride + i];
- ResScalar* r3 = &res[(j2+3)*resStride + i];
-
- internal::prefetch(r0);
- internal::prefetch(r1);
- internal::prefetch(r2);
- internal::prefetch(r3);
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+ r0.prefetch(0);
+ r1.prefetch(0);
+ r2.prefetch(0);
+ r3.prefetch(0);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
@@ -760,31 +801,36 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
for(Index k=0; k<peeled_kc; k+=pk)
{
- EIGEN_ASM_COMMENT("begin gegp micro kernel 3p x 4");
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
RhsPacket B_0, T0;
LhsPacket A2;
#define EIGEN_GEBGP_ONESTEP(K) \
- internal::prefetch(blA+(3*K+16)*LhsProgress); \
- traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
- traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
- traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
- traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \
- traits.madd(A0, B_0, C0, T0); \
- traits.madd(A1, B_0, C4, T0); \
- traits.madd(A2, B_0, C8, B_0); \
- traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \
- traits.madd(A0, B_0, C1, T0); \
- traits.madd(A1, B_0, C5, T0); \
- traits.madd(A2, B_0, C9, B_0); \
- traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \
- traits.madd(A0, B_0, C2, T0); \
- traits.madd(A1, B_0, C6, T0); \
- traits.madd(A2, B_0, C10, B_0); \
- traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \
- traits.madd(A0, B_0, C3 , T0); \
- traits.madd(A1, B_0, C7, T0); \
- traits.madd(A2, B_0, C11, B_0)
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ internal::prefetch(blA+(3*K+16)*LhsProgress); \
+ traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
+ traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
+ traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
+ traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \
+ traits.madd(A0, B_0, C0, T0); \
+ traits.madd(A1, B_0, C4, T0); \
+ traits.madd(A2, B_0, C8, B_0); \
+ traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \
+ traits.madd(A0, B_0, C1, T0); \
+ traits.madd(A1, B_0, C5, T0); \
+ traits.madd(A2, B_0, C9, B_0); \
+ traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \
+ traits.madd(A0, B_0, C2, T0); \
+ traits.madd(A1, B_0, C6, T0); \
+ traits.madd(A2, B_0, C10, B_0); \
+ traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \
+ traits.madd(A0, B_0, C3 , T0); \
+ traits.madd(A1, B_0, C7, T0); \
+ traits.madd(A2, B_0, C11, B_0); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
+ } while(false)
internal::prefetch(blB+(48+0));
EIGEN_GEBGP_ONESTEP(0);
@@ -799,6 +845,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
blB += pk*4*RhsProgress;
blA += pk*3*Traits::LhsProgress;
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
}
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
@@ -813,48 +861,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
ResPacket R0, R1, R2;
ResPacket alphav = pset1<ResPacket>(alpha);
-
- R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
- R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
- R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize);
+
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+ R2 = r0.loadPacket(2 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
traits.acc(C8, alphav, R2);
- pstoreu(r0+0*Traits::ResPacketSize, R0);
- pstoreu(r0+1*Traits::ResPacketSize, R1);
- pstoreu(r0+2*Traits::ResPacketSize, R2);
-
- R0 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
- R1 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize);
- R2 = ploadu<ResPacket>(r1+2*Traits::ResPacketSize);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
+ r0.storePacket(2 * Traits::ResPacketSize, R2);
+
+ R0 = r1.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r1.loadPacket(1 * Traits::ResPacketSize);
+ R2 = r1.loadPacket(2 * Traits::ResPacketSize);
traits.acc(C1, alphav, R0);
traits.acc(C5, alphav, R1);
traits.acc(C9, alphav, R2);
- pstoreu(r1+0*Traits::ResPacketSize, R0);
- pstoreu(r1+1*Traits::ResPacketSize, R1);
- pstoreu(r1+2*Traits::ResPacketSize, R2);
-
- R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
- R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize);
- R2 = ploadu<ResPacket>(r2+2*Traits::ResPacketSize);
+ r1.storePacket(0 * Traits::ResPacketSize, R0);
+ r1.storePacket(1 * Traits::ResPacketSize, R1);
+ r1.storePacket(2 * Traits::ResPacketSize, R2);
+
+ R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+ R2 = r2.loadPacket(2 * Traits::ResPacketSize);
traits.acc(C2, alphav, R0);
traits.acc(C6, alphav, R1);
traits.acc(C10, alphav, R2);
- pstoreu(r2+0*Traits::ResPacketSize, R0);
- pstoreu(r2+1*Traits::ResPacketSize, R1);
- pstoreu(r2+2*Traits::ResPacketSize, R2);
-
- R0 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
- R1 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize);
- R2 = ploadu<ResPacket>(r3+2*Traits::ResPacketSize);
+ r2.storePacket(0 * Traits::ResPacketSize, R0);
+ r2.storePacket(1 * Traits::ResPacketSize, R1);
+ r2.storePacket(2 * Traits::ResPacketSize, R2);
+
+ R0 = r3.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r3.loadPacket(1 * Traits::ResPacketSize);
+ R2 = r3.loadPacket(2 * Traits::ResPacketSize);
traits.acc(C3, alphav, R0);
traits.acc(C7, alphav, R1);
traits.acc(C11, alphav, R2);
- pstoreu(r3+0*Traits::ResPacketSize, R0);
- pstoreu(r3+1*Traits::ResPacketSize, R1);
- pstoreu(r3+2*Traits::ResPacketSize, R2);
+ r3.storePacket(0 * Traits::ResPacketSize, R0);
+ r3.storePacket(1 * Traits::ResPacketSize, R1);
+ r3.storePacket(2 * Traits::ResPacketSize, R2);
}
-
+
// Deal with remaining columns of the rhs
for(Index j2=packet_cols4; j2<cols; j2++)
{
@@ -868,7 +916,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C4);
traits.initAcc(C8);
- ResScalar* r0 = &res[(j2+0)*resStride + i];
+ LinearMapper r0 = res.getLinearMapper(i, j2);
+ r0.prefetch(0);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
@@ -876,16 +925,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
for(Index k=0; k<peeled_kc; k+=pk)
{
- EIGEN_ASM_COMMENT("begin gegp micro kernel 3p x 1");
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
RhsPacket B_0;
#define EIGEN_GEBGP_ONESTEP(K) \
- traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
- traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
- traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
- traits.madd(A0, B_0, C0, B_0); \
- traits.madd(A1, B_0, C4, B_0); \
- traits.madd(A2, B_0, C8, B_0)
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
+ traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
+ traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
+ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
+ traits.madd(A0, B_0, C0, B_0); \
+ traits.madd(A1, B_0, C4, B_0); \
+ traits.madd(A2, B_0, C8, B_0); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
+ } while(false)
EIGEN_GEBGP_ONESTEP(0);
EIGEN_GEBGP_ONESTEP(1);
@@ -898,6 +952,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
blB += pk*RhsProgress;
blA += pk*3*Traits::LhsProgress;
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
}
// process remaining peeled loop
@@ -912,19 +968,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
ResPacket R0, R1, R2;
ResPacket alphav = pset1<ResPacket>(alpha);
- R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
- R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
- R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize);
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+ R2 = r0.loadPacket(2 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
- traits.acc(C8 , alphav, R2);
- pstoreu(r0+0*Traits::ResPacketSize, R0);
- pstoreu(r0+1*Traits::ResPacketSize, R1);
- pstoreu(r0+2*Traits::ResPacketSize, R2);
+ traits.acc(C8, alphav, R2);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
+ r0.storePacket(2 * Traits::ResPacketSize, R2);
}
}
}
-
+
//---------- Process 2 * LhsProgress rows at once ----------
if(mr>=2*Traits::LhsProgress)
{
@@ -946,15 +1002,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
- ResScalar* r0 = &res[(j2+0)*resStride + i];
- ResScalar* r1 = &res[(j2+1)*resStride + i];
- ResScalar* r2 = &res[(j2+2)*resStride + i];
- ResScalar* r3 = &res[(j2+3)*resStride + i];
-
- internal::prefetch(r0+prefetch_res_offset);
- internal::prefetch(r1+prefetch_res_offset);
- internal::prefetch(r2+prefetch_res_offset);
- internal::prefetch(r3+prefetch_res_offset);
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+ r0.prefetch(prefetch_res_offset);
+ r1.prefetch(prefetch_res_offset);
+ r2.prefetch(prefetch_res_offset);
+ r3.prefetch(prefetch_res_offset);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
@@ -963,21 +1019,26 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
for(Index k=0; k<peeled_kc; k+=pk)
{
- EIGEN_ASM_COMMENT("begin gegp micro kernel 2pX4");
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
RhsPacket B_0, B1, B2, B3, T0;
#define EIGEN_GEBGP_ONESTEP(K) \
- traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
- traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
- traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
- traits.madd(A0, B_0, C0, T0); \
- traits.madd(A1, B_0, C4, B_0); \
- traits.madd(A0, B1, C1, T0); \
- traits.madd(A1, B1, C5, B1); \
- traits.madd(A0, B2, C2, T0); \
- traits.madd(A1, B2, C6, B2); \
- traits.madd(A0, B3, C3, T0); \
- traits.madd(A1, B3, C7, B3)
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
+ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
+ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
+ traits.madd(A0, B_0, C0, T0); \
+ traits.madd(A1, B_0, C4, B_0); \
+ traits.madd(A0, B1, C1, T0); \
+ traits.madd(A1, B1, C5, B1); \
+ traits.madd(A0, B2, C2, T0); \
+ traits.madd(A1, B2, C6, B2); \
+ traits.madd(A0, B3, C3, T0); \
+ traits.madd(A1, B3, C7, B3); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
+ } while(false)
internal::prefetch(blB+(48+0));
EIGEN_GEBGP_ONESTEP(0);
@@ -992,6 +1053,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
blB += pk*4*RhsProgress;
blA += pk*(2*Traits::LhsProgress);
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
}
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
@@ -1002,37 +1065,37 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
blA += 2*Traits::LhsProgress;
}
#undef EIGEN_GEBGP_ONESTEP
-
+
ResPacket R0, R1, R2, R3;
ResPacket alphav = pset1<ResPacket>(alpha);
-
- R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
- R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
- R2 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
- R3 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize);
+
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+ R2 = r1.loadPacket(0 * Traits::ResPacketSize);
+ R3 = r1.loadPacket(1 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
traits.acc(C1, alphav, R2);
traits.acc(C5, alphav, R3);
- pstoreu(r0+0*Traits::ResPacketSize, R0);
- pstoreu(r0+1*Traits::ResPacketSize, R1);
- pstoreu(r1+0*Traits::ResPacketSize, R2);
- pstoreu(r1+1*Traits::ResPacketSize, R3);
-
- R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
- R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize);
- R2 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
- R3 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
+ r1.storePacket(0 * Traits::ResPacketSize, R2);
+ r1.storePacket(1 * Traits::ResPacketSize, R3);
+
+ R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+ R2 = r3.loadPacket(0 * Traits::ResPacketSize);
+ R3 = r3.loadPacket(1 * Traits::ResPacketSize);
traits.acc(C2, alphav, R0);
traits.acc(C6, alphav, R1);
traits.acc(C3, alphav, R2);
traits.acc(C7, alphav, R3);
- pstoreu(r2+0*Traits::ResPacketSize, R0);
- pstoreu(r2+1*Traits::ResPacketSize, R1);
- pstoreu(r3+0*Traits::ResPacketSize, R2);
- pstoreu(r3+1*Traits::ResPacketSize, R3);
+ r2.storePacket(0 * Traits::ResPacketSize, R0);
+ r2.storePacket(1 * Traits::ResPacketSize, R1);
+ r3.storePacket(0 * Traits::ResPacketSize, R2);
+ r3.storePacket(1 * Traits::ResPacketSize, R3);
}
-
+
// Deal with remaining columns of the rhs
for(Index j2=packet_cols4; j2<cols; j2++)
{
@@ -1045,8 +1108,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C0);
traits.initAcc(C4);
- ResScalar* r0 = &res[(j2+0)*resStride + i];
- internal::prefetch(r0+prefetch_res_offset);
+ LinearMapper r0 = res.getLinearMapper(i, j2);
+ r0.prefetch(prefetch_res_offset);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
@@ -1054,15 +1117,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
for(Index k=0; k<peeled_kc; k+=pk)
{
- EIGEN_ASM_COMMENT("begin gegp micro kernel 2p x 1");
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
RhsPacket B_0, B1;
#define EIGEN_GEBGP_ONESTEP(K) \
- traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
- traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
- traits.madd(A0, B_0, C0, B1); \
- traits.madd(A1, B_0, C4, B_0)
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
+ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
+ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
+ traits.madd(A0, B_0, C0, B1); \
+ traits.madd(A1, B_0, C4, B_0); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
+ } while(false)
EIGEN_GEBGP_ONESTEP(0);
EIGEN_GEBGP_ONESTEP(1);
@@ -1075,6 +1143,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
blB += pk*RhsProgress;
blA += pk*2*Traits::LhsProgress;
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
}
// process remaining peeled loop
@@ -1089,12 +1159,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
ResPacket R0, R1;
ResPacket alphav = pset1<ResPacket>(alpha);
- R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
- R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r0.loadPacket(1 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
- pstoreu(r0+0*Traits::ResPacketSize, R0);
- pstoreu(r0+1*Traits::ResPacketSize, R1);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
}
}
}
@@ -1120,15 +1190,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C2);
traits.initAcc(C3);
- ResScalar* r0 = &res[(j2+0)*resStride + i];
- ResScalar* r1 = &res[(j2+1)*resStride + i];
- ResScalar* r2 = &res[(j2+2)*resStride + i];
- ResScalar* r3 = &res[(j2+3)*resStride + i];
-
- internal::prefetch(r0+prefetch_res_offset);
- internal::prefetch(r1+prefetch_res_offset);
- internal::prefetch(r2+prefetch_res_offset);
- internal::prefetch(r3+prefetch_res_offset);
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+ r0.prefetch(prefetch_res_offset);
+ r1.prefetch(prefetch_res_offset);
+ r2.prefetch(prefetch_res_offset);
+ r3.prefetch(prefetch_res_offset);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
@@ -1137,16 +1207,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
for(Index k=0; k<peeled_kc; k+=pk)
{
- EIGEN_ASM_COMMENT("begin gegp micro kernel 1pX4");
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
RhsPacket B_0, B1, B2, B3;
#define EIGEN_GEBGP_ONESTEP(K) \
- traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
- traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
- traits.madd(A0, B_0, C0, B_0); \
- traits.madd(A0, B1, C1, B1); \
- traits.madd(A0, B2, C2, B2); \
- traits.madd(A0, B3, C3, B3);
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
+ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
+ traits.madd(A0, B_0, C0, B_0); \
+ traits.madd(A0, B1, C1, B1); \
+ traits.madd(A0, B2, C2, B2); \
+ traits.madd(A0, B3, C3, B3); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
+ } while(false)
internal::prefetch(blB+(48+0));
EIGEN_GEBGP_ONESTEP(0);
@@ -1161,6 +1236,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
blB += pk*4*RhsProgress;
blA += pk*1*LhsProgress;
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
}
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
@@ -1171,25 +1248,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
blA += 1*LhsProgress;
}
#undef EIGEN_GEBGP_ONESTEP
-
+
ResPacket R0, R1;
ResPacket alphav = pset1<ResPacket>(alpha);
-
- R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
- R1 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
+
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r1.loadPacket(0 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C1, alphav, R1);
- pstoreu(r0+0*Traits::ResPacketSize, R0);
- pstoreu(r1+0*Traits::ResPacketSize, R1);
-
- R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
- R1 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
+ r1.storePacket(0 * Traits::ResPacketSize, R1);
+
+ R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r3.loadPacket(0 * Traits::ResPacketSize);
traits.acc(C2, alphav, R0);
traits.acc(C3, alphav, R1);
- pstoreu(r2+0*Traits::ResPacketSize, R0);
- pstoreu(r3+0*Traits::ResPacketSize, R1);
+ r2.storePacket(0 * Traits::ResPacketSize, R0);
+ r3.storePacket(0 * Traits::ResPacketSize, R1);
}
-
+
// Deal with remaining columns of the rhs
for(Index j2=packet_cols4; j2<cols; j2++)
{
@@ -1201,7 +1278,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
AccPacket C0;
traits.initAcc(C0);
- ResScalar* r0 = &res[(j2+0)*resStride + i];
+ LinearMapper r0 = res.getLinearMapper(i, j2);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
@@ -1209,14 +1286,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
for(Index k=0; k<peeled_kc; k+=pk)
{
- EIGEN_ASM_COMMENT("begin gegp micro kernel 2p x 1");
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
RhsPacket B_0;
#define EIGEN_GEBGP_ONESTEP(K) \
- traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
- traits.madd(A0, B_0, C0, B_0); \
-
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
+ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
+ traits.madd(A0, B_0, C0, B_0); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
+ } while(false);
+
EIGEN_GEBGP_ONESTEP(0);
EIGEN_GEBGP_ONESTEP(1);
EIGEN_GEBGP_ONESTEP(2);
@@ -1228,6 +1310,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
blB += pk*RhsProgress;
blA += pk*1*Traits::LhsProgress;
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
}
// process remaining peeled loop
@@ -1241,9 +1325,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
#undef EIGEN_GEBGP_ONESTEP
ResPacket R0;
ResPacket alphav = pset1<ResPacket>(alpha);
- R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
- pstoreu(r0+0*Traits::ResPacketSize, R0);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
}
}
}
@@ -1259,7 +1343,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
const LhsScalar* blA = &blockA[i*strideA+offsetA];
prefetch(&blA[0]);
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
-
+
if( (SwappedTraits::LhsProgress % 4)==0 )
{
// NOTE The following piece of code wont work for 512 bit registers
@@ -1268,32 +1352,32 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
straits.initAcc(C1);
straits.initAcc(C2);
straits.initAcc(C3);
-
+
const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
const Index endk = (depth/spk)*spk;
const Index endk4 = (depth/(spk*4))*(spk*4);
-
+
Index k=0;
for(; k<endk4; k+=4*spk)
{
SLhsPacket A0,A1;
SRhsPacket B_0,B_1;
-
+
straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
-
+
straits.loadRhsQuad(blA+0*spk, B_0);
straits.loadRhsQuad(blA+1*spk, B_1);
straits.madd(A0,B_0,C0,B_0);
straits.madd(A1,B_1,C1,B_1);
-
+
straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
straits.loadRhsQuad(blA+2*spk, B_0);
straits.loadRhsQuad(blA+3*spk, B_1);
straits.madd(A0,B_0,C2,B_0);
straits.madd(A1,B_1,C3,B_1);
-
+
blB += 4*SwappedTraits::LhsProgress;
blA += 4*spk;
}
@@ -1302,11 +1386,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
{
SLhsPacket A0;
SRhsPacket B_0;
-
+
straits.loadLhsUnaligned(blB, A0);
straits.loadRhsQuad(blA, B_0);
straits.madd(A0,B_0,C0,B_0);
-
+
blB += SwappedTraits::LhsProgress;
blA += spk;
}
@@ -1317,10 +1401,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
-
- SResPacketHalf R = pgather<SResScalar, SResPacketHalf>(&res[j2*resStride + i], resStride);
+
+ SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
-
+
if(depth-endk>0)
{
// We have to handle the last row of the rhs which corresponds to a half-packet
@@ -1336,14 +1420,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
{
straits.acc(predux4(C0), alphav, R);
}
- pscatter(&res[j2*resStride + i], R, resStride);
+ res.scatterPacket(i, j2, R);
}
else
{
- SResPacket R = pgather<SResScalar, SResPacket>(&res[j2*resStride + i], resStride);
+ SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
SResPacket alphav = pset1<SResPacket>(alpha);
straits.acc(C0, alphav, R);
- pscatter(&res[j2*resStride + i], R, resStride);
+ res.scatterPacket(i, j2, R);
}
}
else // scalar path
@@ -1355,25 +1439,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
{
LhsScalar A0;
RhsScalar B_0, B_1;
-
+
A0 = blA[k];
-
+
B_0 = blB[0];
B_1 = blB[1];
- MADD(cj,A0,B_0,C0, B_0);
- MADD(cj,A0,B_1,C1, B_1);
+ CJMADD(cj,A0,B_0,C0, B_0);
+ CJMADD(cj,A0,B_1,C1, B_1);
B_0 = blB[2];
B_1 = blB[3];
- MADD(cj,A0,B_0,C2, B_0);
- MADD(cj,A0,B_1,C3, B_1);
+ CJMADD(cj,A0,B_0,C2, B_0);
+ CJMADD(cj,A0,B_1,C3, B_1);
blB += 4;
}
- res[(j2+0)*resStride + i] += alpha*C0;
- res[(j2+1)*resStride + i] += alpha*C1;
- res[(j2+2)*resStride + i] += alpha*C2;
- res[(j2+3)*resStride + i] += alpha*C3;
+ res(i, j2 + 0) += alpha * C0;
+ res(i, j2 + 1) += alpha * C1;
+ res(i, j2 + 2) += alpha * C2;
+ res(i, j2 + 3) += alpha * C3;
}
}
}
@@ -1392,9 +1476,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
{
LhsScalar A0 = blA[k];
RhsScalar B_0 = blB[k];
- MADD(cj, A0, B_0, C0, B_0);
+ CJMADD(cj, A0, B_0, C0, B_0);
}
- res[(j2+0)*resStride + i] += alpha*C0;
+ res(i, j2) += alpha * C0;
}
}
}
@@ -1417,15 +1501,16 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
//
// 32 33 34 35 ...
// 36 36 38 39 ...
-template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
{
- EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
+ typedef typename DataMapper::LinearMapper LinearMapper;
+ EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
};
-template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
- ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+ ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
typedef typename packet_traits<Scalar>::type Packet;
enum { PacketSize = packet_traits<Scalar>::size };
@@ -1436,30 +1521,29 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
- const_blas_data_mapper<Scalar, Index, ColMajor> lhs(_lhs,lhsStride);
Index count = 0;
-
+
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
: Pack2>1 ? (rows/Pack2)*Pack2 : 0;
-
+
Index i=0;
-
+
// Pack 3 packets
if(Pack1>=3*PacketSize)
{
for(; i<peeled_mc3; i+=3*PacketSize)
{
if(PanelMode) count += (3*PacketSize) * offset;
-
+
for(Index k=0; k<depth; k++)
{
Packet A, B, C;
- A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
- B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
- C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
+ A = lhs.loadPacket(i+0*PacketSize, k);
+ B = lhs.loadPacket(i+1*PacketSize, k);
+ C = lhs.loadPacket(i+2*PacketSize, k);
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
@@ -1473,12 +1557,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
for(; i<peeled_mc2; i+=2*PacketSize)
{
if(PanelMode) count += (2*PacketSize) * offset;
-
+
for(Index k=0; k<depth; k++)
{
Packet A, B;
- A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
- B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
+ A = lhs.loadPacket(i+0*PacketSize, k);
+ B = lhs.loadPacket(i+1*PacketSize, k);
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
}
@@ -1491,11 +1575,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
for(; i<peeled_mc1; i+=1*PacketSize)
{
if(PanelMode) count += (1*PacketSize) * offset;
-
+
for(Index k=0; k<depth; k++)
{
Packet A;
- A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
+ A = lhs.loadPacket(i+0*PacketSize, k);
pstore(blockA+count, cj.pconj(A));
count+=PacketSize;
}
@@ -1508,11 +1592,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
for(; i<peeled_mc0; i+=Pack2)
{
if(PanelMode) count += Pack2 * offset;
-
+
for(Index k=0; k<depth; k++)
for(Index w=0; w<Pack2; w++)
blockA[count++] = cj(lhs(i+w, k));
-
+
if(PanelMode) count += Pack2 * (stride-offset-depth);
}
}
@@ -1525,15 +1609,16 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
}
}
-template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
{
- EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
+ typedef typename DataMapper::LinearMapper LinearMapper;
+ EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
};
-template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
- ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+ ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
typedef typename packet_traits<Scalar>::type Packet;
enum { PacketSize = packet_traits<Scalar>::size };
@@ -1543,13 +1628,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
EIGEN_UNUSED_VARIABLE(offset);
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
- const_blas_data_mapper<Scalar, Index, RowMajor> lhs(_lhs,lhsStride);
Index count = 0;
-
+
// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
-
+
int pack = Pack1;
Index i = 0;
while(pack>0)
@@ -1569,7 +1653,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
for (Index m = 0; m < pack; m += PacketSize)
{
PacketBlock<Packet> kernel;
- for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu<Packet>(&lhs(i+p+m, k));
+ for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
ptranspose(kernel);
for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
}
@@ -1594,15 +1678,15 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
for(;w<pack;++w)
blockA[count++] = cj(lhs(i+w, k));
}
-
+
if(PanelMode) count += pack * (stride-offset-depth);
}
-
+
pack -= PacketSize;
if(pack<Pack2 && (pack+PacketSize)!=Pack2)
pack = Pack2;
}
-
+
for(; i<rows; i++)
{
if(PanelMode) count += offset;
@@ -1619,17 +1703,18 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
// 4 5 6 7 16 17 18 19 25 28
// 8 9 10 11 20 21 22 23 26 29
// . . . . . . . . . .
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
{
typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename DataMapper::LinearMapper LinearMapper;
enum { PacketSize = packet_traits<Scalar>::size };
- EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
};
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
- ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+ ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
{
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
EIGEN_UNUSED_VARIABLE(stride);
@@ -1685,27 +1770,27 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
// if(PanelMode) count += 8 * (stride-offset-depth);
// }
// }
-
+
if(nr>=4)
{
for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
{
// skip what we have before
if(PanelMode) count += 4 * offset;
- const Scalar* b0 = &rhs[(j2+0)*rhsStride];
- const Scalar* b1 = &rhs[(j2+1)*rhsStride];
- const Scalar* b2 = &rhs[(j2+2)*rhsStride];
- const Scalar* b3 = &rhs[(j2+3)*rhsStride];
-
+ const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+ const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+ const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+ const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
Index k=0;
if((PacketSize%4)==0) // TODO enbale vectorized transposition for PacketSize==2 ??
{
for(; k<peeled_k; k+=PacketSize) {
PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
- kernel.packet[0] = ploadu<Packet>(&b0[k]);
- kernel.packet[1] = ploadu<Packet>(&b1[k]);
- kernel.packet[2] = ploadu<Packet>(&b2[k]);
- kernel.packet[3] = ploadu<Packet>(&b3[k]);
+ kernel.packet[0] = dm0.loadPacket(k);
+ kernel.packet[1] = dm1.loadPacket(k);
+ kernel.packet[2] = dm2.loadPacket(k);
+ kernel.packet[3] = dm3.loadPacket(k);
ptranspose(kernel);
pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1]));
@@ -1716,10 +1801,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
}
for(; k<depth; k++)
{
- blockB[count+0] = cj(b0[k]);
- blockB[count+1] = cj(b1[k]);
- blockB[count+2] = cj(b2[k]);
- blockB[count+3] = cj(b3[k]);
+ blockB[count+0] = cj(dm0(k));
+ blockB[count+1] = cj(dm1(k));
+ blockB[count+2] = cj(dm2(k));
+ blockB[count+3] = cj(dm3(k));
count += 4;
}
// skip what we have after
@@ -1731,10 +1816,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
for(Index j2=packet_cols4; j2<cols; ++j2)
{
if(PanelMode) count += offset;
- const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+ const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
for(Index k=0; k<depth; k++)
{
- blockB[count] = cj(b0[k]);
+ blockB[count] = cj(dm0(k));
count += 1;
}
if(PanelMode) count += (stride-offset-depth);
@@ -1742,17 +1827,18 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
}
// this version is optimized for row major matrices
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
{
typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename DataMapper::LinearMapper LinearMapper;
enum { PacketSize = packet_traits<Scalar>::size };
- EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
};
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
- ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+ ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
{
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
EIGEN_UNUSED_VARIABLE(stride);
@@ -1762,7 +1848,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
Index count = 0;
-
+
// if(nr>=8)
// {
// for(Index j2=0; j2<packet_cols8; j2+=8)
@@ -1805,15 +1891,15 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
for(Index k=0; k<depth; k++)
{
if (PacketSize==4) {
- Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+ Packet A = rhs.loadPacket(k, j2);
pstoreu(blockB+count, cj.pconj(A));
count += PacketSize;
} else {
- const Scalar* b0 = &rhs[k*rhsStride + j2];
- blockB[count+0] = cj(b0[0]);
- blockB[count+1] = cj(b0[1]);
- blockB[count+2] = cj(b0[2]);
- blockB[count+3] = cj(b0[3]);
+ const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+ blockB[count+0] = cj(dm0(0));
+ blockB[count+1] = cj(dm0(1));
+ blockB[count+2] = cj(dm0(2));
+ blockB[count+3] = cj(dm0(3));
count += 4;
}
}
@@ -1825,10 +1911,9 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
for(Index j2=packet_cols4; j2<cols; ++j2)
{
if(PanelMode) count += offset;
- const Scalar* b0 = &rhs[j2];
for(Index k=0; k<depth; k++)
{
- blockB[count] = cj(b0[k*rhsStride]);
+ blockB[count] = cj(rhs(k, j2));
count += 1;
}
if(PanelMode) count += stride-offset-depth;
@@ -1841,8 +1926,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
* \sa setCpuCacheSize */
inline std::ptrdiff_t l1CacheSize()
{
- std::ptrdiff_t l1, l2;
- internal::manage_caching_sizes(GetAction, &l1, &l2);
+ std::ptrdiff_t l1, l2, l3;
+ internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
return l1;
}
@@ -1850,8 +1935,8 @@ inline std::ptrdiff_t l1CacheSize()
* \sa setCpuCacheSize */
inline std::ptrdiff_t l2CacheSize()
{
- std::ptrdiff_t l1, l2;
- internal::manage_caching_sizes(GetAction, &l1, &l2);
+ std::ptrdiff_t l1, l2, l3;
+ internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
return l2;
}
@@ -1860,9 +1945,9 @@ inline std::ptrdiff_t l2CacheSize()
* for the algorithms working per blocks.
*
* \sa computeProductBlockingSizes */
-inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
+inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
{
- internal::manage_caching_sizes(SetAction, &l1, &l2);
+ internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
}
} // end namespace Eigen
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index b7e1867f0..fd9443cd2 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -59,21 +59,25 @@ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScal
static void run(Index rows, Index cols, Index depth,
const LhsScalar* _lhs, Index lhsStride,
const RhsScalar* _rhs, Index rhsStride,
- ResScalar* res, Index resStride,
+ ResScalar* _res, Index resStride,
ResScalar alpha,
level3_blocking<LhsScalar,RhsScalar>& blocking,
GemmParallelInfo<Index>* info = 0)
{
- const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
- const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+ typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction
- gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
- gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
- gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+ gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+ gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
#ifdef EIGEN_HAS_OPENMP
if(info)
@@ -95,7 +99,7 @@ static void run(Index rows, Index cols, Index depth,
// In order to reduce the chance that a thread has to wait for the other,
// let's start by packing B'.
- pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc);
+ pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc);
// Pack A_k to A' in a parallel fashion:
// each thread packs the sub block A_k,i to A'_i where i is the thread id.
@@ -105,8 +109,8 @@ static void run(Index rows, Index cols, Index depth,
// Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
while(info[tid].users!=0) {}
info[tid].users += threads;
-
- pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length);
+
+ pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
// Notify the other threads that the part A'_i is ready to go.
info[tid].sync = k;
@@ -119,9 +123,12 @@ static void run(Index rows, Index cols, Index depth,
// At this point we have to make sure that A'_i has been updated by the thread i,
// we use testAndSetOrdered to mimic a volatile access.
// However, no need to wait for the B' part which has been updated by the current thread!
- if(shift>0)
- while(info[i].sync!=k) {}
- gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
+ if (shift>0) {
+ while(info[i].sync!=k) {
+ }
+ }
+
+ gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
}
// Then keep going as usual with the remaining B'
@@ -130,10 +137,10 @@ static void run(Index rows, Index cols, Index depth,
const Index actual_nc = (std::min)(j+nc,cols)-j;
// pack B_k,j to B'
- pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc);
+ pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc);
// C_j += A' * B'
- gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
+ gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
}
// Release all the sub blocks A'_i of A' for the current thread,
@@ -159,28 +166,33 @@ static void run(Index rows, Index cols, Index depth,
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
- for(Index k2=0; k2<depth; k2+=kc)
+ for(Index i2=0; i2<rows; i2+=mc)
{
- const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+ const Index actual_mc = (std::min)(i2+mc,rows)-i2;
- // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
- // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
- // Note that this panel will be read as many times as the number of blocks in the rhs's
- // horizontal panel which is, in practice, a very low number.
- pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows);
-
- // For each kc x nc block of the rhs's horizontal panel...
- for(Index j2=0; j2<cols; j2+=nc)
+ for(Index k2=0; k2<depth; k2+=kc)
{
- const Index actual_nc = (std::min)(j2+nc,cols)-j2;
-
- // We pack the rhs's block into a sequential chunk of memory (L2 caching)
- // Note that this block will be read a very high number of times, which is equal to the number of
- // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
- pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc);
-
- // Everything is packed, we can now call the panel * block kernel:
- gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
+ const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+
+ // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
+ // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
+ // Note that this panel will be read as many times as the number of blocks in the rhs's
+ // horizontal panel which is, in practice, a very low number.
+ pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
+
+ // For each kc x nc block of the rhs's horizontal panel...
+ for(Index j2=0; j2<cols; j2+=nc)
+ {
+ const Index actual_nc = (std::min)(j2+nc,cols)-j2;
+
+ // We pack the rhs's block into a sequential chunk of memory (L2 caching)
+ // Note that this block will be read a very high number of times, which is equal to the number of
+ // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
+ pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
+
+ // Everything is packed, we can now call the panel * block kernel:
+ gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
+ }
}
}
}
@@ -287,7 +299,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
public:
- gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false)
+ gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, int /*num_threads*/, bool /*full_rows = false*/)
{
this->m_mc = ActualRows;
this->m_nc = ActualCols;
@@ -319,23 +331,23 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
public:
- gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false)
+ gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, int num_threads, bool l3_blocking)
{
this->m_mc = Transpose ? cols : rows;
this->m_nc = Transpose ? rows : cols;
this->m_kc = depth;
- if(full_rows)
+ if(l3_blocking)
{
- DenseIndex m = this->m_mc;
- computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc);
+ computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
}
- else // full columns
+ else // no l3 blocking
{
+ DenseIndex m = this->m_mc;
DenseIndex n = this->m_nc;
- computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
+ computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads);
}
-
+
m_sizeA = this->m_mc * this->m_kc;
m_sizeB = this->m_kc * this->m_nc;
}
@@ -445,8 +457,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;
- BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true);
-
+ BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit);
}
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 7db3e3d38..e55994900 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -58,27 +58,31 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
{
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
- const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
+ const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha)
{
- const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
- const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+ typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride);
+
Index kc = depth; // cache block size along the K direction
Index mc = size; // cache block size along the M direction
Index nc = size; // cache block size along the N direction
- computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc);
+ computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, 1);
// !!! mc must be a multiple of nr:
if(mc > Traits::nr)
mc = (mc/Traits::nr)*Traits::nr;
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
-
- gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
- gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
- gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+
+ gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+ gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
for(Index k2=0; k2<depth; k2+=kc)
@@ -86,29 +90,30 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
// note that the actual rhs is the transpose/adjoint of mat
- pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size);
+ pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size);
for(Index i2=0; i2<size; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,size)-i2;
- pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+ pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
// the selected actual_mc * size panel of res is split into three different part:
// 1 - before the diagonal => processed with gebp or skipped
// 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
// 3 - after the diagonal => processed with gebp or skipped
if (UpLo==Lower)
- gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
- -1, -1, 0, 0);
+ gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
+ (std::min)(size,i2), alpha, -1, -1, 0, 0);
+
- sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
+ sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
if (UpLo==Upper)
{
Index j2 = i2+actual_mc;
- gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
- -1, -1, 0, 0);
+ gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc,
+ actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0);
}
}
}
@@ -129,13 +134,16 @@ struct tribb_kernel
{
typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
typedef typename Traits::ResScalar ResScalar;
-
+
enum {
BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr)
};
- void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
+ void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
{
- gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
+ typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
+ ResMapper res(_res, resStride);
+ gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
+
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
// let's process the block per panel of actual_mc x BlockSize,
@@ -146,7 +154,7 @@ struct tribb_kernel
const RhsScalar* actual_b = blockB+j*depth;
if(UpLo==Upper)
- gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
+ gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
-1, -1, 0, 0);
// selfadjoint micro block
@@ -154,12 +162,12 @@ struct tribb_kernel
Index i = j;
buffer.setZero();
// 1 - apply the kernel on the temporary buffer
- gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
+ gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-1, -1, 0, 0);
// 2 - triangular accumulation
for(Index j1=0; j1<actualBlockSize; ++j1)
{
- ResScalar* r = res + (j+j1)*resStride + i;
+ ResScalar* r = &res(i, j + j1);
for(Index i1=UpLo==Lower ? j1 : 0;
UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
r[i1] += buffer(i1,j1);
@@ -169,8 +177,8 @@ struct tribb_kernel
if(UpLo==Lower)
{
Index i = j+actualBlockSize;
- gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
- -1, -1, 0, 0);
+ gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i,
+ depth, actualBlockSize, alpha, -1, -1, 0, 0);
}
}
}
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 340c51394..7df6a6b1a 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -10,7 +10,7 @@
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
#define EIGEN_GENERAL_MATRIX_VECTOR_H
-namespace Eigen {
+namespace Eigen {
namespace internal {
@@ -48,17 +48,17 @@ namespace internal {
* // we currently fall back to the NoneAligned case
*
* The same reasoning apply for the transposed case.
- *
+ *
* The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
* One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
* strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
* compared to unaligned loads on a 4 byte boundary.
*
*/
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
{
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
enum {
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
@@ -78,17 +78,17 @@ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
EIGEN_DONT_INLINE static void run(
Index rows, Index cols,
- const LhsScalar* lhs, Index lhsStride,
- const RhsScalar* rhs, Index rhsIncr,
+ const LhsMapper& lhs,
+ const RhsMapper& rhs,
ResScalar* res, Index resIncr,
RhsScalar alpha);
};
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
Index rows, Index cols,
- const LhsScalar* lhs, Index lhsStride,
- const RhsScalar* rhs, Index rhsIncr,
+ const LhsMapper& lhs,
+ const RhsMapper& rhs,
ResScalar* res, Index resIncr,
RhsScalar alpha)
{
@@ -97,14 +97,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
#ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif
- #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
+ #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
pstore(&res[j], \
padd(pload<ResPacket>(&res[j]), \
padd( \
- padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
- pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
- padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
- pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
+ padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \
+ pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \
+ padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \
+ pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) )))
+
+ typedef typename LhsMapper::VectorMapper LhsScalars;
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
@@ -118,7 +120,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
const Index ResPacketAlignedMask = ResPacketSize-1;
// const Index PeelAlignedMask = ResPacketSize*peels-1;
const Index size = rows;
-
+
+ const Index lhsStride = lhs.stride();
+
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type.
Index alignedStart = internal::first_aligned(res,size);
@@ -131,15 +135,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
: FirstAligned;
// we cannot assume the first element is aligned because of sub-matrices
- const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
+ const Index lhsAlignmentOffset = lhs.firstAligned(size);
// find how many columns do we have to skip to be aligned with the result (if possible)
Index skipColumns = 0;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
- if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
+ if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (size_t(res)%sizeof(ResScalar)) )
{
alignedSize = 0;
alignedStart = 0;
+ alignmentPattern = NoneAligned;
}
else if(LhsPacketSize > 4)
{
@@ -149,7 +154,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
}
else if (LhsPacketSize>1)
{
- eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
+ // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
while (skipColumns<LhsPacketSize &&
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
@@ -166,10 +171,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
// note that the skiped columns are processed later.
}
- eigen_internal_assert( (alignmentPattern==NoneAligned)
+ /* eigen_internal_assert( (alignmentPattern==NoneAligned)
|| (skipColumns + columnsAtOnce >= cols)
|| LhsPacketSize > size
- || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
+ || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
}
else if(Vectorizable)
{
@@ -178,20 +183,20 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
alignmentPattern = AllAligned;
}
- Index offset1 = (FirstAligned && alignmentStep==1?3:1);
- Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+ const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+ const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
{
- RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
- ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
- ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
- ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
+ RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
+ ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
+ ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
+ ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
// this helps a lot generating better binary code
- const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
- *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+ const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1),
+ lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3);
if (Vectorizable)
{
@@ -199,10 +204,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
// process initial unaligned coeffs
for (Index j=0; j<alignedStart; ++j)
{
- res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
- res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
- res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
- res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
+ res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+ res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+ res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+ res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
}
if (alignedSize>alignedStart)
@@ -211,11 +216,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
{
case AllAligned:
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
- _EIGEN_ACCUMULATE_PACKETS(d,d,d);
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
break;
case EvenAligned:
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
- _EIGEN_ACCUMULATE_PACKETS(d,du,d);
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
break;
case FirstAligned:
{
@@ -225,28 +230,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
ResPacket T0, T1;
- A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
- A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
- A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
+ A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+ A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+ A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
for (; j<peeledSize; j+=peels*ResPacketSize)
{
- A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
- A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
- A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
+ A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
+ A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
+ A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
- A00 = pload<LhsPacket>(&lhs0[j]);
- A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
+ A00 = lhs0.template load<LhsPacket, Aligned>(j);
+ A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
T0 = pcj.pmadd(A01, ptmp1, T0);
- A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
+ A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
T0 = pcj.pmadd(A02, ptmp2, T0);
- A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
+ A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
T0 = pcj.pmadd(A03, ptmp3, T0);
pstore(&res[j],T0);
- A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
+ A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
T1 = pcj.pmadd(A11, ptmp1, T1);
T1 = pcj.pmadd(A12, ptmp2, T1);
T1 = pcj.pmadd(A13, ptmp3, T1);
@@ -254,12 +259,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
}
}
for (; j<alignedSize; j+=ResPacketSize)
- _EIGEN_ACCUMULATE_PACKETS(d,du,du);
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
break;
}
default:
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
- _EIGEN_ACCUMULATE_PACKETS(du,du,du);
+ _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
break;
}
}
@@ -268,10 +273,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
/* process remaining coeffs (or all if there is no explicit vectorization) */
for (Index j=alignedSize; j<size; ++j)
{
- res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
- res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
- res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
- res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
+ res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+ res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+ res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+ res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
}
}
@@ -282,27 +287,27 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
{
for (Index k=start; k<end; ++k)
{
- RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
- const LhsScalar* lhs0 = lhs + k*lhsStride;
+ RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
+ const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
if (Vectorizable)
{
/* explicit vectorization */
// process first unaligned result's coeffs
for (Index j=0; j<alignedStart; ++j)
- res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
+ res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
// process aligned result's coeffs
- if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+ if (lhs0.template aligned<LhsPacket>(alignedStart))
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
- pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
+ pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
else
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
- pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
+ pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
}
// process remaining scalars (or all if no explicit vectorization)
for (Index i=alignedSize; i<size; ++i)
- res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
+ res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
}
if (skipColumns)
{
@@ -326,8 +331,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
* - alpha is always a complex (or converted to a complex)
* - no vectorization
*/
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
{
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
@@ -346,70 +351,75 @@ typedef typename packet_traits<ResScalar>::type _ResPacket;
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
-
+
EIGEN_DONT_INLINE static void run(
Index rows, Index cols,
- const LhsScalar* lhs, Index lhsStride,
- const RhsScalar* rhs, Index rhsIncr,
+ const LhsMapper& lhs,
+ const RhsMapper& rhs,
ResScalar* res, Index resIncr,
ResScalar alpha);
};
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
Index rows, Index cols,
- const LhsScalar* lhs, Index lhsStride,
- const RhsScalar* rhs, Index rhsIncr,
+ const LhsMapper& lhs,
+ const RhsMapper& rhs,
ResScalar* res, Index resIncr,
ResScalar alpha)
{
- EIGEN_UNUSED_VARIABLE(rhsIncr);
- eigen_internal_assert(rhsIncr==1);
-
+ eigen_internal_assert(rhs.stride()==1);
+
#ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif
- #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
- RhsPacket b = pload<RhsPacket>(&rhs[j]); \
- ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
- ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
- ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
- ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
+ #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
+ RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \
+ ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
+ ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
+ ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
+ ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
+ typedef typename LhsMapper::VectorMapper LhsScalars;
+
enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
const Index rowsAtOnce = 4;
const Index peels = 2;
const Index RhsPacketAlignedMask = RhsPacketSize-1;
const Index LhsPacketAlignedMask = LhsPacketSize-1;
-// const Index PeelAlignedMask = RhsPacketSize*peels-1;
const Index depth = cols;
+ const Index lhsStride = lhs.stride();
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type
// if that's not the case then vectorization is discarded, see below.
- Index alignedStart = internal::first_aligned(rhs, depth);
+ Index alignedStart = rhs.firstAligned(depth);
Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
Index alignmentPattern = alignmentStep==0 ? AllAligned
- : alignmentStep==(LhsPacketSize/2) ? EvenAligned
- : FirstAligned;
+ : alignmentStep==(LhsPacketSize/2) ? EvenAligned
+ : FirstAligned;
// we cannot assume the first element is aligned because of sub-matrices
- const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
+ const Index lhsAlignmentOffset = lhs.firstAligned(depth);
+ const Index rhsAlignmentOffset = rhs.firstAligned(rows);
// find how many rows do we have to skip to be aligned with rhs (if possible)
Index skipRows = 0;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
- if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
+ if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
+ (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
+ (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
{
alignedSize = 0;
alignedStart = 0;
+ alignmentPattern = NoneAligned;
}
else if(LhsPacketSize > 4)
{
@@ -418,7 +428,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
}
else if (LhsPacketSize>1)
{
- eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
+ // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
while (skipRows<LhsPacketSize &&
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
@@ -434,11 +444,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
skipRows = (std::min)(skipRows,Index(rows));
// note that the skiped columns are processed later.
}
- eigen_internal_assert( alignmentPattern==NoneAligned
+ /* eigen_internal_assert( alignmentPattern==NoneAligned
|| LhsPacketSize==1
|| (skipRows + rowsAtOnce >= rows)
|| LhsPacketSize > depth
- || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
+ || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
}
else if(Vectorizable)
{
@@ -447,8 +457,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
alignmentPattern = AllAligned;
}
- Index offset1 = (FirstAligned && alignmentStep==1?3:1);
- Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+ const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+ const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
@@ -457,8 +467,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
// this helps the compiler generating good binary code
- const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
- *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+ const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0),
+ lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0);
if (Vectorizable)
{
@@ -470,9 +480,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
// FIXME this loop get vectorized by the compiler !
for (Index j=0; j<alignedStart; ++j)
{
- RhsScalar b = rhs[j];
- tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
- tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
+ RhsScalar b = rhs(j, 0);
+ tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+ tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
}
if (alignedSize>alignedStart)
@@ -481,11 +491,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
{
case AllAligned:
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
- _EIGEN_ACCUMULATE_PACKETS(d,d,d);
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
break;
case EvenAligned:
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
- _EIGEN_ACCUMULATE_PACKETS(d,du,d);
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
break;
case FirstAligned:
{
@@ -499,39 +509,39 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
* than basic unaligned loads.
*/
LhsPacket A01, A02, A03, A11, A12, A13;
- A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
- A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
- A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
+ A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+ A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+ A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
for (; j<peeledSize; j+=peels*RhsPacketSize)
{
- RhsPacket b = pload<RhsPacket>(&rhs[j]);
- A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
- A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
- A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
+ RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
+ A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
+ A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
+ A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
- ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
+ ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
ptmp1 = pcj.pmadd(A01, b, ptmp1);
- A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
+ A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
ptmp2 = pcj.pmadd(A02, b, ptmp2);
- A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
+ A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
ptmp3 = pcj.pmadd(A03, b, ptmp3);
- A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
+ A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
- b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
- ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
+ b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
+ ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
ptmp1 = pcj.pmadd(A11, b, ptmp1);
ptmp2 = pcj.pmadd(A12, b, ptmp2);
ptmp3 = pcj.pmadd(A13, b, ptmp3);
}
}
for (; j<alignedSize; j+=RhsPacketSize)
- _EIGEN_ACCUMULATE_PACKETS(d,du,du);
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
break;
}
default:
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
- _EIGEN_ACCUMULATE_PACKETS(du,du,du);
+ _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
break;
}
tmp0 += predux(ptmp0);
@@ -545,9 +555,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
// FIXME this loop get vectorized by the compiler !
for (Index j=alignedSize; j<depth; ++j)
{
- RhsScalar b = rhs[j];
- tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
- tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
+ RhsScalar b = rhs(j, 0);
+ tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+ tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
}
res[i*resIncr] += alpha*tmp0;
res[(i+offset1)*resIncr] += alpha*tmp1;
@@ -564,28 +574,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
{
EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
ResPacket ptmp0 = pset1<ResPacket>(tmp0);
- const LhsScalar* lhs0 = lhs + i*lhsStride;
+ const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
// process first unaligned result's coeffs
// FIXME this loop get vectorized by the compiler !
for (Index j=0; j<alignedStart; ++j)
- tmp0 += cj.pmul(lhs0[j], rhs[j]);
+ tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
if (alignedSize>alignedStart)
{
// process aligned rhs coeffs
- if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+ if (lhs0.template aligned<LhsPacket>(alignedStart))
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
- ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
+ ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
else
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
- ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
+ ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
tmp0 += predux(ptmp0);
}
// process remaining scalars
// FIXME this loop get vectorized by the compiler !
for (Index j=alignedSize; j<depth; ++j)
- tmp0 += cj.pmul(lhs0[j], rhs[j]);
+ tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
res[i*resIncr] += alpha*tmp0;
}
if (skipRows)
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 126cfbbff..2b90abf8f 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -49,8 +49,8 @@ inline void initParallel()
{
int nbt;
internal::manage_multi_threading(GetAction, &nbt);
- std::ptrdiff_t l1, l2;
- internal::manage_caching_sizes(GetAction, &l1, &l2);
+ std::ptrdiff_t l1, l2, l3;
+ internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
}
/** \returns the max number of threads reserved for Eigen
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 4e507b6cf..4b6316d63 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -324,20 +324,26 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
Index rows, Index cols,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
- Scalar* res, Index resStride,
+ Scalar* _res, Index resStride,
const Scalar& alpha)
{
Index size = rows;
- const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
- const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
typedef gebp_traits<Scalar,Scalar> Traits;
+ typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+ typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
+ typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride);
+
Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction
- computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
+ computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
// kc must smaller than mc
kc = (std::min)(kc,mc);
@@ -346,10 +352,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
Scalar* blockB = allocatedBlockB;
- gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
- gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
- gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+ gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
for(Index k2=0; k2<size; k2+=kc)
{
@@ -358,7 +364,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
// we have selected one row panel of rhs and one column panel of lhs
// pack rhs's panel into a sequential chunk of memory
// and expand each coeff to a constant packet for further reuse
- pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
+ pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols);
// the select lhs's panel has to be split in three different parts:
// 1 - the transposed panel above the diagonal block => transposed packed copy
@@ -368,9 +374,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
{
const Index actual_mc = (std::min)(i2+mc,k2)-i2;
// transposed packed copy
- pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
+ pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc);
- gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+ gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
// the block diagonal
{
@@ -378,16 +384,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
// symmetric packed copy
pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
- gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+ gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
for(Index i2=k2+kc; i2<size; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,size)-i2;
- gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
- (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
+ (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
- gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+ gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
}
}
@@ -414,26 +420,29 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
Index rows, Index cols,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
- Scalar* res, Index resStride,
+ Scalar* _res, Index resStride,
const Scalar& alpha)
{
Index size = cols;
- const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-
typedef gebp_traits<Scalar,Scalar> Traits;
- Index kc = size; // cache block size along the K direction
+ typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ ResMapper res(_res,resStride);
+
+ Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction
- computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
+ computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
std::size_t sizeB = kc*cols;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
Scalar* blockB = allocatedBlockB;
- gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
- gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=0; k2<size; k2+=kc)
@@ -446,9 +455,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
for(Index i2=0; i2<rows; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,rows)-i2;
- pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+ pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
- gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+ gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
}
}
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index c2d0817ea..60c99dcd2 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -108,7 +108,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
Index _rows, Index _cols, Index _depth,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
- Scalar* res, Index resStride,
+ Scalar* _res, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{
// strip zeros
@@ -117,8 +117,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
Index depth = IsLower ? diagSize : _depth;
Index cols = _cols;
- const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
- const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+ typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+ typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
@@ -136,9 +140,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
else
triangularBuffer.diagonal().setOnes();
- gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
- gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
- gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+ gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=IsLower ? depth : 0;
IsLower ? k2>0 : k2<depth;
@@ -154,7 +158,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
k2 = k2+actual_kc-kc;
}
- pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols);
+ pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols);
// the selected lhs's panel has to be split in three different parts:
// 1 - the part which is zero => skip it
@@ -182,9 +186,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
}
- pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
+ pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth);
- gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
+ gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB,
+ actualPanelWidth, actualPanelWidth, cols, alpha,
actualPanelWidth, actual_kc, 0, blockBOffset);
// GEBP with remaining micro panel
@@ -192,9 +197,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
{
Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
- pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
+ pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
- gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
+ gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB,
+ lengthTarget, actualPanelWidth, cols, alpha,
actualPanelWidth, actual_kc, 0, blockBOffset);
}
}
@@ -206,10 +212,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
for(Index i2=start; i2<end; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,end)-i2;
- gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
- (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
+ (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
- gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
+ gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
+ actual_kc, cols, alpha, -1, -1, 0, 0);
}
}
}
@@ -247,7 +254,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index _rows, Index _cols, Index _depth,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
- Scalar* res, Index resStride,
+ Scalar* _res, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{
// strip zeros
@@ -256,8 +263,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index depth = IsLower ? _depth : diagSize;
Index cols = IsLower ? diagSize : _cols;
- const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
- const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+ typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+ typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
@@ -275,10 +286,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
else
triangularBuffer.diagonal().setOnes();
- gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
- gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
- gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
- gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
+ gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
for(Index k2=IsLower ? 0 : depth;
IsLower ? k2<depth : k2>0;
@@ -302,7 +313,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Scalar* geb = blockB+ts*ts;
geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));
- pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
+ pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
// pack the triangular part of the rhs padding the unrolled blocks with zeros
if(ts>0)
@@ -315,7 +326,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
// general part
pack_rhs_panel(blockB+j2*actual_kc,
- &rhs(actual_k2+panelOffset, actual_j2), rhsStride,
+ rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
panelLength, actualPanelWidth,
actual_kc, panelOffset);
@@ -329,7 +340,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
}
pack_rhs_panel(blockB+j2*actual_kc,
- triangularBuffer.data(), triangularBuffer.outerStride(),
+ RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
actualPanelWidth, actualPanelWidth,
actual_kc, j2);
}
@@ -338,7 +349,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
for (Index i2=0; i2<rows; i2+=mc)
{
const Index actual_mc = (std::min)(mc,rows-i2);
- pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
+ pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
// triangular kernel
if(ts>0)
@@ -349,7 +360,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
Index blockOffset = IsLower ? j2 : 0;
- gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
+ gebp_kernel(res.getSubMapper(i2, actual_k2 + j2),
blockA, blockB+j2*actual_kc,
actual_mc, panelLength, actualPanelWidth,
alpha,
@@ -357,7 +368,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
blockOffset, blockOffset);// offsets
}
}
- gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
+ gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2),
blockA, geb, actual_mc, actual_kc, rs,
alpha,
-1, -1, 0, 0);
@@ -402,7 +413,7 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
: ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
- BlockingType blocking(stripedRows, stripedCols, stripedDepth);
+ BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
internal::product_triangular_matrix_matrix<Scalar, Index,
Mode, LhsIsTriangular,
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index 92d64e384..4d88a710b 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -10,7 +10,7 @@
#ifndef EIGEN_TRIANGULARMATRIXVECTOR_H
#define EIGEN_TRIANGULARMATRIXVECTOR_H
-namespace Eigen {
+namespace Eigen {
namespace internal {
@@ -43,7 +43,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
-
+
typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
@@ -51,6 +51,9 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
ResMap res(_res,rows);
+ typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
for (Index pi=0; pi<size; pi+=PanelWidth)
{
Index actualPanelWidth = (std::min)(PanelWidth, size-pi);
@@ -68,19 +71,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
if (r>0)
{
Index s = IsLower ? pi+actualPanelWidth : 0;
- general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
r, actualPanelWidth,
- &lhs.coeffRef(s,pi), lhsStride,
- &rhs.coeffRef(pi), rhsIncr,
+ LhsMapper(&lhs.coeffRef(s,pi), lhsStride),
+ RhsMapper(&rhs.coeffRef(pi), rhsIncr),
&res.coeffRef(s), resIncr, alpha);
}
}
if((!IsLower) && cols>size)
{
- general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs>::run(
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
rows, cols-size,
- &lhs.coeffRef(0,size), lhsStride,
- &rhs.coeffRef(size), rhsIncr,
+ LhsMapper(&lhs.coeffRef(0,size), lhsStride),
+ RhsMapper(&rhs.coeffRef(size), rhsIncr),
_res, resIncr, alpha);
}
}
@@ -118,7 +121,10 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
ResMap res(_res,rows,InnerStride<>(resIncr));
-
+
+ typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
for (Index pi=0; pi<diagSize; pi+=PanelWidth)
{
Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi);
@@ -136,19 +142,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
if (r>0)
{
Index s = IsLower ? 0 : pi + actualPanelWidth;
- general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
actualPanelWidth, r,
- &lhs.coeffRef(pi,s), lhsStride,
- &rhs.coeffRef(s), rhsIncr,
+ LhsMapper(&lhs.coeffRef(pi,s), lhsStride),
+ RhsMapper(&rhs.coeffRef(s), rhsIncr),
&res.coeffRef(pi), resIncr, alpha);
}
}
if(IsLower && rows>diagSize)
{
- general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs>::run(
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
rows-diagSize, cols,
- &lhs.coeffRef(diagSize,0), lhsStride,
- &rhs.coeffRef(0), rhsIncr,
+ LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride),
+ RhsMapper(&rhs.coeffRef(0), rhsIncr),
&res.coeffRef(diagSize), resIncr, alpha);
}
}
@@ -231,7 +237,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-
+
RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
@@ -251,7 +257,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
else
MappedDest(actualDestPtr, dest.size()) = dest;
}
-
+
internal::triangular_matrix_vector_product
<Index,Mode,
LhsScalar, LhsBlasTraits::NeedToConjugate,
@@ -311,7 +317,7 @@ template<int Mode> struct trmv_selector<Mode,RowMajor>
#endif
Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
}
-
+
internal::triangular_matrix_vector_product
<Index,Mode,
LhsScalar, LhsBlasTraits::NeedToConjugate,
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 1f7afd187..f5de67c59 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -52,10 +52,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
level3_blocking<Scalar,Scalar>& blocking)
{
Index cols = otherSize;
- const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
- blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
+
+ typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
+ typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
+ TriMapper tri(_tri, triStride);
+ OtherMapper other(_other, otherStride);
typedef gebp_traits<Scalar,Scalar> Traits;
+
enum {
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower
@@ -71,14 +75,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
conj_if<Conjugate> conj;
- gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
- gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
- gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
+ gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
+ gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
+ gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
// the goal here is to subdivise the Rhs panels such that we keep some cache
// coherence when accessing the rhs elements
- std::ptrdiff_t l1, l2;
- manage_caching_sizes(GetAction, &l1, &l2);
+ std::ptrdiff_t l1, l2, l3;
+ manage_caching_sizes(GetAction, &l1, &l2, &l3);
Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
@@ -146,16 +150,16 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
Index blockBOffset = IsLower ? k1 : lengthTarget;
// update the respective rows of B from other
- pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset);
+ pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset);
// GEBP
if (lengthTarget>0)
{
Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
- pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
+ pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
- gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
+ gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
actualPanelWidth, actual_kc, 0, blockBOffset);
}
}
@@ -170,9 +174,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
const Index actual_mc = (std::min)(mc,end-i2);
if (actual_mc>0)
{
- pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
+ pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc);
- gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
+ gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
}
}
}
@@ -198,8 +202,11 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
level3_blocking<Scalar,Scalar>& blocking)
{
Index rows = otherSize;
- const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
- blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
+
+ typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
+ typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
+ LhsMapper lhs(_other, otherStride);
+ RhsMapper rhs(_tri, triStride);
typedef gebp_traits<Scalar,Scalar> Traits;
enum {
@@ -218,10 +225,10 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
conj_if<Conjugate> conj;
- gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
- gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
- gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
- gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
+ gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
for(Index k2=IsLower ? size : 0;
IsLower ? k2>0 : k2<size;
@@ -234,7 +241,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
Scalar* geb = blockB+actual_kc*actual_kc;
- if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs);
+ if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs);
// triangular packing (we only pack the panels off the diagonal,
// neglecting the blocks overlapping the diagonal
@@ -248,7 +255,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
if (panelLength>0)
pack_rhs_panel(blockB+j2*actual_kc,
- &rhs(actual_k2+panelOffset, actual_j2), triStride,
+ rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
panelLength, actualPanelWidth,
actual_kc, panelOffset);
}
@@ -276,7 +283,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
// GEBP
if(panelLength>0)
{
- gebp_kernel(&lhs(i2,absolute_j2), otherStride,
+ gebp_kernel(lhs.getSubMapper(i2,absolute_j2),
blockA, blockB+j2*actual_kc,
actual_mc, panelLength, actualPanelWidth,
Scalar(-1),
@@ -303,14 +310,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
}
// pack the just computed part of lhs to A
- pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride,
+ pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
actualPanelWidth, actual_mc,
actual_kc, j2);
}
}
if (rs>0)
- gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
+ gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb,
actual_mc, actual_kc, rs, Scalar(-1),
-1, -1, 0, 0);
}
diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h
index ce4d10088..b994759b2 100644
--- a/Eigen/src/Core/products/TriangularSolverVector.h
+++ b/Eigen/src/Core/products/TriangularSolverVector.h
@@ -10,7 +10,7 @@
#ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H
#define EIGEN_TRIANGULAR_SOLVER_VECTOR_H
-namespace Eigen {
+namespace Eigen {
namespace internal {
@@ -25,7 +25,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Co
>::run(size, _lhs, lhsStride, rhs);
}
};
-
+
// forward and backward substitution, row-major, rhs is a vector
template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor>
@@ -37,6 +37,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
{
typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+
+ typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
+
typename internal::conditional<
Conjugate,
const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
@@ -58,10 +62,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
Index startRow = IsLower ? pi : pi-actualPanelWidth;
Index startCol = IsLower ? 0 : pi;
- general_matrix_vector_product<Index,LhsScalar,RowMajor,Conjugate,RhsScalar,false>::run(
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
actualPanelWidth, r,
- &lhs.coeffRef(startRow,startCol), lhsStride,
- rhs + startCol, 1,
+ LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride),
+ RhsMapper(rhs + startCol, 1),
rhs + startRow, 1,
RhsScalar(-1));
}
@@ -72,7 +76,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
Index s = IsLower ? pi : i+1;
if (k>0)
rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
-
+
if(!(Mode & UnitDiag))
rhs[i] /= cjLhs(i,i);
}
@@ -91,6 +95,8 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
{
typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+ typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
typename internal::conditional<Conjugate,
const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
const LhsMap&
@@ -122,10 +128,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
// let's directly call the low level product function because:
// 1 - it is faster to compile
// 2 - it is slighlty faster at runtime
- general_matrix_vector_product<Index,LhsScalar,ColMajor,Conjugate,RhsScalar,false>::run(
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
r, actualPanelWidth,
- &lhs.coeffRef(endBlock,startBlock), lhsStride,
- rhs+startBlock, 1,
+ LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
+ RhsMapper(rhs+startBlock, 1),
rhs+endBlock, 1, RhsScalar(-1));
}
}
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 9f9115c2a..3ec55fad2 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -18,13 +18,13 @@ namespace Eigen {
namespace internal {
// forward declarations
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
struct gebp_kernel;
-template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
+template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
struct gemm_pack_rhs;
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
struct gemm_pack_lhs;
template<
@@ -34,7 +34,9 @@ template<
int ResStorageOrder>
struct general_matrix_matrix_product;
-template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version=Specialized>
+template<typename Index,
+ typename LhsScalar, typename LhsMapper, int LhsStorageOrder, bool ConjugateLhs,
+ typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version=Specialized>
struct general_matrix_vector_product;
@@ -117,32 +119,133 @@ template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::R
static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) { return numext::real(x); }
};
+
+template<typename Scalar, typename Index>
+class BlasVectorMapper {
+ public:
+ EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {}
+
+ EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+ return m_data[i];
+ }
+ template <typename Packet, int AlignmentType>
+ EIGEN_ALWAYS_INLINE Packet load(Index i) const {
+ return ploadt<Packet, AlignmentType>(m_data + i);
+ }
+
+ template <typename Packet>
+ bool aligned(Index i) const {
+ return (size_t(m_data+i)%sizeof(Packet))==0;
+ }
+
+ protected:
+ Scalar* m_data;
+};
+
+template<typename Scalar, typename Index, int AlignmentType>
+class BlasLinearMapper {
+ public:
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename packet_traits<Scalar>::half HalfPacket;
+
+ EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
+
+ EIGEN_ALWAYS_INLINE void prefetch(int i) const {
+ internal::prefetch(&operator()(i));
+ }
+
+ EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
+ return m_data[i];
+ }
+
+ EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+ return ploadt<Packet, AlignmentType>(m_data + i);
+ }
+
+ EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+ return ploadt<HalfPacket, AlignmentType>(m_data + i);
+ }
+
+ EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+ pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
+ }
+
+ protected:
+ Scalar *m_data;
+};
+
// Lightweight helper class to access matrix coefficients.
-// Yes, this is somehow redundant with Map<>, but this version is much much lighter,
-// and so I hope better compilation performance (time and code quality).
-template<typename Scalar, typename Index, int StorageOrder>
-class blas_data_mapper
-{
+template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned>
+class blas_data_mapper {
public:
- blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
- EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j)
- { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename packet_traits<Scalar>::half HalfPacket;
+
+ typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
+ typedef BlasVectorMapper<Scalar, Index> VectorMapper;
+
+ EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
+
+ EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
+ getSubMapper(Index i, Index j) const {
+ return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride);
+ }
+
+ EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+ return LinearMapper(&operator()(i, j));
+ }
+
+ EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+ return VectorMapper(&operator()(i, j));
+ }
+
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+ return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
+ }
+
+ EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+ return ploadt<Packet, AlignmentType>(&operator()(i, j));
+ }
+
+ EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+ return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
+ }
+
+ template<typename SubPacket>
+ EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const {
+ pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
+ }
+
+ template<typename SubPacket>
+ EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
+ return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
+ }
+
+ const Index stride() const { return m_stride; }
+
+ Index firstAligned(Index size) const {
+ if (size_t(m_data)%sizeof(Scalar)) {
+ return -1;
+ }
+ return internal::first_aligned(m_data, size);
+ }
+
protected:
- Scalar* EIGEN_RESTRICT m_data;
- Index m_stride;
+ Scalar* EIGEN_RESTRICT m_data;
+ const Index m_stride;
};
// lightweight helper class to access matrix coefficients (const version)
template<typename Scalar, typename Index, int StorageOrder>
-class const_blas_data_mapper
-{
+class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
public:
- const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
- EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const
- { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
- protected:
- const Scalar* EIGEN_RESTRICT m_data;
- Index m_stride;
+ EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper<const Scalar, Index, StorageOrder>(data, stride) {}
+
+ EIGEN_ALWAYS_INLINE const_blas_data_mapper<Scalar, Index, StorageOrder> getSubMapper(Index i, Index j) const {
+ return const_blas_data_mapper<Scalar, Index, StorageOrder>(&(this->operator()(i, j)), this->m_stride);
+ }
};
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 5c7d70af6..d1855b50b 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -163,6 +163,19 @@ const unsigned int NestByRefBit = 0x100;
* \sa \ref RowMajorBit, \ref TopicStorageOrders */
const unsigned int NoPreferredStorageOrderBit = 0x200;
+/** \ingroup flags
+ *
+ * Means that the underlying coefficients can be accessed through pointers to the sparse (un)compressed storage format,
+ * that is, the expression provides:
+ * \code
+ inline const Scalar* valuePtr() const;
+ inline const Index* innerIndexPtr() const;
+ inline const Index* outerIndexPtr() const;
+ inline const Index* innerNonZeroPtr() const;
+ \endcode
+ */
+const unsigned int CompressedAccessBit = 0x400;
+
// list of flags that are inherited by default
const unsigned int HereditaryBits = RowMajorBit
@@ -449,6 +462,9 @@ enum Action {GetAction, SetAction};
/** The type used to identify a dense storage. */
struct Dense {};
+/** The type used to identify a general sparse storage. */
+struct Sparse {};
+
/** The type used to identify a permutation storage. */
struct PermutationStorage {};
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 11b7e2887..07923848a 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -73,7 +73,7 @@
/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC
#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC)
- #define EIGEN_COMP_MSVC_STRICT 1
+ #define EIGEN_COMP_MSVC_STRICT _MSC_VER
#else
#define EIGEN_COMP_MSVC_STRICT 0
#endif
@@ -160,6 +160,12 @@
#define EIGEN_ARCH_ARM64 0
#endif
+#if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64
+ #define EIGEN_ARCH_ARM_OR_ARM64 1
+#else
+ #define EIGEN_ARCH_ARM_OR_ARM64 0
+#endif
+
/// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS
#if defined(__mips__) || defined(__mips)
#define EIGEN_ARCH_MIPS 1
@@ -376,10 +382,21 @@
#define EIGEN_HAVE_RVALUE_REFERENCES
#endif
+// Does the compiler support variadic templates?
+#if __cplusplus > 199711L
+#define EIGEN_HAS_VARIADIC_TEMPLATES 1
+#endif
+
+// Does the compiler support const expressions?
+#if (defined(__plusplus) && __cplusplus >= 201402L) || \
+ EIGEN_GNUC_AT_LEAST(4,9)
+#define EIGEN_HAS_CONSTEXPR 1
+#endif
+
/** Allows to disable some optimizations which might affect the accuracy of the result.
* Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
* They currently include:
- * - single precision Cwise::sin() and Cwise::cos() when SSE vectorization is enabled.
+ * - single precision ArrayBase::sin() and ArrayBase::cos() when SSE vectorization is enabled.
*/
#ifndef EIGEN_FAST_MATH
#define EIGEN_FAST_MATH 1
@@ -526,7 +543,7 @@ namespace Eigen {
#define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
#if !defined(EIGEN_ASM_COMMENT)
- #if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64
+ #if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64)
#define EIGEN_ASM_COMMENT(X) __asm__("#" X)
#else
#define EIGEN_ASM_COMMENT(X)
@@ -540,7 +557,9 @@ namespace Eigen {
* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
* vectorized and non-vectorized code.
*/
-#if EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
+#if (defined __CUDACC__)
+ #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
+#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
#elif EIGEN_COMP_MSVC
#define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
@@ -592,7 +611,7 @@ namespace Eigen {
// just an empty macro !
#define EIGEN_EMPTY
-#if EIGEN_COMP_MSVC_STRICT
+#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1900
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
using Base::operator =;
#elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index a54ccaedc..16f8cc1b0 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -143,8 +143,8 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
*** Implementation of generic aligned realloc (when no realloc can be used)***
*****************************************************************************/
-void* aligned_malloc(std::size_t size);
-void aligned_free(void *ptr);
+EIGEN_DEVICE_FUNC void* aligned_malloc(std::size_t size);
+EIGEN_DEVICE_FUNC void aligned_free(void *ptr);
/** \internal
* \brief Reallocates aligned memory.
@@ -185,33 +185,33 @@ inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
*****************************************************************************/
#ifdef EIGEN_NO_MALLOC
-inline void check_that_malloc_is_allowed()
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
{
eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
}
#elif defined EIGEN_RUNTIME_NO_MALLOC
-inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
{
static bool value = true;
if (update == 1)
value = new_value;
return value;
}
-inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
-inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
-inline void check_that_malloc_is_allowed()
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
+EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
{
eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
}
#else
-inline void check_that_malloc_is_allowed()
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
{}
#endif
/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
* On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
*/
-inline void* aligned_malloc(size_t size)
+EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size)
{
check_that_malloc_is_allowed();
@@ -237,7 +237,7 @@ inline void* aligned_malloc(size_t size)
}
/** \internal Frees memory allocated with aligned_malloc. */
-inline void aligned_free(void *ptr)
+EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
{
#if !EIGEN_ALIGN
std::free(ptr);
@@ -298,12 +298,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
/** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
* On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
*/
-template<bool Align> inline void* conditional_aligned_malloc(size_t size)
+template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size)
{
return aligned_malloc(size);
}
-template<> inline void* conditional_aligned_malloc<false>(size_t size)
+template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(size_t size)
{
check_that_malloc_is_allowed();
@@ -314,12 +314,12 @@ template<> inline void* conditional_aligned_malloc<false>(size_t size)
}
/** \internal Frees memory allocated with conditional_aligned_malloc */
-template<bool Align> inline void conditional_aligned_free(void *ptr)
+template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr)
{
aligned_free(ptr);
}
-template<> inline void conditional_aligned_free<false>(void *ptr)
+template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
{
std::free(ptr);
}
@@ -341,7 +341,7 @@ template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new
/** \internal Destructs the elements of an array.
* The \a size parameters tells on how many objects to call the destructor of T.
*/
-template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size)
{
// always destruct an array starting from the end.
if(ptr)
@@ -351,7 +351,7 @@ template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
/** \internal Constructs the elements of an array.
* The \a size parameter tells on how many objects to call the constructor of T.
*/
-template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size)
{
size_t i;
EIGEN_TRY
@@ -371,7 +371,7 @@ template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
*****************************************************************************/
template<typename T>
-EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
{
if(size > size_t(-1) / sizeof(T))
throw_std_bad_alloc();
@@ -381,7 +381,7 @@ EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
* On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
* The default constructor of T is called.
*/
-template<typename T> inline T* aligned_new(size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
{
check_size_for_overflow<T>(size);
T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
@@ -396,7 +396,7 @@ template<typename T> inline T* aligned_new(size_t size)
}
}
-template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size)
{
check_size_for_overflow<T>(size);
T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
@@ -414,7 +414,7 @@ template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
/** \internal Deletes objects constructed with aligned_new
* The \a size parameters tells on how many objects to call the destructor of T.
*/
-template<typename T> inline void aligned_delete(T *ptr, size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t size)
{
destruct_elements_of_array<T>(ptr, size);
aligned_free(ptr);
@@ -423,13 +423,13 @@ template<typename T> inline void aligned_delete(T *ptr, size_t size)
/** \internal Deletes objects constructed with conditional_aligned_new
* The \a size parameters tells on how many objects to call the destructor of T.
*/
-template<typename T, bool Align> inline void conditional_aligned_delete(T *ptr, size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size)
{
destruct_elements_of_array<T>(ptr, size);
conditional_aligned_free<Align>(ptr);
}
-template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
{
check_size_for_overflow<T>(new_size);
check_size_for_overflow<T>(old_size);
@@ -452,7 +452,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pt
}
-template<typename T, bool Align> inline T* conditional_aligned_new_auto(size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size)
{
if(size==0)
return 0; // short-cut. Also fixes Bug 884
@@ -495,7 +495,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(
return result;
}
-template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *ptr, size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size)
{
if(NumTraits<T>::RequireInitialization)
destruct_elements_of_array<T>(ptr, size);
@@ -523,9 +523,8 @@ template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *
template<typename Scalar, typename Index>
inline Index first_aligned(const Scalar* array, Index size)
{
- enum { PacketSize = packet_traits<Scalar>::size,
- PacketAlignedMask = PacketSize-1
- };
+ static const Index PacketSize = packet_traits<Scalar>::size;
+ static const Index PacketAlignedMask = PacketSize-1;
if(PacketSize==1)
{
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 299e5cbc2..528ebe297 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -463,6 +463,21 @@ template<typename XprType, typename CastType> struct cast_return_type
const XprType&,CastType>::type type;
};
+template <typename A, typename B> struct promote_storage_type;
+
+template <typename A> struct promote_storage_type<A,A>
+{
+ typedef A ret;
+};
+template <typename A> struct promote_storage_type<A, const A>
+{
+ typedef A ret;
+};
+template <typename A> struct promote_storage_type<const A, A>
+{
+ typedef A ret;
+};
+
/** \internal Specify the "storage kind" of applying a coefficient-wise
* binary operations between two expressions of kinds A and B respectively.
* The template parameter Functor permits to specialize the resulting storage kind wrt to