aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--Eigen/src/Core/ProductEvaluators.h4
-rw-r--r--Eigen/src/Core/util/Memory.h80
-rw-r--r--Eigen/src/Core/util/XprHelper.h2
-rw-r--r--test/product_notemporary.cpp10
4 files changed, 89 insertions, 7 deletions
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 76e5083c1..22ad32ae3 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -272,7 +272,7 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func>
void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
{
evaluator<Rhs> rhsEval(rhs);
- typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
+ ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
// FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
// FIXME not very good if rhs is real and lhs complex while alpha is real too
const Index cols = dst.cols();
@@ -285,7 +285,7 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func>
void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
{
evaluator<Lhs> lhsEval(lhs);
- typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
+ ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
// FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
// FIXME not very good if lhs is real and rhs complex while alpha is real too
const Index rows = dst.rows();
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 22d7679c5..43920bdc8 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -582,6 +582,60 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
bool m_deallocate;
};
+#ifdef EIGEN_ALLOCA
+
+template<typename Xpr, int NbEvaluations,
+ bool MapExternalBuffer = nested_eval<Xpr,NbEvaluations>::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic
+ >
+struct local_nested_eval_wrapper
+{
+ static const bool NeedExternalBuffer = false;
+ typedef typename Xpr::Scalar Scalar;
+ typedef typename nested_eval<Xpr,NbEvaluations>::type ObjectType;
+ ObjectType object;
+
+ EIGEN_DEVICE_FUNC
+ local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr)
+ {
+ EIGEN_UNUSED_VARIABLE(ptr);
+ eigen_internal_assert(ptr==0);
+ }
+};
+
+template<typename Xpr, int NbEvaluations>
+struct local_nested_eval_wrapper<Xpr,NbEvaluations,true>
+{
+ static const bool NeedExternalBuffer = true;
+ typedef typename Xpr::Scalar Scalar;
+ typedef typename plain_object_eval<Xpr>::type PlainObject;
+ typedef Map<PlainObject,EIGEN_DEFAULT_ALIGN_BYTES> ObjectType;
+ ObjectType object;
+
+ EIGEN_DEVICE_FUNC
+ local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr)
+ : object(ptr==0 ? reinterpret_cast<Scalar*>(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()),
+ m_deallocate(ptr==0)
+ {
+ if(NumTraits<Scalar>::RequireInitialization && object.data())
+ Eigen::internal::construct_elements_of_array(object.data(), object.size());
+ object = xpr;
+ }
+
+ EIGEN_DEVICE_FUNC
+ ~local_nested_eval_wrapper()
+ {
+ if(NumTraits<Scalar>::RequireInitialization && object.data())
+ Eigen::internal::destruct_elements_of_array(object.data(), object.size());
+ if(m_deallocate)
+ Eigen::internal::aligned_free(object.data());
+ }
+
+private:
+ bool m_deallocate;
+};
+
+#endif // EIGEN_ALLOCA
+
template<typename T> class scoped_array : noncopyable
{
T* m_ptr;
@@ -609,9 +663,11 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
} // end namespace internal
/** \internal
- * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
- * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
- * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap.
+ *
+ * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates,
+ * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
+ * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
+ * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap.
* The allocated buffer is automatically deleted when exiting the scope of this declaration.
* If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs.
* Here is an example:
@@ -622,6 +678,14 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
* }
* \endcode
* The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
+ *
+ * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to
+ * \code
+ * typename internal::nested_eval<XPRT_T,N>::type NAME(XPR);
+ * \endcode
+ * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown.
+ * This is accomplished through alloca if this later is supported and if the required number of bytes
+ * is below EIGEN_STACK_ALLOCATION_LIMIT.
*/
#ifdef EIGEN_ALLOCA
@@ -641,6 +705,13 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
: Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) ); \
Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT)
+
+ #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \
+ Eigen::internal::local_nested_eval_wrapper<XPR_T,N> EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast<typename XPR_T::Scalar*>( \
+ ( (Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \
+ ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \
+ typename Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object)
+
#else
#define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
@@ -648,6 +719,9 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE)); \
Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
+
+#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval<XPR_T,N>::type NAME(XPR);
+
#endif
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 7311adec7..3926d3a6c 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -460,7 +460,7 @@ template<typename T, int n, typename PlainObject = typename plain_object_eval<T>
{
enum {
ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
- CoeffReadCost = evaluator<T>::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a tempory?
+ CoeffReadCost = evaluator<T>::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a temporary?
// Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1.
// This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON
// for all evaluator creating a temporary. This flag is then propagated by the parent evaluators.
diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp
index 30592b79e..062180f42 100644
--- a/test/product_notemporary.cpp
+++ b/test/product_notemporary.cpp
@@ -128,11 +128,19 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * (m1*cv1), 1 );
// Check outer products
+ #ifdef EIGEN_ALLOCA
+ bool temp_via_alloca = m3.rows()*sizeof(Scalar) <= EIGEN_STACK_ALLOCATION_LIMIT;
+ #else
+ bool temp_via_alloca = false;
+ #endif
m3 = cv1 * rv1;
VERIFY_EVALUATION_COUNT( m3.noalias() = cv1 * rv1, 0 );
- VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), 1 );
+ VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
VERIFY_EVALUATION_COUNT( m3.noalias() = (m1*cv1) * (rv1), 1 );
VERIFY_EVALUATION_COUNT( m3.noalias() += (m1*cv1) * (rv1), 1 );
+ rm3 = cv1 * rv1;
+ VERIFY_EVALUATION_COUNT( rm3.noalias() = cv1 * rv1, 0 );
+ VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1) * (rv1 * m1), 1 );
VERIFY_EVALUATION_COUNT( rm3.noalias() -= (cv1) * (rv1 * m1), 1 );
VERIFY_EVALUATION_COUNT( rm3.noalias() = (m1*cv1) * (rv1 * m1), 2 );