diff options
author | Deven Desai <deven.desai.amd@gmail.com> | 2018-07-11 09:17:33 -0400 |
---|---|---|
committer | Deven Desai <deven.desai.amd@gmail.com> | 2018-07-11 09:17:33 -0400 |
commit | 38807a257500cd0746b819c994efab805b8a02e4 (patch) | |
tree | 0be837e16ad1dc2b09d8f2be2f752f074b169717 | |
parent | e2b2c61533cb923ddba41ba7bd64b87f30a25e29 (diff) | |
parent | f00d08cc0a987fa624209b920608b56638404f13 (diff) |
merging updates from upstream
55 files changed, 978 insertions, 335 deletions
diff --git a/Eigen/Core b/Eigen/Core index f67bffd12..4336de91d 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -56,47 +56,59 @@ #ifdef EIGEN_EXCEPTIONS #undef EIGEN_EXCEPTIONS #endif +#endif - // All functions callable from CUDA code must be qualified with __device__ - #ifdef EIGEN_CUDACC - // Do not try to vectorize on CUDA and SYCL! - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif - - #define EIGEN_DEVICE_FUNC __host__ __device__ - // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro - // works properly on the device side - #include <cuda_runtime.h> +// All functions callable from CUDA code must be qualified with __device__ +#ifdef EIGEN_CUDACC + // Do not try to vectorize on CUDA and SYCL! + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif - #elif defined(EIGEN_HIPCC) - // Do not try to vectorize on HIP - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE + #define EIGEN_DEVICE_FUNC __host__ __device__ + // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro + // works properly on the device side + #include <cuda_runtime.h> + + #if EIGEN_HAS_CONSTEXPR + // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules + #if defined(__NVCC__) + // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr + #ifdef __CUDACC_RELAXED_CONSTEXPR__ + #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC + #endif + #elif defined(__clang__) && defined(__CUDA__) + // clang++ always considers constexpr functions as implicitly __host__ __device__ + #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC #endif - #define EIGEN_DEVICE_FUNC __host__ __device__ - // We need hip_runtime.h to ensure that that EIGEN_USING_STD_MATH macro - // works properly on the device side - #include <hip/hip_runtime.h> +#elif defined(EIGEN_HIPCC) + // Do not try to vectorize on HIP + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif + + #define EIGEN_DEVICE_FUNC __host__ __device__ + // We need hip_runtime.h to ensure that that EIGEN_USING_STD_MATH macro + // works properly on the device side + #include <hip/hip_runtime.h> - #if defined(__HIP_DEVICE_COMPILE__) && !defined(EIGEN_NO_HIP) - // analogous to EIGEN_CUDA_ARCH, but for HIP - #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__ - // Note this check needs to come after we include hip_runtime.h since - // hip_runtime.h includes hip_common.h which in turn has the define - // for __HIP_DEVICE_COMPILE__ - #endif - - #else - #define EIGEN_DEVICE_FUNC + #if defined(__HIP_DEVICE_COMPILE__) && !defined(EIGEN_NO_HIP) + // analogous to EIGEN_CUDA_ARCH, but for HIP + #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__ + // Note this check needs to come after we include hip_runtime.h since + // hip_runtime.h includes hip_common.h which in turn has the define + // for __HIP_DEVICE_COMPILE__ #endif + #else #define EIGEN_DEVICE_FUNC #endif #ifdef __NVCC__ -#define EIGEN_DONT_VECTORIZE + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif #endif diff --git a/Eigen/PardisoSupport b/Eigen/PardisoSupport index 340edf51f..340edf51f 100755..100644 --- a/Eigen/PardisoSupport +++ b/Eigen/PardisoSupport diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h index 688aadd62..757b31825 100644 --- a/Eigen/src/Core/ArrayWrapper.h +++ b/Eigen/src/Core/ArrayWrapper.h @@ -90,8 +90,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> > EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const { dst = m_expression; } - const typename internal::remove_all<NestedExpressionType>::type& EIGEN_DEVICE_FUNC + const typename internal::remove_all<NestedExpressionType>::type& nestedExpression() const { return m_expression; diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 6231f2edd..b65ec4ffb 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -1080,7 +1080,7 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa : m_argImpl(block.nestedExpression()), m_startRow(block.startRow()), m_startCol(block.startCol()), - m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0) + m_linear_offset((InnerPanel|| XprType::IsVectorAtCompileTime)?(XprType::IsRowMajor ? block.startRow()*block.cols() + block.startCol() : block.startCol()*block.rows() + block.startRow()):0) { } typedef typename XprType::Scalar Scalar; @@ -1088,7 +1088,7 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa enum { RowsAtCompileTime = XprType::RowsAtCompileTime, - ForwardLinearAccess = InnerPanel && bool(evaluator<ArgType>::Flags&LinearAccessBit) + ForwardLinearAccess = (InnerPanel || XprType::IsVectorAtCompileTime) && bool(evaluator<ArgType>::Flags&LinearAccessBit) }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1162,7 +1162,7 @@ protected: evaluator<ArgType> m_argImpl; const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow; const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol; - const variable_if_dynamic<Index, InnerPanel ? Dynamic : 0> m_linear_offset; + const variable_if_dynamic<Index, (InnerPanel || XprType::IsVectorAtCompileTime) ? Dynamic : 0> m_linear_offset; }; // TODO: This evaluator does not actually use the child evaluator; diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 9e58fbf88..3c02a1025 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -207,7 +207,9 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt EIGEN_UNUSED_VARIABLE(rows); EIGEN_UNUSED_VARIABLE(cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data, other.m_data); + } EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {} @@ -267,7 +269,11 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic } EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {} EIGEN_DEVICE_FUNC void swap(DenseStorage& other) - { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } + { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + numext::swap(m_cols,other.m_cols); + } EIGEN_DEVICE_FUNC Index rows() const {return m_rows;} EIGEN_DEVICE_FUNC Index cols() const {return m_cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; } @@ -296,7 +302,11 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si return *this; } EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {} - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) + { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + } EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; } @@ -325,11 +335,14 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si return *this; } EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {} - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data,other.m_data); + numext::swap(m_cols,other.m_cols); + } EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;} EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} - void conservativeResize(Index, Index, Index cols) { m_cols = cols; } - void resize(Index, Index, Index cols) { m_cols = cols; } + EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; } + EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; } EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; } EIGEN_DEVICE_FUNC T *data() { return m_data.array; } }; @@ -381,16 +394,19 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT { - using std::swap; - swap(m_data, other.m_data); - swap(m_rows, other.m_rows); - swap(m_cols, other.m_cols); + numext::swap(m_data, other.m_data); + numext::swap(m_rows, other.m_rows); + numext::swap(m_cols, other.m_cols); return *this; } #endif EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other) - { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } + { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + numext::swap(m_cols,other.m_cols); + } EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} void conservativeResize(Index size, Index rows, Index cols) @@ -459,14 +475,16 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT { - using std::swap; - swap(m_data, other.m_data); - swap(m_cols, other.m_cols); + numext::swap(m_data, other.m_data); + numext::swap(m_cols, other.m_cols); return *this; } #endif EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data,other.m_data); + numext::swap(m_cols,other.m_cols); + } EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols) @@ -533,14 +551,16 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT { - using std::swap; - swap(m_data, other.m_data); - swap(m_rows, other.m_rows); + numext::swap(m_data, other.m_data); + numext::swap(m_rows, other.m_rows); return *this; } #endif EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + } EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} void conservativeResize(Index size, Index rows, Index) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 261f77b99..43f3b84c8 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -163,13 +163,13 @@ template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vect template<typename Scalar,int Size,int MaxSize> struct gemv_static_vector_if<Scalar,Size,MaxSize,false> { - EIGEN_STRONG_INLINE Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } }; template<typename Scalar,int Size> struct gemv_static_vector_if<Scalar,Size,Dynamic,true> { - EIGEN_STRONG_INLINE Scalar* data() { return 0; } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; } }; template<typename Scalar,int Size,int MaxSize> diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 6415a7696..72aa68d45 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -869,7 +869,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x); namespace numext { -#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__) +#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) && !defined(__SYCL_DEVICE_ONLY__) template<typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) @@ -886,19 +886,16 @@ EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) return max EIGEN_NOT_A_MACRO (x,y); } - #elif defined(__SYCL_DEVICE_ONLY__) template<typename T> EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) { - return y < x ? y : x; } template<typename T> EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) { - return x < y ? y : x; } @@ -942,7 +939,6 @@ EIGEN_ALWAYS_INLINE unsigned long maxi(const unsigned long& x, const unsigned lo return cl::sycl::max(x,y); } - EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y) { return cl::sycl::fmin(x,y); @@ -976,6 +972,19 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y) { return fminf(x, y); } +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y) +{ + return fmin(x, y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) +{ + return fminl(x, y); +} + template<typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) @@ -988,6 +997,18 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y) { return fmaxf(x, y); } +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y) +{ + return fmax(x, y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) +{ + return fmaxl(x, y); +} #endif diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index 28737c15e..a23e93ccb 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -67,7 +67,7 @@ T generic_fast_tanh_float(const T& a_x) } template<typename RealScalar> -EIGEN_STRONG_INLINE +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) { EIGEN_USING_STD_MATH(sqrt); @@ -82,7 +82,8 @@ template<typename Scalar> struct hypot_impl { typedef typename NumTraits<Scalar>::Real RealScalar; - static inline RealScalar run(const Scalar& x, const Scalar& y) + static EIGEN_DEVICE_FUNC + inline RealScalar run(const Scalar& x, const Scalar& y) { EIGEN_USING_STD_MATH(abs); return positive_real_hypot<RealScalar>(abs(x), abs(y)); diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 11435903b..6046c8bae 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -328,6 +328,7 @@ template<typename Derived> class MatrixBase inline const PartialPivLU<PlainObject> lu() const; + EIGEN_DEVICE_FUNC inline const Inverse<Derived> inverse() const; template<typename ResultType> @@ -337,12 +338,15 @@ template<typename Derived> class MatrixBase bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision() ) const; + template<typename ResultType> inline void computeInverseWithCheck( ResultType& inverse, bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision() ) const; + + EIGEN_DEVICE_FUNC Scalar determinant() const; /////////// Cholesky module /////////// @@ -414,15 +418,19 @@ template<typename Derived> class MatrixBase ////////// Householder module /////////// + EIGEN_DEVICE_FUNC void makeHouseholderInPlace(Scalar& tau, RealScalar& beta); template<typename EssentialPart> + EIGEN_DEVICE_FUNC void makeHouseholder(EssentialPart& essential, Scalar& tau, RealScalar& beta) const; template<typename EssentialPart> + EIGEN_DEVICE_FUNC void applyHouseholderOnTheLeft(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); template<typename EssentialPart> + EIGEN_DEVICE_FUNC void applyHouseholderOnTheRight(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 5567d4c90..b053cff07 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -21,12 +21,14 @@ template< typename T, bool is_integer = NumTraits<T>::IsInteger> struct default_digits10_impl { + EIGEN_DEVICE_FUNC static int run() { return std::numeric_limits<T>::digits10; } }; template<typename T> struct default_digits10_impl<T,false,false> // Floating point { + EIGEN_DEVICE_FUNC static int run() { using std::log10; using std::ceil; @@ -38,6 +40,7 @@ struct default_digits10_impl<T,false,false> // Floating point template<typename T> struct default_digits10_impl<T,false,true> // Integer { + EIGEN_DEVICE_FUNC static int run() { return 0; } }; @@ -49,12 +52,14 @@ template< typename T, bool is_integer = NumTraits<T>::IsInteger> struct default_digits_impl { + EIGEN_DEVICE_FUNC static int run() { return std::numeric_limits<T>::digits; } }; template<typename T> struct default_digits_impl<T,false,false> // Floating point { + EIGEN_DEVICE_FUNC static int run() { using std::log; using std::ceil; @@ -66,6 +71,7 @@ struct default_digits_impl<T,false,false> // Floating point template<typename T> struct default_digits_impl<T,false,true> // Integer { + EIGEN_DEVICE_FUNC static int run() { return 0; } }; diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h index b1fb455b9..acd085301 100644 --- a/Eigen/src/Core/PermutationMatrix.h +++ b/Eigen/src/Core/PermutationMatrix.h @@ -99,13 +99,13 @@ class PermutationBase : public EigenBase<Derived> #endif /** \returns the number of rows */ - inline Index rows() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); } /** \returns the number of columns */ - inline Index cols() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); } /** \returns the size of a side of the respective square matrix, i.e., the number of indices */ - inline Index size() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); } #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename DenseDerived> diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 3d67d9489..70790dbd4 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -127,7 +127,7 @@ public: using Base::derived; typedef typename Base::Scalar Scalar; - EIGEN_STRONG_INLINE operator const Scalar() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const { return internal::evaluator<ProductXpr>(derived()).coeff(0,0); } diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 2bb42f74b..0330b5741 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -272,7 +272,7 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func> void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) { evaluator<Rhs> rhsEval(rhs); - typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs); + ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs); // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored // FIXME not very good if rhs is real and lhs complex while alpha is real too const Index cols = dst.cols(); @@ -285,7 +285,7 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func> void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) { evaluator<Lhs> lhsEval(lhs); - typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs); + ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs); // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored // FIXME not very good if lhs is real and rhs complex while alpha is real too const Index rows = dst.rows(); @@ -396,7 +396,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> // but easier on the compiler side call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>()); } - + template<typename Dst> static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { @@ -410,6 +410,32 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> // dst.noalias() -= lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>()); } + + // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor: + // dst {,+,-}= s * (A.lazyProduct(B)) + // This is a huge benefit for heap-allocated matrix types as it save one costly allocation. + // For them, this strategy is also faster than simply by-passing the heap allocation through + // stack allocation. + // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower, + // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only, + // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h + template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>, + const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func) + { + call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func); + } + + // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above + // overload more specialized. + template<typename Dst, typename LhsT, typename Func> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func) + { + call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); + } + // template<typename Dst> // static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) @@ -741,7 +767,8 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag> typedef typename Product<Lhs,Rhs>::Scalar Scalar; template<typename Dest> - static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha); } @@ -785,7 +812,11 @@ public: _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0), - Alignment = evaluator<MatrixType>::Alignment + Alignment = evaluator<MatrixType>::Alignment, + + AsScalarProduct = (DiagonalType::SizeAtCompileTime==1) + || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft) + || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight) }; diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) @@ -797,7 +828,10 @@ public: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const { - return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); + if(AsScalarProduct) + return m_diagImpl.coeff(0) * m_matImpl.coeff(idx); + else + return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); } protected: diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 0a99acb21..ddce65468 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -23,22 +23,22 @@ namespace internal { * Part 1 : the logic deciding a strategy for vectorization and unrolling ***************************************************************************/ -template<typename Func, typename Derived> +template<typename Func, typename Evaluator> struct redux_traits { public: - typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType; + typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType; enum { PacketSize = unpacket_traits<PacketType>::size, - InnerMaxSize = int(Derived::IsRowMajor) - ? Derived::MaxColsAtCompileTime - : Derived::MaxRowsAtCompileTime + InnerMaxSize = int(Evaluator::IsRowMajor) + ? Evaluator::MaxColsAtCompileTime + : Evaluator::MaxRowsAtCompileTime }; enum { - MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit) + MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit) && (functor_traits<Func>::PacketAccess), - MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit), + MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit), MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize }; @@ -51,8 +51,8 @@ public: public: enum { - Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost - : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost, + Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost + : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost, UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize)) }; @@ -64,9 +64,9 @@ public: #ifdef EIGEN_DEBUG_ASSIGN static void debug() { - std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl; + std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl; std::cerr.setf(std::ios::hex, std::ios::basefield); - EIGEN_DEBUG_VAR(Derived::Flags) + EIGEN_DEBUG_VAR(Evaluator::Flags) std::cerr.unsetf(std::ios::hex); EIGEN_DEBUG_VAR(InnerMaxSize) EIGEN_DEBUG_VAR(PacketSize) @@ -87,88 +87,88 @@ public: /*** no vectorization ***/ -template<typename Func, typename Derived, int Start, int Length> +template<typename Func, typename Evaluator, int Start, int Length> struct redux_novec_unroller { enum { HalfLength = Length/2 }; - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func) { - return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func), - redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func)); + return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func), + redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func)); } }; -template<typename Func, typename Derived, int Start> -struct redux_novec_unroller<Func, Derived, Start, 1> +template<typename Func, typename Evaluator, int Start> +struct redux_novec_unroller<Func, Evaluator, Start, 1> { enum { - outer = Start / Derived::InnerSizeAtCompileTime, - inner = Start % Derived::InnerSizeAtCompileTime + outer = Start / Evaluator::InnerSizeAtCompileTime, + inner = Start % Evaluator::InnerSizeAtCompileTime }; - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&) + static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&) { - return mat.coeffByOuterInner(outer, inner); + return eval.coeffByOuterInner(outer, inner); } }; // This is actually dead code and will never be called. It is required // to prevent false warnings regarding failed inlining though // for 0 length run() will never be called at all. -template<typename Func, typename Derived, int Start> -struct redux_novec_unroller<Func, Derived, Start, 0> +template<typename Func, typename Evaluator, int Start> +struct redux_novec_unroller<Func, Evaluator, Start, 0> { - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); } + static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); } }; /*** vectorization ***/ -template<typename Func, typename Derived, int Start, int Length> +template<typename Func, typename Evaluator, int Start, int Length> struct redux_vec_unroller { enum { - PacketSize = redux_traits<Func, Derived>::PacketSize, + PacketSize = redux_traits<Func, Evaluator>::PacketSize, HalfLength = Length/2 }; - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar; - static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func) + static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func& func) { return func.packetOp( - redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func), - redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) ); + redux_vec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func), + redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func) ); } }; -template<typename Func, typename Derived, int Start> -struct redux_vec_unroller<Func, Derived, Start, 1> +template<typename Func, typename Evaluator, int Start> +struct redux_vec_unroller<Func, Evaluator, Start, 1> { enum { - index = Start * redux_traits<Func, Derived>::PacketSize, - outer = index / int(Derived::InnerSizeAtCompileTime), - inner = index % int(Derived::InnerSizeAtCompileTime), - alignment = Derived::Alignment + index = Start * redux_traits<Func, Evaluator>::PacketSize, + outer = index / int(Evaluator::InnerSizeAtCompileTime), + inner = index % int(Evaluator::InnerSizeAtCompileTime), + alignment = Evaluator::Alignment }; - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar; - static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&) + static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func&) { - return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner); + return eval.template packetByOuterInner<alignment,PacketScalar>(outer, inner); } }; @@ -176,53 +176,65 @@ struct redux_vec_unroller<Func, Derived, Start, 1> * Part 3 : implementation of all cases ***************************************************************************/ -template<typename Func, typename Derived, - int Traversal = redux_traits<Func, Derived>::Traversal, - int Unrolling = redux_traits<Func, Derived>::Unrolling +template<typename Func, typename Evaluator, + int Traversal = redux_traits<Func, Evaluator>::Traversal, + int Unrolling = redux_traits<Func, Evaluator>::Unrolling > struct redux_impl; -template<typename Func, typename Derived> -struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling> +template<typename Func, typename Evaluator> +struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling> { - typedef typename Derived::Scalar Scalar; - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + typedef typename Evaluator::Scalar Scalar; + + template<typename XprType> + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); Scalar res; - res = mat.coeffByOuterInner(0, 0); - for(Index i = 1; i < mat.innerSize(); ++i) - res = func(res, mat.coeffByOuterInner(0, i)); - for(Index i = 1; i < mat.outerSize(); ++i) - for(Index j = 0; j < mat.innerSize(); ++j) - res = func(res, mat.coeffByOuterInner(i, j)); + res = eval.coeffByOuterInner(0, 0); + for(Index i = 1; i < xpr.innerSize(); ++i) + res = func(res, eval.coeffByOuterInner(0, i)); + for(Index i = 1; i < xpr.outerSize(); ++i) + for(Index j = 0; j < xpr.innerSize(); ++j) + res = func(res, eval.coeffByOuterInner(i, j)); return res; } }; -template<typename Func, typename Derived> -struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling> - : public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime> -{}; +template<typename Func, typename Evaluator> +struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling> + : redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> +{ + typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base; + typedef typename Evaluator::Scalar Scalar; + template<typename XprType> + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/) + { + return Base::run(eval,func); + } +}; -template<typename Func, typename Derived> -struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling> +template<typename Func, typename Evaluator> +struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling> { - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar; - static Scalar run(const Derived &mat, const Func& func) + template<typename XprType> + static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - const Index size = mat.size(); + const Index size = xpr.size(); - const Index packetSize = redux_traits<Func, Derived>::PacketSize; + const Index packetSize = redux_traits<Func, Evaluator>::PacketSize; const int packetAlignment = unpacket_traits<PacketScalar>::alignment; enum { - alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), - alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment) + alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), + alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment) }; - const Index alignedStart = internal::first_default_aligned(mat.nestedExpression()); + const Index alignedStart = internal::first_default_aligned(xpr); const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize); const Index alignedEnd2 = alignedStart + alignedSize2; @@ -230,34 +242,34 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling> Scalar res; if(alignedSize) { - PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart); + PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart); if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop { - PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize); + PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize); for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize) { - packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index)); - packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize)); + packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index)); + packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize)); } packet_res0 = func.packetOp(packet_res0,packet_res1); if(alignedEnd>alignedEnd2) - packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2)); + packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2)); } res = func.predux(packet_res0); for(Index index = 0; index < alignedStart; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); for(Index index = alignedEnd; index < size; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); } else // too small to vectorize anything. // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize. { - res = mat.coeff(0); + res = eval.coeff(0); for(Index index = 1; index < size; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); } return res; @@ -265,130 +277,105 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling> }; // NOTE: for SliceVectorizedTraversal we simply bypass unrolling -template<typename Func, typename Derived, int Unrolling> -struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling> +template<typename Func, typename Evaluator, int Unrolling> +struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling> { - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits<Func, Derived>::PacketType PacketType; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits<Func, Evaluator>::PacketType PacketType; - EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func) + template<typename XprType> + EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); - const Index innerSize = mat.innerSize(); - const Index outerSize = mat.outerSize(); + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); + const Index innerSize = xpr.innerSize(); + const Index outerSize = xpr.outerSize(); enum { - packetSize = redux_traits<Func, Derived>::PacketSize + packetSize = redux_traits<Func, Evaluator>::PacketSize }; const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize; Scalar res; if(packetedInnerSize) { - PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0); + PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0); for(Index j=0; j<outerSize; ++j) for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize)) - packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i)); + packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i)); res = func.predux(packet_res); for(Index j=0; j<outerSize; ++j) for(Index i=packetedInnerSize; i<innerSize; ++i) - res = func(res, mat.coeffByOuterInner(j,i)); + res = func(res, eval.coeffByOuterInner(j,i)); } else // too small to vectorize anything. // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize. { - res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func); + res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr); } return res; } }; -template<typename Func, typename Derived> -struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling> +template<typename Func, typename Evaluator> +struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling> { - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; - typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; + typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar; enum { - PacketSize = redux_traits<Func, Derived>::PacketSize, - Size = Derived::SizeAtCompileTime, + PacketSize = redux_traits<Func, Evaluator>::PacketSize, + Size = Evaluator::SizeAtCompileTime, VectorizedSize = (Size / PacketSize) * PacketSize }; - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + + template<typename XprType> + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); if (VectorizedSize > 0) { - Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func)); + Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::run(eval,func)); if (VectorizedSize != Size) - res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func)); + res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func)); return res; } else { - return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func); + return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func); } } }; // evaluator adaptor template<typename _XprType> -class redux_evaluator +class redux_evaluator : public internal::evaluator<_XprType> { + typedef internal::evaluator<_XprType> Base; public: typedef _XprType XprType; - EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {} + EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr) {} typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; - typedef typename XprType::PacketReturnType PacketReturnType; enum { MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime, MaxColsAtCompileTime = XprType::MaxColsAtCompileTime, // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator - Flags = evaluator<XprType>::Flags & ~DirectAccessBit, + Flags = Base::Flags & ~DirectAccessBit, IsRowMajor = XprType::IsRowMajor, SizeAtCompileTime = XprType::SizeAtCompileTime, - InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime, - CoeffReadCost = evaluator<XprType>::CoeffReadCost, - Alignment = evaluator<XprType>::Alignment + InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime }; - EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } - EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); } - EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); } - EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); } - - EIGEN_DEVICE_FUNC - CoeffReturnType coeff(Index row, Index col) const - { return m_evaluator.coeff(row, col); } - - EIGEN_DEVICE_FUNC - CoeffReturnType coeff(Index index) const - { return m_evaluator.coeff(index); } - - template<int LoadMode, typename PacketType> - PacketType packet(Index row, Index col) const - { return m_evaluator.template packet<LoadMode,PacketType>(row, col); } - - template<int LoadMode, typename PacketType> - PacketType packet(Index index) const - { return m_evaluator.template packet<LoadMode,PacketType>(index); } - EIGEN_DEVICE_FUNC CoeffReturnType coeffByOuterInner(Index outer, Index inner) const - { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } template<int LoadMode, typename PacketType> PacketType packetByOuterInner(Index outer, Index inner) const - { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } - - const XprType & nestedExpression() const { return m_xpr; } + { return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } -protected: - internal::evaluator<XprType> m_evaluator; - const XprType &m_xpr; }; } // end namespace internal @@ -415,7 +402,9 @@ DenseBase<Derived>::redux(const Func& func) const typedef typename internal::redux_evaluator<Derived> ThisEvaluator; ThisEvaluator thisEval(derived()); - return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func); + // The initial expression is passed to the reducer as an additional argument instead of + // passing it as a member of redux_evaluator to help + return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived()); } /** \returns the minimum of all coefficients of \c *this. diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index ba7d6e629..d7c204579 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -79,6 +79,7 @@ template<typename MatrixType> class Transpose nestedExpression() { return m_matrix; } /** \internal */ + EIGEN_DEVICE_FUNC void resize(Index nrows, Index ncols) { m_matrix.resize(ncols,nrows); } diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index ab73fcf21..521de6160 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -65,6 +65,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived> inline Index innerStride() const { return derived().innerStride(); } // dummy resize function + EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) { EIGEN_UNUSED_VARIABLE(rows); @@ -716,6 +717,7 @@ struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased> { typedef TriangularView<MatrixType,Mode> XprType; typedef evaluator<typename internal::remove_all<MatrixType>::type> Base; + EIGEN_DEVICE_FUNC unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {} }; diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 6d7190a56..7f4e90f75 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -103,7 +103,7 @@ static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4u static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; #else -static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; +static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; @@ -784,7 +784,7 @@ typedef __vector __bool long Packet2bl; static Packet2l p2l_ONE = { 1, 1 }; static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO); -static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO); static Packet2d p2d_MZERO = { -0.0, -0.0 }; @@ -1001,7 +1001,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) Packet2d v[2], sum; v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8)); v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8)); - + #ifdef _BIG_ENDIAN sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8)); #else @@ -1054,7 +1054,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) { template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)); + Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) ); return vec_sel(elsePacket, thenPacket, mask); } #endif // __VSX__ diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index e1ecac1ab..b0a72e1f9 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -497,10 +497,10 @@ struct packet_traits<half> : default_packet_traits { AlignedOnScalar = 1, size = 16, HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -550,6 +550,21 @@ template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet1 } template<> EIGEN_STRONG_INLINE Packet16h +ploaddup<Packet16h>(const Eigen::half* from) { + Packet16h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + unsigned short e = from[4].x; + unsigned short f = from[5].x; + unsigned short g = from[6].x; + unsigned short h = from[7].x; + result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet16h ploadquad(const Eigen::half* from) { Packet16h result; unsigned short a = from[0].x; @@ -621,6 +636,13 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { + // FIXME we could do that with bit manipulation + Packet16f af = half2float(a); + Packet16f rf = pnegate(af); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) { Packet16f af = half2float(a); Packet16f bf = half2float(b); @@ -628,6 +650,13 @@ template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, con return float2half(rf); } +template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = psub(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) { Packet16f af = half2float(a); Packet16f bf = half2float(b); @@ -640,6 +669,57 @@ template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) { return half(predux(from_float)); } +template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) { + Packet16f from_float = half2float(from); + return half(predux_mul(from_float)); +} + +template<> EIGEN_STRONG_INLINE Packet16h preduxp<Packet16h>(const Packet16h* p) { + Packet16f pf[16]; + pf[0] = half2float(p[0]); + pf[1] = half2float(p[1]); + pf[2] = half2float(p[2]); + pf[3] = half2float(p[3]); + pf[4] = half2float(p[4]); + pf[5] = half2float(p[5]); + pf[6] = half2float(p[6]); + pf[7] = half2float(p[7]); + pf[8] = half2float(p[8]); + pf[9] = half2float(p[9]); + pf[10] = half2float(p[10]); + pf[11] = half2float(p[11]); + pf[12] = half2float(p[12]); + pf[13] = half2float(p[13]); + pf[14] = half2float(p[14]); + pf[15] = half2float(p[15]); + Packet16f reduced = preduxp<Packet16f>(pf); + return float2half(reduced); +} + +template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) +{ + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + Packet16h res; + res.x = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)), + _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b) +{ + Packet16h res; + res.x = _mm256_insert_epi16(a.x,b.x,0); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b) +{ + Packet16h res; + res.x = _mm256_insert_epi16(a.x,b.x,15); + return res; +} + template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) { Packet16h result; @@ -747,20 +827,20 @@ ptranspose(PacketBlock<Packet16h,16>& kernel) { // NOTE: no unpacklo/hi instr in this case, so using permute instr. __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); - __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); - __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); - __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); - __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); - __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); - __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); - __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); - __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); - __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); - __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); - __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); - __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); - __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); - __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); + __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); + __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); + __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); + __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); + __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); + __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); + __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); + __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); + __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); + __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); + __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); + __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); kernel.packet[0].x = a_p_0; @@ -865,10 +945,10 @@ struct packet_traits<Eigen::half> : default_packet_traits { AlignedOnScalar = 1, size = 8, HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -918,6 +998,17 @@ template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const } template<> EIGEN_STRONG_INLINE Packet8h +ploaddup<Packet8h>(const Eigen::half* from) { + Packet8h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + result.x = _mm_set_epi16(d, d, c, c, b, b, a, a); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet8h ploadquad<Packet8h>(const Eigen::half* from) { Packet8h result; unsigned short a = from[0].x; @@ -970,6 +1061,13 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { + // FIXME we could do that with bit manipulation + Packet8f af = half2float(a); + Packet8f rf = pnegate(af); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) { Packet8f af = half2float(a); Packet8f bf = half2float(b); @@ -977,6 +1075,13 @@ template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const return float2half(rf); } +template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = psub(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) { Packet8f af = half2float(a); Packet8f bf = half2float(b); @@ -1029,6 +1134,52 @@ template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& return Eigen::half(reduced); } +template<> EIGEN_STRONG_INLINE Packet8h preduxp<Packet8h>(const Packet8h* p) { + Packet8f pf[8]; + pf[0] = half2float(p[0]); + pf[1] = half2float(p[1]); + pf[2] = half2float(p[2]); + pf[3] = half2float(p[3]); + pf[4] = half2float(p[4]); + pf[5] = half2float(p[5]); + pf[6] = half2float(p[6]); + pf[7] = half2float(p[7]); + Packet8f reduced = preduxp<Packet8f>(pf); + return float2half(reduced); +} + +template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) +{ + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + Packet8h res; + res.x = _mm_shuffle_epi8(a.x,m); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b) +{ + Packet8h res; + res.x = _mm_insert_epi16(a.x,int(b.x),0); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b) +{ + Packet8h res; + res.x = _mm_insert_epi16(a.x,int(b.x),7); + return res; +} + +template<int Offset> +struct palign_impl<Offset,Packet8h> +{ + static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second) + { + if (Offset!=0) + first.x = _mm_alignr_epi8(second.x,first.x, Offset*2); + } +}; + EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h,8>& kernel) { __m128i a = kernel.packet[0].x; diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index dd4f366ee..1890efd4d 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1526,10 +1526,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga // The following piece of code won't work for 512 bit registers // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size // as nr (which is currently 4) for the return type. - typedef typename unpacket_traits<SResPacket>::half SResPacketHalf; + const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size; if ((SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress <= 8) && - (SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr)) + (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr)) { SAccPacket C0, C1, C2, C3; straits.initAcc(C0); diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index ed4d3182b..dc47cf7cf 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -431,10 +431,10 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> // to determine the following heuristic. // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h, // unless it has been specialized by the user or for a given architecture. - // Note that the condition rhs.rows()>0 was required because lazy produc is (was?) not happy with empty inputs. + // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs. // I'm not sure it is still required. if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0) - lazyproduct::evalTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>()); else { dst.setZero(); @@ -446,7 +446,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0) - lazyproduct::addTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>()); else scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } @@ -455,7 +455,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0) - lazyproduct::subTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>()); else scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 67390f1d7..d38fd72b2 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -27,7 +27,8 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju struct selfadjoint_matrix_vector_product { -static EIGEN_DONT_INLINE void run( +static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC +void run( Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs, @@ -36,7 +37,8 @@ static EIGEN_DONT_INLINE void run( }; template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version> -EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run( +EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC +void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run( Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs, @@ -62,8 +64,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha; - - Index bound = (std::max)(Index(0),size-8) & 0xfffffffe; + Index bound = numext::maxi(Index(0), size-8) & 0xfffffffe; if (FirstTriangular) bound = size - bound; @@ -175,7 +176,8 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true> enum { LhsUpLo = LhsMode&(Upper|Lower) }; template<typename Dest> - static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) { typedef typename Dest::Scalar ResScalar; typedef typename Rhs::Scalar RhsScalar; diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h index d395888e5..09209f733 100644 --- a/Eigen/src/Core/products/SelfadjointRank2Update.h +++ b/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -24,7 +24,8 @@ struct selfadjoint_rank2_update_selector; template<typename Scalar, typename Index, typename UType, typename VType> struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower> { - static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) { const Index size = u.size(); for (Index i=0; i<size; ++i) diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h index b994759b2..647317016 100644 --- a/Eigen/src/Core/products/TriangularSolverVector.h +++ b/Eigen/src/Core/products/TriangularSolverVector.h @@ -58,7 +58,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con { // let's directly call the low level product function because: // 1 - it is faster to compile - // 2 - it is slighlty faster at runtime + // 2 - it is slightly faster at runtime Index startRow = IsLower ? pi : pi-actualPanelWidth; Index startCol = IsLower ? 0 : pi; @@ -77,7 +77,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con if (k>0) rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum(); - if(!(Mode & UnitDiag)) + if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0))) rhs[i] /= cjLhs(i,i); } } @@ -114,20 +114,23 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con for(Index k=0; k<actualPanelWidth; ++k) { Index i = IsLower ? pi+k : pi-k-1; - if(!(Mode & UnitDiag)) - rhs[i] /= cjLhs.coeff(i,i); - - Index r = actualPanelWidth - k - 1; // remaining size - Index s = IsLower ? i+1 : i-r; - if (r>0) - Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + if(numext::not_equal_strict(rhs[i],RhsScalar(0))) + { + if(!(Mode & UnitDiag)) + rhs[i] /= cjLhs.coeff(i,i); + + Index r = actualPanelWidth - k - 1; // remaining size + Index s = IsLower ? i+1 : i-r; + if (r>0) + Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + } } Index r = IsLower ? size - endBlock : startBlock; // remaining size if (r > 0) { // let's directly call the low level product function because: // 1 - it is faster to compile - // 2 - it is slighlty faster at runtime + // 2 - it is slightly faster at runtime general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run( r, actualPanelWidth, LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride), diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index b1791fb3a..705925984 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -289,8 +289,8 @@ template<typename XprType> struct blas_traits ExtractType, typename _ExtractType::PlainObject >::type DirectLinearAccessType; - static inline ExtractType extract(const XprType& x) { return x; } - static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } + static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; } + static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } }; // pop conjugate @@ -318,8 +318,8 @@ struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp typedef blas_traits<NestedXpr> Base; typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType; typedef typename Base::ExtractType ExtractType; - static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } - static inline Scalar extractScalarFactor(const XprType& x) + static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } + static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x) { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); } }; template<typename Scalar, typename NestedXpr, typename Plain> diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h index 78a4705cd..bf99cd8ab 100644 --- a/Eigen/src/Core/util/IntegralConstant.h +++ b/Eigen/src/Core/util/IntegralConstant.h @@ -192,7 +192,7 @@ inline internal::FixedInt<N> fix() { return internal::FixedInt<N>(); } // The generic typename T is mandatory. Otherwise, a code like fix<N> could refer to either the function above or this next overload. // This way a code like fix<N> can only refer to the previous function. template<int N,typename T> -inline internal::VariableAndFixedInt<N> fix(T val) { return internal::VariableAndFixedInt<N>(val); } +inline internal::VariableAndFixedInt<N> fix(T val) { return internal::VariableAndFixedInt<N>(internal::convert_index<int>(val)); } #endif #else // EIGEN_PARSED_BY_DOXYGEN diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 059d06874..f2cac01ac 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -580,7 +580,7 @@ template<typename T> struct smart_memmove_helper<T,false> { // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA // to the appropriate stack allocation function -#ifndef EIGEN_ALLOCA +#if ! defined EIGEN_ALLOCA && ! defined EIGEN_CUDA_ARCH #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca) #define EIGEN_ALLOCA alloca #elif EIGEN_COMP_MSVC @@ -599,12 +599,14 @@ template<typename T> class aligned_stack_memory_handler : noncopyable * In this case, the buffer elements will also be destructed when this handler will be destructed. * Finally, if \a dealloc is true, then the pointer \a ptr is freed. **/ + EIGEN_DEVICE_FUNC aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc) : m_ptr(ptr), m_size(size), m_deallocate(dealloc) { if(NumTraits<T>::RequireInitialization && m_ptr) Eigen::internal::construct_elements_of_array(m_ptr, size); } + EIGEN_DEVICE_FUNC ~aligned_stack_memory_handler() { if(NumTraits<T>::RequireInitialization && m_ptr) @@ -618,6 +620,60 @@ template<typename T> class aligned_stack_memory_handler : noncopyable bool m_deallocate; }; +#ifdef EIGEN_ALLOCA + +template<typename Xpr, int NbEvaluations, + bool MapExternalBuffer = nested_eval<Xpr,NbEvaluations>::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic + > +struct local_nested_eval_wrapper +{ + static const bool NeedExternalBuffer = false; + typedef typename Xpr::Scalar Scalar; + typedef typename nested_eval<Xpr,NbEvaluations>::type ObjectType; + ObjectType object; + + EIGEN_DEVICE_FUNC + local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr) + { + EIGEN_UNUSED_VARIABLE(ptr); + eigen_internal_assert(ptr==0); + } +}; + +template<typename Xpr, int NbEvaluations> +struct local_nested_eval_wrapper<Xpr,NbEvaluations,true> +{ + static const bool NeedExternalBuffer = true; + typedef typename Xpr::Scalar Scalar; + typedef typename plain_object_eval<Xpr>::type PlainObject; + typedef Map<PlainObject,EIGEN_DEFAULT_ALIGN_BYTES> ObjectType; + ObjectType object; + + EIGEN_DEVICE_FUNC + local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) + : object(ptr==0 ? reinterpret_cast<Scalar*>(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()), + m_deallocate(ptr==0) + { + if(NumTraits<Scalar>::RequireInitialization && object.data()) + Eigen::internal::construct_elements_of_array(object.data(), object.size()); + object = xpr; + } + + EIGEN_DEVICE_FUNC + ~local_nested_eval_wrapper() + { + if(NumTraits<Scalar>::RequireInitialization && object.data()) + Eigen::internal::destruct_elements_of_array(object.data(), object.size()); + if(m_deallocate) + Eigen::internal::aligned_free(object.data()); + } + +private: + bool m_deallocate; +}; + +#endif // EIGEN_ALLOCA + template<typename T> class scoped_array : noncopyable { T* m_ptr; @@ -645,9 +701,11 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b) } // end namespace internal /** \internal - * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack - * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform - * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap. + * + * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates, + * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack + * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform + * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap. * The allocated buffer is automatically deleted when exiting the scope of this declaration. * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs. * Here is an example: @@ -658,6 +716,14 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b) * } * \endcode * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token. + * + * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to + * \code + * typename internal::nested_eval<XPRT_T,N>::type NAME(XPR); + * \endcode + * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown. + * This is accomplished through alloca if this later is supported and if the required number of bytes + * is below EIGEN_STACK_ALLOCATION_LIMIT. */ #ifdef EIGEN_ALLOCA @@ -677,6 +743,13 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b) : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) ); \ Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT) + + #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \ + Eigen::internal::local_nested_eval_wrapper<XPR_T,N> EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast<typename XPR_T::Scalar*>( \ + ( (Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \ + ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \ + typename Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object) + #else #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \ @@ -684,6 +757,9 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b) TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE)); \ Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true) + +#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval<XPR_T,N>::type NAME(XPR); + #endif diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 5a358bc12..f37aac56b 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -583,6 +583,7 @@ using std::numeric_limits; // Integer division with rounding up. // T is assumed to be an integer type with a>=0, and b>0 template<typename T> +EIGEN_DEVICE_FUNC T div_ceil(const T &a, const T &b) { return (a+b-1) / b; @@ -593,7 +594,7 @@ T div_ceil(const T &a, const T &b) template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const X& x,const Y& y) { return x == y; } -#if !defined(EIGEN_GPU_COMPILE_PHASE) +#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC) template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); } @@ -604,7 +605,7 @@ bool equal_strict(const double& x,const double& y) { return std::equal_to<double template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const X& x,const Y& y) { return x != y; } -#if !defined(EIGEN_GPU_COMPILE_PHASE) +#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC) template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); } diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 16714fa5c..e3231c712 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -460,7 +460,7 @@ template<typename T, int n, typename PlainObject = typename plain_object_eval<T> { enum { ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost, - CoeffReadCost = evaluator<T>::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a tempory? + CoeffReadCost = evaluator<T>::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a temporary? // Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1. // This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON // for all evaluator creating a temporary. This flag is then propagated by the parent evaluators. @@ -676,17 +676,32 @@ template<typename T> struct is_diagonal<DiagonalWrapper<T> > template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> > { enum { ret = true }; }; + +template<typename T> struct is_identity +{ enum { value = false }; }; + +template<typename T> struct is_identity<CwiseNullaryOp<internal::scalar_identity_op<typename T::Scalar>, T> > +{ enum { value = true }; }; + + template<typename S1, typename S2> struct glue_shapes; template<> struct glue_shapes<DenseShape,TriangularShape> { typedef TriangularShape type; }; template<typename T1, typename T2> -bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<has_direct_access<T1>::ret&&has_direct_access<T2>::ret, T1>::type * = 0) +struct possibly_same_dense { + enum { value = has_direct_access<T1>::ret && has_direct_access<T2>::ret && is_same<typename T1::Scalar,typename T2::Scalar>::value }; +}; + +template<typename T1, typename T2> +EIGEN_DEVICE_FUNC +bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<possibly_same_dense<T1,T2>::value>::type * = 0) { return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride()); } template<typename T1, typename T2> -bool is_same_dense(const T1 &, const T2 &, typename enable_if<!(has_direct_access<T1>::ret&&has_direct_access<T2>::ret), T1>::type * = 0) +EIGEN_DEVICE_FUNC +bool is_same_dense(const T1 &, const T2 &, typename enable_if<!possibly_same_dense<T1,T2>::value>::type * = 0) { return false; } diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index 4c53344bb..aca8a8279 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -270,8 +270,13 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const EigenBase<InputType> // Step 1. Reduce to Hessenberg form m_hess.compute(matrix.derived()/scale); - // Step 2. Reduce to real Schur form - computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU); + // Step 2. Reduce to real Schur form + // Note: we copy m_hess.matrixQ() into m_matU here and not in computeFromHessenberg + // to be able to pass our working-space buffer for the Householder to Dense evaluation. + m_workspaceVector.resize(matrix.cols()); + if(computeU) + m_hess.matrixQ().evalTo(m_matU, m_workspaceVector); + computeFromHessenberg(m_hess.matrixH(), m_matU, computeU); m_matT *= scale; @@ -285,8 +290,8 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa m_matT = matrixH; m_workspaceVector.resize(m_matT.cols()); - if(computeU) - matrixQ.evalTo(m_matU, m_workspaceVector); + if(computeU && !internal::is_same_dense(m_matU,matrixQ)) + m_matU = matrixQ; Index maxIters = m_maxIters; if (maxIters == -1) diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index bf28edc0e..fbc1ee2f6 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -20,7 +20,9 @@ class GeneralizedSelfAdjointEigenSolver; namespace internal { template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues; + template<typename MatrixType, typename DiagType, typename SubDiagType> +EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec); } @@ -354,8 +356,8 @@ template<typename _MatrixType> class SelfAdjointEigenSolver static const int m_maxIterations = 30; protected: - EIGEN_DEVICE_FUNC - static void check_template_parameters() + static EIGEN_DEVICE_FUNC + void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } @@ -404,7 +406,7 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType> const InputType &matrix(a_matrix.derived()); - using std::abs; + EIGEN_USING_STD_MATH(abs); eigen_assert(matrix.cols() == matrix.rows()); eigen_assert((options&~(EigVecMask|GenEigMask))==0 && (options&EigVecMask)!=EigVecMask @@ -480,9 +482,10 @@ namespace internal { * \returns \c Success or \c NoConvergence */ template<typename MatrixType, typename DiagType, typename SubDiagType> +EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec) { - using std::abs; + EIGEN_USING_STD_MATH(abs); ComputationInfo info; typedef typename MatrixType::Scalar Scalar; @@ -536,7 +539,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag diag.segment(i,n-i).minCoeff(&k); if (k > 0) { - std::swap(diag[i], diag[k+i]); + numext::swap(diag[i], diag[k+i]); if(computeEigenvectors) eivec.col(i).swap(eivec.col(k+i)); } @@ -606,7 +609,7 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3 EIGEN_DEVICE_FUNC static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative) { - using std::abs; + EIGEN_USING_STD_MATH(abs); Index i0; // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal): mat.diagonal().cwiseAbs().maxCoeff(&i0); @@ -808,7 +811,7 @@ template<int StorageOrder,typename RealScalar, typename Scalar, typename Index> EIGEN_DEVICE_FUNC static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n) { - using std::abs; + EIGEN_USING_STD_MATH(abs); RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5); RealScalar e = subdiag[end-1]; // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h index 1d102c17b..c5c1acf46 100644 --- a/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/Eigen/src/Eigenvalues/Tridiagonalization.h @@ -25,6 +25,7 @@ struct traits<TridiagonalizationMatrixTReturnType<MatrixType> > }; template<typename MatrixType, typename CoeffVectorType> +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs); } @@ -344,6 +345,7 @@ namespace internal { * \sa Tridiagonalization::packedMatrix() */ template<typename MatrixType, typename CoeffVectorType> +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) { using numext::conj; @@ -424,6 +426,7 @@ struct tridiagonalization_inplace_selector; * \sa class Tridiagonalization */ template<typename MatrixType, typename DiagonalType, typename SubDiagonalType> +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) { eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1); @@ -439,7 +442,8 @@ struct tridiagonalization_inplace_selector typedef typename Tridiagonalization<MatrixType>::CoeffVectorType CoeffVectorType; typedef typename Tridiagonalization<MatrixType>::HouseholderSequenceType HouseholderSequenceType; template<typename DiagonalType, typename SubDiagonalType> - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) + static EIGEN_DEVICE_FUNC + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) { CoeffVectorType hCoeffs(mat.cols()-1); tridiagonalization_inplace(mat,hCoeffs); @@ -508,7 +512,8 @@ struct tridiagonalization_inplace_selector<MatrixType,1,IsComplex> typedef typename MatrixType::Scalar Scalar; template<typename DiagonalType, typename SubDiagonalType> - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ) + static EIGEN_DEVICE_FUNC + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ) { diag(0,0) = numext::real(mat(0,0)); if(extractQ) diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h index f68cab583..d4346aa1c 100644 --- a/Eigen/src/Geometry/arch/Geometry_SSE.h +++ b/Eigen/src/Geometry/arch/Geometry_SSE.h @@ -26,17 +26,17 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, float> static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b) { Quaternion<float> res; - const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); - __m128 a = _a.coeffs().template packet<AAlignment>(0); - __m128 b = _b.coeffs().template packet<BAlignment>(0); - __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); - __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); + const Packet4f mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); + Packet4f a = _a.coeffs().template packet<AAlignment>(0); + Packet4f b = _b.coeffs().template packet<BAlignment>(0); + Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); + Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); pstoret<float,Packet4f,ResAlignment>( &res.x(), - _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)), - _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0), + padd(psub(pmul(a,vec4f_swizzle1(b,3,3,3,3)), + pmul(vec4f_swizzle1(a,2,0,1,0), vec4f_swizzle1(b,1,2,0,0))), - _mm_xor_ps(mask,_mm_add_ps(s1,s2)))); + pxor(mask,padd(s1,s2)))); return res; } diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h index 01a7ed188..39ce1c2a0 100644 --- a/Eigen/src/Householder/BlockHouseholder.h +++ b/Eigen/src/Householder/BlockHouseholder.h @@ -63,8 +63,15 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint() * vectors.bottomRightCorner(rs, rt).template triangularView<UnitLower>(); - // FIXME add .noalias() once the triangular product can work inplace - triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>(); + // FIXME use the following line with .noalias() once the triangular product can work inplace + // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>(); + for(Index j=nbVecs-1; j>i; --j) + { + typename TriangularFactorType::Scalar z = triFactor(i,j); + triFactor(i,j) = z * triFactor(j,j); + if(nbVecs-j-1>0) + triFactor.row(i).tail(nbVecs-j-1) += z * triFactor.row(j).tail(nbVecs-j-1); + } } triFactor(i,i) = hCoeffs(i); diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h index a5f336d18..5bc037f00 100644 --- a/Eigen/src/Householder/Householder.h +++ b/Eigen/src/Householder/Householder.h @@ -39,6 +39,7 @@ template<int n> struct decrement_size * MatrixBase::applyHouseholderOnTheRight() */ template<typename Derived> +EIGEN_DEVICE_FUNC void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) { VectorBlock<Derived, internal::decrement_size<Base::SizeAtCompileTime>::ret> essentialPart(derived(), 1, size()-1); @@ -62,6 +63,7 @@ void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) */ template<typename Derived> template<typename EssentialPart> +EIGEN_DEVICE_FUNC void MatrixBase<Derived>::makeHouseholder( EssentialPart& essential, Scalar& tau, @@ -110,6 +112,7 @@ void MatrixBase<Derived>::makeHouseholder( */ template<typename Derived> template<typename EssentialPart> +EIGEN_DEVICE_FUNC void MatrixBase<Derived>::applyHouseholderOnTheLeft( const EssentialPart& essential, const Scalar& tau, @@ -147,6 +150,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft( */ template<typename Derived> template<typename EssentialPart> +EIGEN_DEVICE_FUNC void MatrixBase<Derived>::applyHouseholderOnTheRight( const EssentialPart& essential, const Scalar& tau, diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h index fad1d5ab6..a4f40b75c 100644 --- a/Eigen/src/Householder/HouseholderSequence.h +++ b/Eigen/src/Householder/HouseholderSequence.h @@ -87,7 +87,7 @@ struct hseq_side_dependent_impl { typedef Block<const VectorsType, Dynamic, 1> EssentialVectorType; typedef HouseholderSequence<VectorsType, CoeffsType, OnTheLeft> HouseholderSequenceType; - static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) + static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) { Index start = k+1+h.m_shift; return Block<const VectorsType,Dynamic,1>(h.m_vectors, start, k, h.rows()-start, 1); @@ -173,6 +173,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS * * \sa setLength(), setShift() */ + EIGEN_DEVICE_FUNC HouseholderSequence(const VectorsType& v, const CoeffsType& h) : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()), m_shift(0) @@ -180,6 +181,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS } /** \brief Copy constructor. */ + EIGEN_DEVICE_FUNC HouseholderSequence(const HouseholderSequence& other) : m_vectors(other.m_vectors), m_coeffs(other.m_coeffs), @@ -193,12 +195,14 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS * \returns Number of rows * \details This equals the dimension of the space that the transformation acts on. */ + EIGEN_DEVICE_FUNC Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); } /** \brief Number of columns of transformation viewed as a matrix. * \returns Number of columns * \details This equals the dimension of the space that the transformation acts on. */ + EIGEN_DEVICE_FUNC Index cols() const { return rows(); } /** \brief Essential part of a Householder vector. @@ -215,6 +219,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS * * \sa setShift(), shift() */ + EIGEN_DEVICE_FUNC const EssentialVectorType essentialVector(Index k) const { eigen_assert(k >= 0 && k < m_length); @@ -252,7 +257,9 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS AdjointReturnType inverse() const { return adjoint(); } /** \internal */ - template<typename DestType> inline void evalTo(DestType& dst) const + template<typename DestType> + inline EIGEN_DEVICE_FUNC + void evalTo(DestType& dst) const { Matrix<Scalar, DestType::RowsAtCompileTime, 1, AutoAlign|ColMajor, DestType::MaxRowsAtCompileTime, 1> workspace(rows()); @@ -261,6 +268,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS /** \internal */ template<typename Dest, typename Workspace> + EIGEN_DEVICE_FUNC void evalTo(Dest& dst, Workspace& workspace) const { workspace.resize(rows()); @@ -394,6 +402,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS * * \sa length() */ + EIGEN_DEVICE_FUNC HouseholderSequence& setLength(Index length) { m_length = length; @@ -411,13 +420,17 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS * * \sa shift() */ + EIGEN_DEVICE_FUNC HouseholderSequence& setShift(Index shift) { m_shift = shift; return *this; } + EIGEN_DEVICE_FUNC Index length() const { return m_length; } /**< \brief Returns the length of the Householder sequence. */ + + EIGEN_DEVICE_FUNC Index shift() const { return m_shift; } /**< \brief Returns the shift of the Householder sequence. */ /* Necessary for .adjoint() and .conjugate() */ diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index af1228cb8..bba75fc4f 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -90,6 +90,7 @@ template<typename Scalar> class JacobiRotation * \sa MatrixBase::makeJacobi(const MatrixBase<Derived>&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template<typename Scalar> +EIGEN_DEVICE_FUNC bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z) { using std::sqrt; @@ -134,6 +135,7 @@ bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, co */ template<typename Scalar> template<typename Derived> +EIGEN_DEVICE_FUNC inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Index p, Index q) { return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q))); @@ -156,6 +158,7 @@ inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Ind * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template<typename Scalar> +EIGEN_DEVICE_FUNC void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* z) { makeGivens(p, q, z, typename internal::conditional<NumTraits<Scalar>::IsComplex, internal::true_type, internal::false_type>::type()); @@ -164,6 +167,7 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar // specialization for complexes template<typename Scalar> +EIGEN_DEVICE_FUNC void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type) { using std::sqrt; @@ -223,6 +227,7 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar // specialization for reals template<typename Scalar> +EIGEN_DEVICE_FUNC void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type) { using std::sqrt; @@ -286,6 +291,7 @@ void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& */ template<typename Derived> template<typename OtherScalar> +EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j) { RowXpr x(this->row(p)); @@ -301,6 +307,7 @@ inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRo */ template<typename Derived> template<typename OtherScalar> +EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j) { ColXpr x(this->col(p)); @@ -314,7 +321,8 @@ template<typename Scalar, typename OtherScalar, int SizeAtCompileTime, int MinAlignment, bool Vectorizable> struct apply_rotation_in_the_plane_selector { - static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + static EIGEN_DEVICE_FUNC + inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) { for(Index i=0; i<size; ++i) { @@ -441,6 +449,7 @@ struct apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime }; template<typename VectorX, typename VectorY, typename OtherScalar> +EIGEN_DEVICE_FUNC void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j) { typedef typename VectorX::Scalar Scalar; diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h index d6a3c1e5a..6af63a6e7 100644 --- a/Eigen/src/LU/Determinant.h +++ b/Eigen/src/LU/Determinant.h @@ -15,6 +15,7 @@ namespace Eigen { namespace internal { template<typename Derived> +EIGEN_DEVICE_FUNC inline const typename Derived::Scalar bruteforce_det3_helper (const MatrixBase<Derived>& matrix, int a, int b, int c) { @@ -23,6 +24,7 @@ inline const typename Derived::Scalar bruteforce_det3_helper } template<typename Derived> +EIGEN_DEVICE_FUNC const typename Derived::Scalar bruteforce_det4_helper (const MatrixBase<Derived>& matrix, int j, int k, int m, int n) { @@ -44,7 +46,8 @@ template<typename Derived, template<typename Derived> struct determinant_impl<Derived, 1> { - static inline typename traits<Derived>::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits<Derived>::Scalar run(const Derived& m) { return m.coeff(0,0); } @@ -52,7 +55,8 @@ template<typename Derived> struct determinant_impl<Derived, 1> template<typename Derived> struct determinant_impl<Derived, 2> { - static inline typename traits<Derived>::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits<Derived>::Scalar run(const Derived& m) { return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1); } @@ -60,7 +64,8 @@ template<typename Derived> struct determinant_impl<Derived, 2> template<typename Derived> struct determinant_impl<Derived, 3> { - static inline typename traits<Derived>::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits<Derived>::Scalar run(const Derived& m) { return bruteforce_det3_helper(m,0,1,2) - bruteforce_det3_helper(m,1,0,2) @@ -70,7 +75,8 @@ template<typename Derived> struct determinant_impl<Derived, 3> template<typename Derived> struct determinant_impl<Derived, 4> { - static typename traits<Derived>::Scalar run(const Derived& m) + static EIGEN_DEVICE_FUNC + typename traits<Derived>::Scalar run(const Derived& m) { // trick by Martin Costabel to compute 4x4 det with only 30 muls return bruteforce_det4_helper(m,0,1,2,3) @@ -89,6 +95,7 @@ template<typename Derived> struct determinant_impl<Derived, 4> * \returns the determinant of this matrix */ template<typename Derived> +EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar MatrixBase<Derived>::determinant() const { eigen_assert(rows() == cols()); diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h index f49f23360..1bab00c01 100644 --- a/Eigen/src/LU/InverseImpl.h +++ b/Eigen/src/LU/InverseImpl.h @@ -290,6 +290,7 @@ template<typename DstXprType, typename XprType> struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar>, Dense2Dense> { typedef Inverse<XprType> SrcXprType; + EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &) { Index dstRows = src.rows(); @@ -332,6 +333,7 @@ struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename Dst * \sa computeInverseAndDetWithCheck() */ template<typename Derived> +EIGEN_DEVICE_FUNC inline const Inverse<Derived> MatrixBase<Derived>::inverse() const { EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h index 13c02685d..33cb9c8ff 100644 --- a/Eigen/src/QR/HouseholderQR.h +++ b/Eigen/src/QR/HouseholderQR.h @@ -291,7 +291,7 @@ template<typename MatrixQR, typename HCoeffs, bool InnerStrideIsOne = (MatrixQR::InnerStrideAtCompileTime == 1 && HCoeffs::InnerStrideAtCompileTime == 1)> struct householder_qr_inplace_blocked { - // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h + // This is specialized for LAPACK-supported Scalar types in HouseholderQR_LAPACKE.h static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32, typename MatrixQR::Scalar* tempData = 0) { diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h index 7409fcae9..1a28389e8 100644 --- a/Eigen/src/SparseQR/SparseQR.h +++ b/Eigen/src/SparseQR/SparseQR.h @@ -640,7 +640,8 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived // Compute res = Q * other column by column for(Index j = 0; j < res.cols(); j++) { - for (Index k = diagSize-1; k >=0; k--) + Index start_k = internal::is_identity<Derived>::value ? numext::mini(j,diagSize-1) : diagSize-1; + for (Index k = start_k; k >=0; k--) { Scalar tau = Scalar(0); tau = m_qr.m_Q.col(k).dot(res.col(j)); diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h index a5748525b..67fdebc6f 100644 --- a/Eigen/src/plugins/BlockMethods.h +++ b/Eigen/src/plugins/BlockMethods.h @@ -1061,6 +1061,7 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template<int NRows, int NCols> +EIGEN_DEVICE_FUNC inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) { diff --git a/test/block.cpp b/test/block.cpp index 8c4dd87be..9c2424662 100644 --- a/test/block.cpp +++ b/test/block.cpp @@ -162,9 +162,11 @@ template<typename MatrixType> void block(const MatrixType& m) // expressions without direct access VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2)) , ((m1+m2).block(r2,c2,rows-r2,cols-c2)) ); VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) ); + VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).eval().row(r1).segment(c1,c2-c1+1)) ); VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).col(0)) , ((m1+m2).col(c1).segment(r1,r2-r1+1)) ); VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() ); VERIFY_IS_APPROX( ((m1+m2).transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() ); + VERIFY_IS_APPROX( ((m1+m2).template block<1,Dynamic>(r1,c1,1,c2-c1+1)) , ((m1+m2).eval().row(r1).segment(c1,c2-c1+1)) ); VERIFY_IS_APPROX( (m1*1).topRows(r1), m1.topRows(r1) ); VERIFY_IS_APPROX( (m1*1).leftCols(c1), m1.leftCols(c1) ); diff --git a/test/cuda_basic.cu b/test/cuda_basic.cu index ce66c2c78..33e5fd119 100644 --- a/test/cuda_basic.cu +++ b/test/cuda_basic.cu @@ -121,7 +121,7 @@ struct diagonal { }; template<typename T> -struct eigenvalues { +struct eigenvalues_direct { EIGEN_DEVICE_FUNC void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const { @@ -136,6 +136,34 @@ struct eigenvalues { } }; +template<typename T> +struct eigenvalues { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec; + T M(in+i); + Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime); + T A = M*M.adjoint(); + SelfAdjointEigenSolver<T> eig; + eig.compute(M); + res = eig.eigenvalues(); + } +}; + +template<typename T> +struct matrix_inverse { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + T M(in+i); + Map<T> res(out+i*T::MaxSizeAtCompileTime); + res = M.inverse(); + } +}; + void test_cuda_basic() { ei_test_init_cuda(); @@ -163,8 +191,13 @@ void test_cuda_basic() CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) ); CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) ); + + CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse<Matrix2f>(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse<Matrix3f>(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse<Matrix4f>(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix3f>(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix2f>(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues_direct<Matrix3f>(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues_direct<Matrix2f>(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix4f>(), nthreads, in, out) ); } diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp index a2092c43e..a4ff10239 100644 --- a/test/diagonalmatrices.cpp +++ b/test/diagonalmatrices.cpp @@ -30,6 +30,7 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m) v2 = VectorType::Random(rows); RowVectorType rv1 = RowVectorType::Random(cols), rv2 = RowVectorType::Random(cols); + LeftDiagonalMatrix ldm1(v1), ldm2(v2); RightDiagonalMatrix rdm1(rv1), rdm2(rv2); @@ -107,6 +108,32 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m) VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).row(i), sq_m2.row(i) ); } +template<typename MatrixType> void as_scalar_product(const MatrixType& m) +{ + typedef typename MatrixType::Scalar Scalar; + typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType; + typedef Matrix<Scalar, Dynamic, Dynamic> DynMatrixType; + typedef Matrix<Scalar, Dynamic, 1> DynVectorType; + typedef Matrix<Scalar, 1, Dynamic> DynRowVectorType; + + Index rows = m.rows(); + Index depth = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE); + + VectorType v1 = VectorType::Random(rows); + DynVectorType dv1 = DynVectorType::Random(depth); + DynRowVectorType drv1 = DynRowVectorType::Random(depth); + DynMatrixType dm1 = dv1; + DynMatrixType drm1 = drv1; + + Scalar s = v1(0); + + VERIFY_IS_APPROX( v1.asDiagonal() * drv1, s*drv1 ); + VERIFY_IS_APPROX( dv1 * v1.asDiagonal(), dv1*s ); + + VERIFY_IS_APPROX( v1.asDiagonal() * drm1, s*drm1 ); + VERIFY_IS_APPROX( dm1 * v1.asDiagonal(), dm1*s ); +} + template<int> void bug987() { @@ -122,14 +149,19 @@ void test_diagonalmatrices() { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( diagonalmatrices(Matrix<float, 1, 1>()) ); + CALL_SUBTEST_1( as_scalar_product(Matrix<float, 1, 1>()) ); + CALL_SUBTEST_2( diagonalmatrices(Matrix3f()) ); CALL_SUBTEST_3( diagonalmatrices(Matrix<double,3,3,RowMajor>()) ); CALL_SUBTEST_4( diagonalmatrices(Matrix4d()) ); CALL_SUBTEST_5( diagonalmatrices(Matrix<float,4,4,RowMajor>()) ); CALL_SUBTEST_6( diagonalmatrices(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_6( as_scalar_product(MatrixXcf(1,1)) ); CALL_SUBTEST_7( diagonalmatrices(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_8( diagonalmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_9( diagonalmatrices(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_9( diagonalmatrices(MatrixXf(1,1)) ); + CALL_SUBTEST_9( as_scalar_product(MatrixXf(1,1)) ); } CALL_SUBTEST_10( bug987<0>() ); } diff --git a/test/half_float.cpp b/test/half_float.cpp index 1b0ea9482..5a881680a 100644 --- a/test/half_float.cpp +++ b/test/half_float.cpp @@ -257,13 +257,31 @@ void test_array() ss << a1; } +void test_product() +{ + typedef Matrix<half,Dynamic,Dynamic> MatrixXh; + Index rows = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE); + Index cols = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE); + Index depth = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE); + MatrixXh Ah = MatrixXh::Random(rows,depth); + MatrixXh Bh = MatrixXh::Random(depth,cols); + MatrixXh Ch = MatrixXh::Random(rows,cols); + MatrixXf Af = Ah.cast<float>(); + MatrixXf Bf = Bh.cast<float>(); + MatrixXf Cf = Ch.cast<float>(); + VERIFY_IS_APPROX(Ch.noalias()+=Ah*Bh, (Cf.noalias()+=Af*Bf).cast<half>()); +} + void test_half_float() { - CALL_SUBTEST(test_conversion()); CALL_SUBTEST(test_numtraits()); - CALL_SUBTEST(test_arithmetic()); - CALL_SUBTEST(test_comparison()); - CALL_SUBTEST(test_basic_functions()); - CALL_SUBTEST(test_trigonometric_functions()); - CALL_SUBTEST(test_array()); + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST(test_conversion()); + CALL_SUBTEST(test_arithmetic()); + CALL_SUBTEST(test_comparison()); + CALL_SUBTEST(test_basic_functions()); + CALL_SUBTEST(test_trigonometric_functions()); + CALL_SUBTEST(test_array()); + CALL_SUBTEST(test_product()); + } } diff --git a/test/is_same_dense.cpp b/test/is_same_dense.cpp index 2c7838ce9..c4e2fbc92 100644 --- a/test/is_same_dense.cpp +++ b/test/is_same_dense.cpp @@ -14,9 +14,13 @@ using internal::is_same_dense; void test_is_same_dense() { typedef Matrix<double,Dynamic,Dynamic,ColMajor> ColMatrixXd; + typedef Matrix<std::complex<double>,Dynamic,Dynamic,ColMajor> ColMatrixXcd; ColMatrixXd m1(10,10); + ColMatrixXcd m2(10,10); Ref<ColMatrixXd> ref_m1(m1); + Ref<ColMatrixXd,0, Stride<Dynamic,Dynamic> > ref_m2_real(m2.real()); Ref<const ColMatrixXd> const_ref_m1(m1); + VERIFY(is_same_dense(m1,m1)); VERIFY(is_same_dense(m1,ref_m1)); VERIFY(is_same_dense(const_ref_m1,m1)); @@ -30,4 +34,8 @@ void test_is_same_dense() Ref<const ColMatrixXd> const_ref_m1_col(m1.col(1)); VERIFY(is_same_dense(m1.col(1),const_ref_m1_col)); + + + VERIFY(!is_same_dense(m1, ref_m2_real)); + VERIFY(!is_same_dense(m2, ref_m2_real)); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 4b19be92d..56e017383 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -123,7 +123,7 @@ template<typename Scalar> void packetmath() EIGEN_ALIGN_MAX Scalar data2[size]; EIGEN_ALIGN_MAX Packet packets[PacketSize*2]; EIGEN_ALIGN_MAX Scalar ref[size]; - RealScalar refvalue = 0; + RealScalar refvalue = RealScalar(0); for (int i=0; i<size; ++i) { data1[i] = internal::random<Scalar>()/RealScalar(PacketSize); @@ -171,14 +171,18 @@ template<typename Scalar> void packetmath() for (int i=0; i<PacketSize; ++i) ref[i] = data1[i+offset]; + // palign is not used anymore, so let's just put a warning if it fails + ++g_test_level; VERIFY(areApprox(ref, data2, PacketSize) && "internal::palign"); + --g_test_level; } VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasAdd); VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasSub); VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMul); VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasNegate); - VERIFY((internal::is_same<Scalar,int>::value) || (!PacketTraits::Vectorizable) || PacketTraits::HasDiv); + // Disabled as it is not clear why it would be mandatory to support division. + //VERIFY((internal::is_same<Scalar,int>::value) || (!PacketTraits::Vectorizable) || PacketTraits::HasDiv); CHECK_CWISE2_IF(PacketTraits::HasAdd, REF_ADD, internal::padd); CHECK_CWISE2_IF(PacketTraits::HasSub, REF_SUB, internal::psub); @@ -242,29 +246,30 @@ template<typename Scalar> void packetmath() } } - ref[0] = 0; + ref[0] = Scalar(0); for (int i=0; i<PacketSize; ++i) ref[0] += data1[i]; VERIFY(isApproxAbs(ref[0], internal::predux(internal::pload<Packet>(data1)), refvalue) && "internal::predux"); + if(PacketSize==8 && internal::unpacket_traits<typename internal::unpacket_traits<Packet>::half>::size ==4) // so far, predux_half_dowto4 is only required in such a case { int HalfPacketSize = PacketSize>4 ? PacketSize/2 : PacketSize; for (int i=0; i<HalfPacketSize; ++i) - ref[i] = 0; + ref[i] = Scalar(0); for (int i=0; i<PacketSize; ++i) ref[i%HalfPacketSize] += data1[i]; internal::pstore(data2, internal::predux_half_dowto4(internal::pload<Packet>(data1))); VERIFY(areApprox(ref, data2, HalfPacketSize) && "internal::predux_half_dowto4"); } - ref[0] = 1; + ref[0] = Scalar(1); for (int i=0; i<PacketSize; ++i) ref[0] *= data1[i]; VERIFY(internal::isApprox(ref[0], internal::predux_mul(internal::pload<Packet>(data1))) && "internal::predux_mul"); for (int j=0; j<PacketSize; ++j) { - ref[j] = 0; + ref[j] = Scalar(0); for (int i=0; i<PacketSize; ++i) ref[j] += data1[i+j*PacketSize]; packets[j] = internal::pload<Packet>(data1+j*PacketSize); @@ -630,6 +635,7 @@ void test_packetmath() CALL_SUBTEST_3( packetmath<int>() ); CALL_SUBTEST_4( packetmath<std::complex<float> >() ); CALL_SUBTEST_5( packetmath<std::complex<double> >() ); + CALL_SUBTEST_6( packetmath<half>() ); CALL_SUBTEST_1( packetmath_notcomplex<float>() ); CALL_SUBTEST_2( packetmath_notcomplex<double>() ); diff --git a/test/product.h b/test/product.h index 0425a929e..d26e8063d 100644 --- a/test/product.h +++ b/test/product.h @@ -111,6 +111,17 @@ template<typename MatrixType> void product(const MatrixType& m) vcres.noalias() -= m1.transpose() * v1; VERIFY_IS_APPROX(vcres, vc2 - m1.transpose() * v1); + // test scaled products + res = square; + res.noalias() = s1 * m1 * m2.transpose(); + VERIFY_IS_APPROX(res, ((s1*m1).eval() * m2.transpose())); + res = square; + res.noalias() += s1 * m1 * m2.transpose(); + VERIFY_IS_APPROX(res, square + ((s1*m1).eval() * m2.transpose())); + res = square; + res.noalias() -= s1 * m1 * m2.transpose(); + VERIFY_IS_APPROX(res, square - ((s1*m1).eval() * m2.transpose())); + // test d ?= a+b*c rules res.noalias() = square + m1 * m2.transpose(); VERIFY_IS_APPROX(res, square + m1 * m2.transpose()); diff --git a/test/product_large.cpp b/test/product_large.cpp index 845cd40ca..14a4f739d 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -35,6 +35,8 @@ void test_product_large() for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( product(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,10), internal::random<int>(1,10))) ); + CALL_SUBTEST_3( product(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_4( product(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) ); CALL_SUBTEST_5( product(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp index 30592b79e..062180f42 100644 --- a/test/product_notemporary.cpp +++ b/test/product_notemporary.cpp @@ -128,11 +128,19 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m) VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * (m1*cv1), 1 ); // Check outer products + #ifdef EIGEN_ALLOCA + bool temp_via_alloca = m3.rows()*sizeof(Scalar) <= EIGEN_STACK_ALLOCATION_LIMIT; + #else + bool temp_via_alloca = false; + #endif m3 = cv1 * rv1; VERIFY_EVALUATION_COUNT( m3.noalias() = cv1 * rv1, 0 ); - VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), 1 ); + VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 ); VERIFY_EVALUATION_COUNT( m3.noalias() = (m1*cv1) * (rv1), 1 ); VERIFY_EVALUATION_COUNT( m3.noalias() += (m1*cv1) * (rv1), 1 ); + rm3 = cv1 * rv1; + VERIFY_EVALUATION_COUNT( rm3.noalias() = cv1 * rv1, 0 ); + VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 ); VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1) * (rv1 * m1), 1 ); VERIFY_EVALUATION_COUNT( rm3.noalias() -= (cv1) * (rv1 * m1), 1 ); VERIFY_EVALUATION_COUNT( rm3.noalias() = (m1*cv1) * (rv1 * m1), 2 ); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 9ab6b3565..b35b36475 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -161,6 +161,22 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> } } } + + // Handle special format like NCHW, its input shape is '[1, N..., 1]' and + // broadcast shape is '[N, 1..., N]' + if (!oneByN && !nByOne) { + if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) { + nByOne = true; + oneByN = true; + for (int i = 1; i < NumDims-1; ++i) { + if (broadcast[i] != 1) { + nByOne = false; + oneByN = false; + break; + } + } + } + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -256,18 +272,22 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> } if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - if (oneByN) { + if (oneByN && !nByOne) { return packetNByOne<LoadMode>(index); - } else if (nByOne) { + } else if (!oneByN && nByOne) { return packetOneByN<LoadMode>(index); + } else if (oneByN && nByOne) { + return packetOneByNByOne<LoadMode>(index); } else { return packetColMajor<LoadMode>(index); } } else { - if (oneByN) { + if (oneByN && !nByOne) { return packetOneByN<LoadMode>(index); - } else if (nByOne) { + } else if (!oneByN && nByOne) { return packetNByOne<LoadMode>(index); + } else if (oneByN && nByOne) { + return packetOneByNByOne<LoadMode>(index); } else { return packetRowMajor<LoadMode>(index); } @@ -275,6 +295,48 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> } template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne + (Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + Index startDim, endDim; + Index inputIndex, outputOffset, batchedIndex; + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + startDim = NumDims - 1; + endDim = 1; + } else { + startDim = 0; + endDim = NumDims - 2; + } + + batchedIndex = index % m_outputStrides[startDim]; + inputIndex = batchedIndex / m_outputStrides[endDim]; + outputOffset = batchedIndex % m_outputStrides[endDim]; + + if (outputOffset + PacketSize <= m_outputStrides[endDim]) { + values[0] = m_impl.coeff(inputIndex); + return internal::pload1<PacketReturnType>(values); + } else { + for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { + if (outputOffset + cur < m_outputStrides[endDim]) { + values[i] = m_impl.coeff(inputIndex); + } else { + ++inputIndex; + inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex); + values[i] = m_impl.coeff(inputIndex); + outputOffset = 0; + cur = 0; + } + } + return internal::pload<PacketReturnType>(values); + } + } + + template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const { EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index ca9ba402e..90fd99027 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -189,9 +189,11 @@ struct ThreadPoolDevice { // of blocks to be evenly dividable across threads. double block_size_f = 1.0 / CostModel::taskSize(1, cost); - Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f)); - const Index max_block_size = - numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f)); + const Index max_oversharding_factor = 4; + Index block_size = numext::mini( + n, numext::maxi<Index>(divup<Index>(n, max_oversharding_factor * numThreads()), + block_size_f)); + const Index max_block_size = numext::mini(n, 2 * block_size); if (block_align) { Index new_block_size = block_align(block_size); eigen_assert(new_block_size >= block_size); @@ -205,7 +207,8 @@ struct ThreadPoolDevice { (divup<int>(block_count, numThreads()) * numThreads()); // Now try to increase block size up to max_block_size as long as it // doesn't decrease parallel efficiency. - for (Index prev_block_count = block_count; prev_block_count > 1;) { + for (Index prev_block_count = block_count; + max_efficiency < 1.0 && prev_block_count > 1;) { // This is the next block size that divides size into a smaller number // of blocks than the current block_size. Index coarser_block_size = divup(n, prev_block_count - 1); diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h index be51b4e6f..079e88602 100644 --- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h +++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h @@ -316,8 +316,8 @@ struct kissfft_impl // use optimized mode for even real fwd( dst, reinterpret_cast<const Complex*> (src), ncfft); - Complex dc = dst[0].real() + dst[0].imag(); - Complex nyquist = dst[0].real() - dst[0].imag(); + Complex dc(dst[0].real() + dst[0].imag()); + Complex nyquist(dst[0].real() - dst[0].imag()); int k; for ( k=1;k <= ncfft2 ; ++k ) { Complex fpk = dst[k]; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 97e0669a6..76d6f5e5b 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -124,6 +124,7 @@ ei_add_test(polynomialsolver) ei_add_test(polynomialutils) ei_add_test(splines) ei_add_test(gmres) +ei_add_test(dgmres) ei_add_test(minres) ei_add_test(levenberg_marquardt) ei_add_test(kronecker_product) diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index a9d268ea6..f0ff03184 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -238,6 +238,59 @@ static void test_simple_broadcasting_n_by_one() } } +template <int DataLayout> +static void test_simple_broadcasting_one_by_n_by_one_1d() +{ + Tensor<float, 3, DataLayout> tensor(1,7,1); + tensor.setRandom(); + array<ptrdiff_t, 3> broadcasts; + broadcasts[0] = 5; + broadcasts[1] = 1; + broadcasts[2] = 13; + Tensor<float, 3, DataLayout> broadcasted; + broadcasted = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcasted.dimension(0), 5); + VERIFY_IS_EQUAL(broadcasted.dimension(1), 7); + VERIFY_IS_EQUAL(broadcasted.dimension(2), 13); + + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 7; ++j) { + for (int k = 0; k < 13; ++k) { + VERIFY_IS_EQUAL(tensor(0,j%7,0), broadcasted(i,j,k)); + } + } + } +} + +template <int DataLayout> +static void test_simple_broadcasting_one_by_n_by_one_2d() +{ + Tensor<float, 4, DataLayout> tensor(1,7,13,1); + tensor.setRandom(); + array<ptrdiff_t, 4> broadcasts; + broadcasts[0] = 5; + broadcasts[1] = 1; + broadcasts[2] = 1; + broadcasts[3] = 19; + Tensor<float, 4, DataLayout> broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 5); + VERIFY_IS_EQUAL(broadcast.dimension(1), 7); + VERIFY_IS_EQUAL(broadcast.dimension(2), 13); + VERIFY_IS_EQUAL(broadcast.dimension(3), 19); + + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 7; ++j) { + for (int k = 0; k < 13; ++k) { + for (int l = 0; l < 19; ++l) { + VERIFY_IS_EQUAL(tensor(0,j%7,k%13,0), broadcast(i,j,k,l)); + } + } + } + } +} void test_cxx11_tensor_broadcasting() { @@ -253,4 +306,8 @@ void test_cxx11_tensor_broadcasting() CALL_SUBTEST(test_simple_broadcasting_n_by_one<RowMajor>()); CALL_SUBTEST(test_simple_broadcasting_one_by_n<ColMajor>()); CALL_SUBTEST(test_simple_broadcasting_n_by_one<ColMajor>()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<ColMajor>()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<ColMajor>()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<RowMajor>()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<RowMajor>()); } |