aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen
diff options
context:
space:
mode:
Diffstat (limited to 'Eigen')
-rw-r--r--Eigen/Cholesky5
-rw-r--r--Eigen/Core34
-rw-r--r--Eigen/Eigenvalues4
-rw-r--r--Eigen/LU4
-rw-r--r--Eigen/QR4
-rw-r--r--Eigen/QtAlignedMalloc2
-rw-r--r--Eigen/SVD4
-rw-r--r--Eigen/src/Cholesky/LDLT.h2
-rw-r--r--Eigen/src/Cholesky/LLT.h26
-rw-r--r--Eigen/src/CholmodSupport/CholmodSupport.h20
-rw-r--r--Eigen/src/Core/Array.h8
-rw-r--r--Eigen/src/Core/ArrayWrapper.h6
-rw-r--r--Eigen/src/Core/AssignEvaluator.h4
-rw-r--r--Eigen/src/Core/CwiseNullaryOp.h36
-rw-r--r--Eigen/src/Core/DenseBase.h2
-rw-r--r--Eigen/src/Core/EigenBase.h1
-rw-r--r--Eigen/src/Core/GeneralProduct.h32
-rw-r--r--Eigen/src/Core/GenericPacketMath.h4
-rw-r--r--Eigen/src/Core/GlobalFunctions.h49
-rw-r--r--Eigen/src/Core/Map.h15
-rw-r--r--Eigen/src/Core/MathFunctions.h98
-rw-r--r--Eigen/src/Core/MatrixBase.h15
-rw-r--r--Eigen/src/Core/NoAlias.h3
-rw-r--r--Eigen/src/Core/NumTraits.h2
-rw-r--r--Eigen/src/Core/PlainObjectBase.h4
-rw-r--r--Eigen/src/Core/ProductEvaluators.h10
-rw-r--r--Eigen/src/Core/Solve.h4
-rw-r--r--Eigen/src/Core/StableNorm.h3
-rw-r--r--Eigen/src/Core/arch/AVX/Complex.h36
-rw-r--r--Eigen/src/Core/arch/AltiVec/Complex.h35
-rw-r--r--Eigen/src/Core/arch/CUDA/Complex.h6
-rw-r--r--Eigen/src/Core/arch/CUDA/Half.h170
-rw-r--r--Eigen/src/Core/arch/CUDA/MathFunctions.h2
-rw-r--r--Eigen/src/Core/arch/CUDA/PacketMath.h10
-rw-r--r--Eigen/src/Core/arch/CUDA/PacketMathHalf.h33
-rw-r--r--Eigen/src/Core/arch/CUDA/TypeCasting.h8
-rw-r--r--Eigen/src/Core/arch/Default/ConjHelper.h29
-rw-r--r--Eigen/src/Core/arch/NEON/Complex.h4
-rw-r--r--Eigen/src/Core/arch/NEON/MathFunctions.h92
-rw-r--r--Eigen/src/Core/arch/NEON/PacketMath.h17
-rw-r--r--Eigen/src/Core/arch/SSE/Complex.h36
-rw-r--r--Eigen/src/Core/arch/ZVector/Complex.h407
-rw-r--r--Eigen/src/Core/arch/ZVector/MathFunctions.h105
-rwxr-xr-xEigen/src/Core/arch/ZVector/PacketMath.h840
-rw-r--r--Eigen/src/Core/functors/AssignmentFunctors.h2
-rw-r--r--Eigen/src/Core/functors/NullaryFunctors.h11
-rw-r--r--Eigen/src/Core/products/GeneralMatrixMatrix.h12
-rw-r--r--Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h13
-rw-r--r--Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h10
-rw-r--r--Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h19
-rw-r--r--Eigen/src/Core/products/GeneralMatrixVector_BLAS.h19
-rw-r--r--Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h48
-rw-r--r--Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h9
-rw-r--r--Eigen/src/Core/products/TriangularMatrixMatrix.h11
-rw-r--r--Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h39
-rw-r--r--Eigen/src/Core/products/TriangularMatrixVector_BLAS.h46
-rw-r--r--Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h40
-rwxr-xr-xEigen/src/Core/util/DisableStupidWarnings.h2
-rwxr-xr-xEigen/src/Core/util/MKL_support.h19
-rw-r--r--Eigen/src/Core/util/Macros.h29
-rw-r--r--Eigen/src/Core/util/Memory.h12
-rwxr-xr-xEigen/src/Core/util/Meta.h14
-rw-r--r--Eigen/src/Core/util/StaticAssert.h116
-rw-r--r--Eigen/src/Core/util/XprHelper.h12
-rw-r--r--Eigen/src/Geometry/AngleAxis.h2
-rw-r--r--Eigen/src/Geometry/Quaternion.h39
-rw-r--r--Eigen/src/Geometry/arch/Geometry_SSE.h60
-rw-r--r--Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h27
-rw-r--r--Eigen/src/Jacobi/Jacobi.h255
-rw-r--r--Eigen/src/OrderingMethods/Eigen_Colamd.h2
-rw-r--r--Eigen/src/QR/ColPivHouseholderQR.h12
-rw-r--r--Eigen/src/SVD/BDCSVD.h245
-rw-r--r--Eigen/src/SVD/UpperBidiagonalization.h4
-rw-r--r--Eigen/src/SparseCore/AmbiVector.h2
-rw-r--r--Eigen/src/SparseCore/ConservativeSparseSparseProduct.h67
-rw-r--r--Eigen/src/SparseCore/SparseAssign.h4
-rw-r--r--Eigen/src/SparseCore/SparseSelfAdjointView.h3
-rw-r--r--Eigen/src/SparseCore/SparseSparseProductWithPruning.h22
-rw-r--r--Eigen/src/SparseQR/SparseQR.h2
-rwxr-xr-xEigen/src/misc/lapacke.h9
-rw-r--r--Eigen/src/plugins/IndexedViewMethods.h12
81 files changed, 2283 insertions, 1198 deletions
diff --git a/Eigen/Cholesky b/Eigen/Cholesky
index 369d1f5ec..1332b540d 100644
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@@ -9,6 +9,7 @@
#define EIGEN_CHOLESKY_MODULE_H
#include "Core"
+#include "Jacobi"
#include "src/Core/util/DisableStupidWarnings.h"
@@ -31,7 +32,11 @@
#include "src/Cholesky/LLT.h"
#include "src/Cholesky/LDLT.h"
#ifdef EIGEN_USE_LAPACKE
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
#include "src/misc/lapacke.h"
+#endif
#include "src/Cholesky/LLT_LAPACKE.h"
#endif
diff --git a/Eigen/Core b/Eigen/Core
index d18835613..c66359b79 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -14,8 +14,25 @@
// first thing Eigen does: stop the compiler from committing suicide
#include "src/Core/util/DisableStupidWarnings.h"
+#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
+ #define EIGEN_CUDACC __CUDACC__
+#endif
+
+#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
+ #define EIGEN_CUDA_ARCH __CUDA_ARCH__
+#endif
+
+// Starting with CUDA 9 the composite __CUDACC_VER__ is not available.
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
+#elif defined(__CUDACC_VER__)
+#define EIGEN_CUDACC_VER __CUDACC_VER__
+#else
+#define EIGEN_CUDACC_VER 0
+#endif
+
// Handle NVCC/CUDA/SYCL
-#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
+#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__)
// Do not try asserts on CUDA and SYCL!
#ifndef EIGEN_NO_DEBUG
#define EIGEN_NO_DEBUG
@@ -30,7 +47,7 @@
#endif
// All functions callable from CUDA code must be qualified with __device__
- #ifdef __CUDACC__
+ #ifdef EIGEN_CUDACC
// Do not try to vectorize on CUDA and SYCL!
#ifndef EIGEN_DONT_VECTORIZE
#define EIGEN_DONT_VECTORIZE
@@ -47,16 +64,20 @@
#define EIGEN_DEVICE_FUNC
#endif
+#ifdef __NVCC__
+#define EIGEN_DONT_VECTORIZE
+#endif
+
// When compiling CUDA device code with NVCC, pull in math functions from the
// global namespace. In host mode, and when device doee with clang, use the
// std versions.
-#if defined(__CUDA_ARCH__) && defined(__NVCC__)
+#if defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)
#define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
#else
#define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
#endif
-#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
#define EIGEN_EXCEPTIONS
#endif
@@ -233,10 +254,10 @@
#define EIGEN_HAS_FP16_C
#endif
-#if defined __CUDACC__
+#if defined EIGEN_CUDACC
#define EIGEN_VECTORIZE_CUDA
#include <vector_types.h>
- #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+ #if EIGEN_CUDACC_VER >= 70500
#define EIGEN_HAS_CUDA_FP16
#endif
#endif
@@ -371,6 +392,7 @@ using std::ptrdiff_t;
#include "src/Core/MathFunctions.h"
#include "src/Core/GenericPacketMath.h"
#include "src/Core/MathFunctionsImpl.h"
+#include "src/Core/arch/Default/ConjHelper.h"
#if defined EIGEN_VECTORIZE_AVX512
#include "src/Core/arch/SSE/PacketMath.h"
diff --git a/Eigen/Eigenvalues b/Eigen/Eigenvalues
index 009e529e1..f3f661b07 100644
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@@ -45,7 +45,11 @@
#include "src/Eigenvalues/GeneralizedEigenSolver.h"
#include "src/Eigenvalues/MatrixBaseEigenvalues.h"
#ifdef EIGEN_USE_LAPACKE
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
#include "src/misc/lapacke.h"
+#endif
#include "src/Eigenvalues/RealSchur_LAPACKE.h"
#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
diff --git a/Eigen/LU b/Eigen/LU
index 6f6c55629..6418a86e1 100644
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -28,7 +28,11 @@
#include "src/LU/FullPivLU.h"
#include "src/LU/PartialPivLU.h"
#ifdef EIGEN_USE_LAPACKE
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
#include "src/misc/lapacke.h"
+#endif
#include "src/LU/PartialPivLU_LAPACKE.h"
#endif
#include "src/LU/Determinant.h"
diff --git a/Eigen/QR b/Eigen/QR
index 80838e3bd..c7e914469 100644
--- a/Eigen/QR
+++ b/Eigen/QR
@@ -36,7 +36,11 @@
#include "src/QR/ColPivHouseholderQR.h"
#include "src/QR/CompleteOrthogonalDecomposition.h"
#ifdef EIGEN_USE_LAPACKE
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
#include "src/misc/lapacke.h"
+#endif
#include "src/QR/HouseholderQR_LAPACKE.h"
#include "src/QR/ColPivHouseholderQR_LAPACKE.h"
#endif
diff --git a/Eigen/QtAlignedMalloc b/Eigen/QtAlignedMalloc
index c6571f129..4f07df02a 100644
--- a/Eigen/QtAlignedMalloc
+++ b/Eigen/QtAlignedMalloc
@@ -27,7 +27,7 @@ void qFree(void *ptr)
void *qRealloc(void *ptr, std::size_t size)
{
void* newPtr = Eigen::internal::aligned_malloc(size);
- memcpy(newPtr, ptr, size);
+ std::memcpy(newPtr, ptr, size);
Eigen::internal::aligned_free(ptr);
return newPtr;
}
diff --git a/Eigen/SVD b/Eigen/SVD
index 86143c23d..5d0e75f7f 100644
--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -37,7 +37,11 @@
#include "src/SVD/JacobiSVD.h"
#include "src/SVD/BDCSVD.h"
#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
#include "src/misc/lapacke.h"
+#endif
#include "src/SVD/JacobiSVD_LAPACKE.h"
#endif
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 9b4fdb414..968427b3a 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -248,7 +248,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
/** \brief Reports whether previous computation was successful.
*
* \returns \c Success if computation was succesful,
- * \c NumericalIssue if the matrix.appears to be negative.
+ * \c NumericalIssue if the factorization failed because of a zero pivot.
*/
ComputationInfo info() const
{
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index e6c02d803..814174d47 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -24,7 +24,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
*
* \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
* \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
- * The other triangular part won't be read.
+ * The other triangular part won't be read.
*
* This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
* matrix A such that A = LL^* = U^*U, where L is lower triangular.
@@ -41,14 +41,18 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
* Example: \include LLT_example.cpp
* Output: \verbinclude LLT_example.out
*
+ * \b Performance: for best performance, it is recommended to use a column-major storage format
+ * with the Lower triangular part (the default), or, equivalently, a row-major storage format
+ * with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
+ * step, and rank-updates can be up to 3 times slower.
+ *
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
*
+ * Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered.
+ * Therefore, the strict lower part does not have to store correct values.
+ *
* \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
*/
- /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
- * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
- * the strict lower part does not have to store correct values.
- */
template<typename _MatrixType, int _UpLo> class LLT
{
public:
@@ -146,7 +150,7 @@ template<typename _MatrixType, int _UpLo> class LLT
}
template<typename Derived>
- void solveInPlace(MatrixBase<Derived> &bAndX) const;
+ void solveInPlace(const MatrixBase<Derived> &bAndX) const;
template<typename InputType>
LLT& compute(const EigenBase<InputType>& matrix);
@@ -177,7 +181,7 @@ template<typename _MatrixType, int _UpLo> class LLT
/** \brief Reports whether previous computation was successful.
*
* \returns \c Success if computation was succesful,
- * \c NumericalIssue if the matrix.appears to be negative.
+ * \c NumericalIssue if the matrix.appears not to be positive definite.
*/
ComputationInfo info() const
{
@@ -424,7 +428,8 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
eigen_assert(a.rows()==a.cols());
const Index size = a.rows();
m_matrix.resize(size, size);
- m_matrix = a.derived();
+ if (!internal::is_same_dense(m_matrix, a.derived()))
+ m_matrix = a.derived();
// Compute matrix L1 norm = max abs column sum.
m_l1_norm = RealScalar(0);
@@ -484,11 +489,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
*
* This version avoids a copy when the right hand side matrix b is not needed anymore.
*
+ * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
+ * This function will const_cast it, so constness isn't honored here.
+ *
* \sa LLT::solve(), MatrixBase::llt()
*/
template<typename MatrixType, int _UpLo>
template<typename Derived>
-void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
+void LLT<MatrixType,_UpLo>::solveInPlace(const MatrixBase<Derived> &bAndX) const
{
eigen_assert(m_isInitialized && "LLT is not initialized.");
eigen_assert(m_matrix.rows()==bAndX.rows());
diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h
index 61faf43ba..dc199ece6 100644
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -167,12 +167,12 @@ namespace internal {
// template specializations for int and long that call the correct cholmod method
#define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \
- template<typename _StorageIndex> ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \
- template<> ret cm_ ## name<long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
+ template<typename _StorageIndex> inline ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \
+ template<> inline ret cm_ ## name<long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
#define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \
- template<typename _StorageIndex> ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \
- template<> ret cm_ ## name<long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
+ template<typename _StorageIndex> inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \
+ template<> inline ret cm_ ## name<long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
EIGEN_CHOLMOD_SPECIALIZE0(int, start)
EIGEN_CHOLMOD_SPECIALIZE0(int, finish)
@@ -183,16 +183,16 @@ EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A)
EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)
-template<typename _StorageIndex> cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); }
-template<> cholmod_dense* cm_solve<long> (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_l_solve (sys, &L, &B, &Common); }
+template<typename _StorageIndex> inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); }
+template<> inline cholmod_dense* cm_solve<long> (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_l_solve (sys, &L, &B, &Common); }
-template<typename _StorageIndex> cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); }
-template<> cholmod_sparse* cm_spsolve<long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
+template<typename _StorageIndex> inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); }
+template<> inline cholmod_sparse* cm_spsolve<long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
template<typename _StorageIndex>
-int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); }
+inline int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); }
template<>
-int cm_factorize_p<long> (cholmod_sparse* A, double beta[2], long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
+inline int cm_factorize_p<long> (cholmod_sparse* A, double beta[2], long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
#undef EIGEN_CHOLMOD_SPECIALIZE0
#undef EIGEN_CHOLMOD_SPECIALIZE1
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index 0d34269fd..e10020d4f 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -231,10 +231,16 @@ class Array
: Base(other)
{ }
+ private:
+ struct PrivateType {};
+ public:
+
/** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
+ EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other,
+ typename internal::enable_if<internal::is_convertible<typename OtherDerived::Scalar,Scalar>::value,
+ PrivateType>::type = PrivateType())
: Base(other.derived())
{ }
diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h
index a04521a16..688aadd62 100644
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -32,7 +32,8 @@ struct traits<ArrayWrapper<ExpressionType> >
// Let's remove NestByRefBit
enum {
Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
- Flags = Flags0 & ~NestByRefBit
+ LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
+ Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
};
};
}
@@ -129,7 +130,8 @@ struct traits<MatrixWrapper<ExpressionType> >
// Let's remove NestByRefBit
enum {
Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
- Flags = Flags0 & ~NestByRefBit
+ LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
+ Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
};
};
}
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index b0ec7b7ca..dbe435d86 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -39,7 +39,7 @@ public:
enum {
DstAlignment = DstEvaluator::Alignment,
SrcAlignment = SrcEvaluator::Alignment,
- DstHasDirectAccess = DstFlags & DirectAccessBit,
+ DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit,
JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
};
@@ -83,7 +83,7 @@ private:
&& int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
&& (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)),
MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
- MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess
+ MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess)
&& (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
/* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
so it's only good for large enough sizes. */
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index 144608ec2..b1923da0f 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -861,6 +861,42 @@ template<typename Derived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
{ return Derived::Unit(3); }
+/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
+ *
+ * \param i index of the unique coefficient to be set to 1
+ *
+ * \only_for_vectors
+ *
+ * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
+ */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i)
+{
+ EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+ eigen_assert(i<size());
+ derived().setZero();
+ derived().coeffRef(i) = Scalar(1);
+ return derived();
+}
+
+/** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
+ *
+ * \param newSize the new size of the vector
+ * \param i index of the unique coefficient to be set to 1
+ *
+ * \only_for_vectors
+ *
+ * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
+ */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i)
+{
+ EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+ eigen_assert(i<newSize);
+ derived().resize(newSize);
+ return setUnit(i);
+}
+
} // end namespace Eigen
#endif // EIGEN_CWISE_NULLARY_OP_H
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 91a8511be..fd933eed4 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -296,7 +296,7 @@ template<typename Derived> class DenseBase
EIGEN_DEVICE_FUNC
Derived& operator=(const ReturnByValue<OtherDerived>& func);
- /** \ínternal
+ /** \internal
* Copies \a other into *this without evaluating other. \returns a reference to *this.
* \deprecated */
template<typename OtherDerived>
diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h
index ccc122cfc..b195506a9 100644
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -14,6 +14,7 @@
namespace Eigen {
/** \class EigenBase
+ * \ingroup Core_Module
*
* Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
*
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index b206b0a7a..694f7cbde 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -18,18 +18,33 @@ enum {
Small = 3
};
+// Define the threshold value to fallback from the generic matrix-matrix product
+// implementation (heavy) to the lightweight coeff-based product one.
+// See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
+// in products/GeneralMatrixMatrix.h for more details.
+// TODO This threshold should also be used in the compile-time selector below.
+#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
+// This default value has been obtained on a Haswell architecture.
+#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
+#endif
+
namespace internal {
template<int Rows, int Cols, int Depth> struct product_type_selector;
template<int Size, int MaxSize> struct product_size_category
{
- enum { is_large = MaxSize == Dynamic ||
- Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
- (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
- value = is_large ? Large
- : Size == 1 ? 1
- : Small
+ enum {
+ #ifndef EIGEN_CUDA_ARCH
+ is_large = MaxSize == Dynamic ||
+ Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
+ (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
+ #else
+ is_large = 0,
+ #endif
+ value = is_large ? Large
+ : Size == 1 ? 1
+ : Small
};
};
@@ -379,10 +394,9 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
*
* \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
*/
-#ifndef __CUDACC__
-
template<typename Derived>
template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
inline const Product<Derived, OtherDerived>
MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
{
@@ -412,8 +426,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
return Product<Derived, OtherDerived>(derived(), other.derived());
}
-#endif // __CUDACC__
-
/** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
*
* The returned product will behave like any other expressions: the coefficients of the product will be
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index d19d5bbd2..30878eda6 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -299,7 +299,7 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
/** \internal tries to do cache prefetching of \a addr */
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
{
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_CUDA_ARCH
#if defined(__LP64__)
// 64-bit pointer operand constraint for inlined asm
asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
@@ -526,7 +526,7 @@ inline void palign(PacketType& first, const PacketType& second)
***************************************************************************/
// Eigen+CUDA does not support complexes.
-#ifndef __CUDACC__
+#ifndef EIGEN_CUDACC
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index 12828a7c3..50406400b 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -103,17 +103,18 @@ namespace Eigen
inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> >
pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
#else
- template<typename Derived,typename ScalarExponent>
- inline typename internal::enable_if< !(internal::is_same<typename Derived::Scalar,ScalarExponent>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent),
- const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type
- pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent) {
- return x.derived().pow(exponent);
- }
-
- template<typename Derived>
- inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow)
- pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) {
- return x.derived().pow(exponent);
+ template <typename Derived,typename ScalarExponent>
+ EIGEN_DEVICE_FUNC inline
+ EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
+ const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<typename Derived::Scalar
+ EIGEN_COMMA ScalarExponent EIGEN_COMMA
+ EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type,pow))
+ pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent)
+ {
+ typedef typename internal::promote_scalar_arg<typename Derived::Scalar,ScalarExponent,
+ EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type PromotedExponent;
+ return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(),
+ typename internal::plain_constant_type<Derived,PromotedExponent>::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op<PromotedExponent>(exponent)));
}
#endif
@@ -156,21 +157,17 @@ namespace Eigen
inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived>
pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
#else
- template<typename Scalar, typename Derived>
- inline typename internal::enable_if< !(internal::is_same<typename Derived::Scalar,Scalar>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar),
- const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type
- pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
- {
- return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)(
- typename internal::plain_constant_type<Derived,Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
- }
-
- template<typename Derived>
- inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)
- pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
- {
- return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)(
- typename internal::plain_constant_type<Derived,typename Derived::Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
+ template <typename Scalar, typename Derived>
+ EIGEN_DEVICE_FUNC inline
+ EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
+ const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<typename Derived::Scalar
+ EIGEN_COMMA Scalar EIGEN_COMMA
+ EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type,Derived,pow))
+ pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents) {
+ typedef typename internal::promote_scalar_arg<typename Derived::Scalar,Scalar,
+ EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type PromotedScalar;
+ return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)(
+ typename internal::plain_constant_type<Derived,PromotedScalar>::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op<PromotedScalar>(x)), exponents.derived());
}
#endif
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index 06d196702..c437f1a92 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -20,11 +20,17 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
{
typedef traits<PlainObjectType> TraitsBase;
enum {
+ PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
+ ? PlainObjectType::ColsAtCompileTime
+ : PlainObjectType::RowsAtCompileTime,
+
InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
? int(PlainObjectType::InnerStrideAtCompileTime)
: int(StrideType::InnerStrideAtCompileTime),
OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
- ? int(PlainObjectType::OuterStrideAtCompileTime)
+ ? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
+ ? Dynamic
+ : int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
: int(StrideType::OuterStrideAtCompileTime),
Alignment = int(MapOptions)&int(AlignedMask),
Flags0 = TraitsBase::Flags & (~NestByRefBit),
@@ -108,9 +114,10 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
inline Index outerStride() const
{
return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
- : IsVectorAtCompileTime ? this->size()
- : int(Flags)&RowMajorBit ? this->cols()
- : this->rows();
+ : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
+ : IsVectorAtCompileTime ? (this->size() * innerStride())
+ : int(Flags)&RowMajorBit ? (this->cols() * innerStride())
+ : (this->rows() * innerStride());
}
/** Constructor in the fixed-size case.
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 7a6b999af..5ba5293a0 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true>
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_CUDA_ARCH
template<typename T>
struct real_impl<std::complex<T> >
{
@@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true>
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_CUDA_ARCH
template<typename T>
struct imag_impl<std::complex<T> >
{
@@ -512,7 +512,7 @@ namespace std_fallback {
template<typename Scalar>
struct expm1_impl {
- static inline Scalar run(const Scalar& x)
+ EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
{
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
#if EIGEN_HAS_CXX11_MATH
@@ -549,7 +549,7 @@ namespace std_fallback {
template<typename Scalar>
struct log1p_impl {
- static inline Scalar run(const Scalar& x)
+ EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
{
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
#if EIGEN_HAS_CXX11_MATH
@@ -778,7 +778,7 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isfinite_impl(const T& x)
{
- #ifdef __CUDA_ARCH__
+ #ifdef EIGEN_CUDA_ARCH
return (::isfinite)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isfinite;
@@ -793,7 +793,7 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isinf_impl(const T& x)
{
- #ifdef __CUDA_ARCH__
+ #ifdef EIGEN_CUDA_ARCH
return (::isinf)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isinf;
@@ -808,7 +808,7 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isnan_impl(const T& x)
{
- #ifdef __CUDA_ARCH__
+ #ifdef EIGEN_CUDA_ARCH
return (::isnan)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isnan;
@@ -874,7 +874,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
namespace numext {
-#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
template<typename T>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
@@ -1059,6 +1059,9 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
}
+EIGEN_DEVICE_FUNC
+inline bool abs2(bool x) { return x; }
+
template<typename Scalar>
EIGEN_DEVICE_FUNC
inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
@@ -1085,7 +1088,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); }
EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float log1p(const float &x) { return ::log1pf(x); }
@@ -1143,7 +1146,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); }
EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float floor(const float &x) { return ::floorf(x); }
@@ -1164,7 +1167,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); }
EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float ceil(const float &x) { return ::ceilf(x); }
@@ -1222,7 +1225,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float log(const float &x) { return ::logf(x); }
@@ -1232,17 +1235,25 @@ double log(const double &x) { return ::log(x); }
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-typename NumTraits<T>::Real abs(const T &x) {
+typename internal::enable_if<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex,typename NumTraits<T>::Real>::type
+abs(const T &x) {
EIGEN_USING_STD_MATH(abs);
return abs(x);
}
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+typename internal::enable_if<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex),typename NumTraits<T>::Real>::type
+abs(const T &x) {
+ return x;
+}
+
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); }
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float abs(const float &x) { return ::fabsf(x); }
@@ -1272,7 +1283,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); }
EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float exp(const float &x) { return ::expf(x); }
@@ -1292,7 +1303,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); }
EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float expm1(const float &x) { return ::expm1f(x); }
@@ -1312,7 +1323,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); }
EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float cos(const float &x) { return ::cosf(x); }
@@ -1332,7 +1343,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); }
EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float sin(const float &x) { return ::sinf(x); }
@@ -1352,7 +1363,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); }
EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tan(const float &x) { return ::tanf(x); }
@@ -1367,12 +1378,23 @@ T acos(const T &x) {
return acos(x);
}
+#if EIGEN_HAS_CXX11_MATH
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T acosh(const T &x) {
+ EIGEN_USING_STD_MATH(acosh);
+ return acosh(x);
+}
+#endif
+
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float acos(float x) { return cl::sycl::acos(x); }
EIGEN_ALWAYS_INLINE double acos(double x) { return cl::sycl::acos(x); }
+EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); }
+EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float acos(const float &x) { return ::acosf(x); }
@@ -1387,12 +1409,23 @@ T asin(const T &x) {
return asin(x);
}
+#if EIGEN_HAS_CXX11_MATH
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T asinh(const T &x) {
+ EIGEN_USING_STD_MATH(asinh);
+ return asinh(x);
+}
+#endif
+
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float asin(float x) { return cl::sycl::asin(x); }
EIGEN_ALWAYS_INLINE double asin(double x) { return cl::sycl::asin(x); }
+EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); }
+EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float asin(const float &x) { return ::asinf(x); }
@@ -1407,12 +1440,23 @@ T atan(const T &x) {
return atan(x);
}
+#if EIGEN_HAS_CXX11_MATH
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T atanh(const T &x) {
+ EIGEN_USING_STD_MATH(atanh);
+ return atanh(x);
+}
+#endif
+
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float atan(float x) { return cl::sycl::atan(x); }
EIGEN_ALWAYS_INLINE double atan(double x) { return cl::sycl::atan(x); }
+EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); }
+EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float atan(const float &x) { return ::atanf(x); }
@@ -1433,7 +1477,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); }
EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float cosh(const float &x) { return ::coshf(x); }
@@ -1453,7 +1497,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); }
EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float sinh(const float &x) { return ::sinhf(x); }
@@ -1471,12 +1515,12 @@ T tanh(const T &x) {
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); }
EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); }
-#elif (!defined(__CUDACC__)) && EIGEN_FAST_MATH
+#elif (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tanh(float x) { return internal::generic_fast_tanh_float(x); }
#endif
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tanh(const float &x) { return ::tanhf(x); }
@@ -1496,7 +1540,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y)
EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float fmod(const float& a, const float& b) {
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 200e57741..11435903b 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -160,20 +160,11 @@ template<typename Derived> class MatrixBase
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Derived& operator-=(const MatrixBase<OtherDerived>& other);
-#ifdef __CUDACC__
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
- const Product<Derived,OtherDerived,LazyProduct>
- operator*(const MatrixBase<OtherDerived> &other) const
- { return this->lazyProduct(other); }
-#else
-
- template<typename OtherDerived>
const Product<Derived,OtherDerived>
operator*(const MatrixBase<OtherDerived> &other) const;
-#endif
-
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
const Product<Derived,OtherDerived,LazyProduct>
@@ -277,6 +268,8 @@ template<typename Derived> class MatrixBase
Derived& setIdentity();
EIGEN_DEVICE_FUNC
Derived& setIdentity(Index rows, Index cols);
+ EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
+ EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);
bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@@ -305,7 +298,7 @@ template<typename Derived> class MatrixBase
EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
{ return cwiseNotEqual(other).any(); }
- NoAlias<Derived,Eigen::MatrixBase > noalias();
+ NoAlias<Derived,Eigen::MatrixBase > EIGEN_DEVICE_FUNC noalias();
// TODO forceAlignedAccess is temporarily disabled
// Need to find a nicer workaround.
@@ -437,8 +430,10 @@ template<typename Derived> class MatrixBase
///////// Jacobi module /////////
template<typename OtherScalar>
+ EIGEN_DEVICE_FUNC
void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
template<typename OtherScalar>
+ EIGEN_DEVICE_FUNC
void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
///////// SparseCore module /////////
diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h
index 33908010b..e94c8ee96 100644
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@@ -33,6 +33,7 @@ class NoAlias
public:
typedef typename ExpressionType::Scalar Scalar;
+ EIGEN_DEVICE_FUNC
explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
template<typename OtherDerived>
@@ -98,7 +99,7 @@ class NoAlias
* \sa class NoAlias
*/
template<typename Derived>
-NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
+NoAlias<Derived,MatrixBase> EIGEN_DEVICE_FUNC MatrixBase<Derived>::noalias()
{
return NoAlias<Derived, Eigen::MatrixBase >(derived());
}
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index aebc0c259..92a9ae1ea 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -215,6 +215,8 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
EIGEN_DEVICE_FUNC
static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
+
+ static inline int digits10() { return NumTraits<Scalar>::digits10(); }
};
template<> struct NumTraits<std::string>
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index 77f4f6066..1dc7e223a 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
* while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
* \a data pointers.
*
+ * Here is an example using strides:
+ * \include Matrix_Map_stride.cpp
+ * Output: \verbinclude Matrix_Map_stride.out
+ *
* \see class Map
*/
//@{
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 583b7f59e..86966abdb 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -207,6 +207,12 @@ struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename
static const bool value = true;
};
+template<typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_difference_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
+ const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
+ static const bool value = true;
+};
+
template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
struct assignment_from_xpr_op_product
{
@@ -845,7 +851,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
}
-#ifndef __CUDACC__
+#ifndef EIGEN_CUDACC
template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
{
@@ -889,7 +895,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
}
-#ifndef __CUDACC__
+#ifndef EIGEN_CUDACC
template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
{
diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index 960a58597..a8daea511 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -34,12 +34,12 @@ template<typename Decomposition, typename RhsType,typename StorageKind> struct s
template<typename Decomposition, typename RhsType>
struct solve_traits<Decomposition,RhsType,Dense>
{
- typedef Matrix<typename RhsType::Scalar,
+ typedef typename make_proper_matrix_type<typename RhsType::Scalar,
Decomposition::ColsAtCompileTime,
RhsType::ColsAtCompileTime,
RhsType::PlainObject::Options,
Decomposition::MaxColsAtCompileTime,
- RhsType::MaxColsAtCompileTime> PlainObject;
+ RhsType::MaxColsAtCompileTime>::type PlainObject;
};
template<typename Decomposition, typename RhsType>
diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index d2fe1e199..be04ed44d 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -170,7 +170,8 @@ MatrixBase<Derived>::stableNorm() const
enum {
CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit)
|| (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
- ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
+ ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
+ && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
};
typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index 99439c8aa..7fa61969d 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -204,23 +204,7 @@ template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
}
};
-template<> struct conj_helper<Packet8f, Packet4cf, false,false>
-{
- EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
- { return Packet4cf(Eigen::internal::pmul(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet4cf, Packet8f, false,false>
-{
- EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
- { return Packet4cf(Eigen::internal::pmul(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
{
@@ -400,23 +384,7 @@ template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
}
};
-template<> struct conj_helper<Packet4d, Packet2cd, false,false>
-{
- EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
- { return Packet2cd(Eigen::internal::pmul(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cd, Packet4d, false,false>
-{
- EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
- { return Packet2cd(Eigen::internal::pmul(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
{
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 67db2f8ee..3e665730c 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -224,23 +224,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
}
};
-template<> struct conj_helper<Packet4f, Packet2cf, false,false>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
- { return Packet2cf(internal::pmul<Packet4f>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cf, Packet4f, false,false>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
- { return Packet2cf(internal::pmul<Packet4f>(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
@@ -416,23 +400,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
return pconj(internal::pmul(a, b));
}
};
-template<> struct conj_helper<Packet2d, Packet1cd, false,false>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
- { return Packet1cd(internal::pmul<Packet2d>(x, y.v)); }
-};
-template<> struct conj_helper<Packet1cd, Packet2d, false,false>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
- { return Packet1cd(internal::pmul<Packet2d>(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{
diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h
index 9c2536509..57d1201f4 100644
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -16,7 +16,7 @@ namespace Eigen {
namespace internal {
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
// Many std::complex methods such as operator+, operator-, operator* and
// operator/ are not constexpr. Due to this, clang does not treat them as device
@@ -55,7 +55,7 @@ template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T
// Product
template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
enum {
- Vectorizable = packet_traits<std::complex<T>>::HasMul
+ Vectorizable = packet_traits<std::complex<T> >::HasMul
};
typedef typename std::complex<T> result_type;
@@ -76,7 +76,7 @@ template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> >
// Quotient
template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
enum {
- Vectorizable = packet_traits<std::complex<T>>::HasDiv
+ Vectorizable = packet_traits<std::complex<T> >::HasDiv
};
typedef typename std::complex<T> result_type;
diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index db9878796..bfda39df5 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -13,7 +13,7 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
@@ -50,38 +50,45 @@ struct half;
namespace half_impl {
#if !defined(EIGEN_HAS_CUDA_FP16)
-
-// Make our own __half definition that is similar to CUDA's.
-struct __half {
- EIGEN_DEVICE_FUNC __half() : x(0) {}
- explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
+// Make our own __half_raw definition that is similar to CUDA's.
+struct __half_raw {
+ EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
+ explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
unsigned short x;
};
-
+#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
+typedef __half __half_raw;
#endif
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
-struct half_base : public __half {
+struct half_base : public __half_raw {
EIGEN_DEVICE_FUNC half_base() {}
- EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
- EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
+ EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
+ EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+ EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+#endif
};
} // namespace half_impl
// Class definition.
struct half : public half_impl::half_base {
- #if !defined(EIGEN_HAS_CUDA_FP16)
- typedef half_impl::__half __half;
+ #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
+ typedef half_impl::__half_raw __half_raw;
#endif
EIGEN_DEVICE_FUNC half() {}
- EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+ EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+ EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+#endif
explicit EIGEN_DEVICE_FUNC half(bool b)
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
@@ -140,62 +147,62 @@ struct half : public half_impl::half_base {
namespace half_impl {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
// Intrinsics for native fp16 support. Note that on current hardware,
// these are no faster than fp32 arithmetic (you need to use the half2
// versions to get the ALU speed increased), but you do save the
// conversion steps back and forth.
-__device__ half operator + (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
return __hadd(a, b);
}
-__device__ half operator * (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
return __hmul(a, b);
}
-__device__ half operator - (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
return __hsub(a, b);
}
-__device__ half operator / (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
float num = __half2float(a);
float denom = __half2float(b);
return __float2half(num / denom);
}
-__device__ half operator - (const half& a) {
+EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
return __hneg(a);
}
-__device__ half& operator += (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
a = a + b;
return a;
}
-__device__ half& operator *= (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
a = a * b;
return a;
}
-__device__ half& operator -= (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
a = a - b;
return a;
}
-__device__ half& operator /= (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
a = a / b;
return a;
}
-__device__ bool operator == (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
return __heq(a, b);
}
-__device__ bool operator != (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
return __hne(a, b);
}
-__device__ bool operator < (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
return __hlt(a, b);
}
-__device__ bool operator <= (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
return __hle(a, b);
}
-__device__ bool operator > (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
return __hgt(a, b);
}
-__device__ bool operator >= (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
return __hge(a, b);
}
@@ -269,8 +276,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
// these in hardware. If we need more performance on older/other CPUs, they are
// also possible to vectorize directly.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
- __half h;
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
+ __half_raw h;
h.x = x;
return h;
}
@@ -280,12 +287,13 @@ union FP32 {
float f;
};
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
- return __float2half(ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
+ __half tmp_ff = __float2half(ff);
+ return *(__half_raw*)&tmp_ff;
#elif defined(EIGEN_HAS_FP16_C)
- __half h;
+ __half_raw h;
h.x = _cvtss_sh(ff, 0);
return h;
@@ -296,7 +304,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
const FP32 f16max = { (127 + 16) << 23 };
const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
unsigned int sign_mask = 0x80000000u;
- __half o;
+ __half_raw o;
o.x = static_cast<unsigned short>(0x0u);
unsigned int sign = f.u & sign_mask;
@@ -335,8 +343,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
#endif
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
return __half2float(h);
#elif defined(EIGEN_HAS_FP16_C)
@@ -370,7 +378,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
return (a.x & 0x7fff) == 0x7c00;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return __hisnan(a);
#else
return (a.x & 0x7fff) > 0x7c00;
@@ -386,7 +394,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
return half(hexp(a));
#else
return half(::expf(float(a)));
@@ -396,7 +404,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
return half(numext::expm1(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return half(::hlog(a));
#else
return half(::logf(float(a)));
@@ -409,7 +417,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
return half(::log10f(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
return half(hsqrt(a));
#else
return half(::sqrtf(float(a)));
@@ -431,14 +439,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
return half(::tanhf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
return half(hfloor(a));
#else
return half(::floorf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
return half(hceil(a));
#else
return half(::ceilf(float(a)));
@@ -446,7 +454,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return __hlt(b, a) ? b : a;
#else
const float f1 = static_cast<float>(a);
@@ -455,7 +463,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return __hlt(a, b) ? b : a;
#else
const float f1 = static_cast<float>(a);
@@ -493,9 +501,59 @@ template<> struct is_arithmetic<half> { enum { value = true }; };
} // end namespace internal
+} // end namespace Eigen
+
+namespace std {
+template<>
+struct numeric_limits<Eigen::half> {
+ static const bool is_specialized = true;
+ static const bool is_signed = true;
+ static const bool is_integer = false;
+ static const bool is_exact = false;
+ static const bool has_infinity = true;
+ static const bool has_quiet_NaN = true;
+ static const bool has_signaling_NaN = true;
+ static const float_denorm_style has_denorm = denorm_present;
+ static const bool has_denorm_loss = false;
+ static const std::float_round_style round_style = std::round_to_nearest;
+ static const bool is_iec559 = false;
+ static const bool is_bounded = false;
+ static const bool is_modulo = false;
+ static const int digits = 11;
+ static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+ static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+ static const int radix = 2;
+ static const int min_exponent = -13;
+ static const int min_exponent10 = -4;
+ static const int max_exponent = 16;
+ static const int max_exponent10 = 4;
+ static const bool traps = true;
+ static const bool tinyness_before = false;
+
+ static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
+ static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
+ static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
+ static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
+ static Eigen::half round_error() { return Eigen::half(0.5); }
+ static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
+ static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+ static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+ static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
+};
+}
+
+namespace Eigen {
+
template<> struct NumTraits<Eigen::half>
: GenericNumTraits<Eigen::half>
{
+ enum {
+ IsSigned = true,
+ IsInteger = false,
+ IsComplex = false,
+ RequireInitialization = false
+ };
+
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
return half_impl::raw_uint16_to_half(0x0800);
}
@@ -526,7 +584,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
return Eigen::half(::expf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return Eigen::half(::hlog(a));
#else
return Eigen::half(::logf(float(a)));
@@ -560,14 +618,18 @@ struct hash<Eigen::half> {
// Add the missing shfl_xor intrinsic
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
+ #if EIGEN_CUDACC_VER < 90000
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
+ #else
+ return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
+ #endif
}
#endif
-// ldg() has an overload for __half, but we also need one for Eigen::half.
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
return Eigen::half_impl::raw_uint16_to_half(
__ldg(reinterpret_cast<const unsigned short*>(ptr)));
@@ -575,7 +637,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
#endif
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_CUDA_ARCH)
namespace Eigen {
namespace numext {
diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h
index 987a5291c..ff6256ce0 100644
--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -17,7 +17,7 @@ namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float4 plog<float4>(const float4& a)
{
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index 8c46af09b..97a8abe59 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -17,7 +17,7 @@ namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
template<> struct is_arithmetic<float4> { enum { value = true }; };
template<> struct is_arithmetic<double2> { enum { value = true }; };
@@ -196,7 +196,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return __ldg((const float4*)from);
#else
return make_float4(from[0], from[1], from[2], from[3]);
@@ -204,7 +204,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return __ldg((const double2*)from);
#else
return make_double2(from[0], from[1]);
@@ -213,7 +213,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
#else
return make_float4(from[0], from[1], from[2], from[3]);
@@ -221,7 +221,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return make_double2(__ldg(from+0), __ldg(from+1));
#else
return make_double2(from[0], from[1]);
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index b9a125b42..ce48e4b31 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -15,7 +15,7 @@ namespace Eigen {
namespace internal {
// Most of the following operations require arch >= 3.0
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
template<> struct is_arithmetic<half2> { enum { value = true }; };
@@ -54,7 +54,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half*
return __halves2half2(from[0], from[1]);
}
-template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
+template<> __device__ EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
return __halves2half2(from[0], from[0]);
}
@@ -69,7 +69,7 @@ template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half*
template<>
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
-#if __CUDA_ARCH__ >= 350
+#if EIGEN_CUDA_ARCH >= 350
return __ldg((const half2*)from);
#else
return __halves2half2(*(from+0), *(from+1));
@@ -78,7 +78,7 @@ template<>
template<>
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
-#if __CUDA_ARCH__ >= 350
+#if EIGEN_CUDA_ARCH >= 350
return __halves2half2(__ldg(from+0), __ldg(from+1));
#else
return __halves2half2(*(from+0), *(from+1));
@@ -100,7 +100,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2&
template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
half2 result;
- result.x = a.x & 0x7FFF7FFF;
+ unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
+ *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
return result;
}
@@ -116,7 +117,7 @@ ptranspose(PacketBlock<half2,2>& kernel) {
}
template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
-#if __CUDA_ARCH__ >= 530
+#if EIGEN_CUDA_ARCH >= 530
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
#else
float f = __half2float(a) + 1.0f;
@@ -125,7 +126,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half&
}
template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
-#if __CUDA_ARCH__ >= 530
+#if EIGEN_CUDA_ARCH >= 530
return __hadd2(a, b);
#else
float a1 = __low2float(a);
@@ -139,7 +140,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
}
template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
-#if __CUDA_ARCH__ >= 530
+#if EIGEN_CUDA_ARCH >= 530
return __hsub2(a, b);
#else
float a1 = __low2float(a);
@@ -153,7 +154,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
}
template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
-#if __CUDA_ARCH__ >= 530
+#if EIGEN_CUDA_ARCH >= 530
return __hneg2(a);
#else
float a1 = __low2float(a);
@@ -165,7 +166,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
-#if __CUDA_ARCH__ >= 530
+#if EIGEN_CUDA_ARCH >= 530
return __hmul2(a, b);
#else
float a1 = __low2float(a);
@@ -179,7 +180,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
}
template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
-#if __CUDA_ARCH__ >= 530
+#if EIGEN_CUDA_ARCH >= 530
return __hfma2(a, b, c);
#else
float a1 = __low2float(a);
@@ -225,7 +226,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
}
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
+#if EIGEN_CUDA_ARCH >= 530
return __hadd(__low2half(a), __high2half(a));
#else
float a1 = __low2float(a);
@@ -235,7 +236,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2&
}
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
+#if EIGEN_CUDA_ARCH >= 530
__half first = __low2half(a);
__half second = __high2half(a);
return __hgt(first, second) ? first : second;
@@ -247,7 +248,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
}
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
+#if EIGEN_CUDA_ARCH >= 530
__half first = __low2half(a);
__half second = __high2half(a);
return __hlt(first, second) ? first : second;
@@ -259,7 +260,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
}
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
+#if EIGEN_CUDA_ARCH >= 530
return __hmul(__low2half(a), __high2half(a));
#else
float a1 = __low2float(a);
@@ -284,7 +285,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
return __floats2half2_rn(r1, r2);
}
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
template<> __device__ EIGEN_STRONG_INLINE
half2 plog<half2>(const half2& a) {
diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h
index aa5fbce8e..30f870c3d 100644
--- a/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@@ -19,7 +19,7 @@ struct scalar_cast_op<float, Eigen::half> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef Eigen::half result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
- #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+ #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
return __float2half(a);
#else
return Eigen::half(a);
@@ -37,7 +37,7 @@ struct scalar_cast_op<int, Eigen::half> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef Eigen::half result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
- #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+ #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
return __float2half(static_cast<float>(a));
#else
return Eigen::half(static_cast<float>(a));
@@ -55,7 +55,7 @@ struct scalar_cast_op<Eigen::half, float> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef float result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
- #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+ #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
return __half2float(a);
#else
return static_cast<float>(a);
@@ -69,7 +69,7 @@ struct functor_traits<scalar_cast_op<Eigen::half, float> >
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
template <>
struct type_casting_traits<Eigen::half, float> {
diff --git a/Eigen/src/Core/arch/Default/ConjHelper.h b/Eigen/src/Core/arch/Default/ConjHelper.h
new file mode 100644
index 000000000..4cfe34e05
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/ConjHelper.h
@@ -0,0 +1,29 @@
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_CONJ_HELPER_H
+#define EIGEN_ARCH_CONJ_HELPER_H
+
+#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \
+ template<> struct conj_helper<PACKET_REAL, PACKET_CPLX, false,false> { \
+ EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \
+ { return padd(c, pmul(x,y)); } \
+ EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \
+ { return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); } \
+ }; \
+ \
+ template<> struct conj_helper<PACKET_CPLX, PACKET_REAL, false,false> { \
+ EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \
+ { return padd(c, pmul(x,y)); } \
+ EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \
+ { return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); } \
+ };
+
+#endif // EIGEN_ARCH_CONJ_HELPER_H
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 57e9b431f..ef50ba303 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -265,6 +265,8 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
}
};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
// TODO optimize it for NEON
@@ -456,6 +458,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
}
};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{
// TODO optimize it for NEON
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
index 6bb05bb92..c48c61023 100644
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -84,6 +84,98 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
return y;
}
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog<Packet4f>(const Packet4f& _x)
+{
+ Packet4f x = _x;
+ _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+ _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+ _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+
+ _EIGEN_DECLARE_CONST_Packet4i(inv_mant_mask, ~0x7f800000);
+
+ /* natural logarithm computed for 4 simultaneous float
+ return NaN for x <= 0
+ */
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+ x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
+ Packet4ui invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
+
+ Packet4i ux = vreinterpretq_s32_f32(x);
+
+ Packet4i emm0 = vshrq_n_s32(ux, 23);
+
+ /* keep only the fractional part */
+ ux = vandq_s32(ux, p4i_inv_mant_mask);
+ ux = vorrq_s32(ux, vreinterpretq_s32_f32(p4f_half));
+ x = vreinterpretq_f32_s32(ux);
+
+ emm0 = vsubq_s32(emm0, p4i_0x7f);
+ Packet4f e = vcvtq_f32_s32(emm0);
+
+ e = vaddq_f32(e, p4f_1);
+
+ /* part2:
+ if( x < SQRTHF ) {
+ e -= 1;
+ x = x + x - 1.0;
+ } else { x = x - 1.0; }
+ */
+ Packet4ui mask = vcltq_f32(x, p4f_cephes_SQRTHF);
+ Packet4f tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
+ x = vsubq_f32(x, p4f_1);
+ e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(p4f_1), mask)));
+ x = vaddq_f32(x, tmp);
+
+ Packet4f z = vmulq_f32(x,x);
+
+ Packet4f y = p4f_cephes_log_p0;
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, p4f_cephes_log_p1);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, p4f_cephes_log_p2);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, p4f_cephes_log_p3);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, p4f_cephes_log_p4);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, p4f_cephes_log_p5);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, p4f_cephes_log_p6);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, p4f_cephes_log_p7);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, p4f_cephes_log_p8);
+ y = vmulq_f32(y, x);
+
+ y = vmulq_f32(y, z);
+
+ tmp = vmulq_f32(e, p4f_cephes_log_q1);
+ y = vaddq_f32(y, tmp);
+
+
+ tmp = vmulq_f32(z, p4f_half);
+ y = vsubq_f32(y, tmp);
+
+ tmp = vmulq_f32(e, p4f_cephes_log_q2);
+ x = vaddq_f32(x, y);
+ x = vaddq_f32(x, tmp);
+ x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
+ return x;
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 84a56bdcc..6283b6cfa 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -51,14 +51,17 @@ typedef uint32x4_t Packet4ui;
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
-// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
-// which available on LLVM and GCC (at least)
-#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+#if EIGEN_ARCH_ARM64
+ // __builtin_prefetch tends to do nothing on ARM64 compilers because the
+ // prefetch instructions there are too detailed for __builtin_prefetch to map
+ // meaningfully to them.
+ #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
+#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
#define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
#elif defined __pld
#define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
-#elif !EIGEN_ARCH_ARM64
- #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( " pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#elif EIGEN_ARCH_ARM32
+ #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
#else
// by default no explicit prefetching
#define EIGEN_ARM_PREFETCH(ADDR)
@@ -78,7 +81,7 @@ template<> struct packet_traits<float> : default_packet_traits
// FIXME check the Has*
HasSin = 0,
HasCos = 0,
- HasLog = 0,
+ HasLog = 1,
HasExp = 1,
HasSqrt = 0
};
@@ -113,7 +116,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from)
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
{
- const float32_t f[] = {0, 1, 2, 3};
+ const float f[] = {0, 1, 2, 3};
Packet4f countdown = vld1q_f32(f);
return vaddq_f32(pset1<Packet4f>(a), countdown);
}
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 5607fe0ab..23e717f28 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -229,23 +229,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
}
};
-template<> struct conj_helper<Packet4f, Packet2cf, false,false>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
- { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cf, Packet4f, false,false>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
- { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
@@ -430,23 +414,7 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
}
};
-template<> struct conj_helper<Packet2d, Packet1cd, false,false>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
- { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet1cd, Packet2d, false,false>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
- { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{
diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h
index d39d2d105..95aba428f 100644
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@@ -15,6 +15,10 @@ namespace Eigen {
namespace internal {
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+static Packet4ui p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
+#endif
+
static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
@@ -29,10 +33,14 @@ struct Packet2cf
{
EIGEN_STRONG_INLINE Packet2cf() {}
EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
union {
Packet4f v;
Packet1cd cd[2];
};
+#else
+ Packet4f v;
+#endif
};
template<> struct packet_traits<std::complex<float> > : default_packet_traits
@@ -89,63 +97,27 @@ template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type
/* Forward declaration */
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
+/* complex<double> first */
template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
-{
- Packet2cf res;
- res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
- res.cd[1] = res.cd[0];
- return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
-{
- std::complex<float> EIGEN_ALIGN16 af[2];
- af[0] = from[0*stride];
- af[1] = from[1*stride];
- return pload<Packet2cf>(af);
-}
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride EIGEN_UNUSED)
{
return pload<Packet1cd>(from);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
-{
- std::complex<float> EIGEN_ALIGN16 af[2];
- pstore<std::complex<float> >((std::complex<float> *) af, from);
- to[0*stride] = af[0];
- to[1*stride] = af[1];
-}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride EIGEN_UNUSED)
{
pstore<std::complex<double> >(to, from);
}
-
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
-{
- Packet2cf res;
- res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
- res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
- return res;
-}
-
template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{
Packet2d a_re, a_im, v1, v2;
@@ -163,45 +135,182 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
return Packet1cd(v1 + v2);
}
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
+{
+ std::complex<double> EIGEN_ALIGN16 res;
+ pstore<std::complex<double> >(&res, a);
+
+ return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
+{
+ return pfirst(a);
+}
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
+{
+ return vecs[0];
+}
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
+{
+ return pfirst(a);
+}
+template<int Offset>
+struct palign_impl<Offset,Packet1cd>
+{
+ static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
+ {
+ // FIXME is it sure we never have to align a Packet1cd?
+ // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
+ }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
+{
+ EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+ {
+ return internal::pmul(a, pconj(b));
+ }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
+{
+ EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+ {
+ return internal::pmul(pconj(a), b);
+ }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
+{
+ EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+ {
+ return pconj(internal::pmul(a, b));
+ }
+};
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+ // TODO optimize it for AltiVec
+ Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+ Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
+ return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{
+ return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+{
+ Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+ kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+ kernel.packet[0].v = tmp;
+}
+
+/* complex<float> follows */
+template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
+{
+ std::complex<float> EIGEN_ALIGN16 res[2];
+ pstore<std::complex<float> >(res, a);
+
+ return res[0];
+}
+
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
Packet2cf res;
- res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
- res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
+ res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
+ res.cd[1] = res.cd[0];
return res;
}
+#else
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
+{
+ Packet2cf res;
+ if((std::ptrdiff_t(&from) % 16) == 0)
+ res.v = pload<Packet4f>((const float *)&from);
+ else
+ res.v = ploadu<Packet4f>((const float *)&from);
+ res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
+ return res;
+}
+#endif
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+ std::complex<float> EIGEN_ALIGN16 af[2];
+ af[0] = from[0*stride];
+ af[1] = from[1*stride];
+ return pload<Packet2cf>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+ std::complex<float> EIGEN_ALIGN16 af[2];
+ pstore<std::complex<float> >((std::complex<float> *) af, from);
+ to[0*stride] = af[0];
+ to[1*stride] = af[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
-{
- std::complex<double> EIGEN_ALIGN16 res;
- pstore<std::complex<double> >(&res, a);
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
+{
+ Packet2cf res;
+ res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
+ res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
return res;
}
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
-{
- std::complex<float> EIGEN_ALIGN16 res[2];
- pstore<std::complex<float> >(res, a);
- return res[0];
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+ Packet2cf res;
+ res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
+ res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
+ return res;
}
-template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
{
Packet2cf res;
@@ -210,10 +319,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
return res;
}
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
-{
- return pfirst(a);
-}
template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
{
std::complex<float> res;
@@ -222,10 +327,6 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packe
return res;
}
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
-{
- return vecs[0];
-}
template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
{
PacketBlock<Packet2cf,2> transpose;
@@ -236,10 +337,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vec
return padd<Packet2cf>(transpose.packet[0], transpose.packet[1]);
}
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
-{
- return pfirst(a);
-}
template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
{
std::complex<float> res;
@@ -249,16 +346,6 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
}
template<int Offset>
-struct palign_impl<Offset,Packet1cd>
-{
- static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
- {
- // FIXME is it sure we never have to align a Packet1cd?
- // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
- }
-};
-
-template<int Offset>
struct palign_impl<Offset,Packet2cf>
{
static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
@@ -270,39 +357,143 @@ struct palign_impl<Offset,Packet2cf>
}
};
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
+template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+ EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(pmul(x,y),c); }
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+ EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
{
return internal::pmul(a, pconj(b));
}
};
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
+template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+ EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(pmul(x,y),c); }
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+ EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
{
return internal::pmul(pconj(a), b);
}
};
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
+template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+ EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(pmul(x,y),c); }
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+ EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
{
return pconj(internal::pmul(a, b));
}
};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+
+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+ // TODO optimize it for AltiVec
+ Packet2cf res;
+ res.cd[0] = pdiv<Packet1cd>(a.cd[0], b.cd[0]);
+ res.cd[1] = pdiv<Packet1cd>(a.cd[1], b.cd[1]);
+ return res;
+}
+
+EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
+{
+ Packet2cf res;
+ res.cd[0] = pcplxflip(x.cd[0]);
+ res.cd[1] = pcplxflip(x.cd[1]);
+ return res;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
+{
+ Packet1cd tmp = kernel.packet[0].cd[1];
+ kernel.packet[0].cd[1] = kernel.packet[1].cd[0];
+ kernel.packet[1].cd[0] = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+ Packet2cf result;
+ const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] };
+ result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
+ return result;
+}
+#else
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+ Packet4f a_re, a_im, prod, prod_im;
+
+ // Permute and multiply the real parts of a and b
+ a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
+
+ // Get the imaginary parts of a
+ a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
+
+ // multiply a_im * b and get the conjugate result
+ prod_im = a_im * b.v;
+ prod_im = pxor<Packet4f>(prod_im, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR));
+ // permute back to a proper order
+ prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV);
+
+ // multiply a_re * b, add prod_im
+ prod = pmadd<Packet4f>(a_re, b.v, prod_im);
+
+ return Packet2cf(prod);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{
+ Packet4f rev_a;
+ rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2);
+ return Packet2cf(rev_a);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+ Packet4f b;
+ b = vec_sld(a.v, a.v, 8);
+ b = padd<Packet4f>(a.v, b);
+ return pfirst<Packet2cf>(Packet2cf(b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+ Packet4f b1, b2;
+ b1 = vec_sld(vecs[0].v, vecs[1].v, 8);
+ b2 = vec_sld(vecs[1].v, vecs[0].v, 8);
+ b2 = vec_sld(b2, b2, 8);
+ b2 = padd<Packet4f>(b1, b2);
+
+ return Packet2cf(b2);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+ Packet4f b;
+ Packet2cf prod;
+ b = vec_sld(a.v, a.v, 8);
+ prod = pmul<Packet2cf>(a, Packet2cf(b));
+
+ return pfirst<Packet2cf>(prod);
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet2cf>
+{
+ static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
+ {
+ if (Offset==1)
+ {
+ first.v = vec_sld(first.v, second.v, 8);
+ }
+ }
+};
+
template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
@@ -336,56 +527,34 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
}
};
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
- // TODO optimize it for AltiVec
- Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
- Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
- return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
-}
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
// TODO optimize it for AltiVec
- Packet2cf res;
- res.cd[0] = pdiv<Packet1cd>(a.cd[0], b.cd[0]);
- res.cd[1] = pdiv<Packet1cd>(a.cd[1], b.cd[1]);
- return res;
+ Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a, b);
+ Packet4f s = pmul<Packet4f>(b.v, b.v);
+ return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
}
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
{
- return Packet1cd(preverse(Packet2d(x.v)));
+ return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
}
-EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
-{
- Packet2cf res;
- res.cd[0] = pcplxflip(x.cd[0]);
- res.cd[1] = pcplxflip(x.cd[1]);
- return res;
-}
-
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
{
- Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+ Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
kernel.packet[0].v = tmp;
}
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
-{
- Packet1cd tmp = kernel.packet[0].cd[1];
- kernel.packet[0].cd[1] = kernel.packet[1].cd[0];
- kernel.packet[1].cd[0] = tmp;
-}
-
template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
Packet2cf result;
- const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] };
- result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
+ result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
return result;
}
+#endif
} // end namespace internal
diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h
index 5c7aa7256..ff33a975f 100644
--- a/Eigen/src/Core/arch/ZVector/MathFunctions.h
+++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h
@@ -20,6 +20,50 @@ namespace Eigen {
namespace internal {
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+static _EIGEN_DECLARE_CONST_Packet4i(23, 23);
+
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+/* the smallest non denormalized float number */
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff);
+
+/* natural logarithm computed for 4 simultaneous float
+ return NaN for x <= 0
+*/
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
+static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+#endif
+
static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
@@ -93,40 +137,91 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pexp<Packet4f>(const Packet4f& x)
+Packet4f pexp<Packet4f>(const Packet4f& _x)
{
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+/*
+ Packet4f x = _x;
+
+ Packet4f tmp, fx;
+ Packet4i emm0;
+
+ // clamp x
+ x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
+
+ // express exp(x) as exp(g + n*log(2))
+ fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
+
+ fx = pfloor(fx);
+
+ tmp = pmul(fx, p4f_cephes_exp_C1);
+ Packet4f z = pmul(fx, p4f_cephes_exp_C2);
+ x = psub(x, tmp);
+ x = psub(x, z);
+
+ z = pmul(x,x);
+
+ Packet4f y = p4f_cephes_exp_p0;
+ y = pmadd(y, x, p4f_cephes_exp_p1);
+ y = pmadd(y, x, p4f_cephes_exp_p2);
+ y = pmadd(y, x, p4f_cephes_exp_p3);
+ y = pmadd(y, x, p4f_cephes_exp_p4);
+ y = pmadd(y, x, p4f_cephes_exp_p5);
+ y = pmadd(y, z, x);
+ y = padd(y, p4f_1);
+
+ // build 2^n
+ emm0 = vec_cts(fx, 0);
+ emm0 = emm0 + p4i_0x7f;
+ emm0 = emm0 << reinterpret_cast<Packet4i>(p4i_23);
+
+ // Altivec's max & min operators just drop silent NaNs. Check NaNs in
+ // inputs and return them unmodified.
+ Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x));
+ return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x),
+ isnumber_mask);*/
+ return _x;
+#else
Packet4f res;
- res.v4f[0] = pexp<Packet2d>(x.v4f[0]);
- res.v4f[1] = pexp<Packet2d>(x.v4f[1]);
+ res.v4f[0] = pexp<Packet2d>(_x.v4f[0]);
+ res.v4f[1] = pexp<Packet2d>(_x.v4f[1]);
return res;
+#endif
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet2d psqrt<Packet2d>(const Packet2d& x)
{
- return __builtin_s390_vfsqdb(x);
+ return vec_sqrt(x);
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f psqrt<Packet4f>(const Packet4f& x)
{
Packet4f res;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+ res = vec_sqrt(x);
+#else
res.v4f[0] = psqrt<Packet2d>(x.v4f[0]);
res.v4f[1] = psqrt<Packet2d>(x.v4f[1]);
+#endif
return res;
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet2d prsqrt<Packet2d>(const Packet2d& x) {
- // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f prsqrt<Packet4f>(const Packet4f& x) {
Packet4f res;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+ res = pset1<Packet4f>(1.0) / psqrt<Packet4f>(x);
+#else
res.v4f[0] = prsqrt<Packet2d>(x.v4f[0]);
res.v4f[1] = prsqrt<Packet2d>(x.v4f[1]);
+#endif
return res;
}
diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h
index 57b01fc63..0b37f4992 100755
--- a/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -17,7 +17,7 @@ namespace Eigen {
namespace internal {
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
#endif
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
@@ -29,7 +29,7 @@ namespace internal {
#endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
#endif
typedef __vector int Packet4i;
@@ -41,9 +41,14 @@ typedef __vector double Packet2d;
typedef __vector unsigned long long Packet2ul;
typedef __vector long long Packet2l;
+// Z14 has builtin support for float vectors
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+typedef __vector float Packet4f;
+#else
typedef struct {
Packet2d v4f[2];
} Packet4f;
+#endif
typedef union {
int32_t i[4];
@@ -51,11 +56,15 @@ typedef union {
int64_t l[2];
uint64_t ul[2];
double d[2];
+ float f[4];
Packet4i v4i;
Packet4ui v4ui;
Packet2l v2l;
Packet2ul v2ul;
Packet2d v2d;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+ Packet4f v4f;
+#endif
} Packet;
// We don't want to write the same code all the time, but we need to reuse the constants
@@ -80,7 +89,7 @@ typedef union {
Packet2l p2l_##NAME = pset1<Packet2l>(X)
// These constants are endian-agnostic
-//static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
@@ -90,6 +99,21 @@ static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
static Packet2d p2d_ONE = { 1.0, 1.0 };
static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
+ Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+ Packet4f p4f_##NAME = pset1<Packet4f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+ const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
+
+static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
+static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000};
+#endif
+
static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
@@ -120,9 +144,9 @@ static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0
static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
-//static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
-//static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
@@ -169,7 +193,11 @@ template<> struct packet_traits<float> : default_packet_traits
HasSin = 0,
HasCos = 0,
HasLog = 0,
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+ HasExp = 0,
+#else
HasExp = 1,
+#endif
HasSqrt = 1,
HasRsqrt = 1,
HasRound = 1,
@@ -258,31 +286,16 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
return s;
}
-/* Helper function to simulate a vec_splat_packet4f
- */
-template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from)
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
{
- Packet4f splat;
- switch (element) {
- case 0:
- splat.v4f[0] = vec_splat(from.v4f[0], 0);
- splat.v4f[1] = splat.v4f[0];
- break;
- case 1:
- splat.v4f[0] = vec_splat(from.v4f[0], 1);
- splat.v4f[1] = splat.v4f[0];
- break;
- case 2:
- splat.v4f[0] = vec_splat(from.v4f[1], 0);
- splat.v4f[1] = splat.v4f[0];
- break;
- case 3:
- splat.v4f[0] = vec_splat(from.v4f[1], 1);
- splat.v4f[1] = splat.v4f[0];
- break;
- }
- return splat;
+ Packet vt;
+ vt.v4f = v;
+ s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
+ return s;
}
+#endif
+
template<int Offset>
struct palign_impl<Offset,Packet4i>
@@ -300,31 +313,6 @@ struct palign_impl<Offset,Packet4i>
}
};
-/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double
- */
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
-{
- static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
- {
- switch (Offset % 4) {
- case 1:
- first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8);
- first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8);
- break;
- case 2:
- first.v4f[0] = first.v4f[1];
- first.v4f[1] = second.v4f[0];
- break;
- case 3:
- first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8);
- first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8);
- break;
- }
- }
-};
-
-
template<int Offset>
struct palign_impl<Offset,Packet2d>
{
@@ -344,16 +332,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
return vfrom->v4i;
}
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
-{
- // FIXME: No intrinsic yet
- EIGEN_DEBUG_ALIGNED_LOAD
- Packet4f vfrom;
- vfrom.v4f[0] = vec_ld2f(&from[0]);
- vfrom.v4f[1] = vec_ld2f(&from[2]);
- return vfrom;
-}
-
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
{
// FIXME: No intrinsic yet
@@ -372,15 +350,6 @@ template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& f
vto->v4i = from;
}
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
-{
- // FIXME: No intrinsic yet
- EIGEN_DEBUG_ALIGNED_STORE
- vec_st2f(from.v4f[0], &to[0]);
- vec_st2f(from.v4f[1], &to[2]);
-}
-
-
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
{
// FIXME: No intrinsic yet
@@ -397,13 +366,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from)
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
return vec_splats(from);
}
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
-{
- Packet4f to;
- to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
- to.v4f[1] = to.v4f[0];
- return to;
-}
template<> EIGEN_STRONG_INLINE void
pbroadcast4<Packet4i>(const int *a,
@@ -417,17 +379,6 @@ pbroadcast4<Packet4i>(const int *a,
}
template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
- Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
- a3 = pload<Packet4f>(a);
- a0 = vec_splat_packet4f<0>(a3);
- a1 = vec_splat_packet4f<1>(a3);
- a2 = vec_splat_packet4f<2>(a3);
- a3 = vec_splat_packet4f<3>(a3);
-}
-
-template<> EIGEN_STRONG_INLINE void
pbroadcast4<Packet2d>(const double *a,
Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
{
@@ -449,16 +400,6 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
return pload<Packet4i>(ai);
}
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
-{
- float EIGEN_ALIGN16 ai[4];
- ai[0] = from[0*stride];
- ai[1] = from[1*stride];
- ai[2] = from[2*stride];
- ai[3] = from[3*stride];
- return pload<Packet4f>(ai);
-}
-
template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
{
double EIGEN_ALIGN16 af[2];
@@ -477,16 +418,6 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
to[3*stride] = ai[3];
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
-{
- float EIGEN_ALIGN16 ai[4];
- pstore<float>((float *)ai, from);
- to[0*stride] = ai[0];
- to[1*stride] = ai[1];
- to[2*stride] = ai[2];
- to[3*stride] = ai[3];
-}
-
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
{
double EIGEN_ALIGN16 af[2];
@@ -496,160 +427,52 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to,
}
template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- Packet4f c;
- c.v4f[0] = a.v4f[0] + b.v4f[0];
- c.v4f[1] = a.v4f[1] + b.v4f[1];
- return c;
-}
template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- Packet4f c;
- c.v4f[0] = a.v4f[0] - b.v4f[0];
- c.v4f[1] = a.v4f[1] - b.v4f[1];
- return c;
-}
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- Packet4f c;
- c.v4f[0] = a.v4f[0] * b.v4f[0];
- c.v4f[1] = a.v4f[1] * b.v4f[1];
- return c;
-}
template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- Packet4f c;
- c.v4f[0] = a.v4f[0] / b.v4f[0];
- c.v4f[1] = a.v4f[1] / b.v4f[1];
- return c;
-}
template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
-{
- Packet4f c;
- c.v4f[0] = -a.v4f[0];
- c.v4f[1] = -a.v4f[1];
- return c;
-}
template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
-{
- Packet4f res;
- res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
- res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
- return res;
-}
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- Packet4f res;
- res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
- res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
- return res;
-}
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- Packet4f res;
- res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
- res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
- return res;
-}
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- Packet4f res;
- res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
- res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
- return res;
-}
template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- Packet4f res;
- res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
- res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
- return res;
-}
template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- Packet4f res;
- res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
- res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
- return res;
-}
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- Packet4f res;
- res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
- res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
- return res;
-}
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
-{
- Packet4f res;
- res.v4f[0] = vec_round(a.v4f[0]);
- res.v4f[1] = vec_round(a.v4f[1]);
- return res;
-}
template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
-{
- Packet4f res;
- res.v4f[0] = vec_ceil(a.v4f[0]);
- res.v4f[1] = vec_ceil(a.v4f[1]);
- return res;
-}
template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{
- Packet4f res;
- res.v4f[0] = vec_floor(a.v4f[0]);
- res.v4f[1] = vec_floor(a.v4f[1]);
- return res;
-}
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { return pload<Packet4i>(from); }
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { return pload<Packet4f>(from); }
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { return pload<Packet2d>(from); }
@@ -659,14 +482,6 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
}
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
-{
- Packet4f p = pload<Packet4f>(from);
- p.v4f[1] = vec_splat(p.v4f[0], 1);
- p.v4f[0] = vec_splat(p.v4f[0], 0);
- return p;
-}
-
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
{
Packet2d p = pload<Packet2d>(from);
@@ -674,15 +489,12 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
}
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { pstore<int>(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { pstore<double>(to, from); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
@@ -695,23 +507,8 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
}
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{
- Packet4f rev;
- rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
- rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
- return rev;
-}
-
template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
-{
- Packet4f res;
- res.v4f[0] = pabs(a.v4f[0]);
- res.v4f[1] = pabs(a.v4f[1]);
- return res;
-}
template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
{
@@ -730,13 +527,6 @@ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
sum = padd<Packet2d>(a, b);
return pfirst(sum);
}
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
- Packet2d sum;
- sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
- double first = predux<Packet2d>(sum);
- return static_cast<float>(first);
-}
template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
{
@@ -777,21 +567,6 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
return sum;
}
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
- PacketBlock<Packet4f,4> transpose;
- transpose.packet[0] = vecs[0];
- transpose.packet[1] = vecs[1];
- transpose.packet[2] = vecs[2];
- transpose.packet[3] = vecs[3];
- ptranspose(transpose);
-
- Packet4f sum = padd(transpose.packet[0], transpose.packet[1]);
- sum = padd(sum, transpose.packet[2]);
- sum = padd(sum, transpose.packet[3]);
- return sum;
-}
-
// Other reduction functions:
// mul
template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
@@ -806,12 +581,6 @@ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
}
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
- // Return predux_mul<Packet2d> of the subvectors product
- return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
-}
-
// min
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
{
@@ -826,14 +595,6 @@ template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
}
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
- Packet2d b, res;
- b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
- res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
- return static_cast<float>(pfirst(res));
-}
-
// max
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
{
@@ -849,14 +610,6 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
}
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
- Packet2d b, res;
- b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
- res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
- return static_cast<float>(pfirst(res));
-}
-
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4i,4>& kernel) {
Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
@@ -877,6 +630,321 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
kernel.packet[1] = t1;
}
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+ Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+ Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
+ return vec_sel(elsePacket, thenPacket, mask);
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+ Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+ Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
+ return vec_sel(elsePacket, thenPacket, mask);
+}
+
+/* z13 has no vector float support so we emulate that with double
+ z14 has proper vector float support.
+*/
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+/* Helper function to simulate a vec_splat_packet4f
+ */
+template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from)
+{
+ Packet4f splat;
+ switch (element) {
+ case 0:
+ splat.v4f[0] = vec_splat(from.v4f[0], 0);
+ splat.v4f[1] = splat.v4f[0];
+ break;
+ case 1:
+ splat.v4f[0] = vec_splat(from.v4f[0], 1);
+ splat.v4f[1] = splat.v4f[0];
+ break;
+ case 2:
+ splat.v4f[0] = vec_splat(from.v4f[1], 0);
+ splat.v4f[1] = splat.v4f[0];
+ break;
+ case 3:
+ splat.v4f[0] = vec_splat(from.v4f[1], 1);
+ splat.v4f[1] = splat.v4f[0];
+ break;
+ }
+ return splat;
+}
+
+/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double
+ */
+template<int Offset>
+struct palign_impl<Offset,Packet4f>
+{
+ static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
+ {
+ switch (Offset % 4) {
+ case 1:
+ first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8);
+ first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8);
+ break;
+ case 2:
+ first.v4f[0] = first.v4f[1];
+ first.v4f[1] = second.v4f[0];
+ break;
+ case 3:
+ first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8);
+ first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8);
+ break;
+ }
+ }
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
+{
+ // FIXME: No intrinsic yet
+ EIGEN_DEBUG_ALIGNED_LOAD
+ Packet4f vfrom;
+ vfrom.v4f[0] = vec_ld2f(&from[0]);
+ vfrom.v4f[1] = vec_ld2f(&from[2]);
+ return vfrom;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
+{
+ // FIXME: No intrinsic yet
+ EIGEN_DEBUG_ALIGNED_STORE
+ vec_st2f(from.v4f[0], &to[0]);
+ vec_st2f(from.v4f[1], &to[2]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
+{
+ Packet4f to;
+ to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
+ to.v4f[1] = to.v4f[0];
+ return to;
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+ Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+ a3 = pload<Packet4f>(a);
+ a0 = vec_splat_packet4f<0>(a3);
+ a1 = vec_splat_packet4f<1>(a3);
+ a2 = vec_splat_packet4f<2>(a3);
+ a3 = vec_splat_packet4f<3>(a3);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+ float EIGEN_ALIGN16 ai[4];
+ ai[0] = from[0*stride];
+ ai[1] = from[1*stride];
+ ai[2] = from[2*stride];
+ ai[3] = from[3*stride];
+ return pload<Packet4f>(ai);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+ float EIGEN_ALIGN16 ai[4];
+ pstore<float>((float *)ai, from);
+ to[0*stride] = ai[0];
+ to[1*stride] = ai[1];
+ to[2*stride] = ai[2];
+ to[3*stride] = ai[3];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+ Packet4f c;
+ c.v4f[0] = a.v4f[0] + b.v4f[0];
+ c.v4f[1] = a.v4f[1] + b.v4f[1];
+ return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+ Packet4f c;
+ c.v4f[0] = a.v4f[0] - b.v4f[0];
+ c.v4f[1] = a.v4f[1] - b.v4f[1];
+ return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+ Packet4f c;
+ c.v4f[0] = a.v4f[0] * b.v4f[0];
+ c.v4f[1] = a.v4f[1] * b.v4f[1];
+ return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+ Packet4f c;
+ c.v4f[0] = a.v4f[0] / b.v4f[0];
+ c.v4f[1] = a.v4f[1] / b.v4f[1];
+ return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
+{
+ Packet4f c;
+ c.v4f[0] = -a.v4f[0];
+ c.v4f[1] = -a.v4f[1];
+ return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
+{
+ Packet4f res;
+ res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
+ res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
+ return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+ Packet4f res;
+ res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
+ res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
+ return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+ Packet4f res;
+ res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
+ res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
+ return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+ Packet4f res;
+ res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+ res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+ return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+ Packet4f res;
+ res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+ res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+ return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+ Packet4f res;
+ res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+ res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+ return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+ Packet4f res;
+ res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
+ res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
+ return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+ Packet4f res;
+ res.v4f[0] = vec_round(a.v4f[0]);
+ res.v4f[1] = vec_round(a.v4f[1]);
+ return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
+{
+ Packet4f res;
+ res.v4f[0] = vec_ceil(a.v4f[0]);
+ res.v4f[1] = vec_ceil(a.v4f[1]);
+ return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+ Packet4f res;
+ res.v4f[0] = vec_floor(a.v4f[0]);
+ res.v4f[1] = vec_floor(a.v4f[1]);
+ return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
+{
+ Packet4f p = pload<Packet4f>(from);
+ p.v4f[1] = vec_splat(p.v4f[0], 1);
+ p.v4f[0] = vec_splat(p.v4f[0], 0);
+ return p;
+}
+
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+ Packet4f rev;
+ rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
+ rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
+ return rev;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
+{
+ Packet4f res;
+ res.v4f[0] = pabs(a.v4f[0]);
+ res.v4f[1] = pabs(a.v4f[1]);
+ return res;
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+ Packet2d sum;
+ sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
+ double first = predux<Packet2d>(sum);
+ return static_cast<float>(first);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+{
+ PacketBlock<Packet4f,4> transpose;
+ transpose.packet[0] = vecs[0];
+ transpose.packet[1] = vecs[1];
+ transpose.packet[2] = vecs[2];
+ transpose.packet[3] = vecs[3];
+ ptranspose(transpose);
+
+ Packet4f sum = padd(transpose.packet[0], transpose.packet[1]);
+ sum = padd(sum, transpose.packet[2]);
+ sum = padd(sum, transpose.packet[3]);
+ return sum;
+}
+
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+ // Return predux_mul<Packet2d> of the subvectors product
+ return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+ Packet2d b, res;
+ b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
+ res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+ return static_cast<float>(pfirst(res));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+ Packet2d b, res;
+ b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
+ res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+ return static_cast<float>(pfirst(res));
+}
+
/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
*/
EIGEN_DEVICE_FUNC inline void
@@ -915,12 +983,6 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
kernel.packet[3].v4f[1] = t3.packet[1];
}
-template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
- Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
- Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
- return vec_sel(elsePacket, thenPacket, mask);
-}
-
template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
@@ -931,13 +993,197 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons
result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo);
return result;
}
+#else
+template<int Offset>
+struct palign_impl<Offset,Packet4f>
+{
+ static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
+ {
+ switch (Offset % 4) {
+ case 1:
+ first = vec_sld(first, second, 4); break;
+ case 2:
+ first = vec_sld(first, second, 8); break;
+ case 3:
+ first = vec_sld(first, second, 12); break;
+ }
+ }
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
+{
+ // FIXME: No intrinsic yet
+ EIGEN_DEBUG_ALIGNED_LOAD
+ Packet *vfrom;
+ vfrom = (Packet *) from;
+ return vfrom->v4f;
+}
-template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
- Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
- Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
+template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
+{
+ // FIXME: No intrinsic yet
+ EIGEN_DEBUG_ALIGNED_STORE
+ Packet *vto;
+ vto = (Packet *) to;
+ vto->v4f = from;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
+{
+ return vec_splats(from);
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+ Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+ a3 = pload<Packet4f>(a);
+ a0 = vec_splat(a3, 0);
+ a1 = vec_splat(a3, 1);
+ a2 = vec_splat(a3, 2);
+ a3 = vec_splat(a3, 3);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+ float EIGEN_ALIGN16 af[4];
+ af[0] = from[0*stride];
+ af[1] = from[1*stride];
+ af[2] = from[2*stride];
+ af[3] = from[3*stride];
+ return pload<Packet4f>(af);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+ float EIGEN_ALIGN16 af[4];
+ pstore<float>((float*)af, from);
+ to[0*stride] = af[0];
+ to[1*stride] = af[1];
+ to[2*stride] = af[2];
+ to[3*stride] = af[3];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a + b); }
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a - b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a * b); }
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a / b); }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) { return (-a); }
+template<> EIGEN_STRONG_INLINE Packet4f pconj<Packet4f> (const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f> (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f> (const Packet4f& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f> (const Packet4f& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f> (const Packet4f& a) { return vec_floor(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f> (const Packet4f& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
+{
+ Packet4f p = pload<Packet4f>(from);
+ return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+ return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+ Packet4f b, sum;
+ b = vec_sld(a, a, 8);
+ sum = padd<Packet4f>(a, b);
+ b = vec_sld(sum, sum, 4);
+ sum = padd<Packet4f>(sum, b);
+ return pfirst(sum);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+{
+ Packet4f v[4], sum[4];
+
+ // It's easier and faster to transpose then add as columns
+ // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
+ // Do the transpose, first set of moves
+ v[0] = vec_mergeh(vecs[0], vecs[2]);
+ v[1] = vec_mergel(vecs[0], vecs[2]);
+ v[2] = vec_mergeh(vecs[1], vecs[3]);
+ v[3] = vec_mergel(vecs[1], vecs[3]);
+ // Get the resulting vectors
+ sum[0] = vec_mergeh(v[0], v[2]);
+ sum[1] = vec_mergel(v[0], v[2]);
+ sum[2] = vec_mergeh(v[1], v[3]);
+ sum[3] = vec_mergel(v[1], v[3]);
+
+ // Now do the summation:
+ // Lines 0+1
+ sum[0] = padd<Packet4f>(sum[0], sum[1]);
+ // Lines 2+3
+ sum[1] = padd<Packet4f>(sum[2], sum[3]);
+ // Add the results
+ sum[0] = padd<Packet4f>(sum[0], sum[1]);
+
+ return sum[0];
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+ Packet4f prod;
+ prod = pmul(a, vec_sld(a, a, 8));
+ return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
+}
+
+// min
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+ Packet4f b, res;
+ b = pmin<Packet4f>(a, vec_sld(a, a, 8));
+ res = pmin<Packet4f>(b, vec_sld(b, b, 4));
+ return pfirst(res);
+}
+
+// max
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+ Packet4f b, res;
+ b = pmax<Packet4f>(a, vec_sld(a, a, 8));
+ res = pmax<Packet4f>(b, vec_sld(b, b, 4));
+ return pfirst(res);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+ Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+ Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+ Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+ Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+ kernel.packet[0] = vec_mergeh(t0, t2);
+ kernel.packet[1] = vec_mergel(t0, t2);
+ kernel.packet[2] = vec_mergeh(t1, t3);
+ kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+ Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+ Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
return vec_sel(elsePacket, thenPacket, mask);
}
+#endif
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f> (const float* from) { return pload<Packet4f>(from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f> (const float& a) { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index 4153b877c..1077d8eb0 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -144,7 +144,7 @@ template<typename Scalar> struct swap_assign_op {
EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
{
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
// FIXME is there some kind of cuda::swap?
Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
#else
diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index 6a30466fb..b03be0269 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -44,16 +44,16 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
{
linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
- m_interPacket(plset<Packet>(0)),
m_flip(numext::abs(high)<numext::abs(low))
{}
template<typename IndexType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const {
+ typedef typename NumTraits<Scalar>::Real RealScalar;
if(m_flip)
- return (i==0)? m_low : (m_high - (m_size1-i)*m_step);
+ return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step);
else
- return (i==m_size1)? m_high : (m_low + i*m_step);
+ return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step);
}
template<typename IndexType>
@@ -63,7 +63,7 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
// [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
if(m_flip)
{
- Packet pi = padd(pset1<Packet>(Scalar(i-m_size1)),m_interPacket);
+ Packet pi = plset<Packet>(Scalar(i-m_size1));
Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
if(i==0)
res = pinsertfirst(res, m_low);
@@ -71,7 +71,7 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
}
else
{
- Packet pi = padd(pset1<Packet>(Scalar(i)),m_interPacket);
+ Packet pi = plset<Packet>(Scalar(i));
Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
if(i==m_size1-unpacket_traits<Packet>::size+1)
res = pinsertlast(res, m_high);
@@ -83,7 +83,6 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
const Scalar m_high;
const Index m_size1;
const Scalar m_step;
- const Packet m_interPacket;
const bool m_flip;
};
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 6440e1d09..ed4d3182b 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -427,7 +427,13 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
template<typename Dst>
static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
- if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
+ // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program
+ // to determine the following heuristic.
+ // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
+ // unless it has been specialized by the user or for a given architecture.
+ // Note that the condition rhs.rows()>0 was required because lazy produc is (was?) not happy with empty inputs.
+ // I'm not sure it is still required.
+ if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
lazyproduct::evalTo(dst, lhs, rhs);
else
{
@@ -439,7 +445,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
template<typename Dst>
static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
- if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
+ if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
lazyproduct::addTo(dst, lhs, rhs);
else
scaleAndAddTo(dst,lhs, rhs, Scalar(1));
@@ -448,7 +454,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
template<typename Dst>
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
- if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
+ if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
lazyproduct::subTo(dst, lhs, rhs);
else
scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index ad38bcf51..e436c50a4 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -269,10 +269,13 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
enum {
IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0,
- RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0
+ RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0,
+ SkipDiag = (UpLo&(UnitDiag|ZeroDiag))!=0
};
Index size = mat.cols();
+ if(SkipDiag)
+ size--;
Index depth = actualLhs.cols();
typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,typename Lhs::Scalar,typename Rhs::Scalar,
@@ -283,10 +286,11 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
internal::general_matrix_matrix_triangular_product<Index,
typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
- IsRowMajor ? RowMajor : ColMajor, UpLo>
+ IsRowMajor ? RowMajor : ColMajor, UpLo&(Lower|Upper)>
::run(size, depth,
- &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(),
- mat.data(), mat.outerStride(), actualAlpha, blocking);
+ &actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(),
+ &actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(),
+ mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? 1 : mat.outerStride() ) : 0), mat.outerStride(), actualAlpha, blocking);
}
};
@@ -294,6 +298,7 @@ template<typename MatrixType, unsigned int UpLo>
template<typename ProductType>
EIGEN_DEVICE_FUNC TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
{
+ EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
index 5b7c15cca..9176a1382 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
@@ -52,7 +52,7 @@ struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,Con
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
{ \
- if (lhs==rhs) { \
+ if ( lhs==rhs && ((UpLo&(Lower|Upper)==UpLo)) ) { \
general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
} else { \
@@ -88,7 +88,7 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \
EIGTYPE beta(1); \
- BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \
+ BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \
} \
};
@@ -125,9 +125,13 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
} \
};
-
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk)
+EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk)
+#else
EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_)
+#endif
// TODO hanlde complex cases
// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
index 7a3bdbf20..b0f6b0d5b 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
@@ -46,7 +46,7 @@ namespace internal {
// gemm specialization
-#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASPREFIX) \
+#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \
template< \
typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
@@ -100,13 +100,20 @@ static void run(Index rows, Index cols, Index depth, \
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} else b = _rhs; \
\
- BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+ BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
}};
-GEMM_SPECIALIZATION(double, d, double, d)
-GEMM_SPECIALIZATION(float, f, float, s)
-GEMM_SPECIALIZATION(dcomplex, cd, double, z)
-GEMM_SPECIALIZATION(scomplex, cf, float, c)
+#ifdef EIGEN_USE_MKL
+GEMM_SPECIALIZATION(double, d, double, dgemm)
+GEMM_SPECIALIZATION(float, f, float, sgemm)
+GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
+GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
+#else
+GEMM_SPECIALIZATION(double, d, double, dgemm_)
+GEMM_SPECIALIZATION(float, f, float, sgemm_)
+GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
+GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
+#endif
} // end namespase internal
diff --git a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
index e3a5d5892..6e36c2b3c 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
@@ -85,7 +85,7 @@ EIGEN_BLAS_GEMV_SPECIALIZE(float)
EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex)
EIGEN_BLAS_GEMV_SPECIALIZE(scomplex)
-#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \
+#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
{ \
@@ -113,14 +113,21 @@ static void run( \
x_ptr=x_tmp.data(); \
incx=1; \
} else x_ptr=rhs; \
- BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
+ BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
}\
};
-EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d)
-EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s)
-EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z)
-EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c)
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv)
+#else
+EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_)
+#endif
} // end namespase internal
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
index a45238d69..9a5318507 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
@@ -40,7 +40,7 @@ namespace internal {
/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
-#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@@ -81,13 +81,13 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} else b = _rhs; \
\
- BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+ BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
\
} \
};
-#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@@ -144,20 +144,26 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} \
\
- BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+ BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
\
} \
};
-EIGEN_BLAS_SYMM_L(double, double, d, d)
-EIGEN_BLAS_SYMM_L(float, float, f, s)
-EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z)
-EIGEN_BLAS_HEMM_L(scomplex, float, cf, c)
-
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMM_L(double, double, d, dsymm)
+EIGEN_BLAS_SYMM_L(float, float, f, ssymm)
+EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm)
+EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm)
+#else
+EIGEN_BLAS_SYMM_L(double, double, d, dsymm_)
+EIGEN_BLAS_SYMM_L(float, float, f, ssymm_)
+EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_)
+EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_)
+#endif
/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
-#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@@ -197,13 +203,13 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} else b = _lhs; \
\
- BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+ BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
\
} \
};
-#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@@ -259,15 +265,21 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} \
\
- BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+ BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
} \
};
-EIGEN_BLAS_SYMM_R(double, double, d, d)
-EIGEN_BLAS_SYMM_R(float, float, f, s)
-EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z)
-EIGEN_BLAS_HEMM_R(scomplex, float, cf, c)
-
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMM_R(double, double, d, dsymm)
+EIGEN_BLAS_SYMM_R(float, float, f, ssymm)
+EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm)
+EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm)
+#else
+EIGEN_BLAS_SYMM_R(double, double, d, dsymm_)
+EIGEN_BLAS_SYMM_R(float, float, f, ssymm_)
+EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_)
+EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_)
+#endif
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
index 38f23accf..1238345e3 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
@@ -95,14 +95,21 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \
x_tmp=map_x.conjugate(); \
x_ptr=x_tmp.data(); \
} else x_ptr=_rhs; \
- BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
+ BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
}\
};
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv)
+#else
EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_)
EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_)
EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_)
+#endif
} // end namespace internal
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 6ec5a8a0b..539b6c0c6 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -137,7 +137,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
- Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
+ // To work around an "error: member reference base type 'Matrix<...>
+ // (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is
+ // not a structure or union" compilation error in nvcc (tested V8.0.61),
+ // create a dummy internal::constructor_without_unaligned_array_assert
+ // object to pass to the Matrix constructor.
+ internal::constructor_without_unaligned_array_assert a;
+ Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer(a);
triangularBuffer.setZero();
if((Mode&ZeroDiag)==ZeroDiag)
triangularBuffer.diagonal().setZero();
@@ -284,7 +290,8 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
- Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
+ internal::constructor_without_unaligned_array_assert a;
+ Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer(a);
triangularBuffer.setZero();
if((Mode&ZeroDiag)==ZeroDiag)
triangularBuffer.diagonal().setZero();
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
index aecded6bb..a25197ab0 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
@@ -75,7 +75,7 @@ EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true)
EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false)
// implements col-major += alpha * op(triangular) * op(general)
-#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, int Mode, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@@ -172,7 +172,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
} \
/*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \
/* call ?trmm*/ \
- BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
+ BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
\
/* Add op(a_triangular)*b into res*/ \
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
@@ -180,13 +180,20 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
} \
};
-EIGEN_BLAS_TRMM_L(double, double, d, d)
-EIGEN_BLAS_TRMM_L(dcomplex, double, cd, z)
-EIGEN_BLAS_TRMM_L(float, float, f, s)
-EIGEN_BLAS_TRMM_L(scomplex, float, cf, c)
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMM_L(double, double, d, dtrmm)
+EIGEN_BLAS_TRMM_L(dcomplex, MKL_Complex16, cd, ztrmm)
+EIGEN_BLAS_TRMM_L(float, float, f, strmm)
+EIGEN_BLAS_TRMM_L(scomplex, MKL_Complex8, cf, ctrmm)
+#else
+EIGEN_BLAS_TRMM_L(double, double, d, dtrmm_)
+EIGEN_BLAS_TRMM_L(dcomplex, double, cd, ztrmm_)
+EIGEN_BLAS_TRMM_L(float, float, f, strmm_)
+EIGEN_BLAS_TRMM_L(scomplex, float, cf, ctrmm_)
+#endif
// implements col-major += alpha * op(general) * op(triangular)
-#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, int Mode, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@@ -282,7 +289,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
} \
/*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \
/* call ?trmm*/ \
- BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
+ BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
\
/* Add op(a_triangular)*b into res*/ \
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
@@ -290,11 +297,17 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
} \
};
-EIGEN_BLAS_TRMM_R(double, double, d, d)
-EIGEN_BLAS_TRMM_R(dcomplex, double, cd, z)
-EIGEN_BLAS_TRMM_R(float, float, f, s)
-EIGEN_BLAS_TRMM_R(scomplex, float, cf, c)
-
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMM_R(double, double, d, dtrmm)
+EIGEN_BLAS_TRMM_R(dcomplex, MKL_Complex16, cd, ztrmm)
+EIGEN_BLAS_TRMM_R(float, float, f, strmm)
+EIGEN_BLAS_TRMM_R(scomplex, MKL_Complex8, cf, ctrmm)
+#else
+EIGEN_BLAS_TRMM_R(double, double, d, dtrmm_)
+EIGEN_BLAS_TRMM_R(dcomplex, double, cd, ztrmm_)
+EIGEN_BLAS_TRMM_R(float, float, f, strmm_)
+EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_)
+#endif
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
index 07bf26ce5..3d47a2b94 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
@@ -71,7 +71,7 @@ EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex)
EIGEN_BLAS_TRMV_SPECIALIZE(scomplex)
// implements col-major: res += alpha * op(triangular) * vector
-#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
enum { \
@@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
diag = IsUnitDiag ? 'U' : 'N'; \
\
/* call ?TRMV*/ \
- BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
+ BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
\
/* Add op(a_tr)rhs into res*/ \
- BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+ BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
if (size<(std::max)(rows,cols)) { \
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
@@ -142,18 +142,25 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
m = convert_index<BlasIndex>(size); \
n = convert_index<BlasIndex>(cols-size); \
} \
- BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
+ BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
} \
} \
};
-EIGEN_BLAS_TRMV_CM(double, double, d, d)
-EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z)
-EIGEN_BLAS_TRMV_CM(float, float, f, s)
-EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c)
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMV_CM(double, double, d, d,)
+EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,)
+EIGEN_BLAS_TRMV_CM(float, float, f, s,)
+EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,)
+#else
+EIGEN_BLAS_TRMV_CM(double, double, d, d, _)
+EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _)
+EIGEN_BLAS_TRMV_CM(float, float, f, s, _)
+EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _)
+#endif
// implements row-major: res += alpha * op(triangular) * vector
-#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
enum { \
@@ -203,10 +210,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
diag = IsUnitDiag ? 'U' : 'N'; \
\
/* call ?TRMV*/ \
- BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
+ BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
\
/* Add op(a_tr)rhs into res*/ \
- BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+ BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
if (size<(std::max)(rows,cols)) { \
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
@@ -224,15 +231,22 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
m = convert_index<BlasIndex>(size); \
n = convert_index<BlasIndex>(cols-size); \
} \
- BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
+ BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
} \
} \
};
-EIGEN_BLAS_TRMV_RM(double, double, d, d)
-EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z)
-EIGEN_BLAS_TRMV_RM(float, float, f, s)
-EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c)
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMV_RM(double, double, d, d,)
+EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,)
+EIGEN_BLAS_TRMV_RM(float, float, f, s,)
+EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,)
+#else
+EIGEN_BLAS_TRMV_RM(double, double, d, d,_)
+EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_)
+EIGEN_BLAS_TRMV_RM(float, float, f, s,_)
+EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_)
+#endif
} // end namespase internal
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
index 88c0fb794..f0775116a 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
@@ -38,7 +38,7 @@ namespace Eigen {
namespace internal {
// implements LeftSide op(triangular)^-1 * general
-#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \
+#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \
{ \
@@ -80,18 +80,24 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
} \
if (IsUnitDiag) diag='U'; \
/* call ?trsm*/ \
- BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
+ BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
} \
};
-EIGEN_BLAS_TRSM_L(double, double, d)
-EIGEN_BLAS_TRSM_L(dcomplex, double, z)
-EIGEN_BLAS_TRSM_L(float, float, s)
-EIGEN_BLAS_TRSM_L(scomplex, float, c)
-
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRSM_L(double, double, dtrsm)
+EIGEN_BLAS_TRSM_L(dcomplex, MKL_Complex16, ztrsm)
+EIGEN_BLAS_TRSM_L(float, float, strsm)
+EIGEN_BLAS_TRSM_L(scomplex, MKL_Complex8, ctrsm)
+#else
+EIGEN_BLAS_TRSM_L(double, double, dtrsm_)
+EIGEN_BLAS_TRSM_L(dcomplex, double, ztrsm_)
+EIGEN_BLAS_TRSM_L(float, float, strsm_)
+EIGEN_BLAS_TRSM_L(scomplex, float, ctrsm_)
+#endif
// implements RightSide general * op(triangular)^-1
-#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASPREFIX) \
+#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC) \
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \
{ \
@@ -133,16 +139,22 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
} \
if (IsUnitDiag) diag='U'; \
/* call ?trsm*/ \
- BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
+ BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
/*std::cout << "TRMS_L specialization!\n";*/ \
} \
};
-EIGEN_BLAS_TRSM_R(double, double, d)
-EIGEN_BLAS_TRSM_R(dcomplex, double, z)
-EIGEN_BLAS_TRSM_R(float, float, s)
-EIGEN_BLAS_TRSM_R(scomplex, float, c)
-
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRSM_R(double, double, dtrsm)
+EIGEN_BLAS_TRSM_R(dcomplex, MKL_Complex16, ztrsm)
+EIGEN_BLAS_TRSM_R(float, float, strsm)
+EIGEN_BLAS_TRSM_R(scomplex, MKL_Complex8, ctrsm)
+#else
+EIGEN_BLAS_TRSM_R(double, double, dtrsm_)
+EIGEN_BLAS_TRSM_R(dcomplex, double, ztrsm_)
+EIGEN_BLAS_TRSM_R(float, float, strsm_)
+EIGEN_BLAS_TRSM_R(scomplex, float, ctrsm_)
+#endif
} // end namespace internal
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index 4431f2fc4..8ef0f3594 100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -55,6 +55,7 @@
#endif
#if defined __NVCC__
+ #pragma diag_suppress boolean_controlling_expr_is_constant
// Disable the "statement is unreachable" message
#pragma diag_suppress code_is_unreachable
// Disable the "dynamic initialization in unreachable code" message
@@ -72,6 +73,7 @@
#pragma diag_suppress 2671
#pragma diag_suppress 2735
#pragma diag_suppress 2737
+ #pragma diag_suppress 2739
#endif
#endif // not EIGEN_WARNINGS_DISABLED
diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h
index 26b59669e..17963fad4 100755
--- a/Eigen/src/Core/util/MKL_support.h
+++ b/Eigen/src/Core/util/MKL_support.h
@@ -49,12 +49,17 @@
#define EIGEN_USE_LAPACKE
#endif
-#if defined(EIGEN_USE_MKL_VML)
+#if defined(EIGEN_USE_MKL_VML) && !defined(EIGEN_USE_MKL)
#define EIGEN_USE_MKL
#endif
+
#if defined EIGEN_USE_MKL
-# include <mkl.h>
+# if (!defined MKL_DIRECT_CALL) && (!defined EIGEN_MKL_NO_DIRECT_CALL)
+# define MKL_DIRECT_CALL
+# define MKL_DIRECT_CALL_JUST_SET
+# endif
+# include <mkl.h>
/*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
# ifndef INTEL_MKL_VERSION
# undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */
@@ -68,6 +73,9 @@
# undef EIGEN_USE_MKL_VML
# undef EIGEN_USE_LAPACKE_STRICT
# undef EIGEN_USE_LAPACKE
+# ifdef MKL_DIRECT_CALL_JUST_SET
+# undef MKL_DIRECT_CALL
+# endif
# endif
#endif
@@ -108,6 +116,10 @@
#endif
#endif
+#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL)
+#include "../../misc/blas.h"
+#endif
+
namespace Eigen {
typedef std::complex<double> dcomplex;
@@ -121,8 +133,5 @@ typedef int BlasIndex;
} // end namespace Eigen
-#if defined(EIGEN_USE_BLAS)
-#include "../../misc/blas.h"
-#endif
#endif // EIGEN_MKL_SUPPORT_H
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 14ec87da8..3a56b0e36 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -410,10 +410,20 @@
#endif
#endif
+// Does the compiler support type_trais?
+#ifndef EIGEN_HAS_TYPE_TRAITS
+#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700)
+#define EIGEN_HAS_TYPE_TRAITS 1
+#define EIGEN_INCLUDE_TYPE_TRAITS
+#else
+#define EIGEN_HAS_TYPE_TRAITS 0
+#endif
+#endif
+
// Does the compiler support variadic templates?
#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
- && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) )
+ && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) )
// ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
// this prevents nvcc from crashing when compiling Eigen on Tegra X1
#define EIGEN_HAS_VARIADIC_TEMPLATES 1
@@ -427,9 +437,9 @@
// Does the compiler fully support const expressions? (as in c++14)
#ifndef EIGEN_HAS_CONSTEXPR
-#if defined(__CUDACC__)
+#if defined(EIGEN_CUDACC)
// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
-#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500))
+#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500))
#define EIGEN_HAS_CONSTEXPR 1
#endif
#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
@@ -669,7 +679,7 @@ namespace Eigen {
* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
* vectorized and non-vectorized code.
*/
-#if (defined __CUDACC__)
+#if (defined EIGEN_CUDACC)
#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
@@ -686,10 +696,10 @@ namespace Eigen {
#if defined(EIGEN_DONT_VECTORIZE)
#define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
#elif defined(EIGEN_VECTORIZE_AVX512)
- // 64 bytes static alignmeent is preferred only if really required
+ // 64 bytes static alignment is preferred only if really required
#define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
#elif defined(__AVX__)
- // 32 bytes static alignmeent is preferred only if really required
+ // 32 bytes static alignment is preferred only if really required
#define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
#else
#define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
@@ -837,7 +847,8 @@ namespace Eigen {
// just an empty macro !
#define EIGEN_EMPTY
-#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || defined(__CUDACC_VER__)) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0)
+ // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
using Base::operator =;
#elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
@@ -955,7 +966,7 @@ namespace Eigen {
const typename internal::plain_constant_type<EXPR,SCALAR>::type, const EXPR>
// Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010")
-#if EIGEN_COMP_MSVC_STRICT<=1600
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC_STRICT<=1600)
#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if<true,X>::type
#else
#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X
@@ -990,7 +1001,7 @@ namespace Eigen {
# define EIGEN_TRY try
# define EIGEN_CATCH(X) catch (X)
#else
-# ifdef __CUDA_ARCH__
+# ifdef EIGEN_CUDA_ARCH
# define EIGEN_THROW_X(X) asm("trap;")
# define EIGEN_THROW asm("trap;")
# else
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 7d9053496..c455f92a1 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -493,7 +493,7 @@ template<typename T> struct smart_copy_helper<T,true> {
IntPtr size = IntPtr(end)-IntPtr(start);
if(size==0) return;
eigen_internal_assert(start!=0 && end!=0 && target!=0);
- memcpy(target, start, size);
+ std::memcpy(target, start, size);
}
};
@@ -696,7 +696,15 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
/** \class aligned_allocator
* \ingroup Core_Module
*
-* \brief STL compatible allocator to use with with 16 byte aligned types
+* \brief STL compatible allocator to use with types requiring a non standrad alignment.
+*
+* The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd.
+* By default, it will thus provide at least 16 bytes alignment and more in following cases:
+* - 32 bytes alignment if AVX is enabled.
+* - 64 bytes alignment if AVX512 is enabled.
+*
+* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented
+* \link TopicPreprocessorDirectivesPerformance there \endlink.
*
* Example:
* \code
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 90eda6e70..0fa818008 100755
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -11,7 +11,7 @@
#ifndef EIGEN_META_H
#define EIGEN_META_H
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_CUDA_ARCH)
#include <cfloat>
#include <math_constants.h>
#endif
@@ -169,7 +169,7 @@ template<bool Condition, typename T=void> struct enable_if;
template<typename T> struct enable_if<true,T>
{ typedef T type; };
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_CUDA_ARCH)
#if !defined(__FLT_EPSILON__)
#define __FLT_EPSILON__ FLT_EPSILON
#define __DBL_EPSILON__ DBL_EPSILON
@@ -433,10 +433,10 @@ struct meta_no { char a[2]; };
template <typename T>
struct has_ReturnType
{
- template <typename C> static meta_yes testFunctor(typename C::ReturnType const *);
- template <typename C> static meta_no testFunctor(...);
+ template <typename C> static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0);
+ template <typename C> static meta_no testFunctor(...);
- enum { value = sizeof(testFunctor<T>(0)) == sizeof(meta_yes) };
+ enum { value = sizeof(testFunctor<T>(static_cast<T*>(0))) == sizeof(meta_yes) };
};
template<typename T> const T* return_ptr();
@@ -523,13 +523,13 @@ template<typename T, typename U> struct scalar_product_traits
namespace numext {
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_CUDA_ARCH)
template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
#else
template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
#endif
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_CUDA_ARCH)
using internal::device::numeric_limits;
#else
using std::numeric_limits;
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index 983361a45..cb1678900 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -44,64 +44,64 @@
struct static_assertion<true>
{
enum {
- YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX,
- YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES,
- YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES,
- THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE,
- THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE,
- THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE,
- OUT_OF_RANGE_ACCESS,
- YOU_MADE_A_PROGRAMMING_MISTAKE,
- EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT,
- EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE,
- YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR,
- YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR,
- UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC,
- THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES,
- FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED,
- NUMERIC_TYPE_MUST_BE_REAL,
- COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED,
- WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED,
- THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE,
- INVALID_MATRIX_PRODUCT,
- INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS,
- INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION,
- YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY,
- THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES,
- THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES,
- INVALID_MATRIX_TEMPLATE_PARAMETERS,
- INVALID_MATRIXBASE_TEMPLATE_PARAMETERS,
- BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER,
- THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX,
- THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE,
- THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES,
- YOU_ALREADY_SPECIFIED_THIS_STRIDE,
- INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION,
- THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD,
- PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1,
- THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS,
- YOU_CANNOT_MIX_ARRAYS_AND_MATRICES,
- YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION,
- THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY,
- YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT,
- THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS,
- THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS,
- THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL,
- THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES,
- YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED,
- YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED,
- THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE,
- THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH,
- OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG,
- IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY,
- STORAGE_LAYOUT_DOES_NOT_MATCH,
- EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE,
- THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS,
- MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY,
- THIS_TYPE_IS_NOT_SUPPORTED,
- STORAGE_KIND_MUST_MATCH,
- STORAGE_INDEX_MUST_MATCH,
- CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY
+ YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX=1,
+ YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES=1,
+ YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES=1,
+ THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE=1,
+ THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE=1,
+ THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE=1,
+ OUT_OF_RANGE_ACCESS=1,
+ YOU_MADE_A_PROGRAMMING_MISTAKE=1,
+ EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT=1,
+ EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE=1,
+ YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR=1,
+ YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR=1,
+ UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC=1,
+ THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES=1,
+ FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED=1,
+ NUMERIC_TYPE_MUST_BE_REAL=1,
+ COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED=1,
+ WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED=1,
+ THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE=1,
+ INVALID_MATRIX_PRODUCT=1,
+ INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS=1,
+ INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION=1,
+ YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY=1,
+ THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES=1,
+ THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES=1,
+ INVALID_MATRIX_TEMPLATE_PARAMETERS=1,
+ INVALID_MATRIXBASE_TEMPLATE_PARAMETERS=1,
+ BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER=1,
+ THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX=1,
+ THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE=1,
+ THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES=1,
+ YOU_ALREADY_SPECIFIED_THIS_STRIDE=1,
+ INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION=1,
+ THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD=1,
+ PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1=1,
+ THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS=1,
+ YOU_CANNOT_MIX_ARRAYS_AND_MATRICES=1,
+ YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION=1,
+ THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY=1,
+ YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT=1,
+ THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS=1,
+ THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS=1,
+ THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL=1,
+ THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES=1,
+ YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED=1,
+ YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED=1,
+ THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE=1,
+ THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH=1,
+ OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG=1,
+ IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY=1,
+ STORAGE_LAYOUT_DOES_NOT_MATCH=1,
+ EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE=1,
+ THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS=1,
+ MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY=1,
+ THIS_TYPE_IS_NOT_SUPPORTED=1,
+ STORAGE_KIND_MUST_MATCH=1,
+ STORAGE_INDEX_MUST_MATCH=1,
+ CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1
};
};
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 4b337f29f..10328be0d 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -34,6 +34,18 @@ inline IndexDest convert_index(const IndexSrc& idx) {
return IndexDest(idx);
}
+// true if T can be considered as an integral index (i.e., and integral type or enum)
+template<typename T> struct is_valid_index_type
+{
+ enum { value =
+#if EIGEN_HAS_TYPE_TRAITS
+ internal::is_integral<T>::value || std::is_enum<T>::value
+#else
+ // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index.
+ internal::is_convertible<T,Index>::value
+#endif
+ };
+};
// promote_scalar_arg is an helper used in operation between an expression and a scalar, like:
// expression * scalar
diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h
index 0af3c1b08..83ee1be46 100644
--- a/Eigen/src/Geometry/AngleAxis.h
+++ b/Eigen/src/Geometry/AngleAxis.h
@@ -178,7 +178,7 @@ EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const Quaterni
if (n != Scalar(0))
{
m_angle = Scalar(2)*atan2(n, abs(q.w()));
- if(q.w() < 0)
+ if(q.w() < Scalar(0))
n = -n;
m_axis = q.vec() / n;
}
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index f6ef1bcf6..c3fd8c3e0 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -43,6 +43,11 @@ class QuaternionBase : public RotationBase<Derived, 3>
typedef typename internal::traits<Derived>::Scalar Scalar;
typedef typename NumTraits<Scalar>::Real RealScalar;
typedef typename internal::traits<Derived>::Coefficients Coefficients;
+ typedef typename Coefficients::CoeffReturnType CoeffReturnType;
+ typedef typename internal::conditional<bool(internal::traits<Derived>::Flags&LvalueBit),
+ Scalar&, CoeffReturnType>::type NonConstCoeffReturnType;
+
+
enum {
Flags = Eigen::internal::traits<Derived>::Flags
};
@@ -58,22 +63,22 @@ class QuaternionBase : public RotationBase<Derived, 3>
/** \returns the \c x coefficient */
- EIGEN_DEVICE_FUNC inline Scalar x() const { return this->derived().coeffs().coeff(0); }
+ EIGEN_DEVICE_FUNC inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); }
/** \returns the \c y coefficient */
- EIGEN_DEVICE_FUNC inline Scalar y() const { return this->derived().coeffs().coeff(1); }
+ EIGEN_DEVICE_FUNC inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); }
/** \returns the \c z coefficient */
- EIGEN_DEVICE_FUNC inline Scalar z() const { return this->derived().coeffs().coeff(2); }
+ EIGEN_DEVICE_FUNC inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); }
/** \returns the \c w coefficient */
- EIGEN_DEVICE_FUNC inline Scalar w() const { return this->derived().coeffs().coeff(3); }
+ EIGEN_DEVICE_FUNC inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); }
- /** \returns a reference to the \c x coefficient */
- EIGEN_DEVICE_FUNC inline Scalar& x() { return this->derived().coeffs().coeffRef(0); }
- /** \returns a reference to the \c y coefficient */
- EIGEN_DEVICE_FUNC inline Scalar& y() { return this->derived().coeffs().coeffRef(1); }
- /** \returns a reference to the \c z coefficient */
- EIGEN_DEVICE_FUNC inline Scalar& z() { return this->derived().coeffs().coeffRef(2); }
- /** \returns a reference to the \c w coefficient */
- EIGEN_DEVICE_FUNC inline Scalar& w() { return this->derived().coeffs().coeffRef(3); }
+ /** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */
+ EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); }
+ /** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */
+ EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); }
+ /** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */
+ EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); }
+ /** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */
+ EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); }
/** \returns a read-only vector expression of the imaginary part (x,y,z) */
EIGEN_DEVICE_FUNC inline const VectorBlock<const Coefficients,3> vec() const { return coeffs().template head<3>(); }
@@ -423,7 +428,7 @@ typedef Map<Quaternion<double>, Aligned> QuaternionMapAlignedd;
// Generic Quaternion * Quaternion product
// This product can be specialized for a given architecture via the Arch template argument.
namespace internal {
-template<int Arch, class Derived1, class Derived2, typename Scalar, int _Options> struct quat_product
+template<int Arch, class Derived1, class Derived2, typename Scalar> struct quat_product
{
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived1>& a, const QuaternionBase<Derived2>& b){
return Quaternion<Scalar>
@@ -446,8 +451,7 @@ QuaternionBase<Derived>::operator* (const QuaternionBase<OtherDerived>& other) c
EIGEN_STATIC_ASSERT((internal::is_same<typename Derived::Scalar, typename OtherDerived::Scalar>::value),
YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
return internal::quat_product<Architecture::Target, Derived, OtherDerived,
- typename internal::traits<Derived>::Scalar,
- EIGEN_PLAIN_ENUM_MIN(internal::traits<Derived>::Alignment, internal::traits<OtherDerived>::Alignment)>::run(*this, other);
+ typename internal::traits<Derived>::Scalar>::run(*this, other);
}
/** \sa operator*(Quaternion) */
@@ -672,7 +676,7 @@ EIGEN_DEVICE_FUNC inline Quaternion<typename internal::traits<Derived>::Scalar>
// Generic conjugate of a Quaternion
namespace internal {
-template<int Arch, class Derived, typename Scalar, int _Options> struct quat_conj
+template<int Arch, class Derived, typename Scalar> struct quat_conj
{
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived>& q){
return Quaternion<Scalar>(q.w(),-q.x(),-q.y(),-q.z());
@@ -691,8 +695,7 @@ EIGEN_DEVICE_FUNC inline Quaternion<typename internal::traits<Derived>::Scalar>
QuaternionBase<Derived>::conjugate() const
{
return internal::quat_conj<Architecture::Target, Derived,
- typename internal::traits<Derived>::Scalar,
- internal::traits<Derived>::Alignment>::run(*this);
+ typename internal::traits<Derived>::Scalar>::run(*this);
}
diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h
index 1a86ff837..f68cab583 100644
--- a/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ b/Eigen/src/Geometry/arch/Geometry_SSE.h
@@ -16,17 +16,23 @@ namespace Eigen {
namespace internal {
template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, float, Aligned16>
+struct quat_product<Architecture::SSE, Derived, OtherDerived, float>
{
+ enum {
+ AAlignment = traits<Derived>::Alignment,
+ BAlignment = traits<OtherDerived>::Alignment,
+ ResAlignment = traits<Quaternion<float> >::Alignment
+ };
static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
{
Quaternion<float> res;
const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
- __m128 a = _a.coeffs().template packet<Aligned16>(0);
- __m128 b = _b.coeffs().template packet<Aligned16>(0);
+ __m128 a = _a.coeffs().template packet<AAlignment>(0);
+ __m128 b = _b.coeffs().template packet<BAlignment>(0);
__m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
__m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
- pstore(&res.x(),
+ pstoret<float,Packet4f,ResAlignment>(
+ &res.x(),
_mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)),
_mm_mul_ps(vec4f_swizzle1(a,2,0,1,0),
vec4f_swizzle1(b,1,2,0,0))),
@@ -36,14 +42,17 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, float, Aligned16>
}
};
-template<class Derived, int Alignment>
-struct quat_conj<Architecture::SSE, Derived, float, Alignment>
+template<class Derived>
+struct quat_conj<Architecture::SSE, Derived, float>
{
+ enum {
+ ResAlignment = traits<Quaternion<float> >::Alignment
+ };
static inline Quaternion<float> run(const QuaternionBase<Derived>& q)
{
Quaternion<float> res;
const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f);
- pstore(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet<Alignment>(0)));
+ pstoret<float,Packet4f,ResAlignment>(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet<traits<Derived>::Alignment>(0)));
return res;
}
};
@@ -52,6 +61,9 @@ struct quat_conj<Architecture::SSE, Derived, float, Alignment>
template<typename VectorLhs,typename VectorRhs>
struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
{
+ enum {
+ ResAlignment = traits<typename plain_matrix_type<VectorLhs>::type>::Alignment
+ };
static inline typename plain_matrix_type<VectorLhs>::type
run(const VectorLhs& lhs, const VectorRhs& rhs)
{
@@ -60,7 +72,7 @@ struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
__m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3));
__m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3));
typename plain_matrix_type<VectorLhs>::type res;
- pstore(&res.x(),_mm_sub_ps(mul1,mul2));
+ pstoret<float,Packet4f,ResAlignment>(&res.x(),_mm_sub_ps(mul1,mul2));
return res;
}
};
@@ -68,9 +80,14 @@ struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
-template<class Derived, class OtherDerived, int Alignment>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Alignment>
+template<class Derived, class OtherDerived>
+struct quat_product<Architecture::SSE, Derived, OtherDerived, double>
{
+ enum {
+ BAlignment = traits<OtherDerived>::Alignment,
+ ResAlignment = traits<Quaternion<double> >::Alignment
+ };
+
static inline Quaternion<double> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
{
const Packet2d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
@@ -78,8 +95,8 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Alignment>
Quaternion<double> res;
const double* a = _a.coeffs().data();
- Packet2d b_xy = _b.coeffs().template packet<Alignment>(0);
- Packet2d b_zw = _b.coeffs().template packet<Alignment>(2);
+ Packet2d b_xy = _b.coeffs().template packet<BAlignment>(0);
+ Packet2d b_zw = _b.coeffs().template packet<BAlignment>(2);
Packet2d a_xx = pset1<Packet2d>(a[0]);
Packet2d a_yy = pset1<Packet2d>(a[1]);
Packet2d a_zz = pset1<Packet2d>(a[2]);
@@ -97,9 +114,9 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Alignment>
t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw));
#ifdef EIGEN_VECTORIZE_SSE3
EIGEN_UNUSED_VARIABLE(mask)
- pstore(&res.x(), _mm_addsub_pd(t1, preverse(t2)));
+ pstoret<double,Packet2d,ResAlignment>(&res.x(), _mm_addsub_pd(t1, preverse(t2)));
#else
- pstore(&res.x(), padd(t1, pxor(mask,preverse(t2))));
+ pstoret<double,Packet2d,ResAlignment>(&res.x(), padd(t1, pxor(mask,preverse(t2))));
#endif
/*
@@ -111,25 +128,28 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Alignment>
t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy));
#ifdef EIGEN_VECTORIZE_SSE3
EIGEN_UNUSED_VARIABLE(mask)
- pstore(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2)));
+ pstoret<double,Packet2d,ResAlignment>(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2)));
#else
- pstore(&res.z(), psub(t1, pxor(mask,preverse(t2))));
+ pstoret<double,Packet2d,ResAlignment>(&res.z(), psub(t1, pxor(mask,preverse(t2))));
#endif
return res;
}
};
-template<class Derived, int Alignment>
-struct quat_conj<Architecture::SSE, Derived, double, Alignment>
+template<class Derived>
+struct quat_conj<Architecture::SSE, Derived, double>
{
+ enum {
+ ResAlignment = traits<Quaternion<double> >::Alignment
+ };
static inline Quaternion<double> run(const QuaternionBase<Derived>& q)
{
Quaternion<double> res;
const __m128d mask0 = _mm_setr_pd(-0.,-0.);
const __m128d mask2 = _mm_setr_pd(-0.,0.);
- pstore(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet<Alignment>(0)));
- pstore(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet<Alignment>(2)));
+ pstoret<double,Packet2d,ResAlignment>(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet<traits<Derived>::Alignment>(0)));
+ pstoret<double,Packet2d,ResAlignment>(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet<traits<Derived>::Alignment>(2)));
return res;
}
};
diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index 358444aff..f66c846ef 100644
--- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -152,13 +152,28 @@ class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar>
{
// Compute the inverse squared-norm of each column of mat
m_invdiag.resize(mat.cols());
- for(Index j=0; j<mat.outerSize(); ++j)
+ if(MatType::IsRowMajor)
{
- RealScalar sum = mat.innerVector(j).squaredNorm();
- if(sum>0)
- m_invdiag(j) = RealScalar(1)/sum;
- else
- m_invdiag(j) = RealScalar(1);
+ m_invdiag.setZero();
+ for(Index j=0; j<mat.outerSize(); ++j)
+ {
+ for(typename MatType::InnerIterator it(mat,j); it; ++it)
+ m_invdiag(it.index()) += numext::abs2(it.value());
+ }
+ for(Index j=0; j<mat.cols(); ++j)
+ if(numext::real(m_invdiag(j))>RealScalar(0))
+ m_invdiag(j) = RealScalar(1)/numext::real(m_invdiag(j));
+ }
+ else
+ {
+ for(Index j=0; j<mat.outerSize(); ++j)
+ {
+ RealScalar sum = mat.col(j).squaredNorm();
+ if(sum>RealScalar(0))
+ m_invdiag(j) = RealScalar(1)/sum;
+ else
+ m_invdiag(j) = RealScalar(1);
+ }
}
Base::m_isInitialized = true;
return *this;
diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h
index d25af8e90..af1228cb8 100644
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h
@@ -37,17 +37,20 @@ template<typename Scalar> class JacobiRotation
typedef typename NumTraits<Scalar>::Real RealScalar;
/** Default constructor without any initialization. */
+ EIGEN_DEVICE_FUNC
JacobiRotation() {}
/** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */
+ EIGEN_DEVICE_FUNC
JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {}
- Scalar& c() { return m_c; }
- Scalar c() const { return m_c; }
- Scalar& s() { return m_s; }
- Scalar s() const { return m_s; }
+ EIGEN_DEVICE_FUNC Scalar& c() { return m_c; }
+ EIGEN_DEVICE_FUNC Scalar c() const { return m_c; }
+ EIGEN_DEVICE_FUNC Scalar& s() { return m_s; }
+ EIGEN_DEVICE_FUNC Scalar s() const { return m_s; }
/** Concatenates two planar rotation */
+ EIGEN_DEVICE_FUNC
JacobiRotation operator*(const JacobiRotation& other)
{
using numext::conj;
@@ -56,19 +59,26 @@ template<typename Scalar> class JacobiRotation
}
/** Returns the transposed transformation */
+ EIGEN_DEVICE_FUNC
JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); }
/** Returns the adjoint transformation */
+ EIGEN_DEVICE_FUNC
JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); }
template<typename Derived>
+ EIGEN_DEVICE_FUNC
bool makeJacobi(const MatrixBase<Derived>&, Index p, Index q);
+ EIGEN_DEVICE_FUNC
bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z);
+ EIGEN_DEVICE_FUNC
void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0);
protected:
+ EIGEN_DEVICE_FUNC
void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::true_type);
+ EIGEN_DEVICE_FUNC
void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::false_type);
Scalar m_c, m_s;
@@ -264,6 +274,7 @@ namespace internal {
* \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
*/
template<typename VectorX, typename VectorY, typename OtherScalar>
+EIGEN_DEVICE_FUNC
void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j);
}
@@ -298,132 +309,162 @@ inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiR
}
namespace internal {
-template<typename VectorX, typename VectorY, typename OtherScalar>
-void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j)
-{
- typedef typename VectorX::Scalar Scalar;
- enum { PacketSize = packet_traits<Scalar>::size };
- typedef typename packet_traits<Scalar>::type Packet;
- eigen_assert(xpr_x.size() == xpr_y.size());
- Index size = xpr_x.size();
- Index incrx = xpr_x.derived().innerStride();
- Index incry = xpr_y.derived().innerStride();
- Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0);
- Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0);
-
- OtherScalar c = j.c();
- OtherScalar s = j.s();
- if (c==OtherScalar(1) && s==OtherScalar(0))
- return;
-
- /*** dynamic-size vectorized paths ***/
+template<typename Scalar, typename OtherScalar,
+ int SizeAtCompileTime, int MinAlignment, bool Vectorizable>
+struct apply_rotation_in_the_plane_selector
+{
+ static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
+ {
+ for(Index i=0; i<size; ++i)
+ {
+ Scalar xi = *x;
+ Scalar yi = *y;
+ *x = c * xi + numext::conj(s) * yi;
+ *y = -s * xi + numext::conj(c) * yi;
+ x += incrx;
+ y += incry;
+ }
+ }
+};
- if(VectorX::SizeAtCompileTime == Dynamic &&
- (VectorX::Flags & VectorY::Flags & PacketAccessBit) &&
- ((incrx==1 && incry==1) || PacketSize == 1))
+template<typename Scalar, typename OtherScalar,
+ int SizeAtCompileTime, int MinAlignment>
+struct apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime,MinAlignment,true /* vectorizable */>
+{
+ static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
{
- // both vectors are sequentially stored in memory => vectorization
- enum { Peeling = 2 };
+ enum {
+ PacketSize = packet_traits<Scalar>::size,
+ OtherPacketSize = packet_traits<OtherScalar>::size
+ };
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename packet_traits<OtherScalar>::type OtherPacket;
+
+ /*** dynamic-size vectorized paths ***/
+ if(SizeAtCompileTime == Dynamic && ((incrx==1 && incry==1) || PacketSize == 1))
+ {
+ // both vectors are sequentially stored in memory => vectorization
+ enum { Peeling = 2 };
- Index alignedStart = internal::first_default_aligned(y, size);
- Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;
+ Index alignedStart = internal::first_default_aligned(y, size);
+ Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;
- const Packet pc = pset1<Packet>(c);
- const Packet ps = pset1<Packet>(s);
- conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex,false> pcj;
+ const OtherPacket pc = pset1<OtherPacket>(c);
+ const OtherPacket ps = pset1<OtherPacket>(s);
+ conj_helper<OtherPacket,Packet,NumTraits<OtherScalar>::IsComplex,false> pcj;
+ conj_helper<OtherPacket,Packet,false,false> pm;
- for(Index i=0; i<alignedStart; ++i)
- {
- Scalar xi = x[i];
- Scalar yi = y[i];
- x[i] = c * xi + numext::conj(s) * yi;
- y[i] = -s * xi + numext::conj(c) * yi;
- }
+ for(Index i=0; i<alignedStart; ++i)
+ {
+ Scalar xi = x[i];
+ Scalar yi = y[i];
+ x[i] = c * xi + numext::conj(s) * yi;
+ y[i] = -s * xi + numext::conj(c) * yi;
+ }
- Scalar* EIGEN_RESTRICT px = x + alignedStart;
- Scalar* EIGEN_RESTRICT py = y + alignedStart;
+ Scalar* EIGEN_RESTRICT px = x + alignedStart;
+ Scalar* EIGEN_RESTRICT py = y + alignedStart;
- if(internal::first_default_aligned(x, size)==alignedStart)
- {
- for(Index i=alignedStart; i<alignedEnd; i+=PacketSize)
+ if(internal::first_default_aligned(x, size)==alignedStart)
{
- Packet xi = pload<Packet>(px);
- Packet yi = pload<Packet>(py);
- pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
- pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
- px += PacketSize;
- py += PacketSize;
+ for(Index i=alignedStart; i<alignedEnd; i+=PacketSize)
+ {
+ Packet xi = pload<Packet>(px);
+ Packet yi = pload<Packet>(py);
+ pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi)));
+ pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi)));
+ px += PacketSize;
+ py += PacketSize;
+ }
}
- }
- else
- {
- Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize);
- for(Index i=alignedStart; i<peelingEnd; i+=Peeling*PacketSize)
+ else
{
- Packet xi = ploadu<Packet>(px);
- Packet xi1 = ploadu<Packet>(px+PacketSize);
- Packet yi = pload <Packet>(py);
- Packet yi1 = pload <Packet>(py+PacketSize);
- pstoreu(px, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
- pstoreu(px+PacketSize, padd(pmul(pc,xi1),pcj.pmul(ps,yi1)));
- pstore (py, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
- pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pmul(ps,xi1)));
- px += Peeling*PacketSize;
- py += Peeling*PacketSize;
+ Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize);
+ for(Index i=alignedStart; i<peelingEnd; i+=Peeling*PacketSize)
+ {
+ Packet xi = ploadu<Packet>(px);
+ Packet xi1 = ploadu<Packet>(px+PacketSize);
+ Packet yi = pload <Packet>(py);
+ Packet yi1 = pload <Packet>(py+PacketSize);
+ pstoreu(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi)));
+ pstoreu(px+PacketSize, padd(pm.pmul(pc,xi1),pcj.pmul(ps,yi1)));
+ pstore (py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi)));
+ pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pm.pmul(ps,xi1)));
+ px += Peeling*PacketSize;
+ py += Peeling*PacketSize;
+ }
+ if(alignedEnd!=peelingEnd)
+ {
+ Packet xi = ploadu<Packet>(x+peelingEnd);
+ Packet yi = pload <Packet>(y+peelingEnd);
+ pstoreu(x+peelingEnd, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi)));
+ pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi)));
+ }
}
- if(alignedEnd!=peelingEnd)
+
+ for(Index i=alignedEnd; i<size; ++i)
{
- Packet xi = ploadu<Packet>(x+peelingEnd);
- Packet yi = pload <Packet>(y+peelingEnd);
- pstoreu(x+peelingEnd, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
- pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
+ Scalar xi = x[i];
+ Scalar yi = y[i];
+ x[i] = c * xi + numext::conj(s) * yi;
+ y[i] = -s * xi + numext::conj(c) * yi;
}
}
- for(Index i=alignedEnd; i<size; ++i)
+ /*** fixed-size vectorized path ***/
+ else if(SizeAtCompileTime != Dynamic && MinAlignment>0) // FIXME should be compared to the required alignment
{
- Scalar xi = x[i];
- Scalar yi = y[i];
- x[i] = c * xi + numext::conj(s) * yi;
- y[i] = -s * xi + numext::conj(c) * yi;
+ const OtherPacket pc = pset1<OtherPacket>(c);
+ const OtherPacket ps = pset1<OtherPacket>(s);
+ conj_helper<OtherPacket,Packet,NumTraits<OtherPacket>::IsComplex,false> pcj;
+ conj_helper<OtherPacket,Packet,false,false> pm;
+ Scalar* EIGEN_RESTRICT px = x;
+ Scalar* EIGEN_RESTRICT py = y;
+ for(Index i=0; i<size; i+=PacketSize)
+ {
+ Packet xi = pload<Packet>(px);
+ Packet yi = pload<Packet>(py);
+ pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi)));
+ pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi)));
+ px += PacketSize;
+ py += PacketSize;
+ }
}
- }
- /*** fixed-size vectorized path ***/
- else if(VectorX::SizeAtCompileTime != Dynamic &&
- (VectorX::Flags & VectorY::Flags & PacketAccessBit) &&
- (EIGEN_PLAIN_ENUM_MIN(evaluator<VectorX>::Alignment, evaluator<VectorY>::Alignment)>0)) // FIXME should be compared to the required alignment
- {
- const Packet pc = pset1<Packet>(c);
- const Packet ps = pset1<Packet>(s);
- conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex,false> pcj;
- Scalar* EIGEN_RESTRICT px = x;
- Scalar* EIGEN_RESTRICT py = y;
- for(Index i=0; i<size; i+=PacketSize)
+ /*** non-vectorized path ***/
+ else
{
- Packet xi = pload<Packet>(px);
- Packet yi = pload<Packet>(py);
- pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
- pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
- px += PacketSize;
- py += PacketSize;
+ apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime,MinAlignment,false>::run(x,incrx,y,incry,size,c,s);
}
}
+};
- /*** non-vectorized path ***/
- else
- {
- for(Index i=0; i<size; ++i)
- {
- Scalar xi = *x;
- Scalar yi = *y;
- *x = c * xi + numext::conj(s) * yi;
- *y = -s * xi + numext::conj(c) * yi;
- x += incrx;
- y += incry;
- }
- }
+template<typename VectorX, typename VectorY, typename OtherScalar>
+void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j)
+{
+ typedef typename VectorX::Scalar Scalar;
+ const bool Vectorizable = (VectorX::Flags & VectorY::Flags & PacketAccessBit)
+ && (int(packet_traits<Scalar>::size) == int(packet_traits<OtherScalar>::size));
+
+ eigen_assert(xpr_x.size() == xpr_y.size());
+ Index size = xpr_x.size();
+ Index incrx = xpr_x.derived().innerStride();
+ Index incry = xpr_y.derived().innerStride();
+
+ Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0);
+ Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0);
+
+ OtherScalar c = j.c();
+ OtherScalar s = j.s();
+ if (c==OtherScalar(1) && s==OtherScalar(0))
+ return;
+
+ apply_rotation_in_the_plane_selector<
+ Scalar,OtherScalar,
+ VectorX::SizeAtCompileTime,
+ EIGEN_PLAIN_ENUM_MIN(evaluator<VectorX>::Alignment, evaluator<VectorY>::Alignment),
+ Vectorizable>::run(x,incrx,y,incry,size,c,s);
}
} // end namespace internal
diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index 933cd564b..da85b4d6e 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -1004,7 +1004,7 @@ static IndexType find_ordering /* return the number of garbage collections */
COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ;
/* get pivot column from head of minimum degree list */
- while (head [min_score] == COLAMD_EMPTY && min_score < n_col)
+ while (min_score < n_col && head [min_score] == COLAMD_EMPTY)
{
min_score++ ;
}
diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index d35395d04..5270eaca2 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -505,8 +505,8 @@ void ColPivHouseholderQR<MatrixType>::computeInPlace()
m_colNormsUpdated.coeffRef(k) = m_colNormsDirect.coeffRef(k);
}
- RealScalar threshold_helper = numext::abs2<Scalar>(m_colNormsUpdated.maxCoeff() * NumTraits<Scalar>::epsilon()) / RealScalar(rows);
- RealScalar norm_downdate_threshold = numext::sqrt(NumTraits<Scalar>::epsilon());
+ RealScalar threshold_helper = numext::abs2<RealScalar>(m_colNormsUpdated.maxCoeff() * NumTraits<RealScalar>::epsilon()) / RealScalar(rows);
+ RealScalar norm_downdate_threshold = numext::sqrt(NumTraits<RealScalar>::epsilon());
m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
m_maxpivot = RealScalar(0);
@@ -552,12 +552,12 @@ void ColPivHouseholderQR<MatrixType>::computeInPlace()
// http://www.netlib.org/lapack/lawnspdf/lawn176.pdf
// and used in LAPACK routines xGEQPF and xGEQP3.
// See lines 278-297 in http://www.netlib.org/lapack/explore-html/dc/df4/sgeqpf_8f_source.html
- if (m_colNormsUpdated.coeffRef(j) != 0) {
+ if (m_colNormsUpdated.coeffRef(j) != RealScalar(0)) {
RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNormsUpdated.coeffRef(j);
temp = (RealScalar(1) + temp) * (RealScalar(1) - temp);
- temp = temp < 0 ? 0 : temp;
- RealScalar temp2 = temp * numext::abs2<Scalar>(m_colNormsUpdated.coeffRef(j) /
- m_colNormsDirect.coeffRef(j));
+ temp = temp < RealScalar(0) ? RealScalar(0) : temp;
+ RealScalar temp2 = temp * numext::abs2<RealScalar>(m_colNormsUpdated.coeffRef(j) /
+ m_colNormsDirect.coeffRef(j));
if (temp2 <= norm_downdate_threshold) {
// The updated norm has become too inaccurate so re-compute the column
// norm directly.
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index 25fca6f4d..0abd4c1bb 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -11,7 +11,7 @@
// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
// Copyright (C) 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
-// Copyright (C) 2014-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2014-2017 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@@ -22,6 +22,11 @@
// #define EIGEN_BDCSVD_DEBUG_VERBOSE
// #define EIGEN_BDCSVD_SANITY_CHECKS
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+#undef eigen_internal_assert
+#define eigen_internal_assert(X) assert(X);
+#endif
+
namespace Eigen {
#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
@@ -77,6 +82,7 @@ public:
typedef _MatrixType MatrixType;
typedef typename MatrixType::Scalar Scalar;
typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+ typedef typename NumTraits<RealScalar>::Literal Literal;
enum {
RowsAtCompileTime = MatrixType::RowsAtCompileTime,
ColsAtCompileTime = MatrixType::ColsAtCompileTime,
@@ -259,7 +265,7 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
//**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows
RealScalar scale = matrix.cwiseAbs().maxCoeff();
- if(scale==RealScalar(0)) scale = RealScalar(1);
+ if(scale==Literal(0)) scale = Literal(1);
MatrixX copy;
if (m_isTranspose) copy = matrix.adjoint()/scale;
else copy = matrix/scale;
@@ -351,13 +357,13 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
Index k1=0, k2=0;
for(Index j=0; j<n; ++j)
{
- if( (A.col(j).head(n1).array()!=0).any() )
+ if( (A.col(j).head(n1).array()!=Literal(0)).any() )
{
A1.col(k1) = A.col(j).head(n1);
B1.row(k1) = B.row(j);
++k1;
}
- if( (A.col(j).tail(n2).array()!=0).any() )
+ if( (A.col(j).tail(n2).array()!=Literal(0)).any() )
{
A2.col(k2) = A.col(j).tail(n2);
B2.row(k2) = B.row(j);
@@ -449,11 +455,11 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
l = m_naiveU.row(1).segment(firstCol, k);
f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1);
}
- if (m_compV) m_naiveV(firstRowW+k, firstColW) = 1;
+ if (m_compV) m_naiveV(firstRowW+k, firstColW) = Literal(1);
if (r0<considerZero)
{
- c0 = 1;
- s0 = 0;
+ c0 = Literal(1);
+ s0 = Literal(0);
}
else
{
@@ -574,7 +580,7 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n);
m_workspace.head(n) = m_computed.block(firstCol, firstCol, n, n).diagonal();
ArrayRef diag = m_workspace.head(n);
- diag(0) = 0;
+ diag(0) = Literal(0);
// Allocate space for singular values and vectors
singVals.resize(n);
@@ -590,7 +596,7 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
// but others are interleaved and we must ignore them at this stage.
// To this end, let's compute a permutation skipping them:
Index actual_n = n;
- while(actual_n>1 && diag(actual_n-1)==0) --actual_n;
+ while(actual_n>1 && diag(actual_n-1)==Literal(0)) {--actual_n; eigen_internal_assert(col0(actual_n)==Literal(0)); }
Index m = 0; // size of the deflated problem
for(Index k=0;k<actual_n;++k)
if(abs(col0(k))>considerZero)
@@ -617,13 +623,11 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
std::cout << " shift: " << shifts.transpose() << "\n";
{
- Index actual_n = n;
- while(actual_n>1 && abs(col0(actual_n-1))<considerZero) --actual_n;
std::cout << "\n\n mus: " << mus.head(actual_n).transpose() << "\n\n";
std::cout << " check1 (expect0) : " << ((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n).transpose() << "\n\n";
+ assert((((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n) >= 0).all());
std::cout << " check2 (>0) : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n";
- std::cout << " check3 (>0) : " << ((diag.segment(1,actual_n-1)-singVals.head(actual_n-1).array()) / singVals.head(actual_n-1).array()).transpose() << "\n\n\n";
- std::cout << " check4 (>0) : " << ((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).transpose() << "\n\n\n";
+ assert((((singVals.array()-diag) / singVals.array()).head(actual_n) >= 0).all());
}
#endif
@@ -651,13 +655,13 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
#endif
#ifdef EIGEN_BDCSVD_SANITY_CHECKS
- assert(U.allFinite());
- assert(V.allFinite());
- assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 1e-14 * n);
- assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 1e-14 * n);
assert(m_naiveU.allFinite());
assert(m_naiveV.allFinite());
assert(m_computed.allFinite());
+ assert(U.allFinite());
+ assert(V.allFinite());
+// assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
+// assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
#endif
// Because of deflation, the singular values might not be completely sorted.
@@ -672,6 +676,15 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
if(m_compV) V.col(i).swap(V.col(i+1));
}
}
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+ {
+ bool singular_values_sorted = (((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).array() >= 0).all();
+ if(!singular_values_sorted)
+ std::cout << "Singular values are not sorted: " << singVals.segment(1,actual_n).transpose() << "\n";
+ assert(singular_values_sorted);
+ }
+#endif
// Reverse order so that singular values in increased order
// Because of deflation, the zeros singular-values are already at the end
@@ -691,11 +704,13 @@ template <typename MatrixType>
typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift)
{
Index m = perm.size();
- RealScalar res = 1;
+ RealScalar res = Literal(1);
for(Index i=0; i<m; ++i)
{
Index j = perm(i);
- res += numext::abs2(col0(j)) / ((diagShifted(j) - mu) * (diag(j) + shift + mu));
+ // The following expression could be rewritten to involve only a single division,
+ // but this would make the expression more sensitive to overflow.
+ res += (col0(j) / (diagShifted(j) - mu)) * (col0(j) / (diag(j) + shift + mu));
}
return res;
@@ -707,19 +722,22 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
{
using std::abs;
using std::swap;
+ using std::sqrt;
Index n = col0.size();
Index actual_n = n;
- while(actual_n>1 && col0(actual_n-1)==0) --actual_n;
+ // Note that here actual_n is computed based on col0(i)==0 instead of diag(i)==0 as above
+ // because 1) we have diag(i)==0 => col0(i)==0 and 2) if col0(i)==0, then diag(i) is already a singular value.
+ while(actual_n>1 && col0(actual_n-1)==Literal(0)) --actual_n;
for (Index k = 0; k < n; ++k)
{
- if (col0(k) == 0 || actual_n==1)
+ if (col0(k) == Literal(0) || actual_n==1)
{
// if col0(k) == 0, then entry is deflated, so singular value is on diagonal
// if actual_n==1, then the deflated problem is already diagonalized
singVals(k) = k==0 ? col0(0) : diag(k);
- mus(k) = 0;
+ mus(k) = Literal(0);
shifts(k) = k==0 ? col0(0) : diag(k);
continue;
}
@@ -731,31 +749,36 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
right = (diag(actual_n-1) + col0.matrix().norm());
else
{
- // Skip deflated singular values
+ // Skip deflated singular values,
+ // recall that at this stage we assume that z[j]!=0 and all entries for which z[j]==0 have been put aside.
+ // This should be equivalent to using perm[]
Index l = k+1;
- while(col0(l)==0) { ++l; eigen_internal_assert(l<actual_n); }
+ while(col0(l)==Literal(0)) { ++l; eigen_internal_assert(l<actual_n); }
right = diag(l);
}
// first decide whether it's closer to the left end or the right end
- RealScalar mid = left + (right-left) / 2;
- RealScalar fMid = secularEq(mid, col0, diag, perm, diag, 0);
+ RealScalar mid = left + (right-left) / Literal(2);
+ RealScalar fMid = secularEq(mid, col0, diag, perm, diag, Literal(0));
#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
- std::cout << right-left << "\n";
- std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, diag-left, left) << " " << secularEq(mid-right, col0, diag, perm, diag-right, right) << "\n";
- std::cout << " = " << secularEq(0.1*(left+right), col0, diag, perm, diag, 0)
- << " " << secularEq(0.2*(left+right), col0, diag, perm, diag, 0)
- << " " << secularEq(0.3*(left+right), col0, diag, perm, diag, 0)
- << " " << secularEq(0.4*(left+right), col0, diag, perm, diag, 0)
- << " " << secularEq(0.49*(left+right), col0, diag, perm, diag, 0)
- << " " << secularEq(0.5*(left+right), col0, diag, perm, diag, 0)
- << " " << secularEq(0.51*(left+right), col0, diag, perm, diag, 0)
- << " " << secularEq(0.6*(left+right), col0, diag, perm, diag, 0)
- << " " << secularEq(0.7*(left+right), col0, diag, perm, diag, 0)
- << " " << secularEq(0.8*(left+right), col0, diag, perm, diag, 0)
- << " " << secularEq(0.9*(left+right), col0, diag, perm, diag, 0) << "\n";
+ std::cout << "right-left = " << right-left << "\n";
+// std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, ArrayXr(diag-left), left)
+// << " " << secularEq(mid-right, col0, diag, perm, ArrayXr(diag-right), right) << "\n";
+ std::cout << " = " << secularEq(left+RealScalar(0.000001)*(right-left), col0, diag, perm, diag, 0)
+ << " " << secularEq(left+RealScalar(0.1) *(right-left), col0, diag, perm, diag, 0)
+ << " " << secularEq(left+RealScalar(0.2) *(right-left), col0, diag, perm, diag, 0)
+ << " " << secularEq(left+RealScalar(0.3) *(right-left), col0, diag, perm, diag, 0)
+ << " " << secularEq(left+RealScalar(0.4) *(right-left), col0, diag, perm, diag, 0)
+ << " " << secularEq(left+RealScalar(0.49) *(right-left), col0, diag, perm, diag, 0)
+ << " " << secularEq(left+RealScalar(0.5) *(right-left), col0, diag, perm, diag, 0)
+ << " " << secularEq(left+RealScalar(0.51) *(right-left), col0, diag, perm, diag, 0)
+ << " " << secularEq(left+RealScalar(0.6) *(right-left), col0, diag, perm, diag, 0)
+ << " " << secularEq(left+RealScalar(0.7) *(right-left), col0, diag, perm, diag, 0)
+ << " " << secularEq(left+RealScalar(0.8) *(right-left), col0, diag, perm, diag, 0)
+ << " " << secularEq(left+RealScalar(0.9) *(right-left), col0, diag, perm, diag, 0)
+ << " " << secularEq(left+RealScalar(0.999999)*(right-left), col0, diag, perm, diag, 0) << "\n";
#endif
- RealScalar shift = (k == actual_n-1 || fMid > 0) ? left : right;
+ RealScalar shift = (k == actual_n-1 || fMid > Literal(0)) ? left : right;
// measure everything relative to shift
Map<ArrayXr> diagShifted(m_workspace.data()+4*n, n);
@@ -785,26 +808,29 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
// rational interpolation: fit a function of the form a / mu + b through the two previous
// iterates and use its zero to compute the next iterate
- bool useBisection = fPrev*fCur>0;
- while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection)
+ bool useBisection = fPrev*fCur>Literal(0);
+ while (fCur!=Literal(0) && abs(muCur - muPrev) > Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection)
{
++m_numIters;
// Find a and b such that the function f(mu) = a / mu + b matches the current and previous samples.
- RealScalar a = (fCur - fPrev) / (1/muCur - 1/muPrev);
+ RealScalar a = (fCur - fPrev) / (Literal(1)/muCur - Literal(1)/muPrev);
RealScalar b = fCur - a / muCur;
// And find mu such that f(mu)==0:
RealScalar muZero = -a/b;
RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift);
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+ assert((std::isfinite)(fZero));
+#endif
muPrev = muCur;
fPrev = fCur;
muCur = muZero;
fCur = fZero;
-
- if (shift == left && (muCur < 0 || muCur > right - left)) useBisection = true;
- if (shift == right && (muCur < -(right - left) || muCur > 0)) useBisection = true;
+ if (shift == left && (muCur < Literal(0) || muCur > right - left)) useBisection = true;
+ if (shift == right && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true;
if (abs(fCur)>abs(fPrev)) useBisection = true;
}
@@ -817,37 +843,59 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
RealScalar leftShifted, rightShifted;
if (shift == left)
{
- leftShifted = (std::numeric_limits<RealScalar>::min)();
+ // to avoid overflow, we must have mu > max(real_min, |z(k)|/sqrt(real_max)),
+ // the factor 2 is to be more conservative
+ leftShifted = numext::maxi<RealScalar>( (std::numeric_limits<RealScalar>::min)(), Literal(2) * abs(col0(k)) / sqrt((std::numeric_limits<RealScalar>::max)()) );
+
+ // check that we did it right:
+ eigen_internal_assert( (numext::isfinite)( (col0(k)/leftShifted)*(col0(k)/(diag(k)+shift+leftShifted)) ) );
// I don't understand why the case k==0 would be special there:
- // if (k == 0) rightShifted = right - left; else
- rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.6)); // theoretically we can take 0.5, but let's be safe
+ // if (k == 0) rightShifted = right - left; else
+ rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.51)); // theoretically we can take 0.5, but let's be safe
}
else
{
- leftShifted = -(right - left) * RealScalar(0.6);
- rightShifted = -(std::numeric_limits<RealScalar>::min)();
+ leftShifted = -(right - left) * RealScalar(0.51);
+ if(k+1<n)
+ rightShifted = -numext::maxi<RealScalar>( (std::numeric_limits<RealScalar>::min)(), abs(col0(k+1)) / sqrt((std::numeric_limits<RealScalar>::max)()) );
+ else
+ rightShifted = -(std::numeric_limits<RealScalar>::min)();
}
-
+
RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
-#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_DEBUG_VERBOSE
+#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_SANITY_CHECKS
RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
#endif
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+ if(!(std::isfinite)(fLeft))
+ std::cout << "f(" << leftShifted << ") =" << fLeft << " ; " << left << " " << shift << " " << right << "\n";
+ assert((std::isfinite)(fLeft));
+
+ if(!(std::isfinite)(fRight))
+ std::cout << "f(" << rightShifted << ") =" << fRight << " ; " << left << " " << shift << " " << right << "\n";
+// assert((std::isfinite)(fRight));
+#endif
+
#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
if(!(fLeft * fRight<0))
{
- std::cout << "fLeft: " << leftShifted << " - " << diagShifted.head(10).transpose() << "\n ; " << bool(left==shift) << " " << (left-shift) << "\n";
- std::cout << k << " : " << fLeft << " * " << fRight << " == " << fLeft * fRight << " ; " << left << " - " << right << " -> " << leftShifted << " " << rightShifted << " shift=" << shift << "\n";
+ std::cout << "f(leftShifted) using leftShifted=" << leftShifted << " ; diagShifted(1:10):" << diagShifted.head(10).transpose() << "\n ; "
+ << "left==shift=" << bool(left==shift) << " ; left-shift = " << (left-shift) << "\n";
+ std::cout << "k=" << k << ", " << fLeft << " * " << fRight << " == " << fLeft * fRight << " ; "
+ << "[" << left << " .. " << right << "] -> [" << leftShifted << " " << rightShifted << "], shift=" << shift << " , f(right)=" << secularEq(0, col0, diag, perm, diagShifted, shift) << " == " << secularEq(right, col0, diag, perm, diag, 0) << "\n";
}
#endif
- eigen_internal_assert(fLeft * fRight < 0);
+ eigen_internal_assert(fLeft * fRight < Literal(0));
- while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted)))
+ while (rightShifted - leftShifted > Literal(2) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted)))
{
- RealScalar midShifted = (leftShifted + rightShifted) / 2;
+ RealScalar midShifted = (leftShifted + rightShifted) / Literal(2);
fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
- if (fLeft * fMid < 0)
+ eigen_internal_assert((numext::isfinite)(fMid));
+
+ if (fLeft * fMid < Literal(0))
{
rightShifted = midShifted;
}
@@ -858,13 +906,22 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
}
}
- muCur = (leftShifted + rightShifted) / 2;
+ muCur = (leftShifted + rightShifted) / Literal(2);
}
singVals[k] = shift + muCur;
shifts[k] = shift;
mus[k] = muCur;
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+ if(k+1<n)
+ std::cout << "found " << singVals[k] << " == " << shift << " + " << muCur << " from " << diag(k) << " .. " << diag(k+1) << "\n";
+#endif
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+ assert(k==0 || singVals[k]>=singVals[k-1]);
+ assert(singVals[k]>=diag(k));
+#endif
+
// perturb singular value slightly if it equals diagonal entry to avoid division by zero later
// (deflation is supposed to avoid this from happening)
// - this does no seem to be necessary anymore -
@@ -892,21 +949,49 @@ void BDCSVD<MatrixType>::perturbCol0
// The offset permits to skip deflated entries while computing zhat
for (Index k = 0; k < n; ++k)
{
- if (col0(k) == 0) // deflated
- zhat(k) = 0;
+ if (col0(k) == Literal(0)) // deflated
+ zhat(k) = Literal(0);
else
{
// see equation (3.6)
RealScalar dk = diag(k);
RealScalar prod = (singVals(last) + dk) * (mus(last) + (shifts(last) - dk));
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+ if(prod<0) {
+ std::cout << "k = " << k << " ; z(k)=" << col0(k) << ", diag(k)=" << dk << "\n";
+ std::cout << "prod = " << "(" << singVals(last) << " + " << dk << ") * (" << mus(last) << " + (" << shifts(last) << " - " << dk << "))" << "\n";
+ std::cout << " = " << singVals(last) + dk << " * " << mus(last) + (shifts(last) - dk) << "\n";
+ }
+ assert(prod>=0);
+#endif
for(Index l = 0; l<m; ++l)
{
Index i = perm(l);
if(i!=k)
{
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+ if(i>=k && (l==0 || l-1>=m))
+ {
+ std::cout << "Error in perturbCol0\n";
+ std::cout << " " << k << "/" << n << " " << l << "/" << m << " " << i << "/" << n << " ; " << col0(k) << " " << diag(k) << " " << "\n";
+ std::cout << " " <<diag(i) << "\n";
+ Index j = (i<k /*|| l==0*/) ? i : perm(l-1);
+ std::cout << " " << "j=" << j << "\n";
+ }
+#endif
Index j = i<k ? i : perm(l-1);
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+ if(!(dk!=Literal(0) || diag(i)!=Literal(0)))
+ {
+ std::cout << "k=" << k << ", i=" << i << ", l=" << l << ", perm.size()=" << perm.size() << "\n";
+ }
+ assert(dk!=Literal(0) || diag(i)!=Literal(0));
+#endif
prod *= ((singVals(j)+dk) / ((diag(i)+dk))) * ((mus(j)+(shifts(j)-dk)) / ((diag(i)-dk)));
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+ assert(prod>=0);
+#endif
#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
if(i!=k && std::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 )
std::cout << " " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk))
@@ -918,7 +1003,10 @@ void BDCSVD<MatrixType>::perturbCol0
std::cout << "zhat(" << k << ") = sqrt( " << prod << ") ; " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n";
#endif
RealScalar tmp = sqrt(prod);
- zhat(k) = col0(k) > 0 ? tmp : -tmp;
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+ assert((std::isfinite)(tmp));
+#endif
+ zhat(k) = col0(k) > Literal(0) ? tmp : -tmp;
}
}
}
@@ -934,7 +1022,7 @@ void BDCSVD<MatrixType>::computeSingVecs
for (Index k = 0; k < n; ++k)
{
- if (zhat(k) == 0)
+ if (zhat(k) == Literal(0))
{
U.col(k) = VectorType::Unit(n+1, k);
if (m_compV) V.col(k) = VectorType::Unit(n, k);
@@ -947,7 +1035,7 @@ void BDCSVD<MatrixType>::computeSingVecs
Index i = perm(l);
U(i,k) = zhat(i)/(((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k]));
}
- U(n,k) = 0;
+ U(n,k) = Literal(0);
U.col(k).normalize();
if (m_compV)
@@ -958,7 +1046,7 @@ void BDCSVD<MatrixType>::computeSingVecs
Index i = perm(l);
V(i,k) = diag(i) * zhat(i) / (((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k]));
}
- V(0,k) = -1;
+ V(0,k) = Literal(-1);
V.col(k).normalize();
}
}
@@ -979,15 +1067,15 @@ void BDCSVD<MatrixType>::deflation43(Index firstCol, Index shift, Index i, Index
Index start = firstCol + shift;
RealScalar c = m_computed(start, start);
RealScalar s = m_computed(start+i, start);
- RealScalar r = sqrt(numext::abs2(c) + numext::abs2(s));
- if (r == 0)
+ RealScalar r = numext::hypot(c,s);
+ if (r == Literal(0))
{
- m_computed(start+i, start+i) = 0;
+ m_computed(start+i, start+i) = Literal(0);
return;
}
m_computed(start,start) = r;
- m_computed(start+i, start) = 0;
- m_computed(start+i, start+i) = 0;
+ m_computed(start+i, start) = Literal(0);
+ m_computed(start+i, start+i) = Literal(0);
JacobiRotation<RealScalar> J(c/r,-s/r);
if (m_compU) m_naiveU.middleRows(firstCol, size+1).applyOnTheRight(firstCol, firstCol+i, J);
@@ -1020,16 +1108,16 @@ void BDCSVD<MatrixType>::deflation44(Index firstColu , Index firstColm, Index fi
<< m_computed(firstColm + i+1, firstColm+i+1) << " "
<< m_computed(firstColm + i+2, firstColm+i+2) << "\n";
#endif
- if (r==0)
+ if (r==Literal(0))
{
m_computed(firstColm + i, firstColm + i) = m_computed(firstColm + j, firstColm + j);
return;
}
c/=r;
s/=r;
- m_computed(firstColm + i, firstColm) = r;
+ m_computed(firstColm + i, firstColm) = r;
m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
- m_computed(firstColm + j, firstColm) = 0;
+ m_computed(firstColm + j, firstColm) = Literal(0);
JacobiRotation<RealScalar> J(c,-s);
if (m_compU) m_naiveU.middleRows(firstColu, size+1).applyOnTheRight(firstColu + i, firstColu + j, J);
@@ -1053,7 +1141,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff();
RealScalar epsilon_strict = numext::maxi<RealScalar>(considerZero,NumTraits<RealScalar>::epsilon() * maxDiag);
- RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
+ RealScalar epsilon_coarse = Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
#ifdef EIGEN_BDCSVD_SANITY_CHECKS
assert(m_naiveU.allFinite());
@@ -1081,7 +1169,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
std::cout << "deflation 4.2, set z(" << i << ") to zero because " << abs(col0(i)) << " < " << epsilon_strict << " (diag(" << i << ")=" << diag(i) << ")\n";
#endif
- col0(i) = 0;
+ col0(i) = Literal(0);
}
//condition 4.3
@@ -1101,6 +1189,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
#endif
#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
std::cout << "to be sorted: " << diag.transpose() << "\n\n";
+ std::cout << " : " << col0.transpose() << "\n\n";
#endif
{
// Check for total deflation
@@ -1191,7 +1280,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
if( (diag(i) - diag(i-1)) < NumTraits<RealScalar>::epsilon()*maxDiag )
{
#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
- std::cout << "deflation 4.4 with i = " << i << " because " << (diag(i) - diag(i-1)) << " < " << NumTraits<RealScalar>::epsilon()*diag(i) << "\n";
+ std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i-1) << " == " << (diag(i) - diag(i-1)) << " < " << NumTraits<RealScalar>::epsilon()*/*diag(i)*/maxDiag << "\n";
#endif
eigen_internal_assert(abs(diag(i) - diag(i-1))<epsilon_coarse && " diagonal entries are not properly sorted");
deflation44(firstCol, firstCol + shift, firstRowW, firstColW, i-1, i, length);
@@ -1210,7 +1299,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
#endif
}//end deflation
-#ifndef __CUDACC__
+#ifndef EIGEN_CUDACC
/** \svd_module
*
* \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h
index 0b1460894..11ac847e1 100644
--- a/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/Eigen/src/SVD/UpperBidiagonalization.h
@@ -159,6 +159,8 @@ void upperbidiagonalization_blocked_helper(MatrixType& A,
traits<MatrixType>::Flags & RowMajorBit> > Y)
{
typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::RealScalar RealScalar;
+ typedef typename NumTraits<RealScalar>::Literal Literal;
enum { StorageOrder = traits<MatrixType>::Flags & RowMajorBit };
typedef InnerStride<int(StorageOrder) == int(ColMajor) ? 1 : Dynamic> ColInnerStride;
typedef InnerStride<int(StorageOrder) == int(ColMajor) ? Dynamic : 1> RowInnerStride;
@@ -263,7 +265,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A,
SubMatType A10( A.block(bs,0, brows-bs,bs) );
SubMatType A01( A.block(0,bs, bs,bcols-bs) );
Scalar tmp = A01(bs-1,0);
- A01(bs-1,0) = 1;
+ A01(bs-1,0) = Literal(1);
A11.noalias() -= A10 * Y.topLeftCorner(bcols,bs).bottomRows(bcols-bs).adjoint();
A11.noalias() -= X.topLeftCorner(brows,bs).bottomRows(brows-bs) * A01;
A01(bs-1,0) = tmp;
diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h
index 8a5cc91f2..e0295f2af 100644
--- a/Eigen/src/SparseCore/AmbiVector.h
+++ b/Eigen/src/SparseCore/AmbiVector.h
@@ -94,7 +94,7 @@ class AmbiVector
Index allocSize = m_allocatedElements * sizeof(ListEl);
allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar);
Scalar* newBuffer = new Scalar[allocSize];
- memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl));
+ std::memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl));
delete[] m_buffer;
m_buffer = newBuffer;
}
diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
index 492eb0a29..9db119b67 100644
--- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@@ -17,7 +17,9 @@ namespace internal {
template<typename Lhs, typename Rhs, typename ResultType>
static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, bool sortedInsertion = false)
{
- typedef typename remove_all<Lhs>::type::Scalar Scalar;
+ typedef typename remove_all<Lhs>::type::Scalar LhsScalar;
+ typedef typename remove_all<Rhs>::type::Scalar RhsScalar;
+ typedef typename remove_all<ResultType>::type::Scalar ResScalar;
// make sure to call innerSize/outerSize since we fake the storage order.
Index rows = lhs.innerSize();
@@ -25,7 +27,7 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
eigen_assert(lhs.outerSize() == rhs.innerSize());
ei_declare_aligned_stack_constructed_variable(bool, mask, rows, 0);
- ei_declare_aligned_stack_constructed_variable(Scalar, values, rows, 0);
+ ei_declare_aligned_stack_constructed_variable(ResScalar, values, rows, 0);
ei_declare_aligned_stack_constructed_variable(Index, indices, rows, 0);
std::memset(mask,0,sizeof(bool)*rows);
@@ -51,12 +53,12 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
Index nnz = 0;
for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
{
- Scalar y = rhsIt.value();
+ RhsScalar y = rhsIt.value();
Index k = rhsIt.index();
for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt)
{
Index i = lhsIt.index();
- Scalar x = lhsIt.value();
+ LhsScalar x = lhsIt.value();
if(!mask[i])
{
mask[i] = true;
@@ -166,11 +168,12 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,C
{
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
{
- typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
- RowMajorMatrix rhsRow = rhs;
- RowMajorMatrix resRow(lhs.rows(), rhs.cols());
- internal::conservative_sparse_sparse_product_impl<RowMajorMatrix,Lhs,RowMajorMatrix>(rhsRow, lhs, resRow);
- res = resRow;
+ typedef SparseMatrix<typename Rhs::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRhs;
+ typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRes;
+ RowMajorRhs rhsRow = rhs;
+ RowMajorRes resRow(lhs.rows(), rhs.cols());
+ internal::conservative_sparse_sparse_product_impl<RowMajorRhs,Lhs,RowMajorRes>(rhsRow, lhs, resRow);
+ res = resRow;
}
};
@@ -179,10 +182,11 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,R
{
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
{
- typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
- RowMajorMatrix lhsRow = lhs;
- RowMajorMatrix resRow(lhs.rows(), rhs.cols());
- internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorMatrix,RowMajorMatrix>(rhs, lhsRow, resRow);
+ typedef SparseMatrix<typename Lhs::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorLhs;
+ typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRes;
+ RowMajorLhs lhsRow = lhs;
+ RowMajorRes resRow(lhs.rows(), rhs.cols());
+ internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorLhs,RowMajorRes>(rhs, lhsRow, resRow);
res = resRow;
}
};
@@ -219,10 +223,11 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,C
{
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
{
- typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
- ColMajorMatrix lhsCol = lhs;
- ColMajorMatrix resCol(lhs.rows(), rhs.cols());
- internal::conservative_sparse_sparse_product_impl<ColMajorMatrix,Rhs,ColMajorMatrix>(lhsCol, rhs, resCol);
+ typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorLhs;
+ typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRes;
+ ColMajorLhs lhsCol = lhs;
+ ColMajorRes resCol(lhs.rows(), rhs.cols());
+ internal::conservative_sparse_sparse_product_impl<ColMajorLhs,Rhs,ColMajorRes>(lhsCol, rhs, resCol);
res = resCol;
}
};
@@ -232,10 +237,11 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,R
{
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
{
- typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
- ColMajorMatrix rhsCol = rhs;
- ColMajorMatrix resCol(lhs.rows(), rhs.cols());
- internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorMatrix,ColMajorMatrix>(lhs, rhsCol, resCol);
+ typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRhs;
+ typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRes;
+ ColMajorRhs rhsCol = rhs;
+ ColMajorRes resCol(lhs.rows(), rhs.cols());
+ internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorRhs,ColMajorRes>(lhs, rhsCol, resCol);
res = resCol;
}
};
@@ -263,7 +269,8 @@ namespace internal {
template<typename Lhs, typename Rhs, typename ResultType>
static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res)
{
- typedef typename remove_all<Lhs>::type::Scalar Scalar;
+ typedef typename remove_all<Lhs>::type::Scalar LhsScalar;
+ typedef typename remove_all<Rhs>::type::Scalar RhsScalar;
Index cols = rhs.outerSize();
eigen_assert(lhs.outerSize() == rhs.innerSize());
@@ -274,12 +281,12 @@ static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs,
{
for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
{
- Scalar y = rhsIt.value();
+ RhsScalar y = rhsIt.value();
Index k = rhsIt.index();
for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt)
{
Index i = lhsIt.index();
- Scalar x = lhsIt.value();
+ LhsScalar x = lhsIt.value();
res.coeffRef(i,j) += x * y;
}
}
@@ -310,9 +317,9 @@ struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,RowMajor,ColMa
{
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
{
- typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
- ColMajorMatrix lhsCol(lhs);
- internal::sparse_sparse_to_dense_product_impl<ColMajorMatrix,Rhs,ResultType>(lhsCol, rhs, res);
+ typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorLhs;
+ ColMajorLhs lhsCol(lhs);
+ internal::sparse_sparse_to_dense_product_impl<ColMajorLhs,Rhs,ResultType>(lhsCol, rhs, res);
}
};
@@ -321,9 +328,9 @@ struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,ColMajor,RowMa
{
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
{
- typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
- ColMajorMatrix rhsCol(rhs);
- internal::sparse_sparse_to_dense_product_impl<Lhs,ColMajorMatrix,ResultType>(lhs, rhsCol, res);
+ typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRhs;
+ ColMajorRhs rhsCol(rhs);
+ internal::sparse_sparse_to_dense_product_impl<Lhs,ColMajorRhs,ResultType>(lhs, rhsCol, res);
}
};
diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h
index 18352a847..113463258 100644
--- a/Eigen/src/SparseCore/SparseAssign.h
+++ b/Eigen/src/SparseCore/SparseAssign.h
@@ -83,7 +83,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
// eval without temporary
dst.resize(src.rows(), src.cols());
dst.setZero();
- dst.reserve((std::max)(src.rows(),src.cols())*2);
+ dst.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
for (Index j=0; j<outerEvaluationSize; ++j)
{
dst.startVec(j);
@@ -107,7 +107,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
DstXprType temp(src.rows(), src.cols());
- temp.reserve((std::max)(src.rows(),src.cols())*2);
+ temp.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
for (Index j=0; j<outerEvaluationSize; ++j)
{
temp.startVec(j);
diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 9e39be738..5ab64f1a8 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -47,6 +47,7 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
enum {
Mode = _Mode,
+ TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0),
RowsAtCompileTime = internal::traits<SparseSelfAdjointView>::RowsAtCompileTime,
ColsAtCompileTime = internal::traits<SparseSelfAdjointView>::ColsAtCompileTime
};
@@ -368,7 +369,7 @@ struct generic_product_impl<Lhs, RhsView, DenseShape, SparseSelfAdjointShape, Pr
// transpose everything
Transpose<Dest> dstT(dst);
- internal::sparse_selfadjoint_time_dense_product<RhsView::Mode>(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha);
+ internal::sparse_selfadjoint_time_dense_product<RhsView::TransposeMode>(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha);
}
};
diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
index 21c419002..88820a48f 100644
--- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
+++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
@@ -21,7 +21,8 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
{
// return sparse_sparse_product_with_pruning_impl2(lhs,rhs,res);
- typedef typename remove_all<Lhs>::type::Scalar Scalar;
+ typedef typename remove_all<Rhs>::type::Scalar RhsScalar;
+ typedef typename remove_all<ResultType>::type::Scalar ResScalar;
typedef typename remove_all<Lhs>::type::StorageIndex StorageIndex;
// make sure to call innerSize/outerSize since we fake the storage order.
@@ -31,7 +32,7 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
eigen_assert(lhs.outerSize() == rhs.innerSize());
// allocate a temporary buffer
- AmbiVector<Scalar,StorageIndex> tempVector(rows);
+ AmbiVector<ResScalar,StorageIndex> tempVector(rows);
// mimics a resizeByInnerOuter:
if(ResultType::IsRowMajor)
@@ -63,14 +64,14 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
{
// FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index())
tempVector.restart();
- Scalar x = rhsIt.value();
+ RhsScalar x = rhsIt.value();
for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt)
{
tempVector.coeffRef(lhsIt.index()) += lhsIt.value() * x;
}
}
res.startVec(j);
- for (typename AmbiVector<Scalar,StorageIndex>::Iterator it(tempVector,tolerance); it; ++it)
+ for (typename AmbiVector<ResScalar,StorageIndex>::Iterator it(tempVector,tolerance); it; ++it)
res.insertBackByOuterInner(j,it.index()) = it.value();
}
res.finalize();
@@ -85,7 +86,6 @@ struct sparse_sparse_product_with_pruning_selector;
template<typename Lhs, typename Rhs, typename ResultType>
struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,ColMajor>
{
- typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
typedef typename ResultType::RealScalar RealScalar;
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
@@ -129,8 +129,8 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,R
typedef typename ResultType::RealScalar RealScalar;
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
{
- typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
- typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
+ typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
+ typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
ColMajorMatrixLhs colLhs(lhs);
ColMajorMatrixRhs colRhs(rhs);
internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,ColMajorMatrixRhs,ResultType>(colLhs, colRhs, res, tolerance);
@@ -149,7 +149,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,R
typedef typename ResultType::RealScalar RealScalar;
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
{
- typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixLhs;
+ typedef SparseMatrix<typename Lhs::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixLhs;
RowMajorMatrixLhs rowLhs(lhs);
sparse_sparse_product_with_pruning_selector<RowMajorMatrixLhs,Rhs,ResultType,RowMajor,RowMajor>(rowLhs,rhs,res,tolerance);
}
@@ -161,7 +161,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,C
typedef typename ResultType::RealScalar RealScalar;
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
{
- typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixRhs;
+ typedef SparseMatrix<typename Rhs::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixRhs;
RowMajorMatrixRhs rowRhs(rhs);
sparse_sparse_product_with_pruning_selector<Lhs,RowMajorMatrixRhs,ResultType,RowMajor,RowMajor,RowMajor>(lhs,rowRhs,res,tolerance);
}
@@ -173,7 +173,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,R
typedef typename ResultType::RealScalar RealScalar;
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
{
- typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
+ typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
ColMajorMatrixRhs colRhs(rhs);
internal::sparse_sparse_product_with_pruning_impl<Lhs,ColMajorMatrixRhs,ResultType>(lhs, colRhs, res, tolerance);
}
@@ -185,7 +185,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,C
typedef typename ResultType::RealScalar RealScalar;
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
{
- typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
+ typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
ColMajorMatrixLhs colLhs(lhs);
internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,Rhs,ResultType>(colLhs, rhs, res, tolerance);
}
diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h
index 2d4498b03..f7111fe2e 100644
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@@ -327,7 +327,7 @@ void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
internal::coletree(matCpy, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
m_isEtreeOk = true;
- m_R.resize(m, n);
+ m_R.resize(diagSize, n);
m_Q.resize(m, diagSize);
// Allocate space for nonzero elements : rough estimation
diff --git a/Eigen/src/misc/lapacke.h b/Eigen/src/misc/lapacke.h
index 8c7e79b03..3d8e24f5a 100755
--- a/Eigen/src/misc/lapacke.h
+++ b/Eigen/src/misc/lapacke.h
@@ -43,10 +43,6 @@
#include "lapacke_config.h"
#endif
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
#include <stdlib.h>
#ifndef lapack_int
@@ -108,6 +104,11 @@ lapack_complex_double lapack_make_complex_double( double re, double im );
#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
#ifndef LAPACKE_malloc
#define LAPACKE_malloc( size ) malloc( size )
#endif
diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h
index 22c1666c5..a7ec63adf 100644
--- a/Eigen/src/plugins/IndexedViewMethods.h
+++ b/Eigen/src/plugins/IndexedViewMethods.h
@@ -55,9 +55,7 @@ ivcSize(const Indices& indices) const {
template<typename RowIndices, typename ColIndices>
struct valid_indexed_view_overload {
- // Here we use is_convertible to Index instead of is_integral in order to treat enums as Index.
- // In c++11 we could use is_integral<T> && is_enum<T> if is_convertible appears to be too permissive.
- enum { value = !(internal::is_convertible<RowIndices,Index>::value && internal::is_convertible<ColIndices,Index>::value) };
+ enum { value = !(internal::is_valid_index_type<RowIndices>::value && internal::is_valid_index_type<ColIndices>::value) };
};
public:
@@ -146,7 +144,7 @@ operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&col
template<typename Indices>
typename internal::enable_if<
- IsRowMajor && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_integral<Indices>::value)),
+ IsRowMajor && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)),
IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,typename IvcType<Indices>::type> >::type
operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
{
@@ -157,7 +155,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
template<typename Indices>
typename internal::enable_if<
- (!IsRowMajor) && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_integral<Indices>::value)),
+ (!IsRowMajor) && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)),
IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcType<Indices>::type,IvcIndex> >::type
operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
{
@@ -168,7 +166,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
template<typename Indices>
typename internal::enable_if<
- (internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1) && (!internal::is_integral<Indices>::value) && (!Symbolic::is_symbolic<Indices>::value),
+ (internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1) && (!internal::is_valid_index_type<Indices>::value) && (!Symbolic::is_symbolic<Indices>::value),
VectorBlock<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,internal::array_size<Indices>::value> >::type
operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
{
@@ -250,6 +248,8 @@ operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
*
* For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but taking a single parameter.
*
+ * See also this <a href="https://stackoverflow.com/questions/46110917/eigen-replicate-items-along-one-dimension-without-useless-allocations">question</a> and its answer for an example of how to duplicate coefficients.
+ *
* \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index)
*/
template<typename RowIndices, typename ColIndices>