diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2015-04-01 23:24:11 -0700 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2015-04-01 23:24:11 -0700 |
commit | 74e558cfa88ac2f6ac556069545260c6aebb9caa (patch) | |
tree | 662d79e5430823e2d99d1613162c1f1149d0ffc9 | |
parent | 731d7b84b4676ed444f4ceb525b637b4bc2e8b54 (diff) | |
parent | 03a0df20100d2b89b38a70d3b0b7a15a4a44b5de (diff) |
Pulled latest updates from trunk
52 files changed, 1071 insertions, 610 deletions
diff --git a/Eigen/IterativeLinearSolvers b/Eigen/IterativeLinearSolvers index 7fab9eed0..f5fdcd9e5 100644 --- a/Eigen/IterativeLinearSolvers +++ b/Eigen/IterativeLinearSolvers @@ -17,7 +17,7 @@ * * These iterative solvers are associated with some preconditioners: * - IdentityPreconditioner - not really useful - * - DiagonalPreconditioner - also called JAcobi preconditioner, work very well on diagonal dominant matrices. + * - DiagonalPreconditioner - also called Jacobi preconditioner, work very well on diagonal dominant matrices. * - IncompleteLUT - incomplete LU factorization with dual thresholding * * Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport, UmfPackSupport, SuperLUSupport. diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index 009fd845d..c7dfedae4 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -300,9 +300,10 @@ template<typename Derived> bool DenseBase<Derived>::isApproxToConstant (const Scalar& val, const RealScalar& prec) const { + typename internal::nested_eval<Derived,1>::type self(derived()); for(Index j = 0; j < cols(); ++j) for(Index i = 0; i < rows(); ++i) - if(!internal::isApprox(this->coeff(i, j), val, prec)) + if(!internal::isApprox(self.coeff(i, j), val, prec)) return false; return true; } @@ -484,9 +485,10 @@ DenseBase<Derived>::Zero() template<typename Derived> bool DenseBase<Derived>::isZero(const RealScalar& prec) const { + typename internal::nested_eval<Derived,1>::type self(derived()); for(Index j = 0; j < cols(); ++j) for(Index i = 0; i < rows(); ++i) - if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<Scalar>(1), prec)) + if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<Scalar>(1), prec)) return false; return true; } @@ -719,18 +721,19 @@ template<typename Derived> bool MatrixBase<Derived>::isIdentity (const RealScalar& prec) const { + typename internal::nested_eval<Derived,1>::type self(derived()); for(Index j = 0; j < cols(); ++j) { for(Index i = 0; i < rows(); ++i) { if(i == j) { - if(!internal::isApprox(this->coeff(i, j), static_cast<Scalar>(1), prec)) + if(!internal::isApprox(self.coeff(i, j), static_cast<Scalar>(1), prec)) return false; } else { - if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<RealScalar>(1), prec)) + if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<RealScalar>(1), prec)) return false; } } diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index ab41641f4..8fcc83a5a 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -35,22 +35,22 @@ void check_static_allocation_size() } template<typename T, int Size, typename Packet = typename packet_traits<T>::type, - bool Match = bool((Size%unpacket_traits<Packet>::size)==0), - bool TryHalf = bool(int(unpacket_traits<Packet>::size) > Size) + bool Match = bool((Size%unpacket_traits<Packet>::size)==0), + bool TryHalf = bool(int(unpacket_traits<Packet>::size) > 1) && bool(int(unpacket_traits<Packet>::size) > int(unpacket_traits<typename unpacket_traits<Packet>::half>::size)) > struct compute_default_alignment { enum { value = 0 }; }; -template<typename T, int Size, typename Packet> -struct compute_default_alignment<T, Size, Packet, true, false> // Match +template<typename T, int Size, typename Packet, bool TryHalf> +struct compute_default_alignment<T, Size, Packet, true, TryHalf> // Match { enum { value = sizeof(T) * unpacket_traits<Packet>::size }; }; template<typename T, int Size, typename Packet> -struct compute_default_alignment<T, Size, Packet, false, true> +struct compute_default_alignment<T, Size, Packet, false, true> // Try-half { // current packet too large, try with an half-packet enum { value = compute_default_alignment<T, Size, typename unpacket_traits<Packet>::half>::value }; diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index 68e9c2660..6228f71bd 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -224,13 +224,13 @@ bool MatrixBase<Derived>::isOrthogonal template<typename Derived> bool MatrixBase<Derived>::isUnitary(const RealScalar& prec) const { - typename Derived::Nested nested(derived()); + typename internal::nested_eval<Derived,1>::type self(derived()); for(Index i = 0; i < cols(); ++i) { - if(!internal::isApprox(nested.col(i).squaredNorm(), static_cast<RealScalar>(1), prec)) + if(!internal::isApprox(self.col(i).squaredNorm(), static_cast<RealScalar>(1), prec)) return false; for(Index j = 0; j < i; ++j) - if(!internal::isMuchSmallerThan(nested.col(i).dot(nested.col(j)), static_cast<Scalar>(1), prec)) + if(!internal::isMuchSmallerThan(self.col(i).dot(self.col(j)), static_cast<Scalar>(1), prec)) return false; } return true; diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index e1b233d82..3c240c272 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -328,6 +328,7 @@ struct hypot_impl p = _y; qp = _x / p; } + if(p==RealScalar(0)) return RealScalar(0); return p * sqrt(RealScalar(1) + qp*qp); } }; diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index d84e7776b..22b5e024b 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -409,7 +409,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, LhsCoeffReadCost = LhsEtorType::CoeffReadCost, RhsCoeffReadCost = RhsEtorType::CoeffReadCost, - CoeffReadCost = (InnerSize == Dynamic || LhsCoeffReadCost==Dynamic || RhsCoeffReadCost==Dynamic || NumTraits<Scalar>::AddCost==Dynamic || NumTraits<Scalar>::MulCost==Dynamic) ? Dynamic + CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost + : (InnerSize == Dynamic || LhsCoeffReadCost==Dynamic || RhsCoeffReadCost==Dynamic || NumTraits<Scalar>::AddCost==Dynamic || NumTraits<Scalar>::MulCost==Dynamic) ? Dynamic : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost) + (InnerSize - 1) * NumTraits<Scalar>::AddCost, @@ -484,7 +485,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, { PacketScalar res; typedef etor_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor, - Unroll ? InnerSize-1 : Dynamic, + Unroll ? InnerSize : Dynamic, LhsEtorType, RhsEtorType, PacketScalar, LoadMode> PacketImpl; PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res); @@ -527,7 +528,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) { etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res); - res = pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res); + res = pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode>(UnrollingIndex-1, col), res); } }; @@ -537,12 +538,12 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) { etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res); - res = pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res); + res = pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res); } }; template<typename Lhs, typename Rhs, typename Packet, int LoadMode> -struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode> +struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode> { static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) { @@ -551,7 +552,7 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode> }; template<typename Lhs, typename Rhs, typename Packet, int LoadMode> -struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode> +struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode> { static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) { @@ -560,13 +561,30 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode> }; template<typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode> +{ + static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) + { + res = pset1<Packet>(0); + } +}; + +template<typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode> +{ + static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) + { + res = pset1<Packet>(0); + } +}; + +template<typename Lhs, typename Rhs, typename Packet, int LoadMode> struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> { static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) { - eigen_assert(innerDim>0 && "you are using a non initialized matrix"); - res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); - for(Index i = 1; i < innerDim; ++i) + res = pset1<Packet>(0); + for(Index i = 0; i < innerDim; ++i) res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res); } }; @@ -576,9 +594,8 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> { static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) { - eigen_assert(innerDim>0 && "you are using a non initialized matrix"); - res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col))); - for(Index i = 1; i < innerDim; ++i) + res = pset1<Packet>(0); + for(Index i = 0; i < innerDim; ++i) res = pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res); } }; @@ -678,8 +695,7 @@ public: //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))), _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, - Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0) | AlignedBit - //(int(MatrixFlags)&int(DiagFlags)&AlignedBit), + Flags = ((HereditaryBits|_LinearAccessMask|AlignedBit) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0) }; diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h index 291300a4a..5237fbf1c 100644 --- a/Eigen/src/Core/Reverse.h +++ b/Eigen/src/Core/Reverse.h @@ -200,17 +200,82 @@ DenseBase<Derived>::reverse() const * In most cases it is probably better to simply use the reversed expression * of a matrix. However, when reversing the matrix data itself is really needed, * then this "in-place" version is probably the right choice because it provides - * the following additional features: + * the following additional benefits: * - less error prone: doing the same operation with .reverse() requires special care: * \code m = m.reverse().eval(); \endcode - * - this API allows to avoid creating a temporary (the current implementation creates a temporary, but that could be avoided using swap) + * - this API enables reverse operations without the need for a temporary * - it allows future optimizations (cache friendliness, etc.) * - * \sa reverse() */ + * \sa VectorwiseOp::reverseInPlace(), reverse() */ template<typename Derived> inline void DenseBase<Derived>::reverseInPlace() { - derived() = derived().reverse().eval(); + if(cols()>rows()) + { + Index half = cols()/2; + leftCols(half).swap(rightCols(half).reverse()); + if((cols()%2)==1) + { + Index half2 = rows()/2; + col(half).head(half2).swap(col(half).tail(half2).reverse()); + } + } + else + { + Index half = rows()/2; + topRows(half).swap(bottomRows(half).reverse()); + if((rows()%2)==1) + { + Index half2 = cols()/2; + row(half).head(half2).swap(row(half).tail(half2).reverse()); + } + } +} + +namespace internal { + +template<int Direction> +struct vectorwise_reverse_inplace_impl; + +template<> +struct vectorwise_reverse_inplace_impl<Vertical> +{ + template<typename ExpressionType> + static void run(ExpressionType &xpr) + { + Index half = xpr.rows()/2; + xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse()); + } +}; + +template<> +struct vectorwise_reverse_inplace_impl<Horizontal> +{ + template<typename ExpressionType> + static void run(ExpressionType &xpr) + { + Index half = xpr.cols()/2; + xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse()); + } +}; + +} // end namespace internal + +/** This is the "in place" version of VectorwiseOp::reverse: it reverses each column or row of \c *this. + * + * In most cases it is probably better to simply use the reversed expression + * of a matrix. However, when reversing the matrix data itself is really needed, + * then this "in-place" version is probably the right choice because it provides + * the following additional benefits: + * - less error prone: doing the same operation with .reverse() requires special care: + * \code m = m.reverse().eval(); \endcode + * - this API enables reverse operations without the need for a temporary + * + * \sa DenseBase::reverseInPlace(), reverse() */ +template<typename ExpressionType, int Direction> +void VectorwiseOp<ExpressionType,Direction>::reverseInPlace() +{ + internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived()); } } // end namespace Eigen diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h index dcb42821f..3880f7b78 100644 --- a/Eigen/src/Core/Swap.h +++ b/Eigen/src/Core/Swap.h @@ -38,13 +38,17 @@ public: template<int StoreMode, int LoadMode> void assignPacket(Index row, Index col) { - m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(row,col), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(row,col)); + PacketScalar tmp = m_src.template packet<LoadMode>(row,col); + const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(row,col, m_dst.template packet<StoreMode>(row,col)); + m_dst.template writePacket<StoreMode>(row,col,tmp); } template<int StoreMode, int LoadMode> void assignPacket(Index index) { - m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(index), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(index)); + PacketScalar tmp = m_src.template packet<LoadMode>(index); + const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(index, m_dst.template packet<StoreMode>(index)); + m_dst.template writePacket<StoreMode>(index,tmp); } // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael) diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index a15777a5e..ea3d8f4b1 100644 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -562,6 +562,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp void normalize() { m_matrix = this->normalized(); } + + inline void reverseInPlace(); /////////// Geometry module /////////// diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h index 161b0aa93..d55ae6096 100644 --- a/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/Eigen/src/Core/functors/AssignmentFunctors.h @@ -150,14 +150,6 @@ template<typename Scalar> struct swap_assign_op { swap(a,const_cast<Scalar&>(b)); #endif } - - template<int LhsAlignment, int RhsAlignment, typename Packet> - EIGEN_STRONG_INLINE void swapPacket(Scalar* a, Scalar* b) const - { - Packet tmp = internal::ploadt<Packet,RhsAlignment>(b); - internal::pstoret<Scalar,Packet,RhsAlignment>(b, internal::ploadt<Packet,LhsAlignment>(a)); - internal::pstoret<Scalar,Packet,LhsAlignment>(a, tmp); - } }; template<typename Scalar> struct functor_traits<swap_assign_op<Scalar> > { diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index d32377a00..428527820 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -249,10 +249,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n actual_lm = l2; max_mc = 576; } - Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); if (mc > Traits::mr) mc -= mc % Traits::mr; - + else if (mc==0) return; m = (m%mc)==0 ? mc : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1)))); } diff --git a/Eigen/src/Core/products/LookupBlockingSizesTable.h b/Eigen/src/Core/products/LookupBlockingSizesTable.h index 5ab4525df..39a53c8f1 100644 --- a/Eigen/src/Core/products/LookupBlockingSizesTable.h +++ b/Eigen/src/Core/products/LookupBlockingSizesTable.h @@ -79,6 +79,14 @@ template <typename LhsScalar, typename RhsScalar> bool lookupBlockingSizesFromTable(Index& k, Index& m, Index& n, Index num_threads) { + if (num_threads > 1) { + // We don't currently have lookup tables recorded for multithread performance, + // and we have confirmed experimentally that our single-thread-recorded LUTs are + // poor for multithread performance, and our LUTs don't currently contain + // any annotation about multithread status (FIXME - we need that). + // So for now, we just early-return here. + return false; + } return LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar>::run(k, m, n, num_threads); } diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 6b294e77f..7cedb4c97 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -213,7 +213,8 @@ #endif /// \internal EIGEN_OS_ANDROID set to 1 if the OS is Android -#if defined(__ANDROID__) +// note: ANDROID is defined when using ndk_build, __ANDROID__ is defined when using a standalone toolchain. +#if defined(__ANDROID__) || defined(ANDROID) #define EIGEN_OS_ANDROID 1 #else #define EIGEN_OS_ANDROID 0 diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h index 167cd99ab..b866544b4 100644 --- a/Eigen/src/Eigenvalues/EigenSolver.h +++ b/Eigen/src/Eigenvalues/EigenSolver.h @@ -417,7 +417,7 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect { Scalar t0 = m_matT.coeff(i+1, i); Scalar t1 = m_matT.coeff(i, i+1); - Scalar maxval = numext::maxi(abs(p),numext::maxi(abs(t0),abs(t1))); + Scalar maxval = numext::maxi<Scalar>(abs(p),numext::maxi<Scalar>(abs(t0),abs(t1))); t0 /= maxval; t1 /= maxval; Scalar p0 = p/maxval; @@ -608,7 +608,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors() } // Overflow control - Scalar t = numext::maxi(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n))); + Scalar t = numext::maxi<Scalar>(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n))); if ((eps * t) * t > Scalar(1)) m_matT.block(i, n-1, size-i, 2) /= t; diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h index ca75f2f50..677c7c0bb 100644 --- a/Eigen/src/Eigenvalues/RealQZ.h +++ b/Eigen/src/Eigenvalues/RealQZ.h @@ -240,10 +240,10 @@ namespace Eigen { m_S.coeffRef(i,j) = Scalar(0.0); m_S.rightCols(dim-j-1).applyOnTheLeft(i-1,i,G.adjoint()); m_T.rightCols(dim-i+1).applyOnTheLeft(i-1,i,G.adjoint()); + // update Q + if (m_computeQZ) + m_Q.applyOnTheRight(i-1,i,G); } - // update Q - if (m_computeQZ) - m_Q.applyOnTheRight(i-1,i,G); // kill T(i,i-1) if(m_T.coeff(i,i-1)!=Scalar(0)) { @@ -251,10 +251,10 @@ namespace Eigen { m_T.coeffRef(i,i-1) = Scalar(0.0); m_S.applyOnTheRight(i,i-1,G); m_T.topRows(i).applyOnTheRight(i,i-1,G); + // update Z + if (m_computeQZ) + m_Z.applyOnTheLeft(i,i-1,G.adjoint()); } - // update Z - if (m_computeQZ) - m_Z.applyOnTheLeft(i,i-1,G.adjoint()); } } } diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h index e90ce77eb..a89d75958 100644 --- a/Eigen/src/Geometry/Quaternion.h +++ b/Eigen/src/Geometry/Quaternion.h @@ -161,8 +161,8 @@ class QuaternionBase : public RotationBase<Derived, 3> bool isApprox(const QuaternionBase<OtherDerived>& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const { return coeffs().isApprox(other.coeffs(), prec); } - /** return the result vector of \a v through the rotation*/ - EIGEN_STRONG_INLINE Vector3 _transformVector(Vector3 v) const; + /** return the result vector of \a v through the rotation*/ + EIGEN_STRONG_INLINE Vector3 _transformVector(const Vector3& v) const; /** \returns \c *this with scalar type casted to \a NewScalarType * @@ -462,7 +462,7 @@ EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator*= (const Quaterni */ template <class Derived> EIGEN_STRONG_INLINE typename QuaternionBase<Derived>::Vector3 -QuaternionBase<Derived>::_transformVector(Vector3 v) const +QuaternionBase<Derived>::_transformVector(const Vector3& v) const { // Note that this algorithm comes from the optimization by hand // of the conversion to a Matrix followed by a Matrix/Vector product. diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index fd7c8a4b2..9b141c8df 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -84,6 +84,8 @@ public: typedef Matrix<RealScalar, Dynamic, 1> VectorType; typedef Array<RealScalar, Dynamic, 1> ArrayXr; typedef Array<Index,1,Dynamic> ArrayXi; + typedef Ref<ArrayXr> ArrayRef; + typedef Ref<ArrayXi> IndicesRef; /** \brief Default Constructor. * @@ -159,21 +161,23 @@ private: void allocate(Index rows, Index cols, unsigned int computationOptions); void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift); void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V); - void computeSingVals(const ArrayXr& col0, const ArrayXr& diag, const ArrayXi& perm, VectorType& singVals, ArrayXr& shifts, ArrayXr& mus); - void perturbCol0(const ArrayXr& col0, const ArrayXr& diag, const ArrayXi& perm, const VectorType& singVals, const ArrayXr& shifts, const ArrayXr& mus, ArrayXr& zhat); - void computeSingVecs(const ArrayXr& zhat, const ArrayXr& diag, const ArrayXi& perm, const VectorType& singVals, const ArrayXr& shifts, const ArrayXr& mus, MatrixXr& U, MatrixXr& V); + void computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, VectorType& singVals, ArrayRef shifts, ArrayRef mus); + void perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat); + void computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V); void deflation43(Index firstCol, Index shift, Index i, Index size); void deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size); void deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift); template<typename HouseholderU, typename HouseholderV, typename NaiveU, typename NaiveV> void copyUV(const HouseholderU &householderU, const HouseholderV &householderV, const NaiveU &naiveU, const NaiveV &naivev); - static void structured_update(Block<MatrixXr,Dynamic,Dynamic> A, const MatrixXr &B, Index n1); - static RealScalar secularEq(RealScalar x, const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm, const ArrayXr& diagShifted, RealScalar shift); + void structured_update(Block<MatrixXr,Dynamic,Dynamic> A, const MatrixXr &B, Index n1); + static RealScalar secularEq(RealScalar x, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift); protected: MatrixXr m_naiveU, m_naiveV; MatrixXr m_computed; Index m_nRec; + ArrayXr m_workspace; + ArrayXi m_workspaceI; int m_algoswap; bool m_isTranspose, m_compU, m_compV; @@ -212,6 +216,9 @@ void BDCSVD<MatrixType>::allocate(Index rows, Index cols, unsigned int computati else m_naiveU = MatrixXr::Zero(2, m_diagSize + 1 ); if (m_compV) m_naiveV = MatrixXr::Zero(m_diagSize, m_diagSize); + + m_workspace.resize((m_diagSize+1)*(m_diagSize+1)*3); + m_workspaceI.resize(3*m_diagSize); }// end allocate template<typename MatrixType> @@ -223,6 +230,19 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign allocate(matrix.rows(), matrix.cols(), computationOptions); using std::abs; + //**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return + if(matrix.cols() < m_algoswap) + { + // FIXME this line involves temporaries + JacobiSVD<MatrixType> jsvd(matrix,computationOptions); + if(computeU()) m_matrixU = jsvd.matrixU(); + if(computeV()) m_matrixV = jsvd.matrixV(); + m_singularValues = jsvd.singularValues(); + m_nonzeroSingularValues = jsvd.nonzeroSingularValues(); + m_isInitialized = true; + return *this; + } + //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows RealScalar scale = matrix.cwiseAbs().maxCoeff(); if(scale==RealScalar(0)) scale = RealScalar(1); @@ -231,11 +251,13 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign else copy = matrix/scale; //**** step 1 - Bidiagonalization + // FIXME this line involves temporaries internal::UpperBidiagonalization<MatrixX> bid(copy); //**** step 2 - Divide & Conquer m_naiveU.setZero(); m_naiveV.setZero(); + // FIXME this line involves a temporary matrix m_computed.topRows(m_diagSize) = bid.bidiagonal().toDenseMatrix().transpose(); m_computed.template bottomRows<1>().setZero(); divide(0, m_diagSize - 1, 0, 0, 0); @@ -257,6 +279,7 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign break; } } + #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE // std::cout << "m_naiveU\n" << m_naiveU << "\n\n"; // std::cout << "m_naiveV\n" << m_naiveV << "\n\n"; @@ -279,14 +302,14 @@ void BDCSVD<MatrixType>::copyUV(const HouseholderU &householderU, const Househol Index Ucols = m_computeThinU ? m_diagSize : householderU.cols(); m_matrixU = MatrixX::Identity(householderU.cols(), Ucols); m_matrixU.topLeftCorner(m_diagSize, m_diagSize) = naiveV.template cast<Scalar>().topLeftCorner(m_diagSize, m_diagSize); - householderU.applyThisOnTheLeft(m_matrixU); + householderU.applyThisOnTheLeft(m_matrixU); // FIXME this line involves a temporary buffer } if (computeV()) { Index Vcols = m_computeThinV ? m_diagSize : householderV.cols(); m_matrixV = MatrixX::Identity(householderV.cols(), Vcols); m_matrixV.topLeftCorner(m_diagSize, m_diagSize) = naiveU.template cast<Scalar>().topLeftCorner(m_diagSize, m_diagSize); - householderV.applyThisOnTheLeft(m_matrixV); + householderV.applyThisOnTheLeft(m_matrixV); // FIXME this line involves a temporary buffer } } @@ -307,7 +330,10 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co // If the matrices are large enough, let's exploit the sparse structure of A by // splitting it in half (wrt n1), and packing the non-zero columns. Index n2 = n - n1; - MatrixXr A1(n1,n), A2(n2,n), B1(n,n), B2(n,n); + Map<MatrixXr> A1(m_workspace.data() , n1, n); + Map<MatrixXr> A2(m_workspace.data()+ n1*n, n2, n); + Map<MatrixXr> B1(m_workspace.data()+ n*n, n, n); + Map<MatrixXr> B2(m_workspace.data()+2*n*n, n, n); Index k1=0, k2=0; for(Index j=0; j<n; ++j) { @@ -329,7 +355,11 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co A.bottomRows(n2).noalias() = A2.leftCols(k2) * B2.topRows(k2); } else - A *= B; // FIXME this requires a temporary + { + Map<MatrixXr,Aligned> tmp(m_workspace.data(),n,n); + tmp.noalias() = A*B; + A = tmp; + } } // The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods takes as argument the @@ -360,7 +390,8 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, // matrices. if (n < m_algoswap) { - JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0)) ; + // FIXME this line involves temporaries + JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0)); if (m_compU) m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = b.matrixU(); else @@ -438,7 +469,7 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, } else { - RealScalar q1 = (m_naiveU(0, firstCol + k)); + RealScalar q1 = m_naiveU(0, firstCol + k); // we shift Q1 to the right for (Index i = firstCol + k - 1; i >= firstCol; i--) m_naiveU(0, i + 1) = m_naiveU(0, i); @@ -491,8 +522,14 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, assert(VofSVD.allFinite()); #endif - if (m_compU) structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n+2)/2); - else m_naiveU.middleCols(firstCol, n + 1) *= UofSVD; // FIXME this requires a temporary, and exploit that there are 2 rows at compile time + if (m_compU) + structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n+2)/2); + else + { + Map<Matrix<RealScalar,2,Dynamic>,Aligned> tmp(m_workspace.data(),2,n+1); + tmp.noalias() = m_naiveU.middleCols(firstCol, n+1) * UofSVD; + m_naiveU.middleCols(firstCol, n + 1) = tmp; + } if (m_compV) structured_update(m_naiveV.block(firstRowW, firstColW, n, n), VofSVD, (n+1)/2); @@ -517,10 +554,9 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, template <typename MatrixType> void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) { - // TODO Get rid of these copies (?) - // FIXME at least preallocate them - ArrayXr col0 = m_computed.col(firstCol).segment(firstCol, n); - ArrayXr diag = m_computed.block(firstCol, firstCol, n, n).diagonal(); + ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n); + m_workspace.head(n) = m_computed.block(firstCol, firstCol, n, n).diagonal(); + ArrayRef diag = m_workspace.head(n); diag(0) = 0; // Allocate space for singular values and vectors @@ -539,13 +575,14 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec Index actual_n = n; while(actual_n>1 && diag(actual_n-1)==0) --actual_n; Index m = 0; // size of the deflated problem - ArrayXi perm(actual_n); for(Index k=0;k<actual_n;++k) if(col0(k)!=0) - perm(m++) = k; - perm.conservativeResize(m); + m_workspaceI(m++) = k; + Map<ArrayXi> perm(m_workspaceI.data(),m); - ArrayXr shifts(n), mus(n), zhat(n); + Map<ArrayXr> shifts(m_workspace.data()+1*n, n); + Map<ArrayXr> mus(m_workspace.data()+2*n, n); + Map<ArrayXr> zhat(m_workspace.data()+3*n, n); #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "computeSVDofM using:\n"; @@ -622,8 +659,8 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec // Reverse order so that singular values in increased order // Because of deflation, the zeros singular-values are already at the end singVals.head(actual_n).reverseInPlace(); - U.leftCols(actual_n) = U.leftCols(actual_n).rowwise().reverse().eval(); // FIXME this requires a temporary - if (m_compV) V.leftCols(actual_n) = V.leftCols(actual_n).rowwise().reverse().eval(); // FIXME this requires a temporary + U.leftCols(actual_n).rowwise().reverseInPlace(); + if (m_compV) V.leftCols(actual_n).rowwise().reverseInPlace(); #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE JacobiSVD<MatrixXr> jsvd(m_computed.block(firstCol, firstCol, n, n) ); @@ -634,7 +671,7 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec } template <typename MatrixType> -typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar mu, const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm, const ArrayXr& diagShifted, RealScalar shift) +typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift) { Index m = perm.size(); RealScalar res = 1; @@ -647,8 +684,8 @@ typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar } template <typename MatrixType> -void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm, - VectorType& singVals, ArrayXr& shifts, ArrayXr& mus) +void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, + VectorType& singVals, ArrayRef shifts, ArrayRef mus) { using std::abs; using std::swap; @@ -703,7 +740,8 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia RealScalar shift = (k == actual_n-1 || fMid > 0) ? left : right; // measure everything relative to shift - ArrayXr diagShifted = diag - shift; + Map<ArrayXr> diagShifted(m_workspace.data()+4*n, n); + diagShifted = diag - shift; // initial guess RealScalar muPrev, muCur; @@ -730,7 +768,7 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia // rational interpolation: fit a function of the form a / mu + b through the two previous // iterates and use its zero to compute the next iterate bool useBisection = fPrev*fCur>0; - while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits<RealScalar>::epsilon() * numext::maxi(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection) + while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection) { ++m_numIters; @@ -773,7 +811,10 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia } RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); + +#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_DEBUG_VERBOSE RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift); +#endif #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE if(!(fLeft * fRight<0)) @@ -781,14 +822,13 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia #endif eigen_internal_assert(fLeft * fRight < 0); - while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * numext::maxi(abs(leftShifted), abs(rightShifted))) + while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted))) { RealScalar midShifted = (leftShifted + rightShifted) / 2; RealScalar fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift); if (fLeft * fMid < 0) { rightShifted = midShifted; - fRight = fMid; } else { @@ -816,8 +856,8 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia // zhat is perturbation of col0 for which singular vectors can be computed stably (see Section 3.1) template <typename MatrixType> void BDCSVD<MatrixType>::perturbCol0 - (const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm, const VectorType& singVals, - const ArrayXr& shifts, const ArrayXr& mus, ArrayXr& zhat) + (const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const VectorType& singVals, + const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat) { using std::sqrt; Index n = col0.size(); @@ -865,8 +905,8 @@ void BDCSVD<MatrixType>::perturbCol0 // compute singular vectors template <typename MatrixType> void BDCSVD<MatrixType>::computeSingVecs - (const ArrayXr& zhat, const ArrayXr& diag, const ArrayXi &perm, const VectorType& singVals, - const ArrayXr& shifts, const ArrayXr& mus, MatrixXr& U, MatrixXr& V) + (const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef &perm, const VectorType& singVals, + const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V) { Index n = zhat.size(); Index m = perm.size(); @@ -991,7 +1031,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff(); RealScalar epsilon_strict = NumTraits<RealScalar>::epsilon() * maxDiag; - RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi(col0.cwiseAbs().maxCoeff(), maxDiag); + RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag); #ifdef EIGEN_BDCSVD_SANITY_CHECKS assert(m_naiveU.allFinite()); @@ -1047,7 +1087,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge. // First, compute the respective permutation. - Index *permutation = new Index[length]; // FIXME avoid repeated dynamic memory allocation + Index *permutation = m_workspaceI.data(); { permutation[0] = 0; Index p = 1; @@ -1084,8 +1124,8 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index } // Current index of each col, and current column of each index - Index *realInd = new Index[length]; // FIXME avoid repeated dynamic memory allocation - Index *realCol = new Index[length]; // FIXME avoid repeated dynamic memory allocation + Index *realInd = m_workspaceI.data()+length; + Index *realCol = m_workspaceI.data()+2*length; for(int pos = 0; pos< length; pos++) { @@ -1115,9 +1155,6 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index realInd[J] = realI; realInd[i] = pi; } - delete[] permutation; - delete[] realInd; - delete[] realCol; } #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "sorted: " << diag.transpose().format(bdcsvdfmt) << "\n"; diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index fcf01f518..a46a47104 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -425,12 +425,13 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q, // If d!=0, then t/d cannot overflow because the magnitude of the // entries forming d are not too small compared to the ones forming t. RealScalar u = t / d; - rot1.s() = RealScalar(1) / sqrt(RealScalar(1) + numext::abs2(u)); - rot1.c() = rot1.s() * u; + RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u)); + rot1.s() = RealScalar(1) / tmp; + rot1.c() = u / tmp; } m.applyOnTheLeft(0,1,rot1); j_right->makeJacobi(m,0,1); - *j_left = rot1 * j_right->transpose(); + *j_left = rot1 * j_right->transpose(); } template<typename _MatrixType, int QRPreconditioner> @@ -680,6 +681,8 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon(); // limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286) + // FIXME What about considerering any denormal numbers as zero, using: + // const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)(); const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min(); // Scaling factor to reduce over/under-flows @@ -719,8 +722,9 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig // if this 2x2 sub-matrix is not diagonal already... // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't // keep us iterating forever. Similarly, small denormal numbers are considered zero. - RealScalar threshold = numext::maxi(considerAsZero, precision * numext::maxi(abs(m_workMatrix.coeff(p,p)), - abs(m_workMatrix.coeff(q,q)))); + RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, + precision * numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p,p)), + abs(m_workMatrix.coeff(q,q)))); // We compare both values to threshold instead of calling max to be robust to NaN (See bug 791) if(abs(m_workMatrix.coeff(p,q))>threshold || abs(m_workMatrix.coeff(q,p)) > threshold) { diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h index 244f1b50e..d25a161f7 100644 --- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h @@ -30,16 +30,16 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r std::memset(mask,0,sizeof(bool)*rows); + typename evaluator<Lhs>::type lhsEval(lhs); + typename evaluator<Rhs>::type rhsEval(rhs); + // estimate the number of non zero entries // given a rhs column containing Y non zeros, we assume that the respective Y columns // of the lhs differs in average of one non zeros, thus the number of non zeros for // the product of a rhs column with the lhs is X+Y where X is the average number of non zero // per column of the lhs. // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs) - Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros(); - - typename evaluator<Lhs>::type lhsEval(lhs); - typename evaluator<Rhs>::type rhsEval(rhs); + Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate(); res.setZero(); res.reserve(Index(estimated_nnz_prod)); diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 2b31716a3..778939791 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -90,7 +90,8 @@ class sparse_matrix_block_impl typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
public:
enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
- EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
+ typedef SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> > Base;
+ _EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
protected:
typedef typename Base::IndexVector IndexVector;
enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
@@ -198,20 +199,9 @@ public: { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
inline const StorageIndex* innerNonZeroPtr() const
- { return isCompressed() ? 0 : m_matrix.innerNonZeroPtr(); }
+ { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); }
inline StorageIndex* innerNonZeroPtr()
- { return isCompressed() ? 0 : m_matrix.const_cast_derived().innerNonZeroPtr(); }
-
- Index nonZeros() const
- {
- if(m_matrix.isCompressed())
- return ( (m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()])
- - (m_matrix.outerIndexPtr()[m_outerStart]));
- else if(m_outerSize.value()==0)
- return 0;
- else
- return Map<const IndexVector>(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
- }
+ { return isCompressed() ? 0 : (m_matrix.const_cast_derived().innerNonZeroPtr()+m_outerStart); }
bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
@@ -233,7 +223,7 @@ public: const Scalar& lastCoeff() const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(sparse_matrix_block_impl);
- eigen_assert(nonZeros()>0);
+ eigen_assert(Base::nonZeros()>0);
if(m_matrix.isCompressed())
return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart+1]-1];
else
@@ -339,17 +329,6 @@ SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const }
-namespace internal {
-
-template< typename XprType, int BlockRows, int BlockCols, bool InnerPanel,
- bool OuterVector = (BlockCols==1 && XprType::IsRowMajor)
- | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
- // revert to || as soon as not needed anymore.
- (BlockRows==1 && !XprType::IsRowMajor)>
-class GenericSparseBlockInnerIteratorImpl;
-
-}
-
/** Generic implementation of sparse Block expression.
* Real-only.
*/
@@ -415,8 +394,11 @@ public: Index blockCols() const { return m_blockCols.value(); }
protected:
- friend class internal::GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel>;
+// friend class internal::GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel>;
friend class ReverseInnerIterator;
+ friend struct internal::unary_evaluator<Block<XprType,BlockRows,BlockCols,InnerPanel>, internal::IteratorBased, Scalar >;
+
+ Index nonZeros() const { return Dynamic; }
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
@@ -429,94 +411,6 @@ public: };
namespace internal {
- template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
- class GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel,false> : public Block<XprType, BlockRows, BlockCols, InnerPanel>::_MatrixTypeNested::InnerIterator
- {
- typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
- enum {
- IsRowMajor = BlockType::IsRowMajor
- };
- typedef typename BlockType::_MatrixTypeNested _MatrixTypeNested;
- typedef typename BlockType::StorageIndex StorageIndex;
- typedef typename _MatrixTypeNested::InnerIterator Base;
- const BlockType& m_block;
- Index m_end;
- public:
-
- EIGEN_STRONG_INLINE GenericSparseBlockInnerIteratorImpl(const BlockType& block, Index outer)
- : Base(block.derived().nestedExpression(), outer + (IsRowMajor ? block.m_startRow.value() : block.m_startCol.value())),
- m_block(block),
- m_end(IsRowMajor ? block.m_startCol.value()+block.m_blockCols.value() : block.m_startRow.value()+block.m_blockRows.value())
- {
- while( (Base::operator bool()) && (Base::index() < (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value())) )
- Base::operator++();
- }
-
- inline Index index() const { return Base::index() - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
- inline Index outer() const { return Base::outer() - (IsRowMajor ? m_block.m_startRow.value() : m_block.m_startCol.value()); }
- inline Index row() const { return Base::row() - m_block.m_startRow.value(); }
- inline Index col() const { return Base::col() - m_block.m_startCol.value(); }
-
- inline operator bool() const { return Base::operator bool() && Base::index() < m_end; }
- };
-
- // Row vector of a column-major sparse matrix or column of a row-major one.
- template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
- class GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel,true>
- {
- typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
- enum {
- IsRowMajor = BlockType::IsRowMajor
- };
- typedef typename BlockType::_MatrixTypeNested _MatrixTypeNested;
- typedef typename BlockType::StorageIndex StorageIndex;
- typedef typename BlockType::Scalar Scalar;
- const BlockType& m_block;
- Index m_outerPos;
- Index m_innerIndex;
- Scalar m_value;
- Index m_end;
- public:
-
- explicit EIGEN_STRONG_INLINE GenericSparseBlockInnerIteratorImpl(const BlockType& block, Index outer = 0)
- :
- m_block(block),
- m_outerPos( (IsRowMajor ? block.m_startCol.value() : block.m_startRow.value()) - 1), // -1 so that operator++ finds the first non-zero entry
- m_innerIndex(IsRowMajor ? block.m_startRow.value() : block.m_startCol.value()),
- m_end(IsRowMajor ? block.m_startCol.value()+block.m_blockCols.value() : block.m_startRow.value()+block.m_blockRows.value())
- {
- EIGEN_UNUSED_VARIABLE(outer);
- eigen_assert(outer==0);
-
- ++(*this);
- }
-
- inline Index index() const { return m_outerPos - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
- inline Index outer() const { return 0; }
- inline Index row() const { return IsRowMajor ? 0 : index(); }
- inline Index col() const { return IsRowMajor ? index() : 0; }
-
- inline Scalar value() const { return m_value; }
-
- inline GenericSparseBlockInnerIteratorImpl& operator++()
- {
- // search next non-zero entry
- while(++m_outerPos<m_end)
- {
- typename XprType::InnerIterator it(m_block.m_matrix, m_outerPos);
- // search for the key m_innerIndex in the current outer-vector
- while(it && it.index() < m_innerIndex) ++it;
- if(it && it.index()==m_innerIndex)
- {
- m_value = it.value();
- break;
- }
- }
- return *this;
- }
-
- inline operator bool() const { return m_outerPos < m_end; }
- };
template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
struct unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased >
@@ -548,9 +442,16 @@ struct unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBa explicit unary_evaluator(const XprType& op)
: m_argImpl(op.nestedExpression()), m_block(op)
{}
+
+ inline Index nonZerosEstimate() const {
+ Index nnz = m_block.nonZeros();
+ if(nnz<0)
+ return m_argImpl.nonZerosEstimate() * m_block.size() / m_block.nestedExpression().size();
+ return nnz;
+ }
protected:
- typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
+ typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
typename evaluator<ArgType>::nestedType m_argImpl;
const XprType &m_block;
@@ -595,6 +496,7 @@ public: : m_eval(aEval),
m_outerPos( (IsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) - 1), // -1 so that operator++ finds the first non-zero entry
m_innerIndex(IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()),
+ m_value(0),
m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows())
{
EIGEN_UNUSED_VARIABLE(outer);
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h index a5ba45e04..0dbb94faf 100644 --- a/Eigen/src/SparseCore/SparseCompressedBase.h +++ b/Eigen/src/SparseCore/SparseCompressedBase.h @@ -35,6 +35,25 @@ class SparseCompressedBase class InnerIterator; class ReverseInnerIterator; + protected: + typedef typename Base::IndexVector IndexVector; + Eigen::Map<IndexVector> innerNonZeros() { return Eigen::Map<IndexVector>(innerNonZeroPtr(), isCompressed()?0:derived().outerSize()); } + const Eigen::Map<const IndexVector> innerNonZeros() const { return Eigen::Map<const IndexVector>(innerNonZeroPtr(), isCompressed()?0:derived().outerSize()); } + + public: + + /** \returns the number of non zero coefficients */ + inline Index nonZeros() const + { + if(isCompressed()) + return outerIndexPtr()[derived().outerSize()]-outerIndexPtr()[0]; + else if(derived().outerSize()==0) + return 0; + else + return innerNonZeros().sum(); + + } + /** \returns a const pointer to the array of values. * This function is aimed at interoperability with other libraries. * \sa innerIndexPtr(), outerIndexPtr() */ @@ -165,6 +184,10 @@ struct evaluator<SparseCompressedBase<Derived> > evaluator() : m_matrix(0) {} explicit evaluator(const Derived &mat) : m_matrix(&mat) {} + inline Index nonZerosEstimate() const { + return m_matrix->nonZeros(); + } + operator Derived&() { return m_matrix->const_cast_derived(); } operator const Derived&() const { return *m_matrix; } diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h index 3b4e9df59..f53427abf 100644 --- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h @@ -121,6 +121,10 @@ public: m_lhsImpl(xpr.lhs()), m_rhsImpl(xpr.rhs()) { } + + inline Index nonZerosEstimate() const { + return m_lhsImpl.nonZerosEstimate() + m_rhsImpl.nonZerosEstimate(); + } protected: const BinaryOp m_functor; @@ -198,6 +202,10 @@ public: m_lhsImpl(xpr.lhs()), m_rhsImpl(xpr.rhs()) { } + + inline Index nonZerosEstimate() const { + return (std::min)(m_lhsImpl.nonZerosEstimate(), m_rhsImpl.nonZerosEstimate()); + } protected: const BinaryOp m_functor; @@ -243,7 +251,7 @@ public: EIGEN_STRONG_INLINE Index col() const { return m_rhsIter.col(); } EIGEN_STRONG_INLINE operator bool() const { return m_rhsIter; } - + protected: const LhsEvaluator &m_lhsEval; RhsIterator m_rhsIter; @@ -262,6 +270,10 @@ public: m_lhsImpl(xpr.lhs()), m_rhsImpl(xpr.rhs()) { } + + inline Index nonZerosEstimate() const { + return m_rhsImpl.nonZerosEstimate(); + } protected: const BinaryOp m_functor; @@ -308,7 +320,7 @@ public: EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); } EIGEN_STRONG_INLINE operator bool() const { return m_lhsIter; } - + protected: LhsIterator m_lhsIter; const RhsEvaluator &m_rhsEval; @@ -327,6 +339,10 @@ public: m_lhsImpl(xpr.lhs()), m_rhsImpl(xpr.rhs()) { } + + inline Index nonZerosEstimate() const { + return m_lhsImpl.nonZerosEstimate(); + } protected: const BinaryOp m_functor; diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h index 63d8f329c..d484be876 100644 --- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h @@ -30,6 +30,10 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased> }; explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression()) {} + + inline Index nonZerosEstimate() const { + return m_argImpl.nonZerosEstimate(); + } protected: typedef typename evaluator<ArgType>::InnerIterator EvalIterator; diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h index a6ff7d559..7c512d9fe 100644 --- a/Eigen/src/SparseCore/SparseMap.h +++ b/Eigen/src/SparseCore/SparseMap.h @@ -105,9 +105,6 @@ class SparseMapBase<Derived,ReadOnlyAccessors> return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0); } - /** \returns the number of non zero coefficients */ - inline Index nonZeros() const { return m_nnz; } - inline SparseMapBase(Index rows, Index cols, Index nnz, IndexPointer outerIndexPtr, IndexPointer innerIndexPtr, ScalarPointer valuePtr, IndexPointer innerNonZerosPtr = 0) : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr), diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 4c3a47959..ef93cf80c 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -95,6 +95,7 @@ class SparseMatrix public: typedef SparseCompressedBase<SparseMatrix> Base; using Base::isCompressed; + using Base::nonZeros; _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix) EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=) EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=) @@ -122,9 +123,6 @@ class SparseMatrix StorageIndex* m_outerIndex; StorageIndex* m_innerNonZeros; // optional, if null then the data is compressed Storage m_data; - - Eigen::Map<IndexVector> innerNonZeros() { return Eigen::Map<IndexVector>(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); } - const Eigen::Map<const IndexVector> innerNonZeros() const { return Eigen::Map<const IndexVector>(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); } public: @@ -252,14 +250,6 @@ class SparseMatrix memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex)); } - /** \returns the number of non zero coefficients */ - inline Index nonZeros() const - { - if(m_innerNonZeros) - return innerNonZeros().sum(); - return convert_index(Index(m_data.size())); - } - /** Preallocates \a reserveSize non zeros. * * Precondition: the matrix must be in compressed mode. */ diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h index 55b0ad9d2..d4ab8b908 100644 --- a/Eigen/src/SparseCore/SparseMatrixBase.h +++ b/Eigen/src/SparseCore/SparseMatrixBase.h @@ -149,9 +149,6 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived> /** \returns the number of coefficients, which is \a rows()*cols(). * \sa rows(), cols(). */ inline Index size() const { return rows() * cols(); } - /** \returns the number of nonzero coefficients which is in practice the number - * of stored coefficients. */ - inline Index nonZeros() const { return derived().nonZeros(); } /** \returns true if either the number of rows or the number of columns is equal to 1. * In other words, this function returns * \code rows()==1 || cols()==1 \endcode diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h index 3db01bf2d..48050077e 100644 --- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h @@ -33,14 +33,6 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r // allocate a temporary buffer AmbiVector<Scalar,StorageIndex> tempVector(rows); - // estimate the number of non zero entries - // given a rhs column containing Y non zeros, we assume that the respective Y columns - // of the lhs differs in average of one non zeros, thus the number of non zeros for - // the product of a rhs column with the lhs is X+Y where X is the average number of non zero - // per column of the lhs. - // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs) - Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros(); - // mimics a resizeByInnerOuter: if(ResultType::IsRowMajor) res.resize(cols, rows); @@ -49,6 +41,14 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r typename evaluator<Lhs>::type lhsEval(lhs); typename evaluator<Rhs>::type rhsEval(rhs); + + // estimate the number of non zero entries + // given a rhs column containing Y non zeros, we assume that the respective Y columns + // of the lhs differs in average of one non zeros, thus the number of non zeros for + // the product of a rhs column with the lhs is X+Y where X is the average number of non zero + // per column of the lhs. + // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs) + Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate(); res.reserve(estimated_nnz_prod); double ratioColRes = double(estimated_nnz_prod)/double(lhs.rows()*rhs.cols()); diff --git a/Eigen/src/SparseCore/SparseTranspose.h b/Eigen/src/SparseCore/SparseTranspose.h index 45d9c6700..d3fc7f102 100644 --- a/Eigen/src/SparseCore/SparseTranspose.h +++ b/Eigen/src/SparseCore/SparseTranspose.h @@ -40,15 +40,11 @@ namespace internal { }; } -// Implement nonZeros() for transpose. I'm not sure that's the best approach for that. -// Perhaps it should be implemented in Transpose<> itself. template<typename MatrixType> class TransposeImpl<MatrixType,Sparse> : public internal::SparseTransposeImpl<MatrixType> { protected: typedef internal::SparseTransposeImpl<MatrixType> Base; - public: - inline Index nonZeros() const { return Base::derived().nestedExpression().nonZeros(); } }; namespace internal { @@ -61,6 +57,10 @@ struct unary_evaluator<Transpose<ArgType>, IteratorBased> typedef typename evaluator<ArgType>::ReverseInnerIterator EvalReverseIterator; public: typedef Transpose<ArgType> XprType; + + inline Index nonZerosEstimate() const { + return m_argImpl.nonZerosEstimate(); + } class InnerIterator : public EvalIterator { diff --git a/Eigen/src/SparseCore/SparseTriangularView.h b/Eigen/src/SparseCore/SparseTriangularView.h index b5fbcbdde..34ec07a13 100644 --- a/Eigen/src/SparseCore/SparseTriangularView.h +++ b/Eigen/src/SparseCore/SparseTriangularView.h @@ -50,13 +50,6 @@ protected: template<typename OtherDerived> void solveInPlace(MatrixBase<OtherDerived>& other) const; template<typename OtherDerived> void solveInPlace(SparseMatrixBase<OtherDerived>& other) const; - - inline Index nonZeros() const { - // FIXME HACK number of nonZeros is required for product logic - // this returns only an upper bound (but should be OK for most purposes) - return derived().nestedExpression().nonZeros(); - } - }; @@ -191,6 +184,10 @@ public: explicit unary_evaluator(const XprType &xpr) : m_argImpl(xpr.nestedExpression()) {} + inline Index nonZerosEstimate() const { + return m_argImpl.nonZerosEstimate(); + } + class InnerIterator : public EvalIterator { typedef EvalIterator Base; diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h index 35bcec819..7b65f32bc 100644 --- a/Eigen/src/SparseCore/SparseVector.h +++ b/Eigen/src/SparseCore/SparseVector.h @@ -442,6 +442,10 @@ struct evaluator<SparseVector<_Scalar,_Options,_Index> > explicit evaluator(const SparseVectorType &mat) : m_matrix(mat) {} + inline Index nonZerosEstimate() const { + return m_matrix.nonZeros(); + } + operator SparseVectorType&() { return m_matrix.const_cast_derived(); } operator const SparseVectorType&() const { return m_matrix; } diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h index efdc6d046..b9d5e48fb 100644 --- a/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -302,6 +302,7 @@ class SuperLUBase : public SparseSolverBase<Derived> typedef Matrix<Scalar,Dynamic,1> Vector; typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType; typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType; + typedef Map<PermutationMatrix<Dynamic,Dynamic,int> > PermutationMap; typedef SparseMatrix<Scalar> LUMatrixType; public: @@ -459,10 +460,11 @@ class SuperLU : public SuperLUBase<_MatrixType,SuperLU<_MatrixType> > typedef typename Base::RealScalar RealScalar; typedef typename Base::StorageIndex StorageIndex; typedef typename Base::IntRowVectorType IntRowVectorType; - typedef typename Base::IntColVectorType IntColVectorType; + typedef typename Base::IntColVectorType IntColVectorType; + typedef typename Base::PermutationMap PermutationMap; typedef typename Base::LUMatrixType LUMatrixType; typedef TriangularView<LUMatrixType, Lower|UnitDiag> LMatrixType; - typedef TriangularView<LUMatrixType, Upper> UMatrixType; + typedef TriangularView<LUMatrixType, Upper> UMatrixType; public: using Base::_solve_impl; @@ -774,6 +776,8 @@ typename SuperLU<MatrixType>::Scalar SuperLU<MatrixType>::determinant() const det *= m_u.valuePtr()[lastId]; } } + if(PermutationMap(m_p.data(),m_p.size()).determinant()*PermutationMap(m_q.data(),m_q.size()).determinant()<0) + det = -det; if(m_sluEqued!='N') return det/m_sluRscale.prod()/m_sluCscale.prod(); else diff --git a/cmake/FindMetis.cmake b/cmake/FindMetis.cmake index e0040d320..6a0ce790c 100644 --- a/cmake/FindMetis.cmake +++ b/cmake/FindMetis.cmake @@ -26,7 +26,7 @@ macro(_metis_check_version) string(REGEX MATCH "define[ \t]+METIS_VER_SUBMINOR[ \t]+([0-9]+)" _metis_subminor_version_match "${_metis_version_header}") set(METIS_SUBMINOR_VERSION "${CMAKE_MATCH_1}") if(NOT METIS_MAJOR_VERSION) - message(WARNING "Could not determine Metis version. Assuming version 4.0.0") + message(STATUS "Could not determine Metis version. Assuming version 4.0.0") set(METIS_VERSION 4.0.0) else() set(METIS_VERSION ${METIS_MAJOR_VERSION}.${METIS_MINOR_VERSION}.${METIS_SUBMINOR_VERSION}) diff --git a/doc/special_examples/CMakeLists.txt b/doc/special_examples/CMakeLists.txt index aab80a55d..101fbc5f9 100644 --- a/doc/special_examples/CMakeLists.txt +++ b/doc/special_examples/CMakeLists.txt @@ -10,9 +10,10 @@ if(QT4_FOUND) target_link_libraries(Tutorial_sparse_example ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${QT_QTCORE_LIBRARY} ${QT_QTGUI_LIBRARY}) add_custom_command( - TARGET Tutorial_sparse_example - POST_BUILD - COMMAND Tutorial_sparse_example ARGS ${CMAKE_CURRENT_BINARY_DIR}/../html/Tutorial_sparse_example.jpeg + TARGET Tutorial_sparse_example + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/../html/ + COMMAND Tutorial_sparse_example ARGS ${CMAKE_CURRENT_BINARY_DIR}/../html/Tutorial_sparse_example.jpeg ) add_dependencies(all_examples Tutorial_sparse_example) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 393c35b57..54ce7fb30 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -332,3 +332,8 @@ endif(EIGEN_TEST_NVCC) file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests) add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON) + +option(EIGEN_TEST_BUILD_DOCUMENTATION "Test building the doxygen documentation" OFF) +IF(EIGEN_TEST_BUILD_DOCUMENTATION) + add_dependencies(buildtests doc) +ENDIF() diff --git a/test/array_reverse.cpp b/test/array_reverse.cpp index fbe7a9901..a5c0d37f9 100644 --- a/test/array_reverse.cpp +++ b/test/array_reverse.cpp @@ -24,7 +24,7 @@ template<typename MatrixType> void reverse(const MatrixType& m) // this test relies a lot on Random.h, and there's not much more that we can do // to test it, hence I consider that we will have tested Random.h - MatrixType m1 = MatrixType::Random(rows, cols); + MatrixType m1 = MatrixType::Random(rows, cols), m2; VectorType v1 = VectorType::Random(rows); MatrixType m1_r = m1.reverse(); @@ -96,6 +96,26 @@ template<typename MatrixType> void reverse(const MatrixType& m) m1.reverse()(r, c) = x; VERIFY_IS_APPROX(x, m1(rows - 1 - r, cols - 1 - c)); + + m2 = m1; + m2.reverseInPlace(); + VERIFY_IS_APPROX(m2,m1.reverse().eval()); + + m2 = m1; + m2.col(0).reverseInPlace(); + VERIFY_IS_APPROX(m2.col(0),m1.col(0).reverse().eval()); + + m2 = m1; + m2.row(0).reverseInPlace(); + VERIFY_IS_APPROX(m2.row(0),m1.row(0).reverse().eval()); + + m2 = m1; + m2.rowwise().reverseInPlace(); + VERIFY_IS_APPROX(m2,m1.rowwise().reverse().eval()); + + m2 = m1; + m2.colwise().reverseInPlace(); + VERIFY_IS_APPROX(m2,m1.colwise().reverse().eval()); /* m1.colwise().reverse()(r, c) = x; @@ -113,11 +133,11 @@ void test_array_reverse() CALL_SUBTEST_2( reverse(Matrix2f()) ); CALL_SUBTEST_3( reverse(Matrix4f()) ); CALL_SUBTEST_4( reverse(Matrix4d()) ); - CALL_SUBTEST_5( reverse(MatrixXcf(3, 3)) ); - CALL_SUBTEST_6( reverse(MatrixXi(6, 3)) ); - CALL_SUBTEST_7( reverse(MatrixXcd(20, 20)) ); + CALL_SUBTEST_5( reverse(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_6( reverse(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_7( reverse(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_8( reverse(Matrix<float, 100, 100>()) ); - CALL_SUBTEST_9( reverse(Matrix<float,Dynamic,Dynamic,RowMajor>(6,3)) ); + CALL_SUBTEST_9( reverse(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); } #ifdef EIGEN_TEST_PART_3 Vector4f x; x << 1, 2, 3, 4; diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp index 0227ba577..cd6dc8cf0 100644 --- a/test/diagonalmatrices.cpp +++ b/test/diagonalmatrices.cpp @@ -17,6 +17,7 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m) typedef Matrix<Scalar, Rows, 1> VectorType; typedef Matrix<Scalar, 1, Cols> RowVectorType; typedef Matrix<Scalar, Rows, Rows> SquareMatrixType; + typedef Matrix<Scalar, Dynamic, Dynamic> DynMatrixType; typedef DiagonalMatrix<Scalar, Rows> LeftDiagonalMatrix; typedef DiagonalMatrix<Scalar, Cols> RightDiagonalMatrix; typedef Matrix<Scalar, Rows==Dynamic?Dynamic:2*Rows, Cols==Dynamic?Dynamic:2*Cols> BigMatrix; @@ -64,6 +65,13 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m) VERIFY_IS_APPROX( (((v1+v2).asDiagonal() * (m1+m2))(i,j)) , (v1+v2)(i) * (m1+m2)(i,j) ); VERIFY_IS_APPROX( ((m1 * (rv1+rv2).asDiagonal())(i,j)) , (rv1+rv2)(j) * m1(i,j) ); VERIFY_IS_APPROX( (((m1+m2) * (rv1+rv2).asDiagonal())(i,j)) , (rv1+rv2)(j) * (m1+m2)(i,j) ); + + if(rows>1) + { + DynMatrixType tmp = m1.topRows(rows/2), res; + VERIFY_IS_APPROX( (res = m1.topRows(rows/2) * rv1.asDiagonal()), tmp * rv1.asDiagonal() ); + VERIFY_IS_APPROX( (res = v1.head(rows/2).asDiagonal()*m1.topRows(rows/2)), v1.head(rows/2).asDiagonal()*tmp ); + } BigMatrix big; big.setZero(2*rows, 2*cols); @@ -93,6 +101,17 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m) VERIFY_IS_APPROX( (sq_m1 = (s1*v1).asDiagonal()), (s1*v1).asDiagonal().toDenseMatrix() ); } +template<int> +void bug987() +{ + Matrix3Xd points = Matrix3Xd::Random(3, 3); + Vector2d diag = Vector2d::Random(); + Matrix2Xd tmp1 = points.topRows<2>(), res1, res2; + VERIFY_IS_APPROX( res1 = diag.asDiagonal() * points.topRows<2>(), res2 = diag.asDiagonal() * tmp1 ); + Matrix2d tmp2 = points.topLeftCorner<2,2>(); + VERIFY_IS_APPROX(( res1 = points.topLeftCorner<2,2>()*diag.asDiagonal()) , res2 = tmp2*diag.asDiagonal() ); +} + void test_diagonalmatrices() { for(int i = 0; i < g_repeat; i++) { @@ -106,4 +125,5 @@ void test_diagonalmatrices() CALL_SUBTEST_8( diagonalmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_9( diagonalmatrices(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); } + CALL_SUBTEST_10( bug987<0>() ); } diff --git a/test/main.h b/test/main.h index ecf0c6924..3591b57a1 100644 --- a/test/main.h +++ b/test/main.h @@ -95,6 +95,9 @@ namespace Eigen { static std::vector<std::string> g_test_stack; + // level == 0 <=> abort if test fail + // level >= 1 <=> warning message to std::cerr if test fail + static int g_test_level = 0; static int g_repeat; static unsigned int g_seed; static bool g_has_set_repeat, g_has_set_seed; @@ -229,6 +232,8 @@ inline void verify_impl(bool condition, const char *testname, const char *file, { if (!condition) { + if(Eigen::g_test_level>0) + std::cerr << "WARNING: "; std::cerr << "Test " << testname << " failed in " << file << " (" << line << ")" << std::endl << " " << condition_as_string << std::endl; std::cerr << "Stack:\n"; @@ -236,7 +241,8 @@ inline void verify_impl(bool condition, const char *testname, const char *file, for(int i=test_stack_size-1; i>=0; --i) std::cerr << " - " << Eigen::g_test_stack[i] << "\n"; std::cerr << "\n"; - abort(); + if(Eigen::g_test_level==0) + abort(); } } diff --git a/test/product_extra.cpp b/test/product_extra.cpp index 1b4c6c33c..7c54b6977 100644 --- a/test/product_extra.cpp +++ b/test/product_extra.cpp @@ -113,6 +113,9 @@ void mat_mat_scalar_scalar_product() template <typename MatrixType> void zero_sized_objects(const MatrixType& m) { + typedef typename MatrixType::Scalar Scalar; + const int PacketSize = internal::packet_traits<Scalar>::size; + const int PacketSize1 = PacketSize>1 ? PacketSize-1 : 1; Index rows = m.rows(); Index cols = m.cols(); @@ -132,9 +135,41 @@ void zero_sized_objects(const MatrixType& m) res = b*a; VERIFY(res.rows()==0 && res.cols()==cols); } + + { + Matrix<Scalar,PacketSize,0> a; + Matrix<Scalar,0,1> b; + Matrix<Scalar,PacketSize,1> res; + VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize,1) ); + VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize,1) ); + } + + { + Matrix<Scalar,PacketSize1,0> a; + Matrix<Scalar,0,1> b; + Matrix<Scalar,PacketSize1,1> res; + VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize1,1) ); + VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize1,1) ); + } + + { + Matrix<Scalar,PacketSize,Dynamic> a(PacketSize,0); + Matrix<Scalar,Dynamic,1> b(0,1); + Matrix<Scalar,PacketSize,1> res; + VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize,1) ); + VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize,1) ); + } + + { + Matrix<Scalar,PacketSize1,Dynamic> a(PacketSize1,0); + Matrix<Scalar,Dynamic,1> b(0,1); + Matrix<Scalar,PacketSize1,1> res; + VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize1,1) ); + VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize1,1) ); + } } - +template<int> void bug_127() { // Bug 127 @@ -159,6 +194,7 @@ void bug_127() a*b; } +template<int> void unaligned_objects() { // Regression test for the bug reported here: @@ -188,6 +224,29 @@ void unaligned_objects() } } +template<typename T> +EIGEN_DONT_INLINE +Index test_compute_block_size(Index m, Index n, Index k) +{ + Index mc(m), nc(n), kc(k); + internal::computeProductBlockingSizes<T,T>(kc, mc, nc); + return kc+mc+nc; +} + +template<typename T> +Index compute_block_size() +{ + Index ret = 0; + ret += test_compute_block_size<T>(0,1,1); + ret += test_compute_block_size<T>(1,0,1); + ret += test_compute_block_size<T>(1,1,0); + ret += test_compute_block_size<T>(0,0,1); + ret += test_compute_block_size<T>(0,1,0); + ret += test_compute_block_size<T>(1,0,0); + ret += test_compute_block_size<T>(0,0,0); + return ret; +} + void test_product_extra() { for(int i = 0; i < g_repeat; i++) { @@ -198,6 +257,9 @@ void test_product_extra() CALL_SUBTEST_4( product_extra(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) ); CALL_SUBTEST_1( zero_sized_objects(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); } - CALL_SUBTEST_5( bug_127() ); - CALL_SUBTEST_6( unaligned_objects() ); + CALL_SUBTEST_5( bug_127<0>() ); + CALL_SUBTEST_6( unaligned_objects<0>() ); + CALL_SUBTEST_7( compute_block_size<float>() ); + CALL_SUBTEST_7( compute_block_size<double>() ); + CALL_SUBTEST_7( compute_block_size<std::complex<double> >() ); } diff --git a/test/real_qz.cpp b/test/real_qz.cpp index 7d743a734..a1766c6d9 100644 --- a/test/real_qz.cpp +++ b/test/real_qz.cpp @@ -25,6 +25,22 @@ template<typename MatrixType> void real_qz(const MatrixType& m) MatrixType A = MatrixType::Random(dim,dim), B = MatrixType::Random(dim,dim); + + // Regression test for bug 985: Randomly set rows or columns to zero + Index k=internal::random<Index>(0, dim-1); + switch(internal::random<int>(0,10)) { + case 0: + A.row(k).setZero(); break; + case 1: + A.col(k).setZero(); break; + case 2: + B.row(k).setZero(); break; + case 3: + B.col(k).setZero(); break; + default: + break; + } + RealQZ<MatrixType> qz(A,B); VERIFY_IS_EQUAL(qz.info(), Success); diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp index 480a660fc..3bad3def7 100644 --- a/test/sparse_product.cpp +++ b/test/sparse_product.cpp @@ -67,6 +67,9 @@ template<typename SparseMatrixType> void sparse_product() VERIFY_IS_APPROX(m4 = m2*m3/s1, refMat4 = refMat2*refMat3/s1); VERIFY_IS_APPROX(m4 = m2*m3*s1, refMat4 = refMat2*refMat3*s1); VERIFY_IS_APPROX(m4 = s2*m2*m3*s1, refMat4 = s2*refMat2*refMat3*s1); + VERIFY_IS_APPROX(m4 = (m2+m2)*m3, refMat4 = (refMat2+refMat2)*refMat3); + VERIFY_IS_APPROX(m4 = m2*m3.leftCols(cols/2), refMat4 = refMat2*refMat3.leftCols(cols/2)); + VERIFY_IS_APPROX(m4 = m2*(m3+m3).leftCols(cols/2), refMat4 = refMat2*(refMat3+refMat3).leftCols(cols/2)); VERIFY_IS_APPROX(m4=(m2*m3).pruned(0), refMat4=refMat2*refMat3); VERIFY_IS_APPROX(m4=(m2t.transpose()*m3).pruned(0), refMat4=refMat2t.transpose()*refMat3); @@ -194,7 +197,7 @@ template<typename SparseMatrixType> void sparse_product() VERIFY_IS_APPROX(d3=d1*m2.transpose(), refM3=d1*refM2.transpose()); } - // test self-adjoint and traingular-view products + // test self-adjoint and triangular-view products { DenseMatrix b = DenseMatrix::Random(rows, rows); DenseMatrix x = DenseMatrix::Random(rows, rows); diff --git a/test/svd_common.h b/test/svd_common.h index 4c172cf9d..b44b79124 100644 --- a/test/svd_common.h +++ b/test/svd_common.h @@ -49,18 +49,39 @@ void svd_compare_to_full(const MatrixType& m, unsigned int computationOptions, const SvdType& referenceSvd) { - typedef typename MatrixType::Index Index; + typedef typename MatrixType::RealScalar RealScalar; Index rows = m.rows(); Index cols = m.cols(); Index diagSize = (std::min)(rows, cols); + RealScalar prec = test_precision<RealScalar>(); SvdType svd(m, computationOptions); VERIFY_IS_APPROX(svd.singularValues(), referenceSvd.singularValues()); + + if(computationOptions & (ComputeFullV|ComputeThinV)) + { + VERIFY( (svd.matrixV().adjoint()*svd.matrixV()).isIdentity(prec) ); + VERIFY_IS_APPROX( svd.matrixV().leftCols(diagSize) * svd.singularValues().asDiagonal() * svd.matrixV().leftCols(diagSize).adjoint(), + referenceSvd.matrixV().leftCols(diagSize) * referenceSvd.singularValues().asDiagonal() * referenceSvd.matrixV().leftCols(diagSize).adjoint()); + } + + if(computationOptions & (ComputeFullU|ComputeThinU)) + { + VERIFY( (svd.matrixU().adjoint()*svd.matrixU()).isIdentity(prec) ); + VERIFY_IS_APPROX( svd.matrixU().leftCols(diagSize) * svd.singularValues().cwiseAbs2().asDiagonal() * svd.matrixU().leftCols(diagSize).adjoint(), + referenceSvd.matrixU().leftCols(diagSize) * referenceSvd.singularValues().cwiseAbs2().asDiagonal() * referenceSvd.matrixU().leftCols(diagSize).adjoint()); + } + + // The following checks are not critical. + // For instance, with Dived&Conquer SVD, if only the factor 'V' is computedt then different matrix-matrix product implementation will be used + // and the resulting 'V' factor might be significantly different when the SVD decomposition is not unique, especially with single precision float. + ++g_test_level; if(computationOptions & ComputeFullU) VERIFY_IS_APPROX(svd.matrixU(), referenceSvd.matrixU()); if(computationOptions & ComputeThinU) VERIFY_IS_APPROX(svd.matrixU(), referenceSvd.matrixU().leftCols(diagSize)); - if(computationOptions & ComputeFullV) VERIFY_IS_APPROX(svd.matrixV(), referenceSvd.matrixV()); + if(computationOptions & ComputeFullV) VERIFY_IS_APPROX(svd.matrixV().cwiseAbs(), referenceSvd.matrixV().cwiseAbs()); if(computationOptions & ComputeThinV) VERIFY_IS_APPROX(svd.matrixV(), referenceSvd.matrixV().leftCols(diagSize)); + --g_test_level; } // @@ -85,33 +106,48 @@ void svd_least_square(const MatrixType& m, unsigned int computationOptions) SvdType svd(m, computationOptions); if(internal::is_same<RealScalar,double>::value) svd.setThreshold(1e-8); - else if(internal::is_same<RealScalar,float>::value) svd.setThreshold(1e-4); - + else if(internal::is_same<RealScalar,float>::value) svd.setThreshold(2e-4); + SolutionType x = svd.solve(rhs); - - // evaluate normal equation which works also for least-squares solutions - if(internal::is_same<RealScalar,double>::value || svd.rank()==m.diagonal().size()) - { - // This test is not stable with single precision. - // This is probably because squaring m signicantly affects the precision. - VERIFY_IS_APPROX(m.adjoint()*(m*x),m.adjoint()*rhs); - } - + RealScalar residual = (m*x-rhs).norm(); - // Check that there is no significantly better solution in the neighborhood of x + RealScalar rhs_norm = rhs.norm(); if(!test_isMuchSmallerThan(residual,rhs.norm())) { // ^^^ If the residual is very small, then we have an exact solution, so we are already good. + + // evaluate normal equation which works also for least-squares solutions + if(internal::is_same<RealScalar,double>::value || svd.rank()==m.diagonal().size()) + { + using std::sqrt; + // This test is not stable with single precision. + // This is probably because squaring m signicantly affects the precision. + if(internal::is_same<RealScalar,float>::value) ++g_test_level; + + VERIFY_IS_APPROX(m.adjoint()*(m*x),m.adjoint()*rhs); + + if(internal::is_same<RealScalar,float>::value) --g_test_level; + } + + // Check that there is no significantly better solution in the neighborhood of x for(Index k=0;k<x.rows();++k) { + using std::abs; + SolutionType y(x); y.row(k) = (1.+2*NumTraits<RealScalar>::epsilon())*x.row(k); RealScalar residual_y = (m*y-rhs).norm(); + VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y ); + if(internal::is_same<RealScalar,float>::value) ++g_test_level; VERIFY( test_isApprox(residual_y,residual) || residual < residual_y ); + if(internal::is_same<RealScalar,float>::value) --g_test_level; y.row(k) = (1.-2*NumTraits<RealScalar>::epsilon())*x.row(k); residual_y = (m*y-rhs).norm(); + VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y ); + if(internal::is_same<RealScalar,float>::value) ++g_test_level; VERIFY( test_isApprox(residual_y,residual) || residual < residual_y ); + if(internal::is_same<RealScalar,float>::value) --g_test_level; } } } diff --git a/test/swap.cpp b/test/swap.cpp index dc3610085..5d6f0e6af 100644 --- a/test/swap.cpp +++ b/test/swap.cpp @@ -82,8 +82,10 @@ template<typename MatrixType> void swap(const MatrixType& m) void test_swap() { + int s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE); CALL_SUBTEST_1( swap(Matrix3f()) ); // fixed size, no vectorization CALL_SUBTEST_2( swap(Matrix4d()) ); // fixed size, possible vectorization - CALL_SUBTEST_3( swap(MatrixXd(3,3)) ); // dyn size, no vectorization - CALL_SUBTEST_4( swap(MatrixXf(30,30)) ); // dyn size, possible vectorization + CALL_SUBTEST_3( swap(MatrixXd(s,s)) ); // dyn size, no vectorization + CALL_SUBTEST_4( swap(MatrixXf(s,s)) ); // dyn size, possible vectorization + TEST_SET_BUT_UNUSED_VARIABLE(s) } diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp index 6f7b72167..9c6f0bc8f 100644 --- a/test/unalignedassert.cpp +++ b/test/unalignedassert.cpp @@ -9,7 +9,17 @@ #include "main.h" -typedef Matrix<float,8,1> Vector8f; +typedef Matrix<float, 6,1> Vector6f; +typedef Matrix<float, 8,1> Vector8f; +typedef Matrix<float, 12,1> Vector12f; + +typedef Matrix<double, 5,1> Vector5d; +typedef Matrix<double, 6,1> Vector6d; +typedef Matrix<double, 7,1> Vector7d; +typedef Matrix<double, 8,1> Vector8d; +typedef Matrix<double, 9,1> Vector9d; +typedef Matrix<double,10,1> Vector10d; +typedef Matrix<double,12,1> Vector12d; struct TestNew1 { @@ -85,6 +95,9 @@ void unalignedassert() construct_at_boundary<Vector2f>(4); construct_at_boundary<Vector3f>(4); construct_at_boundary<Vector4f>(16); + construct_at_boundary<Vector6f>(4); + construct_at_boundary<Vector8f>(EIGEN_ALIGN_BYTES); + construct_at_boundary<Vector12f>(16); construct_at_boundary<Matrix2f>(16); construct_at_boundary<Matrix3f>(4); construct_at_boundary<Matrix4f>(EIGEN_ALIGN_BYTES); @@ -92,6 +105,13 @@ void unalignedassert() construct_at_boundary<Vector2d>(16); construct_at_boundary<Vector3d>(4); construct_at_boundary<Vector4d>(EIGEN_ALIGN_BYTES); + construct_at_boundary<Vector5d>(4); + construct_at_boundary<Vector6d>(16); + construct_at_boundary<Vector7d>(4); + construct_at_boundary<Vector8d>(EIGEN_ALIGN_BYTES); + construct_at_boundary<Vector9d>(4); + construct_at_boundary<Vector10d>(16); + construct_at_boundary<Vector12d>(EIGEN_ALIGN_BYTES); construct_at_boundary<Matrix2d>(EIGEN_ALIGN_BYTES); construct_at_boundary<Matrix3d>(4); construct_at_boundary<Matrix4d>(EIGEN_ALIGN_BYTES); @@ -115,7 +135,14 @@ void unalignedassert() if(EIGEN_ALIGN_BYTES>=16) { VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8f>(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Vector12f>(8)); VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2d>(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4d>(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Vector6d>(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8d>(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Vector10d>(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Vector12d>(8)); VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cf>(8)); VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4i>(8)); } diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 7bd8cc9d4..1ab4dc542 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -81,8 +81,8 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" -#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" #include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index 1490ffb62..77b92ee9b 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -266,16 +266,16 @@ array<t, n> repeat(t v) { } template<std::size_t I, class Head, class Tail> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>&) { return get<I, type_list<Head, Tail> >::value; } template<std::size_t I, class Head, class Tail> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>&) { return get<I, type_list<Head, Tail> >::value; } template <class NList> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList&) { return arg_prod<NList>::value; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index ed1026be2..87e57cebb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -1157,7 +1157,41 @@ in TensorFunctors.h for information on how to implement a reduction operator. ## Convolutions -TBD: convolve(const KernelDerived& kernel, const Dimensions& dims) +### <Operation> convolve(const Kernel& kernel, const Dimensions& dims) + +Returns a tensor that is the output of the convolution of the input tensor with the kernel, +along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor +which were part of the convolution will be reduced by the formula: +output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size). +The dimension sizes for dimensions that were not part of the convolution will remain the same. +Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the +convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is +for the last dimension). + + // Compute convolution along the second and third dimension. + Tensor<float, 4, DataLayout> input(3, 3, 7, 11); + Tensor<float, 2, DataLayout> kernel(2, 2); + Tensor<float, 4, DataLayout> output(3, 2, 6, 11); + input.setRandom(); + kernel.setRandom(); + + Eigen::array<ptrdiff_t, 2> dims({1, 2}); // Specify second and third dimension for convolution. + output = input.convolve(kernel, dims); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 6; ++k) { + for (int l = 0; l < 11; ++l) { + const float result = output(i,j,k,l); + const float expected = input(i,j+0,k+0,l) * kernel(0,0) + + input(i,j+1,k+0,l) * kernel(1,0) + + input(i,j+0,k+1,l) * kernel(0,1) + + input(i,j+1,k+1,l) * kernel(1,1); + VERIFY_IS_APPROX(result, expected); + } + } + } + } ## Geometrical Operations diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 591fd2464..1db5f1232 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -21,8 +21,8 @@ namespace Eigen { */ namespace internal { - -template <typename Index, typename InputDims, size_t NumKernelDims> class IndexMapper { +template <typename Index, typename InputDims, size_t NumKernelDims, int Layout> +class IndexMapper { public: IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims, const array<Index, NumKernelDims>& indices) { @@ -38,13 +38,19 @@ template <typename Index, typename InputDims, size_t NumKernelDims> class IndexM array<Index, NumDims> inputStrides; array<Index, NumDims> outputStrides; - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + inputStrides[0] = 1; + outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; - } else { - inputStrides[0] = 1; - outputStrides[0] = 1; + } + } else { + inputStrides[NumDims - 1] = 1; + outputStrides[NumDims - 1] = 1; + for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) { + inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; } } @@ -52,13 +58,20 @@ template <typename Index, typename InputDims, size_t NumKernelDims> class IndexM array<Index, NumDims> cudaOutputDimensions; array<Index, NumDims> tmp = dimensions; array<Index, NumDims> ordering; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; for (int i = 0; i < NumKernelDims; ++i) { - ordering[i] = indices[i]; + const Index index = i + offset; + ordering[index] = indices[i]; tmp[indices[i]] = -1; - cudaInputDimensions[i] = input_dims[ordering[i]]; - cudaOutputDimensions[i] = dimensions[ordering[i]]; + cudaInputDimensions[index] = input_dims[indices[i]]; + cudaOutputDimensions[index] = dimensions[indices[i]]; } - int written = NumKernelDims; + + int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? NumKernelDims + : 0; for (int i = 0; i < NumDims; ++i) { if (tmp[i] >= 0) { ordering[written] = i; @@ -73,61 +86,123 @@ template <typename Index, typename InputDims, size_t NumKernelDims> class IndexM m_outputStrides[i] = outputStrides[ordering[i]]; } - for (int i = 0; i < NumDims; ++i) { - if (i > NumKernelDims) { - m_cudaInputStrides[i] = m_cudaInputStrides[i-1] * cudaInputDimensions[i-1]; - m_cudaOutputStrides[i] = m_cudaOutputStrides[i-1] * cudaOutputDimensions[i-1]; - } else { - m_cudaInputStrides[i] = 1; - m_cudaOutputStrides[i] = 1; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (i > NumKernelDims) { + m_cudaInputStrides[i] = + m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1]; + m_cudaOutputStrides[i] = + m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1]; + } else { + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (i + 1 < offset) { + m_cudaInputStrides[i] = + m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; + m_cudaOutputStrides[i] = + m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1]; + } else { + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; + } } } } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { Index inputIndex = 0; - for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_cudaInputStrides[d]; - inputIndex += idx * m_inputStrides[d]; - p -= idx * m_cudaInputStrides[d]; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_cudaInputStrides[d]; + } + inputIndex += p * m_inputStrides[NumKernelDims]; + } else { + int limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_cudaInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_cudaInputStrides[d]; + } + inputIndex += p * m_inputStrides[limit]; } - inputIndex += p * m_inputStrides[NumKernelDims]; return inputIndex; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { Index outputIndex = 0; - for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_cudaOutputStrides[d]; - outputIndex += idx * m_outputStrides[d]; - p -= idx * m_cudaOutputStrides[d]; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; + } + outputIndex += p * m_outputStrides[NumKernelDims]; + } else { + int limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_cudaOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; + } + outputIndex += p * m_outputStrides[limit]; } - outputIndex += p * m_outputStrides[NumKernelDims]; return outputIndex; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { - return i * m_inputStrides[0]; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset]; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { - return i * m_outputStrides[0]; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset]; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { - return i * m_inputStrides[0] + j*m_inputStrides[1]; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { - return i * m_outputStrides[0] + j * m_outputStrides[1]; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { - return i * m_inputStrides[0] + j*m_inputStrides[1] + k*m_inputStrides[2]; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + + k * m_inputStrides[offset + 2]; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { - return i * m_outputStrides[0] + j*m_outputStrides[1] + k*m_outputStrides[2]; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + + k * m_outputStrides[offset + 2]; } private: @@ -237,35 +312,61 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) { EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - // Only column major tensors are supported for now. - EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - m_inputStride[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1]; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; + } } m_dimensions = m_inputImpl.dimensions(); - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - if (i > 0) { - m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1]; - } else { - m_kernelStride[0] = 1; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; + } + } else { + for (int i = NumKernelDims - 1; i >= 0; --i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i < NumKernelDims - 1) { + m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; + } else { + m_kernelStride[NumKernelDims - 1] = 1; + } + m_indexStride[i] = m_inputStride[index]; } - m_indexStride[i] = m_inputStride[index]; - } - m_outputStride[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1]; + m_outputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; + } } } @@ -310,13 +411,24 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; Index indices[2] = {index, index+PacketSize-1}; Index startInputs[2] = {0, 0}; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_outputStride[i]; - const Index idx1 = indices[1] / m_outputStride[i]; - startInputs[0] += idx0 * m_inputStride[i]; - startInputs[1] += idx1 * m_inputStride[i]; - indices[0] -= idx0 * m_outputStride[i]; - indices[1] -= idx1 * m_outputStride[i]; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } } startInputs[0] += indices[0]; startInputs[1] += indices[1]; @@ -344,10 +456,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr private: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { Index startInput = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStride[i]; - startInput += idx * m_inputStride[i]; - index -= idx * m_outputStride[i]; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } } startInput += index; return startInput; @@ -378,7 +498,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } } - EIGEN_STRONG_INLINE void preloadKernel() { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { // Don't make a local copy of the kernel unless we have to (i.e. it's an // expression that needs to be evaluated) const Scalar* in_place = m_kernelImpl.data(); @@ -431,11 +551,14 @@ struct GetKernelSize<Dynamic> { } }; - - - -template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize> -__global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) { +template <typename InputEvaluator, typename Index, typename InputDims, + int StaticKernelSize> +__global__ void EigenConvolutionKernel1D( + InputEvaluator eval, + const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int kernelSize, float* buffer) { extern __shared__ float s[]; const int first_x = blockIdx.x * maxX; @@ -453,7 +576,7 @@ __global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::In #pragma unroll for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); - s[i + plane_kernel_offset] = eval.coeff(tensor_index); + s[i + plane_kernel_offset] = eval.coeff(tensor_index); } __syncthreads(); @@ -476,9 +599,15 @@ __global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::In } }; - -template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY> -__global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) { +template <typename InputEvaluator, typename Index, typename InputDims, + int StaticKernelSizeX, int StaticKernelSizeY> +__global__ void EigenConvolutionKernel2D( + InputEvaluator eval, + const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int numY, const int maxY, const int kernelSizeX, + const int kernelSizeY, float* buffer) { extern __shared__ float s[]; const int first_x = blockIdx.x * maxX; @@ -538,9 +667,15 @@ __global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::In } }; - template <typename InputEvaluator, typename Index, typename InputDims> -__global__ void EigenConvolutionKernel3D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3> indexMapper, const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) { +__global__ void EigenConvolutionKernel3D( + InputEvaluator eval, + const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> + indexMapper, + const float* __restrict kernel, const size_t numPlanes, const size_t numX, + const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, + const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, + const size_t kernelSizeZ, float* buffer) { extern __shared__ float s[]; // Load inputs to shared memory @@ -622,8 +757,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) { EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - // Only column major tensors are supported for now. - EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); @@ -712,10 +845,14 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr const int numX = dimensions()[m_indices[0]]; const int numP = dimensions().TotalSize() / numX; - int maxX; dim3 block_size; - if (m_indices[0] == 0) { + + const int single_stride_dim = + static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : m_inputImpl.dimensions().rank() - 1; + if (m_indices[0] == single_stride_dim) { // Maximum the reuse const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; maxX = (std::min<int>)(inner_dim, numX); @@ -747,7 +884,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr const array<Index, 1> indices(m_indices[0]); const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]); - internal::IndexMapper<Index, InputDims, 1> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); switch(kernel_size) { case 4: { LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); @@ -765,11 +903,15 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } case 2: { - const int kernel_size_x = m_kernelImpl.dimensions()[0]; - const int kernel_size_y = m_kernelImpl.dimensions()[1]; - - const int numX = dimensions()[m_indices[0]]; - const int numY = dimensions()[m_indices[1]]; + const int idxX = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1; + const int idxY = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0; + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; const int numP = dimensions().TotalSize() / (numX*numY); const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); @@ -798,9 +940,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - const array<Index, 2> indices(m_indices[0], m_indices[1]); - const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1]); - internal::IndexMapper<Index, InputDims, 2> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]); + const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY]); + internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); switch (kernel_size_x) { case 4: { switch (kernel_size_y) { @@ -837,13 +981,20 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } case 3: { - const int kernel_size_x = m_kernelImpl.dimensions()[0]; - const int kernel_size_y = m_kernelImpl.dimensions()[1]; - const int kernel_size_z = m_kernelImpl.dimensions()[2]; - - const int numX = dimensions()[m_indices[0]]; - const int numY = dimensions()[m_indices[1]]; - const int numZ = dimensions()[m_indices[2]]; + const int idxX = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2; + const int idxY = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1; + const int idxZ = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0; + + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numZ = dimensions()[m_indices[idxZ]]; const int numP = dimensions().TotalSize() / (numX*numY*numZ); const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); @@ -860,16 +1011,20 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr assert(shared_mem <= maxSharedMem); //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - const array<Index, 3> indices(m_indices[0], m_indices[1], m_indices[2]); - const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1], m_kernelImpl.dimensions()[2]); - internal::IndexMapper<Index, InputDims, 3> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + const array<Index, 3> indices(m_indices[idxX], m_indices[idxY], + m_indices[idxZ]); + const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY], + m_kernelImpl.dimensions()[idxZ]); + internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); break; } default: { - assert(false && "not supported yet"); + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); } } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index 7a67c56b3..17f10c07b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -32,8 +32,7 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice { EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign; Assign assign(m_expression, other); - static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess; - internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device); + internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device); return *this; } @@ -44,8 +43,7 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice { Sum sum(m_expression, other); typedef TensorAssignOp<ExpressionType, const Sum> Assign; Assign assign(m_expression, sum); - static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess; - internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device); + internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device); return *this; } @@ -56,8 +54,7 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice { Difference difference(m_expression, other); typedef TensorAssignOp<ExpressionType, const Difference> Assign; Assign assign(m_expression, difference); - static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess; - internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device); + internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device); return *this; } @@ -76,8 +73,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign; Assign assign(m_expression, other); - static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess; - internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device); + internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device); return *this; } @@ -88,8 +84,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool Sum sum(m_expression, other); typedef TensorAssignOp<ExpressionType, const Sum> Assign; Assign assign(m_expression, sum); - static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess; - internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device); + internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device); return *this; } @@ -100,8 +95,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool Difference difference(m_expression, other); typedef TensorAssignOp<ExpressionType, const Difference> Assign; Assign assign(m_expression, difference); - static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess; - internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device); + internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device); return *this; } @@ -122,7 +116,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice> EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign; Assign assign(m_expression, other); - internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device); + internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device); return *this; } @@ -133,7 +127,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice> Sum sum(m_expression, other); typedef TensorAssignOp<ExpressionType, const Sum> Assign; Assign assign(m_expression, sum); - internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device); + internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device); return *this; } @@ -144,14 +138,13 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice> Difference difference(m_expression, other); typedef TensorAssignOp<ExpressionType, const Difference> Assign; Assign assign(m_expression, difference); - static const bool Vectorize = TensorEvaluator<const Assign, GpuDevice>::PacketAccess; - internal::TensorExecutor<const Assign, GpuDevice, Vectorize>::run(assign, m_device); + internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device); return *this; } protected: const GpuDevice& m_device; - ExpressionType m_expression; + ExpressionType& m_expression; }; #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 4c713af9f..89dffbdfd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -145,39 +145,39 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0 Sizes() { } template <typename DenseIndex> - explicit Sizes(const array<DenseIndex, Base::count>& indices) { + explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) { // todo: add assertion } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES - template <typename... DenseIndex> Sizes(DenseIndex... indices) { } - explicit Sizes(std::initializer_list<std::size_t> l) { + template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { } + explicit Sizes(std::initializer_list<std::size_t>) { // todo: add assertion } #else - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0) { + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) { } - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1) { + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex) { } - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex) { } - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { } - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { } #endif - template <typename T> Sizes& operator = (const T& other) { + template <typename T> Sizes& operator = (const T&) { // to do: check the size of other return *this; } template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const { - return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this); + return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this)); } template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const { - return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this); + return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this)); } }; @@ -343,7 +343,7 @@ template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::s template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > { static const size_t value = Sizes<V1,V2,V3,V4,V5>::count; }; -template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>& a) { +template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>&) { return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index d084880de..9198c17ef 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -352,11 +352,12 @@ template<typename IfArgType, typename ThenArgType, typename ElseArgType, typenam struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device> { typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType; + typedef typename XprType::Scalar Scalar; enum { IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess/* & - TensorEvaluator<IfArgType>::PacketAccess*/, + PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess & + internal::packet_traits<Scalar>::HasBlend, Layout = TensorEvaluator<IfArgType, Device>::Layout, CoordAccess = false, // to be implemented }; @@ -373,7 +374,6 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> } typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; typedef typename internal::traits<XprType>::Scalar CoeffReturnType; typedef typename internal::traits<XprType>::Packet PacketReturnType; typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions; @@ -403,7 +403,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> template<int LoadMode> EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; internal::Selector<PacketSize> select; for (Index i = 0; i < PacketSize; ++i) { select.select[i] = m_condImpl.coeff(index+i); diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp index 5ab97f86c..e17b5952c 100644 --- a/unsupported/test/cxx11_tensor_convolution.cpp +++ b/unsupported/test/cxx11_tensor_convolution.cpp @@ -14,15 +14,16 @@ using Eigen::Tensor; using Eigen::DefaultDevice; +template <int DataLayout> static void test_evals() { - Tensor<float, 2> input(3, 3); - Tensor<float, 1> kernel(2); + Tensor<float, 2, DataLayout> input(3, 3); + Tensor<float, 1, DataLayout> kernel(2); input.setRandom(); kernel.setRandom(); - Tensor<float, 2> result(2,3); + Tensor<float, 2, DataLayout> result(2,3); result.setZero(); Eigen::array<Tensor<float, 2>::Index, 1> dims3{{0}}; @@ -41,16 +42,16 @@ static void test_evals() VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5 } - +template <int DataLayout> static void test_expr() { - Tensor<float, 2> input(3, 3); - Tensor<float, 2> kernel(2, 2); + Tensor<float, 2, DataLayout> input(3, 3); + Tensor<float, 2, DataLayout> kernel(2, 2); input.setRandom(); kernel.setRandom(); - Tensor<float, 2> result(2,2); - Eigen::array<ptrdiff_t, 2> dims{{0, 1}}; + Tensor<float, 2, DataLayout> result(2,2); + Eigen::array<ptrdiff_t, 2> dims({0, 1}); result = input.convolve(kernel, dims); VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) + @@ -63,10 +64,10 @@ static void test_expr() input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1)); } - +template <int DataLayout> static void test_modes() { - Tensor<float, 1> input(3); - Tensor<float, 1> kernel(3); + Tensor<float, 1, DataLayout> input(3); + Tensor<float, 1, DataLayout> kernel(3); input(0) = 1.0f; input(1) = 2.0f; input(2) = 3.0f; @@ -74,13 +75,13 @@ static void test_modes() { kernel(1) = 1.0f; kernel(2) = 0.0f; - const Eigen::array<ptrdiff_t, 1> dims{{0}}; + const Eigen::array<ptrdiff_t, 1> dims({0}); Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 1> padding; // Emulate VALID mode (as defined in // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). padding[0] = std::make_pair(0, 0); - Tensor<float, 1> valid(1); + Tensor<float, 1, DataLayout> valid(1); valid = input.pad(padding).convolve(kernel, dims); VERIFY_IS_EQUAL(valid.dimension(0), 1); VERIFY_IS_APPROX(valid(0), 2.5f); @@ -88,7 +89,7 @@ static void test_modes() { // Emulate SAME mode (as defined in // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). padding[0] = std::make_pair(1, 1); - Tensor<float, 1> same(3); + Tensor<float, 1, DataLayout> same(3); same = input.pad(padding).convolve(kernel, dims); VERIFY_IS_EQUAL(same.dimension(0), 3); VERIFY_IS_APPROX(same(0), 1.0f); @@ -98,7 +99,7 @@ static void test_modes() { // Emulate FULL mode (as defined in // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). padding[0] = std::make_pair(2, 2); - Tensor<float, 1> full(5); + Tensor<float, 1, DataLayout> full(5); full = input.pad(padding).convolve(kernel, dims); VERIFY_IS_EQUAL(full.dimension(0), 5); VERIFY_IS_APPROX(full(0), 0.0f); @@ -108,18 +109,18 @@ static void test_modes() { VERIFY_IS_APPROX(full(4), 1.5f); } - +template <int DataLayout> static void test_strides() { - Tensor<float, 1> input(13); - Tensor<float, 1> kernel(3); + Tensor<float, 1, DataLayout> input(13); + Tensor<float, 1, DataLayout> kernel(3); input.setRandom(); kernel.setRandom(); - const Eigen::array<ptrdiff_t, 1> dims{{0}}; - const Eigen::array<ptrdiff_t, 1> stride_of_3{{3}}; - const Eigen::array<ptrdiff_t, 1> stride_of_2{{2}}; + const Eigen::array<ptrdiff_t, 1> dims({0}); + const Eigen::array<ptrdiff_t, 1> stride_of_3({3}); + const Eigen::array<ptrdiff_t, 1> stride_of_2({2}); - Tensor<float, 1> result; + Tensor<float, 1, DataLayout> result; result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2); VERIFY_IS_EQUAL(result.dimension(0), 2); @@ -129,13 +130,14 @@ static void test_strides() { input(12)*kernel(2))); } - - - void test_cxx11_tensor_convolution() { - CALL_SUBTEST(test_evals()); - CALL_SUBTEST(test_expr()); - CALL_SUBTEST(test_modes()); - CALL_SUBTEST(test_strides()); + CALL_SUBTEST(test_evals<ColMajor>()); + CALL_SUBTEST(test_evals<RowMajor>()); + CALL_SUBTEST(test_expr<ColMajor>()); + CALL_SUBTEST(test_expr<RowMajor>()); + CALL_SUBTEST(test_modes<ColMajor>()); + CALL_SUBTEST(test_modes<RowMajor>()); + CALL_SUBTEST(test_strides<ColMajor>()); + CALL_SUBTEST(test_strides<RowMajor>()); } diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cpp index 8c1ca1bf8..78934165f 100644 --- a/unsupported/test/cxx11_tensor_cuda.cpp +++ b/unsupported/test/cxx11_tensor_cuda.cpp @@ -117,11 +117,10 @@ void test_cuda_elementwise() } } - void test_cuda_reduction() { - Tensor<float, 4> in1(Eigen::array<int, 4>(72,53,97,113)); - Tensor<float, 2> out(Eigen::array<int, 2>(72,97)); + Tensor<float, 4> in1(72,53,97,113); + Tensor<float, 2> out(72,97); in1.setRandom(); std::size_t in1_bytes = in1.size() * sizeof(float); @@ -138,8 +137,8 @@ void test_cuda_reduction() assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, Eigen::array<int, 4>(72,53,97,113)); - Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, Eigen::array<int, 2>(72,97)); + Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113); + Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); array<int, 2> reduction_axis; reduction_axis[0] = 1; @@ -156,10 +155,10 @@ void test_cuda_reduction() for (int k = 0; k < 53; ++k) { for (int l = 0; l < 113; ++l) { expected = - std::max<float>(expected, in1(Eigen::array<int, 4>(i, k, j, l))); + std::max<float>(expected, in1(i, k, j, l)); } } - VERIFY_IS_APPROX(out(Eigen::array<int, 2>(i,j)), expected); + VERIFY_IS_APPROX(out(i,j), expected); } } } @@ -170,7 +169,7 @@ static void test_cuda_contraction() // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU - Tensor<float, 4, DataLayout> t_left(Eigen::array<int, 4>(6, 50, 3, 31)); + Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31); Tensor<float, 5, DataLayout> t_right(Eigen::array<int, 5>(3, 31, 7, 20, 1)); Tensor<float, 5, DataLayout> t_result(Eigen::array<int, 5>(6, 50, 7, 20, 1)); @@ -196,12 +195,9 @@ static void test_cuda_contraction() assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > - gpu_t_left(d_t_left, Eigen::array<int, 4>(6, 50, 3, 31)); - Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > - gpu_t_right(d_t_right, Eigen::array<int, 5>(3, 31, 7, 20, 1)); - Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > - gpu_t_result(d_t_result, Eigen::array<int, 5>(6, 50, 7, 20, 1)); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1); typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf; MapXf m_left(t_left.data(), 300, 93); @@ -226,11 +222,12 @@ static void test_cuda_contraction() } } +template<int DataLayout> static void test_cuda_convolution_1d() { - Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137)); - Tensor<float, 1> kernel(Eigen::array<int, 1>(4)); - Tensor<float, 4> out(Eigen::array<int, 4>(74,34,11,137)); + Tensor<float, 4, DataLayout> input(74,37,11,137); + Tensor<float, 1, DataLayout> kernel(4); + Tensor<float, 4, DataLayout> out(74,34,11,137); input = input.constant(10.0f) + input.random(); kernel = kernel.constant(7.0f) + kernel.random(); @@ -252,9 +249,9 @@ static void test_cuda_convolution_1d() assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137)); - Eigen::TensorMap<Eigen::Tensor<float, 1> > gpu_kernel(d_kernel, Eigen::array<int, 1>(4)); - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,34,11,137)); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137); + Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137); Eigen::array<int, 1> dims(1); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); @@ -266,11 +263,9 @@ static void test_cuda_convolution_1d() for (int j = 0; j < 34; ++j) { for (int k = 0; k < 11; ++k) { for (int l = 0; l < 137; ++l) { - const float result = out(Eigen::array<int, 4>(i,j,k,l)); - const float expected = input(Eigen::array<int, 4>(i,j+0,k,l)) * kernel(Eigen::array<int, 1>(0)) + - input(Eigen::array<int, 4>(i,j+1,k,l)) * kernel(Eigen::array<int, 1>(1)) + - input(Eigen::array<int, 4>(i,j+2,k,l)) * kernel(Eigen::array<int, 1>(2)) + - input(Eigen::array<int, 4>(i,j+3,k,l)) * kernel(Eigen::array<int, 1>(3)); + const float result = out(i,j,k,l); + const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) + + input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3); VERIFY_IS_APPROX(result, expected); } } @@ -278,12 +273,11 @@ static void test_cuda_convolution_1d() } } - -static void test_cuda_convolution_2d() +static void test_cuda_convolution_inner_dim_col_major_1d() { - Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137)); - Tensor<float, 2> kernel(Eigen::array<int, 2>(3,4)); - Tensor<float, 4> out(Eigen::array<int, 4>(74,35,8,137)); + Tensor<float, 4, ColMajor> input(74,9,11,7); + Tensor<float, 1, ColMajor> kernel(4); + Tensor<float, 4, ColMajor> out(71,9,11,7); input = input.constant(10.0f) + input.random(); kernel = kernel.constant(7.0f) + kernel.random(); @@ -305,46 +299,35 @@ static void test_cuda_convolution_2d() assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137)); - Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_kernel(d_kernel, Eigen::array<int, 2>(3,4)); - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,35,8,137)); + Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7); + Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4); + Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7); - Eigen::array<int, 2> dims(1,2); + Eigen::array<int, 1> dims(0); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - for (int i = 0; i < 74; ++i) { - for (int j = 0; j < 35; ++j) { - for (int k = 0; k < 8; ++k) { - for (int l = 0; l < 137; ++l) { - const float result = out(Eigen::array<int, 4>(i,j,k,l)); - const float expected = input(Eigen::array<int, 4>(i,j+0,k+0,l)) * kernel(Eigen::array<int, 2>(0,0)) + - input(Eigen::array<int, 4>(i,j+1,k+0,l)) * kernel(Eigen::array<int, 2>(1,0)) + - input(Eigen::array<int, 4>(i,j+2,k+0,l)) * kernel(Eigen::array<int, 2>(2,0)) + - input(Eigen::array<int, 4>(i,j+0,k+1,l)) * kernel(Eigen::array<int, 2>(0,1)) + - input(Eigen::array<int, 4>(i,j+1,k+1,l)) * kernel(Eigen::array<int, 2>(1,1)) + - input(Eigen::array<int, 4>(i,j+2,k+1,l)) * kernel(Eigen::array<int, 2>(2,1)) + - input(Eigen::array<int, 4>(i,j+0,k+2,l)) * kernel(Eigen::array<int, 2>(0,2)) + - input(Eigen::array<int, 4>(i,j+1,k+2,l)) * kernel(Eigen::array<int, 2>(1,2)) + - input(Eigen::array<int, 4>(i,j+2,k+2,l)) * kernel(Eigen::array<int, 2>(2,2)) + - input(Eigen::array<int, 4>(i,j+0,k+3,l)) * kernel(Eigen::array<int, 2>(0,3)) + - input(Eigen::array<int, 4>(i,j+1,k+3,l)) * kernel(Eigen::array<int, 2>(1,3)) + - input(Eigen::array<int, 4>(i,j+2,k+3,l)) * kernel(Eigen::array<int, 2>(2,3)); - VERIFY_IS_APPROX(result, expected); + for (int i = 0; i < 71; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 7; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) + + input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3); + VERIFY_IS_APPROX(result, expected); } } } } } - -static void test_cuda_convolution_3d() +static void test_cuda_convolution_inner_dim_row_major_1d() { - Tensor<float, 5> input(Eigen::array<int, 5>(74,37,11,137,17)); - Tensor<float, 3> kernel(Eigen::array<int, 3>(3,4,2)); - Tensor<float, 5> out(Eigen::array<int, 5>(74,35,8,136,17)); + Tensor<float, 4, RowMajor> input(7,9,11,74); + Tensor<float, 1, RowMajor> kernel(4); + Tensor<float, 4, RowMajor> out(7,9,11,71); input = input.constant(10.0f) + input.random(); kernel = kernel.constant(7.0f) + kernel.random(); @@ -366,139 +349,166 @@ static void test_cuda_convolution_3d() assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_input(d_input, Eigen::array<int, 5>(74,37,11,137,17)); - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_kernel(d_kernel, Eigen::array<int, 3>(3,4,2)); - Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_out(d_out, Eigen::array<int, 5>(74,35,8,136,17)); + Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74); + Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4); + Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71); - Eigen::array<int, 3> dims(1,2,3); + Eigen::array<int, 1> dims(3); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - for (int i = 0; i < 74; ++i) { - for (int j = 0; j < 35; ++j) { - for (int k = 0; k < 8; ++k) { - for (int l = 0; l < 136; ++l) { - for (int m = 0; m < 17; ++m) { - const float result = out(Eigen::array<int, 5>(i,j,k,l,m)); - const float expected = input(Eigen::array<int, 5>(i,j+0,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(0,0,0)) + - input(Eigen::array<int, 5>(i,j+1,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(1,0,0)) + - input(Eigen::array<int, 5>(i,j+2,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(2,0,0)) + - input(Eigen::array<int, 5>(i,j+0,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(0,1,0)) + - input(Eigen::array<int, 5>(i,j+1,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(1,1,0)) + - input(Eigen::array<int, 5>(i,j+2,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(2,1,0)) + - input(Eigen::array<int, 5>(i,j+0,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(0,2,0)) + - input(Eigen::array<int, 5>(i,j+1,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(1,2,0)) + - input(Eigen::array<int, 5>(i,j+2,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(2,2,0)) + - input(Eigen::array<int, 5>(i,j+0,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(0,3,0)) + - input(Eigen::array<int, 5>(i,j+1,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(1,3,0)) + - input(Eigen::array<int, 5>(i,j+2,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(2,3,0)) + - input(Eigen::array<int, 5>(i,j+0,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(0,0,1)) + - input(Eigen::array<int, 5>(i,j+1,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(1,0,1)) + - input(Eigen::array<int, 5>(i,j+2,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(2,0,1)) + - input(Eigen::array<int, 5>(i,j+0,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(0,1,1)) + - input(Eigen::array<int, 5>(i,j+1,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(1,1,1)) + - input(Eigen::array<int, 5>(i,j+2,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(2,1,1)) + - input(Eigen::array<int, 5>(i,j+0,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(0,2,1)) + - input(Eigen::array<int, 5>(i,j+1,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(1,2,1)) + - input(Eigen::array<int, 5>(i,j+2,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(2,2,1)) + - input(Eigen::array<int, 5>(i,j+0,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(0,3,1)) + - input(Eigen::array<int, 5>(i,j+1,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(1,3,1)) + - input(Eigen::array<int, 5>(i,j+2,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(2,3,1)); - VERIFY_IS_APPROX(result, expected); - } + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 71; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) + + input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3); + VERIFY_IS_APPROX(result, expected); } } } } } -static float* CudaCopyFloat(float* data, int size) { - const int nbytes = size * sizeof(float); - float* result = NULL; - if (cudaMalloc((void**)(&result), nbytes) != cudaSuccess) { - return NULL; - } else { - if (data != NULL) { - cudaMemcpy(result, data, nbytes, cudaMemcpyHostToDevice); - } - return result; - } -} - -static void test_cuda_constant_broadcast() +template<int DataLayout> +static void test_cuda_convolution_2d() { + Tensor<float, 4, DataLayout> input(74,37,11,137); + Tensor<float, 2, DataLayout> kernel(3,4); + Tensor<float, 4, DataLayout> out(74,35,8,137); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + cudaStream_t stream; assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Tensor<float, 1> t1(10); - for (int i = 0; i < 10; ++i) { - t1(i) = 10.0f * i; - } - float* t1_cuda = CudaCopyFloat(t1.data(), t1.size()); - Eigen::TensorMap<Eigen::Tensor<float, 1> > t1_gpu(t1_cuda, 10); - - Tensor<float, 1> t2(1); - t2 = t2.constant(20.0f); - float* t2_cuda = CudaCopyFloat(t2.data(), t2.size()); - Eigen::TensorMap<Eigen::TensorFixedSize<float, Sizes<1> > > t2_gpu(t2_cuda, 1); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137); + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137); - float* t3_cuda = CudaCopyFloat(NULL, 10); - Eigen::TensorMap<Eigen::Tensor<float, 1> > t3_gpu(t3_cuda, 10); - - t3_gpu.device(gpu_device) = - t1_gpu + t2_gpu.broadcast(Eigen::array<int, 1>(10)); + Eigen::array<int, 2> dims(1,2); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - Eigen::Tensor<float, 1> t3(10); - cudaMemcpy(t3.data(), t3_gpu.data(), 10 * sizeof(float), - cudaMemcpyDeviceToHost); + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - for (int i = 0; i < 10; ++i) { - VERIFY_IS_APPROX(t3(i), t1(i) + t2(0)); + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 137; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i,j+0,k+0,l) * kernel(0,0) + + input(i,j+1,k+0,l) * kernel(1,0) + + input(i,j+2,k+0,l) * kernel(2,0) + + input(i,j+0,k+1,l) * kernel(0,1) + + input(i,j+1,k+1,l) * kernel(1,1) + + input(i,j+2,k+1,l) * kernel(2,1) + + input(i,j+0,k+2,l) * kernel(0,2) + + input(i,j+1,k+2,l) * kernel(1,2) + + input(i,j+2,k+2,l) * kernel(2,2) + + input(i,j+0,k+3,l) * kernel(0,3) + + input(i,j+1,k+3,l) * kernel(1,3) + + input(i,j+2,k+3,l) * kernel(2,3); + VERIFY_IS_APPROX(result, expected); + } + } + } } } - -void test_cuda_cast() +template<int DataLayout> +static void test_cuda_convolution_3d() { - Tensor<double, 3> in(Eigen::array<int, 3>(72,53,97)); - Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97)); - in.setRandom(); + Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17)); + Tensor<float, 3, DataLayout> kernel(3,4,2); + Tensor<float, 5, DataLayout> out(Eigen::array<int, 5>(74,35,8,136,17)); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); - std::size_t in_bytes = in.size() * sizeof(double); + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); std::size_t out_bytes = out.size() * sizeof(float); - double* d_in; + float* d_input; + float* d_kernel; float* d_out; - cudaMalloc((void**)(&d_in), in_bytes); + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); cudaMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); cudaStream_t stream; assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<double, 3> > gpu_in(d_in, Eigen::array<int, 3>(72,53,97)); - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97)); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17); + Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17); - gpu_out.device(gpu_device) = gpu_in.template cast<float>(); + Eigen::array<int, 3> dims(1,2,3); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - for (int i = 0; i < 72; ++i) { - for (int j = 0; j < 53; ++j) { - for (int k = 0; k < 97; ++k) { - VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), static_cast<float>(in(Eigen::array<int, 3>(i,j,k)))); + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 136; ++l) { + for (int m = 0; m < 17; ++m) { + const float result = out(i,j,k,l,m); + const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) + + input(i,j+1,k+0,l+0,m) * kernel(1,0,0) + + input(i,j+2,k+0,l+0,m) * kernel(2,0,0) + + input(i,j+0,k+1,l+0,m) * kernel(0,1,0) + + input(i,j+1,k+1,l+0,m) * kernel(1,1,0) + + input(i,j+2,k+1,l+0,m) * kernel(2,1,0) + + input(i,j+0,k+2,l+0,m) * kernel(0,2,0) + + input(i,j+1,k+2,l+0,m) * kernel(1,2,0) + + input(i,j+2,k+2,l+0,m) * kernel(2,2,0) + + input(i,j+0,k+3,l+0,m) * kernel(0,3,0) + + input(i,j+1,k+3,l+0,m) * kernel(1,3,0) + + input(i,j+2,k+3,l+0,m) * kernel(2,3,0) + + input(i,j+0,k+0,l+1,m) * kernel(0,0,1) + + input(i,j+1,k+0,l+1,m) * kernel(1,0,1) + + input(i,j+2,k+0,l+1,m) * kernel(2,0,1) + + input(i,j+0,k+1,l+1,m) * kernel(0,1,1) + + input(i,j+1,k+1,l+1,m) * kernel(1,1,1) + + input(i,j+2,k+1,l+1,m) * kernel(2,1,1) + + input(i,j+0,k+2,l+1,m) * kernel(0,2,1) + + input(i,j+1,k+2,l+1,m) * kernel(1,2,1) + + input(i,j+2,k+2,l+1,m) * kernel(2,2,1) + + input(i,j+0,k+3,l+1,m) * kernel(0,3,1) + + input(i,j+1,k+3,l+1,m) * kernel(1,3,1) + + input(i,j+2,k+3,l+1,m) * kernel(2,3,1); + VERIFY_IS_APPROX(result, expected); + } + } } } } } - void test_cxx11_tensor_cuda() { CALL_SUBTEST(test_cuda_elementwise_small()); @@ -506,9 +516,12 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST(test_cuda_reduction()); CALL_SUBTEST(test_cuda_contraction<ColMajor>()); CALL_SUBTEST(test_cuda_contraction<RowMajor>()); - CALL_SUBTEST(test_cuda_convolution_1d()); - CALL_SUBTEST(test_cuda_convolution_2d()); - CALL_SUBTEST(test_cuda_convolution_3d()); - CALL_SUBTEST(test_cuda_constant_broadcast()); - CALL_SUBTEST(test_cuda_cast()); + CALL_SUBTEST(test_cuda_convolution_1d<ColMajor>()); + CALL_SUBTEST(test_cuda_convolution_1d<RowMajor>()); + CALL_SUBTEST(test_cuda_convolution_inner_dim_col_major_1d()); + CALL_SUBTEST(test_cuda_convolution_inner_dim_row_major_1d()); + CALL_SUBTEST(test_cuda_convolution_2d<ColMajor>()); + CALL_SUBTEST(test_cuda_convolution_2d<RowMajor>()); + CALL_SUBTEST(test_cuda_convolution_3d<ColMajor>()); + CALL_SUBTEST(test_cuda_convolution_3d<RowMajor>()); } |