diff options
227 files changed, 32433 insertions, 5999 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 05d92babe..00287c9bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ project(Eigen) -cmake_minimum_required(VERSION 2.8.2) +cmake_minimum_required(VERSION 2.8.4) # guard against in-source builds @@ -448,6 +448,7 @@ if(cmake_generator_tolower MATCHES "makefile") message(STATUS "make check | Build and run the unit-tests. Read this page:") message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests") message(STATUS "make blas | Build BLAS library (not the same thing as Eigen)") + message(STATUS "make uninstall| Removes files installed by make install") message(STATUS "--------------+--------------------------------------------------------------") else() message(STATUS "To build/run the unit tests, read this page:") @@ -483,3 +484,7 @@ install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake DESTINATION ${EIGEN_CONFIG_CMAKE_PATH} ) + +# Add uninstall target +add_custom_target ( uninstall + COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake) diff --git a/Eigen/Core b/Eigen/Core index dcb20bfd0..216428e97 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -125,9 +125,7 @@ #define EIGEN_VECTORIZE_SSE4_1 #define EIGEN_VECTORIZE_SSE4_2 #endif - #ifdef __FMA__ - #define EIGEN_VECTORIZE_FMA - #endif + // include files // This extern "C" works around a MINGW-w64 compilation issue @@ -187,6 +185,11 @@ #endif #endif +#if defined __CUDACC__ + #define EIGEN_VECTORIZE_CUDA + #include <vector_types.h> +#endif + #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #define EIGEN_HAS_OPENMP #endif @@ -302,9 +305,15 @@ using std::ptrdiff_t; #include "src/Core/arch/AltiVec/Complex.h" #elif defined EIGEN_VECTORIZE_NEON #include "src/Core/arch/NEON/PacketMath.h" + #include "src/Core/arch/NEON/MathFunctions.h" #include "src/Core/arch/NEON/Complex.h" #endif +#if defined EIGEN_VECTORIZE_CUDA + #include "src/Core/arch/CUDA/PacketMath.h" + #include "src/Core/arch/CUDA/MathFunctions.h" +#endif + #include "src/Core/arch/Default/Settings.h" #include "src/Core/functors/BinaryFunctors.h" diff --git a/Eigen/SparseCore b/Eigen/SparseCore index b68c8fa8a..48ed967b8 100644 --- a/Eigen/SparseCore +++ b/Eigen/SparseCore @@ -26,21 +26,17 @@ * This module depends on: Core. */ -namespace Eigen { - -/** The type used to identify a general sparse storage. */ -struct Sparse {}; - -} - #include "src/SparseCore/SparseUtil.h" #include "src/SparseCore/SparseMatrixBase.h" #include "src/SparseCore/SparseAssign.h" #include "src/SparseCore/CompressedStorage.h" #include "src/SparseCore/AmbiVector.h" +#include "src/SparseCore/SparseCompressedBase.h" #include "src/SparseCore/SparseMatrix.h" +#include "src/SparseCore/SparseMap.h" #include "src/SparseCore/MappedSparseMatrix.h" #include "src/SparseCore/SparseVector.h" +#include "src/SparseCore/SparseRef.h" #include "src/SparseCore/SparseCwiseUnaryOp.h" #include "src/SparseCore/SparseCwiseBinaryOp.h" #include "src/SparseCore/SparseTranspose.h" diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 6ea383695..2ef37ca1c 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -87,7 +87,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp // FIXME, this traits is rather specialized for dense object and it needs to be cleaned further FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0, FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, - Flags = (traits<XprType>::Flags & DirectAccessBit) | FlagsLvalueBit | FlagsRowMajorBit + Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit // FIXME DirectAccessBit should not be handled by expressions }; }; diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h index 98ebe3bf6..808f3977c 100644 --- a/Eigen/src/Core/CommaInitializer.h +++ b/Eigen/src/Core/CommaInitializer.h @@ -28,7 +28,6 @@ template<typename XprType> struct CommaInitializer { typedef typename XprType::Scalar Scalar; - typedef typename XprType::StorageIndex StorageIndex; EIGEN_DEVICE_FUNC inline CommaInitializer(XprType& xpr, const Scalar& s) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index eb35b44cb..9485080d3 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -111,6 +111,7 @@ struct evaluator_base typedef evaluator<ExpressionType> type; typedef evaluator<ExpressionType> nestedType; + // FIXME is it really usefull? typedef typename traits<ExpressionType>::StorageIndex StorageIndex; // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices. typedef traits<ExpressionType> ExpressionTraits; @@ -128,7 +129,6 @@ struct evaluator<PlainObjectBase<Derived> > : evaluator_base<Derived> { typedef PlainObjectBase<Derived> PlainObjectType; - typedef typename PlainObjectType::StorageIndex StorageIndex; typedef typename PlainObjectType::Scalar Scalar; typedef typename PlainObjectType::CoeffReturnType CoeffReturnType; typedef typename PlainObjectType::PacketScalar PacketScalar; @@ -264,7 +264,6 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased> EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {} - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; @@ -343,7 +342,6 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> > : m_functor(n.functor()) { } - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; @@ -394,7 +392,6 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased > m_argImpl(op.nestedExpression()) { } - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; @@ -469,7 +466,6 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase m_rhsImpl(xpr.rhs()) { } - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; @@ -522,7 +518,6 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased> m_argImpl(op.nestedExpression()) { } - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -563,7 +558,6 @@ struct mapbase_evaluator : evaluator_base<Derived> { typedef Derived XprType; typedef typename XprType::PointerType PointerType; - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; @@ -760,7 +754,6 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa m_startCol(block.startCol()) { } - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; @@ -865,7 +858,6 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> > m_elseImpl(select.elseMatrix()) { } - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::CoeffReturnType CoeffReturnType; inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const @@ -898,7 +890,6 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> > : evaluator_base<Replicate<ArgType, RowFactor, ColFactor> > { typedef Replicate<ArgType, RowFactor, ColFactor> XprType; - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; enum { @@ -981,7 +972,6 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> > : m_expr(expr) {} - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::CoeffReturnType CoeffReturnType; EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const @@ -1016,7 +1006,6 @@ struct evaluator_wrapper_base EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {} - typedef typename ArgType::StorageIndex StorageIndex; typedef typename ArgType::Scalar Scalar; typedef typename ArgType::CoeffReturnType CoeffReturnType; typedef typename ArgType::PacketScalar PacketScalar; @@ -1103,7 +1092,6 @@ struct unary_evaluator<Reverse<ArgType, Direction> > : evaluator_base<Reverse<ArgType, Direction> > { typedef Reverse<ArgType, Direction> XprType; - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; @@ -1219,9 +1207,10 @@ struct evaluator<Diagonal<ArgType, DiagIndex> > m_index(diagonal.index()) { } - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; + // FIXME having to check whether ArgType is sparse here i not very nice. + typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value, + typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType; EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const { diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h index f37091000..56beaf3bc 100644 --- a/Eigen/src/Core/DiagonalMatrix.h +++ b/Eigen/src/Core/DiagonalMatrix.h @@ -326,6 +326,12 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense, Scalar> dst.setZero(); dst.diagonal() = src.diagonal(); } + + static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/) + { dst.diagonal() += src.diagonal(); } + + static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/) + { dst.diagonal() -= src.diagonal(); } }; } // namespace internal diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 4eb01b1b3..81750722c 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -11,7 +11,7 @@ #ifndef EIGEN_GENERAL_PRODUCT_H #define EIGEN_GENERAL_PRODUCT_H -namespace Eigen { +namespace Eigen { enum { Large = 2, @@ -252,12 +252,12 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true> bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; - + RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha); ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), evalToDest ? dest.data() : static_dest.data()); - + if(!evalToDest) { #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN @@ -273,11 +273,13 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true> MappedDest(actualDestPtr, dest.size()) = dest; } + typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper; general_matrix_vector_product - <Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run( + <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run( actualLhs.rows(), actualLhs.cols(), - actualLhs.data(), actualLhs.outerStride(), - actualRhs.data(), actualRhs.innerStride(), + LhsMapper(actualLhs.data(), actualLhs.outerStride()), + RhsMapper(actualRhs.data(), actualRhs.innerStride()), actualDestPtr, 1, compatibleAlpha); @@ -333,11 +335,13 @@ template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,true> Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs; } + typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper; general_matrix_vector_product - <Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run( + <Index,LhsScalar,LhsMapper,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run( actualLhs.rows(), actualLhs.cols(), - actualLhs.data(), actualLhs.outerStride(), - actualRhsPtr, 1, + LhsMapper(actualLhs.data(), actualLhs.outerStride()), + RhsMapper(actualRhsPtr, 1), dest.data(), dest.innerStride(), actualAlpha); } @@ -410,7 +414,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const #ifdef EIGEN_DEBUG_PRODUCT internal::product_type<Derived,OtherDerived>::debug(); #endif - + return Product<Derived, OtherDerived>(derived(), other.derived()); } diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 81f4eef40..8759cd06c 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -54,6 +54,7 @@ struct default_packet_traits HasMax = 1, HasConj = 1, HasSetLinear = 1, + HasBlend = 0, HasDiv = 0, HasSqrt = 0, @@ -94,6 +95,8 @@ template<typename T> struct packet_traits : default_packet_traits }; }; +template<typename T> struct packet_traits<const T> : packet_traits<T> { }; + /** \internal \returns a + b (coeff-wise) */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet padd(const Packet& a, @@ -356,7 +359,7 @@ pmadd(const Packet& a, /** \internal \returns a packet version of \a *from. * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */ template<typename Packet, int LoadMode> -inline Packet ploadt(const typename unpacket_traits<Packet>::type* from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from) { if(LoadMode == Aligned) return pload<Packet>(from); @@ -367,7 +370,7 @@ inline Packet ploadt(const typename unpacket_traits<Packet>::type* from) /** \internal copy the packet \a from to \a *to. * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */ template<typename Scalar, typename Packet, int LoadMode> -inline void pstoret(Scalar* to, const Packet& from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from) { if(LoadMode == Aligned) pstore(to, from); @@ -375,6 +378,17 @@ inline void pstoret(Scalar* to, const Packet& from) pstoreu(to, from); } +/** \internal \returns a packet version of \a *from. + * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the + * hardware if available to speedup the loading of data that won't be modified + * by the current computation. + */ +template<typename Packet, int LoadMode> +inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from) +{ + return ploadt<Packet, LoadMode>(from); +} + /** \internal default implementation of palign() allowing partial specialization */ template<int Offset,typename PacketType> struct palign_impl @@ -433,6 +447,19 @@ ptranspose(PacketBlock<Packet,1>& /*kernel*/) { // Nothing to do in the scalar case, i.e. a 1x1 matrix. } +/*************************************************************************** + * Selector, i.e. vector of N boolean values used to select (i.e. blend) + * words from 2 packets. +***************************************************************************/ +template <size_t N> struct Selector { + bool select[N]; +}; + +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { + return ifPacket.select[0] ? thenPacket : elsePacket; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index 3dafee9d7..8dca9796d 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -171,6 +171,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors> template<typename Derived> class MapBase<Derived, WriteAccessors> : public MapBase<Derived, ReadOnlyAccessors> { + typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase; public: typedef MapBase<Derived, ReadOnlyAccessors> Base; @@ -238,11 +239,13 @@ template<typename Derived> class MapBase<Derived, WriteAccessors> EIGEN_DEVICE_FUNC Derived& operator=(const MapBase& other) { - Base::Base::operator=(other); + ReadOnlyMapBase::Base::operator=(other); return derived(); } - using Base::Base::operator=; + // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base, + // see bugs 821 and 920. + using ReadOnlyMapBase::Base::operator=; }; #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 4c5fc1cae..16ad2dc7e 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -14,7 +14,7 @@ namespace Eigen { // On WINCE, std::abs is defined for int only, so let's defined our own overloads: // This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too. -#if defined(_WIN32_WCE) && defined(_MSC_VER) && _MSC_VER<=1500 +#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500 long abs(long x) { return (labs(x)); } double abs(double x) { return (fabs(x)); } float abs(float x) { return (fabsf(x)); } @@ -360,50 +360,31 @@ inline NewType cast(const OldType& x) } /**************************************************************************** -* Implementation of atanh2 * +* Implementation of logp1 * ****************************************************************************/ template<typename Scalar> -struct atanh2_impl +struct log1p_impl { - static inline Scalar run(const Scalar& x, const Scalar& r) + static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) - #if (__cplusplus >= 201103L) && !defined(__CYGWIN__) + // Let's be conservative and enable the default C++11 implementation only if we are sure it exists + #if (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC) \ + && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC) using std::log1p; - return log1p(2 * x / (r - x)) / 2; + return log1p(x); #else - using std::abs; + typedef typename NumTraits<Scalar>::Real RealScalar; using std::log; - using std::sqrt; - Scalar z = x / r; - if (r == 0 || abs(z) > sqrt(NumTraits<Scalar>::epsilon())) - return log((r + x) / (r - x)) / 2; - else - return z + z*z*z / 3; + Scalar x1p = RealScalar(1) + x; + return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); #endif } }; -template<typename RealScalar> -struct atanh2_impl<std::complex<RealScalar> > -{ - typedef std::complex<RealScalar> Scalar; - static inline Scalar run(const Scalar& x, const Scalar& r) - { - using std::log; - using std::norm; - using std::sqrt; - Scalar z = x / r; - if (r == Scalar(0) || norm(z) > NumTraits<RealScalar>::epsilon()) - return RealScalar(0.5) * log((r + x) / (r - x)); - else - return z + z*z*z / RealScalar(3); - } -}; - template<typename Scalar> -struct atanh2_retval +struct log1p_retval { typedef Scalar type; }; @@ -680,9 +661,9 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& template<typename Scalar> EIGEN_DEVICE_FUNC -inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y) +inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) { - return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y); + return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x); } template<typename Scalar> diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index b2c9b56ed..7f9d135f7 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -210,23 +210,26 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct> template<typename Dst, typename Lhs, typename Rhs, typename Func> EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) { + typename evaluator<Rhs>::type rhsEval(rhs); // FIXME make sure lhs is sequentially stored // FIXME not very good if rhs is real and lhs complex while alpha is real too - // FIXME we should probably build an evaluator for dst and rhs + // FIXME we should probably build an evaluator for dst const Index cols = dst.cols(); for (Index j=0; j<cols; ++j) - func(dst.col(j), rhs.coeff(0,j) * lhs); + func(dst.col(j), rhsEval.coeff(0,j) * lhs); } // Row major result template<typename Dst, typename Lhs, typename Rhs, typename Func> -EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) { +EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) +{ + typename evaluator<Lhs>::type lhsEval(lhs); // FIXME make sure rhs is sequentially stored // FIXME not very good if lhs is real and rhs complex while alpha is real too - // FIXME we should probably build an evaluator for dst and lhs + // FIXME we should probably build an evaluator for dst const Index rows = dst.rows(); for (Index i=0; i<rows; ++i) - func(dst.row(i), lhs.coeff(i,0) * rhs); + func(dst.row(i), lhsEval.coeff(i,0) * rhs); } template<typename Lhs, typename Rhs> diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index 0f17e3a89..f97048bda 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -96,7 +96,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic> typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType; - BlockingType blocking(rhs.rows(), rhs.cols(), size); + BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false); triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor, (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor> diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index e1316a73d..5d60ba149 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -213,18 +213,39 @@ MatrixBase<Derived>::adjoint() const namespace internal { template<typename MatrixType, - bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic> + bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic, + bool MatchPacketSize = + (int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits<typename MatrixType::Scalar>::size)) + && (internal::evaluator<MatrixType>::Flags&PacketAccessBit) > struct inplace_transpose_selector; template<typename MatrixType> -struct inplace_transpose_selector<MatrixType,true> { // square matrix +struct inplace_transpose_selector<MatrixType,true,false> { // square matrix static void run(MatrixType& m) { m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose()); } }; +// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only. template<typename MatrixType> -struct inplace_transpose_selector<MatrixType,false> { // non square matrix +struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize + static void run(MatrixType& m) { + typedef typename MatrixType::Scalar Scalar; + typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet; + typedef typename MatrixType::Index Index; + const Index PacketSize = internal::packet_traits<Scalar>::size; + const Index Alignment = internal::evaluator<MatrixType>::Flags&AlignedBit ? Aligned : Unaligned; + PacketBlock<Packet> A; + for (Index i=0; i<PacketSize; ++i) + A.packet[i] = m.template packetByOuterInner<Alignment>(i,0); + internal::ptranspose(A); + for (Index i=0; i<PacketSize; ++i) + m.template writePacket<Alignment>(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]); + } +}; + +template<typename MatrixType,bool MatchPacketSize> +struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix static void run(MatrixType& m) { if (m.rows()==m.cols()) m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose()); diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index e2376bd1f..be66a502a 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -22,9 +22,9 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef EIGEN_VECTORIZE_FMA -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifdef __FMA__ +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif #endif @@ -58,7 +58,8 @@ template<> struct packet_traits<float> : default_packet_traits HasCos = 0, HasLog = 0, HasExp = 0, - HasSqrt = 0 + HasSqrt = 0, + HasBlend = 1 }; }; template<> struct packet_traits<double> : default_packet_traits @@ -72,7 +73,8 @@ template<> struct packet_traits<double> : default_packet_traits HasHalfPacket = 1, HasDiv = 1, - HasExp = 0 + HasExp = 0, + HasBlend = 1 }; }; @@ -133,7 +135,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co return pset1<Packet8i>(0); } -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { #if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, @@ -557,6 +559,19 @@ ptranspose(PacketBlock<Packet4d,4>& kernel) { kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); } +template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { + const __m256 zero = _mm256_setzero_ps(); + const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_ps(thenPacket, elsePacket, false_mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { + const __m256d zero = _mm256_setzero_pd(); + const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index fa02f57a1..6b68fc7a5 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -18,12 +18,12 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 #endif -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_FUSE_CJMADD -#define EIGEN_HAS_FUSE_CJMADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD #endif // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h new file mode 100644 index 000000000..e7305c01e --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -0,0 +1,75 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H +#define EIGEN_MATH_FUNCTIONS_CUDA_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template<> EIGEN_STRONG_INLINE +float4 plog<float4>(const float4& a) +{ + return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 plog<double2>(const double2& a) +{ + return make_double2(log(a.x), log(a.y)); +} + +template<> EIGEN_STRONG_INLINE +float4 pexp<float4>(const float4& a) +{ + return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 pexp<double2>(const double2& a) +{ + return make_double2(exp(a.x), exp(a.y)); +} + +template<> EIGEN_STRONG_INLINE +float4 psqrt<float4>(const float4& a) +{ + return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 psqrt<double2>(const double2& a) +{ + return make_double2(sqrt(a.x), sqrt(a.y)); +} + +template<> EIGEN_STRONG_INLINE +float4 prsqrt<float4>(const float4& a) +{ + return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 prsqrt<double2>(const double2& a) +{ + return make_double2(rsqrt(a.x), rsqrt(a.y)); +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_CUDA_H diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h new file mode 100644 index 000000000..19749c832 --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -0,0 +1,296 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_CUDA_H +#define EIGEN_PACKET_MATH_CUDA_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template<> struct is_arithmetic<float4> { enum { value = true }; }; +template<> struct is_arithmetic<double2> { enum { value = true }; }; + + +template<> struct packet_traits<float> : default_packet_traits +{ + typedef float4 type; + typedef float4 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + + HasBlend = 0, + }; +}; + +template<> struct packet_traits<double> : default_packet_traits +{ + typedef double2 type; + typedef double2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + + HasDiv = 1, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + + HasBlend = 0, + }; +}; + + +template<> struct unpacket_traits<float4> { typedef float type; enum {size=4}; typedef float4 half; }; +template<> struct unpacket_traits<double2> { typedef double type; enum {size=2}; typedef double2 half; }; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) { + return make_float4(from, from, from, from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) { + return make_double2(from, from); +} + + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float>(const float& a) { + return make_float4(a, a+1, a+2, a+3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double>(const double& a) { + return make_double2(a, a+1); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) { + return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) { + return make_double2(a.x+b.x, a.y+b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) { + return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) { + return make_double2(a.x-b.x, a.y-b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) { + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) { + return make_double2(-a.x, -a.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) { + return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) { + return make_double2(a.x*b.x, a.y*b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) { + return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) { + return make_double2(a.x/b.x, a.y/b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) { + return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) { + return make_double2(fmin(a.x, b.x), fmin(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) { + return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) { + return make_double2(fmax(a.x, b.x), fmax(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) { + return *reinterpret_cast<const float4*>(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) { + return *reinterpret_cast<const double2*>(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) { + return make_float4(from[0], from[1], from[2], from[3]); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) { + return make_double2(from[0], from[1]); +} + +template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) { + return make_float4(from[0], from[0], from[1], from[1]); +} +template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) { + return make_double2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) { + *reinterpret_cast<float4*>(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) { + *reinterpret_cast<double2*>(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) { + to[0] = from.x; + to[1] = from.y; + to[2] = from.z; + to[3] = from.w; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) { + to[0] = from.x; + to[1] = from.y; +} + +#ifdef __CUDA_ARCH__ +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) { + return __ldg((const float4*)from); +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) { + return __ldg((const double2*)from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) { + return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) { + return make_double2(__ldg(from+0), __ldg(from+1)); +} +#endif + +template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) { + return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) { + return make_double2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; + to[stride*2] = from.z; + to[stride*3] = from.w; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; +} + +template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) { + return a.x; +} +template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) { + return a.x; +} + +template<> EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) { + return a.x + a.y + a.z + a.w; +} +template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) { + return a.x + a.y; +} + +template<> EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) { + return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) { + return fmax(a.x, a.y); +} + +template<> EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) { + return fminf(fminf(a.x, a.y), fminf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) { + return fmin(a.x, a.y); +} + +template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) { + return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) { + return make_double2(abs(a.x), abs(a.y)); +} + + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<float4,4>& kernel) { + double tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; + + tmp = kernel.packet[0].z; + kernel.packet[0].z = kernel.packet[2].x; + kernel.packet[2].x = tmp; + + tmp = kernel.packet[0].w; + kernel.packet[0].w = kernel.packet[3].x; + kernel.packet[3].x = tmp; + + tmp = kernel.packet[1].z; + kernel.packet[1].z = kernel.packet[2].y; + kernel.packet[2].y = tmp; + + tmp = kernel.packet[1].w; + kernel.packet[1].w = kernel.packet[3].y; + kernel.packet[3].y = tmp; + + tmp = kernel.packet[2].w; + kernel.packet[2].w = kernel.packet[3].z; + kernel.packet[3].z = tmp; +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<double2,2>& kernel) { + double tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + + +#endif // EIGEN_PACKET_MATH_CUDA_H diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h new file mode 100644 index 000000000..6bb05bb92 --- /dev/null +++ b/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -0,0 +1,91 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The sin, cos, exp, and log functions of this file come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +#ifndef EIGEN_MATH_FUNCTIONS_NEON_H +#define EIGEN_MATH_FUNCTIONS_NEON_H + +namespace Eigen { + +namespace internal { + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexp<Packet4f>(const Packet4f& _x) +{ + Packet4f x = _x; + Packet4f tmp, fx; + + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); + _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); + + x = vminq_f32(x, p4f_exp_hi); + x = vmaxq_f32(x, p4f_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF); + + /* perform a floorf */ + tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); + + /* if greater, substract 1 */ + Packet4ui mask = vcgtq_f32(tmp, fx); + mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1)); + + fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); + + tmp = vmulq_f32(fx, p4f_cephes_exp_C1); + Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2); + x = vsubq_f32(x, tmp); + x = vsubq_f32(x, z); + + Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x); + z = vmulq_f32(x, x); + y = vaddq_f32(y, p4f_cephes_exp_p1); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p2); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p3); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p4); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p5); + + y = vmulq_f32(y, z); + y = vaddq_f32(y, x); + y = vaddq_f32(y, p4f_1); + + /* build 2^n */ + int32x4_t mm; + mm = vcvtq_s32_f32(fx); + mm = vaddq_s32(mm, p4i_0x7f); + mm = vshlq_n_s32(mm, 23); + Packet4f pow2n = vreinterpretq_f32_s32(mm); + + y = vmulq_f32(y, pow2n); + return y; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_NEON_H diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 6c5c669a1..559682cf7 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -20,12 +20,12 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_FUSE_CJMADD -#define EIGEN_HAS_FUSE_CJMADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD #endif // FIXME NEON has 16 quad registers, but since the current register allocator @@ -88,7 +88,7 @@ template<> struct packet_traits<float> : default_packet_traits HasSin = 0, HasCos = 0, HasLog = 0, - HasExp = 0, + HasExp = 1, HasSqrt = 0 }; }; @@ -177,8 +177,19 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co return pset1<Packet4i>(0); } -// for some weird raisons, it has to be overloaded for packet of integers +#ifdef __ARM_FEATURE_FMA +// See bug 936. +// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. +// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding. +// MLA is not fused i.e. does 2 roundings. +// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4): +// MLA: 10 GFlop/s ; FMA: 12 GFlops/s. +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } +#else template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); } +#endif + +// No FMA instruction for int, so use MLA unconditionally. template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); } template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } @@ -492,6 +503,21 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) { //---------- double ---------- #if EIGEN_ARCH_ARM64 +#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__) +// Bug 907: workaround missing declarations of the following two functions in the ADK +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_f64 (float64x2_t __a) +{ + return (uint64x2_t) __a; +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_f64_u64 (uint64x2_t __a) +{ + return (float64x2_t) __a; +} +#endif + typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; @@ -536,8 +562,12 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); } -// for some weird raisons, it has to be overloaded for packet of integers +#ifdef __ARM_FEATURE_FMA +// See bug 936. See above comment about FMA for float. +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); } +#else template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); } +#endif template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } @@ -597,7 +627,12 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vco template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } -template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_low_f64(a) + vget_high_f64(a); } +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +// workaround ICE, see bug 907 +template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; } +#else +template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } +#endif template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) { @@ -613,7 +648,11 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) } // Other reduction functions: // mul -template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_low_f64(a) * vget_high_f64(a); } +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } +#else +template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +#endif // min template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 414c4cb6a..565e448fe 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -44,7 +44,8 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, - HasSetLinear = 0 + HasSetLinear = 0, + HasBlend = 1 }; }; #endif @@ -472,6 +473,11 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) { kernel.packet[1].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); + return Packet2cf(_mm_castpd_ps(result)); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 28427c308..898cb9ab0 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -22,9 +22,9 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef EIGEN_VECTORIZE_FMA -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifdef __FMA__ +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 #endif #endif @@ -108,7 +108,8 @@ template<> struct packet_traits<float> : default_packet_traits HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasBlend = 1 }; }; template<> struct packet_traits<double> : default_packet_traits @@ -123,7 +124,8 @@ template<> struct packet_traits<double> : default_packet_traits HasDiv = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasBlend = 1 }; }; #endif @@ -135,7 +137,9 @@ template<> struct packet_traits<int> : default_packet_traits // FIXME check the Has* Vectorizable = 1, AlignedOnScalar = 1, - size=4 + size=4, + + HasBlend = 1 }; }; @@ -227,7 +231,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } #endif @@ -809,6 +813,37 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) { kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + const __m128i zero = _mm_setzero_si128(); + const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128i false_mask = _mm_cmpeq_epi32(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); +#else + return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + const __m128 zero = _mm_setzero_ps(); + const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128 false_mask = _mm_cmpeq_ps(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_ps(thenPacket, elsePacket, false_mask); +#else + return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + const __m128d zero = _mm_setzero_pd(); + const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); + __m128d false_mask = _mm_cmpeq_pd(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_pd(thenPacket, elsePacket, false_mask); +#else + return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); +#endif +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 7b2ed6728..11e5f591d 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -26,28 +26,37 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff } /** \internal */ -inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) +inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) { - static std::ptrdiff_t m_l1CacheSize = 0; - static std::ptrdiff_t m_l2CacheSize = 0; - if(m_l2CacheSize==0) + static bool m_cache_sizes_initialized = false; + static std::ptrdiff_t m_l1CacheSize = 32*1024; + static std::ptrdiff_t m_l2CacheSize = 256*1024; + static std::ptrdiff_t m_l3CacheSize = 2*1024*1024; + + if(!m_cache_sizes_initialized) { - m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024); - m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024); + int l1CacheSize, l2CacheSize, l3CacheSize; + queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); + m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024); + m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024); + m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024); + m_cache_sizes_initialized = true; } - + if(action==SetAction) { // set the cpu cache size and cache all block sizes from a global cache size in byte eigen_internal_assert(l1!=0 && l2!=0); m_l1CacheSize = *l1; m_l2CacheSize = *l2; + m_l3CacheSize = *l3; } else if(action==GetAction) { eigen_internal_assert(l1!=0 && l2!=0); *l1 = m_l1CacheSize; *l2 = m_l2CacheSize; + *l3 = m_l3CacheSize; } else { @@ -70,10 +79,11 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi * - the number of scalars that fit into a packet (when vectorization is enabled). * * \sa setCpuCacheSizes */ +#define CEIL(a, b) ((a)+(b)-1)/(b) + template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType> -void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) +void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) { - EIGEN_UNUSED_VARIABLE(n); // Explanations: // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed @@ -81,47 +91,75 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) // at the register level. For vectorization purpose, these small vertical panels are unpacked, // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to // stay in L1 cache. - std::ptrdiff_t l1, l2; - - typedef gebp_traits<LhsScalar,RhsScalar> Traits; - enum { - kdiv = KcFactor * 2 * Traits::nr - * Traits::RhsProgress * sizeof(RhsScalar), - mr = gebp_traits<LhsScalar,RhsScalar>::mr, - mr_mask = (0xffffffff/mr)*mr - }; + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); + + if (num_threads > 1) { + typedef gebp_traits<LhsScalar,RhsScalar> Traits; + typedef typename Traits::ResScalar ResScalar; + enum { + kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + ksub = Traits::mr * Traits::nr * sizeof(ResScalar), + k_mask = (0xffffffff/8)*8, + + mr = Traits::mr, + mr_mask = (0xffffffff/mr)*mr, + + nr = Traits::nr, + nr_mask = (0xffffffff/nr)*nr + }; + SizeType k_cache = (l1-ksub)/kdiv; + if (k_cache < k) { + k = k_cache & k_mask; + eigen_assert(k > 0); + } - manage_caching_sizes(GetAction, &l1, &l2); + SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); + SizeType n_per_thread = CEIL(n, num_threads); + if (n_cache <= n_per_thread) { + // Don't exceed the capacity of the l2 cache. + eigen_assert(n_cache >= static_cast<SizeType>(nr)); + n = n_cache & nr_mask; + eigen_assert(n > 0); + } else { + n = (std::min<SizeType>)(n, (n_per_thread + nr - 1) & nr_mask); + } -// k = std::min<SizeType>(k, l1/kdiv); -// SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; -// if(_m<m) m = _m & mr_mask; - - // In unit tests we do not want to use extra large matrices, - // so we reduce the block size to check the blocking strategy is not flawed + if (l3 > l2) { + // l3 is shared between all cores, so we'll give each thread its own chunk of l3. + SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); + SizeType m_per_thread = CEIL(m, num_threads); + if(m_cache < m_per_thread && m_cache >= static_cast<SizeType>(mr)) { + m = m_cache & mr_mask; + eigen_assert(m > 0); + } else { + m = (std::min<SizeType>)(m, (m_per_thread + mr - 1) & mr_mask); + } + } + } + else { + // In unit tests we do not want to use extra large matrices, + // so we reduce the block size to check the blocking strategy is not flawed #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS -// k = std::min<SizeType>(k,240); -// n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); -// m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); - - k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240); - n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); - m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); + k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240); + n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); + m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); #else - k = std::min<SizeType>(k,24); - n = std::min<SizeType>(n,384/sizeof(RhsScalar)); - m = std::min<SizeType>(m,384/sizeof(RhsScalar)); + k = std::min<SizeType>(k,24); + n = std::min<SizeType>(n,384/sizeof(RhsScalar)); + m = std::min<SizeType>(m,384/sizeof(RhsScalar)); #endif + } } template<typename LhsScalar, typename RhsScalar, typename SizeType> -inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) +inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) { - computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n); + computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads); } -#ifdef EIGEN_HAS_FUSE_CJMADD - #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD + #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #else // FIXME (a bit overkill maybe ?) @@ -146,8 +184,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t); } - #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); -// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); + #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); +// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); #endif /* Vectorization logic @@ -182,7 +220,7 @@ public: nr = 4, // register block size along the M direction (currently, this one cannot be modified) -#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) // we assume 16 registers mr = 3*LhsPacketSize, #else @@ -248,7 +286,7 @@ public: // let gcc allocate the register in which to store the result of the pmul // (in the case where there is no FMA) gcc fails to figure out how to avoid // spilling register. -#ifdef EIGEN_HAS_FUSED_MADD +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); c = pmadd(a,b,c); #else @@ -290,7 +328,7 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, -#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) // we assume 16 registers mr = 3*LhsPacketSize, #else @@ -353,7 +391,7 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { -#ifdef EIGEN_HAS_FUSED_MADD +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); c.v = pmadd(a.v,b,c.v); #else @@ -637,7 +675,7 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { -#ifdef EIGEN_HAS_FUSED_MADD +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); c.v = pmadd(a,b.v,c.v); #else @@ -667,7 +705,7 @@ protected: * |real |cplx | no vectorization yet, would require to pack A with duplication * |cplx |real | easy vectorization */ -template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> struct gebp_kernel { typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; @@ -676,14 +714,15 @@ struct gebp_kernel typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; - + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits; typedef typename SwappedTraits::ResScalar SResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; typedef typename SwappedTraits::RhsPacket SRhsPacket; typedef typename SwappedTraits::ResPacket SResPacket; typedef typename SwappedTraits::AccPacket SAccPacket; - + + typedef typename DataMapper::LinearMapper LinearMapper; enum { Vectorizable = Traits::Vectorizable, @@ -693,14 +732,16 @@ struct gebp_kernel }; EIGEN_DONT_INLINE - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, + void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; -template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> EIGEN_DONT_INLINE -void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> - ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, +void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs> + ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { Traits traits; @@ -743,15 +784,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = &res[(j2+1)*resStride + i]; - ResScalar* r2 = &res[(j2+2)*resStride + i]; - ResScalar* r3 = &res[(j2+3)*resStride + i]; - - internal::prefetch(r0); - internal::prefetch(r1); - internal::prefetch(r2); - internal::prefetch(r3); + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(0); + r1.prefetch(0); + r2.prefetch(0); + r3.prefetch(0); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -760,31 +801,36 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index k=0; k<peeled_kc; k+=pk) { - EIGEN_ASM_COMMENT("begin gegp micro kernel 3p x 4"); + EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4"); RhsPacket B_0, T0; LhsPacket A2; #define EIGEN_GEBGP_ONESTEP(K) \ - internal::prefetch(blA+(3*K+16)*LhsProgress); \ - traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ - traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ - traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, T0); \ - traits.madd(A1, B_0, C4, T0); \ - traits.madd(A2, B_0, C8, B_0); \ - traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \ - traits.madd(A0, B_0, C1, T0); \ - traits.madd(A1, B_0, C5, T0); \ - traits.madd(A2, B_0, C9, B_0); \ - traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \ - traits.madd(A0, B_0, C2, T0); \ - traits.madd(A1, B_0, C6, T0); \ - traits.madd(A2, B_0, C10, B_0); \ - traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \ - traits.madd(A0, B_0, C3 , T0); \ - traits.madd(A1, B_0, C7, T0); \ - traits.madd(A2, B_0, C11, B_0) + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + internal::prefetch(blA+(3*K+16)*LhsProgress); \ + traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ + traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ + traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, T0); \ + traits.madd(A1, B_0, C4, T0); \ + traits.madd(A2, B_0, C8, B_0); \ + traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \ + traits.madd(A0, B_0, C1, T0); \ + traits.madd(A1, B_0, C5, T0); \ + traits.madd(A2, B_0, C9, B_0); \ + traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \ + traits.madd(A0, B_0, C2, T0); \ + traits.madd(A1, B_0, C6, T0); \ + traits.madd(A2, B_0, C10, B_0); \ + traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \ + traits.madd(A0, B_0, C3 , T0); \ + traits.madd(A1, B_0, C7, T0); \ + traits.madd(A2, B_0, C11, B_0); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ + } while(false) internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); @@ -799,6 +845,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blB += pk*4*RhsProgress; blA += pk*3*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4"); } // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) @@ -813,48 +861,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> ResPacket R0, R1, R2; ResPacket alphav = pset1<ResPacket>(alpha); - - R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize); - R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); - pstoreu(r0+2*Traits::ResPacketSize, R2); - - R0 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize); - R2 = ploadu<ResPacket>(r1+2*Traits::ResPacketSize); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r1.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(2 * Traits::ResPacketSize); traits.acc(C1, alphav, R0); traits.acc(C5, alphav, R1); traits.acc(C9, alphav, R2); - pstoreu(r1+0*Traits::ResPacketSize, R0); - pstoreu(r1+1*Traits::ResPacketSize, R1); - pstoreu(r1+2*Traits::ResPacketSize, R2); - - R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize); - R2 = ploadu<ResPacket>(r2+2*Traits::ResPacketSize); + r1.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r2.loadPacket(2 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C10, alphav, R2); - pstoreu(r2+0*Traits::ResPacketSize, R0); - pstoreu(r2+1*Traits::ResPacketSize, R1); - pstoreu(r2+2*Traits::ResPacketSize, R2); - - R0 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize); - R2 = ploadu<ResPacket>(r3+2*Traits::ResPacketSize); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r2.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r3.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(2 * Traits::ResPacketSize); traits.acc(C3, alphav, R0); traits.acc(C7, alphav, R1); traits.acc(C11, alphav, R2); - pstoreu(r3+0*Traits::ResPacketSize, R0); - pstoreu(r3+1*Traits::ResPacketSize, R1); - pstoreu(r3+2*Traits::ResPacketSize, R2); + r3.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(2 * Traits::ResPacketSize, R2); } - + // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2<cols; j2++) { @@ -868,7 +916,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> traits.initAcc(C4); traits.initAcc(C8); - ResScalar* r0 = &res[(j2+0)*resStride + i]; + LinearMapper r0 = res.getLinearMapper(i, j2); + r0.prefetch(0); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -876,16 +925,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index k=0; k<peeled_kc; k+=pk) { - EIGEN_ASM_COMMENT("begin gegp micro kernel 3p x 1"); + EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1"); RhsPacket B_0; #define EIGEN_GEBGP_ONESTEP(K) \ - traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ - traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ - traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B_0); \ - traits.madd(A1, B_0, C4, B_0); \ - traits.madd(A2, B_0, C8, B_0) + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ + traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0); \ + traits.madd(A1, B_0, C4, B_0); \ + traits.madd(A2, B_0, C8, B_0); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ + } while(false) EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -898,6 +952,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blB += pk*RhsProgress; blA += pk*3*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1"); } // process remaining peeled loop @@ -912,19 +968,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> ResPacket R0, R1, R2; ResPacket alphav = pset1<ResPacket>(alpha); - R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize); - R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); - traits.acc(C8 , alphav, R2); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); - pstoreu(r0+2*Traits::ResPacketSize, R2); + traits.acc(C8, alphav, R2); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); } } } - + //---------- Process 2 * LhsProgress rows at once ---------- if(mr>=2*Traits::LhsProgress) { @@ -946,15 +1002,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = &res[(j2+1)*resStride + i]; - ResScalar* r2 = &res[(j2+2)*resStride + i]; - ResScalar* r3 = &res[(j2+3)*resStride + i]; - - internal::prefetch(r0+prefetch_res_offset); - internal::prefetch(r1+prefetch_res_offset); - internal::prefetch(r2+prefetch_res_offset); - internal::prefetch(r3+prefetch_res_offset); + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -963,21 +1019,26 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index k=0; k<peeled_kc; k+=pk) { - EIGEN_ASM_COMMENT("begin gegp micro kernel 2pX4"); + EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4"); RhsPacket B_0, B1, B2, B3, T0; #define EIGEN_GEBGP_ONESTEP(K) \ - traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ - traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ - traits.madd(A0, B_0, C0, T0); \ - traits.madd(A1, B_0, C4, B_0); \ - traits.madd(A0, B1, C1, T0); \ - traits.madd(A1, B1, C5, B1); \ - traits.madd(A0, B2, C2, T0); \ - traits.madd(A1, B2, C6, B2); \ - traits.madd(A0, B3, C3, T0); \ - traits.madd(A1, B3, C7, B3) + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ + traits.madd(A0, B_0, C0, T0); \ + traits.madd(A1, B_0, C4, B_0); \ + traits.madd(A0, B1, C1, T0); \ + traits.madd(A1, B1, C5, B1); \ + traits.madd(A0, B2, C2, T0); \ + traits.madd(A1, B2, C6, B2); \ + traits.madd(A0, B3, C3, T0); \ + traits.madd(A1, B3, C7, B3); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ + } while(false) internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); @@ -992,6 +1053,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blB += pk*4*RhsProgress; blA += pk*(2*Traits::LhsProgress); + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4"); } // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) @@ -1002,37 +1065,37 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blA += 2*Traits::LhsProgress; } #undef EIGEN_GEBGP_ONESTEP - + ResPacket R0, R1, R2, R3; ResPacket alphav = pset1<ResPacket>(alpha); - - R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize); - R2 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize); - R3 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(0 * Traits::ResPacketSize); + R3 = r1.loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C1, alphav, R2); traits.acc(C5, alphav, R3); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); - pstoreu(r1+0*Traits::ResPacketSize, R2); - pstoreu(r1+1*Traits::ResPacketSize, R3); - - R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize); - R2 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize); - R3 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(0 * Traits::ResPacketSize, R2); + r1.storePacket(1 * Traits::ResPacketSize, R3); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(0 * Traits::ResPacketSize); + R3 = r3.loadPacket(1 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C3, alphav, R2); traits.acc(C7, alphav, R3); - pstoreu(r2+0*Traits::ResPacketSize, R0); - pstoreu(r2+1*Traits::ResPacketSize, R1); - pstoreu(r3+0*Traits::ResPacketSize, R2); - pstoreu(r3+1*Traits::ResPacketSize, R3); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(0 * Traits::ResPacketSize, R2); + r3.storePacket(1 * Traits::ResPacketSize, R3); } - + // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2<cols; j2++) { @@ -1045,8 +1108,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> traits.initAcc(C0); traits.initAcc(C4); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - internal::prefetch(r0+prefetch_res_offset); + LinearMapper r0 = res.getLinearMapper(i, j2); + r0.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -1054,15 +1117,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index k=0; k<peeled_kc; k+=pk) { - EIGEN_ASM_COMMENT("begin gegp micro kernel 2p x 1"); + EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1"); RhsPacket B_0, B1; #define EIGEN_GEBGP_ONESTEP(K) \ - traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ - traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B1); \ - traits.madd(A1, B_0, C4, B_0) + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B1); \ + traits.madd(A1, B_0, C4, B_0); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ + } while(false) EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -1075,6 +1143,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blB += pk*RhsProgress; blA += pk*2*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1"); } // process remaining peeled loop @@ -1089,12 +1159,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> ResPacket R0, R1; ResPacket alphav = pset1<ResPacket>(alpha); - R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); } } } @@ -1120,15 +1190,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> traits.initAcc(C2); traits.initAcc(C3); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = &res[(j2+1)*resStride + i]; - ResScalar* r2 = &res[(j2+2)*resStride + i]; - ResScalar* r3 = &res[(j2+3)*resStride + i]; - - internal::prefetch(r0+prefetch_res_offset); - internal::prefetch(r1+prefetch_res_offset); - internal::prefetch(r2+prefetch_res_offset); - internal::prefetch(r3+prefetch_res_offset); + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -1137,16 +1207,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index k=0; k<peeled_kc; k+=pk) { - EIGEN_ASM_COMMENT("begin gegp micro kernel 1pX4"); + EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4"); RhsPacket B_0, B1, B2, B3; #define EIGEN_GEBGP_ONESTEP(K) \ - traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ - traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ - traits.madd(A0, B_0, C0, B_0); \ - traits.madd(A0, B1, C1, B1); \ - traits.madd(A0, B2, C2, B2); \ - traits.madd(A0, B3, C3, B3); + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ + traits.madd(A0, B_0, C0, B_0); \ + traits.madd(A0, B1, C1, B1); \ + traits.madd(A0, B2, C2, B2); \ + traits.madd(A0, B3, C3, B3); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \ + } while(false) internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); @@ -1161,6 +1236,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blB += pk*4*RhsProgress; blA += pk*1*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4"); } // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) @@ -1171,25 +1248,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blA += 1*LhsProgress; } #undef EIGEN_GEBGP_ONESTEP - + ResPacket R0, R1; ResPacket alphav = pset1<ResPacket>(alpha); - - R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C1, alphav, R1); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r1+0*Traits::ResPacketSize, R1); - - R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(0 * Traits::ResPacketSize, R1); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(0 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C3, alphav, R1); - pstoreu(r2+0*Traits::ResPacketSize, R0); - pstoreu(r3+0*Traits::ResPacketSize, R1); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(0 * Traits::ResPacketSize, R1); } - + // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2<cols; j2++) { @@ -1201,7 +1278,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> AccPacket C0; traits.initAcc(C0); - ResScalar* r0 = &res[(j2+0)*resStride + i]; + LinearMapper r0 = res.getLinearMapper(i, j2); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -1209,14 +1286,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index k=0; k<peeled_kc; k+=pk) { - EIGEN_ASM_COMMENT("begin gegp micro kernel 2p x 1"); + EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1"); RhsPacket B_0; #define EIGEN_GEBGP_ONESTEP(K) \ - traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ - traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B_0); \ - + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ + } while(false); + EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); @@ -1228,6 +1310,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blB += pk*RhsProgress; blA += pk*1*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1"); } // process remaining peeled loop @@ -1241,9 +1325,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> #undef EIGEN_GEBGP_ONESTEP ResPacket R0; ResPacket alphav = pset1<ResPacket>(alpha); - R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); - pstoreu(r0+0*Traits::ResPacketSize, R0); + r0.storePacket(0 * Traits::ResPacketSize, R0); } } } @@ -1259,7 +1343,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - + if( (SwappedTraits::LhsProgress % 4)==0 ) { // NOTE The following piece of code wont work for 512 bit registers @@ -1268,32 +1352,32 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> straits.initAcc(C1); straits.initAcc(C2); straits.initAcc(C3); - + const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4); const Index endk = (depth/spk)*spk; const Index endk4 = (depth/(spk*4))*(spk*4); - + Index k=0; for(; k<endk4; k+=4*spk) { SLhsPacket A0,A1; SRhsPacket B_0,B_1; - + straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1); - + straits.loadRhsQuad(blA+0*spk, B_0); straits.loadRhsQuad(blA+1*spk, B_1); straits.madd(A0,B_0,C0,B_0); straits.madd(A1,B_1,C1,B_1); - + straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); straits.loadRhsQuad(blA+2*spk, B_0); straits.loadRhsQuad(blA+3*spk, B_1); straits.madd(A0,B_0,C2,B_0); straits.madd(A1,B_1,C3,B_1); - + blB += 4*SwappedTraits::LhsProgress; blA += 4*spk; } @@ -1302,11 +1386,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> { SLhsPacket A0; SRhsPacket B_0; - + straits.loadLhsUnaligned(blB, A0); straits.loadRhsQuad(blA, B_0); straits.madd(A0,B_0,C0,B_0); - + blB += SwappedTraits::LhsProgress; blA += spk; } @@ -1317,10 +1401,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf; typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf; typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf; - - SResPacketHalf R = pgather<SResScalar, SResPacketHalf>(&res[j2*resStride + i], resStride); + + SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2); SResPacketHalf alphav = pset1<SResPacketHalf>(alpha); - + if(depth-endk>0) { // We have to handle the last row of the rhs which corresponds to a half-packet @@ -1336,14 +1420,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> { straits.acc(predux4(C0), alphav, R); } - pscatter(&res[j2*resStride + i], R, resStride); + res.scatterPacket(i, j2, R); } else { - SResPacket R = pgather<SResScalar, SResPacket>(&res[j2*resStride + i], resStride); + SResPacket R = res.template gatherPacket<SResPacket>(i, j2); SResPacket alphav = pset1<SResPacket>(alpha); straits.acc(C0, alphav, R); - pscatter(&res[j2*resStride + i], R, resStride); + res.scatterPacket(i, j2, R); } } else // scalar path @@ -1355,25 +1439,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> { LhsScalar A0; RhsScalar B_0, B_1; - + A0 = blA[k]; - + B_0 = blB[0]; B_1 = blB[1]; - MADD(cj,A0,B_0,C0, B_0); - MADD(cj,A0,B_1,C1, B_1); + CJMADD(cj,A0,B_0,C0, B_0); + CJMADD(cj,A0,B_1,C1, B_1); B_0 = blB[2]; B_1 = blB[3]; - MADD(cj,A0,B_0,C2, B_0); - MADD(cj,A0,B_1,C3, B_1); + CJMADD(cj,A0,B_0,C2, B_0); + CJMADD(cj,A0,B_1,C3, B_1); blB += 4; } - res[(j2+0)*resStride + i] += alpha*C0; - res[(j2+1)*resStride + i] += alpha*C1; - res[(j2+2)*resStride + i] += alpha*C2; - res[(j2+3)*resStride + i] += alpha*C3; + res(i, j2 + 0) += alpha * C0; + res(i, j2 + 1) += alpha * C1; + res(i, j2 + 2) += alpha * C2; + res(i, j2 + 3) += alpha * C3; } } } @@ -1392,9 +1476,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> { LhsScalar A0 = blA[k]; RhsScalar B_0 = blB[k]; - MADD(cj, A0, B_0, C0, B_0); + CJMADD(cj, A0, B_0, C0, B_0); } - res[(j2+0)*resStride + i] += alpha*C0; + res(i, j2) += alpha * C0; } } } @@ -1417,15 +1501,16 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> // // 32 33 34 35 ... // 36 36 38 39 ... -template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode> -struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode> +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> { - EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode> -EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode> - ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { typedef typename packet_traits<Scalar>::type Packet; enum { PacketSize = packet_traits<Scalar>::size }; @@ -1436,30 +1521,29 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - const_blas_data_mapper<Scalar, Index, ColMajor> lhs(_lhs,lhsStride); Index count = 0; - + const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 : Pack2>1 ? (rows/Pack2)*Pack2 : 0; - + Index i=0; - + // Pack 3 packets if(Pack1>=3*PacketSize) { for(; i<peeled_mc3; i+=3*PacketSize) { if(PanelMode) count += (3*PacketSize) * offset; - + for(Index k=0; k<depth; k++) { Packet A, B, C; - A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); - B = ploadu<Packet>(&lhs(i+1*PacketSize, k)); - C = ploadu<Packet>(&lhs(i+2*PacketSize, k)); + A = lhs.loadPacket(i+0*PacketSize, k); + B = lhs.loadPacket(i+1*PacketSize, k); + C = lhs.loadPacket(i+2*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; pstore(blockA+count, cj.pconj(C)); count+=PacketSize; @@ -1473,12 +1557,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj for(; i<peeled_mc2; i+=2*PacketSize) { if(PanelMode) count += (2*PacketSize) * offset; - + for(Index k=0; k<depth; k++) { Packet A, B; - A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); - B = ploadu<Packet>(&lhs(i+1*PacketSize, k)); + A = lhs.loadPacket(i+0*PacketSize, k); + B = lhs.loadPacket(i+1*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } @@ -1491,11 +1575,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj for(; i<peeled_mc1; i+=1*PacketSize) { if(PanelMode) count += (1*PacketSize) * offset; - + for(Index k=0; k<depth; k++) { Packet A; - A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); + A = lhs.loadPacket(i+0*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } @@ -1508,11 +1592,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj for(; i<peeled_mc0; i+=Pack2) { if(PanelMode) count += Pack2 * offset; - + for(Index k=0; k<depth; k++) for(Index w=0; w<Pack2; w++) blockA[count++] = cj(lhs(i+w, k)); - + if(PanelMode) count += Pack2 * (stride-offset-depth); } } @@ -1525,15 +1609,16 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj } } -template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode> -struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode> +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode> { - EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode> -EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode> - ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { typedef typename packet_traits<Scalar>::type Packet; enum { PacketSize = packet_traits<Scalar>::size }; @@ -1543,13 +1628,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj EIGEN_UNUSED_VARIABLE(offset); eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - const_blas_data_mapper<Scalar, Index, RowMajor> lhs(_lhs,lhsStride); Index count = 0; - + // const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; // const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - + int pack = Pack1; Index i = 0; while(pack>0) @@ -1569,7 +1653,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj for (Index m = 0; m < pack; m += PacketSize) { PacketBlock<Packet> kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu<Packet>(&lhs(i+p+m, k)); + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k); ptranspose(kernel); for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); } @@ -1594,15 +1678,15 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj for(;w<pack;++w) blockA[count++] = cj(lhs(i+w, k)); } - + if(PanelMode) count += pack * (stride-offset-depth); } - + pack -= PacketSize; if(pack<Pack2 && (pack+PacketSize)!=Pack2) pack = Pack2; } - + for(; i<rows; i++) { if(PanelMode) count += offset; @@ -1619,17 +1703,18 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj // 4 5 6 7 16 17 18 19 25 28 // 8 9 10 11 20 21 22 23 26 29 // . . . . . . . . . . -template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> -struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> +template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> { typedef typename packet_traits<Scalar>::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; enum { PacketSize = packet_traits<Scalar>::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; -template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> -EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> - ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) +template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); EIGEN_UNUSED_VARIABLE(stride); @@ -1685,27 +1770,27 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan // if(PanelMode) count += 8 * (stride-offset-depth); // } // } - + if(nr>=4) { for(Index j2=packet_cols8; j2<packet_cols4; j2+=4) { // skip what we have before if(PanelMode) count += 4 * offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - const Scalar* b1 = &rhs[(j2+1)*rhsStride]; - const Scalar* b2 = &rhs[(j2+2)*rhsStride]; - const Scalar* b3 = &rhs[(j2+3)*rhsStride]; - + const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0); + const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1); + const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2); + const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3); + Index k=0; if((PacketSize%4)==0) // TODO enbale vectorized transposition for PacketSize==2 ?? { for(; k<peeled_k; k+=PacketSize) { PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel; - kernel.packet[0] = ploadu<Packet>(&b0[k]); - kernel.packet[1] = ploadu<Packet>(&b1[k]); - kernel.packet[2] = ploadu<Packet>(&b2[k]); - kernel.packet[3] = ploadu<Packet>(&b3[k]); + kernel.packet[0] = dm0.loadPacket(k); + kernel.packet[1] = dm1.loadPacket(k); + kernel.packet[2] = dm2.loadPacket(k); + kernel.packet[3] = dm3.loadPacket(k); ptranspose(kernel); pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1])); @@ -1716,10 +1801,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan } for(; k<depth; k++) { - blockB[count+0] = cj(b0[k]); - blockB[count+1] = cj(b1[k]); - blockB[count+2] = cj(b2[k]); - blockB[count+3] = cj(b3[k]); + blockB[count+0] = cj(dm0(k)); + blockB[count+1] = cj(dm1(k)); + blockB[count+2] = cj(dm2(k)); + blockB[count+3] = cj(dm3(k)); count += 4; } // skip what we have after @@ -1731,10 +1816,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan for(Index j2=packet_cols4; j2<cols; ++j2) { if(PanelMode) count += offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; + const LinearMapper dm0 = rhs.getLinearMapper(0, j2); for(Index k=0; k<depth; k++) { - blockB[count] = cj(b0[k]); + blockB[count] = cj(dm0(k)); count += 1; } if(PanelMode) count += (stride-offset-depth); @@ -1742,17 +1827,18 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan } // this version is optimized for row major matrices -template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> -struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> +template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> { typedef typename packet_traits<Scalar>::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; enum { PacketSize = packet_traits<Scalar>::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; -template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> -EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> - ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) +template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); EIGEN_UNUSED_VARIABLE(stride); @@ -1762,7 +1848,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - + // if(nr>=8) // { // for(Index j2=0; j2<packet_cols8; j2+=8) @@ -1805,15 +1891,15 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan for(Index k=0; k<depth; k++) { if (PacketSize==4) { - Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); + Packet A = rhs.loadPacket(k, j2); pstoreu(blockB+count, cj.pconj(A)); count += PacketSize; } else { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = cj(b0[0]); - blockB[count+1] = cj(b0[1]); - blockB[count+2] = cj(b0[2]); - blockB[count+3] = cj(b0[3]); + const LinearMapper dm0 = rhs.getLinearMapper(k, j2); + blockB[count+0] = cj(dm0(0)); + blockB[count+1] = cj(dm0(1)); + blockB[count+2] = cj(dm0(2)); + blockB[count+3] = cj(dm0(3)); count += 4; } } @@ -1825,10 +1911,9 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan for(Index j2=packet_cols4; j2<cols; ++j2) { if(PanelMode) count += offset; - const Scalar* b0 = &rhs[j2]; for(Index k=0; k<depth; k++) { - blockB[count] = cj(b0[k*rhsStride]); + blockB[count] = cj(rhs(k, j2)); count += 1; } if(PanelMode) count += stride-offset-depth; @@ -1841,8 +1926,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan * \sa setCpuCacheSize */ inline std::ptrdiff_t l1CacheSize() { - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); return l1; } @@ -1850,8 +1935,8 @@ inline std::ptrdiff_t l1CacheSize() * \sa setCpuCacheSize */ inline std::ptrdiff_t l2CacheSize() { - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); return l2; } @@ -1860,9 +1945,9 @@ inline std::ptrdiff_t l2CacheSize() * for the algorithms working per blocks. * * \sa computeProductBlockingSizes */ -inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) +inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3) { - internal::manage_caching_sizes(SetAction, &l1, &l2); + internal::manage_caching_sizes(SetAction, &l1, &l2, &l3); } } // end namespace Eigen diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index b7e1867f0..fd9443cd2 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -59,21 +59,25 @@ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScal static void run(Index rows, Index cols, Index depth, const LhsScalar* _lhs, Index lhsStride, const RhsScalar* _rhs, Index rhsStride, - ResScalar* res, Index resStride, + ResScalar* _res, Index resStride, ResScalar alpha, level3_blocking<LhsScalar,RhsScalar>& blocking, GemmParallelInfo<Index>* info = 0) { - const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); + typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction - gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; - gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; - gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; + gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; + gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; #ifdef EIGEN_HAS_OPENMP if(info) @@ -95,7 +99,7 @@ static void run(Index rows, Index cols, Index depth, // In order to reduce the chance that a thread has to wait for the other, // let's start by packing B'. - pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc); + pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc); // Pack A_k to A' in a parallel fashion: // each thread packs the sub block A_k,i to A'_i where i is the thread id. @@ -105,8 +109,8 @@ static void run(Index rows, Index cols, Index depth, // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} info[tid].users += threads; - - pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length); + + pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length); // Notify the other threads that the part A'_i is ready to go. info[tid].sync = k; @@ -119,9 +123,12 @@ static void run(Index rows, Index cols, Index depth, // At this point we have to make sure that A'_i has been updated by the thread i, // we use testAndSetOrdered to mimic a volatile access. // However, no need to wait for the B' part which has been updated by the current thread! - if(shift>0) - while(info[i].sync!=k) {} - gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); + if (shift>0) { + while(info[i].sync!=k) { + } + } + + gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); } // Then keep going as usual with the remaining B' @@ -130,10 +137,10 @@ static void run(Index rows, Index cols, Index depth, const Index actual_nc = (std::min)(j+nc,cols)-j; // pack B_k,j to B' - pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc); + pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc); // C_j += A' * B' - gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha); + gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha); } // Release all the sub blocks A'_i of A' for the current thread, @@ -159,28 +166,33 @@ static void run(Index rows, Index cols, Index depth, ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); // For each horizontal panel of the rhs, and corresponding panel of the lhs... - for(Index k2=0; k2<depth; k2+=kc) + for(Index i2=0; i2<rows; i2+=mc) { - const Index actual_kc = (std::min)(k2+kc,depth)-k2; + const Index actual_mc = (std::min)(i2+mc,rows)-i2; - // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. - // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) - // Note that this panel will be read as many times as the number of blocks in the rhs's - // horizontal panel which is, in practice, a very low number. - pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows); - - // For each kc x nc block of the rhs's horizontal panel... - for(Index j2=0; j2<cols; j2+=nc) + for(Index k2=0; k2<depth; k2+=kc) { - const Index actual_nc = (std::min)(j2+nc,cols)-j2; - - // We pack the rhs's block into a sequential chunk of memory (L2 caching) - // Note that this block will be read a very high number of times, which is equal to the number of - // micro horizontal panel of the large rhs's panel (e.g., rows/12 times). - pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc); - - // Everything is packed, we can now call the panel * block kernel: - gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha); + const Index actual_kc = (std::min)(k2+kc,depth)-k2; + + // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. + // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) + // Note that this panel will be read as many times as the number of blocks in the rhs's + // horizontal panel which is, in practice, a very low number. + pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc); + + // For each kc x nc block of the rhs's horizontal panel... + for(Index j2=0; j2<cols; j2+=nc) + { + const Index actual_nc = (std::min)(j2+nc,cols)-j2; + + // We pack the rhs's block into a sequential chunk of memory (L2 caching) + // Note that this block will be read a very high number of times, which is equal to the number of + // micro horizontal panel of the large rhs's panel (e.g., rows/12 times). + pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc); + + // Everything is packed, we can now call the panel * block kernel: + gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha); + } } } } @@ -287,7 +299,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M public: - gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false) + gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, int /*num_threads*/, bool /*full_rows = false*/) { this->m_mc = ActualRows; this->m_nc = ActualCols; @@ -319,23 +331,23 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M public: - gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false) + gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, int num_threads, bool l3_blocking) { this->m_mc = Transpose ? cols : rows; this->m_nc = Transpose ? rows : cols; this->m_kc = depth; - if(full_rows) + if(l3_blocking) { - DenseIndex m = this->m_mc; - computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc); + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads); } - else // full columns + else // no l3 blocking { + DenseIndex m = this->m_mc; DenseIndex n = this->m_nc; - computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n); + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads); } - + m_sizeA = this->m_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; } @@ -445,8 +457,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor; - BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true); - + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)> (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 7db3e3d38..e55994900 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -58,27 +58,31 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, { typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha) + const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha) { - const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - typedef gebp_traits<LhsScalar,RhsScalar> Traits; + typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); + Index kc = depth; // cache block size along the K direction Index mc = size; // cache block size along the M direction Index nc = size; // cache block size along the N direction - computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc); + computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, 1); // !!! mc must be a multiple of nr: if(mc > Traits::nr) mc = (mc/Traits::nr)*Traits::nr; ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0); - - gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; - gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; - gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; + + gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; + gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb; for(Index k2=0; k2<depth; k2+=kc) @@ -86,29 +90,30 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, const Index actual_kc = (std::min)(k2+kc,depth)-k2; // note that the actual rhs is the transpose/adjoint of mat - pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size); + pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size); for(Index i2=0; i2<size; i2+=mc) { const Index actual_mc = (std::min)(i2+mc,size)-i2; - pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); // the selected actual_mc * size panel of res is split into three different part: // 1 - before the diagonal => processed with gebp or skipped // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel // 3 - after the diagonal => processed with gebp or skipped if (UpLo==Lower) - gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, - -1, -1, 0, 0); + gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, + (std::min)(size,i2), alpha, -1, -1, 0, 0); + - sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); + sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); if (UpLo==Upper) { Index j2 = i2+actual_mc; - gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha, - -1, -1, 0, 0); + gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc, + actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0); } } } @@ -129,13 +134,16 @@ struct tribb_kernel { typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits; typedef typename Traits::ResScalar ResScalar; - + enum { BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) }; - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) + void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { - gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel; + typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper; + ResMapper res(_res, resStride); + gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel; + Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer; // let's process the block per panel of actual_mc x BlockSize, @@ -146,7 +154,7 @@ struct tribb_kernel const RhsScalar* actual_b = blockB+j*depth; if(UpLo==Upper) - gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, + gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, -1, -1, 0, 0); // selfadjoint micro block @@ -154,12 +162,12 @@ struct tribb_kernel Index i = j; buffer.setZero(); // 1 - apply the kernel on the temporary buffer - gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, + gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, -1, -1, 0, 0); // 2 - triangular accumulation for(Index j1=0; j1<actualBlockSize; ++j1) { - ResScalar* r = res + (j+j1)*resStride + i; + ResScalar* r = &res(i, j + j1); for(Index i1=UpLo==Lower ? j1 : 0; UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1) r[i1] += buffer(i1,j1); @@ -169,8 +177,8 @@ struct tribb_kernel if(UpLo==Lower) { Index i = j+actualBlockSize; - gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha, - -1, -1, 0, 0); + gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, + depth, actualBlockSize, alpha, -1, -1, 0, 0); } } } diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 340c51394..7df6a6b1a 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -10,7 +10,7 @@ #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H #define EIGEN_GENERAL_MATRIX_VECTOR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -48,17 +48,17 @@ namespace internal { * // we currently fall back to the NoneAligned case * * The same reasoning apply for the transposed case. - * + * * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet... * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow * compared to unaligned loads on a 4 byte boundary. * */ -template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> -struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version> +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version> { -typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; enum { Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable @@ -78,17 +78,17 @@ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; EIGEN_DONT_INLINE static void run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha); }; -template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> -EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run( +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha) { @@ -97,14 +97,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif - #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \ + #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \ pstore(&res[j], \ padd(pload<ResPacket>(&res[j]), \ padd( \ - padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \ - pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \ - padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \ - pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) ))) + padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \ + pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \ + padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \ + pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) ))) + + typedef typename LhsMapper::VectorMapper LhsScalars; conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; @@ -118,7 +120,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co const Index ResPacketAlignedMask = ResPacketSize-1; // const Index PeelAlignedMask = ResPacketSize*peels-1; const Index size = rows; - + + const Index lhsStride = lhs.stride(); + // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type. Index alignedStart = internal::first_aligned(res,size); @@ -131,15 +135,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = internal::first_aligned(lhs,size); + const Index lhsAlignmentOffset = lhs.firstAligned(size); // find how many columns do we have to skip to be aligned with the result (if possible) Index skipColumns = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) ) + if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (size_t(res)%sizeof(ResScalar)) ) { alignedSize = 0; alignedStart = 0; + alignmentPattern = NoneAligned; } else if(LhsPacketSize > 4) { @@ -149,7 +154,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co } else if (LhsPacketSize>1) { - eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); + // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); while (skipColumns<LhsPacketSize && alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize)) @@ -166,10 +171,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co // note that the skiped columns are processed later. } - eigen_internal_assert( (alignmentPattern==NoneAligned) + /* eigen_internal_assert( (alignmentPattern==NoneAligned) || (skipColumns + columnsAtOnce >= cols) || LhsPacketSize > size - || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0); + || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/ } else if(Vectorizable) { @@ -178,20 +183,20 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co alignmentPattern = AllAligned; } - Index offset1 = (FirstAligned && alignmentStep==1?3:1); - Index offset3 = (FirstAligned && alignmentStep==1?1:3); + const Index offset1 = (FirstAligned && alignmentStep==1?3:1); + const Index offset3 = (FirstAligned && alignmentStep==1?1:3); Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce) { - RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]), - ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]), - ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]), - ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]); + RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)), + ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)), + ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)), + ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0)); // this helps a lot generating better binary code - const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1), + lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3); if (Vectorizable) { @@ -199,10 +204,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co // process initial unaligned coeffs for (Index j=0; j<alignedStart; ++j) { - res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); - res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); - res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); - res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); + res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]); + res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]); + res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]); + res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]); } if (alignedSize>alignedStart) @@ -211,11 +216,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co { case AllAligned: for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,d,d); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned); break; case EvenAligned: for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,du,d); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned); break; case FirstAligned: { @@ -225,28 +230,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; ResPacket T0, T1; - A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); - A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); - A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); + A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1); + A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2); + A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3); for (; j<peeledSize; j+=peels*ResPacketSize) { - A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); - A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); - A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13); - A00 = pload<LhsPacket>(&lhs0[j]); - A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]); + A00 = lhs0.template load<LhsPacket, Aligned>(j); + A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize); T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j])); T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize])); T0 = pcj.pmadd(A01, ptmp1, T0); - A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01); T0 = pcj.pmadd(A02, ptmp2, T0); - A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02); T0 = pcj.pmadd(A03, ptmp3, T0); pstore(&res[j],T0); - A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03); T1 = pcj.pmadd(A11, ptmp1, T1); T1 = pcj.pmadd(A12, ptmp2, T1); T1 = pcj.pmadd(A13, ptmp3, T1); @@ -254,12 +259,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co } } for (; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,du,du); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned); break; } default: for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(du,du,du); + _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned); break; } } @@ -268,10 +273,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co /* process remaining coeffs (or all if there is no explicit vectorization) */ for (Index j=alignedSize; j<size; ++j) { - res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); - res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); - res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); - res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); + res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]); + res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]); + res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]); + res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]); } } @@ -282,27 +287,27 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co { for (Index k=start; k<end; ++k) { - RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]); - const LhsScalar* lhs0 = lhs + k*lhsStride; + RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0)); + const LhsScalars lhs0 = lhs.getVectorMapper(0, k); if (Vectorizable) { /* explicit vectorization */ // process first unaligned result's coeffs for (Index j=0; j<alignedStart; ++j) - res[j] += cj.pmul(lhs0[j], pfirst(ptmp0)); + res[j] += cj.pmul(lhs0(j), pfirst(ptmp0)); // process aligned result's coeffs - if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + if (lhs0.template aligned<LhsPacket>(alignedStart)) for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) - pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); + pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i]))); else for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) - pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); + pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i]))); } // process remaining scalars (or all if no explicit vectorization) for (Index i=alignedSize; i<size; ++i) - res[i] += cj.pmul(lhs0[i], pfirst(ptmp0)); + res[i] += cj.pmul(lhs0(i), pfirst(ptmp0)); } if (skipColumns) { @@ -326,8 +331,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co * - alpha is always a complex (or converted to a complex) * - no vectorization */ -template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> -struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version> +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version> { typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; @@ -346,70 +351,75 @@ typedef typename packet_traits<ResScalar>::type _ResPacket; typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; - + EIGEN_DONT_INLINE static void run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha); }; -template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> -EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run( +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha) { - EIGEN_UNUSED_VARIABLE(rhsIncr); - eigen_internal_assert(rhsIncr==1); - + eigen_internal_assert(rhs.stride()==1); + #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif - #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ - RhsPacket b = pload<RhsPacket>(&rhs[j]); \ - ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \ - ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \ - ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \ - ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); } + #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\ + RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \ + ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \ + ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \ + ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \ + ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); } conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; + typedef typename LhsMapper::VectorMapper LhsScalars; + enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; const Index rowsAtOnce = 4; const Index peels = 2; const Index RhsPacketAlignedMask = RhsPacketSize-1; const Index LhsPacketAlignedMask = LhsPacketSize-1; -// const Index PeelAlignedMask = RhsPacketSize*peels-1; const Index depth = cols; + const Index lhsStride = lhs.stride(); // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type // if that's not the case then vectorization is discarded, see below. - Index alignedStart = internal::first_aligned(rhs, depth); + Index alignedStart = rhs.firstAligned(depth); Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; + : alignmentStep==(LhsPacketSize/2) ? EvenAligned + : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth); + const Index lhsAlignmentOffset = lhs.firstAligned(depth); + const Index rhsAlignmentOffset = rhs.firstAligned(rows); // find how many rows do we have to skip to be aligned with rhs (if possible) Index skipRows = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || + (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) || + (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) ) { alignedSize = 0; alignedStart = 0; + alignmentPattern = NoneAligned; } else if(LhsPacketSize > 4) { @@ -418,7 +428,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co } else if (LhsPacketSize>1) { - eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); + // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); while (skipRows<LhsPacketSize && alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize)) @@ -434,11 +444,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co skipRows = (std::min)(skipRows,Index(rows)); // note that the skiped columns are processed later. } - eigen_internal_assert( alignmentPattern==NoneAligned + /* eigen_internal_assert( alignmentPattern==NoneAligned || LhsPacketSize==1 || (skipRows + rowsAtOnce >= rows) || LhsPacketSize > depth - || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0); + || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/ } else if(Vectorizable) { @@ -447,8 +457,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co alignmentPattern = AllAligned; } - Index offset1 = (FirstAligned && alignmentStep==1?3:1); - Index offset3 = (FirstAligned && alignmentStep==1?1:3); + const Index offset1 = (FirstAligned && alignmentStep==1?3:1); + const Index offset3 = (FirstAligned && alignmentStep==1?1:3); Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) @@ -457,8 +467,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0); // this helps the compiler generating good binary code - const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0), + lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0); if (Vectorizable) { @@ -470,9 +480,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co // FIXME this loop get vectorized by the compiler ! for (Index j=0; j<alignedStart; ++j) { - RhsScalar b = rhs[j]; - tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); - tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); + RhsScalar b = rhs(j, 0); + tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b); + tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b); } if (alignedSize>alignedStart) @@ -481,11 +491,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co { case AllAligned: for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,d,d); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned); break; case EvenAligned: for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,du,d); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned); break; case FirstAligned: { @@ -499,39 +509,39 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co * than basic unaligned loads. */ LhsPacket A01, A02, A03, A11, A12, A13; - A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); - A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); - A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); + A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1); + A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2); + A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3); for (; j<peeledSize; j+=peels*RhsPacketSize) { - RhsPacket b = pload<RhsPacket>(&rhs[j]); - A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); - A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); - A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); + A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13); - ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0); ptmp1 = pcj.pmadd(A01, b, ptmp1); - A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01); ptmp2 = pcj.pmadd(A02, b, ptmp2); - A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02); ptmp3 = pcj.pmadd(A03, b, ptmp3); - A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03); - b = pload<RhsPacket>(&rhs[j+RhsPacketSize]); - ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0); + b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0); + ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0); ptmp1 = pcj.pmadd(A11, b, ptmp1); ptmp2 = pcj.pmadd(A12, b, ptmp2); ptmp3 = pcj.pmadd(A13, b, ptmp3); } } for (; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,du,du); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned); break; } default: for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(du,du,du); + _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned); break; } tmp0 += predux(ptmp0); @@ -545,9 +555,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co // FIXME this loop get vectorized by the compiler ! for (Index j=alignedSize; j<depth; ++j) { - RhsScalar b = rhs[j]; - tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); - tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); + RhsScalar b = rhs(j, 0); + tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b); + tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b); } res[i*resIncr] += alpha*tmp0; res[(i+offset1)*resIncr] += alpha*tmp1; @@ -564,28 +574,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co { EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0); ResPacket ptmp0 = pset1<ResPacket>(tmp0); - const LhsScalar* lhs0 = lhs + i*lhsStride; + const LhsScalars lhs0 = lhs.getVectorMapper(i, 0); // process first unaligned result's coeffs // FIXME this loop get vectorized by the compiler ! for (Index j=0; j<alignedStart; ++j) - tmp0 += cj.pmul(lhs0[j], rhs[j]); + tmp0 += cj.pmul(lhs0(j), rhs(j, 0)); if (alignedSize>alignedStart) { // process aligned rhs coeffs - if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + if (lhs0.template aligned<LhsPacket>(alignedStart)) for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) - ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0); else for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) - ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0); tmp0 += predux(ptmp0); } // process remaining scalars // FIXME this loop get vectorized by the compiler ! for (Index j=alignedSize; j<depth; ++j) - tmp0 += cj.pmul(lhs0[j], rhs[j]); + tmp0 += cj.pmul(lhs0(j), rhs(j, 0)); res[i*resIncr] += alpha*tmp0; } if (skipRows) diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 126cfbbff..2b90abf8f 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -49,8 +49,8 @@ inline void initParallel() { int nbt; internal::manage_multi_threading(GetAction, &nbt); - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); } /** \returns the max number of threads reserved for Eigen diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 4e507b6cf..4b6316d63 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -324,20 +324,26 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* _res, Index resStride, const Scalar& alpha) { Index size = rows; - const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - typedef gebp_traits<Scalar,Scalar> Traits; + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper; + typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + LhsTransposeMapper lhs_transpose(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); + Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction Index nc = cols; // cache block size along the N direction - computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); + computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1); // kc must smaller than mc kc = (std::min)(kc,mc); @@ -346,10 +352,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); Scalar* blockB = allocatedBlockB; - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs; + gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed; for(Index k2=0; k2<size; k2+=kc) { @@ -358,7 +364,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t // we have selected one row panel of rhs and one column panel of lhs // pack rhs's panel into a sequential chunk of memory // and expand each coeff to a constant packet for further reuse - pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); + pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols); // the select lhs's panel has to be split in three different parts: // 1 - the transposed panel above the diagonal block => transposed packed copy @@ -368,9 +374,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t { const Index actual_mc = (std::min)(i2+mc,k2)-i2; // transposed packed copy - pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc); + pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } // the block diagonal { @@ -378,16 +384,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t // symmetric packed copy pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } for(Index i2=k2+kc; i2<size; i2+=mc) { const Index actual_mc = (std::min)(i2+mc,size)-i2; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>() - (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>() + (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } } } @@ -414,26 +420,29 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* _res, Index resStride, const Scalar& alpha) { Index size = cols; - const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - typedef gebp_traits<Scalar,Scalar> Traits; - Index kc = size; // cache block size along the K direction + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + ResMapper res(_res,resStride); + + Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction Index nc = cols; // cache block size along the N direction - computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); + computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1); std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); Scalar* blockB = allocatedBlockB; - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; for(Index k2=0; k2<size; k2+=kc) @@ -446,9 +455,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f for(Index i2=0; i2<rows; i2+=mc) { const Index actual_mc = (std::min)(i2+mc,rows)-i2; - pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } } } diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index c2d0817ea..60c99dcd2 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -108,7 +108,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* _res, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { // strip zeros @@ -117,8 +117,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, Index depth = IsLower ? diagSize : _depth; Index cols = _cols; - const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -136,9 +140,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, else triangularBuffer.diagonal().setOnes(); - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs; for(Index k2=IsLower ? depth : 0; IsLower ? k2>0 : k2<depth; @@ -154,7 +158,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, k2 = k2+actual_kc-kc; } - pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols); + pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols); // the selected lhs's panel has to be split in three different parts: // 1 - the part which is zero => skip it @@ -182,9 +186,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i) triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k); } - pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth); + pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth); - gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha, + gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB, + actualPanelWidth, actualPanelWidth, cols, alpha, actualPanelWidth, actual_kc, 0, blockBOffset); // GEBP with remaining micro panel @@ -192,9 +197,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, { Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2; - pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget); + pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget); - gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha, + gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB, + lengthTarget, actualPanelWidth, cols, alpha, actualPanelWidth, actual_kc, 0, blockBOffset); } } @@ -206,10 +212,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, for(Index i2=start; i2<end; i2+=mc) { const Index actual_mc = (std::min)(i2+mc,end)-i2; - gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>() - (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>() + (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, + actual_kc, cols, alpha, -1, -1, 0, 0); } } } @@ -247,7 +254,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* _res, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { // strip zeros @@ -256,8 +263,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index depth = IsLower ? _depth : diagSize; Index cols = IsLower ? diagSize : _cols; - const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -275,10 +286,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, else triangularBuffer.diagonal().setOnes(); - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; for(Index k2=IsLower ? 0 : depth; IsLower ? k2<depth : k2>0; @@ -302,7 +313,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Scalar* geb = blockB+ts*ts; geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar)); - pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs); + pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs); // pack the triangular part of the rhs padding the unrolled blocks with zeros if(ts>0) @@ -315,7 +326,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; // general part pack_rhs_panel(blockB+j2*actual_kc, - &rhs(actual_k2+panelOffset, actual_j2), rhsStride, + rhs.getSubMapper(actual_k2+panelOffset, actual_j2), panelLength, actualPanelWidth, actual_kc, panelOffset); @@ -329,7 +340,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, } pack_rhs_panel(blockB+j2*actual_kc, - triangularBuffer.data(), triangularBuffer.outerStride(), + RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth, actual_kc, j2); } @@ -338,7 +349,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, for (Index i2=0; i2<rows; i2+=mc) { const Index actual_mc = (std::min)(mc,rows-i2); - pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); + pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); // triangular kernel if(ts>0) @@ -349,7 +360,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth; Index blockOffset = IsLower ? j2 : 0; - gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride, + gebp_kernel(res.getSubMapper(i2, actual_k2 + j2), blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, alpha, @@ -357,7 +368,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, blockOffset, blockOffset);// offsets } } - gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride, + gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2), blockA, geb, actual_mc, actual_kc, rs, alpha, -1, -1, 0, 0); @@ -402,7 +413,7 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false> Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows())) : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols())); - BlockingType blocking(stripedRows, stripedCols, stripedDepth); + BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false); internal::product_triangular_matrix_matrix<Scalar, Index, Mode, LhsIsTriangular, diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h index 92d64e384..4d88a710b 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector.h +++ b/Eigen/src/Core/products/TriangularMatrixVector.h @@ -10,7 +10,7 @@ #ifndef EIGEN_TRIANGULARMATRIXVECTOR_H #define EIGEN_TRIANGULARMATRIXVECTOR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -43,7 +43,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs); - + typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap; const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr)); typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs); @@ -51,6 +51,9 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap; ResMap res(_res,rows); + typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper; + for (Index pi=0; pi<size; pi+=PanelWidth) { Index actualPanelWidth = (std::min)(PanelWidth, size-pi); @@ -68,19 +71,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con if (r>0) { Index s = IsLower ? pi+actualPanelWidth : 0; - general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run( r, actualPanelWidth, - &lhs.coeffRef(s,pi), lhsStride, - &rhs.coeffRef(pi), rhsIncr, + LhsMapper(&lhs.coeffRef(s,pi), lhsStride), + RhsMapper(&rhs.coeffRef(pi), rhsIncr), &res.coeffRef(s), resIncr, alpha); } } if((!IsLower) && cols>size) { - general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run( rows, cols-size, - &lhs.coeffRef(0,size), lhsStride, - &rhs.coeffRef(size), rhsIncr, + LhsMapper(&lhs.coeffRef(0,size), lhsStride), + RhsMapper(&rhs.coeffRef(size), rhsIncr), _res, resIncr, alpha); } } @@ -118,7 +121,10 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap; ResMap res(_res,rows,InnerStride<>(resIncr)); - + + typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper; + for (Index pi=0; pi<diagSize; pi+=PanelWidth) { Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi); @@ -136,19 +142,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con if (r>0) { Index s = IsLower ? 0 : pi + actualPanelWidth; - general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run( actualPanelWidth, r, - &lhs.coeffRef(pi,s), lhsStride, - &rhs.coeffRef(s), rhsIncr, + LhsMapper(&lhs.coeffRef(pi,s), lhsStride), + RhsMapper(&rhs.coeffRef(s), rhsIncr), &res.coeffRef(pi), resIncr, alpha); } } if(IsLower && rows>diagSize) { - general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run( rows-diagSize, cols, - &lhs.coeffRef(diagSize,0), lhsStride, - &rhs.coeffRef(0), rhsIncr, + LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride), + RhsMapper(&rhs.coeffRef(0), rhsIncr), &res.coeffRef(diagSize), resIncr, alpha); } } @@ -231,7 +237,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor> bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; - + RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha); ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), @@ -251,7 +257,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor> else MappedDest(actualDestPtr, dest.size()) = dest; } - + internal::triangular_matrix_vector_product <Index,Mode, LhsScalar, LhsBlasTraits::NeedToConjugate, @@ -311,7 +317,7 @@ template<int Mode> struct trmv_selector<Mode,RowMajor> #endif Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs; } - + internal::triangular_matrix_vector_product <Index,Mode, LhsScalar, LhsBlasTraits::NeedToConjugate, diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 1f7afd187..f5de67c59 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -52,10 +52,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju level3_blocking<Scalar,Scalar>& blocking) { Index cols = otherSize; - const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride); - blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride); + + typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper; + typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper; + TriMapper tri(_tri, triStride); + OtherMapper other(_other, otherStride); typedef gebp_traits<Scalar,Scalar> Traits; + enum { SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower @@ -71,14 +75,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); conj_if<Conjugate> conj; - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs; - gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs; + gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel; + gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache // coherence when accessing the rhs elements - std::ptrdiff_t l1, l2; - manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0; subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr); @@ -146,16 +150,16 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju Index blockBOffset = IsLower ? k1 : lengthTarget; // update the respective rows of B from other - pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset); + pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset); // GEBP if (lengthTarget>0) { Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc; - pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); + pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget); - gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), + gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), actualPanelWidth, actual_kc, 0, blockBOffset); } } @@ -170,9 +174,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju const Index actual_mc = (std::min)(mc,end-i2); if (actual_mc>0) { - pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc); + pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc); - gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0); + gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0); } } } @@ -198,8 +202,11 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj level3_blocking<Scalar,Scalar>& blocking) { Index rows = otherSize; - const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride); - blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride); + + typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper; + LhsMapper lhs(_other, otherStride); + RhsMapper rhs(_tri, triStride); typedef gebp_traits<Scalar,Scalar> Traits; enum { @@ -218,10 +225,10 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); conj_if<Conjugate> conj; - gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel; + gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k2<size; @@ -234,7 +241,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc; Scalar* geb = blockB+actual_kc*actual_kc; - if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs); + if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs); // triangular packing (we only pack the panels off the diagonal, // neglecting the blocks overlapping the diagonal @@ -248,7 +255,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj if (panelLength>0) pack_rhs_panel(blockB+j2*actual_kc, - &rhs(actual_k2+panelOffset, actual_j2), triStride, + rhs.getSubMapper(actual_k2+panelOffset, actual_j2), panelLength, actualPanelWidth, actual_kc, panelOffset); } @@ -276,7 +283,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj // GEBP if(panelLength>0) { - gebp_kernel(&lhs(i2,absolute_j2), otherStride, + gebp_kernel(lhs.getSubMapper(i2,absolute_j2), blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, Scalar(-1), @@ -303,14 +310,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj } // pack the just computed part of lhs to A - pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride, + pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride), actualPanelWidth, actual_mc, actual_kc, j2); } } if (rs>0) - gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, + gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb, actual_mc, actual_kc, rs, Scalar(-1), -1, -1, 0, 0); } diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h index ce4d10088..b994759b2 100644 --- a/Eigen/src/Core/products/TriangularSolverVector.h +++ b/Eigen/src/Core/products/TriangularSolverVector.h @@ -10,7 +10,7 @@ #ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H #define EIGEN_TRIANGULAR_SOLVER_VECTOR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -25,7 +25,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Co >::run(size, _lhs, lhsStride, rhs); } }; - + // forward and backward substitution, row-major, rhs is a vector template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate> struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor> @@ -37,6 +37,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con { typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + + typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper; + typename internal::conditional< Conjugate, const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>, @@ -58,10 +62,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con Index startRow = IsLower ? pi : pi-actualPanelWidth; Index startCol = IsLower ? 0 : pi; - general_matrix_vector_product<Index,LhsScalar,RowMajor,Conjugate,RhsScalar,false>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run( actualPanelWidth, r, - &lhs.coeffRef(startRow,startCol), lhsStride, - rhs + startCol, 1, + LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride), + RhsMapper(rhs + startCol, 1), rhs + startRow, 1, RhsScalar(-1)); } @@ -72,7 +76,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con Index s = IsLower ? pi : i+1; if (k>0) rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum(); - + if(!(Mode & UnitDiag)) rhs[i] /= cjLhs(i,i); } @@ -91,6 +95,8 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con { typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper; typename internal::conditional<Conjugate, const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>, const LhsMap& @@ -122,10 +128,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con // let's directly call the low level product function because: // 1 - it is faster to compile // 2 - it is slighlty faster at runtime - general_matrix_vector_product<Index,LhsScalar,ColMajor,Conjugate,RhsScalar,false>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run( r, actualPanelWidth, - &lhs.coeffRef(endBlock,startBlock), lhsStride, - rhs+startBlock, 1, + LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride), + RhsMapper(rhs+startBlock, 1), rhs+endBlock, 1, RhsScalar(-1)); } } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 9f9115c2a..3ec55fad2 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -18,13 +18,13 @@ namespace Eigen { namespace internal { // forward declarations -template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false> +template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false> struct gebp_kernel; -template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false> +template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false> struct gemm_pack_rhs; -template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false> +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false> struct gemm_pack_lhs; template< @@ -34,7 +34,9 @@ template< int ResStorageOrder> struct general_matrix_matrix_product; -template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version=Specialized> +template<typename Index, + typename LhsScalar, typename LhsMapper, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version=Specialized> struct general_matrix_vector_product; @@ -117,32 +119,133 @@ template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::R static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) { return numext::real(x); } }; + +template<typename Scalar, typename Index> +class BlasVectorMapper { + public: + EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {} + + EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + return m_data[i]; + } + template <typename Packet, int AlignmentType> + EIGEN_ALWAYS_INLINE Packet load(Index i) const { + return ploadt<Packet, AlignmentType>(m_data + i); + } + + template <typename Packet> + bool aligned(Index i) const { + return (size_t(m_data+i)%sizeof(Packet))==0; + } + + protected: + Scalar* m_data; +}; + +template<typename Scalar, typename Index, int AlignmentType> +class BlasLinearMapper { + public: + typedef typename packet_traits<Scalar>::type Packet; + typedef typename packet_traits<Scalar>::half HalfPacket; + + EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} + + EIGEN_ALWAYS_INLINE void prefetch(int i) const { + internal::prefetch(&operator()(i)); + } + + EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { + return m_data[i]; + } + + EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return ploadt<Packet, AlignmentType>(m_data + i); + } + + EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + return ploadt<HalfPacket, AlignmentType>(m_data + i); + } + + EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + pstoret<Scalar, Packet, AlignmentType>(m_data + i, p); + } + + protected: + Scalar *m_data; +}; + // Lightweight helper class to access matrix coefficients. -// Yes, this is somehow redundant with Map<>, but this version is much much lighter, -// and so I hope better compilation performance (time and code quality). -template<typename Scalar, typename Index, int StorageOrder> -class blas_data_mapper -{ +template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned> +class blas_data_mapper { public: - blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} - EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j) - { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } + typedef typename packet_traits<Scalar>::type Packet; + typedef typename packet_traits<Scalar>::half HalfPacket; + + typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper; + typedef BlasVectorMapper<Scalar, Index> VectorMapper; + + EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} + + EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType> + getSubMapper(Index i, Index j) const { + return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride); + } + + EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(&operator()(i, j)); + } + + EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(&operator()(i, j)); + } + + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const { + return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; + } + + EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return ploadt<Packet, AlignmentType>(&operator()(i, j)); + } + + EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + return ploadt<HalfPacket, AlignmentType>(&operator()(i, j)); + } + + template<typename SubPacket> + EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const { + pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride); + } + + template<typename SubPacket> + EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const { + return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride); + } + + const Index stride() const { return m_stride; } + + Index firstAligned(Index size) const { + if (size_t(m_data)%sizeof(Scalar)) { + return -1; + } + return internal::first_aligned(m_data, size); + } + protected: - Scalar* EIGEN_RESTRICT m_data; - Index m_stride; + Scalar* EIGEN_RESTRICT m_data; + const Index m_stride; }; // lightweight helper class to access matrix coefficients (const version) template<typename Scalar, typename Index, int StorageOrder> -class const_blas_data_mapper -{ +class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> { public: - const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {} - EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const - { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } - protected: - const Scalar* EIGEN_RESTRICT m_data; - Index m_stride; + EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper<const Scalar, Index, StorageOrder>(data, stride) {} + + EIGEN_ALWAYS_INLINE const_blas_data_mapper<Scalar, Index, StorageOrder> getSubMapper(Index i, Index j) const { + return const_blas_data_mapper<Scalar, Index, StorageOrder>(&(this->operator()(i, j)), this->m_stride); + } }; diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 5c7d70af6..d1855b50b 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -163,6 +163,19 @@ const unsigned int NestByRefBit = 0x100; * \sa \ref RowMajorBit, \ref TopicStorageOrders */ const unsigned int NoPreferredStorageOrderBit = 0x200; +/** \ingroup flags + * + * Means that the underlying coefficients can be accessed through pointers to the sparse (un)compressed storage format, + * that is, the expression provides: + * \code + inline const Scalar* valuePtr() const; + inline const Index* innerIndexPtr() const; + inline const Index* outerIndexPtr() const; + inline const Index* innerNonZeroPtr() const; + \endcode + */ +const unsigned int CompressedAccessBit = 0x400; + // list of flags that are inherited by default const unsigned int HereditaryBits = RowMajorBit @@ -449,6 +462,9 @@ enum Action {GetAction, SetAction}; /** The type used to identify a dense storage. */ struct Dense {}; +/** The type used to identify a general sparse storage. */ +struct Sparse {}; + /** The type used to identify a permutation storage. */ struct PermutationStorage {}; diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 11b7e2887..07923848a 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -73,7 +73,7 @@ /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC) - #define EIGEN_COMP_MSVC_STRICT 1 + #define EIGEN_COMP_MSVC_STRICT _MSC_VER #else #define EIGEN_COMP_MSVC_STRICT 0 #endif @@ -160,6 +160,12 @@ #define EIGEN_ARCH_ARM64 0 #endif +#if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64 + #define EIGEN_ARCH_ARM_OR_ARM64 1 +#else + #define EIGEN_ARCH_ARM_OR_ARM64 0 +#endif + /// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS #if defined(__mips__) || defined(__mips) #define EIGEN_ARCH_MIPS 1 @@ -376,10 +382,21 @@ #define EIGEN_HAVE_RVALUE_REFERENCES #endif +// Does the compiler support variadic templates? +#if __cplusplus > 199711L +#define EIGEN_HAS_VARIADIC_TEMPLATES 1 +#endif + +// Does the compiler support const expressions? +#if (defined(__plusplus) && __cplusplus >= 201402L) || \ + EIGEN_GNUC_AT_LEAST(4,9) +#define EIGEN_HAS_CONSTEXPR 1 +#endif + /** Allows to disable some optimizations which might affect the accuracy of the result. * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. * They currently include: - * - single precision Cwise::sin() and Cwise::cos() when SSE vectorization is enabled. + * - single precision ArrayBase::sin() and ArrayBase::cos() when SSE vectorization is enabled. */ #ifndef EIGEN_FAST_MATH #define EIGEN_FAST_MATH 1 @@ -526,7 +543,7 @@ namespace Eigen { #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var); #if !defined(EIGEN_ASM_COMMENT) - #if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64 + #if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64) #define EIGEN_ASM_COMMENT(X) __asm__("#" X) #else #define EIGEN_ASM_COMMENT(X) @@ -540,7 +557,9 @@ namespace Eigen { * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link * vectorized and non-vectorized code. */ -#if EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM +#if (defined __CUDACC__) + #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) +#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) #elif EIGEN_COMP_MSVC #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) @@ -592,7 +611,7 @@ namespace Eigen { // just an empty macro ! #define EIGEN_EMPTY -#if EIGEN_COMP_MSVC_STRICT +#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1900 #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index a54ccaedc..16f8cc1b0 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -143,8 +143,8 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = *** Implementation of generic aligned realloc (when no realloc can be used)*** *****************************************************************************/ -void* aligned_malloc(std::size_t size); -void aligned_free(void *ptr); +EIGEN_DEVICE_FUNC void* aligned_malloc(std::size_t size); +EIGEN_DEVICE_FUNC void aligned_free(void *ptr); /** \internal * \brief Reallocates aligned memory. @@ -185,33 +185,33 @@ inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size) *****************************************************************************/ #ifdef EIGEN_NO_MALLOC -inline void check_that_malloc_is_allowed() +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() { eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)"); } #elif defined EIGEN_RUNTIME_NO_MALLOC -inline bool is_malloc_allowed_impl(bool update, bool new_value = false) +EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false) { static bool value = true; if (update == 1) value = new_value; return value; } -inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); } -inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); } -inline void check_that_malloc_is_allowed() +EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); } +EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); } +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() { eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)"); } #else -inline void check_that_malloc_is_allowed() +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {} #endif /** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements. * On allocation error, the returned pointer is null, and std::bad_alloc is thrown. */ -inline void* aligned_malloc(size_t size) +EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size) { check_that_malloc_is_allowed(); @@ -237,7 +237,7 @@ inline void* aligned_malloc(size_t size) } /** \internal Frees memory allocated with aligned_malloc. */ -inline void aligned_free(void *ptr) +EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { #if !EIGEN_ALIGN std::free(ptr); @@ -298,12 +298,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) /** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned. * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown. */ -template<bool Align> inline void* conditional_aligned_malloc(size_t size) +template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size) { return aligned_malloc(size); } -template<> inline void* conditional_aligned_malloc<false>(size_t size) +template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(size_t size) { check_that_malloc_is_allowed(); @@ -314,12 +314,12 @@ template<> inline void* conditional_aligned_malloc<false>(size_t size) } /** \internal Frees memory allocated with conditional_aligned_malloc */ -template<bool Align> inline void conditional_aligned_free(void *ptr) +template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { aligned_free(ptr); } -template<> inline void conditional_aligned_free<false>(void *ptr) +template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr) { std::free(ptr); } @@ -341,7 +341,7 @@ template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new /** \internal Destructs the elements of an array. * The \a size parameters tells on how many objects to call the destructor of T. */ -template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size) +template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size) { // always destruct an array starting from the end. if(ptr) @@ -351,7 +351,7 @@ template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size) /** \internal Constructs the elements of an array. * The \a size parameter tells on how many objects to call the constructor of T. */ -template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size) +template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size) { size_t i; EIGEN_TRY @@ -371,7 +371,7 @@ template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size) *****************************************************************************/ template<typename T> -EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) { if(size > size_t(-1) / sizeof(T)) throw_std_bad_alloc(); @@ -381,7 +381,7 @@ EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown. * The default constructor of T is called. */ -template<typename T> inline T* aligned_new(size_t size) +template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size) { check_size_for_overflow<T>(size); T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size)); @@ -396,7 +396,7 @@ template<typename T> inline T* aligned_new(size_t size) } } -template<typename T, bool Align> inline T* conditional_aligned_new(size_t size) +template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size) { check_size_for_overflow<T>(size); T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size)); @@ -414,7 +414,7 @@ template<typename T, bool Align> inline T* conditional_aligned_new(size_t size) /** \internal Deletes objects constructed with aligned_new * The \a size parameters tells on how many objects to call the destructor of T. */ -template<typename T> inline void aligned_delete(T *ptr, size_t size) +template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t size) { destruct_elements_of_array<T>(ptr, size); aligned_free(ptr); @@ -423,13 +423,13 @@ template<typename T> inline void aligned_delete(T *ptr, size_t size) /** \internal Deletes objects constructed with conditional_aligned_new * The \a size parameters tells on how many objects to call the destructor of T. */ -template<typename T, bool Align> inline void conditional_aligned_delete(T *ptr, size_t size) +template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size) { destruct_elements_of_array<T>(ptr, size); conditional_aligned_free<Align>(ptr); } -template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size) +template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size) { check_size_for_overflow<T>(new_size); check_size_for_overflow<T>(old_size); @@ -452,7 +452,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pt } -template<typename T, bool Align> inline T* conditional_aligned_new_auto(size_t size) +template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size) { if(size==0) return 0; // short-cut. Also fixes Bug 884 @@ -495,7 +495,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto( return result; } -template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *ptr, size_t size) +template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size) { if(NumTraits<T>::RequireInitialization) destruct_elements_of_array<T>(ptr, size); @@ -523,9 +523,8 @@ template<typename T, bool Align> inline void conditional_aligned_delete_auto(T * template<typename Scalar, typename Index> inline Index first_aligned(const Scalar* array, Index size) { - enum { PacketSize = packet_traits<Scalar>::size, - PacketAlignedMask = PacketSize-1 - }; + static const Index PacketSize = packet_traits<Scalar>::size; + static const Index PacketAlignedMask = PacketSize-1; if(PacketSize==1) { diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 299e5cbc2..528ebe297 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -463,6 +463,21 @@ template<typename XprType, typename CastType> struct cast_return_type const XprType&,CastType>::type type; }; +template <typename A, typename B> struct promote_storage_type; + +template <typename A> struct promote_storage_type<A,A> +{ + typedef A ret; +}; +template <typename A> struct promote_storage_type<A, const A> +{ + typedef A ret; +}; +template <typename A> struct promote_storage_type<const A, A> +{ + typedef A ret; +}; + /** \internal Specify the "storage kind" of applying a coefficient-wise * binary operations between two expressions of kinds A and B respectively. * The template parameter Functor permits to specialize the resulting storage kind wrt to diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h index ae10ff91e..128ef9028 100644 --- a/Eigen/src/Eigenvalues/RealQZ.h +++ b/Eigen/src/Eigenvalues/RealQZ.h @@ -313,7 +313,7 @@ namespace Eigen { using std::abs; using std::sqrt; const Index dim=m_S.cols(); - if (abs(m_S.coeff(i+1,i)==Scalar(0))) + if (abs(m_S.coeff(i+1,i))==Scalar(0)) return; Index z = findSmallDiagEntry(i,i+1); if (z==i-1) diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index 10f5fb174..51e61ba38 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -234,7 +234,7 @@ template<typename _MatrixType> class RealSchur typedef Matrix<Scalar,3,1> Vector3s; Scalar computeNormOfT(); - Index findSmallSubdiagEntry(Index iu, const Scalar& norm); + Index findSmallSubdiagEntry(Index iu); void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift); void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo); void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector); @@ -286,7 +286,7 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa { while (iu >= 0) { - Index il = findSmallSubdiagEntry(iu, norm); + Index il = findSmallSubdiagEntry(iu); // Check for convergence if (il == iu) // One root found @@ -343,16 +343,14 @@ inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT() /** \internal Look for single small sub-diagonal element and returns its index */ template<typename MatrixType> -inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu, const Scalar& norm) +inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu) { using std::abs; Index res = iu; while (res > 0) { Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res)); - if (s == 0.0) - s = norm; - if (abs(m_matT.coeff(res,res-1)) < NumTraits<Scalar>::epsilon() * s) + if (abs(m_matT.coeff(res,res-1)) <= NumTraits<Scalar>::epsilon() * s) break; res--; } @@ -457,9 +455,7 @@ inline void RealSchur<MatrixType>::initFrancisQRStep(Index il, Index iu, const V const Scalar lhs = m_matT.coeff(im,im-1) * (abs(v.coeff(1)) + abs(v.coeff(2))); const Scalar rhs = v.coeff(0) * (abs(m_matT.coeff(im-1,im-1)) + abs(Tmm) + abs(m_matT.coeff(im+1,im+1))); if (abs(lhs) < NumTraits<Scalar>::epsilon() * rhs) - { break; - } } } diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index 276e94c58..54842b5ff 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -78,6 +78,8 @@ struct traits<Transform<_Scalar,_Dim,_Mode,_Options> > }; }; +template<int Mode> struct transform_make_affine; + } // end namespace internal /** \geometry_module \ingroup Geometry_Module @@ -247,8 +249,7 @@ public: inline Transform() { check_template_params(); - if (int(Mode)==Affine) - makeAffine(); + internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix); } inline Transform(const Transform& other) @@ -611,11 +612,7 @@ public: */ void makeAffine() { - if(int(Mode)!=int(AffineCompact)) - { - matrix().template block<1,Dim>(Dim,0).setZero(); - matrix().coeffRef(Dim,Dim) = Scalar(1); - } + internal::transform_make_affine<int(Mode)>::run(m_matrix); } /** \internal @@ -1103,6 +1100,24 @@ Transform<Scalar,Dim,Mode,Options>::fromPositionOrientationScale(const MatrixBas namespace internal { +template<int Mode> +struct transform_make_affine +{ + template<typename MatrixType> + static void run(MatrixType &mat) + { + static const int Dim = MatrixType::ColsAtCompileTime-1; + mat.template block<1,Dim>(Dim,0).setZero(); + mat.coeffRef(Dim,Dim) = typename MatrixType::Scalar(1); + } +}; + +template<> +struct transform_make_affine<AffineCompact> +{ + template<typename MatrixType> static void run(MatrixType &) { } +}; + // selector needed to avoid taking the inverse of a 3x4 matrix template<typename TransformType, int Mode=TransformType::Mode> struct projective_transform_inverse diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h index 5f55efbe9..a715c7285 100644 --- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h @@ -139,11 +139,7 @@ struct traits<BiCGSTAB<_MatrixType,_Preconditioner> > * \include BiCGSTAB_simple.cpp * * By default the iterations start with x=0 as an initial guess of the solution. - * One can control the start using the solveWithGuess() method. Here is a step by - * step execution example starting with a random guess and printing the evolution - * of the estimated error: - * \include BiCGSTAB_step_by_step.cpp - * Note that such a step by step execution is slightly slower. + * One can control the start using the solveWithGuess() method. * * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ @@ -192,7 +188,7 @@ public: m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - if(!internal::bicgstab(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error)) + if(!internal::bicgstab(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error)) failed = true; } m_info = failed ? NumericalIssue diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 2ccbf4fe6..d8bc13f5f 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -113,8 +113,8 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> > * The matrix A must be selfadjoint. The matrix A and the vectors x and b can be either dense or sparse. * * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix. - * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower - * or Upper. Default is Lower. + * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower, + * Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower. * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner * * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations() @@ -137,20 +137,7 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> > * \endcode * * By default the iterations start with x=0 as an initial guess of the solution. - * One can control the start using the solveWithGuess() method. Here is a step by - * step execution example starting with a random guess and printing the evolution - * of the estimated error: - * * \code - * x = VectorXd::Random(n); - * cg.setMaxIterations(1); - * int i = 0; - * do { - * x = cg.solveWithGuess(b,x); - * std::cout << i << " : " << cg.error() << std::endl; - * ++i; - * } while (cg.info()!=Success && i<100); - * \endcode - * Note that such a step by step excution is slightly slower. + * One can control the start using the solveWithGuess() method. * * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ @@ -196,6 +183,10 @@ public: template<typename Rhs,typename Dest> void _solve_with_guess_impl(const Rhs& b, Dest& x) const { + typedef typename internal::conditional<UpLo==(Lower|Upper), + Ref<const MatrixType>&, + SparseSelfAdjointView<const Ref<const MatrixType>, UpLo> + >::type MatrixWrapperType; m_iterations = Base::maxIterations(); m_error = Base::m_tolerance; @@ -205,8 +196,7 @@ public: m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - internal::conjugate_gradient(mp_matrix->template selfadjointView<UpLo>(), b.col(j), xj, - Base::m_preconditioner, m_iterations, m_error); + internal::conjugate_gradient(MatrixWrapperType(mp_matrix), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error); } m_isInitialized = true; diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h index cc99e00f9..2afd80f4d 100644 --- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h @@ -37,7 +37,7 @@ public: /** Default constructor. */ IterativeSolverBase() - : mp_matrix(0) + : m_dummy(0,0), mp_matrix(m_dummy) { init(); } @@ -52,10 +52,11 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - explicit IterativeSolverBase(const MatrixType& A) + template<typename SparseMatrixDerived> + explicit IterativeSolverBase(const SparseMatrixBase<SparseMatrixDerived>& A) { init(); - compute(A); + compute(A.derived()); } ~IterativeSolverBase() {} @@ -65,9 +66,11 @@ public: * Currently, this function mostly calls analyzePattern on the preconditioner. In the future * we might, for instance, implement column reordering for faster matrix vector products. */ - Derived& analyzePattern(const MatrixType& A) + template<typename SparseMatrixDerived> + Derived& analyzePattern(const SparseMatrixBase<SparseMatrixDerived>& A) { - m_preconditioner.analyzePattern(A); + grab(A); + m_preconditioner.analyzePattern(mp_matrix); m_isInitialized = true; m_analysisIsOk = true; m_info = Success; @@ -83,11 +86,12 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - Derived& factorize(const MatrixType& A) + template<typename SparseMatrixDerived> + Derived& factorize(const SparseMatrixBase<SparseMatrixDerived>& A) { eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); - mp_matrix = &A; - m_preconditioner.factorize(A); + grab(A); + m_preconditioner.factorize(mp_matrix); m_factorizationIsOk = true; m_info = Success; return derived(); @@ -103,10 +107,11 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - Derived& compute(const MatrixType& A) + template<typename SparseMatrixDerived> + Derived& compute(const SparseMatrixBase<SparseMatrixDerived>& A) { - mp_matrix = &A; - m_preconditioner.compute(A); + grab(A); + m_preconditioner.compute(mp_matrix); m_isInitialized = true; m_analysisIsOk = true; m_factorizationIsOk = true; @@ -115,9 +120,10 @@ public: } /** \internal */ - StorageIndex rows() const { return mp_matrix ? mp_matrix->rows() : 0; } + Index rows() const { return mp_matrix.rows(); } + /** \internal */ - StorageIndex cols() const { return mp_matrix ? mp_matrix->cols() : 0; } + Index cols() const { return mp_matrix.cols(); } /** \returns the tolerance threshold used by the stopping criteria */ RealScalar tolerance() const { return m_tolerance; } @@ -135,13 +141,18 @@ public: /** \returns a read-only reference to the preconditioner. */ const Preconditioner& preconditioner() const { return m_preconditioner; } - /** \returns the max number of iterations */ + /** \returns the max number of iterations. + * It is either the value setted by setMaxIterations or, by default, + * twice the number of columns of the matrix. + */ int maxIterations() const { - return (mp_matrix && m_maxIterations<0) ? mp_matrix->cols() : m_maxIterations; + return (m_maxIterations<0) ? 2*mp_matrix.cols() : m_maxIterations; } - /** Sets the max number of iterations */ + /** Sets the max number of iterations. + * Default is twice the number of columns of the matrix. + */ Derived& setMaxIterations(int maxIters) { m_maxIterations = maxIters; @@ -210,7 +221,16 @@ protected: m_maxIterations = -1; m_tolerance = NumTraits<Scalar>::epsilon(); } - const MatrixType* mp_matrix; + + template<typename SparseMatrixDerived> + void grab(const SparseMatrixBase<SparseMatrixDerived> &A) + { + mp_matrix.~Ref<const MatrixType>(); + ::new (&mp_matrix) Ref<const MatrixType>(A); + } + + MatrixType m_dummy; + Ref<const MatrixType> mp_matrix; Preconditioner m_preconditioner; int m_maxIterations; diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h index 27acf4128..e20c9ba2a 100644 --- a/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -43,7 +43,7 @@ namespace internal typedef _MatrixType MatrixType; typedef typename _MatrixType::Scalar Scalar; typedef typename _MatrixType::RealScalar RealScalar; - typedef typename _MatrixType::Index Index; + typedef typename _MatrixType::StorageIndex StorageIndex; }; template<typename _MatrixType, int Options> @@ -52,7 +52,7 @@ namespace internal typedef _MatrixType MatrixType; typedef typename _MatrixType::Scalar Scalar; typedef typename _MatrixType::RealScalar RealScalar; - typedef typename _MatrixType::Index Index; + typedef typename _MatrixType::StorageIndex StorageIndex; }; template<typename _MatrixType, int Options> @@ -61,7 +61,7 @@ namespace internal typedef _MatrixType MatrixType; typedef typename _MatrixType::Scalar Scalar; typedef typename _MatrixType::RealScalar RealScalar; - typedef typename _MatrixType::Index Index; + typedef typename _MatrixType::StorageIndex StorageIndex; }; void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm) @@ -138,7 +138,6 @@ class PastixBase : public SparseSolverBase<Derived> typedef _MatrixType MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; - typedef typename MatrixType::Index Index; typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix<Scalar,Dynamic,1> Vector; typedef SparseMatrix<Scalar, ColMajor> ColSpMatrix; @@ -163,7 +162,7 @@ class PastixBase : public SparseSolverBase<Derived> * The statistics related to the different phases of factorization and solve are saved here as well * \sa analyzePattern() factorize() */ - Array<Index,IPARM_SIZE,1>& iparm() + Array<StorageIndex,IPARM_SIZE,1>& iparm() { return m_iparm; } @@ -243,8 +242,8 @@ class PastixBase : public SparseSolverBase<Derived> mutable int m_comm; // The MPI communicator identifier mutable Matrix<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters mutable Matrix<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters - mutable Matrix<Index,Dynamic,1> m_perm; // Permutation vector - mutable Matrix<Index,Dynamic,1> m_invp; // Inverse permutation vector + mutable Matrix<StorageIndex,Dynamic,1> m_perm; // Permutation vector + mutable Matrix<StorageIndex,Dynamic,1> m_invp; // Inverse permutation vector mutable int m_size; // Size of the matrix }; @@ -410,7 +409,7 @@ class PastixLU : public PastixBase< PastixLU<_MatrixType> > typedef _MatrixType MatrixType; typedef PastixBase<PastixLU<MatrixType> > Base; typedef typename Base::ColSpMatrix ColSpMatrix; - typedef typename MatrixType::Index Index; + typedef typename MatrixType::StorageIndex StorageIndex; public: PastixLU() : Base() diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index 6fade3755..0dee139ce 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -477,20 +477,10 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const // we store that back into our table: it can't hurt to correct our table. m_colSqNorms.coeffRef(biggest_col_index) = biggest_col_sq_norm; - // if the current biggest column is smaller than epsilon times the initial biggest column, - // terminate to avoid generating nan/inf values. - // Note that here, if we test instead for "biggest == 0", we get a failure every 1000 (or so) - // repetitions of the unit test, with the result of solve() filled with large values of the order - // of 1/(size*epsilon). - if(biggest_col_sq_norm < threshold_helper * RealScalar(rows-k)) - { + // Track the number of meaningful pivots but do not stop the decomposition to make + // sure that the initial matrix is properly reproduced. See bug 941. + if(m_nonzero_pivots==size && biggest_col_sq_norm < threshold_helper * RealScalar(rows-k)) m_nonzero_pivots = k; - m_hCoeffs.tail(size-k).setZero(); - m_qr.bottomRightCorner(rows-k,cols-k) - .template triangularView<StrictlyLower>() - .setZero(); - break; - } // apply the transposition to the columns m_colsTranspositions.coeffRef(k) = biggest_col_index; @@ -519,7 +509,7 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const } m_colsPermutation.setIdentity(PermIndexType(cols)); - for(PermIndexType k = 0; k < m_nonzero_pivots; ++k) + for(PermIndexType k = 0; k < size/*m_nonzero_pivots*/; ++k) m_colsPermutation.applyTranspositionOnTheRight(k, PermIndexType(m_colsTranspositions.coeff(k))); m_det_pq = (number_of_transpositions%2) ? -1 : 1; @@ -575,13 +565,15 @@ struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, interna } // end namespace internal -/** \returns the matrix Q as a sequence of householder transformations */ +/** \returns the matrix Q as a sequence of householder transformations. + * You can extract the meaningful part only by using: + * \code qr.householderQ().setLength(qr.nonzeroPivots()) */ template<typename MatrixType> typename ColPivHouseholderQR<MatrixType>::HouseholderSequenceType ColPivHouseholderQR<MatrixType> ::householderQ() const { eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized."); - return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate()).setLength(m_nonzero_pivots); + return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate()); } #ifndef __CUDACC__ diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h index 5fd18b787..58efa6024 100644 --- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h @@ -68,13 +68,13 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> > typedef Map<PermutationMatrix<Dynamic, Dynamic, StorageIndex> > PermutationType; public: SPQR() - : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()) + : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); } explicit SPQR(const _MatrixType& matrix) - : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()) + : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); compute(matrix); @@ -99,10 +99,25 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> > if(m_isInitialized) SPQR_free(); MatrixType mat(matrix); + + /* Compute the default threshold as in MatLab, see: + * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing + * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3 + */ + RealScalar pivotThreshold = m_tolerance; + if(m_useDefaultThreshold) + { + RealScalar max2Norm = 0.0; + for (int j = 0; j < mat.cols(); j++) max2Norm = numext::maxi(max2Norm, mat.col(j).norm()); + if(max2Norm==RealScalar(0)) + max2Norm = RealScalar(1); + pivotThreshold = 20 * (mat.rows() + mat.cols()) * max2Norm * NumTraits<RealScalar>::epsilon(); + } + cholmod_sparse A; A = viewAsCholmod(mat); Index col = matrix.cols(); - m_rank = SuiteSparseQR<Scalar>(m_ordering, m_tolerance, col, &A, + m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, col, &A, &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc); if (!m_cR) @@ -118,7 +133,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> > /** * Get the number of rows of the input matrix and the Q matrix */ - inline Index rows() const {return m_H->nrow; } + inline Index rows() const {return m_cR->nrow; } /** * Get the number of columns of the input matrix. @@ -130,16 +145,25 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> > { eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()"); eigen_assert(b.cols()==1 && "This method is for vectors only"); - + //Compute Q^T * b - typename Dest::PlainObject y; + typename Dest::PlainObject y, y2; y = matrixQ().transpose() * b; - // Solves with the triangular matrix R + + // Solves with the triangular matrix R Index rk = this->rank(); - y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y.topRows(rk)); - y.bottomRows(cols()-rk).setZero(); + y2 = y; + y.resize((std::max)(cols(),Index(y.rows())),y.cols()); + y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y2.topRows(rk)); + // Apply the column permutation - dest.topRows(cols()) = colsPermutation() * y.topRows(cols()); + // colsPermutation() performs a copy of the permutation, + // so let's apply it manually: + for(Index i = 0; i < rk; ++i) dest.row(m_E[i]) = y.row(i); + for(Index i = rk; i < cols(); ++i) dest.row(m_E[i]).setZero(); + +// y.bottomRows(y.rows()-rk).setZero(); +// dest = colsPermutation() * y.topRows(cols()); m_info = Success; } @@ -178,7 +202,11 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> > /// Set the fill-reducing ordering method to be used void setSPQROrdering(int ord) { m_ordering = ord;} /// Set the tolerance tol to treat columns with 2-norm < =tol as zero - void setPivotThreshold(const RealScalar& tol) { m_tolerance = tol; } + void setPivotThreshold(const RealScalar& tol) + { + m_useDefaultThreshold = false; + m_tolerance = tol; + } /** \returns a pointer to the SPQR workspace */ cholmod_common *cholmodCommon() const { return &m_cc; } @@ -210,6 +238,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> > mutable cholmod_dense *m_HTau; // The Householder coefficients mutable StorageIndex m_rank; // The rank of the matrix mutable cholmod_common m_cc; // Workspace and parameters + bool m_useDefaultThreshold; // Use default threshold template<typename ,typename > friend struct SPQR_QProduct; }; diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 0f7e5b8fe..444187ae7 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -628,6 +628,7 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreColsThanRows> m_qr_precond_morecols; internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols> m_qr_precond_morerows; + MatrixType m_scaledMatrix; }; template<typename MatrixType, int QRPreconditioner> @@ -674,8 +675,9 @@ void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, u : 0); m_workMatrix.resize(m_diagSize, m_diagSize); - if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this); - if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this); + if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this); + if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this); + if(m_cols!=m_cols) m_scaledMatrix.resize(rows,cols); } template<typename MatrixType, int QRPreconditioner> @@ -698,7 +700,13 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */ - if(!m_qr_precond_morecols.run(*this, matrix/scale) && !m_qr_precond_morerows.run(*this, matrix/scale)) + if(m_rows!=m_cols) + { + m_scaledMatrix = matrix / scale; + m_qr_precond_morecols.run(*this, m_scaledMatrix); + m_qr_precond_morerows.run(*this, m_scaledMatrix); + } + else { m_workMatrix = matrix.block(0,0,m_diagSize,m_diagSize) / scale; if(m_computeFullU) m_matrixU.setIdentity(m_rows,m_rows); diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h index b148d6b1f..c47077e50 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -17,6 +17,27 @@ enum SimplicialCholeskyMode { SimplicialCholeskyLDLT }; +namespace internal { + template<typename CholMatrixType, typename InputMatrixType> + struct simplicial_cholesky_grab_input { + typedef CholMatrixType const * ConstCholMatrixPtr; + static void run(const InputMatrixType& input, ConstCholMatrixPtr &pmat, CholMatrixType &tmp) + { + tmp = input; + pmat = &tmp; + } + }; + + template<typename MatrixType> + struct simplicial_cholesky_grab_input<MatrixType,MatrixType> { + typedef MatrixType const * ConstMatrixPtr; + static void run(const MatrixType& input, ConstMatrixPtr &pmat, MatrixType &/*tmp*/) + { + pmat = &input; + } + }; +} // end namespace internal + /** \ingroup SparseCholesky_Module * \brief A direct sparse Cholesky factorizations * @@ -46,6 +67,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived> typedef typename MatrixType::RealScalar RealScalar; typedef typename MatrixType::StorageIndex StorageIndex; typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType; + typedef CholMatrixType const * ConstCholMatrixPtr; typedef Matrix<Scalar,Dynamic,1> VectorType; public: @@ -169,10 +191,11 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived> { eigen_assert(matrix.rows()==matrix.cols()); Index size = matrix.cols(); - CholMatrixType ap(size,size); - ordering(matrix, ap); - analyzePattern_preordered(ap, DoLDLT); - factorize_preordered<DoLDLT>(ap); + CholMatrixType tmp(size,size); + ConstCholMatrixPtr pmat; + ordering(matrix, pmat, tmp); + analyzePattern_preordered(*pmat, DoLDLT); + factorize_preordered<DoLDLT>(*pmat); } template<bool DoLDLT> @@ -180,9 +203,21 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived> { eigen_assert(a.rows()==a.cols()); int size = a.cols(); - CholMatrixType ap(size,size); - ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P); - factorize_preordered<DoLDLT>(ap); + CholMatrixType tmp(size,size); + ConstCholMatrixPtr pmat; + + if(m_P.size()==0 && (UpLo&Upper)==Upper) + { + // If there is no ordering, try to directly use the input matrix without any copy + internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, tmp); + } + else + { + tmp.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P); + pmat = &tmp; + } + + factorize_preordered<DoLDLT>(*pmat); } template<bool DoLDLT> @@ -192,13 +227,14 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived> { eigen_assert(a.rows()==a.cols()); int size = a.cols(); - CholMatrixType ap(size,size); - ordering(a, ap); - analyzePattern_preordered(ap,doLDLT); + CholMatrixType tmp(size,size); + ConstCholMatrixPtr pmat; + ordering(a, pmat, tmp); + analyzePattern_preordered(*pmat,doLDLT); } void analyzePattern_preordered(const CholMatrixType& a, bool doLDLT); - void ordering(const MatrixType& a, CholMatrixType& ap); + void ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap); /** keeps off-diagonal entries; drops diagonal entries */ struct keep_diag { @@ -603,26 +639,41 @@ public: }; template<typename Derived> -void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, CholMatrixType& ap) +void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap) { eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); - // Note that amd compute the inverse permutation + pmat = ≈ + // Note that ordering methods compute the inverse permutation + if(!internal::is_same<OrderingType,NaturalOrdering<Index> >::value) { - CholMatrixType C; - C = a.template selfadjointView<UpLo>(); + { + CholMatrixType C; + C = a.template selfadjointView<UpLo>(); + + OrderingType ordering; + ordering(C,m_Pinv); + } + + if(m_Pinv.size()>0) m_P = m_Pinv.inverse(); + else m_P.resize(0); - OrderingType ordering; - ordering(C,m_Pinv); + ap.resize(size,size); + ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P); } - - if(m_Pinv.size()>0) - m_P = m_Pinv.inverse(); else + { + m_Pinv.resize(0); m_P.resize(0); - - ap.resize(size,size); - ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P); + if(UpLo==Lower || MatrixType::IsRowMajor) + { + // we have to transpose the lower part to to the upper one + ap.resize(size,size); + ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>(); + } + else + internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, ap); + } } } // end namespace Eigen diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h index 302323ab4..a93189df2 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h @@ -126,7 +126,7 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType& StorageIndex top = size; // stack for pattern is empty tags[k] = k; // mark node k as visited m_nonZerosPerCol[k] = 0; // count of nonzeros in column k of L - for(typename MatrixType::InnerIterator it(ap,k); it; ++it) + for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it) { StorageIndex i = it.index(); if(i <= k) diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h index 759fc08ff..8bde5d58e 100644 --- a/Eigen/src/SparseCore/AmbiVector.h +++ b/Eigen/src/SparseCore/AmbiVector.h @@ -73,7 +73,7 @@ class AmbiVector delete[] m_buffer; if (size<1000) { - Index allocSize = (size * sizeof(ListEl))/sizeof(Scalar); + Index allocSize = (size * sizeof(ListEl) + sizeof(Scalar) - 1)/sizeof(Scalar); m_allocatedElements = convert_index((allocSize*sizeof(Scalar))/sizeof(ListEl)); m_buffer = new Scalar[allocSize]; } @@ -92,7 +92,7 @@ class AmbiVector Index copyElements = m_allocatedElements; m_allocatedElements = (std::min)(StorageIndex(m_allocatedElements*1.5),m_size); Index allocSize = m_allocatedElements * sizeof(ListEl); - allocSize = allocSize/sizeof(Scalar) + (allocSize%sizeof(Scalar)>0?1:0); + allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar); Scalar* newBuffer = new Scalar[allocSize]; memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); delete[] m_buffer; diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h index f98b42760..2d4f2bcf8 100644 --- a/Eigen/src/SparseCore/CompressedStorage.h +++ b/Eigen/src/SparseCore/CompressedStorage.h @@ -143,7 +143,7 @@ class CompressedStorage } /** Like at(), but the search is performed in the range [start,end) */ - inline const Scalar& atInRange(size_t start, size_t end, Index key, const Scalar& defaultValue = Scalar(0)) const + inline Scalar atInRange(size_t start, size_t end, Index key, const Scalar &defaultValue = Scalar(0)) const { if (start>=end) return defaultValue; diff --git a/Eigen/src/SparseCore/MappedSparseMatrix.h b/Eigen/src/SparseCore/MappedSparseMatrix.h index 5e4580329..67718c85b 100644 --- a/Eigen/src/SparseCore/MappedSparseMatrix.h +++ b/Eigen/src/SparseCore/MappedSparseMatrix.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -10,9 +10,10 @@ #ifndef EIGEN_MAPPED_SPARSEMATRIX_H #define EIGEN_MAPPED_SPARSEMATRIX_H -namespace Eigen { +namespace Eigen { -/** \class MappedSparseMatrix +/** \deprecated Use Map<SparseMatrix<> > + * \class MappedSparseMatrix * * \brief Sparse matrix * @@ -25,179 +26,38 @@ namespace internal { template<typename _Scalar, int _Flags, typename _StorageIndex> struct traits<MappedSparseMatrix<_Scalar, _Flags, _StorageIndex> > : traits<SparseMatrix<_Scalar, _Flags, _StorageIndex> > {}; -} +} // end namespace internal template<typename _Scalar, int _Flags, typename _StorageIndex> class MappedSparseMatrix - : public SparseMatrixBase<MappedSparseMatrix<_Scalar, _Flags, _StorageIndex> > + : public Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> > { - public: - EIGEN_SPARSE_PUBLIC_INTERFACE(MappedSparseMatrix) - enum { IsRowMajor = Base::IsRowMajor }; - - protected: - - StorageIndex m_outerSize; - StorageIndex m_innerSize; - StorageIndex m_nnz; - StorageIndex* m_outerIndex; - StorageIndex* m_innerIndices; - Scalar* m_values; + typedef Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> > Base; public: - - inline StorageIndex rows() const { return IsRowMajor ? m_outerSize : m_innerSize; } - inline StorageIndex cols() const { return IsRowMajor ? m_innerSize : m_outerSize; } - inline StorageIndex innerSize() const { return m_innerSize; } - inline StorageIndex outerSize() const { return m_outerSize; } - bool isCompressed() const { return true; } - - //---------------------------------------- - // direct access interface - inline const Scalar* valuePtr() const { return m_values; } - inline Scalar* valuePtr() { return m_values; } - - inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; } - inline StorageIndex* innerIndexPtr() { return m_innerIndices; } - - inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; } - inline StorageIndex* outerIndexPtr() { return m_outerIndex; } - //---------------------------------------- - - inline Scalar coeff(Index row, Index col) const - { - const Index outer = IsRowMajor ? row : col; - const Index inner = IsRowMajor ? col : row; - - Index start = m_outerIndex[outer]; - Index end = m_outerIndex[outer+1]; - if (start==end) - return Scalar(0); - else if (end>0 && inner==m_innerIndices[end-1]) - return m_values[end-1]; - // ^^ optimization: let's first check if it is the last coefficient - // (very common in high level algorithms) + typedef typename Base::StorageIndex StorageIndex; + typedef typename Base::Scalar Scalar; - const StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner); - const Index id = r-&m_innerIndices[0]; - return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0); - } - - inline Scalar& coeffRef(Index row, Index col) - { - const Index outer = IsRowMajor ? row : col; - const Index inner = IsRowMajor ? col : row; - - Index start = m_outerIndex[outer]; - Index end = m_outerIndex[outer+1]; - eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix"); - eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient"); - StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end],inner); - const Index id = r-&m_innerIndices[0]; - eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient"); - return m_values[id]; - } - - class InnerIterator; - class ReverseInnerIterator; - - /** \returns the number of non zero coefficients */ - inline StorageIndex nonZeros() const { return m_nnz; } - - inline MappedSparseMatrix(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr) - : m_outerSize(convert_index(IsRowMajor?rows:cols)), m_innerSize(convert_index(IsRowMajor?cols:rows)), m_nnz(convert_index(nnz)), - m_outerIndex(outerIndexPtr), m_innerIndices(innerIndexPtr), m_values(valuePtr) + inline MappedSparseMatrix(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZeroPtr = 0) + : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZeroPtr) {} /** Empty destructor */ inline ~MappedSparseMatrix() {} }; -template<typename Scalar, int _Flags, typename _StorageIndex> -class MappedSparseMatrix<Scalar,_Flags,_StorageIndex>::InnerIterator -{ - public: - InnerIterator(const MappedSparseMatrix& mat, Index outer) - : m_matrix(mat), - m_outer(convert_index(outer)), - m_id(mat.outerIndexPtr()[outer]), - m_start(m_id), - m_end(mat.outerIndexPtr()[outer+1]) - {} - - inline InnerIterator& operator++() { m_id++; return *this; } - - inline Scalar value() const { return m_matrix.valuePtr()[m_id]; } - inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id]); } - - inline StorageIndex index() const { return m_matrix.innerIndexPtr()[m_id]; } - inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); } - inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id < m_end) && (m_id>=m_start); } - - protected: - const MappedSparseMatrix& m_matrix; - const StorageIndex m_outer; - StorageIndex m_id; - const StorageIndex m_start; - const StorageIndex m_end; -}; - -template<typename Scalar, int _Flags, typename _StorageIndex> -class MappedSparseMatrix<Scalar,_Flags,_StorageIndex>::ReverseInnerIterator -{ - public: - ReverseInnerIterator(const MappedSparseMatrix& mat, Index outer) - : m_matrix(mat), - m_outer(outer), - m_id(mat.outerIndexPtr()[outer+1]), - m_start(mat.outerIndexPtr()[outer]), - m_end(m_id) - {} - - inline ReverseInnerIterator& operator--() { m_id--; return *this; } - - inline Scalar value() const { return m_matrix.valuePtr()[m_id-1]; } - inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id-1]); } - - inline StorageIndex index() const { return m_matrix.innerIndexPtr()[m_id-1]; } - inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); } - inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id <= m_end) && (m_id>m_start); } - - protected: - const MappedSparseMatrix& m_matrix; - const StorageIndex m_outer; - StorageIndex m_id; - const StorageIndex m_start; - const StorageIndex m_end; -}; - namespace internal { -template<typename _Scalar, int _Options, typename _Index> -struct evaluator<MappedSparseMatrix<_Scalar,_Options,_Index> > - : evaluator_base<MappedSparseMatrix<_Scalar,_Options,_Index> > +template<typename _Scalar, int _Options, typename _StorageIndex> +struct evaluator<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> > + : evaluator<SparseCompressedBase<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> > > { - typedef MappedSparseMatrix<_Scalar,_Options,_Index> MappedSparseMatrixType; - typedef typename MappedSparseMatrixType::InnerIterator InnerIterator; - typedef typename MappedSparseMatrixType::ReverseInnerIterator ReverseInnerIterator; - - enum { - CoeffReadCost = NumTraits<_Scalar>::ReadCost, - Flags = MappedSparseMatrixType::Flags - }; - - evaluator() : m_matrix(0) {} - explicit evaluator(const MappedSparseMatrixType &mat) : m_matrix(&mat) {} - - operator MappedSparseMatrixType&() { return m_matrix->const_cast_derived(); } - operator const MappedSparseMatrixType&() const { return *m_matrix; } + typedef MappedSparseMatrix<_Scalar,_Options,_StorageIndex> XprType; + typedef evaluator<SparseCompressedBase<XprType> > Base; - const MappedSparseMatrixType *m_matrix; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} }; } diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 8db4bbb75..dfdf4314a 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -74,7 +74,7 @@ namespace internal { template<typename SparseMatrixType, int BlockRows, int BlockCols>
class sparse_matrix_block_impl
- : public SparseMatrixBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
+ : public SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
{
typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _MatrixTypeNested;
typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
@@ -173,19 +173,24 @@ public: }
inline const Scalar* valuePtr() const
- { return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+ { return m_matrix.valuePtr(); }
inline Scalar* valuePtr()
- { return m_matrix.const_cast_derived().valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+ { return m_matrix.const_cast_derived().valuePtr(); }
inline const StorageIndex* innerIndexPtr() const
- { return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+ { return m_matrix.innerIndexPtr(); }
inline StorageIndex* innerIndexPtr()
- { return m_matrix.const_cast_derived().innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+ { return m_matrix.const_cast_derived().innerIndexPtr(); }
inline const StorageIndex* outerIndexPtr() const
{ return m_matrix.outerIndexPtr() + m_outerStart; }
inline StorageIndex* outerIndexPtr()
{ return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
+
+ inline const StorageIndex* innerNonZeroPtr() const
+ { return isCompressed() ? 0 : m_matrix.innerNonZeroPtr(); }
+ inline StorageIndex* innerNonZeroPtr()
+ { return isCompressed() ? 0 : m_matrix.const_cast_derived().innerNonZeroPtr(); }
StorageIndex nonZeros() const
{
@@ -197,6 +202,8 @@ public: else
return Map<const IndexVector>(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
}
+
+ bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
const Scalar& lastCoeff() const
{
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h new file mode 100644 index 000000000..a5ba45e04 --- /dev/null +++ b/Eigen/src/SparseCore/SparseCompressedBase.h @@ -0,0 +1,198 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSE_COMPRESSED_BASE_H +#define EIGEN_SPARSE_COMPRESSED_BASE_H + +namespace Eigen { + +template<typename Derived> class SparseCompressedBase; + +namespace internal { + +template<typename Derived> +struct traits<SparseCompressedBase<Derived> > : traits<Derived> +{}; + +} // end namespace internal + +template<typename Derived> +class SparseCompressedBase + : public SparseMatrixBase<Derived> +{ + public: + typedef SparseMatrixBase<Derived> Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseCompressedBase) + using Base::operator=; + using Base::IsRowMajor; + + class InnerIterator; + class ReverseInnerIterator; + + /** \returns a const pointer to the array of values. + * This function is aimed at interoperability with other libraries. + * \sa innerIndexPtr(), outerIndexPtr() */ + inline const Scalar* valuePtr() const { return derived().valuePtr(); } + /** \returns a non-const pointer to the array of values. + * This function is aimed at interoperability with other libraries. + * \sa innerIndexPtr(), outerIndexPtr() */ + inline Scalar* valuePtr() { return derived().valuePtr(); } + + /** \returns a const pointer to the array of inner indices. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), outerIndexPtr() */ + inline const StorageIndex* innerIndexPtr() const { return derived().innerIndexPtr(); } + /** \returns a non-const pointer to the array of inner indices. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), outerIndexPtr() */ + inline StorageIndex* innerIndexPtr() { return derived().innerIndexPtr(); } + + /** \returns a const pointer to the array of the starting positions of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), innerIndexPtr() */ + inline const StorageIndex* outerIndexPtr() const { return derived().outerIndexPtr(); } + /** \returns a non-const pointer to the array of the starting positions of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), innerIndexPtr() */ + inline StorageIndex* outerIndexPtr() { return derived().outerIndexPtr(); } + + /** \returns a const pointer to the array of the number of non zeros of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \warning it returns the null pointer 0 in compressed mode */ + inline const StorageIndex* innerNonZeroPtr() const { return derived().innerNonZeroPtr(); } + /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \warning it returns the null pointer 0 in compressed mode */ + inline StorageIndex* innerNonZeroPtr() { return derived().innerNonZeroPtr(); } + + /** \returns whether \c *this is in compressed form. */ + inline bool isCompressed() const { return innerNonZeroPtr()==0; } + +}; + +template<typename Derived> +class SparseCompressedBase<Derived>::InnerIterator +{ + public: + InnerIterator(const SparseCompressedBase& mat, Index outer) + : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.outerIndexPtr()[outer]) + { + if(mat.isCompressed()) + m_end = mat.outerIndexPtr()[outer+1]; + else + m_end = m_id + mat.innerNonZeroPtr()[outer]; + } + + inline InnerIterator& operator++() { m_id++; return *this; } + + inline const Scalar& value() const { return m_values[m_id]; } + inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); } + + inline StorageIndex index() const { return m_indices[m_id]; } + inline Index outer() const { return m_outer; } + inline Index row() const { return IsRowMajor ? m_outer : index(); } + inline Index col() const { return IsRowMajor ? index() : m_outer; } + + inline operator bool() const { return (m_id < m_end); } + + protected: + const Scalar* m_values; + const StorageIndex* m_indices; + const Index m_outer; + Index m_id; + Index m_end; + private: + // If you get here, then you're not using the right InnerIterator type, e.g.: + // SparseMatrix<double,RowMajor> A; + // SparseMatrix<double>::InnerIterator it(A,0); + template<typename T> InnerIterator(const SparseMatrixBase<T>&, Index outer); +}; + +template<typename Derived> +class SparseCompressedBase<Derived>::ReverseInnerIterator +{ + public: + ReverseInnerIterator(const SparseCompressedBase& mat, Index outer) + : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.outerIndexPtr()[outer]) + { + if(mat.isCompressed()) + m_id = mat.outerIndexPtr()[outer+1]; + else + m_id = m_start + mat.innerNonZeroPtr()[outer]; + } + + inline ReverseInnerIterator& operator--() { --m_id; return *this; } + + inline const Scalar& value() const { return m_values[m_id-1]; } + inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); } + + inline StorageIndex index() const { return m_indices[m_id-1]; } + inline Index outer() const { return m_outer; } + inline Index row() const { return IsRowMajor ? m_outer : index(); } + inline Index col() const { return IsRowMajor ? index() : m_outer; } + + inline operator bool() const { return (m_id > m_start); } + + protected: + const Scalar* m_values; + const StorageIndex* m_indices; + const Index m_outer; + Index m_id; + const Index m_start; +}; + +namespace internal { + +template<typename Derived> +struct evaluator<SparseCompressedBase<Derived> > + : evaluator_base<Derived> +{ + typedef typename Derived::Scalar Scalar; + typedef typename Derived::InnerIterator InnerIterator; + typedef typename Derived::ReverseInnerIterator ReverseInnerIterator; + + enum { + CoeffReadCost = NumTraits<Scalar>::ReadCost, + Flags = Derived::Flags + }; + + evaluator() : m_matrix(0) {} + explicit evaluator(const Derived &mat) : m_matrix(&mat) {} + + operator Derived&() { return m_matrix->const_cast_derived(); } + operator const Derived&() const { return *m_matrix; } + + typedef typename DenseCoeffsBase<Derived,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType; + Scalar coeff(Index row, Index col) const + { return m_matrix->coeff(row,col); } + + Scalar& coeffRef(Index row, Index col) + { + eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols()); + + const Index outer = Derived::IsRowMajor ? row : col; + const Index inner = Derived::IsRowMajor ? col : row; + + Index start = m_matrix->outerIndexPtr()[outer]; + Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer]; + eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist"); + const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) + - m_matrix->innerIndexPtr(); + eigen_assert((p<end) && (m_matrix->innerIndexPtr()[p]==inner) && "written coefficient does not exist"); + return m_matrix->const_cast_derived().valuePtr()[p]; + } + + const Derived *m_matrix; +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_SPARSE_COMPRESSED_BASE_H diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h new file mode 100644 index 000000000..a6ff7d559 --- /dev/null +++ b/Eigen/src/SparseCore/SparseMap.h @@ -0,0 +1,239 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSE_MAP_H +#define EIGEN_SPARSE_MAP_H + +namespace Eigen { + +namespace internal { + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +struct traits<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > + : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> > +{ + typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType; + typedef traits<PlainObjectType> TraitsBase; + enum { + Flags = TraitsBase::Flags & (~NestByRefBit) + }; +}; + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +struct traits<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > + : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> > +{ + typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType; + typedef traits<PlainObjectType> TraitsBase; + enum { + Flags = TraitsBase::Flags & (~ (NestByRefBit | LvalueBit)) + }; +}; + +} // end namespace internal + +template<typename Derived, + int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors +> class SparseMapBase; + +template<typename Derived> +class SparseMapBase<Derived,ReadOnlyAccessors> + : public SparseCompressedBase<Derived> +{ + public: + typedef SparseCompressedBase<Derived> Base; + typedef typename Base::Scalar Scalar; + typedef typename Base::StorageIndex StorageIndex; + enum { IsRowMajor = Base::IsRowMajor }; + using Base::operator=; + protected: + + typedef typename internal::conditional< + bool(internal::is_lvalue<Derived>::value), + Scalar *, const Scalar *>::type ScalarPointer; + typedef typename internal::conditional< + bool(internal::is_lvalue<Derived>::value), + StorageIndex *, const StorageIndex *>::type IndexPointer; + + Index m_outerSize; + Index m_innerSize; + Index m_nnz; + IndexPointer m_outerIndex; + IndexPointer m_innerIndices; + ScalarPointer m_values; + IndexPointer m_innerNonZeros; + + public: + + inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; } + inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; } + inline Index innerSize() const { return m_innerSize; } + inline Index outerSize() const { return m_outerSize; } + + bool isCompressed() const { return m_innerNonZeros==0; } + + //---------------------------------------- + // direct access interface + inline const Scalar* valuePtr() const { return m_values; } + inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; } + inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; } + inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; } + //---------------------------------------- + + inline Scalar coeff(Index row, Index col) const + { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + Index start = m_outerIndex[outer]; + Index end = isCompressed() ? m_outerIndex[outer+1] : start + m_innerNonZeros[outer]; + if (start==end) + return Scalar(0); + else if (end>0 && inner==m_innerIndices[end-1]) + return m_values[end-1]; + // ^^ optimization: let's first check if it is the last coefficient + // (very common in high level algorithms) + + const StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner); + const Index id = r-&m_innerIndices[0]; + return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0); + } + + /** \returns the number of non zero coefficients */ + inline Index nonZeros() const { return m_nnz; } + + inline SparseMapBase(Index rows, Index cols, Index nnz, IndexPointer outerIndexPtr, IndexPointer innerIndexPtr, + ScalarPointer valuePtr, IndexPointer innerNonZerosPtr = 0) + : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr), + m_innerIndices(innerIndexPtr), m_values(valuePtr), m_innerNonZeros(innerNonZerosPtr) + {} + + /** Empty destructor */ + inline ~SparseMapBase() {} +}; + +template<typename Derived> +class SparseMapBase<Derived,WriteAccessors> + : public SparseMapBase<Derived,ReadOnlyAccessors> +{ + typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase; + + public: + typedef SparseMapBase<Derived, ReadOnlyAccessors> Base; + typedef typename Base::Scalar Scalar; + typedef typename Base::StorageIndex StorageIndex; + enum { IsRowMajor = Base::IsRowMajor }; + + using Base::operator=; + + public: + + //---------------------------------------- + // direct access interface + using Base::valuePtr; + using Base::innerIndexPtr; + using Base::outerIndexPtr; + using Base::innerNonZeroPtr; + inline Scalar* valuePtr() { return Base::m_values; } + inline StorageIndex* innerIndexPtr() { return Base::m_innerIndices; } + inline StorageIndex* outerIndexPtr() { return Base::m_outerIndex; } + inline StorageIndex* innerNonZeroPtr() { return Base::m_innerNonZeros; } + //---------------------------------------- + + inline Scalar& coeffRef(Index row, Index col) + { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + Index start = Base::m_outerIndex[outer]; + Index end = Base::isCompressed() ? Base::m_outerIndex[outer+1] : start + Base::m_innerNonZeros[outer]; + eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix"); + eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient"); + Index* r = std::lower_bound(&Base::m_innerIndices[start],&Base::m_innerIndices[end],inner); + const Index id = r - &Base::m_innerIndices[0]; + eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient"); + return const_cast<Scalar*>(Base::m_values)[id]; + } + + inline SparseMapBase(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, + Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0) + : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) + {} + + /** Empty destructor */ + inline ~SparseMapBase() {} +}; + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +class Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> + : public SparseMapBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > +{ + public: + typedef SparseMapBase<Map> Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Map) + enum { IsRowMajor = Base::IsRowMajor }; + + public: + + inline Map(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, + StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0) + : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) + {} + + /** Empty destructor */ + inline ~Map() {} +}; + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +class Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> + : public SparseMapBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > +{ + public: + typedef SparseMapBase<Map> Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Map) + enum { IsRowMajor = Base::IsRowMajor }; + + public: + + inline Map(Index rows, Index cols, Index nnz, const StorageIndex* outerIndexPtr, + const StorageIndex* innerIndexPtr, const Scalar* valuePtr, const StorageIndex* innerNonZerosPtr = 0) + : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) + {} + + /** Empty destructor */ + inline ~Map() {} +}; + +namespace internal { + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +struct evaluator<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > + : evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > +{ + typedef evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base; + typedef Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +struct evaluator<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > + : evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > +{ + typedef evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base; + typedef Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_SPARSE_MAP_H diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index dd11fcafd..c049f9bcb 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -51,7 +51,7 @@ struct traits<SparseMatrix<_Scalar, _Options, _Index> > ColsAtCompileTime = Dynamic, MaxRowsAtCompileTime = Dynamic, MaxColsAtCompileTime = Dynamic, - Flags = _Options | NestByRefBit | LvalueBit, + Flags = _Options | NestByRefBit | LvalueBit | CompressedAccessBit, SupportedAccessPatterns = InnerRandomAccessPattern }; }; @@ -90,16 +90,20 @@ struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex> template<typename _Scalar, int _Options, typename _Index> class SparseMatrix - : public SparseMatrixBase<SparseMatrix<_Scalar, _Options, _Index> > + : public SparseCompressedBase<SparseMatrix<_Scalar, _Options, _Index> > { public: - EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix) + typedef SparseCompressedBase<SparseMatrix> Base; + using Base::isCompressed; + _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix) EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=) EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=) typedef MappedSparseMatrix<Scalar,Flags> Map; typedef Diagonal<SparseMatrix> DiagonalReturnType; typedef Diagonal<const SparseMatrix> ConstDiagonalReturnType; + typedef typename Base::InnerIterator InnerIterator; + typedef typename Base::ReverseInnerIterator ReverseInnerIterator; using Base::IsRowMajor; @@ -124,9 +128,6 @@ class SparseMatrix public: - /** \returns whether \c *this is in compressed form. */ - inline bool isCompressed() const { return m_innerNonZeros==0; } - /** \returns the number of rows of the matrix */ inline StorageIndex rows() const { return IsRowMajor ? m_outerSize : m_innerSize; } /** \returns the number of columns of the matrix */ @@ -180,7 +181,7 @@ class SparseMatrix /** \returns the value of the matrix at position \a i, \a j * This function returns Scalar(0) if the element is an explicit \em zero */ - inline const Scalar& coeff(Index row, Index col) const + inline Scalar coeff(Index row, Index col) const { eigen_assert(row>=0 && row<rows() && col>=0 && col<cols()); @@ -242,9 +243,6 @@ class SparseMatrix public: - class InnerIterator; - class ReverseInnerIterator; - /** Removes all non zeros but keep allocated memory */ inline void setZero() { @@ -874,77 +872,6 @@ private: }; }; -template<typename Scalar, int _Options, typename _Index> -class SparseMatrix<Scalar,_Options,_Index>::InnerIterator -{ - public: - InnerIterator(const SparseMatrix& mat, Index outer) - : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(convert_index(outer)), m_id(mat.m_outerIndex[outer]) - { - if(mat.isCompressed()) - m_end = mat.m_outerIndex[outer+1]; - else - m_end = m_id + mat.m_innerNonZeros[outer]; - } - - inline InnerIterator& operator++() { m_id++; return *this; } - - inline const Scalar& value() const { return m_values[m_id]; } - inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); } - - inline StorageIndex index() const { return m_indices[m_id]; } - inline StorageIndex outer() const { return m_outer; } - inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); } - inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id < m_end); } - - protected: - const Scalar* m_values; - const StorageIndex* m_indices; - const StorageIndex m_outer; - StorageIndex m_id; - StorageIndex m_end; - private: - // If you get here, then you're not using the right InnerIterator type, e.g.: - // SparseMatrix<double,RowMajor> A; - // SparseMatrix<double>::InnerIterator it(A,0); - template<typename T> InnerIterator(const SparseMatrixBase<T>&,Index outer); -}; - -template<typename Scalar, int _Options, typename _Index> -class SparseMatrix<Scalar,_Options,_Index>::ReverseInnerIterator -{ - public: - ReverseInnerIterator(const SparseMatrix& mat, Index outer) - : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.m_outerIndex[outer]) - { - if(mat.isCompressed()) - m_id = mat.m_outerIndex[outer+1]; - else - m_id = m_start + mat.m_innerNonZeros[outer]; - } - - inline ReverseInnerIterator& operator--() { --m_id; return *this; } - - inline const Scalar& value() const { return m_values[m_id-1]; } - inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); } - - inline StorageIndex index() const { return m_indices[m_id-1]; } - inline StorageIndex outer() const { return m_outer; } - inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); } - inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id > m_start); } - - protected: - const Scalar* m_values; - const StorageIndex* m_indices; - const StorageIndex m_outer; - StorageIndex m_id; - const StorageIndex m_start; -}; - namespace internal { template<typename InputIterator, typename SparseMatrixType> @@ -1074,6 +1001,10 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value), YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + #endif + const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit); if (needToTranspose) { @@ -1276,44 +1207,12 @@ namespace internal { template<typename _Scalar, int _Options, typename _Index> struct evaluator<SparseMatrix<_Scalar,_Options,_Index> > - : evaluator_base<SparseMatrix<_Scalar,_Options,_Index> > + : evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > > { - typedef _Scalar Scalar; + typedef evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > > Base; typedef SparseMatrix<_Scalar,_Options,_Index> SparseMatrixType; - typedef typename SparseMatrixType::InnerIterator InnerIterator; - typedef typename SparseMatrixType::ReverseInnerIterator ReverseInnerIterator; - - enum { - CoeffReadCost = NumTraits<_Scalar>::ReadCost, - Flags = SparseMatrixType::Flags - }; - - evaluator() : m_matrix(0) {} - explicit evaluator(const SparseMatrixType &mat) : m_matrix(&mat) {} - - operator SparseMatrixType&() { return m_matrix->const_cast_derived(); } - operator const SparseMatrixType&() const { return *m_matrix; } - - typedef typename DenseCoeffsBase<SparseMatrixType,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType; - CoeffReturnType coeff(Index row, Index col) const - { return m_matrix->coeff(row,col); } - - Scalar& coeffRef(Index row, Index col) - { - eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols()); - - const Index outer = SparseMatrixType::IsRowMajor ? row : col; - const Index inner = SparseMatrixType::IsRowMajor ? col : row; - - Index start = m_matrix->outerIndexPtr()[outer]; - Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer]; - eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist"); - const Index p = m_matrix->data().searchLowerIndex(start,end-1,inner); - eigen_assert((p<end) && (m_matrix->data().index(p)==inner) && "written coefficient does not exist"); - return m_matrix->const_cast_derived().data().value(p); - } - - const SparseMatrixType *m_matrix; + evaluator() : Base() {} + explicit evaluator(const SparseMatrixType &mat) : Base(mat) {} }; } diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h new file mode 100644 index 000000000..2ca039323 --- /dev/null +++ b/Eigen/src/SparseCore/SparseRef.h @@ -0,0 +1,192 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSE_REF_H +#define EIGEN_SPARSE_REF_H + +namespace Eigen { + +namespace internal { + +template<typename Derived> class SparseRefBase; + +template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType> +struct traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> > + : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> > +{ + typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType; + enum { + Options = _Options, + Flags = traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit + }; + + template<typename Derived> struct match { + enum { + StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), + MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && StorageOrderMatch + }; + typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type; + }; + +}; + +template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType> +struct traits<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> > + : public traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> > +{ + enum { + Flags = (traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit + }; +}; + +template<typename Derived> +struct traits<SparseRefBase<Derived> > : public traits<Derived> {}; + +template<typename Derived> class SparseRefBase + : public SparseMapBase<Derived> +{ +public: + + typedef SparseMapBase<Derived> Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseRefBase) + + SparseRefBase() + : Base(RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime, 0, 0, 0, 0, 0) + {} + +protected: + + + template<typename Expression> + void construct(Expression& expr) + { + ::new (static_cast<Base*>(this)) Base(expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(), expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr()); + } +}; + +} // namespace internal + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType > + : public internal::SparseRefBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType > > +{ + typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType; + typedef internal::traits<Ref> Traits; + template<int OtherOptions> + inline Ref(const SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr); + template<int OtherOptions> + inline Ref(const MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr); + public: + + typedef internal::SparseRefBase<Ref> Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Ref) + + + #ifndef EIGEN_PARSED_BY_DOXYGEN + template<int OtherOptions> + inline Ref(SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr) + { + EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); + Base::construct(expr.derived()); + } + + template<int OtherOptions> + inline Ref(MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr) + { + EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); + Base::construct(expr.derived()); + } + + template<typename Derived> + inline Ref(const SparseCompressedBase<Derived>& expr) + #else + template<typename Derived> + inline Ref(SparseCompressedBase<Derived>& expr) + #endif + { + EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); + EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); + Base::construct(expr.const_cast_derived()); + } +}; + +// this is the const ref version +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> + : public internal::SparseRefBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > +{ + typedef SparseMatrix<MatScalar,MatOptions,MatIndex> TPlainObjectType; + typedef internal::traits<Ref> Traits; + public: + + typedef internal::SparseRefBase<Ref> Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Ref) + + template<typename Derived> + inline Ref(const SparseMatrixBase<Derived>& expr) + { + construct(expr.derived(), typename Traits::template match<Derived>::type()); + } + + inline Ref(const Ref& other) : Base(other) { + // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy + } + + template<typename OtherRef> + inline Ref(const RefBase<OtherRef>& other) { + construct(other.derived(), typename Traits::template match<OtherRef>::type()); + } + + protected: + + template<typename Expression> + void construct(const Expression& expr,internal::true_type) + { + Base::construct(expr); + } + + template<typename Expression> + void construct(const Expression& expr, internal::false_type) + { + m_object = expr; + Base::construct(m_object); + } + + protected: + TPlainObjectType m_object; +}; + + +namespace internal { + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +struct evaluator<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > + : evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > +{ + typedef evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base; + typedef Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +struct evaluator<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > + : evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > +{ + typedef evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base; + typedef Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_SPARSE_REF_H diff --git a/Eigen/src/SparseCore/SparseTranspose.h b/Eigen/src/SparseCore/SparseTranspose.h index c74af46b3..84413c374 100644 --- a/Eigen/src/SparseCore/SparseTranspose.h +++ b/Eigen/src/SparseCore/SparseTranspose.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -12,13 +12,41 @@ namespace Eigen { +namespace internal { + template<typename MatrixType,int CompressedAccess=int(MatrixType::Flags&CompressedAccessBit)> + class SparseTransposeImpl + : public SparseMatrixBase<Transpose<MatrixType> > + {}; + + template<typename MatrixType> + class SparseTransposeImpl<MatrixType,CompressedAccessBit> + : public SparseCompressedBase<Transpose<MatrixType> > + { + typedef SparseCompressedBase<Transpose<MatrixType> > Base; + public: + using Base::derived; + typedef typename Base::Scalar Scalar; + typedef typename Base::StorageIndex StorageIndex; + + inline const Scalar* valuePtr() const { return derived().nestedExpression().valuePtr(); } + inline const StorageIndex* innerIndexPtr() const { return derived().nestedExpression().innerIndexPtr(); } + inline const StorageIndex* outerIndexPtr() const { return derived().nestedExpression().outerIndexPtr(); } + inline const StorageIndex* innerNonZeroPtr() const { return derived().nestedExpression().innerNonZeroPtr(); } + + inline Scalar* valuePtr() { return derived().nestedExpression().valuePtr(); } + inline StorageIndex* innerIndexPtr() { return derived().nestedExpression().innerIndexPtr(); } + inline StorageIndex* outerIndexPtr() { return derived().nestedExpression().outerIndexPtr(); } + inline StorageIndex* innerNonZeroPtr() { return derived().nestedExpression().innerNonZeroPtr(); } + }; +} + // Implement nonZeros() for transpose. I'm not sure that's the best approach for that. // Perhaps it should be implemented in Transpose<> itself. template<typename MatrixType> class TransposeImpl<MatrixType,Sparse> - : public SparseMatrixBase<Transpose<MatrixType> > + : public internal::SparseTransposeImpl<MatrixType> { protected: - typedef SparseMatrixBase<Transpose<MatrixType> > Base; + typedef internal::SparseTransposeImpl<MatrixType> Base; public: inline typename MatrixType::StorageIndex nonZeros() const { return Base::derived().nestedExpression().nonZeros(); } }; diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h index 5714150c2..ac2b4f10b 100644 --- a/Eigen/src/SparseCore/SparseUtil.h +++ b/Eigen/src/SparseCore/SparseUtil.h @@ -44,8 +44,7 @@ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=) // TODO this is mostly the same as EIGEN_GENERIC_PUBLIC_INTERFACE -#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, BaseClass) \ - typedef BaseClass Base; \ +#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \ typedef typename Eigen::internal::traits<Derived >::Scalar Scalar; \ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \ typedef typename Eigen::internal::nested<Derived >::type Nested; \ @@ -61,7 +60,8 @@ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=) using Base::convert_index; #define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \ - _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, Eigen::SparseMatrixBase<Derived >) + typedef Eigen::SparseMatrixBase<Derived > Base; \ + _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) const int CoherentAccessPattern = 0x1; const int InnerRandomAccessPattern = 0x2 | CoherentAccessPattern; diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index 0c48fef3e..380ba25c0 100644 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -309,7 +309,7 @@ class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >, // Functions void initperfvalues() { - m_perfv.panel_size = 1; + m_perfv.panel_size = 16; m_perfv.relax = 1; m_perfv.maxsuper = 128; m_perfv.rowblk = 16; diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h index f00bc3976..3a9b5fd74 100644 --- a/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -627,8 +627,12 @@ void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> m_sluFerr.resize(rhsCols); m_sluBerr.resize(rhsCols); - m_sluB = SluMatrix::Map(b.const_cast_derived()); - m_sluX = SluMatrix::Map(x.derived()); + + Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b); + Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x); + + m_sluB = SluMatrix::Map(b_ref.const_cast_derived()); + m_sluX = SluMatrix::Map(x_ref.const_cast_derived()); typename Rhs::PlainObject b_cpy; if(m_sluEqued!='N') @@ -651,6 +655,10 @@ void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &m_sluFerr[0], &m_sluBerr[0], &m_sluStat, &info, Scalar()); StatFree(&m_sluStat); + + if(&x.coeffRef(0) != x_ref.data()) + x = x_ref; + m_info = info==0 ? Success : NumericalIssue; } @@ -938,8 +946,12 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest m_sluFerr.resize(rhsCols); m_sluBerr.resize(rhsCols); - m_sluB = SluMatrix::Map(b.const_cast_derived()); - m_sluX = SluMatrix::Map(x.derived()); + + Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b); + Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x); + + m_sluB = SluMatrix::Map(b_ref.const_cast_derived()); + m_sluX = SluMatrix::Map(x_ref.const_cast_derived()); typename Rhs::PlainObject b_cpy; if(m_sluEqued!='N') @@ -962,6 +974,9 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest &recip_pivot_growth, &rcond, &m_sluStat, &info, Scalar()); StatFree(&m_sluStat); + + if(&x.coeffRef(0) != x_ref.data()) + x = x_ref; m_info = info==0 ? Success : NumericalIssue; } diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h index 982aa2fca..47e8b6304 100644 --- a/Eigen/src/UmfPackSupport/UmfPackSupport.h +++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h @@ -403,11 +403,22 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve"); int errorCode; + Scalar* x_ptr = 0; + Matrix<Scalar,Dynamic,1> x_tmp; + if(x.innerStride()!=1) + { + x_tmp.resize(x.rows()); + x_ptr = x_tmp.data(); + } for (int j=0; j<rhsCols; ++j) { + if(x.innerStride()==1) + x_ptr = &x.col(j).coeffRef(0); errorCode = umfpack_solve(UMFPACK_A, m_outerIndexPtr, m_innerIndexPtr, m_valuePtr, - &x.col(j).coeffRef(0), &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0); + x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0); + if(x.innerStride()!=1) + x.col(j) = x_tmp; if (errorCode!=0) return false; } diff --git a/bench/btl/CMakeLists.txt b/bench/btl/CMakeLists.txt index b299d9899..9444b450c 100644 --- a/bench/btl/CMakeLists.txt +++ b/bench/btl/CMakeLists.txt @@ -97,6 +97,7 @@ ENABLE_TESTING() add_subdirectory(libs/eigen3) add_subdirectory(libs/eigen2) +add_subdirectory(libs/tensors) add_subdirectory(libs/BLAS) add_subdirectory(libs/ublas) add_subdirectory(libs/gmm) diff --git a/bench/btl/libs/tensors/CMakeLists.txt b/bench/btl/libs/tensors/CMakeLists.txt new file mode 100644 index 000000000..09d6d8e43 --- /dev/null +++ b/bench/btl/libs/tensors/CMakeLists.txt @@ -0,0 +1,44 @@ + + +if((NOT TENSOR_INCLUDE_DIR) AND Eigen_SOURCE_DIR) + # unless TENSOR_INCLUDE_DIR is defined, let's use current Eigen version + set(TENSOR_INCLUDE_DIR ${Eigen_SOURCE_DIR}) + set(TENSOR_FOUND TRUE) +else() + find_package(Tensor) +endif() + +if (TENSOR_FOUND) + + include_directories(${TENSOR_INCLUDE_DIR}) + btl_add_bench(btl_tensor_linear main_linear.cpp) + btl_add_bench(btl_tensor_vecmat main_vecmat.cpp) + btl_add_bench(btl_tensor_matmat main_matmat.cpp) + + btl_add_target_property(btl_tensor_linear COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor") + btl_add_target_property(btl_tensor_vecmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor") + btl_add_target_property(btl_tensor_matmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor") + + option(BTL_BENCH_NOGCCVEC "also bench Eigen explicit vec without GCC's auto vec" OFF) + if(CMAKE_COMPILER_IS_GNUCXX AND BTL_BENCH_NOGCCVEC) + btl_add_bench(btl_tensor_nogccvec_linear main_linear.cpp) + btl_add_bench(btl_tensor_nogccvec_vecmat main_vecmat.cpp) + btl_add_bench(btl_tensor_nogccvec_matmat main_matmat.cpp) + + btl_add_target_property(btl_tensor_nogccvec_linear COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec") + btl_add_target_property(btl_tensor_nogccvec_vecmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec") + btl_add_target_property(btl_tensor_nogccvec_matmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec") + endif() + + + if(NOT BTL_NOVEC) + btl_add_bench(btl_tensor_novec_linear main_linear.cpp OFF) + btl_add_bench(btl_tensor_novec_vecmat main_vecmat.cpp OFF) + btl_add_bench(btl_tensor_novec_matmat main_matmat.cpp OFF) + btl_add_target_property(btl_tensor_novec_linear COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec") + btl_add_target_property(btl_tensor_novec_vecmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec") + btl_add_target_property(btl_tensor_novec_matmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec") + + endif(NOT BTL_NOVEC) + +endif (TENSOR_FOUND) diff --git a/bench/btl/libs/tensors/main_linear.cpp b/bench/btl/libs/tensors/main_linear.cpp new file mode 100644 index 000000000..e257f1e72 --- /dev/null +++ b/bench/btl/libs/tensors/main_linear.cpp @@ -0,0 +1,23 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "utilities.h" +#include "tensor_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + bench<Action_axpy<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT); + bench<Action_axpby<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT); + + return 0; +} diff --git a/bench/btl/libs/tensors/main_matmat.cpp b/bench/btl/libs/tensors/main_matmat.cpp new file mode 100644 index 000000000..675fcfc6d --- /dev/null +++ b/bench/btl/libs/tensors/main_matmat.cpp @@ -0,0 +1,21 @@ +//===================================================== +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +//===================================================== +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +#include "utilities.h" +#include "tensor_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + bench<Action_matrix_matrix_product<tensor_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); + + return 0; +} diff --git a/bench/btl/libs/tensors/main_vecmat.cpp b/bench/btl/libs/tensors/main_vecmat.cpp new file mode 100644 index 000000000..1af00c81b --- /dev/null +++ b/bench/btl/libs/tensors/main_vecmat.cpp @@ -0,0 +1,21 @@ +//===================================================== +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +//===================================================== +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +#include "utilities.h" +#include "tensor_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + bench<Action_matrix_vector_product<tensor_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); + + return 0; +} diff --git a/bench/btl/libs/tensors/tensor_interface.hh b/bench/btl/libs/tensors/tensor_interface.hh new file mode 100644 index 000000000..97b8e0f0b --- /dev/null +++ b/bench/btl/libs/tensors/tensor_interface.hh @@ -0,0 +1,105 @@ +//===================================================== +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +//===================================================== +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +#ifndef TENSOR_INTERFACE_HH +#define TENSOR_INTERFACE_HH + +#include <unsupported/Eigen/CXX11/Tensor> +#include <vector> +#include "btl.hh" + +using namespace Eigen; + +template<class real> +class tensor_interface +{ +public : + typedef real real_type; + typedef typename Eigen::Tensor<real,2>::Index Index; + + typedef std::vector<real> stl_vector; + typedef std::vector<stl_vector> stl_matrix; + + typedef Eigen::Tensor<real,2> gene_matrix; + typedef Eigen::Tensor<real,1> gene_vector; + + + static inline std::string name( void ) + { + return EIGEN_MAKESTRING(BTL_PREFIX); + } + + static void free_matrix(gene_matrix & /*A*/, int /*N*/) {} + + static void free_vector(gene_vector & /*B*/) {} + + static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ + A.resize(Eigen::array<Index,2>(A_stl[0].size(), A_stl.size())); + + for (unsigned int j=0; j<A_stl.size() ; j++){ + for (unsigned int i=0; i<A_stl[j].size() ; i++){ + A.coeffRef(Eigen::array<Index,2>(i,j)) = A_stl[j][i]; + } + } + } + + static BTL_DONT_INLINE void vector_from_stl(gene_vector & B, stl_vector & B_stl){ + B.resize(B_stl.size()); + + for (unsigned int i=0; i<B_stl.size() ; i++){ + B.coeffRef(i) = B_stl[i]; + } + } + + static BTL_DONT_INLINE void vector_to_stl(gene_vector & B, stl_vector & B_stl){ + for (unsigned int i=0; i<B_stl.size() ; i++){ + B_stl[i] = B.coeff(i); + } + } + + static BTL_DONT_INLINE void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){ + int N=A_stl.size(); + + for (int j=0;j<N;j++){ + A_stl[j].resize(N); + for (int i=0;i<N;i++){ + A_stl[j][i] = A.coeff(Eigen::array<Index,2>(i,j)); + } + } + } + + static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int /*N*/){ + typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair; + const Eigen::array<DimPair, 1> dims(DimPair(1, 0)); + X/*.noalias()*/ = A.contract(B, dims); + } + + static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){ + typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair; + const Eigen::array<DimPair, 1> dims(DimPair(1, 0)); + X/*.noalias()*/ = A.contract(B, dims); + } + + static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int /*N*/){ + Y += X.constant(coef) * X; + } + + static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int /*N*/){ + Y = X.constant(a)*X + Y.constant(b)*Y; + } + + static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int /*N*/){ + cible = source; + } + + static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int /*N*/){ + cible = source; + } +}; + +#endif diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h new file mode 100644 index 000000000..525b9acda --- /dev/null +++ b/bench/tensors/tensor_benchmarks.h @@ -0,0 +1,305 @@ +#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ +#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ + +typedef int TensorIndex; +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "testing/base/public/benchmark.h" + +using Eigen::Tensor; +using Eigen::TensorMap; + + +// TODO(bsteiner): also templatize on the input type since we have users +// for int8 as well as floats. +template <typename Device> class BenchmarkSuite { + public: + BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n) + : m_(m), k_(k), n_(n), device_(device) { + initialize(); + } + + BenchmarkSuite(const Device& device, size_t m) + : m_(m), k_(m), n_(m), device_(device) { + initialize(); + } + + ~BenchmarkSuite() { + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + } + + void memcpy(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + device_.memcpy(c_, a_, m_ * m_ * sizeof(float)); + } + // Record the number of values copied per second + finalizeBenchmark(m_ * m_ * num_iters); + } + + void random(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array<TensorIndex, 2> sizes(m_, m_); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = C.random(); + } + // Record the number of random numbers generated per second + finalizeBenchmark(m_ * m_ * num_iters); + } + + void slicing(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array<TensorIndex, 2> sizes(m_, m_); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + + const Eigen::DSizes<TensorIndex, 2> quarter_sizes(Eigen::array<TensorIndex, 2>(m_/2, m_/2)); + const Eigen::DSizes<TensorIndex, 2> first_quadrant(Eigen::array<TensorIndex, 2>(0, 0)); + const Eigen::DSizes<TensorIndex, 2> second_quadrant(Eigen::array<TensorIndex, 2>(0, m_/2)); + const Eigen::DSizes<TensorIndex, 2> third_quadrant(Eigen::array<TensorIndex, 2>(m_/2, 0)); + const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(Eigen::array<TensorIndex, 2>(m_/2, m_/2)); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.slice(first_quadrant, quarter_sizes).device(device_) = + A.slice(first_quadrant, quarter_sizes); + C.slice(second_quadrant, quarter_sizes).device(device_) = + B.slice(second_quadrant, quarter_sizes); + C.slice(third_quadrant, quarter_sizes).device(device_) = + A.slice(third_quadrant, quarter_sizes); + C.slice(fourth_quadrant, quarter_sizes).device(device_) = + B.slice(fourth_quadrant, quarter_sizes); + } + // Record the number of values copied from the rhs slice to the lhs slice + // each second + finalizeBenchmark(m_ * m_ * num_iters); + } + + void shuffling(int num_iters) { + eigen_assert(m_ == n_); + const Eigen::array<TensorIndex, 2> size_a(m_, k_); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); + const Eigen::array<TensorIndex, 2> size_b(k_, m_); + TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b); + + const Eigen::array<int, 2> shuffle(1, 0); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.shuffle(shuffle); + } + // Record the number of values shuffled from A and copied to B each second + finalizeBenchmark(m_ * k_ * num_iters); + } + + void padding(int num_iters) { + eigen_assert(m_ == k_); + const Eigen::array<TensorIndex, 2> size_a(m_, k_-3); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); + const Eigen::array<TensorIndex, 2> size_b(k_, m_); + TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b); + + Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings; + paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0); + paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.pad(paddings); + } + // Record the number of values copied from the padded tensor A each second + finalizeBenchmark(m_ * k_ * num_iters); + } + + void striding(int num_iters) { + eigen_assert(m_ == k_); + const Eigen::array<TensorIndex, 2> size_a(m_, k_); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); + const Eigen::array<TensorIndex, 2> size_b(m_, k_ / 2); + TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b); + + const Eigen::array<TensorIndex, 2> strides(1, 2); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.stride(strides); + } + // Record the number of values copied from the padded tensor A each second + finalizeBenchmark(m_ * k_ * num_iters); + } + + void broadcasting(int num_iters) { + const Eigen::array<TensorIndex, 2> size_a(m_, 1); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); + const Eigen::array<TensorIndex, 2> size_c(m_, n_); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c); + +#if defined(__CUDACC__) + // nvcc doesn't support cxx11 + const Eigen::array<int, 2> broadcast(1, n_); +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList<Eigen::type2index<1>, int> broadcast; + broadcast.set(1, n_); +#endif + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.broadcast(broadcast); + } + // Record the number of values broadcasted from A and copied to C each second + finalizeBenchmark(m_ * n_ * num_iters); + } + + void coeffWiseOp(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array<TensorIndex, 2> sizes(m_, m_); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A * A.constant(3.14) + B * B.constant(2.7); + } + // Record the number of FLOP executed per second (2 multiplications and + // 1 addition per value) + finalizeBenchmark(3 * m_ * m_ * num_iters); + } + + void algebraicFunc(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array<TensorIndex, 2> sizes(m_, m_); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(m_ * m_ * num_iters); + } + + void transcendentalFunc(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array<TensorIndex, 2> sizes(m_, m_); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.exp() + B.log(); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(m_ * m_ * num_iters); + } + + // Simple reduction + void reduction(int num_iters) { + const Eigen::array<TensorIndex, 2> input_size(k_, n_); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size); + const Eigen::array<TensorIndex, 1> output_size(n_); + TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size); + + const Eigen::array<TensorIndex, 1> sum_along_dim(0); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(m_ * m_ * num_iters); + } + + // do a contraction which is equivalent to a matrix multiplication + void contraction(int num_iters) { + const Eigen::array<TensorIndex, 2> sizeA(m_, k_); + const Eigen::array<TensorIndex, 2> sizeB(k_, n_); + const Eigen::array<TensorIndex, 2> sizeC(m_, n_); + + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor<float, 2>::DimensionPair DimPair; + const Eigen::array<DimPair, 1> dims(DimPair(1, 0)); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + // Record the number of FLOP executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(static_cast<int64>(2) * m_ * n_ * k_ * num_iters); + } + + void convolution(int num_iters, int kernel_x, int kernel_y) { + const Eigen::array<TensorIndex, 2> input_sizes(m_, n_); + TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes); + const Eigen::array<TensorIndex, 2> kernel_sizes(kernel_x, kernel_y); + TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes); + const Eigen::array<TensorIndex, 2> result_sizes( + m_ - kernel_x + 1, n_ - kernel_y + 1); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes); + Eigen::array<Tensor<float, 2>::Index, 2> dims(0, 1); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.convolve(B, dims); + } + // Record the number of FLOP executed per second (kernel_size + // multiplications and additions for each value in the resulting tensor) + finalizeBenchmark( + (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters); + } + + private: + void initialize() { + a_ = (float *) device_.allocate(m_ * k_ * sizeof(float)); + b_ = (float *) device_.allocate(k_ * n_ * sizeof(float)); + c_ = (float *) device_.allocate(m_ * n_ * sizeof(float)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(float)); + device_.memset(b_, 23, k_ * n_ * sizeof(float)); + device_.memset(c_, 31, m_ * n_ * sizeof(float)); + + BenchmarkUseRealTime(); + } + + inline void finalizeBenchmark(int64 num_items) { +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) { + device_.synchronize(); + } +#endif + StopBenchmarkTiming(); + SetBenchmarkItemsProcessed(num_items); + } + + + size_t m_; + size_t k_; + size_t n_; + float* a_; + float* b_; + float* c_; + Device device_; +}; +#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc new file mode 100644 index 000000000..68653ba15 --- /dev/null +++ b/bench/tensors/tensor_benchmarks_cpu.cc @@ -0,0 +1,156 @@ +#define EIGEN_USE_THREADS + +#include "base/sysinfo.h" +#include "strings/strcat.h" +#include "third_party/eigen3/tensor_benchmarks.h" +#include "thread/threadpool.h" + +#ifdef __ANDROID__ +#define CREATE_THREAD_POOL(threads) \ +Eigen::ThreadPoolDevice device(threads); +#else +#define CREATE_THREAD_POOL(threads) \ +ThreadPool tp(threads); \ +tp.StartWorkers(); \ +Eigen::ThreadPoolDevice device(&tp, threads); +#endif + +// Simple functions +#define BM_FuncCPU(FUNC, THREADS) \ + static void BM_##FUNC##_##THREADS##T(int iters, int N) { \ + StopBenchmarkTiming(); \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \ + suite.FUNC(iters); \ + SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000); + +BM_FuncCPU(memcpy, 4); +BM_FuncCPU(memcpy, 8); +BM_FuncCPU(memcpy, 12); + +BM_FuncCPU(random, 4); +BM_FuncCPU(random, 8); +BM_FuncCPU(random, 12); + +BM_FuncCPU(slicing, 4); +BM_FuncCPU(slicing, 8); +BM_FuncCPU(slicing, 12); + +BM_FuncCPU(shuffling, 4); +BM_FuncCPU(shuffling, 8); +BM_FuncCPU(shuffling, 12); + +BM_FuncCPU(padding, 4); +BM_FuncCPU(padding, 8); +BM_FuncCPU(padding, 12); + +BM_FuncCPU(striding, 4); +BM_FuncCPU(striding, 8); +BM_FuncCPU(striding, 12); + +BM_FuncCPU(broadcasting, 4); +BM_FuncCPU(broadcasting, 8); +BM_FuncCPU(broadcasting, 12); + +BM_FuncCPU(coeffWiseOp, 4); +BM_FuncCPU(coeffWiseOp, 8); +BM_FuncCPU(coeffWiseOp, 12); + +BM_FuncCPU(algebraicFunc, 4); +BM_FuncCPU(algebraicFunc, 8); +BM_FuncCPU(algebraicFunc, 12); + +BM_FuncCPU(transcendentalFunc, 4); +BM_FuncCPU(transcendentalFunc, 8); +BM_FuncCPU(transcendentalFunc, 12); + +BM_FuncCPU(reduction, 4); +BM_FuncCPU(reduction, 8); +BM_FuncCPU(reduction, 12); + + +// Contractions +#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \ + static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\ + StopBenchmarkTiming(); \ + if (THREADS == 1) { \ + Eigen::DefaultDevice device; \ + BenchmarkSuite<Eigen::DefaultDevice> suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } else { \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } \ + SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000); + + +BM_FuncWithInputDimsCPU(contraction, N, N, N, 1); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 4); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 8); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 12); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 16); + +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16); + +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16); + +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16); + +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16); + + +// Convolutions +#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS) \ + static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \ + StopBenchmarkTiming(); \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \ + suite.FUNC(iters, DIM1, DIM2); \ + SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000); + +BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4); +BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8); +BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12); + +BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4); +BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8); +BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12); + +BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4); +BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8); +BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12); + +BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4); +BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8); +BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12); + +BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4); +BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8); +BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12); + +BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4); +BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8); +BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12); diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cc new file mode 100644 index 000000000..adea754ad --- /dev/null +++ b/bench/tensors/tensor_benchmarks_gpu.cc @@ -0,0 +1,75 @@ +#define EIGEN_USE_GPU + +#include <cuda.h> +#include <cuda_runtime.h> +#include <iostream> +#include "strings/strcat.h" +#include "third_party/eigen3/tensor_benchmarks.h" + + + +// Simple functions +#define BM_FuncGPU(FUNC) \ + static void BM_##FUNC(int iters, int N) { \ + StopBenchmarkTiming(); \ + cudaStream_t stream; \ + cudaStreamCreate(&stream); \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + cudaStreamDestroy(stream); \ + } \ + BENCHMARK_RANGE(BM_##FUNC, 10, 5000); + +BM_FuncGPU(memcpy); +BM_FuncGPU(random); +BM_FuncGPU(slicing); +BM_FuncGPU(shuffling); +BM_FuncGPU(padding); +BM_FuncGPU(striding); +BM_FuncGPU(broadcasting); +BM_FuncGPU(coeffWiseOp); +BM_FuncGPU(reduction); + + +// Contractions +#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ + static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ + StopBenchmarkTiming(); \ + cudaStream_t stream; \ + cudaStreamCreate(&stream); \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + cudaStreamDestroy(stream); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); + + +BM_FuncWithInputDimsGPU(contraction, N, N, N); +BM_FuncWithInputDimsGPU(contraction, 64, N, N); +BM_FuncWithInputDimsGPU(contraction, N, 64, N); + + +// Convolutions +#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ + static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ + StopBenchmarkTiming(); \ + cudaStream_t stream; \ + cudaStreamCreate(&stream); \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters, DIM1, DIM2); \ + cudaStreamDestroy(stream); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); + +BM_FuncWithKernelDimsGPU(convolution, 7, 1); +BM_FuncWithKernelDimsGPU(convolution, 1, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 4); +BM_FuncWithKernelDimsGPU(convolution, 4, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 64); +BM_FuncWithKernelDimsGPU(convolution, 64, 7); diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index a9bc05137..d0efb4188 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -14,23 +14,18 @@ endif() add_custom_target(blas) -set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp) - -if(EIGEN_Fortran_COMPILER_WORKS) - -set(EigenBlas_SRCS ${EigenBlas_SRCS} - complexdots.f - srotm.f srotmg.f drotm.f drotmg.f - lsame.f dspmv.f ssbmv.f - chbmv.f sspmv.f - zhbmv.f chpmv.f dsbmv.f - zhpmv.f - dtbmv.f stbmv.f ctbmv.f ztbmv.f -) +set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp + f2c/srotm.c f2c/srotmg.c f2c/drotm.c f2c/drotmg.c + f2c/lsame.c f2c/dspmv.c f2c/ssbmv.c f2c/chbmv.c + f2c/sspmv.c f2c/zhbmv.c f2c/chpmv.c f2c/dsbmv.c + f2c/zhpmv.c f2c/dtbmv.c f2c/stbmv.c f2c/ctbmv.c + f2c/ztbmv.c f2c/d_cnjg.c f2c/r_cnjg.c + ) + +if (EIGEN_Fortran_COMPILER_WORKS) + set(EigenBlas_SRCS ${EigenBlas_SRCS} fortran/complexdots.f) else() - -message(WARNING " No fortran compiler has been detected, the blas build will be incomplete.") - + set(EigenBlas_SRCS ${EigenBlas_SRCS} f2c/complexdots.c) endif() add_library(eigen_blas_static ${EigenBlas_SRCS}) diff --git a/blas/chbmv.f b/blas/chbmv.f deleted file mode 100644 index 1b1c330ea..000000000 --- a/blas/chbmv.f +++ /dev/null @@ -1,310 +0,0 @@ - SUBROUTINE CHBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - COMPLEX ALPHA,BETA - INTEGER INCX,INCY,K,LDA,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - COMPLEX A(LDA,*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* CHBMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n hermitian band matrix, with k super-diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the band matrix A is being supplied as -* follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* being supplied. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* being supplied. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry, K specifies the number of super-diagonals of the -* matrix A. K must satisfy 0 .le. K. -* Unchanged on exit. -* -* ALPHA - COMPLEX . -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* A - COMPLEX array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the hermitian matrix, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer the upper -* triangular part of a hermitian band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the hermitian matrix, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer the lower -* triangular part of a hermitian band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Note that the imaginary parts of the diagonal elements need -* not be set and are assumed to be zero. -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - COMPLEX array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the -* vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - COMPLEX . -* On entry, BETA specifies the scalar beta. -* Unchanged on exit. -* -* Y - COMPLEX array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the -* vector y. On exit, Y is overwritten by the updated vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - COMPLEX ONE - PARAMETER (ONE= (1.0E+0,0.0E+0)) - COMPLEX ZERO - PARAMETER (ZERO= (0.0E+0,0.0E+0)) -* .. -* .. Local Scalars .. - COMPLEX TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC CONJG,MAX,MIN,REAL -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (K.LT.0) THEN - INFO = 3 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 6 - ELSE IF (INCX.EQ.0) THEN - INFO = 8 - ELSE IF (INCY.EQ.0) THEN - INFO = 11 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('CHBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array A -* are accessed sequentially with one pass through A. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - IF (LSAME(UPLO,'U')) THEN -* -* Form y when upper triangle of A is stored. -* - KPLUS1 = K + 1 - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - L = KPLUS1 - J - DO 50 I = MAX(1,J-K),J - 1 - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(I) - 50 CONTINUE - Y(J) = Y(J) + TEMP1*REAL(A(KPLUS1,J)) + ALPHA*TEMP2 - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - L = KPLUS1 - J - DO 70 I = MAX(1,J-K),J - 1 - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*REAL(A(KPLUS1,J)) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - IF (J.GT.K) THEN - KX = KX + INCX - KY = KY + INCY - END IF - 80 CONTINUE - END IF - ELSE -* -* Form y when lower triangle of A is stored. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*REAL(A(1,J)) - L = 1 - J - DO 90 I = J + 1,MIN(N,J+K) - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(I) - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*REAL(A(1,J)) - L = 1 - J - IX = JX - IY = JY - DO 110 I = J + 1,MIN(N,J+K) - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of CHBMV . -* - END diff --git a/blas/chpmv.f b/blas/chpmv.f deleted file mode 100644 index 158be5a7b..000000000 --- a/blas/chpmv.f +++ /dev/null @@ -1,272 +0,0 @@ - SUBROUTINE CHPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - COMPLEX ALPHA,BETA - INTEGER INCX,INCY,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - COMPLEX AP(*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* CHPMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n hermitian matrix, supplied in packed form. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the matrix A is supplied in the packed -* array AP as follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* supplied in AP. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* supplied in AP. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* ALPHA - COMPLEX . -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* AP - COMPLEX array of DIMENSION at least -* ( ( n*( n + 1 ) )/2 ). -* Before entry with UPLO = 'U' or 'u', the array AP must -* contain the upper triangular part of the hermitian matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) -* and a( 2, 2 ) respectively, and so on. -* Before entry with UPLO = 'L' or 'l', the array AP must -* contain the lower triangular part of the hermitian matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) -* and a( 3, 1 ) respectively, and so on. -* Note that the imaginary parts of the diagonal elements need -* not be set and are assumed to be zero. -* Unchanged on exit. -* -* X - COMPLEX array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - COMPLEX . -* On entry, BETA specifies the scalar beta. When BETA is -* supplied as zero then Y need not be set on input. -* Unchanged on exit. -* -* Y - COMPLEX array of dimension at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the n -* element vector y. On exit, Y is overwritten by the updated -* vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - COMPLEX ONE - PARAMETER (ONE= (1.0E+0,0.0E+0)) - COMPLEX ZERO - PARAMETER (ZERO= (0.0E+0,0.0E+0)) -* .. -* .. Local Scalars .. - COMPLEX TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC CONJG,REAL -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (INCX.EQ.0) THEN - INFO = 6 - ELSE IF (INCY.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('CHPMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array AP -* are accessed sequentially with one pass through AP. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - KK = 1 - IF (LSAME(UPLO,'U')) THEN -* -* Form y when AP contains the upper triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - K = KK - DO 50 I = 1,J - 1 - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + CONJG(AP(K))*X(I) - K = K + 1 - 50 CONTINUE - Y(J) = Y(J) + TEMP1*REAL(AP(KK+J-1)) + ALPHA*TEMP2 - KK = KK + J - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - DO 70 K = KK,KK + J - 2 - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + CONJG(AP(K))*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*REAL(AP(KK+J-1)) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + J - 80 CONTINUE - END IF - ELSE -* -* Form y when AP contains the lower triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*REAL(AP(KK)) - K = KK + 1 - DO 90 I = J + 1,N - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + CONJG(AP(K))*X(I) - K = K + 1 - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - KK = KK + (N-J+1) - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*REAL(AP(KK)) - IX = JX - IY = JY - DO 110 K = KK + 1,KK + N - J - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + CONJG(AP(K))*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + (N-J+1) - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of CHPMV . -* - END diff --git a/blas/common.h b/blas/common.h index c39cc63a8..5ecb153e2 100644 --- a/blas/common.h +++ b/blas/common.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -13,7 +13,6 @@ #include <Eigen/Core> #include <Eigen/Jacobi> -#include <iostream> #include <complex> #ifndef SCALAR diff --git a/blas/ctbmv.f b/blas/ctbmv.f deleted file mode 100644 index 5a879fa01..000000000 --- a/blas/ctbmv.f +++ /dev/null @@ -1,366 +0,0 @@ - SUBROUTINE CTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) -* .. Scalar Arguments .. - INTEGER INCX,K,LDA,N - CHARACTER DIAG,TRANS,UPLO -* .. -* .. Array Arguments .. - COMPLEX A(LDA,*),X(*) -* .. -* -* Purpose -* ======= -* -* CTBMV performs one of the matrix-vector operations -* -* x := A*x, or x := A'*x, or x := conjg( A' )*x, -* -* where x is an n element vector and A is an n by n unit, or non-unit, -* upper or lower triangular band matrix, with ( k + 1 ) diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the matrix is an upper or -* lower triangular matrix as follows: -* -* UPLO = 'U' or 'u' A is an upper triangular matrix. -* -* UPLO = 'L' or 'l' A is a lower triangular matrix. -* -* Unchanged on exit. -* -* TRANS - CHARACTER*1. -* On entry, TRANS specifies the operation to be performed as -* follows: -* -* TRANS = 'N' or 'n' x := A*x. -* -* TRANS = 'T' or 't' x := A'*x. -* -* TRANS = 'C' or 'c' x := conjg( A' )*x. -* -* Unchanged on exit. -* -* DIAG - CHARACTER*1. -* On entry, DIAG specifies whether or not A is unit -* triangular as follows: -* -* DIAG = 'U' or 'u' A is assumed to be unit triangular. -* -* DIAG = 'N' or 'n' A is not assumed to be unit -* triangular. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry with UPLO = 'U' or 'u', K specifies the number of -* super-diagonals of the matrix A. -* On entry with UPLO = 'L' or 'l', K specifies the number of -* sub-diagonals of the matrix A. -* K must satisfy 0 .le. K. -* Unchanged on exit. -* -* A - COMPLEX array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer an upper -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer a lower -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Note that when DIAG = 'U' or 'u' the elements of the array A -* corresponding to the diagonal elements of the matrix are not -* referenced, but are assumed to be unity. -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - COMPLEX array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. On exit, X is overwritten with the -* tranformed vector x. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - COMPLEX ZERO - PARAMETER (ZERO= (0.0E+0,0.0E+0)) -* .. -* .. Local Scalars .. - COMPLEX TEMP - INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L - LOGICAL NOCONJ,NOUNIT -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC CONJG,MAX,MIN -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. - + .NOT.LSAME(TRANS,'C')) THEN - INFO = 2 - ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN - INFO = 3 - ELSE IF (N.LT.0) THEN - INFO = 4 - ELSE IF (K.LT.0) THEN - INFO = 5 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 7 - ELSE IF (INCX.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('CTBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF (N.EQ.0) RETURN -* - NOCONJ = LSAME(TRANS,'T') - NOUNIT = LSAME(DIAG,'N') -* -* Set up the start point in X if the increment is not unity. This -* will be ( N - 1 )*INCX too small for descending loops. -* - IF (INCX.LE.0) THEN - KX = 1 - (N-1)*INCX - ELSE IF (INCX.NE.1) THEN - KX = 1 - END IF -* -* Start the operations. In this version the elements of A are -* accessed sequentially with one pass through A. -* - IF (LSAME(TRANS,'N')) THEN -* -* Form x := A*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 20 J = 1,N - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = KPLUS1 - J - DO 10 I = MAX(1,J-K),J - 1 - X(I) = X(I) + TEMP*A(L+I,J) - 10 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J) - END IF - 20 CONTINUE - ELSE - JX = KX - DO 40 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = KPLUS1 - J - DO 30 I = MAX(1,J-K),J - 1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX + INCX - 30 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J) - END IF - JX = JX + INCX - IF (J.GT.K) KX = KX + INCX - 40 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 60 J = N,1,-1 - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = 1 - J - DO 50 I = MIN(N,J+K),J + 1,-1 - X(I) = X(I) + TEMP*A(L+I,J) - 50 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(1,J) - END IF - 60 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 80 J = N,1,-1 - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = 1 - J - DO 70 I = MIN(N,J+K),J + 1,-1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX - INCX - 70 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(1,J) - END IF - JX = JX - INCX - IF ((N-J).GE.K) KX = KX - INCX - 80 CONTINUE - END IF - END IF - ELSE -* -* Form x := A'*x or x := conjg( A' )*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 110 J = N,1,-1 - TEMP = X(J) - L = KPLUS1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 90 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(I) - 90 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*CONJG(A(KPLUS1,J)) - DO 100 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + CONJG(A(L+I,J))*X(I) - 100 CONTINUE - END IF - X(J) = TEMP - 110 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 140 J = N,1,-1 - TEMP = X(JX) - KX = KX - INCX - IX = KX - L = KPLUS1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 120 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX - INCX - 120 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*CONJG(A(KPLUS1,J)) - DO 130 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + CONJG(A(L+I,J))*X(IX) - IX = IX - INCX - 130 CONTINUE - END IF - X(JX) = TEMP - JX = JX - INCX - 140 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 170 J = 1,N - TEMP = X(J) - L = 1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 150 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(I) - 150 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*CONJG(A(1,J)) - DO 160 I = J + 1,MIN(N,J+K) - TEMP = TEMP + CONJG(A(L+I,J))*X(I) - 160 CONTINUE - END IF - X(J) = TEMP - 170 CONTINUE - ELSE - JX = KX - DO 200 J = 1,N - TEMP = X(JX) - KX = KX + INCX - IX = KX - L = 1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 180 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX + INCX - 180 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*CONJG(A(1,J)) - DO 190 I = J + 1,MIN(N,J+K) - TEMP = TEMP + CONJG(A(L+I,J))*X(IX) - IX = IX + INCX - 190 CONTINUE - END IF - X(JX) = TEMP - JX = JX + INCX - 200 CONTINUE - END IF - END IF - END IF -* - RETURN -* -* End of CTBMV . -* - END diff --git a/blas/drotm.f b/blas/drotm.f deleted file mode 100644 index 63a3b1134..000000000 --- a/blas/drotm.f +++ /dev/null @@ -1,147 +0,0 @@ - SUBROUTINE DROTM(N,DX,INCX,DY,INCY,DPARAM) -* .. Scalar Arguments .. - INTEGER INCX,INCY,N -* .. -* .. Array Arguments .. - DOUBLE PRECISION DPARAM(5),DX(*),DY(*) -* .. -* -* Purpose -* ======= -* -* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX -* -* (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN -* (DY**T) -* -* DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE -* LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. -* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. -* -* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 -* -* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) -* H=( ) ( ) ( ) ( ) -* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). -* SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. -* -* Arguments -* ========= -* -* N (input) INTEGER -* number of elements in input vector(s) -* -* DX (input/output) DOUBLE PRECISION array, dimension N -* double precision vector with N elements -* -* INCX (input) INTEGER -* storage spacing between elements of DX -* -* DY (input/output) DOUBLE PRECISION array, dimension N -* double precision vector with N elements -* -* INCY (input) INTEGER -* storage spacing between elements of DY -* -* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 -* DPARAM(1)=DFLAG -* DPARAM(2)=DH11 -* DPARAM(3)=DH21 -* DPARAM(4)=DH12 -* DPARAM(5)=DH22 -* -* ===================================================================== -* -* .. Local Scalars .. - DOUBLE PRECISION DFLAG,DH11,DH12,DH21,DH22,TWO,W,Z,ZERO - INTEGER I,KX,KY,NSTEPS -* .. -* .. Data statements .. - DATA ZERO,TWO/0.D0,2.D0/ -* .. -* - DFLAG = DPARAM(1) - IF (N.LE.0 .OR. (DFLAG+TWO.EQ.ZERO)) GO TO 140 - IF (.NOT. (INCX.EQ.INCY.AND.INCX.GT.0)) GO TO 70 -* - NSTEPS = N*INCX - IF (DFLAG) 50,10,30 - 10 CONTINUE - DH12 = DPARAM(4) - DH21 = DPARAM(3) - DO 20 I = 1,NSTEPS,INCX - W = DX(I) - Z = DY(I) - DX(I) = W + Z*DH12 - DY(I) = W*DH21 + Z - 20 CONTINUE - GO TO 140 - 30 CONTINUE - DH11 = DPARAM(2) - DH22 = DPARAM(5) - DO 40 I = 1,NSTEPS,INCX - W = DX(I) - Z = DY(I) - DX(I) = W*DH11 + Z - DY(I) = -W + DH22*Z - 40 CONTINUE - GO TO 140 - 50 CONTINUE - DH11 = DPARAM(2) - DH12 = DPARAM(4) - DH21 = DPARAM(3) - DH22 = DPARAM(5) - DO 60 I = 1,NSTEPS,INCX - W = DX(I) - Z = DY(I) - DX(I) = W*DH11 + Z*DH12 - DY(I) = W*DH21 + Z*DH22 - 60 CONTINUE - GO TO 140 - 70 CONTINUE - KX = 1 - KY = 1 - IF (INCX.LT.0) KX = 1 + (1-N)*INCX - IF (INCY.LT.0) KY = 1 + (1-N)*INCY -* - IF (DFLAG) 120,80,100 - 80 CONTINUE - DH12 = DPARAM(4) - DH21 = DPARAM(3) - DO 90 I = 1,N - W = DX(KX) - Z = DY(KY) - DX(KX) = W + Z*DH12 - DY(KY) = W*DH21 + Z - KX = KX + INCX - KY = KY + INCY - 90 CONTINUE - GO TO 140 - 100 CONTINUE - DH11 = DPARAM(2) - DH22 = DPARAM(5) - DO 110 I = 1,N - W = DX(KX) - Z = DY(KY) - DX(KX) = W*DH11 + Z - DY(KY) = -W + DH22*Z - KX = KX + INCX - KY = KY + INCY - 110 CONTINUE - GO TO 140 - 120 CONTINUE - DH11 = DPARAM(2) - DH12 = DPARAM(4) - DH21 = DPARAM(3) - DH22 = DPARAM(5) - DO 130 I = 1,N - W = DX(KX) - Z = DY(KY) - DX(KX) = W*DH11 + Z*DH12 - DY(KY) = W*DH21 + Z*DH22 - KX = KX + INCX - KY = KY + INCY - 130 CONTINUE - 140 CONTINUE - RETURN - END diff --git a/blas/drotmg.f b/blas/drotmg.f deleted file mode 100644 index 3ae647b08..000000000 --- a/blas/drotmg.f +++ /dev/null @@ -1,206 +0,0 @@ - SUBROUTINE DROTMG(DD1,DD2,DX1,DY1,DPARAM) -* .. Scalar Arguments .. - DOUBLE PRECISION DD1,DD2,DX1,DY1 -* .. -* .. Array Arguments .. - DOUBLE PRECISION DPARAM(5) -* .. -* -* Purpose -* ======= -* -* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS -* THE SECOND COMPONENT OF THE 2-VECTOR (DSQRT(DD1)*DX1,DSQRT(DD2)* -* DY2)**T. -* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. -* -* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 -* -* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) -* H=( ) ( ) ( ) ( ) -* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). -* LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 -* RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE -* VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) -* -* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE -* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE -* OF DD1 AND DD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. -* -* -* Arguments -* ========= -* -* DD1 (input/output) DOUBLE PRECISION -* -* DD2 (input/output) DOUBLE PRECISION -* -* DX1 (input/output) DOUBLE PRECISION -* -* DY1 (input) DOUBLE PRECISION -* -* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 -* DPARAM(1)=DFLAG -* DPARAM(2)=DH11 -* DPARAM(3)=DH21 -* DPARAM(4)=DH12 -* DPARAM(5)=DH22 -* -* ===================================================================== -* -* .. Local Scalars .. - DOUBLE PRECISION DFLAG,DH11,DH12,DH21,DH22,DP1,DP2,DQ1,DQ2,DTEMP, - + DU,GAM,GAMSQ,ONE,RGAMSQ,TWO,ZERO - INTEGER IGO -* .. -* .. Intrinsic Functions .. - INTRINSIC DABS -* .. -* .. Data statements .. -* - DATA ZERO,ONE,TWO/0.D0,1.D0,2.D0/ - DATA GAM,GAMSQ,RGAMSQ/4096.D0,16777216.D0,5.9604645D-8/ -* .. - - IF (.NOT.DD1.LT.ZERO) GO TO 10 -* GO ZERO-H-D-AND-DX1.. - GO TO 60 - 10 CONTINUE -* CASE-DD1-NONNEGATIVE - DP2 = DD2*DY1 - IF (.NOT.DP2.EQ.ZERO) GO TO 20 - DFLAG = -TWO - GO TO 260 -* REGULAR-CASE.. - 20 CONTINUE - DP1 = DD1*DX1 - DQ2 = DP2*DY1 - DQ1 = DP1*DX1 -* - IF (.NOT.DABS(DQ1).GT.DABS(DQ2)) GO TO 40 - DH21 = -DY1/DX1 - DH12 = DP2/DP1 -* - DU = ONE - DH12*DH21 -* - IF (.NOT.DU.LE.ZERO) GO TO 30 -* GO ZERO-H-D-AND-DX1.. - GO TO 60 - 30 CONTINUE - DFLAG = ZERO - DD1 = DD1/DU - DD2 = DD2/DU - DX1 = DX1*DU -* GO SCALE-CHECK.. - GO TO 100 - 40 CONTINUE - IF (.NOT.DQ2.LT.ZERO) GO TO 50 -* GO ZERO-H-D-AND-DX1.. - GO TO 60 - 50 CONTINUE - DFLAG = ONE - DH11 = DP1/DP2 - DH22 = DX1/DY1 - DU = ONE + DH11*DH22 - DTEMP = DD2/DU - DD2 = DD1/DU - DD1 = DTEMP - DX1 = DY1*DU -* GO SCALE-CHECK - GO TO 100 -* PROCEDURE..ZERO-H-D-AND-DX1.. - 60 CONTINUE - DFLAG = -ONE - DH11 = ZERO - DH12 = ZERO - DH21 = ZERO - DH22 = ZERO -* - DD1 = ZERO - DD2 = ZERO - DX1 = ZERO -* RETURN.. - GO TO 220 -* PROCEDURE..FIX-H.. - 70 CONTINUE - IF (.NOT.DFLAG.GE.ZERO) GO TO 90 -* - IF (.NOT.DFLAG.EQ.ZERO) GO TO 80 - DH11 = ONE - DH22 = ONE - DFLAG = -ONE - GO TO 90 - 80 CONTINUE - DH21 = -ONE - DH12 = ONE - DFLAG = -ONE - 90 CONTINUE - GO TO IGO(120,150,180,210) -* PROCEDURE..SCALE-CHECK - 100 CONTINUE - 110 CONTINUE - IF (.NOT.DD1.LE.RGAMSQ) GO TO 130 - IF (DD1.EQ.ZERO) GO TO 160 - ASSIGN 120 TO IGO -* FIX-H.. - GO TO 70 - 120 CONTINUE - DD1 = DD1*GAM**2 - DX1 = DX1/GAM - DH11 = DH11/GAM - DH12 = DH12/GAM - GO TO 110 - 130 CONTINUE - 140 CONTINUE - IF (.NOT.DD1.GE.GAMSQ) GO TO 160 - ASSIGN 150 TO IGO -* FIX-H.. - GO TO 70 - 150 CONTINUE - DD1 = DD1/GAM**2 - DX1 = DX1*GAM - DH11 = DH11*GAM - DH12 = DH12*GAM - GO TO 140 - 160 CONTINUE - 170 CONTINUE - IF (.NOT.DABS(DD2).LE.RGAMSQ) GO TO 190 - IF (DD2.EQ.ZERO) GO TO 220 - ASSIGN 180 TO IGO -* FIX-H.. - GO TO 70 - 180 CONTINUE - DD2 = DD2*GAM**2 - DH21 = DH21/GAM - DH22 = DH22/GAM - GO TO 170 - 190 CONTINUE - 200 CONTINUE - IF (.NOT.DABS(DD2).GE.GAMSQ) GO TO 220 - ASSIGN 210 TO IGO -* FIX-H.. - GO TO 70 - 210 CONTINUE - DD2 = DD2/GAM**2 - DH21 = DH21*GAM - DH22 = DH22*GAM - GO TO 200 - 220 CONTINUE - IF (DFLAG) 250,230,240 - 230 CONTINUE - DPARAM(3) = DH21 - DPARAM(4) = DH12 - GO TO 260 - 240 CONTINUE - DPARAM(2) = DH11 - DPARAM(5) = DH22 - GO TO 260 - 250 CONTINUE - DPARAM(2) = DH11 - DPARAM(3) = DH21 - DPARAM(4) = DH12 - DPARAM(5) = DH22 - 260 CONTINUE - DPARAM(1) = DFLAG - RETURN - END diff --git a/blas/dsbmv.f b/blas/dsbmv.f deleted file mode 100644 index 8c82d1fa1..000000000 --- a/blas/dsbmv.f +++ /dev/null @@ -1,304 +0,0 @@ - SUBROUTINE DSBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - DOUBLE PRECISION ALPHA,BETA - INTEGER INCX,INCY,K,LDA,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - DOUBLE PRECISION A(LDA,*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* DSBMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n symmetric band matrix, with k super-diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the band matrix A is being supplied as -* follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* being supplied. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* being supplied. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry, K specifies the number of super-diagonals of the -* matrix A. K must satisfy 0 .le. K. -* Unchanged on exit. -* -* ALPHA - DOUBLE PRECISION. -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the symmetric matrix, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer the upper -* triangular part of a symmetric band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the symmetric matrix, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer the lower -* triangular part of a symmetric band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - DOUBLE PRECISION array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the -* vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - DOUBLE PRECISION. -* On entry, BETA specifies the scalar beta. -* Unchanged on exit. -* -* Y - DOUBLE PRECISION array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the -* vector y. On exit, Y is overwritten by the updated vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - DOUBLE PRECISION ONE,ZERO - PARAMETER (ONE=1.0D+0,ZERO=0.0D+0) -* .. -* .. Local Scalars .. - DOUBLE PRECISION TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC MAX,MIN -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (K.LT.0) THEN - INFO = 3 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 6 - ELSE IF (INCX.EQ.0) THEN - INFO = 8 - ELSE IF (INCY.EQ.0) THEN - INFO = 11 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('DSBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array A -* are accessed sequentially with one pass through A. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - IF (LSAME(UPLO,'U')) THEN -* -* Form y when upper triangle of A is stored. -* - KPLUS1 = K + 1 - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - L = KPLUS1 - J - DO 50 I = MAX(1,J-K),J - 1 - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(I) - 50 CONTINUE - Y(J) = Y(J) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2 - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - L = KPLUS1 - J - DO 70 I = MAX(1,J-K),J - 1 - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - IF (J.GT.K) THEN - KX = KX + INCX - KY = KY + INCY - END IF - 80 CONTINUE - END IF - ELSE -* -* Form y when lower triangle of A is stored. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*A(1,J) - L = 1 - J - DO 90 I = J + 1,MIN(N,J+K) - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(I) - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*A(1,J) - L = 1 - J - IX = JX - IY = JY - DO 110 I = J + 1,MIN(N,J+K) - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of DSBMV . -* - END diff --git a/blas/dspmv.f b/blas/dspmv.f deleted file mode 100644 index f6e121e76..000000000 --- a/blas/dspmv.f +++ /dev/null @@ -1,265 +0,0 @@ - SUBROUTINE DSPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - DOUBLE PRECISION ALPHA,BETA - INTEGER INCX,INCY,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - DOUBLE PRECISION AP(*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* DSPMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n symmetric matrix, supplied in packed form. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the matrix A is supplied in the packed -* array AP as follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* supplied in AP. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* supplied in AP. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* ALPHA - DOUBLE PRECISION. -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* AP - DOUBLE PRECISION array of DIMENSION at least -* ( ( n*( n + 1 ) )/2 ). -* Before entry with UPLO = 'U' or 'u', the array AP must -* contain the upper triangular part of the symmetric matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) -* and a( 2, 2 ) respectively, and so on. -* Before entry with UPLO = 'L' or 'l', the array AP must -* contain the lower triangular part of the symmetric matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) -* and a( 3, 1 ) respectively, and so on. -* Unchanged on exit. -* -* X - DOUBLE PRECISION array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - DOUBLE PRECISION. -* On entry, BETA specifies the scalar beta. When BETA is -* supplied as zero then Y need not be set on input. -* Unchanged on exit. -* -* Y - DOUBLE PRECISION array of dimension at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the n -* element vector y. On exit, Y is overwritten by the updated -* vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - DOUBLE PRECISION ONE,ZERO - PARAMETER (ONE=1.0D+0,ZERO=0.0D+0) -* .. -* .. Local Scalars .. - DOUBLE PRECISION TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (INCX.EQ.0) THEN - INFO = 6 - ELSE IF (INCY.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('DSPMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array AP -* are accessed sequentially with one pass through AP. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - KK = 1 - IF (LSAME(UPLO,'U')) THEN -* -* Form y when AP contains the upper triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - K = KK - DO 50 I = 1,J - 1 - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(I) - K = K + 1 - 50 CONTINUE - Y(J) = Y(J) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2 - KK = KK + J - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - DO 70 K = KK,KK + J - 2 - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + J - 80 CONTINUE - END IF - ELSE -* -* Form y when AP contains the lower triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*AP(KK) - K = KK + 1 - DO 90 I = J + 1,N - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(I) - K = K + 1 - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - KK = KK + (N-J+1) - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*AP(KK) - IX = JX - IY = JY - DO 110 K = KK + 1,KK + N - J - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + (N-J+1) - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of DSPMV . -* - END diff --git a/blas/dtbmv.f b/blas/dtbmv.f deleted file mode 100644 index a87ffdeae..000000000 --- a/blas/dtbmv.f +++ /dev/null @@ -1,335 +0,0 @@ - SUBROUTINE DTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) -* .. Scalar Arguments .. - INTEGER INCX,K,LDA,N - CHARACTER DIAG,TRANS,UPLO -* .. -* .. Array Arguments .. - DOUBLE PRECISION A(LDA,*),X(*) -* .. -* -* Purpose -* ======= -* -* DTBMV performs one of the matrix-vector operations -* -* x := A*x, or x := A'*x, -* -* where x is an n element vector and A is an n by n unit, or non-unit, -* upper or lower triangular band matrix, with ( k + 1 ) diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the matrix is an upper or -* lower triangular matrix as follows: -* -* UPLO = 'U' or 'u' A is an upper triangular matrix. -* -* UPLO = 'L' or 'l' A is a lower triangular matrix. -* -* Unchanged on exit. -* -* TRANS - CHARACTER*1. -* On entry, TRANS specifies the operation to be performed as -* follows: -* -* TRANS = 'N' or 'n' x := A*x. -* -* TRANS = 'T' or 't' x := A'*x. -* -* TRANS = 'C' or 'c' x := A'*x. -* -* Unchanged on exit. -* -* DIAG - CHARACTER*1. -* On entry, DIAG specifies whether or not A is unit -* triangular as follows: -* -* DIAG = 'U' or 'u' A is assumed to be unit triangular. -* -* DIAG = 'N' or 'n' A is not assumed to be unit -* triangular. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry with UPLO = 'U' or 'u', K specifies the number of -* super-diagonals of the matrix A. -* On entry with UPLO = 'L' or 'l', K specifies the number of -* sub-diagonals of the matrix A. -* K must satisfy 0 .le. K. -* Unchanged on exit. -* -* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer an upper -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer a lower -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Note that when DIAG = 'U' or 'u' the elements of the array A -* corresponding to the diagonal elements of the matrix are not -* referenced, but are assumed to be unity. -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - DOUBLE PRECISION array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. On exit, X is overwritten with the -* tranformed vector x. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - DOUBLE PRECISION ZERO - PARAMETER (ZERO=0.0D+0) -* .. -* .. Local Scalars .. - DOUBLE PRECISION TEMP - INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L - LOGICAL NOUNIT -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC MAX,MIN -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. - + .NOT.LSAME(TRANS,'C')) THEN - INFO = 2 - ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN - INFO = 3 - ELSE IF (N.LT.0) THEN - INFO = 4 - ELSE IF (K.LT.0) THEN - INFO = 5 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 7 - ELSE IF (INCX.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('DTBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF (N.EQ.0) RETURN -* - NOUNIT = LSAME(DIAG,'N') -* -* Set up the start point in X if the increment is not unity. This -* will be ( N - 1 )*INCX too small for descending loops. -* - IF (INCX.LE.0) THEN - KX = 1 - (N-1)*INCX - ELSE IF (INCX.NE.1) THEN - KX = 1 - END IF -* -* Start the operations. In this version the elements of A are -* accessed sequentially with one pass through A. -* - IF (LSAME(TRANS,'N')) THEN -* -* Form x := A*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 20 J = 1,N - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = KPLUS1 - J - DO 10 I = MAX(1,J-K),J - 1 - X(I) = X(I) + TEMP*A(L+I,J) - 10 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J) - END IF - 20 CONTINUE - ELSE - JX = KX - DO 40 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = KPLUS1 - J - DO 30 I = MAX(1,J-K),J - 1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX + INCX - 30 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J) - END IF - JX = JX + INCX - IF (J.GT.K) KX = KX + INCX - 40 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 60 J = N,1,-1 - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = 1 - J - DO 50 I = MIN(N,J+K),J + 1,-1 - X(I) = X(I) + TEMP*A(L+I,J) - 50 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(1,J) - END IF - 60 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 80 J = N,1,-1 - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = 1 - J - DO 70 I = MIN(N,J+K),J + 1,-1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX - INCX - 70 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(1,J) - END IF - JX = JX - INCX - IF ((N-J).GE.K) KX = KX - INCX - 80 CONTINUE - END IF - END IF - ELSE -* -* Form x := A'*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 100 J = N,1,-1 - TEMP = X(J) - L = KPLUS1 - J - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 90 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(I) - 90 CONTINUE - X(J) = TEMP - 100 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 120 J = N,1,-1 - TEMP = X(JX) - KX = KX - INCX - IX = KX - L = KPLUS1 - J - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 110 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX - INCX - 110 CONTINUE - X(JX) = TEMP - JX = JX - INCX - 120 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 140 J = 1,N - TEMP = X(J) - L = 1 - J - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 130 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(I) - 130 CONTINUE - X(J) = TEMP - 140 CONTINUE - ELSE - JX = KX - DO 160 J = 1,N - TEMP = X(JX) - KX = KX + INCX - IX = KX - L = 1 - J - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 150 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX + INCX - 150 CONTINUE - X(JX) = TEMP - JX = JX + INCX - 160 CONTINUE - END IF - END IF - END IF -* - RETURN -* -* End of DTBMV . -* - END diff --git a/blas/f2c/chbmv.c b/blas/f2c/chbmv.c new file mode 100644 index 000000000..f218fe3f5 --- /dev/null +++ b/blas/f2c/chbmv.c @@ -0,0 +1,487 @@ +/* chbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int chbmv_(char *uplo, integer *n, integer *k, complex * + alpha, complex *a, integer *lda, complex *x, integer *incx, complex * + beta, complex *y, integer *incy, ftnlen uplo_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; + real r__1; + complex q__1, q__2, q__3, q__4; + + /* Builtin functions */ + void r_cnjg(complex *, complex *); + + /* Local variables */ + integer i__, j, l, ix, iy, jx, jy, kx, ky, info; + complex temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* CHBMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n hermitian band matrix, with k super-diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the band matrix A is being supplied as */ +/* follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* being supplied. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* being supplied. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry, K specifies the number of super-diagonals of the */ +/* matrix A. K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* ALPHA - COMPLEX . */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* A - COMPLEX array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the hermitian matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer the upper */ +/* triangular part of a hermitian band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the hermitian matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer the lower */ +/* triangular part of a hermitian band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Note that the imaginary parts of the diagonal elements need */ +/* not be set and are assumed to be zero. */ +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - COMPLEX array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the */ +/* vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - COMPLEX . */ +/* On entry, BETA specifies the scalar beta. */ +/* Unchanged on exit. */ + +/* Y - COMPLEX array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the */ +/* vector y. On exit, Y is overwritten by the updated vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + --y; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*k < 0) { + info = 3; + } else if (*lda < *k + 1) { + info = 6; + } else if (*incx == 0) { + info = 8; + } else if (*incy == 0) { + info = 11; + } + if (info != 0) { + xerbla_("CHBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && + beta->i == 0.f))) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array A */ +/* are accessed sequentially with one pass through A. */ + +/* First form y := beta*y. */ + + if (beta->r != 1.f || beta->i != 0.f) { + if (*incy == 1) { + if (beta->r == 0.f && beta->i == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + y[i__2].r = 0.f, y[i__2].i = 0.f; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + q__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; +/* L20: */ + } + } + } else { + iy = ky; + if (beta->r == 0.f && beta->i == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + y[i__2].r = 0.f, y[i__2].i = 0.f; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + i__3 = iy; + q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + q__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + iy += *incy; +/* L40: */ + } + } + } + } + if (alpha->r == 0.f && alpha->i == 0.f) { + return 0; + } + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when upper triangle of A is stored. */ + + kplus1 = *k + 1; + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + i__2 = i__; + i__3 = i__; + i__5 = l + i__ + j * a_dim1; + q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__2 = i__; + q__2.r = q__3.r * x[i__2].r - q__3.i * x[i__2].i, q__2.i = + q__3.r * x[i__2].i + q__3.i * x[i__2].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; +/* L50: */ + } + i__4 = j; + i__2 = j; + i__3 = kplus1 + j * a_dim1; + r__1 = a[i__3].r; + q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i; + q__2.r = y[i__2].r + q__3.r, q__2.i = y[i__2].i + q__3.i; + q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; + y[i__4].r = q__1.r, y[i__4].i = q__1.i; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__4 = jx; + q__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, q__1.i = + alpha->r * x[i__4].i + alpha->i * x[i__4].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + ix = kx; + iy = ky; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + i__4 = iy; + i__2 = iy; + i__5 = l + i__ + j * a_dim1; + q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i; + y[i__4].r = q__1.r, y[i__4].i = q__1.i; + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__4 = ix; + q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = + q__3.r * x[i__4].i + q__3.i * x[i__4].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; + ix += *incx; + iy += *incy; +/* L70: */ + } + i__3 = jy; + i__4 = jy; + i__2 = kplus1 + j * a_dim1; + r__1 = a[i__2].r; + q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i; + q__2.r = y[i__4].r + q__3.r, q__2.i = y[i__4].i + q__3.i; + q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + jx += *incx; + jy += *incy; + if (j > *k) { + kx += *incx; + ky += *incy; + } +/* L80: */ + } + } + } else { + +/* Form y when lower triangle of A is stored. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__3 = j; + q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i = + alpha->r * x[i__3].i + alpha->i * x[i__3].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + i__3 = j; + i__4 = j; + i__2 = j * a_dim1 + 1; + r__1 = a[i__2].r; + q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__2 = i__; + i__5 = l + i__ + j * a_dim1; + q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i; + y[i__4].r = q__1.r, y[i__4].i = q__1.i; + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__4 = i__; + q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = + q__3.r * x[i__4].i + q__3.i * x[i__4].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; +/* L90: */ + } + i__3 = j; + i__4 = j; + q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__3 = jx; + q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i = + alpha->r * x[i__3].i + alpha->i * x[i__3].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + i__3 = jy; + i__4 = jy; + i__2 = j * a_dim1 + 1; + r__1 = a[i__2].r; + q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + l = 1 - j; + ix = jx; + iy = jy; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + ix += *incx; + iy += *incy; + i__4 = iy; + i__2 = iy; + i__5 = l + i__ + j * a_dim1; + q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i; + y[i__4].r = q__1.r, y[i__4].i = q__1.i; + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__4 = ix; + q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = + q__3.r * x[i__4].i + q__3.i * x[i__4].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; +/* L110: */ + } + i__3 = jy; + i__4 = jy; + q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + jx += *incx; + jy += *incy; +/* L120: */ + } + } + } + + return 0; + +/* End of CHBMV . */ + +} /* chbmv_ */ + diff --git a/blas/f2c/chpmv.c b/blas/f2c/chpmv.c new file mode 100644 index 000000000..65bab1c7f --- /dev/null +++ b/blas/f2c/chpmv.c @@ -0,0 +1,438 @@ +/* chpmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int chpmv_(char *uplo, integer *n, complex *alpha, complex * + ap, complex *x, integer *incx, complex *beta, complex *y, integer * + incy, ftnlen uplo_len) +{ + /* System generated locals */ + integer i__1, i__2, i__3, i__4, i__5; + real r__1; + complex q__1, q__2, q__3, q__4; + + /* Builtin functions */ + void r_cnjg(complex *, complex *); + + /* Local variables */ + integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info; + complex temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* CHPMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n hermitian matrix, supplied in packed form. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the matrix A is supplied in the packed */ +/* array AP as follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* supplied in AP. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* supplied in AP. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* ALPHA - COMPLEX . */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* AP - COMPLEX array of DIMENSION at least */ +/* ( ( n*( n + 1 ) )/2 ). */ +/* Before entry with UPLO = 'U' or 'u', the array AP must */ +/* contain the upper triangular part of the hermitian matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ +/* and a( 2, 2 ) respectively, and so on. */ +/* Before entry with UPLO = 'L' or 'l', the array AP must */ +/* contain the lower triangular part of the hermitian matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ +/* and a( 3, 1 ) respectively, and so on. */ +/* Note that the imaginary parts of the diagonal elements need */ +/* not be set and are assumed to be zero. */ +/* Unchanged on exit. */ + +/* X - COMPLEX array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - COMPLEX . */ +/* On entry, BETA specifies the scalar beta. When BETA is */ +/* supplied as zero then Y need not be set on input. */ +/* Unchanged on exit. */ + +/* Y - COMPLEX array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the n */ +/* element vector y. On exit, Y is overwritten by the updated */ +/* vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + --y; + --x; + --ap; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*incx == 0) { + info = 6; + } else if (*incy == 0) { + info = 9; + } + if (info != 0) { + xerbla_("CHPMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && + beta->i == 0.f))) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array AP */ +/* are accessed sequentially with one pass through AP. */ + +/* First form y := beta*y. */ + + if (beta->r != 1.f || beta->i != 0.f) { + if (*incy == 1) { + if (beta->r == 0.f && beta->i == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + y[i__2].r = 0.f, y[i__2].i = 0.f; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + q__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; +/* L20: */ + } + } + } else { + iy = ky; + if (beta->r == 0.f && beta->i == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + y[i__2].r = 0.f, y[i__2].i = 0.f; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + i__3 = iy; + q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + q__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + iy += *incy; +/* L40: */ + } + } + } + } + if (alpha->r == 0.f && alpha->i == 0.f) { + return 0; + } + kk = 1; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when AP contains the upper triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + k = kk; + i__2 = j - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + i__5 = k; + q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + r_cnjg(&q__3, &ap[k]); + i__3 = i__; + q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = + q__3.r * x[i__3].i + q__3.i * x[i__3].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; + ++k; +/* L50: */ + } + i__2 = j; + i__3 = j; + i__4 = kk + j - 1; + r__1 = ap[i__4].r; + q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i; + q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i; + q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + kk += j; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = jx; + q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + ix = kx; + iy = ky; + i__2 = kk + j - 2; + for (k = kk; k <= i__2; ++k) { + i__3 = iy; + i__4 = iy; + i__5 = k; + q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + r_cnjg(&q__3, &ap[k]); + i__3 = ix; + q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = + q__3.r * x[i__3].i + q__3.i * x[i__3].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; + ix += *incx; + iy += *incy; +/* L70: */ + } + i__2 = jy; + i__3 = jy; + i__4 = kk + j - 1; + r__1 = ap[i__4].r; + q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i; + q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i; + q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + jx += *incx; + jy += *incy; + kk += j; +/* L80: */ + } + } + } else { + +/* Form y when AP contains the lower triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + i__2 = j; + i__3 = j; + i__4 = kk; + r__1 = ap[i__4].r; + q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i; + q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + k = kk + 1; + i__2 = *n; + for (i__ = j + 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + i__5 = k; + q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + r_cnjg(&q__3, &ap[k]); + i__3 = i__; + q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = + q__3.r * x[i__3].i + q__3.i * x[i__3].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; + ++k; +/* L90: */ + } + i__2 = j; + i__3 = j; + q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + kk += *n - j + 1; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = jx; + q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + i__2 = jy; + i__3 = jy; + i__4 = kk; + r__1 = ap[i__4].r; + q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i; + q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + ix = jx; + iy = jy; + i__2 = kk + *n - j; + for (k = kk + 1; k <= i__2; ++k) { + ix += *incx; + iy += *incy; + i__3 = iy; + i__4 = iy; + i__5 = k; + q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + r_cnjg(&q__3, &ap[k]); + i__3 = ix; + q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = + q__3.r * x[i__3].i + q__3.i * x[i__3].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; +/* L110: */ + } + i__2 = jy; + i__3 = jy; + q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + jx += *incx; + jy += *incy; + kk += *n - j + 1; +/* L120: */ + } + } + } + + return 0; + +/* End of CHPMV . */ + +} /* chpmv_ */ + diff --git a/blas/f2c/complexdots.c b/blas/f2c/complexdots.c new file mode 100644 index 000000000..a856a231c --- /dev/null +++ b/blas/f2c/complexdots.c @@ -0,0 +1,84 @@ +/* This file has been modified to use the standard gfortran calling + convention, rather than the f2c calling convention. + + It does not require -ff2c when compiled with gfortran. +*/ + +/* complexdots.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +complex cdotc_(integer *n, complex *cx, integer + *incx, complex *cy, integer *incy) +{ + complex res; + extern /* Subroutine */ int cdotcw_(integer *, complex *, integer *, + complex *, integer *, complex *); + + /* Parameter adjustments */ + --cy; + --cx; + + /* Function Body */ + cdotcw_(n, &cx[1], incx, &cy[1], incy, &res); + return res; +} /* cdotc_ */ + +complex cdotu_(integer *n, complex *cx, integer + *incx, complex *cy, integer *incy) +{ + complex res; + extern /* Subroutine */ int cdotuw_(integer *, complex *, integer *, + complex *, integer *, complex *); + + /* Parameter adjustments */ + --cy; + --cx; + + /* Function Body */ + cdotuw_(n, &cx[1], incx, &cy[1], incy, &res); + return res; +} /* cdotu_ */ + +doublecomplex zdotc_(integer *n, doublecomplex *cx, integer *incx, + doublecomplex *cy, integer *incy) +{ + doublecomplex res; + extern /* Subroutine */ int zdotcw_(integer *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *); + + /* Parameter adjustments */ + --cy; + --cx; + + /* Function Body */ + zdotcw_(n, &cx[1], incx, &cy[1], incy, &res); + return res; +} /* zdotc_ */ + +doublecomplex zdotu_(integer *n, doublecomplex *cx, integer *incx, + doublecomplex *cy, integer *incy) +{ + doublecomplex res; + extern /* Subroutine */ int zdotuw_(integer *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *); + + /* Parameter adjustments */ + --cy; + --cx; + + /* Function Body */ + zdotuw_(n, &cx[1], incx, &cy[1], incy, &res); + return res; +} /* zdotu_ */ + diff --git a/blas/f2c/ctbmv.c b/blas/f2c/ctbmv.c new file mode 100644 index 000000000..790fd581f --- /dev/null +++ b/blas/f2c/ctbmv.c @@ -0,0 +1,647 @@ +/* ctbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int ctbmv_(char *uplo, char *trans, char *diag, integer *n, + integer *k, complex *a, integer *lda, complex *x, integer *incx, + ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; + complex q__1, q__2, q__3; + + /* Builtin functions */ + void r_cnjg(complex *, complex *); + + /* Local variables */ + integer i__, j, l, ix, jx, kx, info; + complex temp; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + logical noconj, nounit; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* CTBMV performs one of the matrix-vector operations */ + +/* x := A*x, or x := A'*x, or x := conjg( A' )*x, */ + +/* where x is an n element vector and A is an n by n unit, or non-unit, */ +/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the matrix is an upper or */ +/* lower triangular matrix as follows: */ + +/* UPLO = 'U' or 'u' A is an upper triangular matrix. */ + +/* UPLO = 'L' or 'l' A is a lower triangular matrix. */ + +/* Unchanged on exit. */ + +/* TRANS - CHARACTER*1. */ +/* On entry, TRANS specifies the operation to be performed as */ +/* follows: */ + +/* TRANS = 'N' or 'n' x := A*x. */ + +/* TRANS = 'T' or 't' x := A'*x. */ + +/* TRANS = 'C' or 'c' x := conjg( A' )*x. */ + +/* Unchanged on exit. */ + +/* DIAG - CHARACTER*1. */ +/* On entry, DIAG specifies whether or not A is unit */ +/* triangular as follows: */ + +/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ + +/* DIAG = 'N' or 'n' A is not assumed to be unit */ +/* triangular. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry with UPLO = 'U' or 'u', K specifies the number of */ +/* super-diagonals of the matrix A. */ +/* On entry with UPLO = 'L' or 'l', K specifies the number of */ +/* sub-diagonals of the matrix A. */ +/* K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* A - COMPLEX array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer an upper */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer a lower */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Note that when DIAG = 'U' or 'u' the elements of the array A */ +/* corresponding to the diagonal elements of the matrix are not */ +/* referenced, but are assumed to be unity. */ +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - COMPLEX array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. On exit, X is overwritten with the */ +/* tranformed vector x. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, + "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, ( + ftnlen)1)) { + info = 2; + } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, + "N", (ftnlen)1, (ftnlen)1)) { + info = 3; + } else if (*n < 0) { + info = 4; + } else if (*k < 0) { + info = 5; + } else if (*lda < *k + 1) { + info = 7; + } else if (*incx == 0) { + info = 9; + } + if (info != 0) { + xerbla_("CTBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0) { + return 0; + } + + noconj = lsame_(trans, "T", (ftnlen)1, (ftnlen)1); + nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1); + +/* Set up the start point in X if the increment is not unity. This */ +/* will be ( N - 1 )*INCX too small for descending loops. */ + + if (*incx <= 0) { + kx = 1 - (*n - 1) * *incx; + } else if (*incx != 1) { + kx = 1; + } + +/* Start the operations. In this version the elements of A are */ +/* accessed sequentially with one pass through A. */ + + if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) { + +/* Form x := A*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + if (x[i__2].r != 0.f || x[i__2].i != 0.f) { + i__2 = j; + temp.r = x[i__2].r, temp.i = x[i__2].i; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + i__2 = i__; + i__3 = i__; + i__5 = l + i__ + j * a_dim1; + q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, + q__2.i = temp.r * a[i__5].i + temp.i * a[ + i__5].r; + q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + + q__2.i; + x[i__2].r = q__1.r, x[i__2].i = q__1.i; +/* L10: */ + } + if (nounit) { + i__4 = j; + i__2 = j; + i__3 = kplus1 + j * a_dim1; + q__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[ + i__3].i, q__1.i = x[i__2].r * a[i__3].i + + x[i__2].i * a[i__3].r; + x[i__4].r = q__1.r, x[i__4].i = q__1.i; + } + } +/* L20: */ + } + } else { + jx = kx; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__4 = jx; + if (x[i__4].r != 0.f || x[i__4].i != 0.f) { + i__4 = jx; + temp.r = x[i__4].r, temp.i = x[i__4].i; + ix = kx; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + i__4 = ix; + i__2 = ix; + i__5 = l + i__ + j * a_dim1; + q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, + q__2.i = temp.r * a[i__5].i + temp.i * a[ + i__5].r; + q__1.r = x[i__2].r + q__2.r, q__1.i = x[i__2].i + + q__2.i; + x[i__4].r = q__1.r, x[i__4].i = q__1.i; + ix += *incx; +/* L30: */ + } + if (nounit) { + i__3 = jx; + i__4 = jx; + i__2 = kplus1 + j * a_dim1; + q__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[ + i__2].i, q__1.i = x[i__4].r * a[i__2].i + + x[i__4].i * a[i__2].r; + x[i__3].r = q__1.r, x[i__3].i = q__1.i; + } + } + jx += *incx; + if (j > *k) { + kx += *incx; + } +/* L40: */ + } + } + } else { + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + i__1 = j; + if (x[i__1].r != 0.f || x[i__1].i != 0.f) { + i__1 = j; + temp.r = x[i__1].r, temp.i = x[i__1].i; + l = 1 - j; +/* Computing MIN */ + i__1 = *n, i__3 = j + *k; + i__4 = j + 1; + for (i__ = min(i__1,i__3); i__ >= i__4; --i__) { + i__1 = i__; + i__3 = i__; + i__2 = l + i__ + j * a_dim1; + q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, + q__2.i = temp.r * a[i__2].i + temp.i * a[ + i__2].r; + q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + + q__2.i; + x[i__1].r = q__1.r, x[i__1].i = q__1.i; +/* L50: */ + } + if (nounit) { + i__4 = j; + i__1 = j; + i__3 = j * a_dim1 + 1; + q__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[ + i__3].i, q__1.i = x[i__1].r * a[i__3].i + + x[i__1].i * a[i__3].r; + x[i__4].r = q__1.r, x[i__4].i = q__1.i; + } + } +/* L60: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + i__4 = jx; + if (x[i__4].r != 0.f || x[i__4].i != 0.f) { + i__4 = jx; + temp.r = x[i__4].r, temp.i = x[i__4].i; + ix = kx; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__1 = j + *k; + i__3 = j + 1; + for (i__ = min(i__4,i__1); i__ >= i__3; --i__) { + i__4 = ix; + i__1 = ix; + i__2 = l + i__ + j * a_dim1; + q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, + q__2.i = temp.r * a[i__2].i + temp.i * a[ + i__2].r; + q__1.r = x[i__1].r + q__2.r, q__1.i = x[i__1].i + + q__2.i; + x[i__4].r = q__1.r, x[i__4].i = q__1.i; + ix -= *incx; +/* L70: */ + } + if (nounit) { + i__3 = jx; + i__4 = jx; + i__1 = j * a_dim1 + 1; + q__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[ + i__1].i, q__1.i = x[i__4].r * a[i__1].i + + x[i__4].i * a[i__1].r; + x[i__3].r = q__1.r, x[i__3].i = q__1.i; + } + } + jx -= *incx; + if (*n - j >= *k) { + kx -= *incx; + } +/* L80: */ + } + } + } + } else { + +/* Form x := A'*x or x := conjg( A' )*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + i__3 = j; + temp.r = x[i__3].r, temp.i = x[i__3].i; + l = kplus1 - j; + if (noconj) { + if (nounit) { + i__3 = kplus1 + j * a_dim1; + q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, + q__1.i = temp.r * a[i__3].i + temp.i * a[ + i__3].r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + i__4 = l + i__ + j * a_dim1; + i__1 = i__; + q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[ + i__1].i, q__2.i = a[i__4].r * x[i__1].i + + a[i__4].i * x[i__1].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; +/* L90: */ + } + } else { + if (nounit) { + r_cnjg(&q__2, &a[kplus1 + j * a_dim1]); + q__1.r = temp.r * q__2.r - temp.i * q__2.i, + q__1.i = temp.r * q__2.i + temp.i * + q__2.r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__4 = i__; + q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, + q__2.i = q__3.r * x[i__4].i + q__3.i * x[ + i__4].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; +/* L100: */ + } + } + i__3 = j; + x[i__3].r = temp.r, x[i__3].i = temp.i; +/* L110: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + i__3 = jx; + temp.r = x[i__3].r, temp.i = x[i__3].i; + kx -= *incx; + ix = kx; + l = kplus1 - j; + if (noconj) { + if (nounit) { + i__3 = kplus1 + j * a_dim1; + q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, + q__1.i = temp.r * a[i__3].i + temp.i * a[ + i__3].r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + i__4 = l + i__ + j * a_dim1; + i__1 = ix; + q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[ + i__1].i, q__2.i = a[i__4].r * x[i__1].i + + a[i__4].i * x[i__1].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; + ix -= *incx; +/* L120: */ + } + } else { + if (nounit) { + r_cnjg(&q__2, &a[kplus1 + j * a_dim1]); + q__1.r = temp.r * q__2.r - temp.i * q__2.i, + q__1.i = temp.r * q__2.i + temp.i * + q__2.r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__4 = ix; + q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, + q__2.i = q__3.r * x[i__4].i + q__3.i * x[ + i__4].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; + ix -= *incx; +/* L130: */ + } + } + i__3 = jx; + x[i__3].r = temp.r, x[i__3].i = temp.i; + jx -= *incx; +/* L140: */ + } + } + } else { + if (*incx == 1) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + i__4 = j; + temp.r = x[i__4].r, temp.i = x[i__4].i; + l = 1 - j; + if (noconj) { + if (nounit) { + i__4 = j * a_dim1 + 1; + q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, + q__1.i = temp.r * a[i__4].i + temp.i * a[ + i__4].r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + i__1 = l + i__ + j * a_dim1; + i__2 = i__; + q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[ + i__2].i, q__2.i = a[i__1].r * x[i__2].i + + a[i__1].i * x[i__2].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; +/* L150: */ + } + } else { + if (nounit) { + r_cnjg(&q__2, &a[j * a_dim1 + 1]); + q__1.r = temp.r * q__2.r - temp.i * q__2.i, + q__1.i = temp.r * q__2.i + temp.i * + q__2.r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__1 = i__; + q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, + q__2.i = q__3.r * x[i__1].i + q__3.i * x[ + i__1].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; +/* L160: */ + } + } + i__4 = j; + x[i__4].r = temp.r, x[i__4].i = temp.i; +/* L170: */ + } + } else { + jx = kx; + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + i__4 = jx; + temp.r = x[i__4].r, temp.i = x[i__4].i; + kx += *incx; + ix = kx; + l = 1 - j; + if (noconj) { + if (nounit) { + i__4 = j * a_dim1 + 1; + q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, + q__1.i = temp.r * a[i__4].i + temp.i * a[ + i__4].r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + i__1 = l + i__ + j * a_dim1; + i__2 = ix; + q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[ + i__2].i, q__2.i = a[i__1].r * x[i__2].i + + a[i__1].i * x[i__2].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; + ix += *incx; +/* L180: */ + } + } else { + if (nounit) { + r_cnjg(&q__2, &a[j * a_dim1 + 1]); + q__1.r = temp.r * q__2.r - temp.i * q__2.i, + q__1.i = temp.r * q__2.i + temp.i * + q__2.r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__1 = ix; + q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, + q__2.i = q__3.r * x[i__1].i + q__3.i * x[ + i__1].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; + ix += *incx; +/* L190: */ + } + } + i__4 = jx; + x[i__4].r = temp.r, x[i__4].i = temp.i; + jx += *incx; +/* L200: */ + } + } + } + } + + return 0; + +/* End of CTBMV . */ + +} /* ctbmv_ */ + diff --git a/blas/f2c/d_cnjg.c b/blas/f2c/d_cnjg.c new file mode 100644 index 000000000..623090c6b --- /dev/null +++ b/blas/f2c/d_cnjg.c @@ -0,0 +1,6 @@ +#include "datatypes.h" + +void d_cnjg(doublecomplex *r, doublecomplex *z) { + r->r = z->r; + r->i = -(z->i); +} diff --git a/blas/f2c/datatypes.h b/blas/f2c/datatypes.h new file mode 100644 index 000000000..63232b246 --- /dev/null +++ b/blas/f2c/datatypes.h @@ -0,0 +1,24 @@ +/* This contains a limited subset of the typedefs exposed by f2c + for use by the Eigen BLAS C-only implementation. +*/ + +#ifndef __EIGEN_DATATYPES_H__ +#define __EIGEN_DATATYPES_H__ + +typedef int integer; +typedef unsigned int uinteger; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +typedef int ftnlen; +typedef int logical; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (doublereal)abs(x) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (doublereal)min(a,b) +#define dmax(a,b) (doublereal)max(a,b) + +#endif diff --git a/blas/f2c/drotm.c b/blas/f2c/drotm.c new file mode 100644 index 000000000..17a779b74 --- /dev/null +++ b/blas/f2c/drotm.c @@ -0,0 +1,215 @@ +/* drotm.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int drotm_(integer *n, doublereal *dx, integer *incx, + doublereal *dy, integer *incy, doublereal *dparam) +{ + /* Initialized data */ + + static doublereal zero = 0.; + static doublereal two = 2.; + + /* System generated locals */ + integer i__1, i__2; + + /* Local variables */ + integer i__; + doublereal w, z__; + integer kx, ky; + doublereal dh11, dh12, dh21, dh22, dflag; + integer nsteps; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */ + +/* (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN */ +/* (DY**T) */ + +/* DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */ +/* LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. */ +/* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */ + +/* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */ + +/* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */ +/* H=( ) ( ) ( ) ( ) */ +/* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */ +/* SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. */ + +/* Arguments */ +/* ========= */ + +/* N (input) INTEGER */ +/* number of elements in input vector(s) */ + +/* DX (input/output) DOUBLE PRECISION array, dimension N */ +/* double precision vector with N elements */ + +/* INCX (input) INTEGER */ +/* storage spacing between elements of DX */ + +/* DY (input/output) DOUBLE PRECISION array, dimension N */ +/* double precision vector with N elements */ + +/* INCY (input) INTEGER */ +/* storage spacing between elements of DY */ + +/* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 */ +/* DPARAM(1)=DFLAG */ +/* DPARAM(2)=DH11 */ +/* DPARAM(3)=DH21 */ +/* DPARAM(4)=DH12 */ +/* DPARAM(5)=DH22 */ + +/* ===================================================================== */ + +/* .. Local Scalars .. */ +/* .. */ +/* .. Data statements .. */ + /* Parameter adjustments */ + --dparam; + --dy; + --dx; + + /* Function Body */ +/* .. */ + + dflag = dparam[1]; + if (*n <= 0 || dflag + two == zero) { + goto L140; + } + if (! (*incx == *incy && *incx > 0)) { + goto L70; + } + + nsteps = *n * *incx; + if (dflag < 0.) { + goto L50; + } else if (dflag == 0) { + goto L10; + } else { + goto L30; + } +L10: + dh12 = dparam[4]; + dh21 = dparam[3]; + i__1 = nsteps; + i__2 = *incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w + z__ * dh12; + dy[i__] = w * dh21 + z__; +/* L20: */ + } + goto L140; +L30: + dh11 = dparam[2]; + dh22 = dparam[5]; + i__2 = nsteps; + i__1 = *incx; + for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w * dh11 + z__; + dy[i__] = -w + dh22 * z__; +/* L40: */ + } + goto L140; +L50: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + i__1 = nsteps; + i__2 = *incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w * dh11 + z__ * dh12; + dy[i__] = w * dh21 + z__ * dh22; +/* L60: */ + } + goto L140; +L70: + kx = 1; + ky = 1; + if (*incx < 0) { + kx = (1 - *n) * *incx + 1; + } + if (*incy < 0) { + ky = (1 - *n) * *incy + 1; + } + + if (dflag < 0.) { + goto L120; + } else if (dflag == 0) { + goto L80; + } else { + goto L100; + } +L80: + dh12 = dparam[4]; + dh21 = dparam[3]; + i__2 = *n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w + z__ * dh12; + dy[ky] = w * dh21 + z__; + kx += *incx; + ky += *incy; +/* L90: */ + } + goto L140; +L100: + dh11 = dparam[2]; + dh22 = dparam[5]; + i__2 = *n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w * dh11 + z__; + dy[ky] = -w + dh22 * z__; + kx += *incx; + ky += *incy; +/* L110: */ + } + goto L140; +L120: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + i__2 = *n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w * dh11 + z__ * dh12; + dy[ky] = w * dh21 + z__ * dh22; + kx += *incx; + ky += *incy; +/* L130: */ + } +L140: + return 0; +} /* drotm_ */ + diff --git a/blas/f2c/drotmg.c b/blas/f2c/drotmg.c new file mode 100644 index 000000000..a63eb1083 --- /dev/null +++ b/blas/f2c/drotmg.c @@ -0,0 +1,293 @@ +/* drotmg.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int drotmg_(doublereal *dd1, doublereal *dd2, doublereal * + dx1, doublereal *dy1, doublereal *dparam) +{ + /* Initialized data */ + + static doublereal zero = 0.; + static doublereal one = 1.; + static doublereal two = 2.; + static doublereal gam = 4096.; + static doublereal gamsq = 16777216.; + static doublereal rgamsq = 5.9604645e-8; + + /* Format strings */ + static char fmt_120[] = ""; + static char fmt_150[] = ""; + static char fmt_180[] = ""; + static char fmt_210[] = ""; + + /* System generated locals */ + doublereal d__1; + + /* Local variables */ + doublereal du, dp1, dp2, dq1, dq2, dh11, dh12, dh21, dh22; + integer igo; + doublereal dflag, dtemp; + + /* Assigned format variables */ + static char *igo_fmt; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */ +/* THE SECOND COMPONENT OF THE 2-VECTOR (DSQRT(DD1)*DX1,DSQRT(DD2)* */ +/* DY2)**T. */ +/* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */ + +/* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */ + +/* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */ +/* H=( ) ( ) ( ) ( ) */ +/* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */ +/* LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 */ +/* RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE */ +/* VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) */ + +/* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */ +/* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */ +/* OF DD1 AND DD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */ + + +/* Arguments */ +/* ========= */ + +/* DD1 (input/output) DOUBLE PRECISION */ + +/* DD2 (input/output) DOUBLE PRECISION */ + +/* DX1 (input/output) DOUBLE PRECISION */ + +/* DY1 (input) DOUBLE PRECISION */ + +/* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 */ +/* DPARAM(1)=DFLAG */ +/* DPARAM(2)=DH11 */ +/* DPARAM(3)=DH21 */ +/* DPARAM(4)=DH12 */ +/* DPARAM(5)=DH22 */ + +/* ===================================================================== */ + +/* .. Local Scalars .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ +/* .. Data statements .. */ + + /* Parameter adjustments */ + --dparam; + + /* Function Body */ +/* .. */ + if (! (*dd1 < zero)) { + goto L10; + } +/* GO ZERO-H-D-AND-DX1.. */ + goto L60; +L10: +/* CASE-DD1-NONNEGATIVE */ + dp2 = *dd2 * *dy1; + if (! (dp2 == zero)) { + goto L20; + } + dflag = -two; + goto L260; +/* REGULAR-CASE.. */ +L20: + dp1 = *dd1 * *dx1; + dq2 = dp2 * *dy1; + dq1 = dp1 * *dx1; + + if (! (abs(dq1) > abs(dq2))) { + goto L40; + } + dh21 = -(*dy1) / *dx1; + dh12 = dp2 / dp1; + + du = one - dh12 * dh21; + + if (! (du <= zero)) { + goto L30; + } +/* GO ZERO-H-D-AND-DX1.. */ + goto L60; +L30: + dflag = zero; + *dd1 /= du; + *dd2 /= du; + *dx1 *= du; +/* GO SCALE-CHECK.. */ + goto L100; +L40: + if (! (dq2 < zero)) { + goto L50; + } +/* GO ZERO-H-D-AND-DX1.. */ + goto L60; +L50: + dflag = one; + dh11 = dp1 / dp2; + dh22 = *dx1 / *dy1; + du = one + dh11 * dh22; + dtemp = *dd2 / du; + *dd2 = *dd1 / du; + *dd1 = dtemp; + *dx1 = *dy1 * du; +/* GO SCALE-CHECK */ + goto L100; +/* PROCEDURE..ZERO-H-D-AND-DX1.. */ +L60: + dflag = -one; + dh11 = zero; + dh12 = zero; + dh21 = zero; + dh22 = zero; + + *dd1 = zero; + *dd2 = zero; + *dx1 = zero; +/* RETURN.. */ + goto L220; +/* PROCEDURE..FIX-H.. */ +L70: + if (! (dflag >= zero)) { + goto L90; + } + + if (! (dflag == zero)) { + goto L80; + } + dh11 = one; + dh22 = one; + dflag = -one; + goto L90; +L80: + dh21 = -one; + dh12 = one; + dflag = -one; +L90: + switch (igo) { + case 0: goto L120; + case 1: goto L150; + case 2: goto L180; + case 3: goto L210; + } +/* PROCEDURE..SCALE-CHECK */ +L100: +L110: + if (! (*dd1 <= rgamsq)) { + goto L130; + } + if (*dd1 == zero) { + goto L160; + } + igo = 0; + igo_fmt = fmt_120; +/* FIX-H.. */ + goto L70; +L120: +/* Computing 2nd power */ + d__1 = gam; + *dd1 *= d__1 * d__1; + *dx1 /= gam; + dh11 /= gam; + dh12 /= gam; + goto L110; +L130: +L140: + if (! (*dd1 >= gamsq)) { + goto L160; + } + igo = 1; + igo_fmt = fmt_150; +/* FIX-H.. */ + goto L70; +L150: +/* Computing 2nd power */ + d__1 = gam; + *dd1 /= d__1 * d__1; + *dx1 *= gam; + dh11 *= gam; + dh12 *= gam; + goto L140; +L160: +L170: + if (! (abs(*dd2) <= rgamsq)) { + goto L190; + } + if (*dd2 == zero) { + goto L220; + } + igo = 2; + igo_fmt = fmt_180; +/* FIX-H.. */ + goto L70; +L180: +/* Computing 2nd power */ + d__1 = gam; + *dd2 *= d__1 * d__1; + dh21 /= gam; + dh22 /= gam; + goto L170; +L190: +L200: + if (! (abs(*dd2) >= gamsq)) { + goto L220; + } + igo = 3; + igo_fmt = fmt_210; +/* FIX-H.. */ + goto L70; +L210: +/* Computing 2nd power */ + d__1 = gam; + *dd2 /= d__1 * d__1; + dh21 *= gam; + dh22 *= gam; + goto L200; +L220: + if (dflag < 0.) { + goto L250; + } else if (dflag == 0) { + goto L230; + } else { + goto L240; + } +L230: + dparam[3] = dh21; + dparam[4] = dh12; + goto L260; +L240: + dparam[2] = dh11; + dparam[5] = dh22; + goto L260; +L250: + dparam[2] = dh11; + dparam[3] = dh21; + dparam[4] = dh12; + dparam[5] = dh22; +L260: + dparam[1] = dflag; + return 0; +} /* drotmg_ */ + diff --git a/blas/f2c/dsbmv.c b/blas/f2c/dsbmv.c new file mode 100644 index 000000000..c6b4b21d6 --- /dev/null +++ b/blas/f2c/dsbmv.c @@ -0,0 +1,366 @@ +/* dsbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int dsbmv_(char *uplo, integer *n, integer *k, doublereal * + alpha, doublereal *a, integer *lda, doublereal *x, integer *incx, + doublereal *beta, doublereal *y, integer *incy, ftnlen uplo_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + + /* Local variables */ + integer i__, j, l, ix, iy, jx, jy, kx, ky, info; + doublereal temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* DSBMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n symmetric band matrix, with k super-diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the band matrix A is being supplied as */ +/* follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* being supplied. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* being supplied. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry, K specifies the number of super-diagonals of the */ +/* matrix A. K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* ALPHA - DOUBLE PRECISION. */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the symmetric matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer the upper */ +/* triangular part of a symmetric band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the symmetric matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer the lower */ +/* triangular part of a symmetric band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - DOUBLE PRECISION array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the */ +/* vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - DOUBLE PRECISION. */ +/* On entry, BETA specifies the scalar beta. */ +/* Unchanged on exit. */ + +/* Y - DOUBLE PRECISION array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the */ +/* vector y. On exit, Y is overwritten by the updated vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + --y; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*k < 0) { + info = 3; + } else if (*lda < *k + 1) { + info = 6; + } else if (*incx == 0) { + info = 8; + } else if (*incy == 0) { + info = 11; + } + if (info != 0) { + xerbla_("DSBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (*alpha == 0. && *beta == 1.)) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array A */ +/* are accessed sequentially with one pass through A. */ + +/* First form y := beta*y. */ + + if (*beta != 1.) { + if (*incy == 1) { + if (*beta == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = 0.; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = *beta * y[i__]; +/* L20: */ + } + } + } else { + iy = ky; + if (*beta == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = 0.; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = *beta * y[iy]; + iy += *incy; +/* L40: */ + } + } + } + } + if (*alpha == 0.) { + return 0; + } + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when upper triangle of A is stored. */ + + kplus1 = *k + 1; + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + y[i__] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[i__]; +/* L50: */ + } + y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.; + ix = kx; + iy = ky; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + y[iy] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[ix]; + ix += *incx; + iy += *incy; +/* L70: */ + } + y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * + temp2; + jx += *incx; + jy += *incy; + if (j > *k) { + kx += *incx; + ky += *incy; + } +/* L80: */ + } + } + } else { + +/* Form y when lower triangle of A is stored. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.; + y[j] += temp1 * a[j * a_dim1 + 1]; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + y[i__] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[i__]; +/* L90: */ + } + y[j] += *alpha * temp2; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.; + y[jy] += temp1 * a[j * a_dim1 + 1]; + l = 1 - j; + ix = jx; + iy = jy; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + ix += *incx; + iy += *incy; + y[iy] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[ix]; +/* L110: */ + } + y[jy] += *alpha * temp2; + jx += *incx; + jy += *incy; +/* L120: */ + } + } + } + + return 0; + +/* End of DSBMV . */ + +} /* dsbmv_ */ + diff --git a/blas/f2c/dspmv.c b/blas/f2c/dspmv.c new file mode 100644 index 000000000..0b4e92d5c --- /dev/null +++ b/blas/f2c/dspmv.c @@ -0,0 +1,316 @@ +/* dspmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int dspmv_(char *uplo, integer *n, doublereal *alpha, + doublereal *ap, doublereal *x, integer *incx, doublereal *beta, + doublereal *y, integer *incy, ftnlen uplo_len) +{ + /* System generated locals */ + integer i__1, i__2; + + /* Local variables */ + integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info; + doublereal temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* DSPMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n symmetric matrix, supplied in packed form. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the matrix A is supplied in the packed */ +/* array AP as follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* supplied in AP. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* supplied in AP. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* ALPHA - DOUBLE PRECISION. */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* AP - DOUBLE PRECISION array of DIMENSION at least */ +/* ( ( n*( n + 1 ) )/2 ). */ +/* Before entry with UPLO = 'U' or 'u', the array AP must */ +/* contain the upper triangular part of the symmetric matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ +/* and a( 2, 2 ) respectively, and so on. */ +/* Before entry with UPLO = 'L' or 'l', the array AP must */ +/* contain the lower triangular part of the symmetric matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ +/* and a( 3, 1 ) respectively, and so on. */ +/* Unchanged on exit. */ + +/* X - DOUBLE PRECISION array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - DOUBLE PRECISION. */ +/* On entry, BETA specifies the scalar beta. When BETA is */ +/* supplied as zero then Y need not be set on input. */ +/* Unchanged on exit. */ + +/* Y - DOUBLE PRECISION array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the n */ +/* element vector y. On exit, Y is overwritten by the updated */ +/* vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + --y; + --x; + --ap; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*incx == 0) { + info = 6; + } else if (*incy == 0) { + info = 9; + } + if (info != 0) { + xerbla_("DSPMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (*alpha == 0. && *beta == 1.)) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array AP */ +/* are accessed sequentially with one pass through AP. */ + +/* First form y := beta*y. */ + + if (*beta != 1.) { + if (*incy == 1) { + if (*beta == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = 0.; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = *beta * y[i__]; +/* L20: */ + } + } + } else { + iy = ky; + if (*beta == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = 0.; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = *beta * y[iy]; + iy += *incy; +/* L40: */ + } + } + } + } + if (*alpha == 0.) { + return 0; + } + kk = 1; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when AP contains the upper triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.; + k = kk; + i__2 = j - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + y[i__] += temp1 * ap[k]; + temp2 += ap[k] * x[i__]; + ++k; +/* L50: */ + } + y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2; + kk += j; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.; + ix = kx; + iy = ky; + i__2 = kk + j - 2; + for (k = kk; k <= i__2; ++k) { + y[iy] += temp1 * ap[k]; + temp2 += ap[k] * x[ix]; + ix += *incx; + iy += *incy; +/* L70: */ + } + y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2; + jx += *incx; + jy += *incy; + kk += j; +/* L80: */ + } + } + } else { + +/* Form y when AP contains the lower triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.; + y[j] += temp1 * ap[kk]; + k = kk + 1; + i__2 = *n; + for (i__ = j + 1; i__ <= i__2; ++i__) { + y[i__] += temp1 * ap[k]; + temp2 += ap[k] * x[i__]; + ++k; +/* L90: */ + } + y[j] += *alpha * temp2; + kk += *n - j + 1; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.; + y[jy] += temp1 * ap[kk]; + ix = jx; + iy = jy; + i__2 = kk + *n - j; + for (k = kk + 1; k <= i__2; ++k) { + ix += *incx; + iy += *incy; + y[iy] += temp1 * ap[k]; + temp2 += ap[k] * x[ix]; +/* L110: */ + } + y[jy] += *alpha * temp2; + jx += *incx; + jy += *incy; + kk += *n - j + 1; +/* L120: */ + } + } + } + + return 0; + +/* End of DSPMV . */ + +} /* dspmv_ */ + diff --git a/blas/f2c/dtbmv.c b/blas/f2c/dtbmv.c new file mode 100644 index 000000000..fdf73ebb5 --- /dev/null +++ b/blas/f2c/dtbmv.c @@ -0,0 +1,428 @@ +/* dtbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int dtbmv_(char *uplo, char *trans, char *diag, integer *n, + integer *k, doublereal *a, integer *lda, doublereal *x, integer *incx, + ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + + /* Local variables */ + integer i__, j, l, ix, jx, kx, info; + doublereal temp; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + logical nounit; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* DTBMV performs one of the matrix-vector operations */ + +/* x := A*x, or x := A'*x, */ + +/* where x is an n element vector and A is an n by n unit, or non-unit, */ +/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the matrix is an upper or */ +/* lower triangular matrix as follows: */ + +/* UPLO = 'U' or 'u' A is an upper triangular matrix. */ + +/* UPLO = 'L' or 'l' A is a lower triangular matrix. */ + +/* Unchanged on exit. */ + +/* TRANS - CHARACTER*1. */ +/* On entry, TRANS specifies the operation to be performed as */ +/* follows: */ + +/* TRANS = 'N' or 'n' x := A*x. */ + +/* TRANS = 'T' or 't' x := A'*x. */ + +/* TRANS = 'C' or 'c' x := A'*x. */ + +/* Unchanged on exit. */ + +/* DIAG - CHARACTER*1. */ +/* On entry, DIAG specifies whether or not A is unit */ +/* triangular as follows: */ + +/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ + +/* DIAG = 'N' or 'n' A is not assumed to be unit */ +/* triangular. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry with UPLO = 'U' or 'u', K specifies the number of */ +/* super-diagonals of the matrix A. */ +/* On entry with UPLO = 'L' or 'l', K specifies the number of */ +/* sub-diagonals of the matrix A. */ +/* K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer an upper */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer a lower */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Note that when DIAG = 'U' or 'u' the elements of the array A */ +/* corresponding to the diagonal elements of the matrix are not */ +/* referenced, but are assumed to be unity. */ +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - DOUBLE PRECISION array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. On exit, X is overwritten with the */ +/* tranformed vector x. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, + "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, ( + ftnlen)1)) { + info = 2; + } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, + "N", (ftnlen)1, (ftnlen)1)) { + info = 3; + } else if (*n < 0) { + info = 4; + } else if (*k < 0) { + info = 5; + } else if (*lda < *k + 1) { + info = 7; + } else if (*incx == 0) { + info = 9; + } + if (info != 0) { + xerbla_("DTBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0) { + return 0; + } + + nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1); + +/* Set up the start point in X if the increment is not unity. This */ +/* will be ( N - 1 )*INCX too small for descending loops. */ + + if (*incx <= 0) { + kx = 1 - (*n - 1) * *incx; + } else if (*incx != 1) { + kx = 1; + } + +/* Start the operations. In this version the elements of A are */ +/* accessed sequentially with one pass through A. */ + + if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) { + +/* Form x := A*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (x[j] != 0.) { + temp = x[j]; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + x[i__] += temp * a[l + i__ + j * a_dim1]; +/* L10: */ + } + if (nounit) { + x[j] *= a[kplus1 + j * a_dim1]; + } + } +/* L20: */ + } + } else { + jx = kx; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (x[jx] != 0.) { + temp = x[jx]; + ix = kx; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + x[ix] += temp * a[l + i__ + j * a_dim1]; + ix += *incx; +/* L30: */ + } + if (nounit) { + x[jx] *= a[kplus1 + j * a_dim1]; + } + } + jx += *incx; + if (j > *k) { + kx += *incx; + } +/* L40: */ + } + } + } else { + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + if (x[j] != 0.) { + temp = x[j]; + l = 1 - j; +/* Computing MIN */ + i__1 = *n, i__3 = j + *k; + i__4 = j + 1; + for (i__ = min(i__1,i__3); i__ >= i__4; --i__) { + x[i__] += temp * a[l + i__ + j * a_dim1]; +/* L50: */ + } + if (nounit) { + x[j] *= a[j * a_dim1 + 1]; + } + } +/* L60: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + if (x[jx] != 0.) { + temp = x[jx]; + ix = kx; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__1 = j + *k; + i__3 = j + 1; + for (i__ = min(i__4,i__1); i__ >= i__3; --i__) { + x[ix] += temp * a[l + i__ + j * a_dim1]; + ix -= *incx; +/* L70: */ + } + if (nounit) { + x[jx] *= a[j * a_dim1 + 1]; + } + } + jx -= *incx; + if (*n - j >= *k) { + kx -= *incx; + } +/* L80: */ + } + } + } + } else { + +/* Form x := A'*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + temp = x[j]; + l = kplus1 - j; + if (nounit) { + temp *= a[kplus1 + j * a_dim1]; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + temp += a[l + i__ + j * a_dim1] * x[i__]; +/* L90: */ + } + x[j] = temp; +/* L100: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + temp = x[jx]; + kx -= *incx; + ix = kx; + l = kplus1 - j; + if (nounit) { + temp *= a[kplus1 + j * a_dim1]; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + temp += a[l + i__ + j * a_dim1] * x[ix]; + ix -= *incx; +/* L110: */ + } + x[jx] = temp; + jx -= *incx; +/* L120: */ + } + } + } else { + if (*incx == 1) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + temp = x[j]; + l = 1 - j; + if (nounit) { + temp *= a[j * a_dim1 + 1]; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + temp += a[l + i__ + j * a_dim1] * x[i__]; +/* L130: */ + } + x[j] = temp; +/* L140: */ + } + } else { + jx = kx; + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + temp = x[jx]; + kx += *incx; + ix = kx; + l = 1 - j; + if (nounit) { + temp *= a[j * a_dim1 + 1]; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + temp += a[l + i__ + j * a_dim1] * x[ix]; + ix += *incx; +/* L150: */ + } + x[jx] = temp; + jx += *incx; +/* L160: */ + } + } + } + } + + return 0; + +/* End of DTBMV . */ + +} /* dtbmv_ */ + diff --git a/blas/f2c/lsame.c b/blas/f2c/lsame.c new file mode 100644 index 000000000..46324d916 --- /dev/null +++ b/blas/f2c/lsame.c @@ -0,0 +1,117 @@ +/* lsame.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +logical lsame_(char *ca, char *cb, ftnlen ca_len, ftnlen cb_len) +{ + /* System generated locals */ + logical ret_val; + + /* Local variables */ + integer inta, intb, zcode; + + +/* -- LAPACK auxiliary routine (version 3.1) -- */ +/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ +/* November 2006 */ + +/* .. Scalar Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* LSAME returns .TRUE. if CA is the same letter as CB regardless of */ +/* case. */ + +/* Arguments */ +/* ========= */ + +/* CA (input) CHARACTER*1 */ + +/* CB (input) CHARACTER*1 */ +/* CA and CB specify the single characters to be compared. */ + +/* ===================================================================== */ + +/* .. Intrinsic Functions .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ + +/* Test if the characters are equal */ + + ret_val = *(unsigned char *)ca == *(unsigned char *)cb; + if (ret_val) { + return ret_val; + } + +/* Now test for equivalence if both characters are alphabetic. */ + + zcode = 'Z'; + +/* Use 'Z' rather than 'A' so that ASCII can be detected on Prime */ +/* machines, on which ICHAR returns a value with bit 8 set. */ +/* ICHAR('A') on Prime machines returns 193 which is the same as */ +/* ICHAR('A') on an EBCDIC machine. */ + + inta = *(unsigned char *)ca; + intb = *(unsigned char *)cb; + + if (zcode == 90 || zcode == 122) { + +/* ASCII is assumed - ZCODE is the ASCII code of either lower or */ +/* upper case 'Z'. */ + + if (inta >= 97 && inta <= 122) { + inta += -32; + } + if (intb >= 97 && intb <= 122) { + intb += -32; + } + + } else if (zcode == 233 || zcode == 169) { + +/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */ +/* upper case 'Z'. */ + + if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || + (inta >= 162 && inta <= 169)) { + inta += 64; + } + if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || + (intb >= 162 && intb <= 169)) { + intb += 64; + } + + } else if (zcode == 218 || zcode == 250) { + +/* ASCII is assumed, on Prime machines - ZCODE is the ASCII code */ +/* plus 128 of either lower or upper case 'Z'. */ + + if (inta >= 225 && inta <= 250) { + inta += -32; + } + if (intb >= 225 && intb <= 250) { + intb += -32; + } + } + ret_val = inta == intb; + +/* RETURN */ + +/* End of LSAME */ + + return ret_val; +} /* lsame_ */ + diff --git a/blas/f2c/r_cnjg.c b/blas/f2c/r_cnjg.c new file mode 100644 index 000000000..c08182f88 --- /dev/null +++ b/blas/f2c/r_cnjg.c @@ -0,0 +1,6 @@ +#include "datatypes.h" + +void r_cnjg(complex *r, complex *z) { + r->r = z->r; + r->i = -(z->i); +} diff --git a/blas/f2c/srotm.c b/blas/f2c/srotm.c new file mode 100644 index 000000000..bd5944a99 --- /dev/null +++ b/blas/f2c/srotm.c @@ -0,0 +1,216 @@ +/* srotm.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int srotm_(integer *n, real *sx, integer *incx, real *sy, + integer *incy, real *sparam) +{ + /* Initialized data */ + + static real zero = 0.f; + static real two = 2.f; + + /* System generated locals */ + integer i__1, i__2; + + /* Local variables */ + integer i__; + real w, z__; + integer kx, ky; + real sh11, sh12, sh21, sh22, sflag; + integer nsteps; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */ + +/* (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN */ +/* (DX**T) */ + +/* SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */ +/* LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. */ +/* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */ + +/* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */ + +/* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */ +/* H=( ) ( ) ( ) ( ) */ +/* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */ +/* SEE SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. */ + + +/* Arguments */ +/* ========= */ + +/* N (input) INTEGER */ +/* number of elements in input vector(s) */ + +/* SX (input/output) REAL array, dimension N */ +/* double precision vector with N elements */ + +/* INCX (input) INTEGER */ +/* storage spacing between elements of SX */ + +/* SY (input/output) REAL array, dimension N */ +/* double precision vector with N elements */ + +/* INCY (input) INTEGER */ +/* storage spacing between elements of SY */ + +/* SPARAM (input/output) REAL array, dimension 5 */ +/* SPARAM(1)=SFLAG */ +/* SPARAM(2)=SH11 */ +/* SPARAM(3)=SH21 */ +/* SPARAM(4)=SH12 */ +/* SPARAM(5)=SH22 */ + +/* ===================================================================== */ + +/* .. Local Scalars .. */ +/* .. */ +/* .. Data statements .. */ + /* Parameter adjustments */ + --sparam; + --sy; + --sx; + + /* Function Body */ +/* .. */ + + sflag = sparam[1]; + if (*n <= 0 || sflag + two == zero) { + goto L140; + } + if (! (*incx == *incy && *incx > 0)) { + goto L70; + } + + nsteps = *n * *incx; + if (sflag < 0.f) { + goto L50; + } else if (sflag == 0) { + goto L10; + } else { + goto L30; + } +L10: + sh12 = sparam[4]; + sh21 = sparam[3]; + i__1 = nsteps; + i__2 = *incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = sx[i__]; + z__ = sy[i__]; + sx[i__] = w + z__ * sh12; + sy[i__] = w * sh21 + z__; +/* L20: */ + } + goto L140; +L30: + sh11 = sparam[2]; + sh22 = sparam[5]; + i__2 = nsteps; + i__1 = *incx; + for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { + w = sx[i__]; + z__ = sy[i__]; + sx[i__] = w * sh11 + z__; + sy[i__] = -w + sh22 * z__; +/* L40: */ + } + goto L140; +L50: + sh11 = sparam[2]; + sh12 = sparam[4]; + sh21 = sparam[3]; + sh22 = sparam[5]; + i__1 = nsteps; + i__2 = *incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = sx[i__]; + z__ = sy[i__]; + sx[i__] = w * sh11 + z__ * sh12; + sy[i__] = w * sh21 + z__ * sh22; +/* L60: */ + } + goto L140; +L70: + kx = 1; + ky = 1; + if (*incx < 0) { + kx = (1 - *n) * *incx + 1; + } + if (*incy < 0) { + ky = (1 - *n) * *incy + 1; + } + + if (sflag < 0.f) { + goto L120; + } else if (sflag == 0) { + goto L80; + } else { + goto L100; + } +L80: + sh12 = sparam[4]; + sh21 = sparam[3]; + i__2 = *n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = sx[kx]; + z__ = sy[ky]; + sx[kx] = w + z__ * sh12; + sy[ky] = w * sh21 + z__; + kx += *incx; + ky += *incy; +/* L90: */ + } + goto L140; +L100: + sh11 = sparam[2]; + sh22 = sparam[5]; + i__2 = *n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = sx[kx]; + z__ = sy[ky]; + sx[kx] = w * sh11 + z__; + sy[ky] = -w + sh22 * z__; + kx += *incx; + ky += *incy; +/* L110: */ + } + goto L140; +L120: + sh11 = sparam[2]; + sh12 = sparam[4]; + sh21 = sparam[3]; + sh22 = sparam[5]; + i__2 = *n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = sx[kx]; + z__ = sy[ky]; + sx[kx] = w * sh11 + z__ * sh12; + sy[ky] = w * sh21 + z__ * sh22; + kx += *incx; + ky += *incy; +/* L130: */ + } +L140: + return 0; +} /* srotm_ */ + diff --git a/blas/f2c/srotmg.c b/blas/f2c/srotmg.c new file mode 100644 index 000000000..75f789fe2 --- /dev/null +++ b/blas/f2c/srotmg.c @@ -0,0 +1,295 @@ +/* srotmg.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int srotmg_(real *sd1, real *sd2, real *sx1, real *sy1, real + *sparam) +{ + /* Initialized data */ + + static real zero = 0.f; + static real one = 1.f; + static real two = 2.f; + static real gam = 4096.f; + static real gamsq = 16777200.f; + static real rgamsq = 5.96046e-8f; + + /* Format strings */ + static char fmt_120[] = ""; + static char fmt_150[] = ""; + static char fmt_180[] = ""; + static char fmt_210[] = ""; + + /* System generated locals */ + real r__1; + + /* Local variables */ + real su, sp1, sp2, sq1, sq2, sh11, sh12, sh21, sh22; + integer igo; + real sflag, stemp; + + /* Assigned format variables */ + static char *igo_fmt; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */ +/* THE SECOND COMPONENT OF THE 2-VECTOR (SQRT(SD1)*SX1,SQRT(SD2)* */ +/* SY2)**T. */ +/* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */ + +/* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */ + +/* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */ +/* H=( ) ( ) ( ) ( ) */ +/* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */ +/* LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 */ +/* RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE */ +/* VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) */ + +/* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */ +/* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */ +/* OF SD1 AND SD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */ + + +/* Arguments */ +/* ========= */ + + +/* SD1 (input/output) REAL */ + +/* SD2 (input/output) REAL */ + +/* SX1 (input/output) REAL */ + +/* SY1 (input) REAL */ + + +/* SPARAM (input/output) REAL array, dimension 5 */ +/* SPARAM(1)=SFLAG */ +/* SPARAM(2)=SH11 */ +/* SPARAM(3)=SH21 */ +/* SPARAM(4)=SH12 */ +/* SPARAM(5)=SH22 */ + +/* ===================================================================== */ + +/* .. Local Scalars .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ +/* .. Data statements .. */ + + /* Parameter adjustments */ + --sparam; + + /* Function Body */ +/* .. */ + if (! (*sd1 < zero)) { + goto L10; + } +/* GO ZERO-H-D-AND-SX1.. */ + goto L60; +L10: +/* CASE-SD1-NONNEGATIVE */ + sp2 = *sd2 * *sy1; + if (! (sp2 == zero)) { + goto L20; + } + sflag = -two; + goto L260; +/* REGULAR-CASE.. */ +L20: + sp1 = *sd1 * *sx1; + sq2 = sp2 * *sy1; + sq1 = sp1 * *sx1; + + if (! (dabs(sq1) > dabs(sq2))) { + goto L40; + } + sh21 = -(*sy1) / *sx1; + sh12 = sp2 / sp1; + + su = one - sh12 * sh21; + + if (! (su <= zero)) { + goto L30; + } +/* GO ZERO-H-D-AND-SX1.. */ + goto L60; +L30: + sflag = zero; + *sd1 /= su; + *sd2 /= su; + *sx1 *= su; +/* GO SCALE-CHECK.. */ + goto L100; +L40: + if (! (sq2 < zero)) { + goto L50; + } +/* GO ZERO-H-D-AND-SX1.. */ + goto L60; +L50: + sflag = one; + sh11 = sp1 / sp2; + sh22 = *sx1 / *sy1; + su = one + sh11 * sh22; + stemp = *sd2 / su; + *sd2 = *sd1 / su; + *sd1 = stemp; + *sx1 = *sy1 * su; +/* GO SCALE-CHECK */ + goto L100; +/* PROCEDURE..ZERO-H-D-AND-SX1.. */ +L60: + sflag = -one; + sh11 = zero; + sh12 = zero; + sh21 = zero; + sh22 = zero; + + *sd1 = zero; + *sd2 = zero; + *sx1 = zero; +/* RETURN.. */ + goto L220; +/* PROCEDURE..FIX-H.. */ +L70: + if (! (sflag >= zero)) { + goto L90; + } + + if (! (sflag == zero)) { + goto L80; + } + sh11 = one; + sh22 = one; + sflag = -one; + goto L90; +L80: + sh21 = -one; + sh12 = one; + sflag = -one; +L90: + switch (igo) { + case 0: goto L120; + case 1: goto L150; + case 2: goto L180; + case 3: goto L210; + } +/* PROCEDURE..SCALE-CHECK */ +L100: +L110: + if (! (*sd1 <= rgamsq)) { + goto L130; + } + if (*sd1 == zero) { + goto L160; + } + igo = 0; + igo_fmt = fmt_120; +/* FIX-H.. */ + goto L70; +L120: +/* Computing 2nd power */ + r__1 = gam; + *sd1 *= r__1 * r__1; + *sx1 /= gam; + sh11 /= gam; + sh12 /= gam; + goto L110; +L130: +L140: + if (! (*sd1 >= gamsq)) { + goto L160; + } + igo = 1; + igo_fmt = fmt_150; +/* FIX-H.. */ + goto L70; +L150: +/* Computing 2nd power */ + r__1 = gam; + *sd1 /= r__1 * r__1; + *sx1 *= gam; + sh11 *= gam; + sh12 *= gam; + goto L140; +L160: +L170: + if (! (dabs(*sd2) <= rgamsq)) { + goto L190; + } + if (*sd2 == zero) { + goto L220; + } + igo = 2; + igo_fmt = fmt_180; +/* FIX-H.. */ + goto L70; +L180: +/* Computing 2nd power */ + r__1 = gam; + *sd2 *= r__1 * r__1; + sh21 /= gam; + sh22 /= gam; + goto L170; +L190: +L200: + if (! (dabs(*sd2) >= gamsq)) { + goto L220; + } + igo = 3; + igo_fmt = fmt_210; +/* FIX-H.. */ + goto L70; +L210: +/* Computing 2nd power */ + r__1 = gam; + *sd2 /= r__1 * r__1; + sh21 *= gam; + sh22 *= gam; + goto L200; +L220: + if (sflag < 0.f) { + goto L250; + } else if (sflag == 0) { + goto L230; + } else { + goto L240; + } +L230: + sparam[3] = sh21; + sparam[4] = sh12; + goto L260; +L240: + sparam[2] = sh11; + sparam[5] = sh22; + goto L260; +L250: + sparam[2] = sh11; + sparam[3] = sh21; + sparam[4] = sh12; + sparam[5] = sh22; +L260: + sparam[1] = sflag; + return 0; +} /* srotmg_ */ + diff --git a/blas/f2c/ssbmv.c b/blas/f2c/ssbmv.c new file mode 100644 index 000000000..8599325f2 --- /dev/null +++ b/blas/f2c/ssbmv.c @@ -0,0 +1,368 @@ +/* ssbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int ssbmv_(char *uplo, integer *n, integer *k, real *alpha, + real *a, integer *lda, real *x, integer *incx, real *beta, real *y, + integer *incy, ftnlen uplo_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + + /* Local variables */ + integer i__, j, l, ix, iy, jx, jy, kx, ky, info; + real temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* SSBMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n symmetric band matrix, with k super-diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the band matrix A is being supplied as */ +/* follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* being supplied. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* being supplied. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry, K specifies the number of super-diagonals of the */ +/* matrix A. K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* ALPHA - REAL . */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* A - REAL array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the symmetric matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer the upper */ +/* triangular part of a symmetric band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the symmetric matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer the lower */ +/* triangular part of a symmetric band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - REAL array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the */ +/* vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - REAL . */ +/* On entry, BETA specifies the scalar beta. */ +/* Unchanged on exit. */ + +/* Y - REAL array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the */ +/* vector y. On exit, Y is overwritten by the updated vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + --y; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*k < 0) { + info = 3; + } else if (*lda < *k + 1) { + info = 6; + } else if (*incx == 0) { + info = 8; + } else if (*incy == 0) { + info = 11; + } + if (info != 0) { + xerbla_("SSBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array A */ +/* are accessed sequentially with one pass through A. */ + +/* First form y := beta*y. */ + + if (*beta != 1.f) { + if (*incy == 1) { + if (*beta == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = 0.f; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = *beta * y[i__]; +/* L20: */ + } + } + } else { + iy = ky; + if (*beta == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = 0.f; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = *beta * y[iy]; + iy += *incy; +/* L40: */ + } + } + } + } + if (*alpha == 0.f) { + return 0; + } + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when upper triangle of A is stored. */ + + kplus1 = *k + 1; + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.f; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + y[i__] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[i__]; +/* L50: */ + } + y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.f; + ix = kx; + iy = ky; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + y[iy] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[ix]; + ix += *incx; + iy += *incy; +/* L70: */ + } + y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * + temp2; + jx += *incx; + jy += *incy; + if (j > *k) { + kx += *incx; + ky += *incy; + } +/* L80: */ + } + } + } else { + +/* Form y when lower triangle of A is stored. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.f; + y[j] += temp1 * a[j * a_dim1 + 1]; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + y[i__] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[i__]; +/* L90: */ + } + y[j] += *alpha * temp2; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.f; + y[jy] += temp1 * a[j * a_dim1 + 1]; + l = 1 - j; + ix = jx; + iy = jy; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + ix += *incx; + iy += *incy; + y[iy] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[ix]; +/* L110: */ + } + y[jy] += *alpha * temp2; + jx += *incx; + jy += *incy; +/* L120: */ + } + } + } + + return 0; + +/* End of SSBMV . */ + +} /* ssbmv_ */ + diff --git a/blas/f2c/sspmv.c b/blas/f2c/sspmv.c new file mode 100644 index 000000000..47858ec6c --- /dev/null +++ b/blas/f2c/sspmv.c @@ -0,0 +1,316 @@ +/* sspmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int sspmv_(char *uplo, integer *n, real *alpha, real *ap, + real *x, integer *incx, real *beta, real *y, integer *incy, ftnlen + uplo_len) +{ + /* System generated locals */ + integer i__1, i__2; + + /* Local variables */ + integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info; + real temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* SSPMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n symmetric matrix, supplied in packed form. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the matrix A is supplied in the packed */ +/* array AP as follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* supplied in AP. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* supplied in AP. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* ALPHA - REAL . */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* AP - REAL array of DIMENSION at least */ +/* ( ( n*( n + 1 ) )/2 ). */ +/* Before entry with UPLO = 'U' or 'u', the array AP must */ +/* contain the upper triangular part of the symmetric matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ +/* and a( 2, 2 ) respectively, and so on. */ +/* Before entry with UPLO = 'L' or 'l', the array AP must */ +/* contain the lower triangular part of the symmetric matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ +/* and a( 3, 1 ) respectively, and so on. */ +/* Unchanged on exit. */ + +/* X - REAL array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - REAL . */ +/* On entry, BETA specifies the scalar beta. When BETA is */ +/* supplied as zero then Y need not be set on input. */ +/* Unchanged on exit. */ + +/* Y - REAL array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the n */ +/* element vector y. On exit, Y is overwritten by the updated */ +/* vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + --y; + --x; + --ap; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*incx == 0) { + info = 6; + } else if (*incy == 0) { + info = 9; + } + if (info != 0) { + xerbla_("SSPMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array AP */ +/* are accessed sequentially with one pass through AP. */ + +/* First form y := beta*y. */ + + if (*beta != 1.f) { + if (*incy == 1) { + if (*beta == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = 0.f; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = *beta * y[i__]; +/* L20: */ + } + } + } else { + iy = ky; + if (*beta == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = 0.f; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = *beta * y[iy]; + iy += *incy; +/* L40: */ + } + } + } + } + if (*alpha == 0.f) { + return 0; + } + kk = 1; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when AP contains the upper triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.f; + k = kk; + i__2 = j - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + y[i__] += temp1 * ap[k]; + temp2 += ap[k] * x[i__]; + ++k; +/* L50: */ + } + y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2; + kk += j; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.f; + ix = kx; + iy = ky; + i__2 = kk + j - 2; + for (k = kk; k <= i__2; ++k) { + y[iy] += temp1 * ap[k]; + temp2 += ap[k] * x[ix]; + ix += *incx; + iy += *incy; +/* L70: */ + } + y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2; + jx += *incx; + jy += *incy; + kk += j; +/* L80: */ + } + } + } else { + +/* Form y when AP contains the lower triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.f; + y[j] += temp1 * ap[kk]; + k = kk + 1; + i__2 = *n; + for (i__ = j + 1; i__ <= i__2; ++i__) { + y[i__] += temp1 * ap[k]; + temp2 += ap[k] * x[i__]; + ++k; +/* L90: */ + } + y[j] += *alpha * temp2; + kk += *n - j + 1; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.f; + y[jy] += temp1 * ap[kk]; + ix = jx; + iy = jy; + i__2 = kk + *n - j; + for (k = kk + 1; k <= i__2; ++k) { + ix += *incx; + iy += *incy; + y[iy] += temp1 * ap[k]; + temp2 += ap[k] * x[ix]; +/* L110: */ + } + y[jy] += *alpha * temp2; + jx += *incx; + jy += *incy; + kk += *n - j + 1; +/* L120: */ + } + } + } + + return 0; + +/* End of SSPMV . */ + +} /* sspmv_ */ + diff --git a/blas/f2c/stbmv.c b/blas/f2c/stbmv.c new file mode 100644 index 000000000..fcf9ce336 --- /dev/null +++ b/blas/f2c/stbmv.c @@ -0,0 +1,428 @@ +/* stbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int stbmv_(char *uplo, char *trans, char *diag, integer *n, + integer *k, real *a, integer *lda, real *x, integer *incx, ftnlen + uplo_len, ftnlen trans_len, ftnlen diag_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + + /* Local variables */ + integer i__, j, l, ix, jx, kx, info; + real temp; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + logical nounit; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* STBMV performs one of the matrix-vector operations */ + +/* x := A*x, or x := A'*x, */ + +/* where x is an n element vector and A is an n by n unit, or non-unit, */ +/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the matrix is an upper or */ +/* lower triangular matrix as follows: */ + +/* UPLO = 'U' or 'u' A is an upper triangular matrix. */ + +/* UPLO = 'L' or 'l' A is a lower triangular matrix. */ + +/* Unchanged on exit. */ + +/* TRANS - CHARACTER*1. */ +/* On entry, TRANS specifies the operation to be performed as */ +/* follows: */ + +/* TRANS = 'N' or 'n' x := A*x. */ + +/* TRANS = 'T' or 't' x := A'*x. */ + +/* TRANS = 'C' or 'c' x := A'*x. */ + +/* Unchanged on exit. */ + +/* DIAG - CHARACTER*1. */ +/* On entry, DIAG specifies whether or not A is unit */ +/* triangular as follows: */ + +/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ + +/* DIAG = 'N' or 'n' A is not assumed to be unit */ +/* triangular. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry with UPLO = 'U' or 'u', K specifies the number of */ +/* super-diagonals of the matrix A. */ +/* On entry with UPLO = 'L' or 'l', K specifies the number of */ +/* sub-diagonals of the matrix A. */ +/* K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* A - REAL array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer an upper */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer a lower */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Note that when DIAG = 'U' or 'u' the elements of the array A */ +/* corresponding to the diagonal elements of the matrix are not */ +/* referenced, but are assumed to be unity. */ +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - REAL array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. On exit, X is overwritten with the */ +/* tranformed vector x. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, + "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, ( + ftnlen)1)) { + info = 2; + } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, + "N", (ftnlen)1, (ftnlen)1)) { + info = 3; + } else if (*n < 0) { + info = 4; + } else if (*k < 0) { + info = 5; + } else if (*lda < *k + 1) { + info = 7; + } else if (*incx == 0) { + info = 9; + } + if (info != 0) { + xerbla_("STBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0) { + return 0; + } + + nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1); + +/* Set up the start point in X if the increment is not unity. This */ +/* will be ( N - 1 )*INCX too small for descending loops. */ + + if (*incx <= 0) { + kx = 1 - (*n - 1) * *incx; + } else if (*incx != 1) { + kx = 1; + } + +/* Start the operations. In this version the elements of A are */ +/* accessed sequentially with one pass through A. */ + + if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) { + +/* Form x := A*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (x[j] != 0.f) { + temp = x[j]; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + x[i__] += temp * a[l + i__ + j * a_dim1]; +/* L10: */ + } + if (nounit) { + x[j] *= a[kplus1 + j * a_dim1]; + } + } +/* L20: */ + } + } else { + jx = kx; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (x[jx] != 0.f) { + temp = x[jx]; + ix = kx; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + x[ix] += temp * a[l + i__ + j * a_dim1]; + ix += *incx; +/* L30: */ + } + if (nounit) { + x[jx] *= a[kplus1 + j * a_dim1]; + } + } + jx += *incx; + if (j > *k) { + kx += *incx; + } +/* L40: */ + } + } + } else { + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + if (x[j] != 0.f) { + temp = x[j]; + l = 1 - j; +/* Computing MIN */ + i__1 = *n, i__3 = j + *k; + i__4 = j + 1; + for (i__ = min(i__1,i__3); i__ >= i__4; --i__) { + x[i__] += temp * a[l + i__ + j * a_dim1]; +/* L50: */ + } + if (nounit) { + x[j] *= a[j * a_dim1 + 1]; + } + } +/* L60: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + if (x[jx] != 0.f) { + temp = x[jx]; + ix = kx; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__1 = j + *k; + i__3 = j + 1; + for (i__ = min(i__4,i__1); i__ >= i__3; --i__) { + x[ix] += temp * a[l + i__ + j * a_dim1]; + ix -= *incx; +/* L70: */ + } + if (nounit) { + x[jx] *= a[j * a_dim1 + 1]; + } + } + jx -= *incx; + if (*n - j >= *k) { + kx -= *incx; + } +/* L80: */ + } + } + } + } else { + +/* Form x := A'*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + temp = x[j]; + l = kplus1 - j; + if (nounit) { + temp *= a[kplus1 + j * a_dim1]; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + temp += a[l + i__ + j * a_dim1] * x[i__]; +/* L90: */ + } + x[j] = temp; +/* L100: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + temp = x[jx]; + kx -= *incx; + ix = kx; + l = kplus1 - j; + if (nounit) { + temp *= a[kplus1 + j * a_dim1]; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + temp += a[l + i__ + j * a_dim1] * x[ix]; + ix -= *incx; +/* L110: */ + } + x[jx] = temp; + jx -= *incx; +/* L120: */ + } + } + } else { + if (*incx == 1) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + temp = x[j]; + l = 1 - j; + if (nounit) { + temp *= a[j * a_dim1 + 1]; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + temp += a[l + i__ + j * a_dim1] * x[i__]; +/* L130: */ + } + x[j] = temp; +/* L140: */ + } + } else { + jx = kx; + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + temp = x[jx]; + kx += *incx; + ix = kx; + l = 1 - j; + if (nounit) { + temp *= a[j * a_dim1 + 1]; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + temp += a[l + i__ + j * a_dim1] * x[ix]; + ix += *incx; +/* L150: */ + } + x[jx] = temp; + jx += *incx; +/* L160: */ + } + } + } + } + + return 0; + +/* End of STBMV . */ + +} /* stbmv_ */ + diff --git a/blas/f2c/zhbmv.c b/blas/f2c/zhbmv.c new file mode 100644 index 000000000..42da13dbb --- /dev/null +++ b/blas/f2c/zhbmv.c @@ -0,0 +1,488 @@ +/* zhbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int zhbmv_(char *uplo, integer *n, integer *k, doublecomplex + *alpha, doublecomplex *a, integer *lda, doublecomplex *x, integer * + incx, doublecomplex *beta, doublecomplex *y, integer *incy, ftnlen + uplo_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; + doublereal d__1; + doublecomplex z__1, z__2, z__3, z__4; + + /* Builtin functions */ + void d_cnjg(doublecomplex *, doublecomplex *); + + /* Local variables */ + integer i__, j, l, ix, iy, jx, jy, kx, ky, info; + doublecomplex temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* ZHBMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n hermitian band matrix, with k super-diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the band matrix A is being supplied as */ +/* follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* being supplied. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* being supplied. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry, K specifies the number of super-diagonals of the */ +/* matrix A. K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* ALPHA - COMPLEX*16 . */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the hermitian matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer the upper */ +/* triangular part of a hermitian band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the hermitian matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer the lower */ +/* triangular part of a hermitian band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Note that the imaginary parts of the diagonal elements need */ +/* not be set and are assumed to be zero. */ +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - COMPLEX*16 array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the */ +/* vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - COMPLEX*16 . */ +/* On entry, BETA specifies the scalar beta. */ +/* Unchanged on exit. */ + +/* Y - COMPLEX*16 array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the */ +/* vector y. On exit, Y is overwritten by the updated vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + --y; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*k < 0) { + info = 3; + } else if (*lda < *k + 1) { + info = 6; + } else if (*incx == 0) { + info = 8; + } else if (*incy == 0) { + info = 11; + } + if (info != 0) { + xerbla_("ZHBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && + beta->i == 0.))) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array A */ +/* are accessed sequentially with one pass through A. */ + +/* First form y := beta*y. */ + + if (beta->r != 1. || beta->i != 0.) { + if (*incy == 1) { + if (beta->r == 0. && beta->i == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + y[i__2].r = 0., y[i__2].i = 0.; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + z__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; +/* L20: */ + } + } + } else { + iy = ky; + if (beta->r == 0. && beta->i == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + y[i__2].r = 0., y[i__2].i = 0.; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + i__3 = iy; + z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + z__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + iy += *incy; +/* L40: */ + } + } + } + } + if (alpha->r == 0. && alpha->i == 0.) { + return 0; + } + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when upper triangle of A is stored. */ + + kplus1 = *k + 1; + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + i__2 = i__; + i__3 = i__; + i__5 = l + i__ + j * a_dim1; + z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__2 = i__; + z__2.r = z__3.r * x[i__2].r - z__3.i * x[i__2].i, z__2.i = + z__3.r * x[i__2].i + z__3.i * x[i__2].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; +/* L50: */ + } + i__4 = j; + i__2 = j; + i__3 = kplus1 + j * a_dim1; + d__1 = a[i__3].r; + z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i; + z__2.r = y[i__2].r + z__3.r, z__2.i = y[i__2].i + z__3.i; + z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i; + y[i__4].r = z__1.r, y[i__4].i = z__1.i; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__4 = jx; + z__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, z__1.i = + alpha->r * x[i__4].i + alpha->i * x[i__4].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + ix = kx; + iy = ky; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + i__4 = iy; + i__2 = iy; + i__5 = l + i__ + j * a_dim1; + z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i; + y[i__4].r = z__1.r, y[i__4].i = z__1.i; + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__4 = ix; + z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = + z__3.r * x[i__4].i + z__3.i * x[i__4].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; + ix += *incx; + iy += *incy; +/* L70: */ + } + i__3 = jy; + i__4 = jy; + i__2 = kplus1 + j * a_dim1; + d__1 = a[i__2].r; + z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i; + z__2.r = y[i__4].r + z__3.r, z__2.i = y[i__4].i + z__3.i; + z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + jx += *incx; + jy += *incy; + if (j > *k) { + kx += *incx; + ky += *incy; + } +/* L80: */ + } + } + } else { + +/* Form y when lower triangle of A is stored. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__3 = j; + z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i = + alpha->r * x[i__3].i + alpha->i * x[i__3].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + i__3 = j; + i__4 = j; + i__2 = j * a_dim1 + 1; + d__1 = a[i__2].r; + z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__2 = i__; + i__5 = l + i__ + j * a_dim1; + z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i; + y[i__4].r = z__1.r, y[i__4].i = z__1.i; + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__4 = i__; + z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = + z__3.r * x[i__4].i + z__3.i * x[i__4].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; +/* L90: */ + } + i__3 = j; + i__4 = j; + z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__3 = jx; + z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i = + alpha->r * x[i__3].i + alpha->i * x[i__3].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + i__3 = jy; + i__4 = jy; + i__2 = j * a_dim1 + 1; + d__1 = a[i__2].r; + z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + l = 1 - j; + ix = jx; + iy = jy; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + ix += *incx; + iy += *incy; + i__4 = iy; + i__2 = iy; + i__5 = l + i__ + j * a_dim1; + z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i; + y[i__4].r = z__1.r, y[i__4].i = z__1.i; + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__4 = ix; + z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = + z__3.r * x[i__4].i + z__3.i * x[i__4].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; +/* L110: */ + } + i__3 = jy; + i__4 = jy; + z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + jx += *incx; + jy += *incy; +/* L120: */ + } + } + } + + return 0; + +/* End of ZHBMV . */ + +} /* zhbmv_ */ + diff --git a/blas/f2c/zhpmv.c b/blas/f2c/zhpmv.c new file mode 100644 index 000000000..fbe2f42b3 --- /dev/null +++ b/blas/f2c/zhpmv.c @@ -0,0 +1,438 @@ +/* zhpmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int zhpmv_(char *uplo, integer *n, doublecomplex *alpha, + doublecomplex *ap, doublecomplex *x, integer *incx, doublecomplex * + beta, doublecomplex *y, integer *incy, ftnlen uplo_len) +{ + /* System generated locals */ + integer i__1, i__2, i__3, i__4, i__5; + doublereal d__1; + doublecomplex z__1, z__2, z__3, z__4; + + /* Builtin functions */ + void d_cnjg(doublecomplex *, doublecomplex *); + + /* Local variables */ + integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info; + doublecomplex temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* ZHPMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n hermitian matrix, supplied in packed form. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the matrix A is supplied in the packed */ +/* array AP as follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* supplied in AP. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* supplied in AP. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* ALPHA - COMPLEX*16 . */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* AP - COMPLEX*16 array of DIMENSION at least */ +/* ( ( n*( n + 1 ) )/2 ). */ +/* Before entry with UPLO = 'U' or 'u', the array AP must */ +/* contain the upper triangular part of the hermitian matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ +/* and a( 2, 2 ) respectively, and so on. */ +/* Before entry with UPLO = 'L' or 'l', the array AP must */ +/* contain the lower triangular part of the hermitian matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ +/* and a( 3, 1 ) respectively, and so on. */ +/* Note that the imaginary parts of the diagonal elements need */ +/* not be set and are assumed to be zero. */ +/* Unchanged on exit. */ + +/* X - COMPLEX*16 array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - COMPLEX*16 . */ +/* On entry, BETA specifies the scalar beta. When BETA is */ +/* supplied as zero then Y need not be set on input. */ +/* Unchanged on exit. */ + +/* Y - COMPLEX*16 array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the n */ +/* element vector y. On exit, Y is overwritten by the updated */ +/* vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + --y; + --x; + --ap; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*incx == 0) { + info = 6; + } else if (*incy == 0) { + info = 9; + } + if (info != 0) { + xerbla_("ZHPMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && + beta->i == 0.))) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array AP */ +/* are accessed sequentially with one pass through AP. */ + +/* First form y := beta*y. */ + + if (beta->r != 1. || beta->i != 0.) { + if (*incy == 1) { + if (beta->r == 0. && beta->i == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + y[i__2].r = 0., y[i__2].i = 0.; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + z__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; +/* L20: */ + } + } + } else { + iy = ky; + if (beta->r == 0. && beta->i == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + y[i__2].r = 0., y[i__2].i = 0.; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + i__3 = iy; + z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + z__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + iy += *incy; +/* L40: */ + } + } + } + } + if (alpha->r == 0. && alpha->i == 0.) { + return 0; + } + kk = 1; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when AP contains the upper triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + k = kk; + i__2 = j - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + i__5 = k; + z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + d_cnjg(&z__3, &ap[k]); + i__3 = i__; + z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = + z__3.r * x[i__3].i + z__3.i * x[i__3].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; + ++k; +/* L50: */ + } + i__2 = j; + i__3 = j; + i__4 = kk + j - 1; + d__1 = ap[i__4].r; + z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i; + z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i; + z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + kk += j; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = jx; + z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + ix = kx; + iy = ky; + i__2 = kk + j - 2; + for (k = kk; k <= i__2; ++k) { + i__3 = iy; + i__4 = iy; + i__5 = k; + z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + d_cnjg(&z__3, &ap[k]); + i__3 = ix; + z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = + z__3.r * x[i__3].i + z__3.i * x[i__3].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; + ix += *incx; + iy += *incy; +/* L70: */ + } + i__2 = jy; + i__3 = jy; + i__4 = kk + j - 1; + d__1 = ap[i__4].r; + z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i; + z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i; + z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + jx += *incx; + jy += *incy; + kk += j; +/* L80: */ + } + } + } else { + +/* Form y when AP contains the lower triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + i__2 = j; + i__3 = j; + i__4 = kk; + d__1 = ap[i__4].r; + z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i; + z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + k = kk + 1; + i__2 = *n; + for (i__ = j + 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + i__5 = k; + z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + d_cnjg(&z__3, &ap[k]); + i__3 = i__; + z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = + z__3.r * x[i__3].i + z__3.i * x[i__3].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; + ++k; +/* L90: */ + } + i__2 = j; + i__3 = j; + z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + kk += *n - j + 1; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = jx; + z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + i__2 = jy; + i__3 = jy; + i__4 = kk; + d__1 = ap[i__4].r; + z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i; + z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + ix = jx; + iy = jy; + i__2 = kk + *n - j; + for (k = kk + 1; k <= i__2; ++k) { + ix += *incx; + iy += *incy; + i__3 = iy; + i__4 = iy; + i__5 = k; + z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + d_cnjg(&z__3, &ap[k]); + i__3 = ix; + z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = + z__3.r * x[i__3].i + z__3.i * x[i__3].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; +/* L110: */ + } + i__2 = jy; + i__3 = jy; + z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + jx += *incx; + jy += *incy; + kk += *n - j + 1; +/* L120: */ + } + } + } + + return 0; + +/* End of ZHPMV . */ + +} /* zhpmv_ */ + diff --git a/blas/f2c/ztbmv.c b/blas/f2c/ztbmv.c new file mode 100644 index 000000000..4cdcd7f88 --- /dev/null +++ b/blas/f2c/ztbmv.c @@ -0,0 +1,647 @@ +/* ztbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int ztbmv_(char *uplo, char *trans, char *diag, integer *n, + integer *k, doublecomplex *a, integer *lda, doublecomplex *x, integer + *incx, ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; + doublecomplex z__1, z__2, z__3; + + /* Builtin functions */ + void d_cnjg(doublecomplex *, doublecomplex *); + + /* Local variables */ + integer i__, j, l, ix, jx, kx, info; + doublecomplex temp; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + logical noconj, nounit; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* ZTBMV performs one of the matrix-vector operations */ + +/* x := A*x, or x := A'*x, or x := conjg( A' )*x, */ + +/* where x is an n element vector and A is an n by n unit, or non-unit, */ +/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the matrix is an upper or */ +/* lower triangular matrix as follows: */ + +/* UPLO = 'U' or 'u' A is an upper triangular matrix. */ + +/* UPLO = 'L' or 'l' A is a lower triangular matrix. */ + +/* Unchanged on exit. */ + +/* TRANS - CHARACTER*1. */ +/* On entry, TRANS specifies the operation to be performed as */ +/* follows: */ + +/* TRANS = 'N' or 'n' x := A*x. */ + +/* TRANS = 'T' or 't' x := A'*x. */ + +/* TRANS = 'C' or 'c' x := conjg( A' )*x. */ + +/* Unchanged on exit. */ + +/* DIAG - CHARACTER*1. */ +/* On entry, DIAG specifies whether or not A is unit */ +/* triangular as follows: */ + +/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ + +/* DIAG = 'N' or 'n' A is not assumed to be unit */ +/* triangular. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry with UPLO = 'U' or 'u', K specifies the number of */ +/* super-diagonals of the matrix A. */ +/* On entry with UPLO = 'L' or 'l', K specifies the number of */ +/* sub-diagonals of the matrix A. */ +/* K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer an upper */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer a lower */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Note that when DIAG = 'U' or 'u' the elements of the array A */ +/* corresponding to the diagonal elements of the matrix are not */ +/* referenced, but are assumed to be unity. */ +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - COMPLEX*16 array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. On exit, X is overwritten with the */ +/* tranformed vector x. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, + "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, ( + ftnlen)1)) { + info = 2; + } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, + "N", (ftnlen)1, (ftnlen)1)) { + info = 3; + } else if (*n < 0) { + info = 4; + } else if (*k < 0) { + info = 5; + } else if (*lda < *k + 1) { + info = 7; + } else if (*incx == 0) { + info = 9; + } + if (info != 0) { + xerbla_("ZTBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0) { + return 0; + } + + noconj = lsame_(trans, "T", (ftnlen)1, (ftnlen)1); + nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1); + +/* Set up the start point in X if the increment is not unity. This */ +/* will be ( N - 1 )*INCX too small for descending loops. */ + + if (*incx <= 0) { + kx = 1 - (*n - 1) * *incx; + } else if (*incx != 1) { + kx = 1; + } + +/* Start the operations. In this version the elements of A are */ +/* accessed sequentially with one pass through A. */ + + if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) { + +/* Form x := A*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + if (x[i__2].r != 0. || x[i__2].i != 0.) { + i__2 = j; + temp.r = x[i__2].r, temp.i = x[i__2].i; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + i__2 = i__; + i__3 = i__; + i__5 = l + i__ + j * a_dim1; + z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, + z__2.i = temp.r * a[i__5].i + temp.i * a[ + i__5].r; + z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + + z__2.i; + x[i__2].r = z__1.r, x[i__2].i = z__1.i; +/* L10: */ + } + if (nounit) { + i__4 = j; + i__2 = j; + i__3 = kplus1 + j * a_dim1; + z__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[ + i__3].i, z__1.i = x[i__2].r * a[i__3].i + + x[i__2].i * a[i__3].r; + x[i__4].r = z__1.r, x[i__4].i = z__1.i; + } + } +/* L20: */ + } + } else { + jx = kx; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__4 = jx; + if (x[i__4].r != 0. || x[i__4].i != 0.) { + i__4 = jx; + temp.r = x[i__4].r, temp.i = x[i__4].i; + ix = kx; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + i__4 = ix; + i__2 = ix; + i__5 = l + i__ + j * a_dim1; + z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, + z__2.i = temp.r * a[i__5].i + temp.i * a[ + i__5].r; + z__1.r = x[i__2].r + z__2.r, z__1.i = x[i__2].i + + z__2.i; + x[i__4].r = z__1.r, x[i__4].i = z__1.i; + ix += *incx; +/* L30: */ + } + if (nounit) { + i__3 = jx; + i__4 = jx; + i__2 = kplus1 + j * a_dim1; + z__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[ + i__2].i, z__1.i = x[i__4].r * a[i__2].i + + x[i__4].i * a[i__2].r; + x[i__3].r = z__1.r, x[i__3].i = z__1.i; + } + } + jx += *incx; + if (j > *k) { + kx += *incx; + } +/* L40: */ + } + } + } else { + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + i__1 = j; + if (x[i__1].r != 0. || x[i__1].i != 0.) { + i__1 = j; + temp.r = x[i__1].r, temp.i = x[i__1].i; + l = 1 - j; +/* Computing MIN */ + i__1 = *n, i__3 = j + *k; + i__4 = j + 1; + for (i__ = min(i__1,i__3); i__ >= i__4; --i__) { + i__1 = i__; + i__3 = i__; + i__2 = l + i__ + j * a_dim1; + z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, + z__2.i = temp.r * a[i__2].i + temp.i * a[ + i__2].r; + z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + + z__2.i; + x[i__1].r = z__1.r, x[i__1].i = z__1.i; +/* L50: */ + } + if (nounit) { + i__4 = j; + i__1 = j; + i__3 = j * a_dim1 + 1; + z__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[ + i__3].i, z__1.i = x[i__1].r * a[i__3].i + + x[i__1].i * a[i__3].r; + x[i__4].r = z__1.r, x[i__4].i = z__1.i; + } + } +/* L60: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + i__4 = jx; + if (x[i__4].r != 0. || x[i__4].i != 0.) { + i__4 = jx; + temp.r = x[i__4].r, temp.i = x[i__4].i; + ix = kx; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__1 = j + *k; + i__3 = j + 1; + for (i__ = min(i__4,i__1); i__ >= i__3; --i__) { + i__4 = ix; + i__1 = ix; + i__2 = l + i__ + j * a_dim1; + z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, + z__2.i = temp.r * a[i__2].i + temp.i * a[ + i__2].r; + z__1.r = x[i__1].r + z__2.r, z__1.i = x[i__1].i + + z__2.i; + x[i__4].r = z__1.r, x[i__4].i = z__1.i; + ix -= *incx; +/* L70: */ + } + if (nounit) { + i__3 = jx; + i__4 = jx; + i__1 = j * a_dim1 + 1; + z__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[ + i__1].i, z__1.i = x[i__4].r * a[i__1].i + + x[i__4].i * a[i__1].r; + x[i__3].r = z__1.r, x[i__3].i = z__1.i; + } + } + jx -= *incx; + if (*n - j >= *k) { + kx -= *incx; + } +/* L80: */ + } + } + } + } else { + +/* Form x := A'*x or x := conjg( A' )*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + i__3 = j; + temp.r = x[i__3].r, temp.i = x[i__3].i; + l = kplus1 - j; + if (noconj) { + if (nounit) { + i__3 = kplus1 + j * a_dim1; + z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, + z__1.i = temp.r * a[i__3].i + temp.i * a[ + i__3].r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + i__4 = l + i__ + j * a_dim1; + i__1 = i__; + z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[ + i__1].i, z__2.i = a[i__4].r * x[i__1].i + + a[i__4].i * x[i__1].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; +/* L90: */ + } + } else { + if (nounit) { + d_cnjg(&z__2, &a[kplus1 + j * a_dim1]); + z__1.r = temp.r * z__2.r - temp.i * z__2.i, + z__1.i = temp.r * z__2.i + temp.i * + z__2.r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__4 = i__; + z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, + z__2.i = z__3.r * x[i__4].i + z__3.i * x[ + i__4].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; +/* L100: */ + } + } + i__3 = j; + x[i__3].r = temp.r, x[i__3].i = temp.i; +/* L110: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + i__3 = jx; + temp.r = x[i__3].r, temp.i = x[i__3].i; + kx -= *incx; + ix = kx; + l = kplus1 - j; + if (noconj) { + if (nounit) { + i__3 = kplus1 + j * a_dim1; + z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, + z__1.i = temp.r * a[i__3].i + temp.i * a[ + i__3].r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + i__4 = l + i__ + j * a_dim1; + i__1 = ix; + z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[ + i__1].i, z__2.i = a[i__4].r * x[i__1].i + + a[i__4].i * x[i__1].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; + ix -= *incx; +/* L120: */ + } + } else { + if (nounit) { + d_cnjg(&z__2, &a[kplus1 + j * a_dim1]); + z__1.r = temp.r * z__2.r - temp.i * z__2.i, + z__1.i = temp.r * z__2.i + temp.i * + z__2.r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__4 = ix; + z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, + z__2.i = z__3.r * x[i__4].i + z__3.i * x[ + i__4].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; + ix -= *incx; +/* L130: */ + } + } + i__3 = jx; + x[i__3].r = temp.r, x[i__3].i = temp.i; + jx -= *incx; +/* L140: */ + } + } + } else { + if (*incx == 1) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + i__4 = j; + temp.r = x[i__4].r, temp.i = x[i__4].i; + l = 1 - j; + if (noconj) { + if (nounit) { + i__4 = j * a_dim1 + 1; + z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, + z__1.i = temp.r * a[i__4].i + temp.i * a[ + i__4].r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + i__1 = l + i__ + j * a_dim1; + i__2 = i__; + z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[ + i__2].i, z__2.i = a[i__1].r * x[i__2].i + + a[i__1].i * x[i__2].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; +/* L150: */ + } + } else { + if (nounit) { + d_cnjg(&z__2, &a[j * a_dim1 + 1]); + z__1.r = temp.r * z__2.r - temp.i * z__2.i, + z__1.i = temp.r * z__2.i + temp.i * + z__2.r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__1 = i__; + z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, + z__2.i = z__3.r * x[i__1].i + z__3.i * x[ + i__1].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; +/* L160: */ + } + } + i__4 = j; + x[i__4].r = temp.r, x[i__4].i = temp.i; +/* L170: */ + } + } else { + jx = kx; + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + i__4 = jx; + temp.r = x[i__4].r, temp.i = x[i__4].i; + kx += *incx; + ix = kx; + l = 1 - j; + if (noconj) { + if (nounit) { + i__4 = j * a_dim1 + 1; + z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, + z__1.i = temp.r * a[i__4].i + temp.i * a[ + i__4].r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + i__1 = l + i__ + j * a_dim1; + i__2 = ix; + z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[ + i__2].i, z__2.i = a[i__1].r * x[i__2].i + + a[i__1].i * x[i__2].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; + ix += *incx; +/* L180: */ + } + } else { + if (nounit) { + d_cnjg(&z__2, &a[j * a_dim1 + 1]); + z__1.r = temp.r * z__2.r - temp.i * z__2.i, + z__1.i = temp.r * z__2.i + temp.i * + z__2.r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__1 = ix; + z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, + z__2.i = z__3.r * x[i__1].i + z__3.i * x[ + i__1].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; + ix += *incx; +/* L190: */ + } + } + i__4 = jx; + x[i__4].r = temp.r, x[i__4].i = temp.i; + jx += *incx; +/* L200: */ + } + } + } + } + + return 0; + +/* End of ZTBMV . */ + +} /* ztbmv_ */ + diff --git a/blas/complexdots.f b/blas/fortran/complexdots.f index a7da51d16..a7da51d16 100644 --- a/blas/complexdots.f +++ b/blas/fortran/complexdots.f diff --git a/blas/level2_impl.h b/blas/level2_impl.h index 233c7b753..e604fe611 100644 --- a/blas/level2_impl.h +++ b/blas/level2_impl.h @@ -9,6 +9,20 @@ #include "common.h" +template<typename Index, typename Scalar, int StorageOrder, bool ConjugateLhs, bool ConjugateRhs> +struct general_matrix_vector_product_wrapper +{ + static void run(Index rows, Index cols,const Scalar *lhs, Index lhsStride, const Scalar *rhs, Index rhsIncr, Scalar* res, Index resIncr, Scalar alpha) + { + typedef internal::const_blas_data_mapper<Scalar,Index,StorageOrder> LhsMapper; + typedef internal::const_blas_data_mapper<Scalar,Index,RowMajor> RhsMapper; + + internal::general_matrix_vector_product + <Index,Scalar,LhsMapper,StorageOrder,ConjugateLhs,Scalar,RhsMapper,ConjugateRhs>::run( + rows, cols, LhsMapper(lhs, lhsStride), RhsMapper(rhs, rhsIncr), res, resIncr, alpha); + } +}; + int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *incb, RealScalar *pbeta, RealScalar *pc, int *incc) { typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int , Scalar *, int, Scalar); @@ -20,9 +34,9 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca for(int k=0; k<4; ++k) func[k] = 0; - func[NOTR] = (internal::general_matrix_vector_product<int,Scalar,ColMajor,false,Scalar,false>::run); - func[TR ] = (internal::general_matrix_vector_product<int,Scalar,RowMajor,false,Scalar,false>::run); - func[ADJ ] = (internal::general_matrix_vector_product<int,Scalar,RowMajor,Conj, Scalar,false>::run); + func[NOTR] = (general_matrix_vector_product_wrapper<int,Scalar,ColMajor,false,false>::run); + func[TR ] = (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,false,false>::run); + func[ADJ ] = (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,Conj ,false>::run); init = true; } diff --git a/blas/level3_impl.h b/blas/level3_impl.h index a05872666..37a803ced 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -56,7 +56,7 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal else matrix(c, *m, *n, *ldc) *= beta; } - internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,true); + internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,1,true); int code = OP(*opa) | (OP(*opb) << 2); func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha, blocking, 0); @@ -131,12 +131,12 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, if(SIDE(*side)==LEFT) { - internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m); + internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false); func[code](*m, *n, a, *lda, b, *ldb, blocking); } else { - internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n); + internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false); func[code](*n, *m, a, *lda, b, *ldb, blocking); } @@ -222,12 +222,12 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, if(SIDE(*side)==LEFT) { - internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m); + internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false); func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha, blocking); } else { - internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n); + internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false); func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha, blocking); } return 1; @@ -577,7 +577,7 @@ int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal else if(*n<0) info = 3; else if(*k<0) info = 4; else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 7; - else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 9; + else if(*ldb<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 9; else if(*ldc<std::max(1,*n)) info = 12; if(info) return xerbla_(SCALAR_SUFFIX_UP"HER2K",&info,6); diff --git a/blas/lsame.f b/blas/lsame.f deleted file mode 100644 index f53690268..000000000 --- a/blas/lsame.f +++ /dev/null @@ -1,85 +0,0 @@ - LOGICAL FUNCTION LSAME(CA,CB) -* -* -- LAPACK auxiliary routine (version 3.1) -- -* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. -* November 2006 -* -* .. Scalar Arguments .. - CHARACTER CA,CB -* .. -* -* Purpose -* ======= -* -* LSAME returns .TRUE. if CA is the same letter as CB regardless of -* case. -* -* Arguments -* ========= -* -* CA (input) CHARACTER*1 -* -* CB (input) CHARACTER*1 -* CA and CB specify the single characters to be compared. -* -* ===================================================================== -* -* .. Intrinsic Functions .. - INTRINSIC ICHAR -* .. -* .. Local Scalars .. - INTEGER INTA,INTB,ZCODE -* .. -* -* Test if the characters are equal -* - LSAME = CA .EQ. CB - IF (LSAME) RETURN -* -* Now test for equivalence if both characters are alphabetic. -* - ZCODE = ICHAR('Z') -* -* Use 'Z' rather than 'A' so that ASCII can be detected on Prime -* machines, on which ICHAR returns a value with bit 8 set. -* ICHAR('A') on Prime machines returns 193 which is the same as -* ICHAR('A') on an EBCDIC machine. -* - INTA = ICHAR(CA) - INTB = ICHAR(CB) -* - IF (ZCODE.EQ.90 .OR. ZCODE.EQ.122) THEN -* -* ASCII is assumed - ZCODE is the ASCII code of either lower or -* upper case 'Z'. -* - IF (INTA.GE.97 .AND. INTA.LE.122) INTA = INTA - 32 - IF (INTB.GE.97 .AND. INTB.LE.122) INTB = INTB - 32 -* - ELSE IF (ZCODE.EQ.233 .OR. ZCODE.EQ.169) THEN -* -* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or -* upper case 'Z'. -* - IF (INTA.GE.129 .AND. INTA.LE.137 .OR. - + INTA.GE.145 .AND. INTA.LE.153 .OR. - + INTA.GE.162 .AND. INTA.LE.169) INTA = INTA + 64 - IF (INTB.GE.129 .AND. INTB.LE.137 .OR. - + INTB.GE.145 .AND. INTB.LE.153 .OR. - + INTB.GE.162 .AND. INTB.LE.169) INTB = INTB + 64 -* - ELSE IF (ZCODE.EQ.218 .OR. ZCODE.EQ.250) THEN -* -* ASCII is assumed, on Prime machines - ZCODE is the ASCII code -* plus 128 of either lower or upper case 'Z'. -* - IF (INTA.GE.225 .AND. INTA.LE.250) INTA = INTA - 32 - IF (INTB.GE.225 .AND. INTB.LE.250) INTB = INTB - 32 - END IF - LSAME = INTA .EQ. INTB -* -* RETURN -* -* End of LSAME -* - END diff --git a/blas/srotm.f b/blas/srotm.f deleted file mode 100644 index fc5a59333..000000000 --- a/blas/srotm.f +++ /dev/null @@ -1,148 +0,0 @@ - SUBROUTINE SROTM(N,SX,INCX,SY,INCY,SPARAM) -* .. Scalar Arguments .. - INTEGER INCX,INCY,N -* .. -* .. Array Arguments .. - REAL SPARAM(5),SX(*),SY(*) -* .. -* -* Purpose -* ======= -* -* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX -* -* (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN -* (DX**T) -* -* SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE -* LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. -* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. -* -* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 -* -* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) -* H=( ) ( ) ( ) ( ) -* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). -* SEE SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. -* -* -* Arguments -* ========= -* -* N (input) INTEGER -* number of elements in input vector(s) -* -* SX (input/output) REAL array, dimension N -* double precision vector with N elements -* -* INCX (input) INTEGER -* storage spacing between elements of SX -* -* SY (input/output) REAL array, dimension N -* double precision vector with N elements -* -* INCY (input) INTEGER -* storage spacing between elements of SY -* -* SPARAM (input/output) REAL array, dimension 5 -* SPARAM(1)=SFLAG -* SPARAM(2)=SH11 -* SPARAM(3)=SH21 -* SPARAM(4)=SH12 -* SPARAM(5)=SH22 -* -* ===================================================================== -* -* .. Local Scalars .. - REAL SFLAG,SH11,SH12,SH21,SH22,TWO,W,Z,ZERO - INTEGER I,KX,KY,NSTEPS -* .. -* .. Data statements .. - DATA ZERO,TWO/0.E0,2.E0/ -* .. -* - SFLAG = SPARAM(1) - IF (N.LE.0 .OR. (SFLAG+TWO.EQ.ZERO)) GO TO 140 - IF (.NOT. (INCX.EQ.INCY.AND.INCX.GT.0)) GO TO 70 -* - NSTEPS = N*INCX - IF (SFLAG) 50,10,30 - 10 CONTINUE - SH12 = SPARAM(4) - SH21 = SPARAM(3) - DO 20 I = 1,NSTEPS,INCX - W = SX(I) - Z = SY(I) - SX(I) = W + Z*SH12 - SY(I) = W*SH21 + Z - 20 CONTINUE - GO TO 140 - 30 CONTINUE - SH11 = SPARAM(2) - SH22 = SPARAM(5) - DO 40 I = 1,NSTEPS,INCX - W = SX(I) - Z = SY(I) - SX(I) = W*SH11 + Z - SY(I) = -W + SH22*Z - 40 CONTINUE - GO TO 140 - 50 CONTINUE - SH11 = SPARAM(2) - SH12 = SPARAM(4) - SH21 = SPARAM(3) - SH22 = SPARAM(5) - DO 60 I = 1,NSTEPS,INCX - W = SX(I) - Z = SY(I) - SX(I) = W*SH11 + Z*SH12 - SY(I) = W*SH21 + Z*SH22 - 60 CONTINUE - GO TO 140 - 70 CONTINUE - KX = 1 - KY = 1 - IF (INCX.LT.0) KX = 1 + (1-N)*INCX - IF (INCY.LT.0) KY = 1 + (1-N)*INCY -* - IF (SFLAG) 120,80,100 - 80 CONTINUE - SH12 = SPARAM(4) - SH21 = SPARAM(3) - DO 90 I = 1,N - W = SX(KX) - Z = SY(KY) - SX(KX) = W + Z*SH12 - SY(KY) = W*SH21 + Z - KX = KX + INCX - KY = KY + INCY - 90 CONTINUE - GO TO 140 - 100 CONTINUE - SH11 = SPARAM(2) - SH22 = SPARAM(5) - DO 110 I = 1,N - W = SX(KX) - Z = SY(KY) - SX(KX) = W*SH11 + Z - SY(KY) = -W + SH22*Z - KX = KX + INCX - KY = KY + INCY - 110 CONTINUE - GO TO 140 - 120 CONTINUE - SH11 = SPARAM(2) - SH12 = SPARAM(4) - SH21 = SPARAM(3) - SH22 = SPARAM(5) - DO 130 I = 1,N - W = SX(KX) - Z = SY(KY) - SX(KX) = W*SH11 + Z*SH12 - SY(KY) = W*SH21 + Z*SH22 - KX = KX + INCX - KY = KY + INCY - 130 CONTINUE - 140 CONTINUE - RETURN - END diff --git a/blas/srotmg.f b/blas/srotmg.f deleted file mode 100644 index 7b3bd4272..000000000 --- a/blas/srotmg.f +++ /dev/null @@ -1,208 +0,0 @@ - SUBROUTINE SROTMG(SD1,SD2,SX1,SY1,SPARAM) -* .. Scalar Arguments .. - REAL SD1,SD2,SX1,SY1 -* .. -* .. Array Arguments .. - REAL SPARAM(5) -* .. -* -* Purpose -* ======= -* -* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS -* THE SECOND COMPONENT OF THE 2-VECTOR (SQRT(SD1)*SX1,SQRT(SD2)* -* SY2)**T. -* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. -* -* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 -* -* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) -* H=( ) ( ) ( ) ( ) -* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). -* LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 -* RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE -* VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) -* -* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE -* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE -* OF SD1 AND SD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. -* -* -* Arguments -* ========= -* -* -* SD1 (input/output) REAL -* -* SD2 (input/output) REAL -* -* SX1 (input/output) REAL -* -* SY1 (input) REAL -* -* -* SPARAM (input/output) REAL array, dimension 5 -* SPARAM(1)=SFLAG -* SPARAM(2)=SH11 -* SPARAM(3)=SH21 -* SPARAM(4)=SH12 -* SPARAM(5)=SH22 -* -* ===================================================================== -* -* .. Local Scalars .. - REAL GAM,GAMSQ,ONE,RGAMSQ,SFLAG,SH11,SH12,SH21,SH22,SP1,SP2,SQ1, - + SQ2,STEMP,SU,TWO,ZERO - INTEGER IGO -* .. -* .. Intrinsic Functions .. - INTRINSIC ABS -* .. -* .. Data statements .. -* - DATA ZERO,ONE,TWO/0.E0,1.E0,2.E0/ - DATA GAM,GAMSQ,RGAMSQ/4096.E0,1.67772E7,5.96046E-8/ -* .. - - IF (.NOT.SD1.LT.ZERO) GO TO 10 -* GO ZERO-H-D-AND-SX1.. - GO TO 60 - 10 CONTINUE -* CASE-SD1-NONNEGATIVE - SP2 = SD2*SY1 - IF (.NOT.SP2.EQ.ZERO) GO TO 20 - SFLAG = -TWO - GO TO 260 -* REGULAR-CASE.. - 20 CONTINUE - SP1 = SD1*SX1 - SQ2 = SP2*SY1 - SQ1 = SP1*SX1 -* - IF (.NOT.ABS(SQ1).GT.ABS(SQ2)) GO TO 40 - SH21 = -SY1/SX1 - SH12 = SP2/SP1 -* - SU = ONE - SH12*SH21 -* - IF (.NOT.SU.LE.ZERO) GO TO 30 -* GO ZERO-H-D-AND-SX1.. - GO TO 60 - 30 CONTINUE - SFLAG = ZERO - SD1 = SD1/SU - SD2 = SD2/SU - SX1 = SX1*SU -* GO SCALE-CHECK.. - GO TO 100 - 40 CONTINUE - IF (.NOT.SQ2.LT.ZERO) GO TO 50 -* GO ZERO-H-D-AND-SX1.. - GO TO 60 - 50 CONTINUE - SFLAG = ONE - SH11 = SP1/SP2 - SH22 = SX1/SY1 - SU = ONE + SH11*SH22 - STEMP = SD2/SU - SD2 = SD1/SU - SD1 = STEMP - SX1 = SY1*SU -* GO SCALE-CHECK - GO TO 100 -* PROCEDURE..ZERO-H-D-AND-SX1.. - 60 CONTINUE - SFLAG = -ONE - SH11 = ZERO - SH12 = ZERO - SH21 = ZERO - SH22 = ZERO -* - SD1 = ZERO - SD2 = ZERO - SX1 = ZERO -* RETURN.. - GO TO 220 -* PROCEDURE..FIX-H.. - 70 CONTINUE - IF (.NOT.SFLAG.GE.ZERO) GO TO 90 -* - IF (.NOT.SFLAG.EQ.ZERO) GO TO 80 - SH11 = ONE - SH22 = ONE - SFLAG = -ONE - GO TO 90 - 80 CONTINUE - SH21 = -ONE - SH12 = ONE - SFLAG = -ONE - 90 CONTINUE - GO TO IGO(120,150,180,210) -* PROCEDURE..SCALE-CHECK - 100 CONTINUE - 110 CONTINUE - IF (.NOT.SD1.LE.RGAMSQ) GO TO 130 - IF (SD1.EQ.ZERO) GO TO 160 - ASSIGN 120 TO IGO -* FIX-H.. - GO TO 70 - 120 CONTINUE - SD1 = SD1*GAM**2 - SX1 = SX1/GAM - SH11 = SH11/GAM - SH12 = SH12/GAM - GO TO 110 - 130 CONTINUE - 140 CONTINUE - IF (.NOT.SD1.GE.GAMSQ) GO TO 160 - ASSIGN 150 TO IGO -* FIX-H.. - GO TO 70 - 150 CONTINUE - SD1 = SD1/GAM**2 - SX1 = SX1*GAM - SH11 = SH11*GAM - SH12 = SH12*GAM - GO TO 140 - 160 CONTINUE - 170 CONTINUE - IF (.NOT.ABS(SD2).LE.RGAMSQ) GO TO 190 - IF (SD2.EQ.ZERO) GO TO 220 - ASSIGN 180 TO IGO -* FIX-H.. - GO TO 70 - 180 CONTINUE - SD2 = SD2*GAM**2 - SH21 = SH21/GAM - SH22 = SH22/GAM - GO TO 170 - 190 CONTINUE - 200 CONTINUE - IF (.NOT.ABS(SD2).GE.GAMSQ) GO TO 220 - ASSIGN 210 TO IGO -* FIX-H.. - GO TO 70 - 210 CONTINUE - SD2 = SD2/GAM**2 - SH21 = SH21*GAM - SH22 = SH22*GAM - GO TO 200 - 220 CONTINUE - IF (SFLAG) 250,230,240 - 230 CONTINUE - SPARAM(3) = SH21 - SPARAM(4) = SH12 - GO TO 260 - 240 CONTINUE - SPARAM(2) = SH11 - SPARAM(5) = SH22 - GO TO 260 - 250 CONTINUE - SPARAM(2) = SH11 - SPARAM(3) = SH21 - SPARAM(4) = SH12 - SPARAM(5) = SH22 - 260 CONTINUE - SPARAM(1) = SFLAG - RETURN - END diff --git a/blas/ssbmv.f b/blas/ssbmv.f deleted file mode 100644 index 16893a295..000000000 --- a/blas/ssbmv.f +++ /dev/null @@ -1,306 +0,0 @@ - SUBROUTINE SSBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - REAL ALPHA,BETA - INTEGER INCX,INCY,K,LDA,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - REAL A(LDA,*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* SSBMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n symmetric band matrix, with k super-diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the band matrix A is being supplied as -* follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* being supplied. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* being supplied. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry, K specifies the number of super-diagonals of the -* matrix A. K must satisfy 0 .le. K. -* Unchanged on exit. -* -* ALPHA - REAL . -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* A - REAL array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the symmetric matrix, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer the upper -* triangular part of a symmetric band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the symmetric matrix, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer the lower -* triangular part of a symmetric band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - REAL array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the -* vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - REAL . -* On entry, BETA specifies the scalar beta. -* Unchanged on exit. -* -* Y - REAL array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the -* vector y. On exit, Y is overwritten by the updated vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - REAL ONE,ZERO - PARAMETER (ONE=1.0E+0,ZERO=0.0E+0) -* .. -* .. Local Scalars .. - REAL TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC MAX,MIN -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (K.LT.0) THEN - INFO = 3 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 6 - ELSE IF (INCX.EQ.0) THEN - INFO = 8 - ELSE IF (INCY.EQ.0) THEN - INFO = 11 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('SSBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array A -* are accessed sequentially with one pass through A. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - IF (LSAME(UPLO,'U')) THEN -* -* Form y when upper triangle of A is stored. -* - KPLUS1 = K + 1 - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - L = KPLUS1 - J - DO 50 I = MAX(1,J-K),J - 1 - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(I) - 50 CONTINUE - Y(J) = Y(J) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2 - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - L = KPLUS1 - J - DO 70 I = MAX(1,J-K),J - 1 - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - IF (J.GT.K) THEN - KX = KX + INCX - KY = KY + INCY - END IF - 80 CONTINUE - END IF - ELSE -* -* Form y when lower triangle of A is stored. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*A(1,J) - L = 1 - J - DO 90 I = J + 1,MIN(N,J+K) - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(I) - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*A(1,J) - L = 1 - J - IX = JX - IY = JY - DO 110 I = J + 1,MIN(N,J+K) - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of SSBMV . -* - END diff --git a/blas/sspmv.f b/blas/sspmv.f deleted file mode 100644 index 0b8449824..000000000 --- a/blas/sspmv.f +++ /dev/null @@ -1,265 +0,0 @@ - SUBROUTINE SSPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - REAL ALPHA,BETA - INTEGER INCX,INCY,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - REAL AP(*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* SSPMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n symmetric matrix, supplied in packed form. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the matrix A is supplied in the packed -* array AP as follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* supplied in AP. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* supplied in AP. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* ALPHA - REAL . -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* AP - REAL array of DIMENSION at least -* ( ( n*( n + 1 ) )/2 ). -* Before entry with UPLO = 'U' or 'u', the array AP must -* contain the upper triangular part of the symmetric matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) -* and a( 2, 2 ) respectively, and so on. -* Before entry with UPLO = 'L' or 'l', the array AP must -* contain the lower triangular part of the symmetric matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) -* and a( 3, 1 ) respectively, and so on. -* Unchanged on exit. -* -* X - REAL array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - REAL . -* On entry, BETA specifies the scalar beta. When BETA is -* supplied as zero then Y need not be set on input. -* Unchanged on exit. -* -* Y - REAL array of dimension at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the n -* element vector y. On exit, Y is overwritten by the updated -* vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - REAL ONE,ZERO - PARAMETER (ONE=1.0E+0,ZERO=0.0E+0) -* .. -* .. Local Scalars .. - REAL TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (INCX.EQ.0) THEN - INFO = 6 - ELSE IF (INCY.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('SSPMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array AP -* are accessed sequentially with one pass through AP. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - KK = 1 - IF (LSAME(UPLO,'U')) THEN -* -* Form y when AP contains the upper triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - K = KK - DO 50 I = 1,J - 1 - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(I) - K = K + 1 - 50 CONTINUE - Y(J) = Y(J) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2 - KK = KK + J - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - DO 70 K = KK,KK + J - 2 - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + J - 80 CONTINUE - END IF - ELSE -* -* Form y when AP contains the lower triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*AP(KK) - K = KK + 1 - DO 90 I = J + 1,N - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(I) - K = K + 1 - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - KK = KK + (N-J+1) - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*AP(KK) - IX = JX - IY = JY - DO 110 K = KK + 1,KK + N - J - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + (N-J+1) - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of SSPMV . -* - END diff --git a/blas/stbmv.f b/blas/stbmv.f deleted file mode 100644 index c0b8f1136..000000000 --- a/blas/stbmv.f +++ /dev/null @@ -1,335 +0,0 @@ - SUBROUTINE STBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) -* .. Scalar Arguments .. - INTEGER INCX,K,LDA,N - CHARACTER DIAG,TRANS,UPLO -* .. -* .. Array Arguments .. - REAL A(LDA,*),X(*) -* .. -* -* Purpose -* ======= -* -* STBMV performs one of the matrix-vector operations -* -* x := A*x, or x := A'*x, -* -* where x is an n element vector and A is an n by n unit, or non-unit, -* upper or lower triangular band matrix, with ( k + 1 ) diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the matrix is an upper or -* lower triangular matrix as follows: -* -* UPLO = 'U' or 'u' A is an upper triangular matrix. -* -* UPLO = 'L' or 'l' A is a lower triangular matrix. -* -* Unchanged on exit. -* -* TRANS - CHARACTER*1. -* On entry, TRANS specifies the operation to be performed as -* follows: -* -* TRANS = 'N' or 'n' x := A*x. -* -* TRANS = 'T' or 't' x := A'*x. -* -* TRANS = 'C' or 'c' x := A'*x. -* -* Unchanged on exit. -* -* DIAG - CHARACTER*1. -* On entry, DIAG specifies whether or not A is unit -* triangular as follows: -* -* DIAG = 'U' or 'u' A is assumed to be unit triangular. -* -* DIAG = 'N' or 'n' A is not assumed to be unit -* triangular. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry with UPLO = 'U' or 'u', K specifies the number of -* super-diagonals of the matrix A. -* On entry with UPLO = 'L' or 'l', K specifies the number of -* sub-diagonals of the matrix A. -* K must satisfy 0 .le. K. -* Unchanged on exit. -* -* A - REAL array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer an upper -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer a lower -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Note that when DIAG = 'U' or 'u' the elements of the array A -* corresponding to the diagonal elements of the matrix are not -* referenced, but are assumed to be unity. -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - REAL array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. On exit, X is overwritten with the -* tranformed vector x. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - REAL ZERO - PARAMETER (ZERO=0.0E+0) -* .. -* .. Local Scalars .. - REAL TEMP - INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L - LOGICAL NOUNIT -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC MAX,MIN -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. - + .NOT.LSAME(TRANS,'C')) THEN - INFO = 2 - ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN - INFO = 3 - ELSE IF (N.LT.0) THEN - INFO = 4 - ELSE IF (K.LT.0) THEN - INFO = 5 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 7 - ELSE IF (INCX.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('STBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF (N.EQ.0) RETURN -* - NOUNIT = LSAME(DIAG,'N') -* -* Set up the start point in X if the increment is not unity. This -* will be ( N - 1 )*INCX too small for descending loops. -* - IF (INCX.LE.0) THEN - KX = 1 - (N-1)*INCX - ELSE IF (INCX.NE.1) THEN - KX = 1 - END IF -* -* Start the operations. In this version the elements of A are -* accessed sequentially with one pass through A. -* - IF (LSAME(TRANS,'N')) THEN -* -* Form x := A*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 20 J = 1,N - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = KPLUS1 - J - DO 10 I = MAX(1,J-K),J - 1 - X(I) = X(I) + TEMP*A(L+I,J) - 10 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J) - END IF - 20 CONTINUE - ELSE - JX = KX - DO 40 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = KPLUS1 - J - DO 30 I = MAX(1,J-K),J - 1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX + INCX - 30 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J) - END IF - JX = JX + INCX - IF (J.GT.K) KX = KX + INCX - 40 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 60 J = N,1,-1 - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = 1 - J - DO 50 I = MIN(N,J+K),J + 1,-1 - X(I) = X(I) + TEMP*A(L+I,J) - 50 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(1,J) - END IF - 60 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 80 J = N,1,-1 - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = 1 - J - DO 70 I = MIN(N,J+K),J + 1,-1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX - INCX - 70 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(1,J) - END IF - JX = JX - INCX - IF ((N-J).GE.K) KX = KX - INCX - 80 CONTINUE - END IF - END IF - ELSE -* -* Form x := A'*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 100 J = N,1,-1 - TEMP = X(J) - L = KPLUS1 - J - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 90 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(I) - 90 CONTINUE - X(J) = TEMP - 100 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 120 J = N,1,-1 - TEMP = X(JX) - KX = KX - INCX - IX = KX - L = KPLUS1 - J - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 110 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX - INCX - 110 CONTINUE - X(JX) = TEMP - JX = JX - INCX - 120 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 140 J = 1,N - TEMP = X(J) - L = 1 - J - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 130 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(I) - 130 CONTINUE - X(J) = TEMP - 140 CONTINUE - ELSE - JX = KX - DO 160 J = 1,N - TEMP = X(JX) - KX = KX + INCX - IX = KX - L = 1 - J - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 150 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX + INCX - 150 CONTINUE - X(JX) = TEMP - JX = JX + INCX - 160 CONTINUE - END IF - END IF - END IF -* - RETURN -* -* End of STBMV . -* - END diff --git a/blas/xerbla.cpp b/blas/xerbla.cpp index 0422f79b7..8775b88cd 100644 --- a/blas/xerbla.cpp +++ b/blas/xerbla.cpp @@ -1,5 +1,5 @@ -#include <iostream> +#include <stdio.h> #if (defined __GNUC__) && (!defined __MINGW32__) #define EIGEN_WEAK_LINKING __attribute__ ((weak)) @@ -14,7 +14,7 @@ extern "C" EIGEN_WEAK_LINKING int xerbla_(const char * msg, int *info, int) { - std::cerr << "Eigen BLAS ERROR #" << *info << ": " << msg << "\n"; + printf("Eigen BLAS ERROR #%i: %s\n", *info, msg ); return 0; } diff --git a/blas/zhbmv.f b/blas/zhbmv.f deleted file mode 100644 index bca0da5fc..000000000 --- a/blas/zhbmv.f +++ /dev/null @@ -1,310 +0,0 @@ - SUBROUTINE ZHBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - DOUBLE COMPLEX ALPHA,BETA - INTEGER INCX,INCY,K,LDA,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - DOUBLE COMPLEX A(LDA,*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* ZHBMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n hermitian band matrix, with k super-diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the band matrix A is being supplied as -* follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* being supplied. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* being supplied. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry, K specifies the number of super-diagonals of the -* matrix A. K must satisfy 0 .le. K. -* Unchanged on exit. -* -* ALPHA - COMPLEX*16 . -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* A - COMPLEX*16 array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the hermitian matrix, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer the upper -* triangular part of a hermitian band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the hermitian matrix, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer the lower -* triangular part of a hermitian band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Note that the imaginary parts of the diagonal elements need -* not be set and are assumed to be zero. -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - COMPLEX*16 array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the -* vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - COMPLEX*16 . -* On entry, BETA specifies the scalar beta. -* Unchanged on exit. -* -* Y - COMPLEX*16 array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the -* vector y. On exit, Y is overwritten by the updated vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - DOUBLE COMPLEX ONE - PARAMETER (ONE= (1.0D+0,0.0D+0)) - DOUBLE COMPLEX ZERO - PARAMETER (ZERO= (0.0D+0,0.0D+0)) -* .. -* .. Local Scalars .. - DOUBLE COMPLEX TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC DBLE,DCONJG,MAX,MIN -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (K.LT.0) THEN - INFO = 3 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 6 - ELSE IF (INCX.EQ.0) THEN - INFO = 8 - ELSE IF (INCY.EQ.0) THEN - INFO = 11 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('ZHBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array A -* are accessed sequentially with one pass through A. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - IF (LSAME(UPLO,'U')) THEN -* -* Form y when upper triangle of A is stored. -* - KPLUS1 = K + 1 - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - L = KPLUS1 - J - DO 50 I = MAX(1,J-K),J - 1 - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(I) - 50 CONTINUE - Y(J) = Y(J) + TEMP1*DBLE(A(KPLUS1,J)) + ALPHA*TEMP2 - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - L = KPLUS1 - J - DO 70 I = MAX(1,J-K),J - 1 - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*DBLE(A(KPLUS1,J)) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - IF (J.GT.K) THEN - KX = KX + INCX - KY = KY + INCY - END IF - 80 CONTINUE - END IF - ELSE -* -* Form y when lower triangle of A is stored. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*DBLE(A(1,J)) - L = 1 - J - DO 90 I = J + 1,MIN(N,J+K) - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(I) - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*DBLE(A(1,J)) - L = 1 - J - IX = JX - IY = JY - DO 110 I = J + 1,MIN(N,J+K) - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of ZHBMV . -* - END diff --git a/blas/zhpmv.f b/blas/zhpmv.f deleted file mode 100644 index b686108b3..000000000 --- a/blas/zhpmv.f +++ /dev/null @@ -1,272 +0,0 @@ - SUBROUTINE ZHPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - DOUBLE COMPLEX ALPHA,BETA - INTEGER INCX,INCY,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - DOUBLE COMPLEX AP(*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* ZHPMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n hermitian matrix, supplied in packed form. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the matrix A is supplied in the packed -* array AP as follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* supplied in AP. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* supplied in AP. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* ALPHA - COMPLEX*16 . -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* AP - COMPLEX*16 array of DIMENSION at least -* ( ( n*( n + 1 ) )/2 ). -* Before entry with UPLO = 'U' or 'u', the array AP must -* contain the upper triangular part of the hermitian matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) -* and a( 2, 2 ) respectively, and so on. -* Before entry with UPLO = 'L' or 'l', the array AP must -* contain the lower triangular part of the hermitian matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) -* and a( 3, 1 ) respectively, and so on. -* Note that the imaginary parts of the diagonal elements need -* not be set and are assumed to be zero. -* Unchanged on exit. -* -* X - COMPLEX*16 array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - COMPLEX*16 . -* On entry, BETA specifies the scalar beta. When BETA is -* supplied as zero then Y need not be set on input. -* Unchanged on exit. -* -* Y - COMPLEX*16 array of dimension at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the n -* element vector y. On exit, Y is overwritten by the updated -* vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - DOUBLE COMPLEX ONE - PARAMETER (ONE= (1.0D+0,0.0D+0)) - DOUBLE COMPLEX ZERO - PARAMETER (ZERO= (0.0D+0,0.0D+0)) -* .. -* .. Local Scalars .. - DOUBLE COMPLEX TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC DBLE,DCONJG -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (INCX.EQ.0) THEN - INFO = 6 - ELSE IF (INCY.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('ZHPMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array AP -* are accessed sequentially with one pass through AP. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - KK = 1 - IF (LSAME(UPLO,'U')) THEN -* -* Form y when AP contains the upper triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - K = KK - DO 50 I = 1,J - 1 - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + DCONJG(AP(K))*X(I) - K = K + 1 - 50 CONTINUE - Y(J) = Y(J) + TEMP1*DBLE(AP(KK+J-1)) + ALPHA*TEMP2 - KK = KK + J - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - DO 70 K = KK,KK + J - 2 - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + DCONJG(AP(K))*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*DBLE(AP(KK+J-1)) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + J - 80 CONTINUE - END IF - ELSE -* -* Form y when AP contains the lower triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*DBLE(AP(KK)) - K = KK + 1 - DO 90 I = J + 1,N - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + DCONJG(AP(K))*X(I) - K = K + 1 - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - KK = KK + (N-J+1) - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*DBLE(AP(KK)) - IX = JX - IY = JY - DO 110 K = KK + 1,KK + N - J - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + DCONJG(AP(K))*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + (N-J+1) - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of ZHPMV . -* - END diff --git a/blas/ztbmv.f b/blas/ztbmv.f deleted file mode 100644 index 7c85c1b55..000000000 --- a/blas/ztbmv.f +++ /dev/null @@ -1,366 +0,0 @@ - SUBROUTINE ZTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) -* .. Scalar Arguments .. - INTEGER INCX,K,LDA,N - CHARACTER DIAG,TRANS,UPLO -* .. -* .. Array Arguments .. - DOUBLE COMPLEX A(LDA,*),X(*) -* .. -* -* Purpose -* ======= -* -* ZTBMV performs one of the matrix-vector operations -* -* x := A*x, or x := A'*x, or x := conjg( A' )*x, -* -* where x is an n element vector and A is an n by n unit, or non-unit, -* upper or lower triangular band matrix, with ( k + 1 ) diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the matrix is an upper or -* lower triangular matrix as follows: -* -* UPLO = 'U' or 'u' A is an upper triangular matrix. -* -* UPLO = 'L' or 'l' A is a lower triangular matrix. -* -* Unchanged on exit. -* -* TRANS - CHARACTER*1. -* On entry, TRANS specifies the operation to be performed as -* follows: -* -* TRANS = 'N' or 'n' x := A*x. -* -* TRANS = 'T' or 't' x := A'*x. -* -* TRANS = 'C' or 'c' x := conjg( A' )*x. -* -* Unchanged on exit. -* -* DIAG - CHARACTER*1. -* On entry, DIAG specifies whether or not A is unit -* triangular as follows: -* -* DIAG = 'U' or 'u' A is assumed to be unit triangular. -* -* DIAG = 'N' or 'n' A is not assumed to be unit -* triangular. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry with UPLO = 'U' or 'u', K specifies the number of -* super-diagonals of the matrix A. -* On entry with UPLO = 'L' or 'l', K specifies the number of -* sub-diagonals of the matrix A. -* K must satisfy 0 .le. K. -* Unchanged on exit. -* -* A - COMPLEX*16 array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer an upper -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer a lower -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Note that when DIAG = 'U' or 'u' the elements of the array A -* corresponding to the diagonal elements of the matrix are not -* referenced, but are assumed to be unity. -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - COMPLEX*16 array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. On exit, X is overwritten with the -* tranformed vector x. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - DOUBLE COMPLEX ZERO - PARAMETER (ZERO= (0.0D+0,0.0D+0)) -* .. -* .. Local Scalars .. - DOUBLE COMPLEX TEMP - INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L - LOGICAL NOCONJ,NOUNIT -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC DCONJG,MAX,MIN -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. - + .NOT.LSAME(TRANS,'C')) THEN - INFO = 2 - ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN - INFO = 3 - ELSE IF (N.LT.0) THEN - INFO = 4 - ELSE IF (K.LT.0) THEN - INFO = 5 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 7 - ELSE IF (INCX.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('ZTBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF (N.EQ.0) RETURN -* - NOCONJ = LSAME(TRANS,'T') - NOUNIT = LSAME(DIAG,'N') -* -* Set up the start point in X if the increment is not unity. This -* will be ( N - 1 )*INCX too small for descending loops. -* - IF (INCX.LE.0) THEN - KX = 1 - (N-1)*INCX - ELSE IF (INCX.NE.1) THEN - KX = 1 - END IF -* -* Start the operations. In this version the elements of A are -* accessed sequentially with one pass through A. -* - IF (LSAME(TRANS,'N')) THEN -* -* Form x := A*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 20 J = 1,N - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = KPLUS1 - J - DO 10 I = MAX(1,J-K),J - 1 - X(I) = X(I) + TEMP*A(L+I,J) - 10 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J) - END IF - 20 CONTINUE - ELSE - JX = KX - DO 40 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = KPLUS1 - J - DO 30 I = MAX(1,J-K),J - 1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX + INCX - 30 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J) - END IF - JX = JX + INCX - IF (J.GT.K) KX = KX + INCX - 40 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 60 J = N,1,-1 - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = 1 - J - DO 50 I = MIN(N,J+K),J + 1,-1 - X(I) = X(I) + TEMP*A(L+I,J) - 50 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(1,J) - END IF - 60 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 80 J = N,1,-1 - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = 1 - J - DO 70 I = MIN(N,J+K),J + 1,-1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX - INCX - 70 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(1,J) - END IF - JX = JX - INCX - IF ((N-J).GE.K) KX = KX - INCX - 80 CONTINUE - END IF - END IF - ELSE -* -* Form x := A'*x or x := conjg( A' )*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 110 J = N,1,-1 - TEMP = X(J) - L = KPLUS1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 90 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(I) - 90 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*DCONJG(A(KPLUS1,J)) - DO 100 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + DCONJG(A(L+I,J))*X(I) - 100 CONTINUE - END IF - X(J) = TEMP - 110 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 140 J = N,1,-1 - TEMP = X(JX) - KX = KX - INCX - IX = KX - L = KPLUS1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 120 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX - INCX - 120 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*DCONJG(A(KPLUS1,J)) - DO 130 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + DCONJG(A(L+I,J))*X(IX) - IX = IX - INCX - 130 CONTINUE - END IF - X(JX) = TEMP - JX = JX - INCX - 140 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 170 J = 1,N - TEMP = X(J) - L = 1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 150 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(I) - 150 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*DCONJG(A(1,J)) - DO 160 I = J + 1,MIN(N,J+K) - TEMP = TEMP + DCONJG(A(L+I,J))*X(I) - 160 CONTINUE - END IF - X(J) = TEMP - 170 CONTINUE - ELSE - JX = KX - DO 200 J = 1,N - TEMP = X(JX) - KX = KX + INCX - IX = KX - L = 1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 180 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX + INCX - 180 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*DCONJG(A(1,J)) - DO 190 I = J + 1,MIN(N,J+K) - TEMP = TEMP + DCONJG(A(L+I,J))*X(IX) - IX = IX + INCX - 190 CONTINUE - END IF - X(JX) = TEMP - JX = JX + INCX - 200 CONTINUE - END IF - END IF - END IF -* - RETURN -* -* End of ZTBMV . -* - END diff --git a/cmake/EigenUninstall.cmake b/cmake/EigenUninstall.cmake new file mode 100644 index 000000000..4dae8c85c --- /dev/null +++ b/cmake/EigenUninstall.cmake @@ -0,0 +1,40 @@ +################ CMake Uninstall Template ####################### +# CMake Template file for uninstallation of files +# mentioned in 'install_manifest.txt' +# +# Used by uinstall target +################################################################# + +set(MANIFEST "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt") + +if(EXISTS ${MANIFEST}) + message(STATUS "============== Uninstalling Eigen ===================") + + file(STRINGS ${MANIFEST} files) + foreach(file ${files}) + if(EXISTS ${file}) + message(STATUS "Removing file: '${file}'") + + execute_process( + COMMAND ${CMAKE_COMMAND} -E remove ${file} + OUTPUT_VARIABLE rm_out + RESULT_VARIABLE rm_retval + ) + + if(NOT "${rm_retval}" STREQUAL 0) + message(FATAL_ERROR "Failed to remove file: '${file}'.") + endif() + else() + message(STATUS "File '${file}' does not exist.") + endif() + endforeach(file) + + message(STATUS "========== Finished Uninstalling Eigen ==============") +else() + message(STATUS "Cannot find install manifest: '${MANIFEST}'") + message(STATUS "Probably make install has not been performed") + message(STATUS " or install_manifest.txt has been deleted.") +endif() + + + diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt index c4d021624..b5bdfa1f4 100644 --- a/doc/AsciiQuickReference.txt +++ b/doc/AsciiQuickReference.txt @@ -67,10 +67,10 @@ P.rightCols<cols>() // P(:, end-cols+1:end) P.rightCols(cols) // P(:, end-cols+1:end) P.topRows<rows>() // P(1:rows, :) P.topRows(rows) // P(1:rows, :) -P.middleRows<rows>(i) // P(:, i+1:i+rows) -P.middleRows(i, rows) // P(:, i+1:i+rows) -P.bottomRows<rows>() // P(:, end-rows+1:end) -P.bottomRows(rows) // P(:, end-rows+1:end) +P.middleRows<rows>(i) // P(i+1:i+rows, :) +P.middleRows(i, rows) // P(i+1:i+rows, :) +P.bottomRows<rows>() // P(end-rows+1:end, :) +P.bottomRows(rows) // P(end-rows+1:end, :) P.topLeftCorner(rows, cols) // P(1:rows, 1:cols) P.topRightCorner(rows, cols) // P(1:rows, end-cols+1:end) P.bottomLeftCorner(rows, cols) // P(end-rows+1:end, 1:cols) diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox index 4a33d0cc9..d04ac35c5 100644 --- a/doc/SparseQuickReference.dox +++ b/doc/SparseQuickReference.dox @@ -71,11 +71,10 @@ i.e either row major or column major. The default is column major. Most arithmet <td> Constant or Random Insertion</td> <td> \code -sm1.setZero(); // Set the matrix with zero elements -sm1.setConstant(val); //Replace all the nonzero values with val +sm1.setZero(); \endcode </td> -<td> The matrix sm1 should have been created before ???</td> +<td>Remove all non-zero coefficients</td> </tr> </table> diff --git a/failtest/CMakeLists.txt b/failtest/CMakeLists.txt index d2fea7bdc..c8795a344 100644 --- a/failtest/CMakeLists.txt +++ b/failtest/CMakeLists.txt @@ -41,6 +41,12 @@ ei_add_failtest("ref_5") ei_add_failtest("swap_1") ei_add_failtest("swap_2") +ei_add_failtest("sparse_ref_1") +ei_add_failtest("sparse_ref_2") +ei_add_failtest("sparse_ref_3") +ei_add_failtest("sparse_ref_4") +ei_add_failtest("sparse_ref_5") + if (EIGEN_FAILTEST_FAILURE_COUNT) message(FATAL_ERROR "${EIGEN_FAILTEST_FAILURE_COUNT} out of ${EIGEN_FAILTEST_COUNT} failtests FAILED. " diff --git a/failtest/sparse_ref_1.cpp b/failtest/sparse_ref_1.cpp new file mode 100644 index 000000000..d78d1f9b1 --- /dev/null +++ b/failtest/sparse_ref_1.cpp @@ -0,0 +1,18 @@ +#include "../Eigen/Sparse" + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +#define CV_QUALIFIER const +#else +#define CV_QUALIFIER +#endif + +using namespace Eigen; + +void call_ref(Ref<SparseMatrix<float> > a) { } + +int main() +{ + SparseMatrix<float> a(10,10); + CV_QUALIFIER SparseMatrix<float>& ac(a); + call_ref(ac); +} diff --git a/failtest/sparse_ref_2.cpp b/failtest/sparse_ref_2.cpp new file mode 100644 index 000000000..46c9440c2 --- /dev/null +++ b/failtest/sparse_ref_2.cpp @@ -0,0 +1,15 @@ +#include "../Eigen/Sparse" + +using namespace Eigen; + +void call_ref(Ref<SparseMatrix<float> > a) { } + +int main() +{ + SparseMatrix<float> A(10,10); +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD + call_ref(A.row(3)); +#else + call_ref(A.col(3)); +#endif +} diff --git a/failtest/sparse_ref_3.cpp b/failtest/sparse_ref_3.cpp new file mode 100644 index 000000000..a9949b552 --- /dev/null +++ b/failtest/sparse_ref_3.cpp @@ -0,0 +1,15 @@ +#include "../Eigen/Sparse" + +using namespace Eigen; + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +void call_ref(Ref<SparseMatrix<float> > a) { } +#else +void call_ref(const Ref<const SparseMatrix<float> > &a) { } +#endif + +int main() +{ + SparseMatrix<float> a(10,10); + call_ref(a+a); +} diff --git a/failtest/sparse_ref_4.cpp b/failtest/sparse_ref_4.cpp new file mode 100644 index 000000000..57bb6a1fc --- /dev/null +++ b/failtest/sparse_ref_4.cpp @@ -0,0 +1,15 @@ +#include "../Eigen/Sparse" + +using namespace Eigen; + +void call_ref(Ref<SparseMatrix<float> > a) {} + +int main() +{ + SparseMatrix<float> A(10,10); +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD + call_ref(A.transpose()); +#else + call_ref(A); +#endif +} diff --git a/failtest/sparse_ref_5.cpp b/failtest/sparse_ref_5.cpp new file mode 100644 index 000000000..4478f6f2f --- /dev/null +++ b/failtest/sparse_ref_5.cpp @@ -0,0 +1,16 @@ +#include "../Eigen/Sparse" + +using namespace Eigen; + +void call_ref(Ref<SparseMatrix<float> > a) { } + +int main() +{ + SparseMatrix<float> a(10,10); + SparseMatrixBase<SparseMatrix<float> > &ac(a); +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD + call_ref(ac); +#else + call_ref(ac.derived()); +#endif +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f57d8ce36..168749634 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -228,6 +228,7 @@ ei_add_test(stddeque) ei_add_test(sparse_basic) ei_add_test(sparse_vector) ei_add_test(sparse_product) +ei_add_test(sparse_ref) ei_add_test(sparse_solvers) ei_add_test(sparse_permutations) ei_add_test(simplicial_cholesky) diff --git a/test/adjoint.cpp b/test/adjoint.cpp index ea36f7841..3b2a53c91 100644 --- a/test/adjoint.cpp +++ b/test/adjoint.cpp @@ -64,6 +64,7 @@ template<typename MatrixType> void adjoint(const MatrixType& m) typedef typename NumTraits<Scalar>::Real RealScalar; typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType; typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType; + const Index PacketSize = internal::packet_traits<Scalar>::size; Index rows = m.rows(); Index cols = m.cols(); @@ -108,6 +109,17 @@ template<typename MatrixType> void adjoint(const MatrixType& m) VERIFY_IS_APPROX(m3,m1.transpose()); m3.transposeInPlace(); VERIFY_IS_APPROX(m3,m1); + + if(PacketSize<m3.rows() && PacketSize<m3.cols()) + { + m3 = m1; + Index i = internal::random<Index>(0,m3.rows()-PacketSize); + Index j = internal::random<Index>(0,m3.cols()-PacketSize); + m3.template block<PacketSize,PacketSize>(i,j).transposeInPlace(); + VERIFY_IS_APPROX( (m3.template block<PacketSize,PacketSize>(i,j)), (m1.template block<PacketSize,PacketSize>(i,j).transpose()) ); + m3.template block<PacketSize,PacketSize>(i,j).transposeInPlace(); + VERIFY_IS_APPROX(m3,m1); + } // check inplace adjoint m3 = m1; @@ -129,9 +141,19 @@ void test_adjoint() CALL_SUBTEST_1( adjoint(Matrix<float, 1, 1>()) ); CALL_SUBTEST_2( adjoint(Matrix3d()) ); CALL_SUBTEST_3( adjoint(Matrix4f()) ); + CALL_SUBTEST_4( adjoint(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) ); CALL_SUBTEST_5( adjoint(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_6( adjoint(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); + + // Complement for 128 bits vectorization: + CALL_SUBTEST_8( adjoint(Matrix2d()) ); + CALL_SUBTEST_9( adjoint(Matrix<int,4,4>()) ); + + // 256 bits vectorization: + CALL_SUBTEST_10( adjoint(Matrix<float,8,8>()) ); + CALL_SUBTEST_11( adjoint(Matrix<double,4,4>()) ); + CALL_SUBTEST_12( adjoint(Matrix<int,8,8>()) ); } // test a large static matrix only once CALL_SUBTEST_7( adjoint(Matrix<float, 100, 100>()) ); diff --git a/test/conjugate_gradient.cpp b/test/conjugate_gradient.cpp index 869051b31..019cc4d64 100644 --- a/test/conjugate_gradient.cpp +++ b/test/conjugate_gradient.cpp @@ -12,13 +12,15 @@ template<typename T> void test_conjugate_gradient_T() { - ConjugateGradient<SparseMatrix<T>, Lower> cg_colmajor_lower_diag; - ConjugateGradient<SparseMatrix<T>, Upper> cg_colmajor_upper_diag; + ConjugateGradient<SparseMatrix<T>, Lower > cg_colmajor_lower_diag; + ConjugateGradient<SparseMatrix<T>, Upper > cg_colmajor_upper_diag; + ConjugateGradient<SparseMatrix<T>, Lower|Upper> cg_colmajor_loup_diag; ConjugateGradient<SparseMatrix<T>, Lower, IdentityPreconditioner> cg_colmajor_lower_I; ConjugateGradient<SparseMatrix<T>, Upper, IdentityPreconditioner> cg_colmajor_upper_I; CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_diag) ); CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_diag) ); + CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_loup_diag) ); CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_I) ); CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_I) ); } diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp index 149f1db2f..0227ba577 100644 --- a/test/diagonalmatrices.cpp +++ b/test/diagonalmatrices.cpp @@ -84,6 +84,13 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m) VERIFY_IS_APPROX(m1 * (rdm1 * s1), (m1 * rdm1) * s1); VERIFY_IS_APPROX(m1 * (s1 * rdm1), (m1 * rdm1) * s1); + + // Diagonal to dense + sq_m1.setRandom(); + sq_m2 = sq_m1; + VERIFY_IS_APPROX( (sq_m1 += (s1*v1).asDiagonal()), sq_m2 += (s1*v1).asDiagonal().toDenseMatrix() ); + VERIFY_IS_APPROX( (sq_m1 -= (s1*v1).asDiagonal()), sq_m2 -= (s1*v1).asDiagonal().toDenseMatrix() ); + VERIFY_IS_APPROX( (sq_m1 = (s1*v1).asDiagonal()), (s1*v1).asDiagonal().toDenseMatrix() ); } void test_diagonalmatrices() diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp index e189217eb..042dd0329 100644 --- a/test/geo_transformations.cpp +++ b/test/geo_transformations.cpp @@ -99,10 +99,16 @@ template<typename Scalar, int Mode, int Options> void transformations() Scalar a = internal::random<Scalar>(-Scalar(M_PI), Scalar(M_PI)); Scalar s0 = internal::random<Scalar>(), s1 = internal::random<Scalar>(); + + while(v0.norm() < test_precision<Scalar>()) v0 = Vector3::Random(); + while(v1.norm() < test_precision<Scalar>()) v1 = Vector3::Random(); VERIFY_IS_APPROX(v0, AngleAxisx(a, v0.normalized()) * v0); VERIFY_IS_APPROX(-v0, AngleAxisx(Scalar(M_PI), v0.unitOrthogonal()) * v0); - VERIFY_IS_APPROX(cos(a)*v0.squaredNorm(), v0.dot(AngleAxisx(a, v0.unitOrthogonal()) * v0)); + if(abs(cos(a)) > test_precision<Scalar>()) + { + VERIFY_IS_APPROX(cos(a)*v0.squaredNorm(), v0.dot(AngleAxisx(a, v0.unitOrthogonal()) * v0)); + } m = AngleAxisx(a, v0.normalized()).toRotationMatrix().adjoint(); VERIFY_IS_APPROX(Matrix3::Identity(), m * AngleAxisx(a, v0.normalized())); VERIFY_IS_APPROX(Matrix3::Identity(), AngleAxisx(a, v0.normalized()) * m); @@ -123,11 +129,18 @@ template<typename Scalar, int Mode, int Options> void transformations() // angle-axis conversion AngleAxisx aa = AngleAxisx(q1); VERIFY_IS_APPROX(q1 * v1, Quaternionx(aa) * v1); - VERIFY_IS_NOT_APPROX(q1 * v1, Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1); + + if(abs(aa.angle()) > NumTraits<Scalar>::dummy_precision()) + { + VERIFY( !(q1 * v1).isApprox(Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1) ); + } aa.fromRotationMatrix(aa.toRotationMatrix()); VERIFY_IS_APPROX(q1 * v1, Quaternionx(aa) * v1); - VERIFY_IS_NOT_APPROX(q1 * v1, Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1); + if(abs(aa.angle()) > NumTraits<Scalar>::dummy_precision()) + { + VERIFY( !(q1 * v1).isApprox(Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1) ); + } // AngleAxis VERIFY_IS_APPROX(AngleAxisx(a,v1.normalized()).toRotationMatrix(), @@ -347,7 +360,9 @@ template<typename Scalar, int Mode, int Options> void transformations() // test transform inversion t0.setIdentity(); t0.translate(v0); - t0.linear().setRandom(); + do { + t0.linear().setRandom(); + } while(t0.linear().jacobiSvd().singularValues()(2)<test_precision<Scalar>()); Matrix4 t044 = Matrix4::Zero(); t044(3,3) = 1; t044.block(0,0,t0.matrix().rows(),4) = t0.matrix(); diff --git a/test/main.h b/test/main.h index a1ccd28b5..ab0b9187e 100644 --- a/test/main.h +++ b/test/main.h @@ -47,8 +47,8 @@ // protected by parenthesis against macro expansion, the min()/max() macros // are defined here and any not-parenthesized min/max call will cause a // compiler error. -#define min(A,B) please_protect_your_min_with_parentheses -#define max(A,B) please_protect_your_max_with_parentheses +//#define min(A,B) please_protect_your_min_with_parentheses +//#define max(A,B) please_protect_your_max_with_parentheses #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes // B0 is defined in POSIX header termios.h @@ -237,6 +237,7 @@ inline void verify_impl(bool condition, const char *testname, const char *file, #define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a)) #define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b)) +#define VERIFY_IS_NOT_EQUAL(a, b) VERIFY(!test_is_equal(a, b)) #define VERIFY_IS_APPROX(a, b) VERIFY(test_isApprox(a, b)) #define VERIFY_IS_NOT_APPROX(a, b) VERIFY(!test_isApprox(a, b)) #define VERIFY_IS_MUCH_SMALLER_THAN(a, b) VERIFY(test_isMuchSmallerThan(a, b)) diff --git a/test/nullary.cpp b/test/nullary.cpp index a19c674e4..8344855df 100644 --- a/test/nullary.cpp +++ b/test/nullary.cpp @@ -80,7 +80,9 @@ void testVectorType(const VectorType& base) Matrix<Scalar,1,Dynamic> col_vector(size); row_vector.setLinSpaced(size,low,high); col_vector.setLinSpaced(size,low,high); - VERIFY( row_vector.isApprox(col_vector.transpose(), NumTraits<Scalar>::epsilon())); + // when using the extended precision (e.g., FPU) the relative error might exceed 1 bit + // when computing the squared sum in isApprox, thus the 2x factor. + VERIFY( row_vector.isApprox(col_vector.transpose(), Scalar(2)*NumTraits<Scalar>::epsilon())); Matrix<Scalar,Dynamic,1> size_changer(size+50); size_changer.setLinSpaced(size,low,high); diff --git a/test/packetmath.cpp b/test/packetmath.cpp index ee0502f69..49f601907 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -261,6 +261,22 @@ template<typename Scalar> void packetmath() VERIFY(isApproxAbs(data2[j], data1[i+j*PacketSize], refvalue) && "ptranspose"); } } + + if (internal::packet_traits<Scalar>::HasBlend) { + Packet thenPacket = internal::pload<Packet>(data1); + Packet elsePacket = internal::pload<Packet>(data2); + EIGEN_ALIGN_DEFAULT internal::Selector<PacketSize> selector; + for (int i = 0; i < PacketSize; ++i) { + selector.select[i] = i; + } + + Packet blend = internal::pblend(selector, thenPacket, elsePacket); + EIGEN_ALIGN_DEFAULT Scalar result[size]; + internal::pstore(result, blend); + for (int i = 0; i < PacketSize; ++i) { + VERIFY(isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue)); + } + } } template<typename Scalar> void packetmath_real() diff --git a/test/product_large.cpp b/test/product_large.cpp index 11531aa1d..ffb8b7bf2 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -39,15 +39,16 @@ void test_product_large() // check the functions to setup blocking sizes compile and do not segfault // FIXME check they do what they are supposed to do !! std::ptrdiff_t l1 = internal::random<int>(10000,20000); - std::ptrdiff_t l2 = internal::random<int>(1000000,2000000); - setCpuCacheSizes(l1,l2); + std::ptrdiff_t l2 = internal::random<int>(100000,200000); + std::ptrdiff_t l3 = internal::random<int>(1000000,2000000); + setCpuCacheSizes(l1,l2,l3); VERIFY(l1==l1CacheSize()); VERIFY(l2==l2CacheSize()); std::ptrdiff_t k1 = internal::random<int>(10,100)*16; std::ptrdiff_t m1 = internal::random<int>(10,100)*16; std::ptrdiff_t n1 = internal::random<int>(10,100)*16; // only makes sure it compiles fine - internal::computeProductBlockingSizes<float,float>(k1,m1,n1); + internal::computeProductBlockingSizes<float,float>(k1,m1,n1,1); } { diff --git a/test/product_small.cpp b/test/product_small.cpp index 8b132abb6..091955a0f 100644 --- a/test/product_small.cpp +++ b/test/product_small.cpp @@ -9,6 +9,7 @@ #define EIGEN_NO_STATIC_ASSERT #include "product.h" +#include <Eigen/LU> // regression test for bug 447 void product1x1() @@ -46,5 +47,14 @@ void test_product_small() Vector3f v = Vector3f::Random(); VERIFY_IS_APPROX( (v * v.transpose()) * v, (v * v.transpose()).eval() * v); } + + { + // regression test for pull-request #93 + Eigen::Matrix<double, 1, 1> A; A.setRandom(); + Eigen::Matrix<double, 18, 1> B; B.setRandom(); + Eigen::Matrix<double, 1, 18> C; C.setRandom(); + VERIFY_IS_APPROX(B * A.inverse(), B * A.inverse()[0]); + VERIFY_IS_APPROX(A.inverse() * C, A.inverse()[0] * C); + } #endif } diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index a26fd5dcd..8fd759c93 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -408,6 +408,26 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re m.setFromTriplets(triplets.begin(), triplets.end()); VERIFY_IS_APPROX(m, refMat); } + + // test Map + { + DenseMatrix refMat2(rows, cols), refMat3(rows, cols); + SparseMatrixType m2(rows, cols), m3(rows, cols); + initSparse<Scalar>(density, refMat2, m2); + initSparse<Scalar>(density, refMat3, m3); + { + Map<SparseMatrixType> mapMat2(m2.rows(), m2.cols(), m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(), m2.innerNonZeroPtr()); + Map<SparseMatrixType> mapMat3(m3.rows(), m3.cols(), m3.nonZeros(), m3.outerIndexPtr(), m3.innerIndexPtr(), m3.valuePtr(), m3.innerNonZeroPtr()); + VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3); + VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3); + } + { + MappedSparseMatrix<Scalar,SparseMatrixType::Options,StorageIndex> mapMat2(m2.rows(), m2.cols(), m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(), m2.innerNonZeroPtr()); + MappedSparseMatrix<Scalar,SparseMatrixType::Options,StorageIndex> mapMat3(m3.rows(), m3.cols(), m3.nonZeros(), m3.outerIndexPtr(), m3.innerIndexPtr(), m3.valuePtr(), m3.innerNonZeroPtr()); + VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3); + VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3); + } + } // test triangularView { diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp new file mode 100644 index 000000000..e7380ba21 --- /dev/null +++ b/test/sparse_ref.cpp @@ -0,0 +1,99 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 20015 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// This unit test cannot be easily written to work with EIGEN_DEFAULT_TO_ROW_MAJOR +#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR +#undef EIGEN_DEFAULT_TO_ROW_MAJOR +#endif + +static long int nb_temporaries; + +inline void on_temporary_creation() { + // here's a great place to set a breakpoint when debugging failures in this test! + nb_temporaries++; +} + +#define EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN { on_temporary_creation(); } + +#include "main.h" +#include <Eigen/SparseCore> + +#define VERIFY_EVALUATION_COUNT(XPR,N) {\ + nb_temporaries = 0; \ + XPR; \ + if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \ + VERIFY( (#XPR) && nb_temporaries==N ); \ + } + +template<typename PlainObjectType> void check_const_correctness(const PlainObjectType&) +{ + // verify that ref-to-const don't have LvalueBit + typedef typename internal::add_const<PlainObjectType>::type ConstPlainObjectType; + VERIFY( !(internal::traits<Ref<ConstPlainObjectType> >::Flags & LvalueBit) ); + VERIFY( !(internal::traits<Ref<ConstPlainObjectType, Aligned> >::Flags & LvalueBit) ); + VERIFY( !(Ref<ConstPlainObjectType>::Flags & LvalueBit) ); + VERIFY( !(Ref<ConstPlainObjectType, Aligned>::Flags & LvalueBit) ); +} + +template<typename B> +EIGEN_DONT_INLINE void call_ref_1(Ref<SparseMatrix<float> > a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); } + +template<typename B> +EIGEN_DONT_INLINE void call_ref_2(const Ref<const SparseMatrix<float> >& a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); } + +void call_ref() +{ +// SparseVector<std::complex<float> > ca = VectorXcf::Random(10).sparseView(); +// SparseVector<float> a = VectorXf::Random(10).sparseView(); + SparseMatrix<float> A = MatrixXf::Random(10,10).sparseView(0.5,1); + SparseMatrix<float,RowMajor> B = MatrixXf::Random(10,10).sparseView(0.5,1); + const SparseMatrix<float>& Ac(A); + Block<SparseMatrix<float> > Ab(A,0,1, 3,3); + const Block<SparseMatrix<float> > Abc(A,0,1,3,3); + + + VERIFY_EVALUATION_COUNT( call_ref_1(A, A), 0); +// VERIFY_EVALUATION_COUNT( call_ref_1(Ac, Ac), 0); // does not compile on purpose + VERIFY_EVALUATION_COUNT( call_ref_2(A, A), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(A.transpose(), A.transpose()), 1); + VERIFY_EVALUATION_COUNT( call_ref_2(Ac,Ac), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(A+A,2*Ac), 1); + VERIFY_EVALUATION_COUNT( call_ref_2(B, B), 1); + VERIFY_EVALUATION_COUNT( call_ref_2(B.transpose(), B.transpose()), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(A*A, A*A), 1); + + Ref<SparseMatrix<float> > Ar(A); + VERIFY_IS_APPROX(Ar+Ar, A+A); + VERIFY_EVALUATION_COUNT( call_ref_1(Ar, A), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(Ar, A), 0); + + Ref<SparseMatrix<float,RowMajor> > Br(B); + VERIFY_EVALUATION_COUNT( call_ref_1(Br.transpose(), Br.transpose()), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(Br, Br), 1); + VERIFY_EVALUATION_COUNT( call_ref_2(Br.transpose(), Br.transpose()), 0); + + Ref<const SparseMatrix<float> > Arc(A); +// VERIFY_EVALUATION_COUNT( call_ref_1(Arc, Arc), 0); // does not compile on purpose + VERIFY_EVALUATION_COUNT( call_ref_2(Arc, Arc), 0); + + VERIFY_EVALUATION_COUNT( call_ref_2(A.middleCols(1,3), A.middleCols(1,3)), 0); + + VERIFY_EVALUATION_COUNT( call_ref_2(A.col(2), A.col(2)), 0); + + VERIFY_EVALUATION_COUNT( call_ref_2(A.block(1,1,3,3), A.block(1,1,3,3)), 1); // should be 0 (allocate starts/nnz only) +} + +void test_sparse_ref() +{ + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_1( check_const_correctness(SparseMatrix<float>()) ); + CALL_SUBTEST_1( check_const_correctness(SparseMatrix<double,RowMajor>()) ); + CALL_SUBTEST_2( call_ref() ); + } +} diff --git a/test/sparse_solver.h b/test/sparse_solver.h index afda26b93..42b365eaa 100644 --- a/test/sparse_solver.h +++ b/test/sparse_solver.h @@ -32,7 +32,7 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A, x = solver.solve(b); if (solver.info() != Success) { - std::cerr << "sparse solver testing: solving failed\n"; + std::cerr << "sparse solver testing: solving failed (" << typeid(Solver).name() << ")\n"; return; } VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!"); @@ -75,7 +75,8 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A, xm = solver.solve(bm); if (solver.info() != Success) { - std::cerr << "sparse solver testing: solving failed\n"; + std::cerr << "sparse solver testing: solving with a Map failed\n"; + exit(0); return; } VERIFY(oldb.isApprox(bm) && "sparse solver testing: the rhs should not be modified!"); @@ -194,7 +195,10 @@ int generate_sparse_spd_problem(Solver& , typename Solver::MatrixType& A, typena dA = dM * dM.adjoint(); halfA.resize(size,size); - halfA.template selfadjointView<Solver::UpLo>().rankUpdate(M); + if(Solver::UpLo==(Lower|Upper)) + halfA = A; + else + halfA.template selfadjointView<Solver::UpLo>().rankUpdate(M); return size; } diff --git a/unsupported/Eigen/CXX11/Core b/unsupported/Eigen/CXX11/Core index 4dc4ab224..292f09564 100644 --- a/unsupported/Eigen/CXX11/Core +++ b/unsupported/Eigen/CXX11/Core @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2013 Christian Seiler <christian@iwakd.de> +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -21,20 +22,26 @@ * module. Note that at this stage, you should not need to include * this module directly. * + * It also provides a limited fallback for compilers that don't support + * CXX11 yet, such as nvcc. + * * \code * #include <Eigen/CXX11/Core> * \endcode */ -#include <array> +#include <vector> +// Emulate the cxx11 functionality that we need if the compiler doesn't support it. +#if __cplusplus <= 199711L +#include "src/Core/util/EmulateCXX11Meta.h" +#else +#include <array> #include "src/Core/util/CXX11Workarounds.h" #include "src/Core/util/CXX11Meta.h" +#endif #include <Eigen/src/Core/util/ReenableStupidWarnings.h> #endif // EIGEN_CXX11_CORE_MODULE -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 049ce5596..34107ae71 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -1,6 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> // Copyright (C) 2013 Christian Seiler <christian@iwakd.de> // // This Source Code Form is subject to the terms of the Mozilla @@ -10,9 +11,10 @@ #ifndef EIGEN_CXX11_TENSOR_MODULE #define EIGEN_CXX11_TENSOR_MODULE -#include <unsupported/Eigen/CXX11/Core> +#include "Eigen/src/Core/util/StaticAssert.h" +#include "unsupported/Eigen/CXX11/Core" -#include <Eigen/src/Core/util/DisableStupidWarnings.h> +#include "Eigen/src/Core/util/DisableStupidWarnings.h" /** \defgroup CXX11_Tensor_Module Tensor Module * @@ -26,14 +28,69 @@ #include <cstddef> #include <cstring> +#include <stdint.h> -#include "src/Tensor/TensorStorage.h" -#include "src/Tensor/Tensor.h" +#if __cplusplus > 199711 +#include <random> +#endif -#include <Eigen/src/Core/util/ReenableStupidWarnings.h> +#ifdef EIGEN_USE_THREADS +#include <future> +#endif -#endif // EIGEN_CXX11_TENSOR_MODULE +#ifdef EIGEN_USE_GPU +#include <cuda_runtime.h> +#if defined(__CUDACC__) +#include <curand_kernel.h> +#endif +#endif + +#include "Eigen/Core" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" +#include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorRef.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h" + +#include "Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 0e274b801..3a08628be 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -317,7 +317,7 @@ constexpr inline decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts template<typename Array, int... n> constexpr inline Array h_array_reverse(Array arr, numeric_list<int, n...>) { - return {{std_array_get<sizeof...(n) - n - 1>(arr)...}}; + return {{array_get<sizeof...(n) - n - 1>(arr)...}}; } template<typename T, std::size_t N> @@ -335,9 +335,9 @@ constexpr inline std::array<T, N> array_reverse(std::array<T, N> arr) // an infinite loop) template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1> struct h_array_reduce { - constexpr static inline auto run(std::array<T, N> arr) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), std_array_get<n>(arr))) + constexpr static inline auto run(std::array<T, N> arr) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), array_get<n>(arr))) { - return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), std_array_get<n>(arr)); + return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), array_get<n>(arr)); } }; @@ -346,7 +346,7 @@ struct h_array_reduce<Reducer, T, N, 0> { constexpr static inline T run(std::array<T, N> arr) { - return std_array_get<0>(arr); + return array_get<0>(arr); } }; @@ -370,12 +370,20 @@ constexpr inline auto array_prod(std::array<T, N> arr) -> decltype(array_reduce< return array_reduce<product_op, T, N>(arr); } +template<typename t> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) { + eigen_assert(a.size() > 0); + t prod = 1; + for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; } + return prod; +} + /* zip an array */ template<typename Op, typename A, typename B, std::size_t N, int... n> constexpr inline std::array<decltype(Op::run(A(), B())),N> h_array_zip(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>) { - return std::array<decltype(Op::run(A(), B())),N>{{ Op::run(std_array_get<n>(a), std_array_get<n>(b))... }}; + return std::array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }}; } template<typename Op, typename A, typename B, std::size_t N> @@ -387,9 +395,9 @@ constexpr inline std::array<decltype(Op::run(A(), B())),N> array_zip(std::array< /* zip an array and reduce the result */ template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n> -constexpr inline auto h_array_zip_and_reduce(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(std_array_get<n>(a), std_array_get<n>(b))...)) +constexpr inline auto h_array_zip_and_reduce(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...)) { - return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(std_array_get<n>(a), std_array_get<n>(b))...); + return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...); } template<typename Reducer, typename Op, typename A, typename B, std::size_t N> @@ -403,7 +411,7 @@ constexpr inline auto array_zip_and_reduce(std::array<A, N> a, std::array<B, N> template<typename Op, typename A, std::size_t N, int... n> constexpr inline std::array<decltype(Op::run(A())),N> h_array_apply(std::array<A, N> a, numeric_list<int, n...>) { - return std::array<decltype(Op::run(A())),N>{{ Op::run(std_array_get<n>(a))... }}; + return std::array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }}; } template<typename Op, typename A, std::size_t N> @@ -415,9 +423,9 @@ constexpr inline std::array<decltype(Op::run(A())),N> array_apply(std::array<A, /* apply stuff to an array and reduce */ template<typename Reducer, typename Op, typename A, std::size_t N, int... n> -constexpr inline auto h_array_apply_and_reduce(std::array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(std_array_get<n>(arr))...)) +constexpr inline auto h_array_apply_and_reduce(std::array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...)) { - return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(std_array_get<n>(arr))...); + return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...); } template<typename Reducer, typename Op, typename A, std::size_t N> @@ -497,7 +505,3 @@ InstType instantiate_by_c_array(ArrType* arr) } // end namespace Eigen #endif // EIGEN_CXX11META_H - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index d71a67590..a590cf4e1 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -17,9 +17,6 @@ #error Intel Compiler only supports required C++ features since version 13.1. // note that most stuff in principle works with 13.0 but when combining // some features, at some point 13.0 will just fail with an internal assertion -#elif defined(__clang__) && (__clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 1)) -// note that it _should_ work with 3.1 but it was only tested with 3.2 -#error Clang C++ Compiler (clang++) only supports required C++ features since version 3.1. #elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)) // G++ < 4.6 by default will continue processing the source files - even if we use #error to make // it error out. For this reason, we use the pragma to make sure G++ aborts at the first error @@ -42,32 +39,46 @@ namespace Eigen { +// Use std::array as Eigen array +template <typename T, std::size_t N> using array = std::array<T, N>; + namespace internal { /* std::get is only constexpr in C++14, not yet in C++11 * - libstdc++ from version 4.7 onwards has it nevertheless, * so use that * - libstdc++ older versions: use _M_instance directly - * - libc++ from version 3.4 onwards has it IF compiled with - * -std=c++1y - * - libc++ older versions or -std=c++11: use __elems_ directly + * - libc++ all versions so far: use __elems_ directly * - all other libs: use std::get to be portable, but * this may not be constexpr */ #if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322 #define STD_GET_ARR_HACK a._M_instance[I] -#elif defined(_LIBCPP_VERSION) && (!defined(_LIBCPP_STD_VER) || _LIBCPP_STD_VER <= 11) +#elif defined(_LIBCPP_VERSION) #define STD_GET_ARR_HACK a.__elems_[I] #else #define STD_GET_ARR_HACK std::template get<I, T, N>(a) #endif -template<std::size_t I, class T, std::size_t N> constexpr inline T& std_array_get(std::array<T,N>& a) { return (T&) STD_GET_ARR_HACK; } -template<std::size_t I, class T, std::size_t N> constexpr inline T&& std_array_get(std::array<T,N>&& a) { return (T&&) STD_GET_ARR_HACK; } -template<std::size_t I, class T, std::size_t N> constexpr inline T const& std_array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; } +template<std::size_t I, class T, std::size_t N> constexpr inline T& array_get(std::array<T,N>& a) { return (T&) STD_GET_ARR_HACK; } +template<std::size_t I, class T, std::size_t N> constexpr inline T&& array_get(std::array<T,N>&& a) { return (T&&) STD_GET_ARR_HACK; } +template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; } + +template<std::size_t I, class T> constexpr inline T& array_get(std::vector<T>& a) { return a[I]; } +template<std::size_t I, class T> constexpr inline T&& array_get(std::vector<T>&& a) { return a[I]; } +template<std::size_t I, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I]; } #undef STD_GET_ARR_HACK +template <typename T> struct array_size; +template<class T, std::size_t N> struct array_size<const std::array<T,N> > { + static const size_t value = N; +}; +template <typename T> struct array_size; +template<class T, std::size_t N> struct array_size<std::array<T,N> > { + static const size_t value = N; +}; + /* Suppose you have a template of the form * template<typename T> struct X; * And you want to specialize it in such a way: diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h new file mode 100644 index 000000000..494f95690 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -0,0 +1,435 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EMULATE_CXX11_META_H +#define EIGEN_EMULATE_CXX11_META_H + + + +namespace Eigen { + +// The array class is only available starting with cxx11. Emulate our own here +// if needed +template <typename T, size_t n> class array { + public: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; } + + static const std::size_t size = n; + + T values[n]; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array() { } + explicit EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v) { + EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2) { + EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) { + EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, + const T& v4) { + EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5) { + EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6) { + EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7) { + EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array( + const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7, const T& v8) { + EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + values[7] = v8; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + array(std::initializer_list<T> l) { + eigen_assert(l.size() == n); + std::copy(l.begin(), l.end(), values); + } +#endif +}; + + +namespace internal { + +/** \internal + * \file CXX11/Core/util/EmulateCXX11Meta.h + * This file emulates a subset of the functionality provided by CXXMeta.h for + * compilers that don't yet support cxx11 such as nvcc. + */ + +struct empty_list { static const std::size_t count = 0; }; + +template<typename T, typename Tail=empty_list> struct type_list { + typedef T HeadType; + typedef Tail TailType; + static const T head; + static const Tail tail; + static const std::size_t count = 1 + Tail::count; +}; + +struct null_type { }; + +template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type, + typename T4 = null_type, typename T5 = null_type, typename T6 = null_type, + typename T7 = null_type, typename T8 = null_type> +struct make_type_list { + typedef typename make_type_list<T2, T3, T4, T5, T6, T7, T8>::type tailresult; + + typedef type_list<T1, tailresult> type; +}; + +template<> struct make_type_list<> { + typedef empty_list type; +}; + + +template <std::size_t index, class TList> struct get_type; + +template <class Head, class Tail> +struct get_type<0, type_list<Head, Tail> > +{ + typedef Head type; +}; + +template <std::size_t i, class Head, class Tail> +struct get_type<i, type_list<Head, Tail> > +{ + typedef typename get_type<i-1, Tail>::type type; +}; + + +/* numeric list */ +template <typename T, T n> +struct type2val { + typedef T type; + static const T value = n; +}; + + +template<typename T, size_t n, T V> struct gen_numeric_list_repeated; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 1, V> { + typedef typename make_type_list<type2val<T, V> >::type type; +}; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 2, V> { + typedef typename make_type_list<type2val<T, V>, type2val<T, V> >::type type; +}; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 3, V> { + typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type; +}; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 4, V> { + typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type; +}; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 5, V> { + typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type; +}; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 6, V> { + typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, + type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type; +}; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 7, V> { + typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, + type2val<T, V>, type2val<T, V>, type2val<T, V>, + type2val<T, V> >::type type; +}; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 8, V> { + typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, + type2val<T, V>, type2val<T, V>, type2val<T, V>, + type2val<T, V>, type2val<T, V> >::type type; +}; + + +template <std::size_t index, class NList> struct get; + +template <std::size_t i> +struct get<i, empty_list> +{ + get() { eigen_assert(false && "index overflow"); } + typedef void type; + static const char value = '\0'; +}; + +template <std::size_t i, class Head> +struct get<i, type_list<Head, empty_list> > +{ + get() { eigen_assert(false && "index overflow"); } + typedef void type; + static const char value = '\0'; +}; + +template <class Head> +struct get<0, type_list<Head, empty_list> > +{ + typedef typename Head::type type; + static const type value = Head::value; +}; + +template <class Head, class Tail> +struct get<0, type_list<Head, Tail> > +{ + typedef typename Head::type type; + static const type value = Head::value; +}; + +template <std::size_t i, class Head, class Tail> +struct get<i, type_list<Head, Tail> > +{ + typedef typename Tail::HeadType::type type; + static const type value = get<i-1, Tail>::value; +}; + + +template <class NList> struct arg_prod { + static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod<typename NList::TailType>::value; +}; +template <> struct arg_prod<empty_list> { + static const int value = 1; +}; + + +template<int n, typename t> +array<t, n> repeat(t v) { + array<t, n> array; + array.fill(v); + return array; +} + +template<std::size_t I, class Head, class Tail> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>& a) { + return get<I, type_list<Head, Tail> >::value; +} +template<std::size_t I, class Head, class Tail> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>& a) { + return get<I, type_list<Head, Tail> >::value; +} + +template <class NList> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) { + return arg_prod<NList>::value; +}; + +template<std::size_t n, typename t> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) { + t prod = 1; + for (size_t i = 0; i < n; ++i) { prod *= a[i]; } + return prod; +} +template<typename t> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, 0>& /*a*/) { + return 0; +} + +template<typename t> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) { + eigen_assert(a.size() > 0); + t prod = 1; + for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; } + return prod; +} + +template<std::size_t I, class T, std::size_t N> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) { + return a[I]; +} +template<std::size_t I, class T, std::size_t N> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) { + return a[I]; +} + +template<std::size_t I, class T> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector<T>& a) { + return a[I]; +} +template<std::size_t I, class T> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector<T>& a) { + return a[I]; +} + +template <typename T> struct array_size; +template<class T, std::size_t N> struct array_size<array<T,N> > { + static const size_t value = N; +}; +template <typename T> struct array_size; +template<class T, std::size_t N> struct array_size<array<T,N>& > { + static const size_t value = N; +}; +template <typename T> struct array_size; +template<class T, std::size_t N> struct array_size<const array<T,N> > { + static const size_t value = N; +}; +template <typename T> struct array_size; +template<class T, std::size_t N> struct array_size<const array<T,N>& > { + static const size_t value = N; +}; + +struct sum_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a + b; } +}; +struct product_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a * b; } +}; + +struct logical_and_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a && b; } +}; +struct logical_or_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a || b; } +}; + +struct equal_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a == b; } +}; +struct not_equal_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a != b; } +}; +struct lesser_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a < b; } +}; +struct lesser_equal_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a <= b; } +}; + +struct greater_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a > b; } +}; +struct greater_equal_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a >= b; } +}; + +struct not_op { + template<typename A> static inline bool run(A a) { return !a; } +}; +struct negation_op { + template<typename A> static inline bool run(A a) { return -a; } +}; +struct greater_equal_zero_op { + template<typename A> static inline bool run(A a) { return a >= 0; } +}; + + +template<typename Reducer, typename Op, typename A, std::size_t N> +struct ArrayApplyAndReduce { + static inline bool run(const array<A, N>& a) { + EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + bool result = Reducer::run(Op::run(a[0]), Op::run(a[1])); + for (size_t i = 2; i < N; ++i) { + result = Reducer::run(result, Op::run(a[i])); + } + return result; + } +}; + +template<typename Reducer, typename Op, typename A> +struct ArrayApplyAndReduce<Reducer, Op, A, 1> { + static inline bool run(const array<A, 1>& a) { + return Op::run(a[0]); + } +}; + +template<typename Reducer, typename Op, typename A, std::size_t N> +inline bool array_apply_and_reduce(const array<A, N>& a) { + return ArrayApplyAndReduce<Reducer, Op, A, N>::run(a); +} + +template<typename Reducer, typename Op, typename A, typename B, std::size_t N> +struct ArrayZipAndReduce { + static inline bool run(const array<A, N>& a, const array<B, N>& b) { + EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1])); + for (size_t i = 2; i < N; ++i) { + result = Reducer::run(result, Op::run(a[i], b[i])); + } + return result; + } +}; + +template<typename Reducer, typename Op, typename A, typename B> +struct ArrayZipAndReduce<Reducer, Op, A, B, 1> { + static inline bool run(const array<A, 1>& a, const array<B, 1>& b) { + return Op::run(a[0], b[0]); + } +}; + +template<typename Reducer, typename Op, typename A, typename B, std::size_t N> +inline bool array_zip_and_reduce(const array<A, N>& a, const array<B, N>& b) { + return ArrayZipAndReduce<Reducer, Op, A, B, N>::run(a, b); +} + +} // end namespace internal + +} // end namespace Eigen + + + +#endif // EIGEN_EMULATE_CXX11_META_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md new file mode 100644 index 000000000..6a4d52cc3 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -0,0 +1,1446 @@ +# Eigen Tensors + +Tensors are multidimensional arrays of elements. Elements are typically scalars, +but more complex types such as strings are also supported. + +[TOC] + +## Tensor Classes + +You can manipulate a tensor with one of the following classes. They all are in +the namespace ```::Eigen.``` + + +### Class Tensor<data_type, rank> + +This is the class to use to create a tensor and allocate memory for it. The +class is templatized with the tensor datatype, such as float or int, and the +tensor rank. The rank is the number of dimensions, for example rank 2 is a +matrix. + +Tensors of this class are resizable. For example, if you assign a tensor of a +different size to a Tensor, that tensor is resized to match its new value. + +#### Constructor Tensor<data_type, rank>(size0, size1, ...) + +Constructor for a Tensor. The constructor must be passed ```rank``` integers +indicating the sizes of the instance along each of the the ```rank``` +dimensions. + + // Create a tensor of rank 3 of sizes 2, 3, 4. This tensor owns + // memory to hold 24 floating point values (24 = 2 x 3 x 4). + Tensor<float, 3> t_3d(2, 3, 4); + + // Resize t_3d by assigning a tensor of different sizes, but same rank. + t_3d = Tensor<float, 3>(3, 4, 3); + +#### Constructor Tensor<data_type, rank>(size_array) + +Constructor where the sizes for the constructor are specified as an array of +values instead of an explicitly list of parameters. The array type to use is +```Eigen::array<Eigen::Index>```. The array can be constructed automatically +from an initializer list. + + // Create a tensor of strings of rank 2 with sizes 5, 7. + Tensor<string, 2> t_2d({5, 7}); + + +### Class TensorFixedSize<data_type, Sizes<size0, size1, ...>> + +Class to use for tensors of fixed size, where the size is known at compile +time. Fixed sized tensors can provide very fast computations because all their +dimensions are known by the compiler. FixedSize tensors are not resizable. + +If the total number of elements in a fixed size tensor is small enough the +tensor data is held onto the stack and does not cause heap allocation and free. + + // Create a 4 x 3 tensor of floats. + TensorFixedSize<float, Sizes<4, 3>> t_4x3; + +### Class TensorMap<Tensor<data_type, rank>> + +This is the class to use to create a tensor on top of memory allocated and +owned by another part of your code. It allows to view any piece of allocated +memory as a Tensor. Instances of this class do not own the memory where the +data are stored. + +A TensorMap is not resizable because it does not own the memory where its data +are stored. + +#### Constructor TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...) + +Constructor for a Tensor. The constructor must be passed a pointer to the +storage for the data, and "rank" size attributes. The storage has to be +large enough to hold all the data. + + // Map a tensor of ints on top of stack-allocated storage. + int storage[128]; // 2 x 4 x 2 x 8 = 128 + TensorMap<int, 4> t_4d(storage, 2, 4, 2, 8); + + // The same storage can be viewed as a different tensor. + // You can also pass the sizes as an array. + TensorMap<int, 2> t_2d(storage, 16, 8); + + // You can also map fixed-size tensors. Here we get a 1d view of + // the 2d fixed-size tensor. + Tensor<float, Sizes<4, 5>> t_4x3; + TensorMap<float, 1> t_12(t_4x3, 12); + + +#### Class TensorRef + +See Assigning to a TensorRef below. + +## Accessing Tensor Elements + +#### <data_type> tensor(index0, index1...) + +Return the element at position ```(index0, index1...)``` in tensor +```tensor```. You must pass as many parameters as the rank of ```tensor```. +The expression can be used as an l-value to set the value of the element at the +specified position. The value returned is of the datatype of the tensor. + + // Set the value of the element at position (0, 1, 0); + Tensor<float, 3> t_3d(2, 3, 4); + t_3d(0, 1, 0) = 12.0f; + + // Initialize all elements to random values. + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 4; ++k) { + t_3d(i, j, k) = ...some random value...; + } + } + } + + // Print elements of a tensor. + for (int i = 0; i < 2; ++i) { + LOG(INFO) << t_3d(i, 0, 0); + } + + +## TensorLayout + +The tensor library supports 2 layouts: ```ColMajor``` (the default) and +```RowMajor```. Only the default column major layout is currently fully +supported, and it is therefore not recommended to attempt to use the row major +layout at the moment. + +The layout of a tensor is optionally specified as part of its type. If not +specified explicitly column major is assumed. + + Tensor<float, 3, ColMajor> col_major; // equivalent to Tensor<float, 3> + TensorMap<Tensor<float, 3, RowMajor> > row_major(data, ...); + +All the arguments to an expression must use the same layout. Attempting to mix +different layouts will result in a compilation error. + +It is possible to change the layout of a tensor or an expression using the +```swap_layout()``` method. Note that this will also reverse the order of the +dimensions. + + Tensor<float, 2, ColMajor> col_major(2, 4); + Tensor<float, 2, RowMajor> row_major(2, 4); + + Tensor<float, 2> col_major_result = col_major; // ok, layouts match + Tensor<float, 2> col_major_result = row_major; // will not compile + + // Simple layout swap + col_major_result = row_major.swap_layout(); + eigen_assert(col_major_result.dimension(0) == 4); + eigen_assert(col_major_result.dimension(1) == 2); + + // Swap the layout and preserve the order of the dimensions + array<int, 2> shuffle(1, 0); + col_major_result = row_major.swap_layout().shuffle(shuffle); + eigen_assert(col_major_result.dimension(0) == 2); + eigen_assert(col_major_result.dimension(1) == 4); + + +## Tensor Operations + +The Eigen Tensor library provides a vast library of operations on Tensors: +numerical operations such as addition and multiplication, geometry operations +such as slicing and shuffling, etc. These operations are available as methods +of the Tensor classes, and in some cases as operator overloads. For example +the following code computes the elementwise addition of two tensors: + + Tensor<float, 3> t1(2, 3, 4); + ...set some values in t1... + Tensor<float, 3> t2(2, 3, 4); + ...set some values in t2... + // Set t3 to the element wise sum of t1 and t2 + Tensor<float, 3> t3 = t1 + t2; + +While the code above looks easy enough, it is important to understand that the +expression ```t1 + t2``` is not actually adding the values of the tensors. The +expression instead constructs a "tensor operator" object of the class +TensorCwiseBinaryOp<scalar_sum>, which has references to the tensors +```t1``` and ```t2```. This is a small C++ object that knows how to add +```t1``` and ```t2```. It is only when the value of the expression is assigned +to the tensor ```t3``` that the addition is actually performed. Technically, +this happens through the overloading of ```operator=()``` in the Tensor class. + +This mechanism for computing tensor expressions allows for lazy evaluation and +optimizations which are what make the tensor library very fast. + +Of course, the tensor operators do nest, and the expression ```t1 + t2 * +0.3f``` is actually represented with the (approximate) tree of operators: + + TensorCwiseBinaryOp<scalar_sum>(t1, TensorCwiseUnaryOp<scalar_mul>(t2, 0.3f)) + + +### Tensor Operations and C++ "auto" + +Because Tensor operations create tensor operators, the C++ ```auto``` keyword +does not have its intuitive meaning. Consider these 2 lines of code: + + Tensor<float, 3> t3 = t1 + t2; + auto t4 = t1 + t2; + +In the first line we allocate the tensor ```t3``` and it will contain the +result of the addition of ```t1``` and ```t2```. In the second line, ```t4``` +is actually the tree of tensor operators that will compute the addition of +```t1``` and ```t2```. In fact, ```t4``` is *not* a tensor and you cannot get +the values of its elements: + + Tensor<float, 3> t3 = t1 + t2; + cout << t3(0, 0, 0); // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0) + + auto t4 = t1 + t2; + cout << t4(0, 0, 0); // Compilation error! + +When you use ```auto``` you do not get a Tensor as a result but instead a +non-evaluated expression. So only use ```auto``` to delay evaluation. + +Unfortunately, there is no single underlying concrete type for holding +non-evaluated expressions, hence you have to use auto in the case when you do +want to hold non-evaluated expressions. + +When you need the results of set of tensor computations you have to assign the +result to a Tensor that will be capable of holding onto them. This can be +either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing +piece of memory. All the following will work: + + auto t4 = t1 + t2; + + Tensor<float, 3> result = t4; // Could also be: result(t4); + cout << result(0, 0, 0); + + TensorMap<float, 4> result(<a float* with enough space>, <size0>, ...) = t4; + cout << result(0, 0, 0); + + TensorFixedSize<float, Sizes<size0, ...>> result = t4; + cout << result(0, 0, 0); + +Until you need the results, you can keep the operation around, and even reuse +it for additional operations. As long as you keep the expression as an +operation, no computation is performed. + + // One way to compute exp((t1 + t2) * 0.2f); + auto t3 = t1 + t2; + auto t4 = t3 * 0.2f; + auto t5 = t4.exp(); + Tensor<float, 3> result = t5; + + // Another way, exactly as efficient as the previous one: + Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp(); + +### Controlling When Expression are Evaluated + +There are several ways to control when expressions are evaluated: +* Assignment to a Tensor, TensorFixedSize, or TensorMap. +* Use of the eval() method. +* Assignment to a TensorRef. + +#### Assigning to a Tensor, TensorFixedSize, or TensorMap. + +The most common way to evaluate an expression is to assign it to a Tensor. In +the example below, the ```auto``` declarations make the intermediate values +"Operations", not Tensors, and do not cause the expressions to be evaluated. +The assignment to the Tensor ```result``` causes the evaluation of all the +operations. + + auto t3 = t1 + t2; // t3 is an Operation. + auto t4 = t3 * 0.2f; // t4 is an Operation. + auto t5 = t4.exp(); // t5 is an Operation. + Tensor<float, 3> result = t5; // The operations are evaluated. + +If you know the ranks and sizes of the Operation value you can assign the +Operation to a TensorFixedSize instead of a Tensor, which is a bit more +efficient. + + // We know that the result is a 4x4x2 tensor! + TensorFixedSize<float, 4, 4, 2> result = t5; + +Simiarly, assigning an expression to a TensorMap causes its evaluation. Like +tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to +have the rank and sizes of the expression that are assigned to them. + +#### Calling eval(). + +When you compute large composite expressions, you sometimes want to tell Eigen +that an intermediate value in the expression tree is worth evaluating ahead of +time. This is done by inserting a call to the ```eval()``` method of the +expression Operation. + + // The previous example could have been written: + Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp(); + + // If you want to compute (t1 + t2) once ahead of time you can write: + Tensor<float, 3> result = ((t1 + t2).eval() * 0.2f).exp(); + +Semantically, calling ```eval()``` is equivalent to materializing the value of +the expression in a temporary Tensor of the right size. The code above in +effect does: + + // .eval() knows the size! + TensorFixedSize<float, 4, 4, 2> tmp = t1 + t2; + Tensor<float, 3> result = (tmp * 0.2f).exp(); + +Note that the return value of ```eval()``` is itself an Operation, so the +following code does not do what you may think: + + // Here t3 is an evaluation Operation. t3 has not been evaluated yet. + auto t3 = (t1 + t2).eval(); + + // You can use t3 in another expression. Still no evaluation. + auto t4 = (t3 * 0.2f).exp(); + + // The value is evaluated when you assign the Operation to a Tensor, using + // an intermediate tensor to represent t3.x + Tensor<float, 3> result = t4; + +While in the examples above calling ```eval()``` does not make a difference in +performance, in other cases it can make a huge difference. In the expression +below the ```broadcast()``` expression causes the ```X.maximum()``` expression +to be evaluated many times: + + Tensor<...> X ...; + Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast)) + * beta).exp(); + +Inserting a call to ```eval()``` between the ```maximum()``` and +```reshape()``` calls guarantees that maximum() is only computed once and +greatly speeds-up execution: + + Tensor<...> Y = + ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) + * beta).exp(); + +In the other example below, the tensor ```Y``` is both used in the expression +and its assignment. This is an aliasing problem and if the evaluation is not +done in the right order Y will be updated incrementally during the evaluation +resulting in bogus results: + + Tensor<...> Y ...; + Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast)); + +Inserting a call to ```eval()``` between the ```sum()``` and ```reshape()``` +expressions ensures that the sum is computed before any updates to ```Y``` are +done. + + Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); + +Note that an eval around the full right hand side expression is not needed +because the generated has to compute the i-th value of the right hand side +before assigning it to the left hand side. + +However, if you were assigning the expression value to a shuffle of ```Y``` +then you would need to force an eval for correctness by adding an ```eval()``` +call for the right hand side: + + Y.shuffle(...) = + (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval(); + + +#### Assigning to a TensorRef. + +If you need to access only a few elements from the value of an expression you +can avoid materializing the value in a full tensor by using a TensorRef. + +A TensorRef is a small wrapper class for any Eigen Operation. It provides +overloads for the ```()``` operator that let you access individual values in +the expression. TensorRef is convenient, because the Operation themselves do +not provide a way to access individual elements. + + // Create a TensorRef for the expression. The expression is not + // evaluated yet. + TensorRef<Tensor<float, 3> > ref = ((t1 + t2) * 0.2f).exp(); + + // Use "ref" to access individual elements. The expression is evaluated + // on the fly. + float at_0 = ref(0, 0, 0); + cout << ref(0, 1, 0); + +Only use TensorRef when you need a subset of the values of the expression. +TensorRef only computes the values you access. However note that if you are +going to access all the values it will be much faster to materialize the +results in a Tensor first. + +In some cases, if the full Tensor result would be very large, you may save +memory by accessing it as a TensorRef. But not always. So don't count on it. + + +### Controlling How Expressions Are Evaluated + +The tensor library provides several implementations of the various operations +such as contractions and convolutions. The implementations are optimized for +different environments: single threaded on CPU, multi threaded on CPU, or on a +GPU using cuda. Additional implementations may be added later. + +You can choose which implementation to use with the ```device()``` call. If +you do not choose an implementation explicitly the default implementation that +uses a single thread on the CPU is used. + +The default implementation has been optimized for recent Intel CPUs, taking +advantage of SSE, AVX, and FMA instructions. Work is ongoing to tune the +library on ARM CPUs. Note that you need to pass compiler-dependent flags +to enable the use of SSE, AVX, and other instructions. + +For example, the following code adds two tensors using the default +single-threaded CPU implementation: + + Tensor<float, 2> a(30, 40); + Tensor<float, 2> b(30, 40); + Tensor<float, 2> c = a + b; + +To choose a different implementation you have to insert a ```device()``` call +before the assignment of the result. For technical C++ reasons this requires +that the Tensor for the result be declared on its own. This means that you +have to know the size of the result. + + Eigen::Tensor<float, 2> c(30, 40); + c.device(...) = a + b; + +The call to ```device()``` must be the last call on the left of the operator=. + +You must pass to the ```device()``` call an Eigen device object. There are +presently three devices you can use: DefaultDevice, ThreadPoolDevice and +GpuDevice. + + +#### Evaluating With the DefaultDevice + +This is exactly the same as not inserting a ```device()``` call. + + DefaultDevice my_device; + c.device(my_device) = a + b; + +#### Evaluating with a Thread Pool + + // Create the Eigen ThreadPoolDevice. + Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */); + + // Now just use the device when evaluating expressions. + Eigen::Tensor<float, 2> c(30, 50); + c.device(my_device) = a.contract(b, dot_product_dims); + + +#### Evaluating On GPU + +This is presently a bit more complicated than just using a thread pool device. +You need to create a GPU device but you also need to explicitly allocate the +memory for tensors with cuda. + + +## API Reference + +### Datatypes + +In the documentation of the tensor methods and Operation we mention datatypes +that are tensor-type specific: + +#### <Tensor-Type>::Dimensions + +Acts like an array of ints. Has an ```int size``` attribute, and can be +indexed like an array to access individual values. Used to represent the +dimensions of a tensor. See ```dimensions()```. + +#### <Tensor-Type>::Index + +Acts like an ```int```. Used for indexing tensors along their dimensions. See +```operator()```, ```dimension()```, and ```size()```. + +#### <Tensor-Type>::Scalar + +Represents the datatype of individual tensor elements. For example, for a +```Tensor<float>```, ```Scalar``` is the type ```float```. See +```setConstant()```. + +#### <Operation> + +We use this pseudo type to indicate that a tensor Operation is returned by a +method. We indicate in the text the type and dimensions of the tensor that the +Operation returns after evaluation. + +The Operation will have to be evaluated, for example by assigning it to a +tensor, before you can access the values of the resulting tensor. You can also +access the values through a TensorRef. + + +## Built-in Tensor Methods + +These are usual C++ methods that act on tensors immediately. They are not +Operations which provide delayed evaluation of their results. Unless specified +otherwise, all the methods listed below are available on all tensor classes: +Tensor, TensorFixedSize, and TensorMap. + +## Metadata + +### int NumDimensions + +Constant value indicating the number of dimensions of a Tensor. This is also +known as the tensor "rank". + + Eigen::Tensor<float, 2> a(3, 4); + cout << "Dims " << a.NumDimensions; + => Dims 2 + +### Dimensions dimensions() + +Returns an array-like object representing the dimensions of the tensor. +The actual type of the dimensions() result is <Tensor-Type>::Dimensions. + + Eigen::Tensor<float, 2> a(3, 4); + const Eigen::Tensor<float, 2>::Dimensions& d = a.dimensions(); + cout << "Dim size: " << d.size << ", dim 0: " << d[0] + << ", dim 1: " << d[1]; + => Dim size: 2, dim 0: 3, dim 1: 4 + +If you use a C++11 compiler, you can use ```auto``` to simplify the code: + + const auto& d = a.dimensions(); + cout << "Dim size: " << d.size << ", dim 0: " << d[0] + << ", dim 1: " << d[1]; + => Dim size: 2, dim 0: 3, dim 1: 4 + +### Index dimension(Index n) + +Returns the n-th dimension of the tensor. The actual type of the +```dimension()``` result is ```<Tensor-Type>::Index```, but you can +always use it like an int. + + Eigen::Tensor<float, 2> a(3, 4); + int dim1 = a.dimension(1); + cout << "Dim 1: " << dim1; + => Dim 1: 4 + +### Index size() + +Returns the total number of elements in the tensor. This is the product of all +the tensor dimensions. The actual type of the ```size()``` result is +```<Tensor-Type>::Index```, but you can always use it like an int. + + Eigen::Tensor<float, 2> a(3, 4); + cout << "Size: " << a.size(); + => Size: 12 + + +### Getting Dimensions From An Operation + +A few operations provide ```dimensions()``` directly, +e.g. ```TensorReslicingOp```. Most operations defer calculating dimensions +until the operation is being evaluated. If you need access to the dimensions +of a deferred operation, you can wrap it in a TensorRef (see Assigning to a +TensorRef above), which provides ```dimensions()``` and ```dimension()``` as +above. + +TensorRef can also wrap the plain Tensor types, so this is a useful idiom in +templated contexts where the underlying object could be either a raw Tensor +or some deferred operation (e.g. a slice of a Tensor). In this case, the +template code can wrap the object in a TensorRef and reason about its +dimensionality while remaining agnostic to the underlying type. + + +## Constructors and Copies + +TODO. + + Tensor(...) + TensorFixedSize(...) + TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) + TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions) + TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) + Self& operator=(const Self& other) + Self& operator=(const OtherDerived& other) + + +## Contents Initialization + +When a new Tensor or a new TensorFixedSize are created, memory is allocated to +hold all the tensor elements, but the memory is not initialized. Similarly, +when a new TensorMap is created on top of non-initialized memory the memory its +contents are not initialized. + +You can use one of the methods below to initialize the tensor memory. These +have an immediate effect on the tensor and return the tensor itself as a +result. These are not tensor Operations which delay evaluation. + +### <Tensor-Type> setConstant(const Scalar& val) + +Sets all elements of the tensor to the constant value ```val```. ```Scalar``` +is the type of data stored in the tensor. You can pass any value that is +convertible to that type. + +Returns the tensor itself in case you want to chain another call. + + a.setConstant(12.3f); + cout << "Constant: " << endl << a << endl << endl; + => + Constant: + 12.3 12.3 12.3 12.3 + 12.3 12.3 12.3 12.3 + 12.3 12.3 12.3 12.3 + +Note that ```setConstant()``` can be used on any tensor where the element type +has a copy constructor and an ```operator=()```: + + Eigen::Tensor<string, 2> a(2, 3); + a.setConstant("yolo"); + cout << "String tensor: " << endl << a << endl << endl; + => + String tensor: + yolo yolo yolo + yolo yolo yolo + + +### <Tensor-Type> setZero() + +Fills the tensor with zeros. Equivalent to ```setConstant(Scalar(0))```. +Returns the tensor itself in case you want to chain another call. + + a.setZero(); + cout << "Zeros: " << endl << a << endl << endl; + => + Zeros: + 0 0 0 0 + 0 0 0 0 + 0 0 0 0 + + +### <Tensor-Type> setValues({..initializer_list}) + +Fills the tensor with explicit values specified in a std::initializer_list. +The type of the initializer list depends on the type and rank of the tensor. + +If the tensor has rank N, the initializer list must be nested N times. The +most deeply nested lists must contains P scalars of the Tensor type where P is +the size of the last dimension of the Tensor. + +For example, for a ```TensorFixedSize<float, 2, 3>``` the initializer list must +contains 2 lists of 3 floats each. + +```setValues()``` returns the tensor itself in case you want to chain another +call. + + Eigen::Tensor<float, 2> a(2, 3); + a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}}); + cout << "a" << endl << a << endl << endl; + => + a + 0 1 2 + 3 4 5 + +If a list is too short, the corresponding elements of the tensor will not be +changed. This is valid at each level of nesting. For example the following +code only sets the values of the first row of the tensor. + + Eigen::Tensor<int, 2> a(2, 3); + a.setConstant(1000); + a.setValues({{10, 20, 30}}); + cout << "a" << endl << a << endl << endl; + => + a + 10 20 30 + 1000 1000 1000 + +### <Tensor-Type> setRandom() + +Fills the tensor with random values. Returns the tensor itself in case you +want to chain another call. + + a.setRandom(); + cout << "Random: " << endl << a << endl << endl; + => + Random: + 0.680375 0.59688 -0.329554 0.10794 + -0.211234 0.823295 0.536459 -0.0452059 + 0.566198 -0.604897 -0.444451 0.257742 + +You can customize ```setRandom()``` by providing your own random number +generator as a template argument: + + a.setRandom<MyRandomGenerator>(); + +Here, ```MyRandomGenerator``` must be a struct with the following member +functions, where Scalar and Index are the same as ```<Tensor-Type>::Scalar``` +and ```<Tensor-Type>::Index```. + +See ```struct UniformRandomGenerator``` in TensorFunctors.h for an example. + + // Custom number generator for use with setRandom(). + struct MyRandomGenerator { + // Default and copy constructors. Both are needed + MyRandomGenerator() { } + MyRandomGenerator(const MyRandomGenerator& ) { } + + // Return a random value to be used. "element_location" is the + // location of the entry to set in the tensor, it can typically + // be ignored. + Scalar operator()(Eigen::DenseIndex element_location, + Eigen::DenseIndex /*unused*/ = 0) const { + return <randomly generated value of type T>; + } + + // Same as above but generates several numbers at a time. + typename internal::packet_traits<Scalar>::type packetOp( + Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const { + return <a packet of randomly generated values>; + } + }; + +You can also use one of the 2 random number generators that are part of the +tensor library: +* UniformRandomGenerator +* NormalRandomGenerator + + +## Data Access + +TODO + + const Scalar& operator()(const array<Index, NumIndices>& indices) + const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + Scalar& operator()(const array<Index, NumIndices>& indices) + Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + Scalar& operator[](Index index) + ??? mention coeff() and coeffRef() ??? + +### Scalar* data() +### const Scalar* data() const + +Returns a pointer to the storage for the tensor. The pointer is const if the +tensor was const. This allows direct access to the data. The layout of the +data depends on the tensor layout: RowMajor or ColMajor. + +This access is usually only needed for special cases, for example when mixing +Eigen Tensor code with other libraries. + +Scalar is the type of data stored in the tensor. + + Eigen::Tensor<float, 2> a(3, 4); + float* a_data = a.data(); + a_data[0] = 123.45f; + cout << "a(0, 0): " << a(0, 0); + => a(0, 0): 123.45 + + +## Tensor Operations + +All the methods documented below return non evaluated tensor ```Operations```. +These can be chained: you can apply another Tensor Operation to the value +returned by the method. + +The chain of Operation is evaluated lazily, typically when it is assigned to a +tensor. See "Controlling when Expression are Evaluated" for more details about +their evaluation. + +### <Operation> constant(const Scalar& val) + +Returns a tensor of the same type and dimensions as the original tensor but +where all elements have the value ```val```. + +This is useful, for example, when you want to add or subtract a constant from a +tensor, or multiply every element of a tensor by a scalar. + + Eigen::Tensor<float, 2> a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor<float, 2> b = a + a.constant(2.0f); + Eigen::Tensor<float, 2> c = b * b.constant(0.2f); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + cout << "c" << endl << c << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + 3 3 3 + 3 3 3 + + c + 0.6 0.6 0.6 + 0.6 0.6 0.6 + +### <Operation> random() + +Returns a tensor of the same type and dimensions as the current tensor +but where all elements have random values. + +This is for example useful to add random values to an existing tensor. +The generation of random values can be customized in the same manner +as for ```setRandom()```. + + Eigen::Tensor<float, 2> a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor<float, 2> b = a + a.random(); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + 1.68038 1.5662 1.82329 + 0.788766 1.59688 0.395103 + + +## Unary Element Wise Operations + +All these operations take a single input tensor as argument and return a tensor +of the same type and dimensions as the tensor to which they are applied. The +requested operations are applied to each element independently. + +### <Operation> operator-() + +Returns a tensor of the same type and dimensions as the original tensor +containing the opposite values of the original tensor. + + Eigen::Tensor<float, 2> a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor<float, 2> b = -a; + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + -1 -1 -1 + -1 -1 -1 + +### <Operation> sqrt() + +Returns a tensor of the same type and dimensions as the original tensor +containing the square roots of the original tensor. + +### <Operation> rsqrt() + +Returns a tensor of the same type and dimensions as the original tensor +containing the inverse square roots of the original tensor. + +### <Operation> square() + +Returns a tensor of the same type and dimensions as the original tensor +containing the squares of the original tensor values. + +### <Operation> inverse() + +Returns a tensor of the same type and dimensions as the original tensor +containing the inverse of the original tensor values. + +### <Operation> exp() + +Returns a tensor of the same type and dimensions as the original tensor +containing the exponential of the original tensor. + +### <Operation> log() + +Returns a tensor of the same type and dimensions as the original tensor +containing the natural logarithms of the original tensor. + +### <Operation> abs() + +Returns a tensor of the same type and dimensions as the original tensor +containing the absolute values of the original tensor. + +### <Operation> pow(Scalar exponent) + +Returns a tensor of the same type and dimensions as the original tensor +containing the coefficients of the original tensor to the power of the +exponent. + +The type of the exponent, Scalar, is always the same as the type of the +tensor coefficients. For example, only integer exponents can be used in +conjuntion with tensors of integer values. + +You can use cast() to lift this restriction. For example this computes +cubic roots of an int Tensor: + + Eigen::Tensor<int, 2> a(2, 3); + a.setValues({{0, 1, 8}, {27, 64, 125}}); + Eigen::Tensor<double, 2> b = a.cast<double>().pow(1.0 / 3.0); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 0 1 8 + 27 64 125 + + b + 0 1 2 + 3 4 5 + +### <Operation> operator * (Scalar scale) +TODO + +### <Operation> cwiseMax(Scalar threshold) +TODO + +### <Operation> cwiseMin(Scalar threshold) +TODO + + ### <Operation> unaryExpr(const CustomUnaryOp& func) +TODO + + +## Binary Element Wise Operations + +These operations take two input tensors as arguments. The 2 input tensors should +be of the same type and dimensions. The result is a tensor of the same +dimensions as the tensors to which they are applied, and unless otherwise +specified it is also of the same type. The requested operations are applied to +each pair of elements independently. + +### <Operation> operator+(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise sums of the inputs. + +### <Operation> operator-(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise differences of the inputs. + +### <Operation> operator*(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise products of the inputs. + +### <Operation> operator/(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise quotients of the inputs. + +This operator is not supported for integer types. + +### <Operation> cwiseMax(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise maximums of the inputs. + +### <Operation> cwiseMin(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise mimimums of the inputs. + +### <Operation> Logical operators + +The following logical operators are supported as well: + +* operator&&(const OtherDerived& other) + +* operator||(const OtherDerived& other) + +* operator<(const OtherDerived& other) + +* operator<=(const OtherDerived& other) + +* operator>(const OtherDerived& other) + +* operator>=(const OtherDerived& other) + +* operator==(const OtherDerived& other) + +* operator!=(const OtherDerived& other) + +They all return a tensor of boolean values. + + +## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) + +Selection is a coefficient-wise ternary operator that is the tensor equivalent +to the if-then-else operation. + + Tensor<bool, 3> if = ...; + Tensor<float, 3> then = ...; + Tensor<float, 3> else = ...; + Tensor<float, 3> result = if.select(then, else); + +The 3 arguments must be of the same dimensions, which will also be the dimension +of the result. The 'if' tensor must be of type boolean, the 'then' and the +'else' tensor must be of the same type, which will also be the type of the +result. + +Each coefficient in the result is equal to the corresponding coefficient in the +'then' tensor if the corresponding value in the 'if' tensor is true. If not, the +resulting coefficient will come from the 'else' tensor. + + +## Contractions + +TODO + contract(const OtherDerived& other, const Dimensions& dims) + + + +## Reduction Operations + +A *Reduction* operation returns a tensor with fewer dimensions than the +original tensor. The values in the returned tensor are computed by applying a +*reduction operator* to slices of values from the original tensor. You specify +the dimensions along which the slices are made. + +The Eigen Tensor library provides a set of predefined reduction operators such +as ```maximum()``` and ```sum()``` and lets you define additional operators by +implementing a few methods from a reductor template. + +### Reduction Dimensions + +All reduction operations take a single parameter of type +```<TensorType>::Dimensions``` which can always be specified as an array of +ints. These are called the "reduction dimensions." The values are the indices +of the dimensions of the input tensor over which the reduction is done. The +parameter can have at most as many element as the rank of the input tensor; +each element must be less than the tensor rank, as it indicates one of the +dimensions to reduce. + +Each dimension of the input tensor should occur at most once in the reduction +dimensions as the implementation does not remove duplicates. + +The order of the values in the reduction dimensions does not affect the +results, but the code may execute faster if you list the dimensions in +increasing order. + +Example: Reduction along one dimension. + + // Create a tensor of 3 dimensions: 2, 3, 4 + Eigen::Tensor<int, 2> a(2, 3); + a.setValues({{1, 2, 3}, {6, 5, 4}}); + // Reduce it along the second dimension (1)... + Eigen::array<int, 1> dims({1 /* dimension to reduce */}); + // ...using the "maximum" operator. + // The result is a tensor with one dimension. The size of + // that dimension is the same as the first (non-reduced) dimension of a. + Eigen::Tensor<int, 1> b = a.maximum(dims); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 2 3 + 6 5 4 + + b + 3 + 6 + +Example: Reduction along two dimensions. + + Eigen::Tensor<float, 3, Eigen::ColMajor> a(2, 3, 4); + a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, + {7.0f, 6.0f, 5.0f, 4.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}, + {{12.0f, 13.0f, 14.0f, 15.0f}, + {19.0f, 18.0f, 17.0f, 16.0f}, + {20.0f, 21.0f, 22.0f, 23.0f}}}); + // The tensor a has 3 dimensions. We reduce along the + // first 2, resulting in a tensor with a single dimension + // of size 4 (the last dimension of a.) + // Note that we pass the array of reduction dimensions + // directly to the maximum() call. + Eigen::Tensor<float, 1, Eigen::ColMajor> b = + a.maximum(Eigen::array<int, 2>({0, 1})); + cout << "b" << endl << b << endl << endl; + => + b + 20 + 21 + 22 + 23 + +#### Reduction along all dimensions + +As a special case, if you pass no parameter to a reduction operation the +original tensor is reduced along *all* its dimensions. The result is a +one-dimension tensor with a single value. + + Eigen::Tensor<float, 3> a(2, 3, 4); + a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, + {7.0f, 6.0f, 5.0f, 4.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}, + {{12.0f, 13.0f, 14.0f, 15.0f}, + {19.0f, 18.0f, 17.0f, 16.0f}, + {20.0f, 21.0f, 22.0f, 23.0f}}}); + // Reduce along all dimensions using the sum() operator. + Eigen::Tensor<float, 1> b = a.sum(); + cout << "b" << endl << b << endl << endl; + => + b + 276 + + +### <Operation> sum(const Dimensions& new_dims) +### <Operation> sum() + +Reduce a tensor using the sum() operator. The resulting values +are the sum of the reduced values. + +### <Operation> mean(const Dimensions& new_dims) +### <Operation> mean() + +Reduce a tensor using the mean() operator. The resulting values +are the mean of the reduced values. + +### <Operation> maximum(const Dimensions& new_dims) +### <Operation> maximum() + +Reduce a tensor using the maximum() operator. The resulting values are the +largest of the reduced values. + +### <Operation> minimum(const Dimensions& new_dims) +### <Operation> minimum() + +Reduce a tensor using the minimum() operator. The resulting values +are the smallest of the reduced values. + +### <Operation> prod(const Dimensions& new_dims) +### <Operation> prod() + +Reduce a tensor using the prod() operator. The resulting values +are the product of the reduced values. + +### <Operation> reduce(const Dimensions& new_dims, const Reducer& reducer) + +Reduce a tensor using a user-defined reduction operator. See ```SumReducer``` +in TensorFunctors.h for information on how to implement a reduction operator. + + +## Convolutions + +TBD: convolve(const KernelDerived& kernel, const Dimensions& dims) + + +## Geometrical Operations + +These operations return a Tensor with different dimensions than the original +Tensor. They can be used to access slices of tensors, see them with different +dimensions, or pad tensors with additional data. + +### <Operation> reshape(const Dimensions& new_dims) + +Returns a view of the input tensor that has been reshaped to the specified +new dimensions. The argument new_dims is an array of Index values. The +rank of the resulting tensor is equal to the number of elements in new_dims. + +The product of all the sizes in the new dimension array must be equal to +the number of elements in the input tensor. + + // Increase the rank of the input tensor by introducing a new dimension + // of size 1. + Tensor<float, 2> input(7, 11); + array<int, 3> three_dims{{7, 11, 1}}; + Tensor<float, 3> result = input.reshape(three_dims); + + // Decrease the rank of the input tensor by merging 2 dimensions; + array<int, 1> one_dim{{7 * 11}}; + Tensor<float, 1> result = input.reshape(one_dim); + +This operation does not move any data in the input tensor, so the resulting +contents of a reshaped Tensor depend on the data layout of the original Tensor. + +For example this is what happens when you ```reshape()``` a 2D ColMajor tensor +to one dimension: + + Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2}); + Eigen::Tensor<float, 1, Eigen::ColMajor> b = a.reshape(one_dim); + cout << "b" << endl << b << endl; + => + b + 0 + 300 + 100 + 400 + 200 + 500 + +This is what happens when the 2D Tensor is RowMajor: + + Eigen::Tensor<float, 2, Eigen::RowMajor> a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2}); + Eigen::Tensor<float, 1, Eigen::RowMajor> b = a.reshape(one_dim); + cout << "b" << endl << b << endl; + => + b + 0 + 100 + 200 + 300 + 400 + 500 + +The reshape operation is a lvalue. In other words, it can be used on the left +side of the assignment operator. + +The previous example can be rewritten as follow: + + Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array<Eigen::DenseIndex, 2> two_dim({2, 3}); + Eigen::Tensor<float, 1, Eigen::ColMajor> b; + b.reshape(two_dim) = a; + cout << "b" << endl << b << endl; + => + b + 0 + 300 + 100 + 400 + 200 + 500 + +Note that "b" itself was not reshaped but that instead the assignment is done to +the reshape view of b. + + +### <Operation> shuffle(const Shuffle& shuffle) + +Returns a copy of the input tensor whose dimensions have been +reordered according to the specified permutation. The argument shuffle +is an array of Index values. Its size is the rank of the input +tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th +dimension of the output tensor equals to the size of the shuffle[i]-th +dimension of the input tensor. For example: + + // Shuffle all dimensions to the left by 1. + Tensor<float, 3> input(20, 30, 50); + // ... set some values in input. + Tensor<float, 3> output = input.shuffle({1, 2, 0}) + + eigen_assert(output.dimension(0) == 30); + eigen_assert(output.dimension(1) == 50); + eigen_assert(output.dimension(2) == 20); + +Indices into the output tensor are shuffled accordingly to formulate +indices into the input tensor. For example, one can assert in the above +code snippet that: + + eigen_assert(output(3, 7, 11) == input(11, 3, 7)); + +In general, one can assert that + + eigen_assert(output(..., indices[shuffle[i]], ...) == + input(..., indices[i], ...)) + +The shuffle operation results in a lvalue, which means that it can be assigned +to. In other words, it can be used on the left side of the assignment operator. + +Let's rewrite the previous example to take advantage of this feature: + + // Shuffle all dimensions to the left by 1. + Tensor<float, 3> input(20, 30, 50); + // ... set some values in input. + Tensor<float, 3> output(30, 50, 20); + output.shuffle({2, 0, 1}) = input; + + +### <Operation> stride(const Strides& strides) + +Returns a view of the input tensor that strides (skips stride-1 +elements) along each of the dimensions. The argument strides is an +array of Index values. The dimensions of the resulting tensor are +ceil(input_dimensions[i] / strides[i]). + +For example this is what happens when you ```stride()``` a 2D tensor: + + Eigen::Tensor<int, 2> a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array<Eigen::DenseIndex, 2> strides({3, 2}); + Eigen::Tensor<int, 2> b = a.stride(strides); + cout << "b" << endl << b << endl; + => + b + 0 200 + 900 1100 + +It is possible to assign a tensor to a stride: + Tensor<float, 3> input(20, 30, 50); + // ... set some values in input. + Tensor<float, 3> output(40, 90, 200); + output.stride({2, 3, 4}) = input; + + +### <Operation> slice(const StartIndices& startIndices, + const Sizes& sizes) + +TBD + + +### <Operation> chip(const Index offset, const Index dim) + +A chip is a special kind of slice. It is the subtensor at the given offset in +the dimension dim. The returned tensor has one fewer dimension than the input +tensor: the dimension dim is removed. + +For example, a matrix chip would be either a row or a column of the input +matrix. + + Eigen::Tensor<int, 2> a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::Tensor<int, 1> row_3 = a.chip(2, 0); + Eigen::Tensor<int, 1> col_2 = a.chip(1, 1); + cout << "a" << endl << a << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + cout << "row_3" << endl << row_3 << endl; + => + row_3 + 600 700 800 + cout << "col_2" << endl << col_2 << endl; + => + col_2 + 100 400 700 1000 + +It is possible to assign values to a tensor chip since the chip operation is a +lvalue. For example: + + Eigen::Tensor<int, 1> a(3); + a.setValues({{100, 200, 300}}); + Eigen::Tensor<int, 2> b(2, 3); + b.setZero(); + b.chip(0, 0) = a; + cout << "a" << endl << a << endl; + => + a + 100 + 200 + 300 + cout << "b" << endl << b << endl; + => + b + 100 200 300 + 0 0 0 + + +### <Operation> reverse(const ReverseDimensions& reverse) + +Returns a view of the input tensor that reverses the order of the coefficients +along a subset of the dimensions. The argument reverse is an array of boolean +values that indicates whether or not the order of the coefficients should be +reversed along each of the dimensions. This operation preserves the dimensions +of the input tensor. + +For example this is what happens when you ```reverse()``` the first dimension +of a 2D tensor: + + Eigen::Tensor<int, 2> a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array<bool, 2> reverse({true, false}); + Eigen::Tensor<int, 2> b = a.reverse(reverse); + cout << "a" << endl << a << endl << "b" << endl << b << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + b + 900 1000 1100 + 600 700 800 + 300 400 500 + 0 100 200 + + +TODO +### <Operation> broadcast(const Broadcast& broadcast) + +TODO + +### <Operation> concatenate(const OtherDerived& other, Axis axis) + +TODO + +### <Operation> pad(const PaddingDimensions& padding) + +TODO + +### <Operation> extract_patches(const PatchDims& patch_dims) + +TODO + +### <Operation> extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const PaddingType padding_type) + +TODO + + +## Special Operations + +### <Operation> cast<T>() + +Returns a tensor of type T with the same dimensions as the original tensor. +The returned tensor contains the values of the original tensor converted to +type T. + + Eigen::Tensor<float, 2> a(2, 3); + Eigen::Tensor<int, 2> b = a.cast<int>(); + +This can be useful for example if you need to do element-wise division of +Tensors of integers. This is not currently supported by the Tensor library +but you can easily cast the tensors to floats to do the division: + + Eigen::Tensor<int, 2> a(2, 3); + a.setValues({{0, 1, 2}, {3, 4, 5}}); + Eigen::Tensor<int, 2> b = + (a.cast<float>() / a.constant(2).cast<float>()).cast<int>(); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 0 1 2 + 3 4 5 + + b + 0 0 1 + 1 2 2 + + +### <Operation> eval() + +TODO + + +## Representation of scalar values + +Scalar values are often represented by tensors of size 1 and rank 1. It would be +more logical and user friendly to use tensors of rank 0 instead. For example +Tensor<T, N>::maximum() currently returns a Tensor<T, 1>. Similarly, the inner +product of 2 1d tensors (through contractions) returns a 1d tensor. In the +future these operations might be updated to return 0d tensors instead. + +## Limitations + +* The number of tensor dimensions is currently limited to 250 when using a + compiler that supports cxx11. It is limited to only 5 for older compilers. +* The IndexList class requires a cxx11 compliant compiler. You can use an + array of indices instead if you don't have access to a modern compiler. +* TensorVarDims are only partially supported +* On GPUs only floating point values are properly tested and optimized for. +* Complex and integer values are known to be broken on GPUs. If you try to use + them you'll most likely end up triggering a static assertion failure such as + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + + diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 70ca1433f..037219f23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -1,6 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> // Copyright (C) 2013 Christian Seiler <christian@iwakd.de> // // This Source Code Form is subject to the terms of the Mozilla @@ -55,70 +56,46 @@ namespace Eigen { * change dramatically.</dd> * </dl> * - * \ref TopicStorageOrders + * \ref TopicStorageOrders */ -template<typename Scalar_, std::size_t NumIndices_, int Options_ = 0> -class Tensor; -namespace internal { template<typename Scalar_, std::size_t NumIndices_, int Options_> -struct traits<Tensor<Scalar_, NumIndices_, Options_>> +class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_> > { - typedef Scalar_ Scalar; - typedef Dense StorageKind; - typedef DenseIndex Index; - enum { - Options = Options_ - }; -}; - -template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor> -struct tensor_index_linearization_helper -{ - constexpr static inline Index run(std::array<Index, NumIndices> const& indices, std::array<Index, NumIndices> const& dimensions) - { - return std_array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) + - std_array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) * - tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions); - } -}; - -template<typename Index, std::size_t NumIndices, bool RowMajor> -struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor> -{ - constexpr static inline Index run(std::array<Index, NumIndices> const& indices, std::array<Index, NumIndices> const&) - { - return std_array_get<RowMajor ? 0 : NumIndices - 1>(indices); - } -}; -} // end namespace internal - -template<typename Scalar_, std::size_t NumIndices_, int Options_> -class Tensor -{ - static_assert(NumIndices_ >= 1, "A tensor must have at least one index."); - public: typedef Tensor<Scalar_, NumIndices_, Options_> Self; + typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_> > Base; + typedef typename Eigen::internal::nested<Self>::type Nested; typedef typename internal::traits<Self>::StorageKind StorageKind; typedef typename internal::traits<Self>::Index Index; - typedef typename internal::traits<Self>::Scalar Scalar; - typedef typename internal::packet_traits<Scalar>::type PacketScalar; + typedef Scalar_ Scalar; + typedef typename internal::packet_traits<Scalar>::type Packet; typedef typename NumTraits<Scalar>::Real RealScalar; - typedef Self DenseType; + typedef typename Base::CoeffReturnType CoeffReturnType; + typedef typename Base::PacketReturnType PacketReturnType; + + enum { + IsAligned = bool(EIGEN_ALIGN) & !(Options_&DontAlign), + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + Layout = Options_ & RowMajor ? RowMajor : ColMajor, + CoordAccess = true, + }; - constexpr static int Options = Options_; - constexpr static std::size_t NumIndices = NumIndices_; + static const int Options = Options_; + static const std::size_t NumIndices = NumIndices_; + typedef DSizes<Index, NumIndices_> Dimensions; protected: TensorStorage<Scalar, NumIndices, Dynamic, Options> m_storage; public: - EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } - EIGEN_STRONG_INLINE std::array<Index, NumIndices> dimensions() const { return m_storage.dimensions(); } - EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(m_storage.dimensions()); } - EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } - EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + // Metadata + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<DenseIndex, NumIndices_>& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED // work, because that uses base().coeffRef() - and we don't yet @@ -126,146 +103,254 @@ class Tensor inline Self& base() { return *this; } inline const Self& base() const { return *this; } - void setZero() - { - // FIXME: until we have implemented packet access and the - // expression engine w.r.t. nullary ops, use this - // as a kludge. Only works with POD types, but for - // any standard usage, this shouldn't be a problem - memset((void *)data(), 0, size() * sizeof(Scalar)); - } - - inline Self& operator=(Self const& other) - { - m_storage = other.m_storage; - return *this; - } - +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { - static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return coeff(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); } +#endif - inline const Scalar& coeff(const std::array<Index, NumIndices>& indices) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const { eigen_internal_assert(checkIndexRange(indices)); return m_storage.data()[linearizedIndex(indices)]; } - inline const Scalar& coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const { eigen_internal_assert(index >= 0 && index < size()); return m_storage.data()[index]; } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { - static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return coeffRef(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); } +#endif - inline Scalar& coeffRef(const std::array<Index, NumIndices>& indices) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) { eigen_internal_assert(checkIndexRange(indices)); return m_storage.data()[linearizedIndex(indices)]; } - inline Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { eigen_internal_assert(index >= 0 && index < size()); return m_storage.data()[index]; } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { - static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return this->operator()(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + return coeff(array<Index, 2>(i0, i1)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + return coeff(array<Index, 3>(i0, i1, i2)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + return coeff(array<Index, 4>(i0, i1, i2, i3)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + return coeff(array<Index, 5>(i0, i1, i2, i3, i4)); } +#endif - inline const Scalar& operator()(const std::array<Index, NumIndices>& indices) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const { eigen_assert(checkIndexRange(indices)); return coeff(indices); } - inline const Scalar& operator()(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const { eigen_internal_assert(index >= 0 && index < size()); return coeff(index); } - inline const Scalar& operator[](Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const { - static_assert(NumIndices == 1, "The bracket operator is only for vectors, use the parenthesis operator instead."); + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); return coeff(index); } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { - static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return operator()(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + return coeffRef(array<Index, 2>(i0, i1)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + return coeffRef(array<Index, 3>(i0, i1, i2)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + return coeffRef(array<Index, 4>(i0, i1, i2, i3)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + return coeffRef(array<Index, 5>(i0, i1, i2, i3, i4)); } +#endif - inline Scalar& operator()(const std::array<Index, NumIndices>& indices) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices) { eigen_assert(checkIndexRange(indices)); return coeffRef(indices); } - inline Scalar& operator()(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) { eigen_assert(index >= 0 && index < size()); return coeffRef(index); } - inline Scalar& operator[](Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) { - static_assert(NumIndices == 1, "The bracket operator is only for vectors, use the parenthesis operator instead."); + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) return coeffRef(index); } - inline Tensor() + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor() : m_storage() { } - inline Tensor(const Self& other) - : m_storage(other.m_storage) - { - } - - inline Tensor(Self&& other) + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const Self& other) : m_storage(other.m_storage) { } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> inline Tensor(Index firstDimension, IndexTypes... otherDimensions) - : m_storage() + : m_storage(internal::array_prod(array<Index, NumIndices>{{firstDimension, otherDimensions...}}), array<Index, NumIndices>{{firstDimension, otherDimensions...}}) + { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#else + inline explicit Tensor(Index dim1) + : m_storage(dim1, array<Index, 1>(dim1)) + { + EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + inline explicit Tensor(Index dim1, Index dim2) + : m_storage(dim1*dim2, array<Index, 2>(dim1, dim2)) { - static_assert(sizeof...(otherDimensions) + 1 == NumIndices, "Number of dimensions used to construct a tensor must be equal to the rank of the tensor."); - resize(std::array<Index, NumIndices>{{firstDimension, otherDimensions...}}); + EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } + inline explicit Tensor(Index dim1, Index dim2, Index dim3) + : m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3)) + { + EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4) + : m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4)) + { + EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) + : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 4>(dim1, dim2, dim3, dim4, dim5)) + { + EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#endif - inline Tensor(std::array<Index, NumIndices> dimensions) - : m_storage(internal::array_prod(dimensions), dimensions) + inline explicit Tensor(const array<Index, NumIndices>& dimensions) + : m_storage(internal::array_prod(dimensions), dimensions) { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) + { + typedef TensorAssignOp<Tensor, const OtherDerived> Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + } + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other) + { + typedef TensorAssignOp<Tensor, const OtherDerived> Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) + { + typedef TensorAssignOp<Tensor, const Tensor> Assign; + Assign assign(*this, other); + resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + return *this; + } + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) + { + typedef TensorAssignOp<Tensor, const OtherDerived> Assign; + Assign assign(*this, other); + resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + return *this; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> void resize(Index firstDimension, IndexTypes... otherDimensions) { - static_assert(sizeof...(otherDimensions) + 1 == NumIndices, "Number of dimensions used to resize a tensor must be equal to the rank of the tensor."); - resize(std::array<Index, NumIndices>{{firstDimension, otherDimensions...}}); + // The number of dimensions used to resize a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}}); } +#endif - void resize(const std::array<Index, NumIndices>& dimensions) + EIGEN_DEVICE_FUNC void resize(const array<Index, NumIndices>& dimensions) { std::size_t i; Index size = Index(1); @@ -282,8 +367,17 @@ class Tensor #endif } + EIGEN_DEVICE_FUNC void resize(const DSizes<Index, NumIndices>& dimensions) { + array<Index, NumIndices> dims; + for (std::size_t i = 0; i < NumIndices; ++i) { + dims[i] = dimensions[i]; + } + resize(dims); + } + protected: - bool checkIndexRange(const std::array<Index, NumIndices>& indices) const + + bool checkIndexRange(const array<Index, NumIndices>& indices) const { using internal::array_apply_and_reduce; using internal::array_zip_and_reduce; @@ -298,16 +392,16 @@ class Tensor array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions()); } - inline Index linearizedIndex(const std::array<Index, NumIndices>& indices) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const { - return internal::tensor_index_linearization_helper<Index, NumIndices, NumIndices - 1, Options&RowMajor>::run(indices, m_storage.dimensions()); + if (Options&RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } } }; } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_H - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h new file mode 100644 index 000000000..a4f73b2a1 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -0,0 +1,164 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H +#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H + +namespace Eigen { + +/** \class TensorAssign + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor assignment class. + * + * This class is represents the assignment of the values resulting from the evaluation of + * the rhs expression to the memory locations denoted by the lhs expression. + */ +namespace internal { +template<typename LhsXprType, typename RhsXprType> +struct traits<TensorAssignOp<LhsXprType, RhsXprType> > +{ + typedef typename LhsXprType::Scalar Scalar; + typedef typename internal::packet_traits<Scalar>::type Packet; + typedef typename traits<LhsXprType>::StorageKind StorageKind; + typedef typename promote_index_type<typename traits<LhsXprType>::Index, + typename traits<RhsXprType>::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference<LhsNested>::type _LhsNested; + typedef typename remove_reference<RhsNested>::type _RhsNested; + static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions; + static const int Layout = internal::traits<LhsXprType>::Layout; + + enum { + Flags = 0, + }; +}; + +template<typename LhsXprType, typename RhsXprType> +struct eval<TensorAssignOp<LhsXprType, RhsXprType>, Eigen::Dense> +{ + typedef const TensorAssignOp<LhsXprType, RhsXprType>& type; +}; + +template<typename LhsXprType, typename RhsXprType> +struct nested<TensorAssignOp<LhsXprType, RhsXprType>, 1, typename eval<TensorAssignOp<LhsXprType, RhsXprType> >::type> +{ + typedef TensorAssignOp<LhsXprType, RhsXprType> type; +}; + +} // end namespace internal + + + +template<typename LhsXprType, typename RhsXprType> +class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorAssignOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename LhsXprType::CoeffReturnType CoeffReturnType; + typedef typename LhsXprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorAssignOp>::type Nested; + typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + typename internal::remove_all<typename LhsXprType::Nested>::type& + lhsExpression() const { return *((typename internal::remove_all<typename LhsXprType::Nested>::type*)&m_lhs_xpr); } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename RhsXprType::Nested>::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename internal::remove_all<typename LhsXprType::Nested>::type& m_lhs_xpr; + const typename internal::remove_all<typename RhsXprType::Nested>::type& m_rhs_xpr; +}; + + +template<typename LeftArgType, typename RightArgType, typename Device> +struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> +{ + typedef TensorAssignOp<LeftArgType, RightArgType> XprType; + + enum { + IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess, + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + // The dimensions of the lhs and the rhs tensors should be equal to prevent + // overflows and ensure the result is fully initialized. + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_leftImpl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use left impl instead if right impl dimensions are known at compile time. + return m_rightImpl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + m_leftImpl.evalSubExprsIfNeeded(NULL); + // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non + // null value), attempt to evaluate the rhs expression in place. Returns true iff in place + // evaluation isn't supported and the caller still needs to manually assign the values generated + // by the rhs to the lhs. + return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned; + const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned; + m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i)); + } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_leftImpl.coeff(index); + } + template<int LoadMode> + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + return m_leftImpl.template packet<LoadMode>(index); + } + + private: + TensorEvaluator<LeftArgType, Device> m_leftImpl; + TensorEvaluator<RightArgType, Device> m_rightImpl; +}; + +} + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h new file mode 100644 index 000000000..e08ac6aa1 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -0,0 +1,573 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H +#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H + +namespace Eigen { + +/** \class TensorBase + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor base class. + * + * This class is the common parent of the Tensor and TensorMap class, thus + * making it possible to use either class interchangably in expressions. + */ + +template<typename Derived> +class TensorBase<Derived, ReadOnlyAccessors> +{ + public: + typedef internal::traits<Derived> DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef typename internal::remove_const<Scalar>::type CoeffReturnType; + typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType; + static const int NumDimensions = DerivedTraits::NumDimensions; + + // Generic nullary operation support. + template <typename CustomNullaryOp> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<CustomNullaryOp, const Derived> + nullaryExpr(const CustomNullaryOp& func) const { + return TensorCwiseNullaryOp<CustomNullaryOp, const Derived>(derived(), func); + } + + // Coefficient-wise nullary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> + constant(const Scalar& value) const { + return nullaryExpr(internal::scalar_constant_op<Scalar>(value)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::UniformRandomGenerator<Scalar>, const Derived> + random() const { + return nullaryExpr(internal::UniformRandomGenerator<Scalar>()); + } + template <typename RandomGenerator> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<RandomGenerator, const Derived> + random() const { + return nullaryExpr(RandomGenerator()); + } + + // Generic unary operation support. + template <typename CustomUnaryOp> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<CustomUnaryOp, const Derived> + unaryExpr(const CustomUnaryOp& func) const { + return TensorCwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func); + } + + // Coefficient-wise unary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> + operator-() const { + return unaryExpr(internal::scalar_opposite_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> + sqrt() const { + return unaryExpr(internal::scalar_sqrt_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> + square() const { + return unaryExpr(internal::scalar_square_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived> + cube() const { + return unaryExpr(internal::scalar_cube_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> + inverse() const { + return unaryExpr(internal::scalar_inverse_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived> + exp() const { + return unaryExpr(internal::scalar_exp_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived> + log() const { + return unaryExpr(internal::scalar_log_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> + abs() const { + return unaryExpr(internal::scalar_abs_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived> + pow(Scalar exponent) const { + return unaryExpr(internal::scalar_pow_op<Scalar>(exponent)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived> + operator+ (Scalar rhs) const { + return unaryExpr(internal::scalar_add_op<Scalar>(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived> + operator- (Scalar rhs) const { + EIGEN_STATIC_ASSERT((std::numeric_limits<Scalar>::is_signed || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + return unaryExpr(internal::scalar_sub_op<Scalar>(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived> + operator* (Scalar rhs) const { + return unaryExpr(internal::scalar_multiple_op<Scalar>(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived> + operator/ (Scalar rhs) const { + // EIGEN_STATIC_ASSERT(!std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE); + return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + cwiseMax(Scalar threshold) const { + return cwiseMax(constant(threshold)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + cwiseMin(Scalar threshold) const { + return cwiseMin(constant(threshold)); + } + + template <typename NewType> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cast_op<Scalar, NewType>, const Derived> + cast() const { + return unaryExpr(internal::scalar_cast_op<Scalar, NewType>()); + } + + // Generic binary operation support. + template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived> + binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const { + return TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other, func); + } + + // Coefficient-wise binary operators. + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const OtherDerived> + operator+(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_sum_op<Scalar>()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const Derived, const OtherDerived> + operator-(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_difference_op<Scalar>()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_product_op<Scalar>, const Derived, const OtherDerived> + operator*(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_product_op<Scalar>()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived> + operator/(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived> + cwiseMax(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_max_op<Scalar>()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived> + cwiseMin(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_min_op<Scalar>()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived> + operator&&(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_and_op()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived> + operator||(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_or_op()); + } + + // Comparisons and tests. + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<std::less<Scalar>, const Derived, const OtherDerived> + operator<(const OtherDerived& other) const { + return binaryExpr(other.derived(), std::less<Scalar>()); + } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<std::less_equal<Scalar>, const Derived, const OtherDerived> + operator<=(const OtherDerived& other) const { + return binaryExpr(other.derived(), std::less_equal<Scalar>()); + } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<std::greater<Scalar>, const Derived, const OtherDerived> + operator>(const OtherDerived& other) const { + return binaryExpr(other.derived(), std::greater<Scalar>()); + } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<std::greater_equal<Scalar>, const Derived, const OtherDerived> + operator>=(const OtherDerived& other) const { + return binaryExpr(other.derived(), std::greater_equal<Scalar>()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived> + operator==(const OtherDerived& other) const { + return binaryExpr(other.derived(), std::equal_to<Scalar>()); + } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived> + operator!=(const OtherDerived& other) const { + return binaryExpr(other.derived(), std::not_equal_to<Scalar>()); + } + + // Coefficient-wise ternary operators. + template<typename ThenDerived, typename ElseDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSelectOp<const Derived, const ThenDerived, const ElseDerived> + select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { + return TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>(derived(), thenTensor.derived(), elseTensor.derived()); + } + + // Contractions. + typedef Eigen::IndexPair<Index> DimensionPair; + + template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorContractionOp<const Dimensions, const Derived, const OtherDerived> + contract(const OtherDerived& other, const Dimensions& dims) const { + return TensorContractionOp<const Dimensions, const Derived, const OtherDerived>(derived(), other.derived(), dims); + } + + // Convolutions. + template<typename KernelDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived> + convolve(const KernelDerived& kernel, const Dimensions& dims) const { + return TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims); + } + + // Reductions. + template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived> + sum(const Dims& dims) const { + return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::SumReducer<CoeffReturnType>()); + } + + const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived> + sum() const { + array<Index, NumDimensions> in_dims; + for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i; + return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>()); + } + + template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived> + mean(const Dims& dims) const { + return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MeanReducer<CoeffReturnType>()); + } + + const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived> + mean() const { + array<Index, NumDimensions> in_dims; + for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i; + return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>()); + } + + template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived> + prod(const Dims& dims) const { + return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::ProdReducer<CoeffReturnType>()); + } + + const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived> + prod() const { + array<Index, NumDimensions> in_dims; + for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i; + return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>()); + } + + template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived> + maximum(const Dims& dims) const { + return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType>()); + } + + const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived> + maximum() const { + array<Index, NumDimensions> in_dims; + for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i; + return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType>()); + } + + template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived> + minimum(const Dims& dims) const { + return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType>()); + } + + const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived> + minimum() const { + array<Index, NumDimensions> in_dims; + for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i; + return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType>()); + } + + template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<Reducer, const Dims, const Derived> + reduce(const Dims& dims, const Reducer& reducer) const { + return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer); + } + + template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorBroadcastingOp<const Broadcast, const Derived> + broadcast(const Broadcast& broadcast) const { + return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), broadcast); + } + + template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp<Axis, const Derived, const OtherDerived> + concatenate(const OtherDerived& other, Axis axis) const { + return TensorConcatenationOp<Axis, const Derived, const OtherDerived>(derived(), other.derived(), axis); + } + + template <typename PatchDims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPatchOp<const PatchDims, const Derived> + extract_patches(const PatchDims& patch_dims) const { + return TensorPatchOp<const PatchDims, const Derived>(derived(), patch_dims); + } + + template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Rows, Cols, const Derived> + extract_image_patches() const { + return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, 1, 1, PADDING_SAME); + } + + template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Rows, Cols, const Derived> + extract_image_patches(const PaddingType padding_type) const { + return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, 1, 1, padding_type); + } + + template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Rows, Cols, const Derived> + extract_image_patches(const Index stride, const PaddingType padding_type) const { + return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, stride, stride, padding_type); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Dynamic, Dynamic, const Derived> + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride = 1, const Index col_stride = 1) const { + return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride, + PADDING_SAME); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Dynamic, Dynamic, const Derived> + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const PaddingType padding_type) const { + return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride, + padding_type); + } + + // Morphing operators. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorLayoutSwapOp<const Derived> + swap_layout() const { + return TensorLayoutSwapOp<const Derived>(derived()); + } + template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp<const NewDimensions, const Derived> + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions); + } + template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp<const StartIndices, const Sizes, const Derived> + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes); + } + template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp<DimId, const Derived> + chip(const Index offset) const { + return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp<Dynamic, const Derived> + chip(const Index offset, const Index dim) const { + return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim); + } + template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp<const ReverseDimensions, const Derived> + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev); + } + template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPaddingOp<const PaddingDimensions, const Derived> + pad(const PaddingDimensions& padding) const { + return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding); + } + template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorShufflingOp<const Shuffle, const Derived> + shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle); + } + template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingOp<const Strides, const Derived> + stride(const Strides& strides) const { + return TensorStridingOp<const Strides, const Derived>(derived(), strides); + } + + // Force the evaluation of the expression. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorForcedEvalOp<const Derived> eval() const { + return TensorForcedEvalOp<const Derived>(derived()); + } + + protected: + template <typename Scalar, std::size_t NumIndices, int Options> friend class Tensor; + template <typename Scalar, int Options> friend class TensorVarDim; + template <typename OtherDerived, int AccessLevel> friend class TensorBase; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); } +}; + +template<typename Derived> +class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> { + public: + typedef internal::traits<Derived> DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef Scalar CoeffReturnType; + typedef typename internal::packet_traits<Scalar>::type PacketReturnType; + static const int NumDimensions = DerivedTraits::NumDimensions; + + template <typename Scalar, std::size_t NumIndices, int Options> friend class Tensor; + template <typename Scalar, int Options> friend class TensorVarDim; + template <typename OtherDerived, int AccessLevel> friend class TensorBase; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setZero() { + return setConstant(Scalar(0)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { + return derived() = this->constant(val); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->random(); + } + template <typename RandomGenerator> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->template random<RandomGenerator>(); + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setValues( + const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) { + TensorEvaluator<Derived, DefaultDevice> eval(derived(), DefaultDevice()); + internal::initialize_tensor<Derived, NumDimensions>(eval, vals); + return derived(); + } +#endif // EIGEN_HAS_VARIADIC_TEMPLATES + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator+=(const OtherDerived& other) { + return derived() = derived() + other.derived(); + } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator-=(const OtherDerived& other) { + return derived() = derived() - other.derived(); + } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator*=(const OtherDerived& other) { + return derived() = derived() * other.derived(); + } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator/=(const OtherDerived& other) { + return derived() = derived() / other.derived(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorLayoutSwapOp<Derived> + swap_layout() const { + return TensorLayoutSwapOp<Derived>(derived()); + } + template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReshapingOp<const NewDimensions, Derived> + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp<const NewDimensions, Derived>(derived(), newDimensions); + } + template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorSlicingOp<const StartIndices, const Sizes, Derived> + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes); + } + template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp<DimId, Derived> + chip(const Index offset) const { + return TensorChippingOp<DimId, Derived>(derived(), offset, DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp<Dynamic, Derived> + chip(const Index offset, const Index dim) const { + return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim); + } + template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorShufflingOp<const Shuffle, Derived> + shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp<const Shuffle, Derived>(derived(), shuffle); + } + template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingOp<const Strides, Derived> + stride(const Strides& strides) const { + return TensorStridingOp<const Strides, Derived>(derived(), strides); + } + + // Select the device on which to evaluate the expression. + template <typename DeviceType> + TensorDevice<Derived, DeviceType> device(const DeviceType& device) { + return TensorDevice<Derived, DeviceType>(device, derived()); + } + + protected: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h new file mode 100644 index 000000000..5790e19d6 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -0,0 +1,341 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H +#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H + +namespace Eigen { + +/** \class TensorBroadcasting + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor broadcasting class. + * + * + */ +namespace internal { +template<typename Broadcast, typename XprType> +struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename Broadcast, typename XprType> +struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense> +{ + typedef const TensorBroadcastingOp<Broadcast, XprType>& type; +}; + +template<typename Broadcast, typename XprType> +struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type> +{ + typedef TensorBroadcastingOp<Broadcast, XprType> type; +}; + +} // end namespace internal + + + +template<typename Broadcast, typename XprType> +class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested; + typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast) + : m_xpr(expr), m_broadcast(broadcast) {} + + EIGEN_DEVICE_FUNC + const Broadcast& broadcast() const { return m_broadcast; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Broadcast m_broadcast; +}; + + +// Eval as rvalue +template<typename Broadcast, typename ArgType, typename Device> +struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> +{ + typedef TensorBroadcastingOp<Broadcast, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + const Broadcast& broadcast = op.broadcast(); + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i] * broadcast[i]; + } + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } else { + m_inputStrides[NumDims-1] = 1; + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims-2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + } + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const + { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + return coeffColMajor(index); + } else { + return coeffRowMajor(index); + } + } + + // TODO: attempt to speed this up. The integer divisions and modulo are slow + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq<Broadcast>()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq<InputDimensions>()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq<Broadcast>()(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + inputIndex += index; + } else { + if (internal::index_statically_eq<InputDimensions>()(0, 1)) { + eigen_assert(index % m_impl.dimensions()[0] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[0]); + } + } + return m_impl.coeff(inputIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + { + Index inputIndex = 0; + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq<Broadcast>()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq<InputDimensions>()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims-1]); + inputIndex += index; + } else { + if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[NumDims-1]); + } + } + return m_impl.coeff(inputIndex); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const + { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + return packetColMajor<LoadMode>(index); + } else { + return packetRowMajor<LoadMode>(index); + } + } + + // Ignore the LoadMode and always use unaligned loads since we can't guarantee + // the alignment at compile time. + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq<Broadcast>()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq<InputDimensions>()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + Index innermostLoc; + if (internal::index_statically_eq<Broadcast>()(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + innermostLoc = index; + } else { + if (internal::index_statically_eq<InputDimensions>()(0, 1)) { + eigen_assert(innermostLoc % m_impl.dimensions()[0] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[0]; + } + } + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + packetSize <= m_impl.dimensions()[0]) { + return m_impl.template packet<Unaligned>(inputIndex); + } else { + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + values[0] = m_impl.coeff(inputIndex); + for (int i = 1; i < packetSize; ++i) { + values[i] = coeffColMajor(originalIndex+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq<Broadcast>()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq<InputDimensions>()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + Index innermostLoc; + if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims-1]); + innermostLoc = index; + } else { + if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) { + eigen_assert(innermostLoc % m_impl.dimensions()[NumDims-1] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[NumDims-1]; + } + } + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) { + return m_impl.template packet<Unaligned>(inputIndex); + } else { + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + values[0] = m_impl.coeff(inputIndex); + for (int i = 1; i < packetSize; ++i) { + values[i] = coeffRowMajor(originalIndex+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + } + + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array<Index, NumDims> m_outputStrides; + array<Index, NumDims> m_inputStrides; + TensorEvaluator<ArgType, Device> m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h new file mode 100644 index 000000000..dc9586cbc --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -0,0 +1,363 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H +#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H + +namespace Eigen { + +/** \class TensorKChippingReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor. + * + * + */ + +namespace internal { +template<DenseIndex DimId, typename XprType> +struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions - 1; + static const int Layout = XprTraits::Layout; +}; + +template<DenseIndex DimId, typename XprType> +struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense> +{ + typedef const TensorChippingOp<DimId, XprType>& type; +}; + +template<DenseIndex DimId, typename XprType> +struct nested<TensorChippingOp<DimId, XprType>, 1, typename eval<TensorChippingOp<DimId, XprType> >::type> +{ + typedef TensorChippingOp<DimId, XprType> type; +}; + +template <DenseIndex DimId> +struct DimensionId +{ + DimensionId(DenseIndex dim) { + eigen_assert(dim == DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { + return DimId; + } +}; +template <> +struct DimensionId<Dynamic> +{ + DimensionId(DenseIndex dim) : actual_dim(dim) { + eigen_assert(dim >= 0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { + return actual_dim; + } + private: + const DenseIndex actual_dim; +}; + + +} // end namespace internal + + + +template<DenseIndex DimId, typename XprType> +class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorChippingOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested; + typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) + : m_xpr(expr), m_offset(offset), m_dim(dim) { + } + + EIGEN_DEVICE_FUNC + const Index offset() const { return m_offset; } + EIGEN_DEVICE_FUNC + const Index dim() const { return m_dim.actualDim(); } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other) + { + typedef TensorAssignOp<TensorChippingOp, const TensorChippingOp> Assign; + Assign assign(*this, other); + static const bool Vectorize = TensorEvaluator<const Assign, DefaultDevice>::PacketAccess; + internal::TensorExecutor<const Assign, DefaultDevice, Vectorize>::run(assign, DefaultDevice()); + return *this; + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorChippingOp, const OtherDerived> Assign; + Assign assign(*this, other); + static const bool Vectorize = TensorEvaluator<const Assign, DefaultDevice>::PacketAccess; + internal::TensorExecutor<const Assign, DefaultDevice, Vectorize>::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const Index m_offset; + const internal::DimensionId<DimId> m_dim; +}; + + +// Eval as rvalue +template<DenseIndex DimId, typename ArgType, typename Device> +struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> +{ + typedef TensorChippingOp<DimId, ArgType> XprType; + static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + static const int NumDims = NumInputDims-1; + typedef typename XprType::Index Index; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets. + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) + { + // We could also support the case where NumInputDims==1 if needed. + EIGEN_STATIC_ASSERT(NumInputDims >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(NumInputDims > m_dim.actualDim()); + + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + int j = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (i != m_dim.actualDim()) { + m_dimensions[j] = input_dims[i]; + ++j; + } + } + + m_stride = 1; + m_inputStride = 1; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < m_dim.actualDim(); ++i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } else { + for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } + m_inputStride *= input_dims[m_dim.actualDim()]; + m_inputOffset = m_stride * op.offset(); + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) || + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + Index inputIndex = index * m_inputStride + m_inputOffset; + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = m_impl.coeff(inputIndex); + inputIndex += m_inputStride; + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(m_stride > index); + return m_impl.template packet<LoadMode>(index + m_inputOffset); + } else { + const Index idx = index / m_stride; + const Index rem = index - idx * m_stride; + if (rem + packetSize <= m_stride) { + Index inputIndex = idx * m_inputStride + m_inputOffset + rem; + return m_impl.template packet<LoadMode>(inputIndex); + } else { + // Cross the stride boundary. Fallback to slow path. + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index); + ++index; + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + Scalar* result = m_impl.data(); + if (m_dim.actualDim() == NumDims && result) { + return result + m_inputOffset; + } else { + return NULL; + } + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex; + if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) || + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + inputIndex = index * m_inputStride + m_inputOffset; + } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) || + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(m_stride > index); + inputIndex = index + m_inputOffset; + } else { + const Index idx = index / m_stride; + inputIndex = idx * m_inputStride + m_inputOffset; + index -= idx * m_stride; + inputIndex += index; + } + return inputIndex; + } + + Dimensions m_dimensions; + Index m_stride; + Index m_inputOffset; + Index m_inputStride; + TensorEvaluator<ArgType, Device> m_impl; + const internal::DimensionId<DimId> m_dim; + const Device& m_device; +}; + + +// Eval as lvalue +template<DenseIndex DimId, typename ArgType, typename Device> +struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> + : public TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> +{ + typedef TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Base; + typedef TensorChippingOp<DimId, ArgType> XprType; + static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + static const int NumDims = NumInputDims-1; + typedef typename XprType::Index Index; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + static const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + + if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) || + (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(this->m_stride == 1); + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + internal::pstore<CoeffReturnType, PacketReturnType>(values, x); + Index inputIndex = index * this->m_inputStride + this->m_inputOffset; + for (int i = 0; i < packetSize; ++i) { + this->m_impl.coeffRef(inputIndex) = values[i]; + inputIndex += this->m_inputStride; + } + } else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) || + (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(this->m_stride > index); + this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x); + } else { + const Index idx = index / this->m_stride; + const Index rem = index - idx * this->m_stride; + if (rem + packetSize <= this->m_stride) { + const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; + this->m_impl.template writePacket<StoreMode>(inputIndex, x); + } else { + // Cross stride boundary. Fallback to slow path. + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + internal::pstore<CoeffReturnType, PacketReturnType>(values, x); + for (int i = 0; i < packetSize; ++i) { + this->coeffRef(index) = values[i]; + ++index; + } + } + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h new file mode 100644 index 000000000..a1dec76d1 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -0,0 +1,258 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H + +namespace Eigen { + +/** \class TensorConcatenationOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor concatenation class. + * + * + */ +namespace internal { +template<typename Axis, typename LhsXprType, typename RhsXprType> +struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type<typename LhsXprType::Scalar, + typename RhsXprType::Scalar>::ret Scalar; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind, + typename traits<RhsXprType>::StorageKind>::ret StorageKind; + typedef typename promote_index_type<typename traits<LhsXprType>::Index, + typename traits<RhsXprType>::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference<LhsNested>::type _LhsNested; + typedef typename remove_reference<RhsNested>::type _RhsNested; + static const int NumDimensions = traits<LhsXprType>::NumDimensions; + static const int Layout = traits<LhsXprType>::Layout; + enum { Flags = 0 }; +}; + +template<typename Axis, typename LhsXprType, typename RhsXprType> +struct eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, Eigen::Dense> +{ + typedef const TensorConcatenationOp<Axis, LhsXprType, RhsXprType>& type; +}; + +template<typename Axis, typename LhsXprType, typename RhsXprType> +struct nested<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, 1, typename eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >::type> +{ + typedef TensorConcatenationOp<Axis, LhsXprType, RhsXprType> type; +}; + +} // end namespace internal + + +template<typename Axis, typename LhsXprType, typename RhsXprType> +class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors> +{ + public: + typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar; + typedef typename internal::traits<TensorConcatenationOp>::Packet Packet; + typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind; + typedef typename internal::traits<TensorConcatenationOp>::Index Index; + typedef typename internal::nested<TensorConcatenationOp>::type Nested; + typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType, + typename RhsXprType::CoeffReturnType>::ret CoeffReturnType; + typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType, + typename RhsXprType::PacketReturnType>::ret PacketReturnType; + typedef typename NumTraits<Scalar>::Real RealScalar; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename LhsXprType::Nested>::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename RhsXprType::Nested>::type& + rhsExpression() const { return m_rhs_xpr; } + + EIGEN_DEVICE_FUNC Axis axis() const { return m_axis; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Axis m_axis; +}; + + +// Eval as rvalue +template<typename Axis, typename LeftArgType, typename RightArgType, typename Device> +struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> +{ + typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value; + static const int RightNumDims = internal::array_size<typename TensorEvaluator<RightArgType, Device>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess, + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(0 <= m_axis && m_axis < NumDims); + const Dimensions& lhs_dims = m_leftImpl.dimensions(); + const Dimensions& rhs_dims = m_rightImpl.dimensions(); + int i = 0; + for (; i < m_axis; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + eigen_assert(lhs_dims[i] > 0); // Now i == m_axis. + eigen_assert(rhs_dims[i] > 0); + m_dimensions[i] = lhs_dims[i] + rhs_dims[i]; + for (++i; i < NumDims; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_leftStrides[0] = 1; + m_rightStrides[0] = 1; + m_outputStrides[0] = 1; + + for (int i = 1; i < NumDims; ++i) { + m_leftStrides[i] = m_leftStrides[i-1] * lhs_dims[i-1]; + m_rightStrides[i] = m_rightStrides[i-1] * rhs_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } else { + m_leftStrides[NumDims - 1] = 1; + m_rightStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + + for (int i = NumDims - 2; i >= 0; --i) { + m_leftStrides[i] = m_leftStrides[i+1] * lhs_dims[i+1]; + m_rightStrides[i] = m_rightStrides[i+1] * rhs_dims[i+1]; + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear? + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) + { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() + { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow. + // See CL/76180724 comments for more ideas. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Collect dimension-wise indices (subs). + array<Index, NumDims> subs; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[NumDims - 1] = index; + } + + const Dimensions& left_dims = m_leftImpl.dimensions(); + if (subs[m_axis] < left_dims[m_axis]) { + Index left_index; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + left_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } else { + left_index = subs[NumDims - 1]; + for (int i = NumDims - 2; i >= 0; --i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } + return m_leftImpl.coeff(left_index); + } else { + subs[m_axis] -= left_dims[m_axis]; + const Dimensions& right_dims = m_rightImpl.dimensions(); + Index right_index; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + right_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } else { + right_index = subs[NumDims - 1]; + for (int i = NumDims - 2; i >= 0; --i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } + return m_rightImpl.coeff(right_index); + } + } + + // TODO(phli): Add a real vectorization. + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + static const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array<Index, NumDims> m_outputStrides; + array<Index, NumDims> m_leftStrides; + array<Index, NumDims> m_rightStrides; + TensorEvaluator<LeftArgType, Device> m_leftImpl; + TensorEvaluator<RightArgType, Device> m_rightImpl; + const Axis m_axis; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h new file mode 100644 index 000000000..f7254a24d --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -0,0 +1,992 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H + +namespace Eigen { + +/** \class TensorContraction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor contraction class. + * + * + */ +namespace internal { + +enum { + Rhs = 0, + Lhs = 1, +}; + +/* + * Implementation of the Eigen blas_data_mapper class for tensors. + */ +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + size_t packet_size, bool inner_dim_contiguous> +class BaseTensorContractionMapper { + public: + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + m_tensor(tensor), + m_nocontract_strides(nocontract_strides), + m_ij_strides(ij_strides), + m_contract_strides(contract_strides), + m_k_strides(k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row) const { + // column major assumption + return operator()(row, 0); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { + return m_tensor.coeff(computeIndex(row, col)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { + const bool left = (side == Lhs); + Index nocontract_val = left ? row : col; + Index linidx = 0; + for (int i = array_size<nocontract_t>::value - 1; i > 0; i--) { + const Index idx = nocontract_val / m_ij_strides[i]; + linidx += idx * m_nocontract_strides[i]; + nocontract_val -= idx * m_ij_strides[i]; + } + if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx += nocontract_val; + } else { + linidx += nocontract_val * m_nocontract_strides[0]; + } + } + + Index contract_val = left ? col : row; + for (int i = array_size<contract_t>::value - 1; i > 0; i--) { + const Index idx = contract_val / m_k_strides[i]; + linidx += idx * m_contract_strides[i]; + contract_val -= idx * m_k_strides[i]; + } + EIGEN_STATIC_ASSERT(array_size<contract_t>::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx += contract_val; + } else { + linidx += contract_val * m_contract_strides[0]; + } + + return linidx; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const { + const bool left = (side == Lhs); + Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; + Index linidx[2] = {0, 0}; + for (int i = array_size<nocontract_t>::value - 1; i > 0; i--) { + const Index idx0 = nocontract_val[0] / m_ij_strides[i]; + const Index idx1 = nocontract_val[1] / m_ij_strides[i]; + linidx[0] += idx0 * m_nocontract_strides[i]; + linidx[1] += idx1 * m_nocontract_strides[i]; + nocontract_val[0] -= idx0 * m_ij_strides[i]; + nocontract_val[1] -= idx1 * m_ij_strides[i]; + } + if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx[0] += nocontract_val[0]; + linidx[1] += nocontract_val[1]; + } else { + linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; + linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; + } + } + + Index contract_val[2] = {left ? col : row, left ? col : row + distance}; + for (int i = array_size<contract_t>::value - 1; i > 0; i--) { + const Index idx0 = contract_val[0] / m_k_strides[i]; + const Index idx1 = contract_val[1] / m_k_strides[i]; + linidx[0] += idx0 * m_contract_strides[i]; + linidx[1] += idx1 * m_contract_strides[i]; + contract_val[0] -= idx0 * m_k_strides[i]; + contract_val[1] -= idx1 * m_k_strides[i]; + } + EIGEN_STATIC_ASSERT(array_size<contract_t>::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx[0] += contract_val[0]; + linidx[1] += contract_val[1]; + } else { + linidx[0] += contract_val[0] * m_contract_strides[0]; + linidx[1] += contract_val[1] * m_contract_strides[0]; + } + return IndexPair<Index>(linidx[0], linidx[1]); + } + + Index firstAligned(Index size) const { + return size; + } + Index stride() const { + return 1; + } + + protected: + const Tensor m_tensor; + const nocontract_t m_nocontract_strides; + const nocontract_t m_ij_strides; + const contract_t m_contract_strides; + const contract_t m_k_strides; +}; + + + +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + size_t packet_size, + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> +class TensorContractionInputMapper; + +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + size_t packet_size, + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> +class TensorContractionSubMapper { + public: + typedef typename packet_traits<Scalar>::type Packet; + typedef typename packet_traits<Scalar>::half HalfPacket; + + typedef TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper; + typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self; + typedef Self LinearMapper; + + EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + return m_base_mapper(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); + } + + template <typename PacketT, int AlignmentType> + EIGEN_ALWAYS_INLINE PacketT load(Index i) const { + EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE); + return loadPacket(i); + } + + template <typename Packet> + bool aligned(Index /*i*/) const { + return false; + } + + private: + const ParentMapper& m_base_mapper; + const Index m_vert_offset; + const Index m_horiz_offset; +}; + + +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + size_t packet_size = (Tensor::PacketAccess ? packet_traits<Scalar>::size : 1), + bool inner_dim_contiguous = false, bool inner_dim_reordered = (side != Lhs), int Alignment=Unaligned> +class TensorContractionInputMapper + : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> { + + public: + typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> Base; + typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper; + typedef SubMapper VectorMapper; + + TensorContractionInputMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(*this, i, j); + } + + typedef typename packet_traits<Scalar>::type Packet; + typedef typename packet_traits<Scalar>::half HalfPacket; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + // current code assumes packet size must be a multiple of 2 + EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + + if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { + const Index index = this->computeIndex(i, j); + eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); + return this->m_tensor.template packet<Alignment>(index); + } + + const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1); + const Index first = indexPair.first; + const Index last = indexPair.second; + + // We can always do optimized packet reads from left hand side right now, because + // the vertical matrix dimension on the left hand side is never contracting. + // On the right hand side we need to check if the contracting dimensions may have + // been shuffled first. + if (Tensor::PacketAccess && + (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) && + (last - first) == (packet_size - 1)) { + + return this->m_tensor.template packet<Alignment>(first); + } + + EIGEN_ALIGN_DEFAULT Scalar data[packet_size]; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < packet_size - 1; k += 2) { + const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[packet_size - 1] = this->m_tensor.coeff(last); + + return pload<Packet>(data); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + const Index half_packet_size = unpacket_traits<HalfPacket>::size; + if (half_packet_size == packet_size) { + return loadPacket(i, j); + } + EIGEN_ALIGN_DEFAULT Scalar data[half_packet_size]; + for (Index k = 0; k < half_packet_size; k++) { + data[k] = operator()(i + k, j); + } + return pload<HalfPacket>(data); + } +}; + + + + +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> +class TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> + : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> { + + public: + typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> Base; + typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper; + typedef SubMapper VectorMapper; + + TensorContractionInputMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(*this, i, j); + } + + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + EIGEN_ALIGN_DEFAULT Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload<typename packet_traits<Scalar>::type>(data); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { + return loadPacket(i, j); + } +}; + + +template <size_t n> struct max_n_1 { + static const size_t size = n; +}; +template <> struct max_n_1<0> { + static const size_t size = 1; +}; + + +template<typename Dimensions, typename LhsXprType, typename RhsXprType> +struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename internal::promote_storage_type<typename LhsXprType::Scalar, + typename RhsXprType::Scalar>::ret Scalar; + typedef typename internal::packet_traits<Scalar>::type Packet; + typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind, + typename traits<RhsXprType>::StorageKind>::ret StorageKind; + typedef typename promote_index_type<typename traits<LhsXprType>::Index, + typename traits<RhsXprType>::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference<LhsNested>::type _LhsNested; + typedef typename remove_reference<RhsNested>::type _RhsNested; + + // From NumDims below. + static const int NumDimensions = max_n_1<traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value>::size; + static const int Layout = traits<LhsXprType>::Layout; + + enum { + Flags = 0, + }; +}; + +template<typename Dimensions, typename LhsXprType, typename RhsXprType> +struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, Eigen::Dense> +{ + typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType>& type; +}; + +template<typename Dimensions, typename LhsXprType, typename RhsXprType> +struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >::type> +{ + typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType> type; +}; + +template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename Device_> +struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_>, Device_> > { + typedef Indices_ Indices; + typedef LeftArgType_ LeftArgType; + typedef RightArgType_ RightArgType; + typedef Device_ Device; + + // From NumDims below. + static const int NumDimensions = max_n_1<traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value>::size; +}; + +} // end namespace internal + +template<typename Indices, typename LhsXprType, typename RhsXprType> +class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorContractionOp>::Packet Packet; + typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType, + typename RhsXprType::CoeffReturnType>::ret CoeffReturnType; + typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType, + typename RhsXprType::PacketReturnType>::ret PacketReturnType; + typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested; + typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( + const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} + + EIGEN_DEVICE_FUNC + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename LhsXprType::Nested>::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename RhsXprType::Nested>::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Indices m_indices; +}; + + +template<bool cond> struct Cond {}; + +template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T1& choose(Cond<true>, const T1& first, const T2&) { + return first; +} + +template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T2& choose(Cond<false>, const T1&, const T2& second) { + return second; +} + + +template<typename Derived> +struct TensorContractionEvaluatorBase +{ + typedef typename internal::traits<Derived>::Indices Indices; + typedef typename internal::traits<Derived>::LeftArgType LeftArgType; + typedef typename internal::traits<Derived>::RightArgType RightArgType; + typedef typename internal::traits<Derived>::Device Device; + + typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType; + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + enum { + IsAligned = true, + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; + static const int RDims = + internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; + static const int ContractDims = internal::array_size<Indices>::value; + static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size; + + typedef array<Index, LDims> left_dim_mapper_t; + typedef array<Index, RDims> right_dim_mapper_t; + typedef array<Index, ContractDims> contract_t; + typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t; + typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t; + + typedef DSizes<Index, NumDims> Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorContractionEvaluatorBase(const XprType& op, const Device& device) + : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), + op.lhsExpression(), op.rhsExpression()), device), + m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), + op.rhsExpression(), op.lhsExpression()), device), + m_device(device), + m_result(NULL) { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == + static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + eigen_assert((internal::array_size<contract_t>::value > 0) && "Must contract on some indices"); + + + DSizes<Index, LDims> eval_left_dims; + DSizes<Index, RDims> eval_right_dims; + array<IndexPair<Index>, ContractDims> eval_op_indices; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + // For ColMajor, we keep using the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[i]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[i]; + } + // We keep the pairs of contracting indices. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = op.indices()[i].first; + eval_op_indices[i].second = op.indices()[i].second; + } + } else { + // For RowMajor, we need to reverse the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1]; + } + // We need to flip all the pairs of contracting indices as well as + // reversing the dimensions. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = LDims - 1 - op.indices()[i].second; + eval_op_indices[i].second = RDims - 1 - op.indices()[i].first; + } + } + + array<Index, LDims> lhs_strides; + lhs_strides[0] = 1; + for (int i = 0; i < LDims-1; ++i) { + lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i]; + } + + array<Index, RDims> rhs_strides; + rhs_strides[0] = 1; + for (int i = 0; i < RDims-1; ++i) { + rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i]; + } + + m_i_strides[0] = 1; + m_j_strides[0] = 1; + m_k_strides[0] = 1; + + m_i_size = 1; + m_j_size = 1; + m_k_size = 1; + + // To compute the dimension, we simply concatenate the non-contracting + // dimensions of the left and then the right tensor. Additionally, we also + // compute the strides corresponding to the left non-contracting + // dimensions and right non-contracting dimensions. + m_lhs_inner_dim_contiguous = true; + int dim_idx = 0; + int nocontract_idx = 0; + + for (int i = 0; i < LDims; i++) { + // find if we are contracting on index i of left tensor + bool contracting = false; + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].first == i) { + contracting = true; + break; + } + } + if (!contracting) { + // add dimension size to output dimensions + m_dimensions[dim_idx] = eval_left_dims[i]; + m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; + if (dim_idx != i) { + m_lhs_inner_dim_contiguous = false; + } + if (nocontract_idx+1 < internal::array_size<left_nocontract_t>::value) { + m_i_strides[nocontract_idx+1] = + m_i_strides[nocontract_idx] * eval_left_dims[i]; + } else { + m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i]; + } + dim_idx++; + nocontract_idx++; + } + } + + nocontract_idx = 0; + for (int i = 0; i < RDims; i++) { + bool contracting = false; + // find if we are contracting on index i of right tensor + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].second == i) { + contracting = true; + break; + } + } + if (!contracting) { + m_dimensions[dim_idx] = eval_right_dims[i]; + if (nocontract_idx+1 < internal::array_size<right_nocontract_t>::value) { + m_j_strides[nocontract_idx+1] = + m_j_strides[nocontract_idx] * eval_right_dims[i]; + } else { + m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i]; + } + m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; + dim_idx++; + nocontract_idx++; + } + } + + // Now compute the strides corresponding to the contracting dimensions. We + // assumed above that non-contracting axes are represented in the same order + // in the matrix as they are in the tensor. This is not the case for + // contracting axes. As the contracting axes must be of the same size in + // each tensor, we'll only look at the first tensor here. + m_rhs_inner_dim_contiguous = true; + m_rhs_inner_dim_reordered = false; + for (int i = 0; i < ContractDims; i++) { + Index left = eval_op_indices[i].first; + Index right = eval_op_indices[i].second; + + Index size = eval_left_dims[left]; + eigen_assert(size == eval_right_dims[right] && + "Contraction axes must be same size"); + + if (i+1 < internal::array_size<contract_t>::value) { + m_k_strides[i+1] = m_k_strides[i] * size; + } else { + m_k_size = m_k_strides[i] * size; + } + m_left_contracting_strides[i] = lhs_strides[left]; + m_right_contracting_strides[i] = rhs_strides[right]; + + if (i > 0 && right < eval_op_indices[i-1].second) { + m_rhs_inner_dim_reordered = true; + } + if (right != i) { + m_rhs_inner_dim_contiguous = false; + } + } + + // Scalar case. We represent the result as a 1d tensor of size 1. + if (LDims + RDims == 2 * ContractDims) { + m_dimensions[0] = 1; + } + + // If the layout is RowMajor, we need to reverse the m_dimensions + if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) { + for (int i = 0, j = NumDims - 1; i < j; i++, j--) { + std::swap(m_dimensions[i], m_dimensions[j]); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } + } + + EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast<const Derived*>(this)->template evalProduct<true, true, true, Unaligned>(buffer); + } + else { + static_cast<const Derived*>(this)->template evalProduct<true, true, false, Unaligned>(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast<const Derived*>(this)->template evalProduct<true, false, true, Unaligned>(buffer); + } + else { + static_cast<const Derived*>(this)->template evalProduct<true, false, false, Unaligned>(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast<const Derived*>(this)->template evalProduct<false, true, true, Unaligned>(buffer); + } + else { + static_cast<const Derived*>(this)->template evalProduct<false, true, false, Unaligned>(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast<const Derived*>(this)->template evalProduct<false, false, true, Unaligned>(buffer); + } + else { + static_cast<const Derived*>(this)->template evalProduct<false, false, false, Unaligned>(buffer); + } + } + } + } + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + void evalGemv(Scalar* buffer) const { + const Index rows = m_i_size; + const Index cols = m_k_size; + + typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; + typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; + typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; + typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; + const int lhs_packet_size = internal::packet_traits<LhsScalar>::size; + const int rhs_packet_size = internal::packet_traits<RhsScalar>::size; + typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, + LeftEvaluator, left_nocontract_t, + contract_t, lhs_packet_size, + lhs_inner_dim_contiguous, + false, Unaligned> LhsMapper; + + typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, + RightEvaluator, right_nocontract_t, + contract_t, rhs_packet_size, + rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned> RhsMapper; + + LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, + m_left_contracting_strides, m_k_strides); + RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, + m_right_contracting_strides, m_k_strides); + + const Scalar alpha(1); + const Index resIncr(1); + + // zero out the result buffer (which must be of size at least rows * sizeof(Scalar) + m_device.memset(buffer, 0, rows * sizeof(Scalar)); + + internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run( + rows, cols, lhs, rhs, + buffer, resIncr, alpha); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt<Packet, LoadMode>(m_result + index); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + // Prevent assignment + TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); + Dimensions m_dimensions; + + contract_t m_k_strides; + contract_t m_left_contracting_strides; + contract_t m_right_contracting_strides; + + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + left_nocontract_t m_i_strides; + right_nocontract_t m_j_strides; + left_nocontract_t m_left_nocontract_strides; + right_nocontract_t m_right_nocontract_strides; + + Index m_i_size; + Index m_j_size; + Index m_k_size; + + TensorEvaluator<EvalLeftArgType, Device> m_leftImpl; + TensorEvaluator<EvalRightArgType, Device> m_rightImpl; + const Device& m_device; + Scalar* m_result; +}; + + +// evaluator for default device +template<typename Indices, typename LeftArgType, typename RightArgType, typename Device> +struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> : + public TensorContractionEvaluatorBase< + TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > { + typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self; + typedef TensorContractionEvaluatorBase<Self> Base; + + typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType; + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + enum { + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; + static const int RDims = + internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; + static const int ContractDims = internal::array_size<Indices>::value; + + typedef array<Index, LDims> left_dim_mapper_t; + typedef array<Index, RDims> right_dim_mapper_t; + + typedef array<Index, ContractDims> contract_t; + typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t; + typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t; + + static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size; + + // Could we use NumDimensions here? + typedef DSizes<Index, NumDims> Dimensions; + + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) { } + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + void evalProduct(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); + return; + } + + evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); + } + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + // define mr, nr, and all of my data mapper types + typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; + typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; + typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; + + const Index nr = Traits::nr; + const Index mr = Traits::mr; + + typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; + typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; + + const int lhs_packet_size = internal::packet_traits<LhsScalar>::size; + const int rhs_packet_size = internal::packet_traits<RhsScalar>::size; + + typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, + LeftEvaluator, left_nocontract_t, + contract_t, lhs_packet_size, + lhs_inner_dim_contiguous, + false, Unaligned> LhsMapper; + + typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, + RightEvaluator, right_nocontract_t, + contract_t, rhs_packet_size, + rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned> RhsMapper; + + typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; + + // Declare GEBP packing and kernel structs + internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs; + internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs; + + internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType; + + // Sizes of the blocks to load in cache. See the Goto paper for details. + BlockingType blocking(m, n, k, 1, true); + const Index kc = blocking.kc(); + const Index mc = (std::min)(m, blocking.mc()); + const Index nc = (std::min)(n, blocking.nc()); + const Index sizeA = mc * kc; + const Index sizeB = kc * nc; + + LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))); + RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))); + + for(Index i2=0; i2<m; i2+=mc) + { + const Index actual_mc = (std::min)(i2+mc,m)-i2; + for (Index k2 = 0; k2 < k; k2 += kc) { + // make sure we don't overshoot right edge of left matrix, then pack vertical panel + const Index actual_kc = (std::min)(k2 + kc, k) - k2; + pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0); + + // series of horizontal blocks + for (Index j2 = 0; j2 < n; j2 += nc) { + // make sure we don't overshoot right edge of right matrix, then pack block + const Index actual_nc = (std::min)(j2 + nc, n) - j2; + pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0); + + // call gebp (matrix kernel) + // The parameters here are copied from Eigen's GEMM implementation + gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0); + } + } + } + + this->m_device.deallocate(blockA); + this->m_device.deallocate(blockB); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h new file mode 100644 index 000000000..588770bb4 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -0,0 +1,1384 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com> +// Copyright (C) 2014 Eric Martin <eric@ericmart.in> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +namespace Eigen { + +template<typename Scalar, typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper, bool needs_edge_check> +__device__ EIGEN_STRONG_INLINE void +EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, volatile Scalar* lhs_shmem, volatile Scalar* rhs_shmem, + const Index m_size, const Index n_size, const Index k_size) { + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + // declare and initialize 64 registers for output 8x8 block + + // prefetch registers + Scalar lhs_pf0; + Scalar lhs_pf1; + Scalar lhs_pf2; + Scalar lhs_pf3; + Scalar lhs_pf4; + Scalar lhs_pf5; + Scalar lhs_pf6; + Scalar lhs_pf7; + + Scalar rhs_pf0; + Scalar rhs_pf1; + Scalar rhs_pf2; + Scalar rhs_pf3; + Scalar rhs_pf4; + Scalar rhs_pf5; + Scalar rhs_pf6; + Scalar rhs_pf7; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in threadIdx.x. + + // On the LHS, we pad each row inside of each block with an extra element. This makes + // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts + // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. + + // On the RHS we just add 8 padding elements to the end of each block. This gives no bank + // conflicts on writes and also none on reads. + + // storage indices + const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; + const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; + + const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; + const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; + const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; + const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; + const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; + const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; + const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; + + const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; + const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; + const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; + const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; + const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; + const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; + const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; + const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; + + // in the loading code, the following variables are important: + // threadIdx.x: the vertical position in an 8x8 block + // threadIdx.y: the vertical index of the 8x8 block in the grid + // threadIdx.z: the horizontal position in an 8x8 block + // k: the horizontal index of the 8x8 block in the grid + // + // The k parameter is implicit (it was the loop counter for a loop that went + // from 0 to <8, but now that loop is unrolled in the below code. + + const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; + const Index lhs_vert = base_m + load_idx_vert; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = Scalar(0); \ + lhs_pf1 = Scalar(0); \ + lhs_pf2 = Scalar(0); \ + lhs_pf3 = Scalar(0); \ + lhs_pf4 = Scalar(0); \ + lhs_pf5 = Scalar(0); \ + lhs_pf6 = Scalar(0); \ + lhs_pf7 = Scalar(0); \ + \ + rhs_pf0 = Scalar(0); \ + rhs_pf1 = Scalar(0); \ + rhs_pf2 = Scalar(0); \ + rhs_pf3 = Scalar(0); \ + rhs_pf4 = Scalar(0); \ + rhs_pf5 = Scalar(0); \ + rhs_pf6 = Scalar(0); \ + rhs_pf7 = Scalar(0); \ + \ + if (!needs_edge_check || lhs_vert < m_size) { \ + const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ + const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ + const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ + const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ + const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ + const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ + const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ + const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ + \ + if (!needs_edge_check || lhs_horiz_7 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ + } else if (lhs_horiz_6 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + } else if (lhs_horiz_5 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + } else if (lhs_horiz_4 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + } else if (lhs_horiz_3 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + } else if (lhs_horiz_2 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + } else if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + } \ + } \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (!needs_edge_check || rhs_vert < k_size) { \ + const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ + const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ + const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ + const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ + const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ + const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ + const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ + const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ + \ + if (rhs_horiz_7 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ + } else if (rhs_horiz_6 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + } else if (rhs_horiz_5 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + } else if (rhs_horiz_4 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + } else if (rhs_horiz_3 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + } else if (rhs_horiz_2 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + } else if (rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + } \ + } \ + } \ + +#define writeRegToShmem(_) \ + lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ + rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ + \ + lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ + rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ + \ + lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ + rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ + \ + lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ + rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ + \ + lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ + rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ + \ + lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ + rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ + \ + lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ + rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ + \ + lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ + rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = Scalar(0); \ + Scalar res(i, 1) = Scalar(0); \ + Scalar res(i, 2) = Scalar(0); \ + Scalar res(i, 3) = Scalar(0); \ + Scalar res(i, 4) = Scalar(0); \ + Scalar res(i, 5) = Scalar(0); \ + Scalar res(i, 6) = Scalar(0); \ + Scalar res(i, 7) = Scalar(0); \ + + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + + #undef prefetchIntoRegisters + #undef writeRegToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + +#define lcol(i) _lcol##i + Scalar lcol(0); + Scalar lcol(1); + Scalar lcol(2); + Scalar lcol(3); + Scalar lcol(4); + Scalar lcol(5); + Scalar lcol(6); + Scalar lcol(7); + +#define rrow(j) _rrow##j + Scalar rrow(0); + Scalar rrow(1); + Scalar rrow(2); + Scalar rrow(3); + Scalar rrow(4); + Scalar rrow(5); + Scalar rrow(6); + Scalar rrow(7); + + // Now x corresponds to k, y to m, and z to n + const volatile Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; + const volatile Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; + +#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] +#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] + +#define loadData(i, j) \ + lcol(0) = lhs_element(0, j); \ + rrow(0) = rhs_element(i, 0); \ + lcol(1) = lhs_element(1, j); \ + rrow(1) = rhs_element(i, 1); \ + lcol(2) = lhs_element(2, j); \ + rrow(2) = rhs_element(i, 2); \ + lcol(3) = lhs_element(3, j); \ + rrow(3) = rhs_element(i, 3); \ + lcol(4) = lhs_element(4, j); \ + rrow(4) = rhs_element(i, 4); \ + lcol(5) = lhs_element(5, j); \ + rrow(5) = rhs_element(i, 5); \ + lcol(6) = lhs_element(6, j); \ + rrow(6) = rhs_element(i, 6); \ + lcol(7) = lhs_element(7, j); \ + rrow(7) = rhs_element(i, 7); \ + +#define computeCol(j) \ + res(0, j) += lcol(0) * rrow(j); \ + res(1, j) += lcol(1) * rrow(j); \ + res(2, j) += lcol(2) * rrow(j); \ + res(3, j) += lcol(3) * rrow(j); \ + res(4, j) += lcol(4) * rrow(j); \ + res(5, j) += lcol(5) * rrow(j); \ + res(6, j) += lcol(6) * rrow(j); \ + res(7, j) += lcol(7) * rrow(j); \ + +#define computePass(i) \ + loadData(i, i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); \ + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol +#undef rrow +#undef lhs_element +#undef rhs_element +#undef loadData +#undef computeCol +#undef computePass + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); \ + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); \ + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (2) is slightly faster than (1) due to less branching and more ILP + + // TODO: won't yield much gain, but could just use currently unused shared mem + // and then we won't have to sync + // wait for shared mem to be out of use + __syncthreads(); + +#define writeResultShmem(i, j) \ + lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ + +#define writeRow(i) \ + writeResultShmem(i, 0); \ + writeResultShmem(i, 1); \ + writeResultShmem(i, 2); \ + writeResultShmem(i, 3); \ + writeResultShmem(i, 4); \ + writeResultShmem(i, 5); \ + writeResultShmem(i, 6); \ + writeResultShmem(i, 7); \ + + if (threadIdx.x == 0) { + writeRow(0); + writeRow(1); + writeRow(2); + writeRow(3); + writeRow(4); + writeRow(5); + writeRow(6); + writeRow(7); + } +#undef writeResultShmem +#undef writeRow + + const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); + const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); + + if (threadIdx.x < max_i_write) { + if (max_j_write == 8) { + // TODO: can i trade bank conflicts for coalesced writes? + Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; + Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; + Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; + Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; + Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; + Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; + Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; + Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; + + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; + } else { +#pragma unroll 7 + for (int j = 0; j < max_j_write; j++) { + Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; + } + } + } +#undef res +} + + +template<typename Scalar, typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper> +__global__ void +__launch_bounds__(512) +EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ volatile Scalar lhs_shmem[72 * 64]; + __shared__ volatile Scalar rhs_shmem[72 * 64]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } +} + + +template<typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY, + bool CHECK_RHS_BOUNDARY> +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][16], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + typedef float Scalar; + + // prefetch registers + float4 lhs_pf0, rhs_pf0; + + float4 results[4]; + for (int i=0; i < 4; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg =lhs.loadPacket(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg =lhs.loadPacket(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ + + + Index lhs_vert = base_m+threadIdx.x*4; + + for (Index k = 0; k < k_size; k += 16) { + lhs_pf0 = internal::pset1<float4>(0); + rhs_pf0 = internal::pset1<float4>(0); + + Index lhs_horiz = threadIdx.y+k; + prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) + + Index rhs_vert = k+(threadIdx.x%4)*4; + Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; + + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } else { + if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + float x1, x2 ; + // the following can be a bitwise operation..... some day. + if((threadIdx.x%8) < 4) { + x1 = rhs_pf0.y; + x2 = rhs_pf0.w; + } else { + x1 = rhs_pf0.x; + x2 = rhs_pf0.z; + } + x1 = __shfl_xor(x1, 4); + x2 = __shfl_xor(x2, 4); + if((threadIdx.x%8) < 4) { + rhs_pf0.y = x1; + rhs_pf0.w = x2; + } else { + rhs_pf0.x = x1; + rhs_pf0.z = x2; + } + + // We have 64 features. + // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. + // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. + // ... + // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 + // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 + // ... + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); + + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // ... + // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) + // ... + + lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); + + +#define add_vals(fl1, fl2, fr1, fr2)\ + results[0].x += fl1.x * fr1.x;\ + results[0].y += fl1.y * fr1.x;\ + results[0].z += fl2.x * fr1.x;\ + results[0].w += fl2.y * fr1.x;\ +\ + results[1].x += fl1.x * fr1.y;\ + results[1].y += fl1.y * fr1.y;\ + results[1].z += fl2.x * fr1.y;\ + results[1].w += fl2.y * fr1.y;\ +\ + results[2].x += fl1.x * fr2.x;\ + results[2].y += fl1.y * fr2.x;\ + results[2].z += fl2.x * fr2.x;\ + results[2].w += fl2.y * fr2.x;\ +\ + results[3].x += fl1.x * fr2.y;\ + results[3].y += fl1.y * fr2.y;\ + results[3].z += fl2.x * fr2.y;\ + results[3].w += fl2.y * fr2.y;\ + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 16; koff ++) { + // 32 x threads. + float2 fl1 = lhs_shmem2[koff][threadIdx.x]; + float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; + + int start_feature = threadIdx.y * 4; + float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + + add_vals(fl1, fl2, fr1, fr2) + } + __syncthreads(); + } + +#undef prefetch_lhs +#undef add_vals + + Index horiz_base = threadIdx.y*4+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + // CHECK LHS + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK RHS + /* + int ncols_rem = fminf(n_size- horiz_base, 4); + for (int i = 0; i < ncols_rem; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + }*/ + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template<typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY, + bool CHECK_RHS_BOUNDARY> +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][32], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + typedef float Scalar; + + // prefetch registers + float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; + float4 rhs_pf0, rhs_pf1; + + float4 results[8]; + for (int i=0; i < 8; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + + Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; + for (Index k = 0; k < k_size; k += 32) { + lhs_pf0 = internal::pset1<float4>(0); + lhs_pf1 = internal::pset1<float4>(0); + lhs_pf2 = internal::pset1<float4>(0); + lhs_pf3 = internal::pset1<float4>(0); + + rhs_pf0 = internal::pset1<float4>(0); + rhs_pf1 = internal::pset1<float4>(0); + + if (!CHECK_LHS_BOUNDARY) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else { + // just CHECK_LHS_BOUNDARY + if (lhs_vert + 3 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 2 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 1 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + } + } else if (lhs_vert < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + } + } + } + __syncthreads(); + Index rhs_vert = k+threadIdx.x*4; + Index rhs_horiz0 = threadIdx.y*2+base_n; + Index rhs_horiz1 = threadIdx.y*2+1+base_n; + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else { + if (rhs_horiz1 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (k+threadIdx.x*4 + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (k+threadIdx.x*4 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + __syncthreads(); + // Loaded. Do computation + // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. + // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. + // .. + // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 + rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); + // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. + // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. + // .. + rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); + // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. + // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. + rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); + // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. + // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. + rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); + + // LHS. + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // ... + // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + + +#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ + results[0].x += a_feat1.x * f1.x;\ + results[1].x += a_feat1.x * f1.y;\ + results[2].x += a_feat1.x * f2.x;\ + results[3].x += a_feat1.x * f2.y;\ + results[4].x += a_feat1.x * f3.x;\ + results[5].x += a_feat1.x * f3.y;\ + results[6].x += a_feat1.x * f4.x;\ + results[7].x += a_feat1.x * f4.y;\ +\ + results[0].y += a_feat1.y * f1.x;\ + results[1].y += a_feat1.y * f1.y;\ + results[2].y += a_feat1.y * f2.x;\ + results[3].y += a_feat1.y * f2.y;\ + results[4].y += a_feat1.y * f3.x;\ + results[5].y += a_feat1.y * f3.y;\ + results[6].y += a_feat1.y * f4.x;\ + results[7].y += a_feat1.y * f4.y;\ +\ + results[0].z += a_feat2.x * f1.x;\ + results[1].z += a_feat2.x * f1.y;\ + results[2].z += a_feat2.x * f2.x;\ + results[3].z += a_feat2.x * f2.y;\ + results[4].z += a_feat2.x * f3.x;\ + results[5].z += a_feat2.x * f3.y;\ + results[6].z += a_feat2.x * f4.x;\ + results[7].z += a_feat2.x * f4.y;\ +\ + results[0].w += a_feat2.y * f1.x;\ + results[1].w += a_feat2.y * f1.y;\ + results[2].w += a_feat2.y * f2.x;\ + results[3].w += a_feat2.y * f2.y;\ + results[4].w += a_feat2.y * f3.x;\ + results[5].w += a_feat2.y * f3.y;\ + results[6].w += a_feat2.y * f4.x;\ + results[7].w += a_feat2.y * f4.y;\ + + lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); + lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); + lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); + + lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); + lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); + lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); + lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 32; koff ++) { + float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; + float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; + + // first feature is at (threadIdx.y/4) * 8 last is at start + 8. + int start_feature = (threadIdx.y / 4) * 8; + + float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; + float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; + float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; + float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; + + add_vals(a3, a4, br1, br2, br3, br4) + } + __syncthreads(); + } // end loop over k + + + __syncthreads(); + Index horiz_base = (threadIdx.y/4)*8+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK BOUNDARY_B + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template<typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper> +__global__ void +__launch_bounds__(256) +EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[64*32]; + __shared__ float2 rhs_shmem[128*8]; + + typedef float2 LHS_MEM[64][32]; + typedef float2 RHS_MEM[128][8]; + + typedef float2 LHS_MEM16x16[32][16]; + typedef float2 RHS_MEM16x16[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 128 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + bool check_rhs = (base_n + 63) >= n_size; + bool check_lhs128 = (base_m + 127) >= m_size; + bool check_lhs64 = (base_m + 63) >= m_size; + + if (!check_rhs) { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } else { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } +} + +template<typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper> +__global__ void +__launch_bounds__(256) +EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[32][16]; + __shared__ float2 rhs_shmem[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size) { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } else { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } +} + + +template<typename Indices, typename LeftArgType, typename RightArgType> +struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> : + public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > { + + typedef GpuDevice Device; + + typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self; + typedef TensorContractionEvaluatorBase<Self> Base; + + typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType; + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + enum { + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; + static const int RDims = + internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; + static const int ContractDims = internal::array_size<Indices>::value; + + typedef array<Index, LDims> left_dim_mapper_t; + typedef array<Index, RDims> right_dim_mapper_t; + + typedef array<Index, ContractDims> contract_t; + typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t; + typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t; + + static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size; + + typedef DSizes<Index, NumDims> Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; + typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; + + typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; + typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped<true, true, true, Unaligned>(buffer); + } + else { + evalTyped<true, true, false, Unaligned>(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped<true, false, true, Unaligned>(buffer); + } + else { + evalTyped<true, false, false, Unaligned>(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped<false, true, true, Unaligned>(buffer); + } + else { + evalTyped<false, true, false, Unaligned>(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped<false, false, true, Unaligned>(buffer); + } + else { + evalTyped<false, false, false, Unaligned>(buffer); + } + } + } + } + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, + LeftEvaluator, left_nocontract_t, + contract_t, 4, + lhs_inner_dim_contiguous, + false, Unaligned> LhsMapper; + + typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, + RightEvaluator, right_nocontract_t, + contract_t, 4, + rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned> RhsMapper; + + typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; + + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte); + if (internal::is_same<LhsScalar, float>::value && + internal::is_same<RhsScalar, float>::value) { + if (m < 768 || n < 768) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(16, 16, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); + } + } else { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_GPU and __CUDACC__ +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h new file mode 100644 index 000000000..8b87f1045 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -0,0 +1,382 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H + +// evaluator for thread pool device +#ifdef EIGEN_USE_THREADS + +namespace Eigen { +namespace internal { + +template<typename LhsScalar, typename LhsMapper, typename Index> +struct packLhsArg { + LhsScalar* blockA; + const LhsMapper& lhs; + const Index m_start; + const Index k_start; + const Index mc; + const Index kc; +}; + +template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index> +struct packRhsAndKernelArg { + const std::vector<LhsScalar*>* blockAs; + RhsScalar* blockB; + const RhsMapper& rhs; + OutputMapper& output; + const Index m; + const Index k; + const Index n; + const Index mc; + const Index kc; + const Index nc; + const Index num_threads; + const Index num_blockAs; + const Index max_m; + const Index k_block_idx; + const Index m_block_idx; + const Index n_block_idx; + const Index m_blocks; + const Index n_blocks; + std::vector<Promise>* kernel_promises; + const std::vector<Future>* lhs_futures; + const bool need_to_pack; +}; + +} // end namespace internal + + +template<typename Indices, typename LeftArgType, typename RightArgType> +struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> : + public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> > { + + typedef ThreadPoolDevice Device; + + typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self; + typedef TensorContractionEvaluatorBase<Self> Base; + + typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType; + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + enum { + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; + static const int RDims = + internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; + static const int ContractDims = internal::array_size<Indices>::value; + + typedef array<Index, LDims> left_dim_mapper_t; + typedef array<Index, RDims> right_dim_mapper_t; + + typedef array<Index, ContractDims> contract_t; + typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t; + typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t; + + static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size; + + typedef DSizes<Index, NumDims> Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; + typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; + typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; + + typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; + typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; + + TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + void evalProduct(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); + return; + } + + evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); + } + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + void evalGemm(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + + const int lhs_packet_size = internal::packet_traits<LhsScalar>::size; + const int rhs_packet_size = internal::packet_traits<RhsScalar>::size; + + typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, + LeftEvaluator, left_nocontract_t, + contract_t, lhs_packet_size, + lhs_inner_dim_contiguous, + false, Unaligned> LhsMapper; + + typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, + RightEvaluator, right_nocontract_t, + contract_t, rhs_packet_size, + rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned> RhsMapper; + + typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; + + // TODO: packing could be faster sometimes if we supported row major tensor mappers + typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr, + Traits::LhsProgress, ColMajor> LhsPacker; + typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker; + + // TODO: replace false, false with conjugate values? + typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, + Traits::mr, Traits::nr, false, false> GebpKernel; + + typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg; + typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + LhsPacker pack_lhs; + + // compute block sizes (which depend on number of threads) + const Index num_threads = this->m_device.numThreads(); + Index mc = m; + Index nc = n; + Index kc = k; + internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc, num_threads); + eigen_assert(mc <= m); + eigen_assert(nc <= n); + eigen_assert(kc <= k); + +#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) + const Index k_blocks = CEIL_DIV(k, kc); + const Index n_blocks = CEIL_DIV(n, nc); + const Index m_blocks = CEIL_DIV(m, mc); + const int sizeA = mc * kc; + const int sizeB = kc * nc; + + /* cout << "m: " << m << " n: " << n << " k: " << k << endl; + cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; + cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; + cout << "num threads: " << num_threads << endl; + */ + + // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB + // aren't 16 byte aligned segfaults will happen due to SIMD instructions + // note: You can get away with allocating just a single blockA and offsets and meet the + // the alignment requirements with the assumption that + // (Traits::mr * sizeof(ResScalar)) % 16 == 0 + const Index numBlockAs = (std::min)(num_threads, m_blocks); + std::vector<LhsScalar *> blockAs; + blockAs.reserve(num_threads); + for (int i = 0; i < num_threads; i++) { + blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); + } + + // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread + // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. + // Other options: (1) reuse memory when a thread finishes. con: tricky + // (2) allocate block B memory in each thread. con: overhead + std::vector<RhsScalar *> blockBs; + blockBs.reserve(n_blocks); + for (int i = 0; i < n_blocks; i++) { + blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); + } + + // lhs_futures starts with all null futures + std::vector<Future> lhs_futures(num_threads); + + // this should really be numBlockAs * n_blocks; + const Index num_kernel_promises = num_threads * n_blocks; + std::vector<Promise> kernel_promises(num_kernel_promises); + std::vector<Future> kernel_futures(num_kernel_promises); + for (int i = 0; i < kernel_promises.size(); ++i) { + kernel_promises[i].set_value(); + kernel_futures[i] = kernel_promises[i].get_future(); + } + + for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { + const Index k_start = k_block_idx * kc; + // make sure we don't overshoot right edge of left matrix + const Index actual_kc = (std::min)(k_start + kc, k) - k_start; + + for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { + const int num_blocks = (std::min)(m_blocks-m_block_idx, numBlockAs); + + for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { + const Index m_start = mt_block_idx * mc; + const Index actual_mc = (std::min)(m_start + mc, m) - m_start; + eigen_assert(actual_mc > 0); + + int blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; + for (int i = 0; i < n_blocks; ++i) { + int future_id = (blockAId * n_blocks + i); + wait_until_ready(&kernel_futures[future_id]); + kernel_promises[future_id] = Promise(); + kernel_futures[future_id] = kernel_promises[future_id].get_future(); + } + const packLArg arg = { + blockAs[blockAId], // blockA + lhs, // lhs + m_start, // m + k_start, // k + actual_mc, // mc + actual_kc, // kc + }; + + lhs_futures[blockAId] = + this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg); + } + + // now start kernels. + const Index m_base_start = m_block_idx * mc; + const bool need_to_pack = m_block_idx == 0; + + for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { + const Index n_start = n_block_idx * nc; + const Index actual_nc = (std::min)(n_start + nc, n) - n_start; + + // first make sure the previous kernels are all done before overwriting rhs. Also wait if + // we're going to start new k. In both cases need_to_pack is true. + if (need_to_pack) { + for (int i = num_blocks; i < num_threads; ++i) { + int blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; + int future_id = (blockAId * n_blocks + n_block_idx); + wait_until_ready(&kernel_futures[future_id]); + } + } + + packRKArg arg = { + &blockAs, // blockA + blockBs[n_block_idx], // blockB + rhs, // rhs + output, // output + m_base_start, // m + k_start, // k + n_start, // n + mc, // mc + actual_kc, // kc + actual_nc, // nc + num_threads, + numBlockAs, + m, + k_block_idx, + m_block_idx, + n_block_idx, // n_block_idx + m_blocks, // m_blocks + n_blocks, // n_blocks + &kernel_promises, // kernel_promises + &lhs_futures, // lhs_futures + need_to_pack, // need_to_pack + }; + + this->m_device.enqueueNoFuture(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg); + } + } + } + + // Make sure all the kernels are done. + for (int i = 0; i < kernel_futures.size(); ++i) { + wait_until_ready(&kernel_futures[i]); + } + + // deallocate all of the memory for both A and B's + for (int i = 0; i < blockAs.size(); i++) { + this->m_device.deallocate(blockAs[i]); + } + for (int i = 0; i < blockBs.size(); i++) { + this->m_device.deallocate(blockBs[i]); + } + +#undef CEIL_DIV + } + + /* + * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing + * the LHS block, check that all of the kernels that worked on the same + * mt_block_idx in the previous m_block are done. + */ + template <typename packLArg, typename LhsPacker> + static void packLhs(const packLArg arg) { + // perform actual packing + LhsPacker pack_lhs; + pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); + } + + /* + * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that + * all kernels in the previous block are done. + * Then for each LHS future, we wait on the future and then call GEBP + * on the area packed by the future (which starts at + * blockA + future_idx * mt * kc) on the LHS and with the full packed + * RHS block. + * The output of this GEBP is written to output(m + i * mt, n). + */ + template <typename packRKArg, typename RhsPacker, typename GebpKernel> + static void packRhsAndKernel(packRKArg arg) { + if (arg.need_to_pack) { + RhsPacker pack_rhs; + pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); + } + + GebpKernel gebp; + for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { + const Index m_base_start = arg.m + arg.mc*mt_block_idx; + if (m_base_start < arg.max_m) { + int blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; + + wait_until_ready(&(*arg.lhs_futures)[blockAId]); + const Index actual_mc = (std::min)(m_base_start + arg.mc, arg.max_m) - m_base_start; + gebp(arg.output.getSubMapper(m_base_start, arg.n), + (*arg.blockAs)[blockAId], arg.blockB, + actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0); + + const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; + (*arg.kernel_promises)[set_idx].set_value(); + } + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_THREADS +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h new file mode 100644 index 000000000..591fd2464 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -0,0 +1,912 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +namespace internal { + + +template <typename Index, typename InputDims, size_t NumKernelDims> class IndexMapper { + public: + IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims, + const array<Index, NumKernelDims>& indices) { + + array<Index, NumDims> dimensions = input_dims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = indices[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + dimensions[index] = result_dim; + } + + array<Index, NumDims> inputStrides; + array<Index, NumDims> outputStrides; + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; + outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; + } else { + inputStrides[0] = 1; + outputStrides[0] = 1; + } + } + + array<Index, NumDims> cudaInputDimensions; + array<Index, NumDims> cudaOutputDimensions; + array<Index, NumDims> tmp = dimensions; + array<Index, NumDims> ordering; + for (int i = 0; i < NumKernelDims; ++i) { + ordering[i] = indices[i]; + tmp[indices[i]] = -1; + cudaInputDimensions[i] = input_dims[ordering[i]]; + cudaOutputDimensions[i] = dimensions[ordering[i]]; + } + int written = NumKernelDims; + for (int i = 0; i < NumDims; ++i) { + if (tmp[i] >= 0) { + ordering[written] = i; + cudaInputDimensions[written] = input_dims[i]; + cudaOutputDimensions[written] = dimensions[i]; + ++written; + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[ordering[i]]; + m_outputStrides[i] = outputStrides[ordering[i]]; + } + + for (int i = 0; i < NumDims; ++i) { + if (i > NumKernelDims) { + m_cudaInputStrides[i] = m_cudaInputStrides[i-1] * cudaInputDimensions[i-1]; + m_cudaOutputStrides[i] = m_cudaOutputStrides[i-1] * cudaOutputDimensions[i-1]; + } else { + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; + } + } + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { + Index inputIndex = 0; + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_cudaInputStrides[d]; + } + inputIndex += p * m_inputStrides[NumKernelDims]; + return inputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { + Index outputIndex = 0; + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; + } + outputIndex += p * m_outputStrides[NumKernelDims]; + return outputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { + return i * m_inputStrides[0]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { + return i * m_outputStrides[0]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { + return i * m_inputStrides[0] + j*m_inputStrides[1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { + return i * m_outputStrides[0] + j * m_outputStrides[1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + return i * m_inputStrides[0] + j*m_inputStrides[1] + k*m_inputStrides[2]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + return i * m_outputStrides[0] + j*m_outputStrides[1] + k*m_outputStrides[2]; + } + + private: + static const size_t NumDims = internal::array_size<InputDims>::value; + array<Index, NumDims> m_inputStrides; + array<Index, NumDims> m_outputStrides; + array<Index, NumDims> m_cudaInputStrides; + array<Index, NumDims> m_cudaOutputStrides; +}; + + + +template<typename Dimensions, typename InputXprType, typename KernelXprType> +struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type<typename InputXprType::Scalar, + typename KernelXprType::Scalar>::ret Scalar; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind, + typename traits<KernelXprType>::StorageKind>::ret StorageKind; + typedef typename promote_index_type<typename traits<InputXprType>::Index, + typename traits<KernelXprType>::Index>::type Index; + typedef typename InputXprType::Nested LhsNested; + typedef typename KernelXprType::Nested RhsNested; + typedef typename remove_reference<LhsNested>::type _LhsNested; + typedef typename remove_reference<RhsNested>::type _RhsNested; + static const int NumDimensions = traits<InputXprType>::NumDimensions; + static const int Layout = traits<InputXprType>::Layout; + + enum { + Flags = 0, + }; +}; + +template<typename Dimensions, typename InputXprType, typename KernelXprType> +struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> +{ + typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type; +}; + +template<typename Dimensions, typename InputXprType, typename KernelXprType> +struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type> +{ + typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type; +}; + +} // end namespace internal + + + +template<typename Indices, typename InputXprType, typename KernelXprType> +class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorConvolutionOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType, + typename KernelXprType::CoeffReturnType>::ret CoeffReturnType; + typedef typename internal::promote_storage_type<typename InputXprType::PacketReturnType, + typename KernelXprType::PacketReturnType>::ret PacketReturnType; + typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested; + typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) + : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all<typename InputXprType::Nested>::type& + inputExpression() const { return m_input_xpr; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all<typename KernelXprType::Nested>::type& + kernelExpression() const { return m_kernel_xpr; } + + protected: + typename InputXprType::Nested m_input_xpr; + typename KernelXprType::Nested m_kernel_xpr; + const Indices m_indices; +}; + + +template<typename Indices, typename InputArgType, typename KernelArgType, typename Device> +struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device> +{ + typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; + + static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value; + static const int NumKernelDims = internal::array_size<Indices>::value; + typedef typename XprType::Index Index; + typedef DSizes<Index, NumDims> Dimensions; + + enum { + IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess, + Layout = TensorEvaluator<InputArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + // Only column major tensors are supported for now. + EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1]; + } + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1]; + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_inputImpl.evalSubExprsIfNeeded(NULL); + preloadKernel(); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + void evalTo(typename XprType::Scalar* buffer) { + evalSubExprsIfNeeded(NULL); + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + CoeffReturnType result = CoeffReturnType(0); + convolve(firstInput(index), 0, NumKernelDims-1, result); + return result; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const + { + const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + Index indices[2] = {index, index+PacketSize-1}; + Index startInputs[2] = {0, 0}; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + startInputs[0] += indices[0]; + startInputs[1] += indices[1]; + + if (startInputs[1]-startInputs[0] == PacketSize-1) { + PacketReturnType result = internal::pset1<PacketReturnType>(0); + convolvePacket(startInputs[0], 0, NumKernelDims-1, result); + return result; + } else { + EIGEN_ALIGN_DEFAULT Scalar data[PacketSize]; + data[0] = Scalar(0); + convolve(startInputs[0], 0, NumKernelDims-1, data[0]); + for (int i = 1; i < PacketSize-1; ++i) { + data[i] = Scalar(0); + convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); + } + data[PacketSize-1] = Scalar(0); + convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); + return internal::pload<PacketReturnType>(data); + } + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + startInput += index; + return startInput; + } + + EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolve(input, kernel, DimIndex-1, accum); + } else { + accum += m_inputImpl.coeff(input) * m_kernel[kernel]; + } + } + } + + template <typename Packet> + EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolvePacket(input, kernel, DimIndex-1, accum); + } else { + accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum); + } + } + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp<const KernelArgType> EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<KernelArgType, Device>::PacketAccess>::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + array<Index, NumDims> m_inputStride; + array<Index, NumDims> m_outputStride; + + array<Index, NumKernelDims> m_indexStride; + array<Index, NumKernelDims> m_kernelStride; + TensorEvaluator<InputArgType, Device> m_inputImpl; + TensorEvaluator<KernelArgType, Device> m_kernelImpl; + Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device& m_device; +}; + + + + +// Use an optimized implementation of the evaluation code for GPUs whenever possible. +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +template <int StaticKernelSize> +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { + return StaticKernelSize; + } +}; +template <> +struct GetKernelSize<Dynamic> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { + return kernelSize; + } +}; + + + + +template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize> +__global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) { + extern __shared__ float s[]; + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize); + const int num_x_output = last_x - first_x + 1; + + const int first_plane = blockIdx.y * blockDim.y; + const int plane_stride = blockDim.y * gridDim.y; + + for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { + // Load inputs to shared memory + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.y * num_x_input; + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); + s[i + plane_kernel_offset] = eval.coeff(tensor_index); + } + + __syncthreads(); + + // Compute the convolution + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + const int kernel_offset = plane_kernel_offset + i; + float result = 0.0f; + #pragma unroll + for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) { + result += s[k + kernel_offset] * kernel[k]; + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); + buffer[tensor_index] = result; + } + __syncthreads(); + } +}; + + +template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY> +__global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) { + extern __shared__ float s[]; + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX); + const int num_x_output = last_x - first_x + 1; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY); + const int num_y_output = last_y - first_y + 1; + + const int first_plane = blockIdx.z * blockDim.z; + const int plane_stride = blockDim.z * gridDim.z; + + for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { + + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.z * num_y_input; + + // Load inputs to shared memory + #pragma unroll + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + const int input_offset = num_x_input * (j + plane_kernel_offset); + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); + s[i + input_offset] = eval.coeff(tensor_index); + } + } + + __syncthreads(); + + // Convolution + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + #pragma unroll + for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) { + const int kernel_offset = kernelSizeX * l; + const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); + #pragma unroll + for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) { + result += s[k + input_offset] * kernel[k + kernel_offset]; + } + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); + buffer[tensor_index] = result; + } + } + + __syncthreads(); + } +}; + + +template <typename InputEvaluator, typename Index, typename InputDims> +__global__ void EigenConvolutionKernel3D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3> indexMapper, const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) { + extern __shared__ float s[]; + + // Load inputs to shared memory + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + kernelSizeX; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + kernelSizeY; + + const int first_z = blockIdx.z * maxZ; + const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; + const int num_z_input = last_z - first_z + kernelSizeZ; + + for (int p = 0; p < numPlanes; ++p) { + + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = 0; + + for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); + s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); + } + } + } + + __syncthreads(); + + // Convolution + const int num_z_output = last_z - first_z + 1; + const int num_y_output = last_y - first_y + 1; + const int num_x_output = last_x - first_x + 1; + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + for (int n = 0; n < kernelSizeZ; ++n) { + for (int m = 0; m < kernelSizeY; ++m) { + for (int l = 0; l < kernelSizeX; ++l) { + result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; + } + } + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); + buffer[tensor_index] = result; + } + } + } + __syncthreads(); + } +}; + + + +template<typename Indices, typename InputArgType, typename KernelArgType> +struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> +{ + typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; + + static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value; + static const int NumKernelDims = internal::array_size<Indices>::value; + typedef typename XprType::Index Index; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions; + + enum { + IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned, + PacketAccess = false, + Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + // Only column major tensors are supported for now. + EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp<const KernelArgType> EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + internal::TensorExecutor<const EvalTo, GpuDevice, TensorEvaluator<KernelArgType, GpuDevice>::PacketAccess>::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + static unsigned int ceil(unsigned int num, unsigned int denom) { + const unsigned int rounded_toward_zero = num / denom; + if (num > rounded_toward_zero * denom) { + return rounded_toward_zero + 1; + } + return rounded_toward_zero; + } + + void executeEval(Scalar* data) const { + typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims; + + const int maxSharedMem = sharedMemPerBlock(); + const int maxThreadsPerBlock = maxCudaThreadsPerBlock(); + const int maxBlocksPerProcessor = maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = getNumCudaMultiProcessors(); + const int warpSize = 32; + + switch (NumKernelDims) { + case 1: { + const int kernel_size = m_kernelImpl.dimensions().TotalSize(); + + const int numX = dimensions()[m_indices[0]]; + const int numP = dimensions().TotalSize() / numX; + + int maxX; + dim3 block_size; + if (m_indices[0] == 0) { + // Maximum the reuse + const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; + maxX = (std::min<int>)(inner_dim, numX); + const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); + block_size.x = (std::min)(maxThreadsPerBlock, maxX); + block_size.y = (std::min<int>)(maxThreadsPerBlock / block_size.x, maxP); + } + else { + // Read as much as possible alongside the inner most dimension, that is the plane + const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); + const int maxP = (std::min<int>)(inner_dim, numP); + maxX = (std::min<int>)(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); + + block_size.x = (std::min)(warpSize, maxX); + block_size.y = (std::min<int>)(maxThreadsPerBlock/block_size.x, maxP); + } + + const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); + + dim3 num_blocks(num_x_blocks, min<int>(num_y_blocks, ceil(numP, block_size.y))); + + + //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array<Index, 1> indices(m_indices[0]); + const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]); + internal::IndexMapper<Index, InputDims, 1> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + switch(kernel_size) { + case 4: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + break; + } + case 7: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + } + } + break; + } + + case 2: { + const int kernel_size_x = m_kernelImpl.dimensions()[0]; + const int kernel_size_y = m_kernelImpl.dimensions()[1]; + + const int numX = dimensions()[m_indices[0]]; + const int numY = dimensions()[m_indices[1]]; + const int numP = dimensions().TotalSize() / (numX*numY); + + const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); + + // Snap maxX to warp size + int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; + const int maxX = (std::min<int>)(inner_dim, numX); + const int maxY = (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); + const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); + + dim3 block_size; + block_size.x = (std::min)(1024, maxX); + block_size.y = (std::min<int>)(1024/block_size.x, maxY); + block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxP); + + const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int num_y_blocks = ceil(numY, maxY); + const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); + + dim3 num_blocks(num_x_blocks, num_y_blocks, min<int>(num_z_blocks, ceil(numP, block_size.z))); + + + //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array<Index, 2> indices(m_indices[0], m_indices[1]); + const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1]); + internal::IndexMapper<Index, InputDims, 2> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + switch (kernel_size_x) { + case 4: { + switch (kernel_size_y) { + case 7: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + break; + } + } + break; + } + case 7: { + switch (kernel_size_y) { + case 4: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + break; + } + } + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + break; + } + } + break; + } + + case 3: { + const int kernel_size_x = m_kernelImpl.dimensions()[0]; + const int kernel_size_y = m_kernelImpl.dimensions()[1]; + const int kernel_size_z = m_kernelImpl.dimensions()[2]; + + const int numX = dimensions()[m_indices[0]]; + const int numY = dimensions()[m_indices[1]]; + const int numZ = dimensions()[m_indices[2]]; + const int numP = dimensions().TotalSize() / (numX*numY*numZ); + + const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); + const int maxY = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); + const int maxZ = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); + + dim3 block_size; + block_size.x = (std::min)(32, maxX); + block_size.y = (std::min)(32, maxY); + block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxZ); + dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); + + const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + const array<Index, 3> indices(m_indices[0], m_indices[1], m_indices[2]); + const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1], m_kernelImpl.dimensions()[2]); + internal::IndexMapper<Index, InputDims, 3> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + break; + } + + default: { + assert(false && "not supported yet"); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator& operator = (const TensorEvaluator&); + + TensorEvaluator<InputArgType, GpuDevice> m_inputImpl; + TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl; + KernelArgType m_kernelArg; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + + const GpuDevice& m_device; +}; +#endif + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h new file mode 100644 index 000000000..649bdb308 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -0,0 +1,126 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H + +namespace Eigen { + +/** \class TensorDevice + * \ingroup CXX11_Tensor_Module + * + * \brief Pseudo expression providing an operator = that will evaluate its argument + * on the specified computing 'device' (GPU, thread pool, ...) + * + * Example: + * C.device(EIGEN_GPU) = A + B; + * + * Todo: thread pools. + * Todo: operator +=, -=, *= and so on. + */ + +template <typename ExpressionType, typename DeviceType> class TensorDevice { + public: + TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign; + Assign assign(m_expression, other); + static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess; + internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device); + return *this; + } + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp<ExpressionType, const Sum> Assign; + Assign assign(m_expression, sum); + static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess; + internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device); + return *this; + } + + protected: + const DeviceType& m_device; + ExpressionType& m_expression; +}; + + +#ifdef EIGEN_USE_THREADS +template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPoolDevice> { + public: + TensorDevice(const ThreadPoolDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign; + Assign assign(m_expression, other); + static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess; + internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device); + return *this; + } + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp<ExpressionType, const Sum> Assign; + Assign assign(m_expression, sum); + static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess; + internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device); + return *this; + } + + protected: + const ThreadPoolDevice& m_device; + ExpressionType& m_expression; +}; +#endif + + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice> +{ + public: + TensorDevice(const GpuDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign; + Assign assign(m_expression, other); + internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device); + return *this; + } + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp<ExpressionType, const Sum> Assign; + Assign assign(m_expression, sum); + internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device); + return *this; + } + + protected: + const GpuDevice& m_device; + ExpressionType m_expression; +}; +#endif + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h new file mode 100644 index 000000000..efd207507 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -0,0 +1,190 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H + + +namespace Eigen { + +// Default device for the machine (typically a single cpu core) +struct DefaultDevice { + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } + + EIGEN_STRONG_INLINE size_t numThreads() const { + return 1; + } +}; + + +// Multiple cpu cores +// We should really use a thread pool here but first we need to find a portable thread pool library. +#ifdef EIGEN_USE_THREADS + +typedef std::future<void> Future; +typedef std::promise<void> Promise; + +static EIGEN_STRONG_INLINE void wait_until_ready(const Future* f) { + f->wait(); +} +static EIGEN_STRONG_INLINE void get_when_ready(Future* f) { + f->get(); +} + + +struct ThreadPoolDevice { + ThreadPoolDevice(size_t num_cores) : num_threads_(num_cores) { } + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } + + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } + + EIGEN_STRONG_INLINE size_t numThreads() const { + return num_threads_; + } + + template <class Function, class... Args> + EIGEN_STRONG_INLINE Future enqueue(Function&& f, Args&&... args) const { + return std::async(std::launch::async, f, args...); + } + template <class Function, class... Args> + EIGEN_STRONG_INLINE void enqueueNoFuture(Function&& f, Args&&... args) const { + std::async(std::launch::async, f, args...); + } + + private: + size_t num_threads_; +}; + +#endif + + +// GPU offloading +#ifdef EIGEN_USE_GPU +static cudaDeviceProp m_deviceProperties; +static bool m_devicePropInitialized = false; + +static void initializeDeviceProp() { + if (!m_devicePropInitialized) { + assert(cudaGetDeviceProperties(&m_deviceProperties, 0) == cudaSuccess); + m_devicePropInitialized = true; + } +} + +static inline int getNumCudaMultiProcessors() { + initializeDeviceProp(); + return m_deviceProperties.multiProcessorCount; +} +static inline int maxCudaThreadsPerBlock() { + initializeDeviceProp(); + return m_deviceProperties.maxThreadsPerBlock; +} +static inline int maxCudaThreadsPerMultiProcessor() { + initializeDeviceProp(); + return m_deviceProperties.maxThreadsPerMultiProcessor; +} +static inline int sharedMemPerBlock() { + initializeDeviceProp(); + return m_deviceProperties.sharedMemPerBlock; +} + +static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { + cudaError_t status = cudaDeviceSetSharedMemConfig(config); + assert(status == cudaSuccess); +} + +struct GpuDevice { + // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction. + GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); } + + EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { +#ifndef __CUDA_ARCH__ + void* result; + assert(cudaMalloc(&result, num_bytes) == cudaSuccess); + assert(result != NULL); + return result; +#else + assert(false && "The default device should be used instead to generate kernel code"); + return NULL; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { +#ifndef __CUDA_ARCH__ + assert(buffer != NULL); + assert(cudaFree(buffer) == cudaSuccess); +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + assert(cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_) == cudaSuccess); +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifndef __CUDA_ARCH__ + assert(cudaMemsetAsync(buffer, c, n, *stream_) == cudaSuccess); +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { + cudaStreamSynchronize(*stream_); + } + + private: + // TODO: multigpu. + const cudaStream_t* stream_; +}; + +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ + assert(cudaGetLastError() == cudaSuccess); + +#endif + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h new file mode 100644 index 000000000..2ad52b2f9 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -0,0 +1,380 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H + + +namespace Eigen { + +/** \internal + * + * \class TensorDimensions + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode and store the dimensions of a Tensor. + * + * The Sizes class encodes as part of the type the number of dimensions and the + * sizes corresponding to each dimension. It uses no storage space since it is + * entirely known at compile time. + * The DSizes class is its dynamic sibling: the number of dimensions is known + * at compile time but the sizes are set during execution. + * + * \sa Tensor + */ + +// Can't use std::pair on cuda devices +template <typename Index> struct IndexPair { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { } + Index first; + Index second; +}; + +// Boilerplate code +namespace internal { + +template<std::size_t n, typename Dimension> struct dget { + static const std::size_t value = get<n, Dimension>::value; +}; + + +template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor> +struct fixed_size_tensor_index_linearization_helper +{ + template <typename Dimensions> EIGEN_DEVICE_FUNC + static inline Index run(array<Index, NumIndices> const& indices, + const Dimensions& dimensions) + { + return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) + + dget<RowMajor ? n : (NumIndices - n - 1), Dimensions>::value * + fixed_size_tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions); + } +}; + +template<typename Index, std::size_t NumIndices, bool RowMajor> +struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor> +{ + template <typename Dimensions> EIGEN_DEVICE_FUNC + static inline Index run(array<Index, NumIndices> const& indices, + const Dimensions&) + { + return array_get<RowMajor ? 0 : NumIndices - 1>(indices); + } +}; + +} // end namespace internal + + +// Fixed size +#ifndef EIGEN_EMULATE_CXX11_META_H +template <typename std::size_t... Indices> +struct Sizes : internal::numeric_list<std::size_t, Indices...> { + typedef internal::numeric_list<std::size_t, Indices...> Base; + static const std::size_t total_size = internal::arg_prod(Indices...); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return Base::count; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t TotalSize() { + return internal::arg_prod(Indices...); + } + + Sizes() { } + template <typename DenseIndex> + explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) { + // todo: add assertion + } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template <typename... DenseIndex> Sizes(DenseIndex...) { } + explicit Sizes(std::initializer_list<std::size_t> /*l*/) { + // todo: add assertion + } +#endif + + template <typename T> Sizes& operator = (const T& /*other*/) { + // add assertion failure if the size of other is different + return *this; + } + + template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const { + return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this)); + } + template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const { + return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this)); + } +}; + +template <typename std::size_t... Indices> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<Indices...>&) { + return Sizes<Indices...>::total_size; +} + +#else + +template <std::size_t n> +struct non_zero_size { + typedef internal::type2val<std::size_t, n> type; +}; +template <> +struct non_zero_size<0> { + typedef internal::null_type type; +}; + +template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0, std::size_t V5=0> struct Sizes { + typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base; + static const size_t count = Base::count; + static const std::size_t total_size = internal::arg_prod<Base>::value; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return count; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() { + return internal::arg_prod<Base>::value; + } + + Sizes() { } + template <typename DenseIndex> + explicit Sizes(const array<DenseIndex, Base::count>& indices) { + // todo: add assertion + } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template <typename... DenseIndex> Sizes(DenseIndex... indices) { } + explicit Sizes(std::initializer_list<std::size_t> l) { + // todo: add assertion + } +#else + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + } +#endif + + template <typename T> Sizes& operator = (const T& other) { + // to do: check the size of other + return *this; + } + + template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const { + return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this); + } + template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const { + return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this); + } +}; + +template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) { + return Sizes<V1, V2, V3, V4, V5>::total_size; +}; + +#endif + +// Boilerplate +namespace internal { +template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor> +struct tensor_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const& dimensions) + { + return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) + + array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) * + tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions); + } +}; + +template<typename Index, std::size_t NumIndices, bool RowMajor> +struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor> +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const&) + { + return array_get<RowMajor ? 0 : NumIndices - 1>(indices); + } +}; +} // end namespace internal + + + +// Dynamic size +template <typename DenseIndex, std::size_t NumDims> +struct DSizes : array<DenseIndex, NumDims> { + typedef array<DenseIndex, NumDims> Base; + static const std::size_t count = NumDims; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return NumDims; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const { + return internal::array_prod(*static_cast<const Base*>(this)); + } + + EIGEN_DEVICE_FUNC DSizes() { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = 0; + } + } + EIGEN_DEVICE_FUNC explicit DSizes(const array<DenseIndex, NumDims>& a) : Base(a) { } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) { + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) + (*this) = array<DenseIndex, NumDims>{{firstDimension, otherDimensions...}}; + } +#else + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { + eigen_assert(NumDims == 1); + (*this)[0] = i0; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) { + eigen_assert(NumDims == 2); + (*this)[0] = i0; + (*this)[1] = i1; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + eigen_assert(NumDims == 3); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + eigen_assert(NumDims == 4); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + eigen_assert(NumDims == 5); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + (*this)[4] = i4; + } +#endif + + EIGEN_DEVICE_FUNC DSizes& operator = (const array<DenseIndex, NumDims>& other) { + *static_cast<Base*>(this) = other; + return *this; + } + + // A constexpr would be so much better here + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const { + return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const { + return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this)); + } +}; + + + + +// Boilerplate +namespace internal { +template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor> +struct tensor_vsize_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const& dimensions) + { + return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) + + array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) * + tensor_vsize_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions); + } +}; + +template<typename Index, std::size_t NumIndices, bool RowMajor> +struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor> +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const&) + { + return array_get<RowMajor ? 0 : NumIndices - 1>(indices); + } +}; +} // end namespace internal + + +namespace internal { + +template <typename DenseIndex, std::size_t NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > { + static const size_t value = NumDims; +}; +template <typename DenseIndex, std::size_t NumDims> struct array_size<DSizes<DenseIndex, NumDims> > { + static const size_t value = NumDims; +}; +#ifndef EIGEN_EMULATE_CXX11_META_H +template <typename std::size_t... Indices> struct array_size<const Sizes<Indices...> > { +static const size_t value = Sizes<Indices...>::count; +}; +template <typename std::size_t... Indices> struct array_size<Sizes<Indices...> > { +static const size_t value = Sizes<Indices...>::count; +}; +template <std::size_t n, typename std::size_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<Indices...>&) { + return get<n, internal::numeric_list<std::size_t, Indices...> >::value; +} +#else +template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > { + static const size_t value = Sizes<V1,V2,V3,V4,V5>::count; +}; +template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > { + static const size_t value = Sizes<V1,V2,V3,V4,V5>::count; +}; +template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>& a) { + return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value; +}; + +#endif + + +template <typename Dims1, typename Dims2, size_t n> +struct sizes_match_up_to_dim { + static inline bool run(Dims1& dims1, Dims2& dims2) { + return (array_get<n>(dims1) == array_get<n>(dims2)) & + sizes_match_up_to_dim<Dims1, Dims2, n-1>::run(dims1, dims2); + } +}; +template <typename Dims1, typename Dims2> +struct sizes_match_up_to_dim<Dims1, Dims2, 0> { + static inline bool run(Dims1& dims1, Dims2& dims2) { + return (array_get<0>(dims1) == array_get<0>(dims2)); + } +}; + +} // end namespace internal + + +template <typename Dims1, typename Dims2> +bool dimensions_match(Dims1& dims1, Dims2& dims2) { + if (internal::array_size<Dims1>::value != internal::array_size<Dims2>::value) { + return false; + } + return internal::sizes_match_up_to_dim<Dims1, Dims2, internal::array_size<Dims1>::value-1>::run(dims1, dims2); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h new file mode 100644 index 000000000..883e6cab1 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -0,0 +1,154 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template<typename XprType> +struct traits<TensorEvalToOp<XprType> > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0, + }; +}; + +template<typename XprType> +struct eval<TensorEvalToOp<XprType>, Eigen::Dense> +{ + typedef const TensorEvalToOp<XprType>& type; +}; + +template<typename XprType> +struct nested<TensorEvalToOp<XprType>, 1, typename eval<TensorEvalToOp<XprType> >::type> +{ + typedef TensorEvalToOp<XprType> type; +}; + +} // end namespace internal + + + + +template<typename XprType> +class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorEvalToOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType; + typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested; + typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(CoeffReturnType* buffer, const XprType& expr) + : m_xpr(expr), m_buffer(buffer) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC CoeffReturnType* buffer() const { return m_buffer; } + + protected: + typename XprType::Nested m_xpr; + CoeffReturnType* m_buffer; +}; + + + +template<typename ArgType, typename Device> +struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> +{ + typedef TensorEvalToOp<ArgType> XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename ArgType::Packet Packet; + typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + + enum { + IsAligned = true, + PacketAccess = true, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_buffer(op.buffer()) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { + } + + typedef typename XprType::Index Index; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_buffer[i] = m_impl.coeff(i); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template<int LoadMode> + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt<Packet, LoadMode>(m_buffer + index); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + private: + TensorEvaluator<ArgType, Device> m_impl; + const Device& m_device; + CoeffReturnType* m_buffer; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h new file mode 100644 index 000000000..d084880de --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -0,0 +1,427 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor evaluator classes. + * + * These classes are responsible for the evaluation of the tensor expression. + * + * TODO: add support for more types of expressions, in particular expressions + * leading to lvalues (slicing, reshaping, etc...) + */ + +// Generic evaluator +template<typename Derived, typename Device> +struct TensorEvaluator +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Packet Packet; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename Derived::Packet PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + // NumDimensions is -1 for variable dim tensors + static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ? + internal::traits<Derived>::NumDimensions : 0; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = Derived::PacketAccess, + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(const_cast<Scalar*>(m.data())), m_dims(m.dimensions()), m_device(device) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { + if (dest) { + m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); + return false; + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data); + return m_data[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + eigen_assert(m_data); + return m_data[index]; + } + + template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + return internal::ploadt<Packet, LoadMode>(m_data + index); + } + + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const Packet& x) + { + return internal::pstoret<Scalar, Packet, StoreMode>(m_data + index, x); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const { + eigen_assert(m_data); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) { + eigen_assert(m_data); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } + + protected: + Scalar* m_data; + Dimensions m_dims; + const Device& m_device; +}; + + +// Default evaluator for rvalues +template<typename Derived, typename Device> +struct TensorEvaluator<const Derived, Device> +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Packet Packet; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename Derived::Packet PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + // NumDimensions is -1 for variable dim tensors + static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ? + internal::traits<Derived>::NumDimensions : 0; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = Derived::PacketAccess, + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device&) + : m_data(m.data()), m_dims(m.dimensions()) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data); +#ifdef __CUDA_ARCH__ + return __ldg(m_data+index); +#else + return m_data[index]; +#endif + } + + template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + return internal::ploadt_ro<Packet, LoadMode>(m_data + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const { + eigen_assert(m_data); + const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords) + : m_dims.IndexOfRowMajor(coords); +#ifdef __CUDA_ARCH__ + return __ldg(m_data+index); +#else + return m_data[index]; +#endif + } + + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } + + protected: + const Scalar* m_data; + Dimensions m_dims; +}; + + + + +// -------------------- CwiseNullaryOp -------------------- + +template<typename NullaryOp, typename ArgType, typename Device> +struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> +{ + typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType; + + enum { + IsAligned = true, + PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC + TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits<XprType>::Scalar CoeffReturnType; + typedef typename internal::traits<XprType>::Packet PacketReturnType; + typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(index); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(index); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + private: + const NullaryOp m_functor; + TensorEvaluator<ArgType, Device> m_argImpl; +}; + + + +// -------------------- CwiseUnaryOp -------------------- + +template<typename UnaryOp, typename ArgType, typename Device> +struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> +{ + typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType; + + enum { + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), + m_argImpl(op.nestedExpression(), device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits<XprType>::Scalar CoeffReturnType; + typedef typename internal::traits<XprType>::Packet PacketReturnType; + typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_argImpl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_argImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(m_argImpl.coeff(index)); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index)); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + private: + const UnaryOp m_functor; + TensorEvaluator<ArgType, Device> m_argImpl; +}; + + +// -------------------- CwiseBinaryOp -------------------- + +template<typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device> +struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device> +{ + typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType; + + enum { + IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess & + internal::functor_traits<BinaryOp>::PacketAccess, + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits<XprType>::Scalar CoeffReturnType; + typedef typename internal::traits<XprType>::Packet PacketReturnType; + typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use right impl instead if right impl dimensions are known at compile time. + return m_leftImpl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index)); + } + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index)); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + private: + const BinaryOp m_functor; + TensorEvaluator<LeftArgType, Device> m_leftImpl; + TensorEvaluator<RightArgType, Device> m_rightImpl; +}; + + +// -------------------- SelectOp -------------------- + +template<typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device> +struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device> +{ + typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType; + + enum { + IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess/* & + TensorEvaluator<IfArgType>::PacketAccess*/, + Layout = TensorEvaluator<IfArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_condImpl(op.ifExpression(), device), + m_thenImpl(op.thenExpression(), device), + m_elseImpl(op.elseExpression(), device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); + eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits<XprType>::Scalar CoeffReturnType; + typedef typename internal::traits<XprType>::Packet PacketReturnType; + typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use then or else impl instead if they happen to be known at compile time. + return m_condImpl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_condImpl.evalSubExprsIfNeeded(NULL); + m_thenImpl.evalSubExprsIfNeeded(NULL); + m_elseImpl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_condImpl.cleanup(); + m_thenImpl.cleanup(); + m_elseImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); + } + template<int LoadMode> + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + internal::Selector<PacketSize> select; + for (Index i = 0; i < PacketSize; ++i) { + select.select[i] = m_condImpl.coeff(index+i); + } + return internal::pblend(select, + m_thenImpl.template packet<LoadMode>(index), + m_elseImpl.template packet<LoadMode>(index)); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + private: + TensorEvaluator<IfArgType, Device> m_condImpl; + TensorEvaluator<ThenArgType, Device> m_thenImpl; + TensorEvaluator<ElseArgType, Device> m_elseImpl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h new file mode 100644 index 000000000..05ac9bd2f --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -0,0 +1,244 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H + +namespace Eigen { + +/** \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + */ +namespace internal { + +template <typename Device, typename Expression> +struct IsVectorizable { + static const bool value = TensorEvaluator<Expression, Device>::PacketAccess; +}; + +// Default strategy: the expression is evaluated with a single cpu thread. +template<typename Expression, typename Device = DefaultDevice, bool Vectorizable = IsVectorizable<Device, Expression>::value> +class TensorExecutor +{ + public: + typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, const Device& device = Device()) + { + TensorEvaluator<Expression, Device> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); + for (Index i = 0; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + + +template<typename Expression> +class TensorExecutor<Expression, DefaultDevice, true> +{ + public: + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) + { + TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); + static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; + const Index VectorizedSize = (size / PacketSize) * PacketSize; + + for (Index i = 0; i < VectorizedSize; i += PacketSize) { + evaluator.evalPacket(i); + } + for (Index i = VectorizedSize; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + + + +// Multicore strategy: the index space is partitioned and each partition is executed on a single core +#ifdef EIGEN_USE_THREADS +template <typename Evaluator, typename Index, bool Vectorizable = Evaluator::PacketAccess> +struct EvalRange { + static void run(Evaluator evaluator, const Index first, const Index last) { + eigen_assert(last > first); + for (Index i = first; i < last; ++i) { + evaluator.evalScalar(i); + } + } +}; + +template <typename Evaluator, typename Index> +struct EvalRange<Evaluator, Index, true> { + static void run(Evaluator evaluator, const Index first, const Index last) { + eigen_assert(last > first); + + Index i = first; + static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + if (last - first > PacketSize) { + eigen_assert(first % PacketSize == 0); + Index lastPacket = last - (last % PacketSize); + for (; i < lastPacket; i += PacketSize) { + evaluator.evalPacket(i); + } + } + + for (; i < last; ++i) { + evaluator.evalScalar(i); + } + } +}; + +template<typename Expression, bool Vectorizable> +class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> +{ + public: + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const ThreadPoolDevice& device) + { + typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; + Evaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); + + static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; + + int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1; + const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + Index i = 0; + std::vector<Future> results; + results.reserve(numblocks); + for (int i = 0; i < numblocks; ++i) { + results.push_back(device.enqueue(&EvalRange<Evaluator, Index>::run, evaluator, i*blocksize, (i+1)*blocksize)); + } + + if (numblocks * blocksize < size) { + EvalRange<Evaluator, Index>::run(evaluator, numblocks * blocksize, size); + } + + for (int i = 0; i < numblocks; ++i) { + get_when_ready(&results[i]); + } + + } + evaluator.cleanup(); + } +}; +#endif + + +// GPU: the evaluation of the expression is offloaded to a GPU. +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +template <typename Evaluator, typename Index> +__global__ void +__launch_bounds__(1024) +EigenMetaKernel_NonVectorizable(Evaluator eval, Index size) { + + const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; + const Index step_size = blockDim.x * gridDim.x; + + // Use the scalar path + for (Index i = first_index; i < size; i += step_size) { + eval.evalScalar(i); + } +} + +template <typename Evaluator, typename Index> +__global__ void +__launch_bounds__(1024) +EigenMetaKernel_Vectorizable(Evaluator eval, Index size) { + + const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; + const Index step_size = blockDim.x * gridDim.x; + + // Use the vector path + const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + const Index vectorized_step_size = step_size * PacketSize; + const Index vectorized_size = (size / PacketSize) * PacketSize; + for (Index i = first_index * PacketSize; i < vectorized_size; + i += vectorized_step_size) { + eval.evalPacket(i); + } + for (Index i = vectorized_size + first_index; i < size; i += step_size) { + eval.evalScalar(i); + } +} + +template <typename Expression> +struct IsVectorizable<GpuDevice, Expression> { + static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess && TensorEvaluator<Expression, GpuDevice>::IsAligned; +}; + +template<typename Expression> +class TensorExecutor<Expression, GpuDevice, false> +{ + public: + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const GpuDevice& device) + { + TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); + const int block_size = maxCudaThreadsPerBlock(); + const Index size = array_prod(evaluator.dimensions()); + LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size); + } + evaluator.cleanup(); + } +}; + +template<typename Expression> +class TensorExecutor<Expression, GpuDevice, true> +{ + public: + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const GpuDevice& device) + { + TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); + const int block_size = maxCudaThreadsPerBlock(); + const Index size = array_prod(evaluator.dimensions()); + LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size); + } + evaluator.cleanup(); + } +}; + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h new file mode 100644 index 000000000..b66b3ec2c --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -0,0 +1,300 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H + +namespace Eigen { + +/** \class TensorExpr + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor expression classes. + * + * The TensorCwiseNullaryOp class applies a nullary operators to an expression. + * This is typically used to generate constants. + * + * The TensorCwiseUnaryOp class represents an expression where a unary operator + * (e.g. cwiseSqrt) is applied to an expression. + * + * The TensorCwiseBinaryOp class represents an expression where a binary + * operator (e.g. addition) is applied to a lhs and a rhs expression. + * + */ +namespace internal { +template<typename NullaryOp, typename XprType> +struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> > + : traits<XprType> +{ + typedef typename XprType::Packet Packet; + typedef traits<XprType> XprTraits; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::Nested XprTypeNested; + typedef typename remove_reference<XprTypeNested>::type _XprTypeNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0, + }; +}; + +} // end namespace internal + + + +template<typename NullaryOp, typename XprType> +class TensorCwiseNullaryOp : public TensorBase<TensorCwiseNullaryOp<NullaryOp, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef TensorCwiseNullaryOp<NullaryOp, XprType> Nested; + typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + nestedExpression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + const NullaryOp& functor() const { return m_functor; } + + protected: + typename XprType::Nested m_xpr; + const NullaryOp m_functor; +}; + + + +namespace internal { +template<typename UnaryOp, typename XprType> +struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> > + : traits<XprType> +{ + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename result_of<UnaryOp(typename XprType::Scalar)>::type Scalar; + typedef traits<XprType> XprTraits; + typedef typename internal::packet_traits<Scalar>::type Packet; + typedef typename XprType::Nested XprTypeNested; + typedef typename remove_reference<XprTypeNested>::type _XprTypeNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename UnaryOp, typename XprType> +struct eval<TensorCwiseUnaryOp<UnaryOp, XprType>, Eigen::Dense> +{ + typedef const TensorCwiseUnaryOp<UnaryOp, XprType>& type; +}; + +template<typename UnaryOp, typename XprType> +struct nested<TensorCwiseUnaryOp<UnaryOp, XprType>, 1, typename eval<TensorCwiseUnaryOp<UnaryOp, XprType> >::type> +{ + typedef TensorCwiseUnaryOp<UnaryOp, XprType> type; +}; + +} // end namespace internal + + + +template<typename UnaryOp, typename XprType> +class TensorCwiseUnaryOp : public TensorBase<TensorCwiseUnaryOp<UnaryOp, XprType>, ReadOnlyAccessors> +{ + public: + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType; + typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested; + typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const UnaryOp& functor() const { return m_functor; } + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + nestedExpression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const UnaryOp m_functor; +}; + + +namespace internal { +template<typename BinaryOp, typename LhsXprType, typename RhsXprType> +struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > +{ + // Type promotion to handle the case where the types of the lhs and the rhs + // are different. + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename result_of< + BinaryOp(typename LhsXprType::Scalar, + typename RhsXprType::Scalar)>::type Scalar; + typedef traits<LhsXprType> XprTraits; + typedef typename internal::packet_traits<Scalar>::type Packet; + typedef typename promote_storage_type< + typename traits<LhsXprType>::StorageKind, + typename traits<RhsXprType>::StorageKind>::ret StorageKind; + typedef typename promote_index_type< + typename traits<LhsXprType>::Index, + typename traits<RhsXprType>::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference<LhsNested>::type _LhsNested; + typedef typename remove_reference<RhsNested>::type _RhsNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0, + }; +}; + +template<typename BinaryOp, typename LhsXprType, typename RhsXprType> +struct eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, Eigen::Dense> +{ + typedef const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>& type; +}; + +template<typename BinaryOp, typename LhsXprType, typename RhsXprType> +struct nested<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, 1, typename eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >::type> +{ + typedef TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> type; +}; + +} // end namespace internal + + + +template<typename BinaryOp, typename LhsXprType, typename RhsXprType> +class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, ReadOnlyAccessors> +{ + public: + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType; + typedef typename Eigen::internal::nested<TensorCwiseBinaryOp>::type Nested; + typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const BinaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename LhsXprType::Nested>::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename RhsXprType::Nested>::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const BinaryOp m_functor; +}; + + +namespace internal { +template<typename IfXprType, typename ThenXprType, typename ElseXprType> +struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> > + : traits<ThenXprType> +{ + typedef typename traits<ThenXprType>::Scalar Scalar; + typedef traits<ThenXprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename promote_storage_type<typename traits<ThenXprType>::StorageKind, + typename traits<ElseXprType>::StorageKind>::ret StorageKind; + typedef typename promote_index_type<typename traits<ElseXprType>::Index, + typename traits<ThenXprType>::Index>::type Index; + typedef typename IfXprType::Nested IfNested; + typedef typename ThenXprType::Nested ThenNested; + typedef typename ElseXprType::Nested ElseNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename IfXprType, typename ThenXprType, typename ElseXprType> +struct eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, Eigen::Dense> +{ + typedef const TensorSelectOp<IfXprType, ThenXprType, ElseXprType>& type; +}; + +template<typename IfXprType, typename ThenXprType, typename ElseXprType> +struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >::type> +{ + typedef TensorSelectOp<IfXprType, ThenXprType, ElseXprType> type; +}; + +} // end namespace internal + + +template<typename IfXprType, typename ThenXprType, typename ElseXprType> +class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorSelectOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::promote_storage_type<typename ThenXprType::CoeffReturnType, + typename ElseXprType::CoeffReturnType>::ret CoeffReturnType; + typedef typename internal::promote_storage_type<typename ThenXprType::PacketReturnType, + typename ElseXprType::PacketReturnType>::ret PacketReturnType; + typedef typename Eigen::internal::nested<TensorSelectOp>::type Nested; + typedef typename Eigen::internal::traits<TensorSelectOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorSelectOp>::Index Index; + + TensorSelectOp(const IfXprType& a_condition, + const ThenXprType& a_then, + const ElseXprType& a_else) + : m_condition(a_condition), m_then(a_then), m_else(a_else) + { } + + const IfXprType& ifExpression() const { return m_condition; } + + const ThenXprType& thenExpression() const { return m_then; } + + const ElseXprType& elseExpression() const { return m_else; } + + protected: + typename IfXprType::Nested m_condition; + typename ThenXprType::Nested m_then; + typename ElseXprType::Nested m_else; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h new file mode 100644 index 000000000..94b3f957b --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -0,0 +1,253 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H +#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H + +namespace Eigen { + +/** \class TensorFixedSize + * \ingroup CXX11_Tensor_Module + * + * \brief The fixed sized version of the tensor class. + * + * The fixed sized equivalent of + * Eigen::Tensor<float, 3> t(3, 5, 7); + * is + * Eigen::TensorFixedSize<float, Size<3,5,7>> t; + */ + +template<typename Scalar_, typename Dimensions_, int Options_> +class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_> > +{ + public: + typedef TensorFixedSize<Scalar_, Dimensions_, Options_> Self; + typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_> > Base; + typedef typename Eigen::internal::nested<Self>::type Nested; + typedef typename internal::traits<Self>::StorageKind StorageKind; + typedef typename internal::traits<Self>::Index Index; + typedef Scalar_ Scalar; + typedef typename internal::packet_traits<Scalar>::type Packet; + typedef typename NumTraits<Scalar>::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + static const int Options = Options_; + + enum { + IsAligned = bool(EIGEN_ALIGN), + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + Layout = Options_ & RowMajor ? RowMajor : ColMajor, + CoordAccess = true, + }; + + typedef Dimensions_ Dimensions; + static const std::size_t NumIndices = Dimensions::count; + + protected: + TensorStorage<Scalar, NumIndices, Dimensions::total_size, Options, Dimensions> m_storage; + + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + inline const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + inline Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + inline const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const + { + eigen_assert(checkIndexRange(indices)); + return coeff(indices); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const + { + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(index); + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + inline Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices) + { + eigen_assert(checkIndexRange(indices)); + return coeffRef(indices); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator[](Index index) + { + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize() + : m_storage() + { + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize(const Self& other) + : m_storage(other.m_storage) + { + } + +#ifdef EIGEN_HAVE_RVALUE_REFERENCES + inline TensorFixedSize(Self&& other) + : m_storage(other.m_storage) + { + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other) + { + // FIXME: check that the dimensions of other match the dimensions of *this. + // Unfortunately this isn't possible yet when the rhs is an expression. + typedef TensorAssignOp<Self, const TensorFixedSize> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + return *this; + } + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other) + { + // FIXME: check that the dimensions of other match the dimensions of *this. + // Unfortunately this isn't possible yet when the rhs is an expression. + typedef TensorAssignOp<Self, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + return *this; + } + + protected: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE bool checkIndexRange(const array<Index, NumIndices>& /*indices*/) const + { + using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::logical_and_op; + using internal::lesser_op; + + return true; + // check whether the indices are all >= 0 + /* array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());*/ + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const + { + if (Options&RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h new file mode 100644 index 000000000..41a36cb75 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -0,0 +1,151 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template<typename XprType> +struct traits<TensorForcedEvalOp<XprType> > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename traits<XprType>::StorageKind StorageKind; + typedef typename traits<XprType>::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0, + }; +}; + +template<typename XprType> +struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense> +{ + typedef const TensorForcedEvalOp<XprType>& type; +}; + +template<typename XprType> +struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type> +{ + typedef TensorForcedEvalOp<XprType> type; +}; + +} // end namespace internal + + + +template<typename XprType> +class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType; + typedef typename Eigen::internal::nested<TensorForcedEvalOp>::type Nested; + typedef typename Eigen::internal::traits<TensorForcedEvalOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + + +template<typename ArgType, typename Device> +struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> +{ + typedef TensorForcedEvalOp<ArgType> XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename ArgType::Packet Packet; + typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + + enum { + IsAligned = true, + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + Layout = TensorEvaluator<ArgType, Device>::Layout, + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_impl.evalSubExprsIfNeeded(NULL); + const Index numValues = m_impl.dimensions().TotalSize(); + m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); + // Should initialize the memory in case we're dealing with non POD types. + if (!internal::is_arithmetic<CoeffReturnType>::value) { + for (Index i = 0; i < numValues; ++i) { + new(m_buffer+i) CoeffReturnType(); + } + } + typedef TensorEvalToOp<const ArgType> EvalTo; + EvalTo evalToTmp(m_buffer, m_op); + internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<ArgType, Device>::PacketAccess>::run(evalToTmp, m_device); + m_impl.cleanup(); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { + m_device.deallocate(m_buffer); + m_buffer = NULL; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template<int LoadMode> + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt<Packet, LoadMode>(m_buffer + index); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; } + + private: + TensorEvaluator<ArgType, Device> m_impl; + const ArgType m_op; + const Device& m_device; + CoeffReturnType* m_buffer; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h new file mode 100644 index 000000000..7bec2b10a --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -0,0 +1,54 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H + +namespace Eigen { + +template<typename Scalar_, std::size_t NumIndices_, int Options_ = 0> class Tensor; +template<typename Scalar_, typename Dimensions, int Options_ = 0> class TensorFixedSize; +template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap; +template<typename PlainObjectType> class TensorRef; +template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase; + +template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp; +template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp; +template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp; +template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp; +template<typename Op, typename Dims, typename XprType> class TensorReductionOp; +template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp; +template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp; +template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp; +template<typename PatchDim, typename XprType> class TensorPatchOp; +template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp; +template<typename Broadcast, typename XprType> class TensorBroadcastingOp; +template<DenseIndex DimId, typename XprType> class TensorChippingOp; +template<typename NewDimensions, typename XprType> class TensorReshapingOp; +template<typename XprType> class TensorLayoutSwapOp; +template<typename StartIndices, typename Sizes, typename XprType> class TensorSlicingOp; +template<typename ReverseDimensions, typename XprType> class TensorReverseOp; +template<typename PaddingDimensions, typename XprType> class TensorPaddingOp; +template<typename Shuffle, typename XprType> class TensorShufflingOp; +template<typename Strides, typename XprType> class TensorStridingOp; +template<typename LeftXprType, typename RightXprType> class TensorAssignOp; + +template<typename XprType> class TensorEvalToOp; +template<typename XprType> class TensorForcedEvalOp; + +template<typename ExpressionType, typename DeviceType> class TensorDevice; +template<typename Derived, typename Device> struct TensorEvaluator; + +namespace internal { +template<typename Expression, typename Device, bool Vectorizable> class TensorExecutor; +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h new file mode 100644 index 000000000..38586d067 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -0,0 +1,338 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H + +namespace Eigen { +namespace internal { + +// Standard reduction functors +template <typename T> struct SumReducer +{ + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + (*accum) += t; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = padd<Packet>(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return static_cast<T>(0); + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1<Packet>(0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return saccum + predux(vaccum); + } +}; + +template <typename T> struct MeanReducer +{ + static const bool PacketAccess = true; + MeanReducer() : scalarCount_(0), packetCount_(0) { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { + (*accum) += t; + scalarCount_++; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { + (*accum) = padd<Packet>(*accum, p); + packetCount_++; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return static_cast<T>(0); + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1<Packet>(0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum / scalarCount_; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return pdiv(vaccum, pset1<Packet>(packetCount_)); + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * packet_traits<Packet>::size); + } + + protected: + int scalarCount_; + int packetCount_; +}; + +template <typename T> struct MaxReducer +{ + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t > *accum) { *accum = t; } + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmax<Packet>(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return -(std::numeric_limits<T>::max)(); + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1<Packet>(-(std::numeric_limits<T>::max)()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return (std::max)(saccum, predux_max(vaccum)); + } +}; + +template <typename T> struct MinReducer +{ + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t < *accum) { *accum = t; } + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmin<Packet>(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return (std::numeric_limits<T>::max)(); + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1<Packet>((std::numeric_limits<T>::max)()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return (std::min)(saccum, predux_min(vaccum)); + } +}; + + +template <typename T> struct ProdReducer +{ + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + (*accum) *= t; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmul<Packet>(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return static_cast<T>(1); + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1<Packet>(1); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return saccum * predux_mul(vaccum); + } +}; + +#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__) +// We're not compiling a cuda kernel +template <typename T> struct UniformRandomGenerator { + + static const bool PacketAccess = true; + + template<typename Index> + T operator()(Index, Index = 0) const { + return random<T>(); + } + template<typename Index> + typename internal::packet_traits<T>::type packetOp(Index, Index = 0) const { + const int packetSize = internal::packet_traits<T>::size; + EIGEN_ALIGN_DEFAULT T values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = random<T>(); + } + return internal::pload<typename internal::packet_traits<T>::type>(values); + } +}; + +#else + +// We're compiling a cuda kernel +template <typename T> struct UniformRandomGenerator; + +template <> struct UniformRandomGenerator<float> { + + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC UniformRandomGenerator() { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + curand_init(0, tid, 0, &m_state); + } + + template<typename Index> EIGEN_DEVICE_FUNC + float operator()(Index, Index = 0) const { + return curand_uniform(&m_state); + } + template<typename Index> EIGEN_DEVICE_FUNC + float4 packetOp(Index, Index = 0) const { + return curand_uniform4(&m_state); + } + + private: + mutable curandStatePhilox4_32_10_t m_state; +}; + +template <> struct UniformRandomGenerator<double> { + + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC UniformRandomGenerator() { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + curand_init(0, tid, 0, &m_state); + } + template<typename Index> EIGEN_DEVICE_FUNC + double operator()(Index, Index = 0) const { + return curand_uniform_double(&m_state); + } + template<typename Index> EIGEN_DEVICE_FUNC + double2 packetOp(Index, Index = 0) const { + return curand_uniform2_double(&m_state); + } + + private: + mutable curandStatePhilox4_32_10_t m_state; +}; + +#endif + + +#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && __cplusplus > 199711 +// We're not compiling a cuda kernel +template <typename T> struct NormalRandomGenerator { + + static const bool PacketAccess = true; + + NormalRandomGenerator() : m_distribution(0, 1) {} + NormalRandomGenerator(const NormalRandomGenerator& other) : m_distribution(other.m_distribution) { } + + template<typename Index> + T operator()(Index, Index = 0) const { + return m_distribution(m_generator); + } + template<typename Index> + typename internal::packet_traits<T>::type packetOp(Index, Index = 0) const { + const int packetSize = internal::packet_traits<T>::size; + EIGEN_ALIGN_DEFAULT T values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = m_distribution(m_generator); + } + return internal::pload<typename internal::packet_traits<T>::type>(values); + } + + mutable std::normal_distribution<T> m_distribution; + mutable std::default_random_engine m_generator; +}; + +#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__) + +// We're compiling a cuda kernel +template <typename T> struct NormalRandomGenerator; + +template <> struct NormalRandomGenerator<float> { + + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC NormalRandomGenerator() { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + curand_init(0, tid, 0, &m_state); + } + + template<typename Index> EIGEN_DEVICE_FUNC + float operator()(Index, Index = 0) const { + return curand_normal(&m_state); + } + template<typename Index> EIGEN_DEVICE_FUNC + float4 packetOp(Index, Index = 0) const { + return curand_normal4(&m_state); + } + + private: + mutable curandStatePhilox4_32_10_t m_state; +}; + +template <> struct NormalRandomGenerator<double> { + + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC NormalRandomGenerator() { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + curand_init(0, tid, 0, &m_state); + } + template<typename Index> EIGEN_DEVICE_FUNC + double operator()(Index, Index = 0) const { + return curand_normal_double(&m_state); + } + template<typename Index> EIGEN_DEVICE_FUNC + double2 packetOp(Index, Index = 0) const { + return curand_normal2_double(&m_state); + } + + private: + mutable curandStatePhilox4_32_10_t m_state; +}; + +#endif + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h new file mode 100644 index 000000000..a9d0f6c39 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h @@ -0,0 +1,53 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H +#define EIGEN_CXX11_TENSOR_TENSOR_IO_H + +namespace Eigen { + +namespace internal { +template<> +struct significant_decimals_impl<std::string> + : significant_decimals_default_impl<std::string, true> +{}; +} + + +template <typename T> +std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) { + // Evaluate the expression if needed + TensorForcedEvalOp<const T> eval = expr.eval(); + TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> tensor(eval, DefaultDevice()); + tensor.evalSubExprsIfNeeded(NULL); + + typedef typename internal::remove_const<typename T::Scalar>::type Scalar; + typedef typename T::Index Index; + typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions; + const Index total_size = internal::array_prod(tensor.dimensions()); + + // Print the tensor as a 1d vector or a 2d matrix. + if (internal::array_size<Dimensions>::value == 1) { + Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size); + os << array; + } else { + const Index first_dim = tensor.dimensions()[0]; + static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout; + Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim); + os << matrix; + } + + // Cleanup. + tensor.cleanup(); + return os; +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h new file mode 100644 index 000000000..bf0e7edfb --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -0,0 +1,382 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H + +namespace Eigen { + +/** \class TensorImagePatch + * \ingroup CXX11_Tensor_Module + * + * \brief Patch extraction specialized for image processing. + * This assumes that the input has a least 3 dimensions ordered as follow: + * 1st dimension: channels (of size d) + * 2nd dimension: rows (of size r) + * 3rd dimension: columns (of size c) + * There can be additional dimensions such as time (for video) or batch (for + * bulk processing after the first 3. + * Calling the image patch code with patch_rows and patch_cols is equivalent + * to calling the regular patch extraction code with parameters d, patch_rows, + * patch_cols, and 1 for all the additional dimensions. + */ +namespace internal { +template<DenseIndex Rows, DenseIndex Cols, typename XprType> +struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; +}; + +template<DenseIndex Rows, DenseIndex Cols, typename XprType> +struct eval<TensorImagePatchOp<Rows, Cols, XprType>, Eigen::Dense> +{ + typedef const TensorImagePatchOp<Rows, Cols, XprType>& type; +}; + +template<DenseIndex Rows, DenseIndex Cols, typename XprType> +struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorImagePatchOp<Rows, Cols, XprType> >::type> +{ + typedef TensorImagePatchOp<Rows, Cols, XprType> type; +}; + +} // end namespace internal + +template<DenseIndex Rows, DenseIndex Cols, typename XprType> +class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorImagePatchOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorImagePatchOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorImagePatchOp>::type Nested; + typedef typename Eigen::internal::traits<TensorImagePatchOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorImagePatchOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex row_strides, DenseIndex col_strides, + PaddingType padding_type) + : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_row_strides(row_strides), m_col_strides(col_strides), + m_padding_type(padding_type) {} + + EIGEN_DEVICE_FUNC + DenseIndex patch_rows() const { return m_patch_rows; } + EIGEN_DEVICE_FUNC + DenseIndex patch_cols() const { return m_patch_cols; } + EIGEN_DEVICE_FUNC + DenseIndex row_strides() const { return m_row_strides; } + EIGEN_DEVICE_FUNC + DenseIndex col_strides() const { return m_col_strides; } + EIGEN_DEVICE_FUNC + PaddingType padding_type() const { return m_padding_type; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const DenseIndex m_patch_rows; + const DenseIndex m_patch_cols; + const DenseIndex m_row_strides; + const DenseIndex m_col_strides; + const PaddingType m_padding_type; +}; + + +// Eval as rvalue +template<DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device> +struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> +{ + typedef TensorImagePatchOp<Rows, Cols, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = NumDims == 5, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + // Only column major tensors are supported for now. + EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + + // Caches a few variables. + m_inputRows = input_dims[1]; + m_inputCols = input_dims[2]; + + m_row_strides = op.row_strides(); + m_col_strides = op.col_strides(); + + // We only support same strides for both dimensions and square patches. + eigen_assert(m_row_strides == m_col_strides); + + switch (op.padding_type()) { + case PADDING_VALID: + m_outputRows = ceil((m_inputRows - op.patch_rows() + 1.f) / static_cast<float>(m_row_strides)); + m_outputCols = ceil((m_inputCols - op.patch_cols() + 1.f) / static_cast<float>(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + op.patch_rows() - m_inputRows) / 2; + m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + op.patch_cols() - m_inputCols) / 2; + break; + case PADDING_SAME: + m_outputRows = ceil(m_inputRows / static_cast<float>(m_row_strides)); + m_outputCols = ceil(m_inputCols / static_cast<float>(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + op.patch_rows() - m_inputRows) / 2; + m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + op.patch_cols() - m_inputCols) / 2; + break; + default: + eigen_assert(false && "unexpected padding"); + } + + // Dimensions for result of extraction. + // 0: depth + // 1: patch_rows + // 2: patch_cols + // 3: number of patches + // 4 and beyond: anything else (such as batch). + m_dimensions[0] = input_dims[0]; + m_dimensions[1] = op.patch_rows(); + m_dimensions[2] = op.patch_cols(); + m_dimensions[3] = m_outputRows * m_outputCols; + for (int i = 4; i < NumDims; ++i) { + m_dimensions[i] = input_dims[i-1]; + } + + // Strides for moving the patch in various dimensions. + m_colStride = m_dimensions[1]; + m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0]; + m_otherStride = m_patchStride * m_dimensions[3]; + + // Strides for navigating through the input tensor. + m_rowInputStride = input_dims[0]; + m_colInputStride = input_dims[0] * input_dims[1]; + m_patchInputStride = input_dims[0] * input_dims[1] * input_dims[2]; + + // Fast representations of different variables. + m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride); + m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride); + m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride); + // Number of patches in the width dimension. + m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows); + m_fastDimZero = internal::TensorIntDivisor<Index>(m_dimensions[0]); + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Patch index corresponding to the passed in index. + const Index patchIndex = index / m_fastPatchStride; + + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastDimZero; + + // Other ways to index this element. + const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride; + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; + + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index colOffset = patchOffset / m_fastColStride; + + // Calculate col index in the input original tensor. + const Index inputCol = colIndex * m_col_strides + colOffset - m_colPaddingLeft; + if (inputCol < 0 || inputCol >= m_inputCols) { + return Scalar(0); + } + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; + const Index rowOffset = patchOffset - colOffset * m_colStride; + + // Calculate row index in the original input tensor. + const Index inputRow = rowIndex * m_row_strides + rowOffset - m_rowPaddingTop; + if (inputRow < 0 || inputRow >= m_inputRows) { + return Scalar(0); + } + + const Index depth = index - (index / m_fastDimZero) * m_dimensions[0]; + + const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.coeff(inputIndex); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const Index packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index indices[2] = {index, index + packetSize - 1}; + const Index patchIndex = indices[0] / m_fastPatchStride; + if (patchIndex != indices[1] / m_fastPatchStride) { + return packetWithPossibleZero(index); + } + const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride; + eigen_assert(otherIndex == indices[1] / m_fastOtherStride); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastDimZero, + (indices[1] - patchIndex * m_patchStride) / m_fastDimZero}; + + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; + eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); + + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; + + // Calculate col indices in the original input tensor. + const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - + m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; + if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { + // all zeros + return internal::pset1<PacketReturnType>(Scalar(0)); + } + + if (inputCols[0] == inputCols[1]) { + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; + const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + // Calculate col indices in the original input tensor. + const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - + m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; + + if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { + // all zeros + return internal::pset1<PacketReturnType>(Scalar(0)); + } + + if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) { + // no padding + const Index depth = index - (index / m_fastDimZero) * m_dimensions[0]; + const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.template packet<Unaligned>(inputIndex); + } + } + + return packetWithPossibleZero(index); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + + Index rowPaddingTop() const { return m_rowPaddingTop; } + Index colPaddingLeft() const { return m_colPaddingLeft; } + Index outputRows() const { return m_outputRows; } + Index outputCols() const { return m_outputCols; } + Index userRowStride() const { return m_row_strides; } + Index userColStride() const { return m_col_strides; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const + { + // Location of the first element of the patch. + // 0: d, 1: patch_rows, 2: patch_cols, 3: number of patches, 4: number of batches + const Index patchIndex = coords[3]; + + array<Index, NumDims-1> inputCoords; + inputCoords[0] = coords[0]; // depth + inputCoords[1] = patchIndex / m_inputCols + coords[1] - m_rowPaddingTop; + inputCoords[2] = patchIndex - patchIndex / m_inputCols * m_inputCols + coords[2] - m_colPaddingLeft; + inputCoords[3] = coords[4]; // batch + // If the computed coordinates are outside the original image perimeter, return 0. + if (inputCoords[1] < 0 || inputCoords[1] >= m_inputRows || + inputCoords[2] < 0 || inputCoords[2] >= m_inputCols) { + return Scalar(0); + } + if (TensorEvaluator<ArgType, Device>::CoordAccess) { + return m_impl.coeff(inputCoords); + } else { + Index inputIndex = + inputCoords[3] * m_patchInputStride + + inputCoords[2] * m_colInputStride + + inputCoords[1] * m_rowInputStride + + inputCoords[0]; + return m_impl.coeff(inputIndex); + } + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + Dimensions m_dimensions; + + Index m_otherStride; + Index m_patchStride; + Index m_colStride; + Index m_row_strides; + Index m_col_strides; + internal::TensorIntDivisor<Index> m_fastOtherStride; + internal::TensorIntDivisor<Index> m_fastPatchStride; + internal::TensorIntDivisor<Index> m_fastColStride; + + Index m_rowInputStride; + Index m_colInputStride; + Index m_patchInputStride; + + Index m_inputRows; + Index m_inputCols; + + Index m_outputRows; + Index m_outputCols; + + Index m_rowPaddingTop; + Index m_colPaddingLeft; + + internal::TensorIntDivisor<Index> m_fastOutputRows; + internal::TensorIntDivisor<Index> m_fastDimZero; + + TensorEvaluator<ArgType, Device> m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h new file mode 100644 index 000000000..eed0a9f05 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -0,0 +1,419 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H +#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H + +#ifdef EIGEN_HAS_CONSTEXPR + +namespace Eigen { + +/** \internal + * + * \class TensorIndexList + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode a set of Tensor dimensions/indices. + * + * The indices in the list can be known at compile time or at runtime. A mix + * of static and dynamic indices can also be provided if needed. The tensor + * code will attempt to take advantage of the indices that are known at + * compile time to optimize the code it generates. + * + * This functionality requires a c++11 compliant compiler. If your compiler + * is older you need to use arrays of indices instead. + * + * Several examples are provided in the cxx11_tensor_index_list.cpp file. + * + * \sa Tensor + */ + +template <DenseIndex n> +struct type2index { + static const DenseIndex value = n; + constexpr operator DenseIndex() const { return n; } + void set(DenseIndex val) { + eigen_assert(val == n); + } +}; + +namespace internal { +template <typename T> +void update_value(T& val, DenseIndex new_val) { + val = new_val; +} +template <DenseIndex n> +void update_value(type2index<n>& val, DenseIndex new_val) { + val.set(new_val); +} + +template <typename T> +struct is_compile_time_constant { + static constexpr bool value = false; +}; + +template <DenseIndex idx> +struct is_compile_time_constant<type2index<idx> > { + static constexpr bool value = true; +}; +template <DenseIndex idx> +struct is_compile_time_constant<const type2index<idx> > { + static constexpr bool value = true; +}; +template <DenseIndex idx> +struct is_compile_time_constant<type2index<idx>& > { + static constexpr bool value = true; +}; +template <DenseIndex idx> +struct is_compile_time_constant<const type2index<idx>& > { + static constexpr bool value = true; +}; + +template <DenseIndex Idx> +struct tuple_coeff { + template <typename... T> + static constexpr DenseIndex get(const DenseIndex i, const std::tuple<T...>& t) { + return std::get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx); + } + template <typename... T> + static void set(const DenseIndex i, std::tuple<T...>& t, const DenseIndex value) { + if (i == Idx) { + update_value(std::get<Idx>(t), value); + } else { + tuple_coeff<Idx-1>::set(i, t, value); + } + } + + template <typename... T> + static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>& t) { + return ((i == Idx) & is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value) || + tuple_coeff<Idx-1>::value_known_statically(i, t); + } + + template <typename... T> + static constexpr bool values_up_to_known_statically(const std::tuple<T...>& t) { + return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value && + tuple_coeff<Idx-1>::values_up_to_known_statically(t); + } + + template <typename... T> + static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>& t) { + return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value && + is_compile_time_constant<typename std::tuple_element<Idx-1, std::tuple<T...> >::type>::value && + std::get<Idx>(t) > std::get<Idx-1>(t) && + tuple_coeff<Idx-1>::values_up_to_statically_known_to_increase(t); + } +}; + +template <> +struct tuple_coeff<0> { + template <typename... T> + static constexpr DenseIndex get(const DenseIndex i, const std::tuple<T...>& t) { + // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr + return std::get<0>(t) * (i == 0); + } + template <typename... T> + static void set(const DenseIndex i, std::tuple<T...>& t, const DenseIndex value) { + eigen_assert (i == 0); + update_value(std::get<0>(t), value); + } + template <typename... T> + static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>&) { + // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr + return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value & (i == 0); + } + + template <typename... T> + static constexpr bool values_up_to_known_statically(const std::tuple<T...>&) { + return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value; + } + + template <typename... T> + static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>&) { + return true; + } +}; +} // namespace internal + + +template<typename FirstType, typename... OtherTypes> +struct IndexList : std::tuple<FirstType, OtherTypes...> { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const { + return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) { + return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::set(i, *this, value); + } + + constexpr IndexList(const std::tuple<FirstType, OtherTypes...>& other) : std::tuple<FirstType, OtherTypes...>(other) { } + constexpr IndexList() : std::tuple<FirstType, OtherTypes...>() { } + + constexpr bool value_known_statically(const DenseIndex i) const { + return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::value_known_statically(i, *this); + } + constexpr bool all_values_known_statically() const { + return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::values_up_to_known_statically(*this); + } + + constexpr bool values_statically_known_to_increase() const { + return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::values_up_to_statically_known_to_increase(*this); + } +}; + + +template<typename FirstType, typename... OtherTypes> +constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) { + return std::make_tuple(val1, other_vals...); +} + + +namespace internal { + +template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) { + size_t result = 1; + for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) { + result *= sizes[i]; + } + return result; +}; + +template<typename FirstType, typename... OtherTypes> struct array_size<IndexList<FirstType, OtherTypes...> > { + static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value; +}; +template<typename FirstType, typename... OtherTypes> struct array_size<const IndexList<FirstType, OtherTypes...> > { + static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value; +}; + +template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) { + return std::get<n>(a); +} +template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(const IndexList<FirstType, OtherTypes...>& a) { + return std::get<n>(a); +} + +template <typename T> +struct index_known_statically { + constexpr bool operator() (DenseIndex) const { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_known_statically<IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i); + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_known_statically<const IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i); + } +}; + +template <typename T> +struct all_indices_known_statically { + constexpr bool operator() () const { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct all_indices_known_statically<IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() () const { + return IndexList<FirstType, OtherTypes...>().all_values_known_statically(); + } +}; + +template <typename FirstType, typename... OtherTypes> +struct all_indices_known_statically<const IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() () const { + return IndexList<FirstType, OtherTypes...>().all_values_known_statically(); + } +}; + +template <typename T> +struct indices_statically_known_to_increase { + constexpr bool operator() () const { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct indices_statically_known_to_increase<IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() () const { + return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase(); + } +}; + +template <typename FirstType, typename... OtherTypes> +struct indices_statically_known_to_increase<const IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() () const { + return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase(); + } +}; + +template <typename Tx> +struct index_statically_eq { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_eq<IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & + (IndexList<FirstType, OtherTypes...>()[i] == value); + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_eq<const IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & + (IndexList<FirstType, OtherTypes...>()[i] == value); + } +}; + +template <typename T> +struct index_statically_ne { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_ne<IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & + (IndexList<FirstType, OtherTypes...>()[i] != value); + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_ne<const IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & + (IndexList<FirstType, OtherTypes...>()[i] != value); + } +}; + + +template <typename T> +struct index_statically_gt { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_gt<IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & + (IndexList<FirstType, OtherTypes...>()[i] > value); + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_gt<const IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & + (IndexList<FirstType, OtherTypes...>()[i] > value); + } +}; + +template <typename T> +struct index_statically_lt { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_lt<IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & + (IndexList<FirstType, OtherTypes...>()[i] < value); + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_lt<const IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & + (IndexList<FirstType, OtherTypes...>()[i] < value); + } +}; + +} // end namespace internal +} // end namespace Eigen + +#else + +namespace Eigen { +namespace internal { + +// No C++11 support +template <typename T> +struct index_known_statically { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex) const{ + return false; + } +}; + +template <typename T> +struct all_indices_known_statically { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const { + return false; + } +}; + +template <typename T> +struct indices_statically_known_to_increase { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const { + return false; + } +}; + +template <typename T> +struct index_statically_eq { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + +template <typename T> +struct index_statically_ne { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + +template <typename T> +struct index_statically_gt { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + +template <typename T> +struct index_statically_lt { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + +} // end namespace internal +} // end namespace Eigen + +#endif + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h new file mode 100644 index 000000000..4303e3536 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h @@ -0,0 +1,70 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H +#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + +#include <initializer_list> + +namespace Eigen { + +/** \class TensorInitializer + * \ingroup CXX11_Tensor_Module + * + * \brief Helper template to initialize Tensors from std::initializer_lists. + */ +namespace internal { + +template <typename Derived, int N> +struct Initializer { + typedef std::initializer_list< + typename Initializer<Derived, N - 1>::InitList> InitList; + + static void run(TensorEvaluator<Derived, DefaultDevice>& tensor, + Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + for (auto v : vals) { + (*indices)[traits<Derived>::NumDimensions - N] = i++; + Initializer<Derived, N - 1>::run(tensor, indices, v); + } + } +}; + +template <typename Derived> +struct Initializer<Derived, 1> { + typedef std::initializer_list<typename traits<Derived>::Scalar> InitList; + + static void run(TensorEvaluator<Derived, DefaultDevice>& tensor, + Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + // There is likely a faster way to do that than iterating. + for (auto v : vals) { + (*indices)[traits<Derived>::NumDimensions - 1] = i++; + tensor.coeffRef(*indices) = v; + } + } +}; + +template <typename Derived, int N> +void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor, + const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) { + Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions> indices; + Initializer<Derived, traits<Derived>::NumDimensions>::run(tensor, &indices, vals); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_HAS_VARIADIC_TEMPLATES + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h new file mode 100644 index 000000000..2714117ab --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -0,0 +1,86 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H +#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H + + +namespace Eigen { + +/** \internal + * + * \class TensorIntDiv + * \ingroup CXX11_Tensor_Module + * + * \brief Fast integer division by a constant. + * + * See the paper from Granlund and Montgomery for explanation. + * (at http://dx.doi.org/10.1145/773473.178249) + * + * \sa Tensor + */ + +namespace internal { + +template <typename T> +struct TensorIntDivisor { + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { + multiplier = 0; + shift1 = 0; + shift2 = 0; + } + + // Must have 1 <= divider <= 2^31-1 + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) { + const int N = 32; + eigen_assert(divider > 0); + eigen_assert(divider <= (1<<(N-1)) - 1); + + // fast ln2 +#ifndef __CUDA_ARCH__ + const int leading_zeros = __builtin_clz(divider); +#else + const int leading_zeros = __clz(divider); +#endif + const int log_div = N - (leading_zeros+1); + + multiplier = (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1; + shift1 = log_div > 1 ? 1 : log_div; + shift2 = log_div > 1 ? log_div-1 : 0; + } + + // Must have 0 <= numerator <= 2^32-1 + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { + const int N = 32; + eigen_assert(numerator >= 0); + eigen_assert(numerator <= (1ull<<N) - 1); + + uint32_t t1 = (multiplier * numerator) >> 32; + uint32_t t = (static_cast<uint32_t>(numerator) - t1) >> shift1; + return (t1 + t) >> shift2; + } + + private: + uint64_t multiplier; + int32_t shift1; + int32_t shift2; +}; + + +template <typename T> +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) { + return divisor.divide(numerator); +} + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h new file mode 100644 index 000000000..c119b30e2 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -0,0 +1,198 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H + +namespace Eigen { + +/** \class TensorLayoutSwap + * \ingroup CXX11_Tensor_Module + * + * \brief Swap the layout from col-major to row-major, or row-major + * to col-major, and invert the order of the dimensions. + * + * Beware: the dimensions are reversed by this operation. If you want to + * preserve the ordering of the dimensions, you need to combine this + * operation with a shuffle. + * + * \example: + * Tensor<float, 2, ColMajor> input(2, 4); + * Tensor<float, 2, RowMajor> output = input.swap_layout(); + * eigen_assert(output.dimension(0) == 4); + * eigen_assert(output.dimension(1) == 2); + * + * array<int, 2> shuffle(1, 0); + * output = input.swap_layout().shuffle(shuffle); + * eigen_assert(output.dimension(0) == 2); + * eigen_assert(output.dimension(1) == 4); + * + */ +namespace internal { +template<typename XprType> +struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = traits<XprType>::NumDimensions; + static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor; +}; + +template<typename XprType> +struct eval<TensorLayoutSwapOp<XprType>, Eigen::Dense> +{ + typedef const TensorLayoutSwapOp<XprType>& type; +}; + +template<typename XprType> +struct nested<TensorLayoutSwapOp<XprType>, 1, typename eval<TensorLayoutSwapOp<XprType> >::type> +{ + typedef TensorLayoutSwapOp<XprType> type; +}; + +} // end namespace internal + + + +template<typename XprType> +class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType; + typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested; + typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorLayoutSwapOp, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; +}; + + +// Eval as rvalue +template<typename ArgType, typename Device> +struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> +{ + typedef TensorLayoutSwapOp<ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + + enum { + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + for(int i = 0; i < NumDims; ++i) { + m_dimensions[i] = m_impl.dimensions()[NumDims-1-i]; + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + return m_impl.evalSubExprsIfNeeded(data); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet<LoadMode>(index); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); } + + const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + + protected: + TensorEvaluator<ArgType, Device> m_impl; + Dimensions m_dimensions; +}; + + +// Eval as lvalue +template<typename ArgType, typename Device> + struct TensorEvaluator<TensorLayoutSwapOp<ArgType>, Device> + : public TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> +{ + typedef TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> Base; + typedef TensorLayoutSwapOp<ArgType> XprType; + + enum { + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(index); + } + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + this->m_impl.template writePacket<StoreMode>(index, x); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h new file mode 100644 index 000000000..2cb2bc7a6 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -0,0 +1,291 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H + +namespace Eigen { + +/** \class TensorMap + * \ingroup CXX11_Tensor_Module + * + * \brief A tensor expression mapping an existing array of data. + * + */ + +template<typename PlainObjectType, int Options_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_> > +{ + public: + typedef TensorMap<PlainObjectType, Options_> Self; + typedef typename PlainObjectType::Base Base; + typedef typename Eigen::internal::nested<Self>::type Nested; + typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind; + typedef typename internal::traits<PlainObjectType>::Index Index; + typedef typename internal::traits<PlainObjectType>::Scalar Scalar; + typedef typename internal::packet_traits<Scalar>::type Packet; + typedef typename NumTraits<Scalar>::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + /* typedef typename internal::conditional< + bool(internal::is_lvalue<PlainObjectType>::value), + Scalar *, + const Scalar *>::type + PointerType;*/ + typedef Scalar* PointerType; + typedef PointerType PointerArgType; + + static const int Options = Options_; + + static const Index NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + enum { + IsAligned = ((int(Options_)&Aligned)==Aligned), + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + Layout = PlainObjectType::Layout, + CoordAccess = true, + }; + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { + EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { + EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { + EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { + EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#endif + + inline TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) + { } + + template <typename Dimensions> + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) + { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar* data() { return m_data; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar* data() const { return m_data; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const + { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const + { + static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + return m_data[index]; + } + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i1 + i0 * m_dimensions[0]; + return m_data[index]; + } else { + const Index index = i0 + i1 * m_dimensions[0]; + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); + return m_data[index]; + } + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices) + { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + { + static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + const std::size_t NumDims = sizeof...(otherIndices) + 1; + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, otherIndices...}}); + return m_data[index]; + } + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i1 + i0 * m_dimensions[0]; + return m_data[index]; + } else { + const Index index = i0 + i1 * m_dimensions[0]; + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); + return m_data[index]; + } + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other) + { + typedef TensorAssignOp<Self, const Self> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + return *this; + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Self& operator=(const OtherDerived& other) + { + typedef TensorAssignOp<Self, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + return *this; + } + + private: + Scalar* m_data; + Dimensions m_dimensions; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h new file mode 100644 index 000000000..a93f48ccb --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -0,0 +1,600 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H +#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H + +namespace Eigen { + +/** \class TensorReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template<typename NewDimensions, typename XprType> +struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = array_size<NewDimensions>::value; + static const int Layout = XprTraits::Layout; +}; + +template<typename NewDimensions, typename XprType> +struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense> +{ + typedef const TensorReshapingOp<NewDimensions, XprType>& type; +}; + +template<typename NewDimensions, typename XprType> +struct nested<TensorReshapingOp<NewDimensions, XprType>, 1, typename eval<TensorReshapingOp<NewDimensions, XprType> >::type> +{ + typedef TensorReshapingOp<NewDimensions, XprType> type; +}; + +} // end namespace internal + + + +template<typename NewDimensions, typename XprType> +class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorReshapingOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType; + typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested; + typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorReshapingOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC + const NewDimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other) + { + typedef TensorAssignOp<TensorReshapingOp, const TensorReshapingOp> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice()); + return *this; + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorReshapingOp, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const NewDimensions m_dims; +}; + + +// Eval as rvalue +template<typename NewDimensions, typename ArgType, typename Device> +struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> +{ + typedef TensorReshapingOp<NewDimensions, ArgType> XprType; + typedef NewDimensions Dimensions; + + enum { + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.dimensions()) + { + // The total size of the reshaped tensor must be equal to the total size + // of the input tensor. + eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + return m_impl.evalSubExprsIfNeeded(data); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet<LoadMode>(index); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); } + + const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + + protected: + TensorEvaluator<ArgType, Device> m_impl; + NewDimensions m_dimensions; +}; + + +// Eval as lvalue +template<typename NewDimensions, typename ArgType, typename Device> + struct TensorEvaluator<TensorReshapingOp<NewDimensions, ArgType>, Device> + : public TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> + +{ + typedef TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> Base; + typedef TensorReshapingOp<NewDimensions, ArgType> XprType; + typedef NewDimensions Dimensions; + + enum { + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(index); + } + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + this->m_impl.template writePacket<StoreMode>(index, x); + } +}; + + +/** \class TensorSlicing + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor slicing class. + * + * + */ +namespace internal { +template<typename StartIndices, typename Sizes, typename XprType> +struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = array_size<StartIndices>::value; + static const int Layout = XprTraits::Layout; +}; + +template<typename StartIndices, typename Sizes, typename XprType> +struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense> +{ + typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type; +}; + +template<typename StartIndices, typename Sizes, typename XprType> +struct nested<TensorSlicingOp<StartIndices, Sizes, XprType>, 1, typename eval<TensorSlicingOp<StartIndices, Sizes, XprType> >::type> +{ + typedef TensorSlicingOp<StartIndices, Sizes, XprType> type; +}; + +} // end namespace internal + + + +template<typename StartIndices, typename Sizes, typename XprType> +class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorSlicingOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested; + typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorSlicingOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes) + : m_xpr(expr), m_indices(indices), m_sizes(sizes) {} + + EIGEN_DEVICE_FUNC + const StartIndices& startIndices() const { return m_indices; } + EIGEN_DEVICE_FUNC + const Sizes& sizes() const { return m_sizes; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorSlicingOp, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice()); + return *this; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other) + { + typedef TensorAssignOp<TensorSlicingOp, const TensorSlicingOp> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice()); + return *this; + } + + + protected: + typename XprType::Nested m_xpr; + const StartIndices m_indices; + const Sizes m_sizes; +}; + + +// Eval as rvalue +template<typename StartIndices, typename Sizes, typename ArgType, typename Device> +struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> +{ + typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType; + static const int NumDims = internal::array_size<Sizes>::value; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets and sizes. + IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) + { + for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); + } + + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + const Sizes& output_dims = op.sizes(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + } + + m_outputStrides[0] = 1; + m_fastOutputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]); + } + } else { + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + } + + m_outputStrides[NumDims-1] = 1; + m_fastOutputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]); + } + } + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef Sizes Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + m_impl.evalSubExprsIfNeeded(NULL); + if (internal::is_arithmetic<Scalar>::value && data && m_impl.data()) { + Index contiguous_values = 1; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } else { + for (int i = NumDims-1; i >= 0; --i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } + // Use memcpy if it's going to be faster than using the regular evaluation. + if (contiguous_values > 2 * m_device.numThreads()) { + Scalar* src = m_impl.data(); + for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { + Index offset = srcCoeff(i); + m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); + } + return false; + } + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[0]); + inputIndices[1] += (indices[1] + m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[NumDims-1]); + inputIndices[1] += (indices[1] + m_offsets[NumDims-1]); + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]); + return rslt; + } + else { + typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) + { + array<Index, NumDims> inputCoords; + for (int i = 0; i < NumDims; ++i) { + inputCoords = coords[i] + this->m_offsets[i]; + } + return m_impl.coeff(inputCoords); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { + Scalar* result = m_impl.data(); + if (result) { + Index offset = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i+1; j < NumDims; ++j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i-1; j >= 0; --j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + } + return result + offset; + } + return NULL; + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[NumDims-1]); + } + return inputIndex; + } + + array<Index, NumDims> m_outputStrides; + array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides; + array<Index, NumDims> m_inputStrides; + TensorEvaluator<ArgType, Device> m_impl; + const Device& m_device; + Dimensions m_dimensions; + const StartIndices m_offsets; +}; + + +// Eval as lvalue +template<typename StartIndices, typename Sizes, typename ArgType, typename Device> +struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> + : public TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> +{ + typedef TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> Base; + typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType; + static const int NumDims = internal::array_size<Sizes>::value; + + enum { + IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef Sizes Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[0]); + inputIndices[1] += (indices[1] + this->m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]); + inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]); + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + this->m_impl.template writePacket<StoreMode>(inputIndices[0], x); + } + else { + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + internal::pstore<CoeffReturnType, PacketReturnType>(values, x); + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + for (int i = 1; i < packetSize-1; ++i) { + this->coeffRef(index+i) = values[i]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array<Index, NumDims>& coords) + { + array<Index, NumDims> inputCoords; + for (int i = 0; i < NumDims; ++i) { + inputCoords = coords[i] + this->m_offsets[i]; + } + return this->m_impl.coeffRef(inputCoords); + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h new file mode 100644 index 000000000..2a7dd45c0 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -0,0 +1,361 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H + +namespace Eigen { + +/** \class TensorPadding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor padding class. + * At the moment only 0-padding is supported. + * + */ +namespace internal { +template<typename PaddingDimensions, typename XprType> +struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename PaddingDimensions, typename XprType> +struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense> +{ + typedef const TensorPaddingOp<PaddingDimensions, XprType>& type; +}; + +template<typename PaddingDimensions, typename XprType> +struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1, typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type> +{ + typedef TensorPaddingOp<PaddingDimensions, XprType> type; +}; + +} // end namespace internal + + + +template<typename PaddingDimensions, typename XprType> +class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorPaddingOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested; + typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims) + : m_xpr(expr), m_padding_dims(padding_dims) {} + + EIGEN_DEVICE_FUNC + const PaddingDimensions& padding() const { return m_padding_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PaddingDimensions m_padding_dims; +}; + + +// Eval as rvalue +template<typename PaddingDimensions, typename ArgType, typename Device> +struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device> +{ + typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<PaddingDimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = true, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_padding(op.padding()) + { + // Compute dimensions + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] += m_padding[i].first + m_padding[i].second; + } + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; + } else { + m_inputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1]; + } + m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0]; + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(index < dimensions().TotalSize()); + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return Scalar(0); + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) { + return Scalar(0); + } + inputIndex += (index - m_padding[0].first); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i+1]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return Scalar(0); + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i+1]; + } + if (index < m_padding[NumDims-1].first || + index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { + return Scalar(0); + } + inputIndex += (index - m_padding[NumDims-1].first); + } + return m_impl.coeff(inputIndex); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + return packetColMajor(index); + } + return packetRowMajor(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const + { + Index inputIndex; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + const Index idx = coords[0]; + if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) { + return Scalar(0); + } + inputIndex = idx - m_padding[0].first; + for (int i = 1; i < NumDims; ++i) { + const Index idx = coords[i]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return Scalar(0); + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + } + } else { + const Index idx = coords[NumDims-1]; + if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { + return Scalar(0); + } + inputIndex = idx - m_padding[NumDims-1].first; + for (int i = NumDims - 2; i >= 0; --i) { + const Index idx = coords[i]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return Scalar(0); + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + } + } + return m_impl.coeff(inputIndex); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index first = index; + const Index last = index + packetSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; + const Index lastPaddedRight = m_outputStrides[i+1]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(Scalar(0)); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(Scalar(0)); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index last = index + packetSize - 1; + const Index first = index; + const Index lastPaddedLeft = m_padding[0].first; + const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); + const Index lastPaddedRight = m_outputStrides[1]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(Scalar(0)); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(Scalar(0)); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[0].first); + return m_impl.template packet<Unaligned>(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + + for (int i = 0; i < NumDims - 1; ++i) { + const Index first = index; + const Index last = index + packetSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; + const Index lastPaddedRight = m_outputStrides[i]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(Scalar(0)); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(Scalar(0)); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i+1]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i+1]; + } + else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index last = index + packetSize - 1; + const Index first = index; + const Index lastPaddedLeft = m_padding[NumDims-1].first; + const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); + const Index lastPaddedRight = m_outputStrides[NumDims-1]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(Scalar(0)); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(Scalar(0)); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[NumDims-1].first); + return m_impl.template packet<Unaligned>(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + Dimensions m_dimensions; + array<Index, NumDims+1> m_outputStrides; + array<Index, NumDims> m_inputStrides; + TensorEvaluator<ArgType, Device> m_impl; + PaddingDimensions m_padding; +}; + + + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h new file mode 100644 index 000000000..8a42ab6b1 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -0,0 +1,248 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H + +namespace Eigen { + +/** \class TensorPatch + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor patch class. + * + * + */ +namespace internal { +template<typename PatchDim, typename XprType> +struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; +}; + +template<typename PatchDim, typename XprType> +struct eval<TensorPatchOp<PatchDim, XprType>, Eigen::Dense> +{ + typedef const TensorPatchOp<PatchDim, XprType>& type; +}; + +template<typename PatchDim, typename XprType> +struct nested<TensorPatchOp<PatchDim, XprType>, 1, typename eval<TensorPatchOp<PatchDim, XprType> >::type> +{ + typedef TensorPatchOp<PatchDim, XprType> type; +}; + +} // end namespace internal + + + +template<typename PatchDim, typename XprType> +class TensorPatchOp : public TensorBase<TensorPatchOp<PatchDim, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorPatchOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorPatchOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorPatchOp>::type Nested; + typedef typename Eigen::internal::traits<TensorPatchOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorPatchOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims) + : m_xpr(expr), m_patch_dims(patch_dims) {} + + EIGEN_DEVICE_FUNC + const PatchDim& patch_dims() const { return m_patch_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PatchDim m_patch_dims; +}; + + +// Eval as rvalue +template<typename PatchDim, typename ArgType, typename Device> +struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> +{ + typedef TensorPatchOp<PatchDim, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = true, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + // Only column major tensors are supported for now. + EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + Index num_patches = 1; + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + const PatchDim& patch_dims = op.patch_dims(); + for (int i = 0; i < NumDims-1; ++i) { + m_dimensions[i] = patch_dims[i]; + num_patches *= (input_dims[i] - patch_dims[i] + 1); + } + m_dimensions[NumDims-1] = num_patches; + + m_inputStrides[0] = 1; + m_patchStrides[0] = 1; + for (int i = 1; i < NumDims-1; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1); + } + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Find the location of the first element of the patch. + Index patchIndex = index / m_outputStrides[NumDims - 1]; + // Find the offset of the element wrt the location of the first element. + Index patchOffset = index - patchIndex * m_outputStrides[NumDims - 1]; + + Index inputIndex = 0; + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = patchOffset / m_outputStrides[i]; + patchOffset -= offsetIdx * m_outputStrides[i]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + inputIndex += (patchIndex + patchOffset); + return m_impl.coeff(inputIndex); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + Index indices[2] = {index, index + packetSize - 1}; + Index patchIndices[2] = {indices[0] / m_outputStrides[NumDims - 1], + indices[1] / m_outputStrides[NumDims - 1]}; + Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[NumDims - 1], + indices[1] - patchIndices[1] * m_outputStrides[NumDims - 1]}; + + Index inputIndices[2] = {0, 0}; + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], + patchIndices[1] / m_patchStrides[i]}; + patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; + patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; + + const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i], + patchOffsets[1] / m_outputStrides[i]}; + patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i]; + patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i]; + + inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; + inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; + } + inputIndices[0] += (patchIndices[0] + patchOffsets[0]); + inputIndices[1] += (patchIndices[1] + patchOffsets[1]); + + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const + { + // Location of the first element of the patch. + const Index patchIndex = coords[NumDims - 1]; + + if (TensorEvaluator<ArgType, Device>::CoordAccess) { + array<Index, NumDims-1> inputCoords; + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = coords[i]; + inputCoords[i] = coords[i] + patchIdx; + } + inputCoords[0] = (patchIndex + coords[0]); + return m_impl.coeff(inputCoords); + } + else { + Index inputIndex = 0; + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = coords[i]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + inputIndex += (patchIndex + coords[0]); + return m_impl.coeff(inputIndex); + } + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array<Index, NumDims> m_outputStrides; + array<Index, NumDims-1> m_inputStrides; + array<Index, NumDims-1> m_patchStrides; + + TensorEvaluator<ArgType, Device> m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h new file mode 100644 index 000000000..de5747905 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -0,0 +1,426 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H + +namespace Eigen { + +/** \class TensorReduction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reduction class. + * + */ + +namespace internal { +template<typename Op, typename Dims, typename XprType> +struct traits<TensorReductionOp<Op, Dims, XprType> > + : traits<XprType> +{ + typedef typename traits<XprType>::Scalar Scalar; + typedef typename internal::packet_traits<Scalar>::type Packet; + typedef typename traits<XprType>::StorageKind StorageKind; + typedef typename traits<XprType>::Index Index; + typedef typename XprType::Nested Nested; +}; + +template<typename Op, typename Dims, typename XprType> +struct eval<TensorReductionOp<Op, Dims, XprType>, Eigen::Dense> +{ + typedef const TensorReductionOp<Op, Dims, XprType>& type; +}; + +template<typename Op, typename Dims, typename XprType> +struct nested<TensorReductionOp<Op, Dims, XprType>, 1, typename eval<TensorReductionOp<Op, Dims, XprType> >::type> +{ + typedef TensorReductionOp<Op, Dims, XprType> type; +}; + + +template <typename ReducedDims, int NumTensorDims, int Layout> +struct are_inner_most_dims { + static const bool value = false; +}; +template <typename ReducedDims, int NumTensorDims, int Layout> +struct preserve_inner_most_dims { + static const bool value = false; +}; + +#ifdef EIGEN_HAS_CONSTEXPR +template <typename ReducedDims, int NumTensorDims> +struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{ + static const bool value = indices_statically_known_to_increase<ReducedDims>()() && + index_statically_eq<ReducedDims>()(0, 0) && + index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1); +}; +template <typename ReducedDims, int NumTensorDims> +struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{ + static const bool value = indices_statically_known_to_increase<ReducedDims>()() && + index_statically_eq<ReducedDims>()(0, NumTensorDims - array_size<ReducedDims>::value) && + index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1); +}; +template <typename ReducedDims, int NumTensorDims> +struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{ + static const bool value = indices_statically_known_to_increase<ReducedDims>()() && + index_statically_gt<ReducedDims>()(0, 0); +}; +template <typename ReducedDims, int NumTensorDims> +struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{ + static const bool value = indices_statically_known_to_increase<ReducedDims>()() && + index_statically_lt<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1); +}; +#endif + + +template <int DimIndex, typename Self, typename Op> +struct GenericDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { + EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum); + } + } +}; +template <typename Self, typename Op> +struct GenericDimReducer<0, Self, Op> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { + for (int j = 0; j < self.m_reducedDims[0]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; + reducer.reduce(self.m_impl.coeff(input), accum); + } + } +}; + +template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalize(accum); + } +}; + +template <typename Self, typename Op> +struct InnerMostDimReducer<Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size; + const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; + typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>(); + for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { + reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p); + } + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalizeBoth(accum, p); + } +}; + +template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { + eigen_assert(false && "should never be called"); + } +}; + +template <int DimIndex, typename Self, typename Op> +struct InnerMostDimPreserver<DimIndex, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum); + } + } +}; + +template <typename Self, typename Op> +struct InnerMostDimPreserver<0, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + for (int j = 0; j < self.m_reducedDims[0]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; + reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum); + } + } +}; + +} // end namespace internal + + +template <typename Op, typename Dims, typename XprType> +class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorReductionOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType; + typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested; + typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims) + { } + TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const XprType& expression() const { return m_expr; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dims& dims() const { return m_dims; } + const Op& reducer() const { return m_reducer; } + + protected: + typename XprType::Nested m_expr; + const Dims m_dims; + const Op m_reducer; +}; + + +// Eval as rvalue +template<typename Op, typename Dims, typename ArgType, typename Device> +struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> +{ + typedef TensorReductionOp<Op, Dims, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + static const int NumReducedDims = internal::array_size<Dims>::value; + static const int NumOutputDims = (NumInputDims==NumReducedDims) ? 1 : NumInputDims - NumReducedDims; + typedef DSizes<Index, NumOutputDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self; + static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess; + + enum { + IsAligned = false, + PacketAccess = Self::InputPacketAccess && Op::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value; + static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_reducer(op.reducer()) + { + EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + // Bitmap indicating if an input dimension is reduced or not. + array<bool, NumInputDims> reduced; + for (int i = 0; i < NumInputDims; ++i) { + reduced[i] = false; + } + for (int i = 0; i < NumReducedDims; ++i) { + eigen_assert(op.dims()[i] >= 0); + eigen_assert(op.dims()[i] < NumInputDims); + reduced[op.dims()[i]] = true; + } + + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + int outputIndex = 0; + int reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (reduced[i]) { + m_reducedDims[reduceIndex] = input_dims[i]; + ++reduceIndex; + } else { + m_dimensions[outputIndex] = input_dims[i]; + ++outputIndex; + } + } + + // Precompute output strides. + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_outputStrides[NumOutputDims - 1] = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + } + + // Precompute input strides. + array<Index, NumInputDims> input_strides; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + input_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_strides[i] = input_strides[i-1] * input_dims[i-1]; + } + } else { + input_strides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + } + } + + outputIndex = 0; + reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (reduced[i]) { + m_reducedStrides[reduceIndex] = input_strides[i]; + ++reduceIndex; + } else { + m_preservedStrides[outputIndex] = input_strides[i]; + ++outputIndex; + } + } + + // Special case for full reductions + if (NumInputDims == NumReducedDims) { + m_dimensions[0] = 1; + m_preservedStrides[0] = internal::array_prod(input_dims); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Op reducer(m_reducer); + if (ReducingInnerMostDims) { + const Index num_values_to_reduce = + (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; + return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index), + num_values_to_reduce, reducer); + } else { + typename Self::CoeffReturnType accum = reducer.initialize(); + internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum); + return reducer.finalize(accum); + } + } + + // TODO(bsteiner): provide a more efficient implementation. + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + if (ReducingInnerMostDims) { + const Index num_values_to_reduce = + (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; + const Index firstIndex = firstInput(index); + for (Index i = 0; i < packetSize; ++i) { + Op reducer(m_reducer); + values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce, + num_values_to_reduce, reducer); + } + } else if (PreservingInnerMostDims) { + const Index firstIndex = firstInput(index); + const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1; + // TBD: extend this the the n innermost dimensions that we preserve. + if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) { + Op reducer(m_reducer); + typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>(); + internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum); + return reducer.finalizePacket(accum); + } else { + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index + i); + } + } + } else { + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index + i); + } + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + private: + template <int, typename, typename> friend struct internal::GenericDimReducer; + template <typename, typename, bool> friend struct internal::InnerMostDimReducer; + template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver; + + // Returns the Index in the input tensor of the first value that needs to be + // used to compute the reduction at output index "index". + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + if (ReducingInnerMostDims) { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + return index * m_preservedStrides[0]; + } else { + return index * m_preservedStrides[NumOutputDims - 1]; + } + } + // TBD: optimize the case where we preserve the innermost dimensions. + Index startInput = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[0]; + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[NumOutputDims - 1]; + } + return startInput; + } + + // Dimensions of the output of the operation. + Dimensions m_dimensions; + // Precomputed strides for the output tensor. + array<Index, NumOutputDims> m_outputStrides; + // Subset of strides of the input tensor for the non-reduced dimensions. + // Indexed by output dimensions. + array<Index, NumOutputDims> m_preservedStrides; + + // Subset of strides of the input tensor for the reduced dimensions. + // Indexed by reduced dimensions. + array<Index, NumReducedDims> m_reducedStrides; + // Size of the input dimensions that are reduced. + // Indexed by reduced dimensions. + array<Index, NumReducedDims> m_reducedDims; + + // Evaluator for the input expression. + TensorEvaluator<ArgType, Device> m_impl; + + // Operation to apply for computing the reduction. + Op m_reducer; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h new file mode 100644 index 000000000..0a87e67eb --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -0,0 +1,429 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H +#define EIGEN_CXX11_TENSOR_TENSOR_REF_H + +namespace Eigen { + +namespace internal { + +template <typename Dimensions, typename Scalar> +class TensorLazyBaseEvaluator { + public: + TensorLazyBaseEvaluator() : m_refcount(0) { } + virtual ~TensorLazyBaseEvaluator() { } + + virtual const Dimensions& dimensions() const = 0; + virtual const Scalar* data() const = 0; + + virtual const Scalar coeff(DenseIndex index) const = 0; + virtual Scalar& coeffRef(DenseIndex index) = 0; + + void incrRefCount() { ++m_refcount; } + void decrRefCount() { --m_refcount; } + int refCount() const { return m_refcount; } + + private: + // No copy, no assigment; + TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other); + TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other); + + int m_refcount; +}; + +static char dummy[8]; + +template <typename Dimensions, typename Expr, typename Device> +class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, typename TensorEvaluator<Expr, Device>::Scalar> { + public: + // typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions; + typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar; + + TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device) { + m_dims = m_impl.dimensions(); + m_impl.evalSubExprsIfNeeded(NULL); + } + virtual ~TensorLazyEvaluatorReadOnly() { + m_impl.cleanup(); + } + + virtual const Dimensions& dimensions() const { + return m_dims; + } + virtual const Scalar* data() const { + return m_impl.data(); + } + + virtual const Scalar coeff(DenseIndex index) const { + return m_impl.coeff(index); + } + virtual Scalar& coeffRef(DenseIndex /*index*/) { + eigen_assert(false && "can't reference the coefficient of a rvalue"); + return *reinterpret_cast<Scalar*>(dummy); + }; + + protected: + TensorEvaluator<Expr, Device> m_impl; + Dimensions m_dims; +}; + +template <typename Dimensions, typename Expr, typename Device> +class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> { + public: + typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base; + typedef typename Base::Scalar Scalar; + + TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) { + } + virtual ~TensorLazyEvaluatorWritable() { + } + + virtual Scalar& coeffRef(DenseIndex index) { + return this->m_impl.coeffRef(index); + } +}; + +template <typename Dimensions, typename Expr, typename Device> +class TensorLazyEvaluator : public internal::conditional<bool(internal::is_lvalue<Expr>::value), + TensorLazyEvaluatorWritable<Dimensions, Expr, Device>, + TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type { + public: + typedef typename internal::conditional<bool(internal::is_lvalue<Expr>::value), + TensorLazyEvaluatorWritable<Dimensions, Expr, Device>, + TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type Base; + typedef typename Base::Scalar Scalar; + + TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) { + } + virtual ~TensorLazyEvaluator() { + } +}; + +} // namespace internal + + +/** \class TensorRef + * \ingroup CXX11_Tensor_Module + * + * \brief A reference to a tensor expression + * The expression will be evaluated lazily (as much as possible). + * + */ +template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef<PlainObjectType> > +{ + public: + typedef TensorRef<PlainObjectType> Self; + typedef typename PlainObjectType::Base Base; + typedef typename Eigen::internal::nested<Self>::type Nested; + typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind; + typedef typename internal::traits<PlainObjectType>::Index Index; + typedef typename internal::traits<PlainObjectType>::Scalar Scalar; + typedef typename internal::packet_traits<Scalar>::type Packet; + typedef typename NumTraits<Scalar>::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + typedef Scalar* PointerType; + typedef PointerType PointerArgType; + + static const Index NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + Layout = PlainObjectType::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) { + } + + template <typename Expression> + EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice())) { + m_evaluator->incrRefCount(); + } + + template <typename Expression> + EIGEN_STRONG_INLINE TensorRef& operator = (const Expression& expr) { + unrefEvaluator(); + m_evaluator = new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice()); + m_evaluator->incrRefCount(); + return *this; + } + + ~TensorRef() { + unrefEvaluator(); + } + + TensorRef(const TensorRef& other) : m_evaluator(other.m_evaluator) { + eigen_assert(m_evaluator->refCount() > 0); + m_evaluator->incrRefCount(); + } + + TensorRef& operator = (const TensorRef& other) { + if (this != &other) { + unrefEvaluator(); + m_evaluator = other.m_evaluator; + eigen_assert(m_evaluator->refCount() > 0); + m_evaluator->incrRefCount(); + } + return *this; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index index) const + { + return m_evaluator->coeff(index); + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const + { + const std::size_t NumIndices = (sizeof...(otherIndices) + 1); + const array<Index, NumIndices> indices{{firstIndex, otherIndices...}}; + return coeff(indices); + } + template<typename... IndexTypes> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) + { + const std::size_t NumIndices = (sizeof...(otherIndices) + 1); + const array<Index, NumIndices> indices{{firstIndex, otherIndices...}}; + return coeffRef(indices); + } +#else + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1) const + { + array<Index, 2> indices; + indices[0] = i0; + indices[1] = i1; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2) const + { + array<Index, 3> indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3) const + { + array<Index, 4> indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + array<Index, 5> indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + indices[4] = i4; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1) + { + array<Index, 2> indices; + indices[0] = i0; + indices[1] = i1; + return coeffRef(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2) + { + array<Index, 3> indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + return coeffRef(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + array<Index, 4> indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + return coeffRef(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2, Index i3, Index i4) + { + array<Index, 5> indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + indices[4] = i4; + return coeffRef(indices); + } +#endif + + template <std::size_t NumIndices> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar coeff(const array<Index, NumIndices>& indices) const + { + const Dimensions& dims = this->dimensions(); + Index index = 0; + if (PlainObjectType::Options & RowMajor) { + index += indices[0]; + for (int i = 1; i < NumIndices; ++i) { + index = index * dims[i] + indices[i]; + } + } else { + index += indices[NumIndices-1]; + for (int i = NumIndices-2; i >= 0; --i) { + index = index * dims[i] + indices[i]; + } + } + return m_evaluator->coeff(index); + } + template <std::size_t NumIndices> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) + { + const Dimensions& dims = this->dimensions(); + Index index = 0; + if (PlainObjectType::Options & RowMajor) { + index += indices[0]; + for (int i = 1; i < NumIndices; ++i) { + index = index * dims[i] + indices[i]; + } + } else { + index += indices[NumIndices-1]; + for (int i = NumIndices-2; i >= 0; --i) { + index = index * dims[i] + indices[i]; + } + } + return m_evaluator->coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar coeff(Index index) const + { + return m_evaluator->coeff(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + return m_evaluator->coeffRef(index); + } + + private: + EIGEN_STRONG_INLINE void unrefEvaluator() { + if (m_evaluator) { + m_evaluator->decrRefCount(); + if (m_evaluator->refCount() == 0) { + delete m_evaluator; + } + } + } + + internal::TensorLazyBaseEvaluator<Dimensions, Scalar>* m_evaluator; +}; + + +// evaluator for rvalues +template<typename Derived, typename Device> +struct TensorEvaluator<const TensorRef<Derived>, Device> +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Packet Packet; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename Derived::Packet PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + Layout = TensorRef<Derived>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&) + : m_ref(m) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_ref.coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + return m_ref.coeffRef(index); + } + + Scalar* data() const { return m_ref.data(); } + + protected: + TensorRef<Derived> m_ref; +}; + + +// evaluator for lvalues +template<typename Derived, typename Device> +struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<const TensorRef<Derived>, Device> +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Packet Packet; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename Derived::Packet PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + typedef TensorEvaluator<const TensorRef<Derived>, Device> Base; + + enum { + IsAligned = false, + PacketAccess = false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + return this->m_ref.coeffRef(index); + } +}; + + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h new file mode 100644 index 000000000..ad21e966b --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -0,0 +1,207 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H +#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H +namespace Eigen { + +/** \class TensorReverse + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reverse elements class. + * + */ +namespace internal { +template<typename ReverseDimensions, typename XprType> +struct traits<TensorReverseOp<ReverseDimensions, + XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename ReverseDimensions, typename XprType> +struct eval<TensorReverseOp<ReverseDimensions, XprType>, Eigen::Dense> +{ + typedef const TensorReverseOp<ReverseDimensions, XprType>& type; +}; + +template<typename ReverseDimensions, typename XprType> +struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1, + typename eval<TensorReverseOp<ReverseDimensions, XprType> >::type> +{ + typedef TensorReverseOp<ReverseDimensions, XprType> type; +}; + +} // end namespace internal + + + + +template<typename ReverseDimensions, typename XprType> +class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions, + XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorReverseOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested; + typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind + StorageKind; + typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(const XprType& expr, + const ReverseDimensions& reverse_dims) + : m_xpr(expr), m_reverse_dims(reverse_dims) {} + + EIGEN_DEVICE_FUNC + const ReverseDimensions& reverse() const { return m_reverse_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const ReverseDimensions m_reverse_dims; +}; + + +// Eval as rvalue +template<typename ReverseDimensions, typename ArgType, typename Device> +struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device> +{ + typedef TensorReverseOp<ReverseDimensions, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<ReverseDimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : m_impl(op.expression(), device), m_reverse(op.reverse()) + { + // Compute strides + m_dimensions = m_impl.dimensions(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i-1] * m_dimensions[i-1]; + } + } else { + m_strides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i+1] * m_dimensions[i+1]; + } + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(index < dimensions().TotalSize()); + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + Index idx = index / m_strides[i]; + index -= idx * m_strides[i]; + if (m_reverse[i]) { + idx = m_dimensions[i] - idx - 1; + } + inputIndex += idx * m_strides[i] ; + } + if (m_reverse[0]) { + inputIndex += (m_dimensions[0] - index - 1); + } else { + inputIndex += index; + } + return m_impl.coeff(inputIndex); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + Index idx = index / m_strides[i]; + index -= idx * m_strides[i]; + if (m_reverse[i]) { + idx = m_dimensions[i] - idx - 1; + } + inputIndex += idx * m_strides[i] ; + } + if (m_reverse[NumDims-1]) { + inputIndex += (m_dimensions[NumDims-1] - index - 1); + } else { + inputIndex += index; + } + return m_impl.coeff(inputIndex); + } + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + // TODO(ndjaitly): write a better packing routine that uses + // local structure. + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type + values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array<Index, NumDims> m_strides; + TensorEvaluator<ArgType, Device> m_impl; + ReverseDimensions m_reverse; +}; + + + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h new file mode 100644 index 000000000..1012ecd69 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -0,0 +1,259 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H +#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H + +namespace Eigen { + +/** \class TensorShuffling + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor shuffling class. + * + * + */ +namespace internal { +template<typename Shuffle, typename XprType> +struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename Shuffle, typename XprType> +struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense> +{ + typedef const TensorShufflingOp<Shuffle, XprType>& type; +}; + +template<typename Shuffle, typename XprType> +struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type> +{ + typedef TensorShufflingOp<Shuffle, XprType> type; +}; + +} // end namespace internal + + + +template<typename Shuffle, typename XprType> +class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorShufflingOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested; + typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shuffle) + : m_xpr(expr), m_shuffle(shuffle) {} + + EIGEN_DEVICE_FUNC + const Shuffle& shuffle() const { return m_shuffle; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const TensorShufflingOp& other) + { + typedef TensorAssignOp<TensorShufflingOp, const TensorShufflingOp> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice()); + return *this; + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorShufflingOp, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const Shuffle m_shuffle; +}; + + +// Eval as rvalue +template<typename Shuffle, typename ArgType, typename Device> +struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> +{ + typedef TensorShufflingOp<Shuffle, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + const Shuffle& shuffle = op.shuffle(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = input_dims[shuffle[i]]; + } + + array<Index, NumDims> inputStrides; + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1]; + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } else { + inputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[shuffle[i]]; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[0]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[NumDims - 1]; + } + } + + Dimensions m_dimensions; + array<Index, NumDims> m_outputStrides; + array<Index, NumDims> m_inputStrides; + TensorEvaluator<ArgType, Device> m_impl; +}; + + +// Eval as lvalue +template<typename Shuffle, typename ArgType, typename Device> +struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> + : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> +{ + typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base; + + typedef TensorShufflingOp<Shuffle, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template <int StoreMode> EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + static const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + internal::pstore<CoeffReturnType, PacketReturnType>(values, x); + for (int i = 0; i < packetSize; ++i) { + this->coeffRef(index+i) = values[i]; + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index a34600ee6..1b227e8c2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -30,39 +30,70 @@ namespace Eigen { * * \sa Tensor */ -template<typename T, std::size_t NumIndices_, DenseIndex Size, int Options_, typename Dimensions = void> class TensorStorage; +template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename Dimensions = void> class TensorStorage; + + +// Pure fixed-size storage +template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename FixedDimensions> +class TensorStorage +{ + private: + EIGEN_ALIGN_DEFAULT T m_data[Size]; + FixedDimensions m_dimensions; + + public: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStorage() { + EIGEN_STATIC_ASSERT(Size == FixedDimensions::total_size, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T *data() { return m_data; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T *data() const { return m_data; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const FixedDimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); } +}; + + // pure-dynamic, but without specification of all dimensions explicitly -template<typename T, std::size_t NumIndices_, int Options_> +template<typename T, DenseIndex NumIndices_, int Options_> class TensorStorage<T, NumIndices_, Dynamic, Options_, void> : public TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> { - typedef TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> Base_; + typedef TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> Base_; + public: - TensorStorage() = default; - TensorStorage(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>&) = default; - TensorStorage(TensorStorage<T, NumIndices_, Dynamic, Options_, void>&&) = default; + TensorStorage() { } + TensorStorage(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>& other) : Base_(other) { } + TensorStorage(internal::constructor_without_unaligned_array_assert) : Base_(internal::constructor_without_unaligned_array_assert()) {} - TensorStorage(DenseIndex size, const std::array<DenseIndex, NumIndices_>& dimensions) : Base_(size, dimensions) {} - TensorStorage<T, NumIndices_, Dynamic, Options_, void>& operator=(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>&) = default; + TensorStorage(DenseIndex size, const array<DenseIndex, NumIndices_>& dimensions) : Base_(size, dimensions) {} + + // TensorStorage<T, NumIndices_, Dynamic, Options_, void>& operator=(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>&) = default; }; // pure dynamic -template<typename T, std::size_t NumIndices_, int Options_> +template<typename T, DenseIndex NumIndices_, int Options_> class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> { T *m_data; - std::array<DenseIndex, NumIndices_> m_dimensions; + DSizes<DenseIndex, NumIndices_> m_dimensions; typedef TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> Self_; public: - TensorStorage() : m_data(0), m_dimensions(internal::template repeat<NumIndices_, DenseIndex>(0)) {} + TensorStorage() : m_data(0), m_dimensions() {} TensorStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_dimensions(internal::template repeat<NumIndices_, DenseIndex>(0)) {} - TensorStorage(DenseIndex size, const std::array<DenseIndex, NumIndices_>& dimensions) - : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions) - { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN } - TensorStorage(const Self_& other) + TensorStorage(DenseIndex size, const array<DenseIndex, NumIndices_>& dimensions) + : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions) + { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN } + TensorStorage(const Self_& other) : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(other.m_dimensions))) , m_dimensions(other.m_dimensions) { @@ -76,32 +107,19 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_nu } return *this; } - TensorStorage(Self_&& other) - : m_data(std::move(other.m_data)), m_dimensions(std::move(other.m_dimensions)) - { - other.m_data = nullptr; - } - Self_& operator=(Self_&& other) - { - using std::swap; - swap(m_data, other.m_data); - swap(m_dimensions, other.m_dimensions); - return *this; - } + ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); } void swap(Self_& other) { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); } - std::array<DenseIndex, NumIndices_> dimensions(void) const {return m_dimensions;} - void conservativeResize(DenseIndex size, const std::array<DenseIndex, NumIndices_>& nbDimensions) - { - m_data = internal::conditional_aligned_realloc_new_auto<T,(Options_&DontAlign)==0>(m_data, size, internal::array_prod(m_dimensions)); - m_dimensions = nbDimensions; - } - void resize(DenseIndex size, const std::array<DenseIndex, NumIndices_>& nbDimensions) + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<DenseIndex, NumIndices_>& dimensions() const {return m_dimensions;} + + EIGEN_DEVICE_FUNC void resize(DenseIndex size, const array<DenseIndex, NumIndices_>& nbDimensions) { - if(size != internal::array_prod(m_dimensions)) + const DenseIndex currentSz = internal::array_prod(m_dimensions); + if(size != currentSz) { - internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); + internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, currentSz); if (size) m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size); else @@ -110,16 +128,13 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_nu } m_dimensions = nbDimensions; } - const T *data() const { return m_data; } - T *data() { return m_data; } -}; -// TODO: implement fixed-size stuff + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); } +}; } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h new file mode 100644 index 000000000..00cb8e373 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -0,0 +1,325 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H + +namespace Eigen { + +/** \class TensorStriding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor striding class. + * + * + */ +namespace internal { +template<typename Strides, typename XprType> +struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename Strides, typename XprType> +struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense> +{ + typedef const TensorStridingOp<Strides, XprType>& type; +}; + +template<typename Strides, typename XprType> +struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridingOp<Strides, XprType> >::type> +{ + typedef TensorStridingOp<Strides, XprType> type; +}; + +} // end namespace internal + + + +template<typename Strides, typename XprType> +class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorStridingOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested; + typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC + const Strides& strides() const { return m_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStridingOp& operator = (const TensorStridingOp& other) + { + typedef TensorAssignOp<TensorStridingOp, const TensorStridingOp> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice()); + return *this; + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorStridingOp, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const Strides m_dims; +}; + + +// Eval as rvalue +template<typename Strides, typename ArgType, typename Device> +struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> +{ + typedef TensorStridingOp<Strides, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + + enum { + IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]); + } + + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_inputStrides[i-1] *= op.strides()[i-1]; + } + m_inputStrides[NumDims-1] *= op.strides()[NumDims-1]; + } else { // RowMajor + m_outputStrides[NumDims-1] = 1; + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_inputStrides[i+1] *= op.strides()[i+1]; + } + m_inputStrides[0] *= op.strides()[0]; + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[0]; + inputIndices[1] += indices[1] * m_inputStrides[0]; + } else { // RowMajor + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[NumDims-1]; + inputIndices[1] += indices[1] * m_inputStrides[NumDims-1]; + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[0]; + } else { // RowMajor + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[NumDims-1]; + } + return inputIndex; + } + + Dimensions m_dimensions; + array<Index, NumDims> m_outputStrides; + array<Index, NumDims> m_inputStrides; + TensorEvaluator<ArgType, Device> m_impl; +}; + + +// Eval as lvalue +template<typename Strides, typename ArgType, typename Device> +struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> + : public TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> +{ + typedef TensorStridingOp<Strides, ArgType> XprType; + typedef TensorEvaluator<const XprType, Device> Base; + // typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + // typedef DSizes<Index, NumDims> Dimensions; + + enum { + IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < this->dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_outputStrides[i]; + const Index idx1 = indices[1] / this->m_outputStrides[i]; + inputIndices[0] += idx0 * this->m_inputStrides[i]; + inputIndices[1] += idx1 * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += indices[0] * this->m_inputStrides[0]; + inputIndices[1] += indices[1] * this->m_inputStrides[0]; + } else { // RowMajor + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_outputStrides[i]; + const Index idx1 = indices[1] / this->m_outputStrides[i]; + inputIndices[0] += idx0 * this->m_inputStrides[i]; + inputIndices[1] += idx1 * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1]; + inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1]; + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + this->m_impl.template writePacket<Unaligned>(inputIndices[0], x); + } + else { + EIGEN_ALIGN_DEFAULT Scalar values[packetSize]; + internal::pstore<Scalar, PacketReturnType>(values, x); + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + for (int i = 1; i < packetSize-1; ++i) { + this->coeffRef(index+i) = values[i]; + } + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h new file mode 100644 index 000000000..a844a4d68 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -0,0 +1,256 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H +#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H + +namespace Eigen { +namespace internal { + + +template<typename Scalar, int Options> +class compute_tensor_flags +{ + enum { + is_dynamic_size_storage = 1, + + aligned_bit = + ( + ((Options&DontAlign)==0) && ( +#if EIGEN_ALIGN_STATICALLY + (!is_dynamic_size_storage) +#else + 0 +#endif + || +#if EIGEN_ALIGN + is_dynamic_size_storage +#else + 0 +#endif + ) + ) ? AlignedBit : 0, + packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0 + }; + + public: + enum { ret = packet_access_bit | aligned_bit}; +}; + + +template<typename Scalar_, std::size_t NumIndices_, int Options_> +struct traits<Tensor<Scalar_, NumIndices_, Options_> > +{ + typedef Scalar_ Scalar; + typedef Dense StorageKind; + typedef DenseIndex Index; + static const int NumDimensions = NumIndices_; + static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; + enum { + Options = Options_, + Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit, + }; +}; + + +template<typename Scalar_, typename Dimensions, int Options_> +struct traits<TensorFixedSize<Scalar_, Dimensions, Options_> > +{ + typedef Scalar_ Scalar; + typedef Dense StorageKind; + typedef DenseIndex Index; + static const int NumDimensions = array_size<Dimensions>::value; + static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; + enum { + Options = Options_, + Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit, + }; +}; + + +template<typename PlainObjectType, int Options_> +struct traits<TensorMap<PlainObjectType, Options_> > + : public traits<PlainObjectType> +{ + typedef traits<PlainObjectType> BaseTraits; + typedef typename BaseTraits::Scalar Scalar; + typedef typename BaseTraits::StorageKind StorageKind; + typedef typename BaseTraits::Index Index; + static const int NumDimensions = BaseTraits::NumDimensions; + static const int Layout = BaseTraits::Layout; + enum { + Options = Options_, + Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + }; +}; + +template<typename PlainObjectType> +struct traits<TensorRef<PlainObjectType> > + : public traits<PlainObjectType> +{ + typedef traits<PlainObjectType> BaseTraits; + typedef typename BaseTraits::Scalar Scalar; + typedef typename BaseTraits::StorageKind StorageKind; + typedef typename BaseTraits::Index Index; + static const int NumDimensions = BaseTraits::NumDimensions; + static const int Layout = BaseTraits::Layout; + enum { + Options = BaseTraits::Options, + Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + }; +}; + + +template<typename _Scalar, std::size_t NumIndices_, int Options> +struct eval<Tensor<_Scalar, NumIndices_, Options>, Eigen::Dense> +{ + typedef const Tensor<_Scalar, NumIndices_, Options>& type; +}; + +template<typename _Scalar, std::size_t NumIndices_, int Options> +struct eval<const Tensor<_Scalar, NumIndices_, Options>, Eigen::Dense> +{ + typedef const Tensor<_Scalar, NumIndices_, Options>& type; +}; + +template<typename Scalar_, typename Dimensions, int Options> +struct eval<TensorFixedSize<Scalar_, Dimensions, Options>, Eigen::Dense> +{ + typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type; +}; + +template<typename Scalar_, typename Dimensions, int Options> +struct eval<const TensorFixedSize<Scalar_, Dimensions, Options>, Eigen::Dense> +{ + typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type; +}; + +template<typename PlainObjectType, int Options> +struct eval<TensorMap<PlainObjectType, Options>, Eigen::Dense> +{ + typedef const TensorMap<PlainObjectType, Options>& type; +}; + +template<typename PlainObjectType, int Options> +struct eval<const TensorMap<PlainObjectType, Options>, Eigen::Dense> +{ + typedef const TensorMap<PlainObjectType, Options>& type; +}; + +template<typename PlainObjectType> +struct eval<TensorRef<PlainObjectType>, Eigen::Dense> +{ + typedef const TensorRef<PlainObjectType>& type; +}; + +template<typename PlainObjectType> +struct eval<const TensorRef<PlainObjectType>, Eigen::Dense> +{ + typedef const TensorRef<PlainObjectType>& type; +}; + + +template <typename Scalar_, std::size_t NumIndices_, int Options_> +struct nested<Tensor<Scalar_, NumIndices_, Options_> > +{ + typedef const Tensor<Scalar_, NumIndices_, Options_>& type; +}; + +template <typename Scalar_, std::size_t NumIndices_, int Options_> +struct nested<const Tensor<Scalar_, NumIndices_, Options_> > +{ + typedef const Tensor<Scalar_, NumIndices_, Options_>& type; +}; + +template <typename Scalar_, typename Dimensions, int Options> +struct nested<TensorFixedSize<Scalar_, Dimensions, Options> > +{ + typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type; +}; + +template <typename Scalar_, typename Dimensions, int Options> +struct nested<const TensorFixedSize<Scalar_, Dimensions, Options> > +{ + typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type; +}; + + +template <typename PlainObjectType, int Options> +struct nested<TensorMap<PlainObjectType, Options> > +{ + typedef const TensorMap<PlainObjectType, Options>& type; +}; + +template <typename PlainObjectType, int Options> +struct nested<const TensorMap<PlainObjectType, Options> > +{ + typedef const TensorMap<PlainObjectType, Options>& type; +}; + +template <typename PlainObjectType> +struct nested<TensorRef<PlainObjectType> > +{ + typedef const TensorRef<PlainObjectType>& type; +}; + +template <typename PlainObjectType> +struct nested<const TensorRef<PlainObjectType> > +{ + typedef const TensorRef<PlainObjectType>& type; +}; + +} // end namespace internal + +// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C, +// R, B), and convolve it with a set of filters, which can also be presented as +// a tensor (D, K, K, M), where M is the number of filters, K is the filter +// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For +// simplicity we assume that we always use square filters (which is usually the +// case in images), hence the two Ks in the tensor dimension. It also takes in +// a few additional parameters: +// Stride (S): The convolution stride is the offset between locations where we +// apply the filters. A larger stride means that the output will be +// spatially smaller. +// Padding (P): The padding we apply to the input tensor along the R and C +// dimensions. This is usually used to make sure that the spatial +// dimensions of the output matches our intention. +// +// Two types of padding are often used: +// SAME: The pad value is computed so that the output will have size +// R/S and C/S. +// VALID: no padding is carried out. +// When we do padding, the padded values at the padded locations are usually +// zero. +// +// The output dimensions for convolution, when given all the parameters above, +// are as follows: +// When Padding = SAME: the output size is (B, R', C', M), where +// R' = ceil(float(R) / float(S)) +// C' = ceil(float(C) / float(S)) +// where ceil is the ceiling function. The input tensor is padded with 0 as +// needed. The number of padded rows and columns are computed as: +// Pr = ((R' - 1) * S + K - R) / 2 +// Pc = ((C' - 1) * S + K - C) / 2 +// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2. +// This is where SAME comes from - the output has the same size as the input has. +// When Padding = VALID: the output size is computed as +// R' = ceil(float(R - K + 1) / float(S)) +// C' = ceil(float(C - K + 1) / float(S)) +// and the number of padded rows and columns are computed in the same way as in +// the SAME case. +// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0, +// Pc=0. +typedef enum { + PADDING_VALID = 1, + PADDING_SAME = 2, +} PaddingType; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H diff --git a/unsupported/Eigen/OpenGLSupport b/unsupported/Eigen/OpenGLSupport index 4ed545174..6ca1b1217 100644 --- a/unsupported/Eigen/OpenGLSupport +++ b/unsupported/Eigen/OpenGLSupport @@ -178,11 +178,11 @@ template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,Affine>& t) template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,Projective>& t) { glLoadMatrix(t.matrix()); } template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,AffineCompact>& t) { glLoadMatrix(Transform<Scalar,3,Affine>(t).matrix()); } -static void glRotate(const Rotation2D<float>& rot) +inline void glRotate(const Rotation2D<float>& rot) { glRotatef(rot.angle()*180.f/float(M_PI), 0.f, 0.f, 1.f); } -static void glRotate(const Rotation2D<double>& rot) +inline void glRotate(const Rotation2D<double>& rot) { glRotated(rot.angle()*180.0/M_PI, 0.0, 0.0, 1.0); } @@ -246,18 +246,18 @@ EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glGet,GLenum,_,double, 4,4,Doublev) #ifdef GL_VERSION_2_0 -static void glUniform2fv_ei (GLint loc, const float* v) { glUniform2fv(loc,1,v); } -static void glUniform2iv_ei (GLint loc, const int* v) { glUniform2iv(loc,1,v); } +inline void glUniform2fv_ei (GLint loc, const float* v) { glUniform2fv(loc,1,v); } +inline void glUniform2iv_ei (GLint loc, const int* v) { glUniform2iv(loc,1,v); } -static void glUniform3fv_ei (GLint loc, const float* v) { glUniform3fv(loc,1,v); } -static void glUniform3iv_ei (GLint loc, const int* v) { glUniform3iv(loc,1,v); } +inline void glUniform3fv_ei (GLint loc, const float* v) { glUniform3fv(loc,1,v); } +inline void glUniform3iv_ei (GLint loc, const int* v) { glUniform3iv(loc,1,v); } -static void glUniform4fv_ei (GLint loc, const float* v) { glUniform4fv(loc,1,v); } -static void glUniform4iv_ei (GLint loc, const int* v) { glUniform4iv(loc,1,v); } +inline void glUniform4fv_ei (GLint loc, const float* v) { glUniform4fv(loc,1,v); } +inline void glUniform4iv_ei (GLint loc, const int* v) { glUniform4iv(loc,1,v); } -static void glUniformMatrix2fv_ei (GLint loc, const float* v) { glUniformMatrix2fv(loc,1,false,v); } -static void glUniformMatrix3fv_ei (GLint loc, const float* v) { glUniformMatrix3fv(loc,1,false,v); } -static void glUniformMatrix4fv_ei (GLint loc, const float* v) { glUniformMatrix4fv(loc,1,false,v); } +inline void glUniformMatrix2fv_ei (GLint loc, const float* v) { glUniformMatrix2fv(loc,1,false,v); } +inline void glUniformMatrix3fv_ei (GLint loc, const float* v) { glUniformMatrix3fv(loc,1,false,v); } +inline void glUniformMatrix4fv_ei (GLint loc, const float* v) { glUniformMatrix4fv(loc,1,false,v); } EIGEN_GL_FUNC1_DECLARATION (glUniform,GLint,const) @@ -294,9 +294,9 @@ EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 4,3,Matrix #ifdef GL_VERSION_3_0 -static void glUniform2uiv_ei (GLint loc, const unsigned int* v) { glUniform2uiv(loc,1,v); } -static void glUniform3uiv_ei (GLint loc, const unsigned int* v) { glUniform3uiv(loc,1,v); } -static void glUniform4uiv_ei (GLint loc, const unsigned int* v) { glUniform4uiv(loc,1,v); } +inline void glUniform2uiv_ei (GLint loc, const unsigned int* v) { glUniform2uiv(loc,1,v); } +inline void glUniform3uiv_ei (GLint loc, const unsigned int* v) { glUniform3uiv(loc,1,v); } +inline void glUniform4uiv_ei (GLint loc, const unsigned int* v) { glUniform4uiv(loc,1,v); } EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 2,2uiv_ei) EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 3,3uiv_ei) @@ -305,9 +305,9 @@ EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 4,4uiv_ei) #endif #ifdef GL_ARB_gpu_shader_fp64 -static void glUniform2dv_ei (GLint loc, const double* v) { glUniform2dv(loc,1,v); } -static void glUniform3dv_ei (GLint loc, const double* v) { glUniform3dv(loc,1,v); } -static void glUniform4dv_ei (GLint loc, const double* v) { glUniform4dv(loc,1,v); } +inline void glUniform2dv_ei (GLint loc, const double* v) { glUniform2dv(loc,1,v); } +inline void glUniform3dv_ei (GLint loc, const double* v) { glUniform3dv(loc,1,v); } +inline void glUniform4dv_ei (GLint loc, const double* v) { glUniform4dv(loc,1,v); } EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double, 2,2dv_ei) EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double, 3,3dv_ei) diff --git a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h index 0e1b7d977..52eb65a2f 100644 --- a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h @@ -150,7 +150,7 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> > m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - dgmres(*mp_matrix, b.col(j), xj, Base::m_preconditioner); + dgmres(mp_matrix, b.col(j), xj, Base::m_preconditioner); } m_info = failed ? NumericalIssue : m_error <= Base::m_tolerance ? Success diff --git a/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/unsupported/Eigen/src/IterativeSolvers/GMRES.h index 60f5f662c..6e847e110 100644 --- a/unsupported/Eigen/src/IterativeSolvers/GMRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/GMRES.h @@ -250,21 +250,8 @@ struct traits<GMRES<_MatrixType,_Preconditioner> > * \endcode * * By default the iterations start with x=0 as an initial guess of the solution. - * One can control the start using the solveWithGuess() method. Here is a step by - * step execution example starting with a random guess and printing the evolution - * of the estimated error: - * * \code - * x = VectorXd::Random(n); - * solver.setMaxIterations(1); - * int i = 0; - * do { - * x = solver.solveWithGuess(b,x); - * std::cout << i << " : " << solver.error() << std::endl; - * ++i; - * } while (solver.info()!=Success && i<100); - * \endcode - * Note that such a step by step excution is slightly slower. - * + * One can control the start using the solveWithGuess() method. + * * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ template< typename _MatrixType, typename _Preconditioner> @@ -327,7 +314,7 @@ public: m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - if(!internal::gmres(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_restart, m_error)) + if(!internal::gmres(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_restart, m_error)) failed = true; } m_info = failed ? NumericalIssue diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h index eccdce24c..2845b9cfd 100644 --- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h @@ -165,8 +165,8 @@ namespace Eigen { * The vectors x and b can be either dense or sparse. * * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix. - * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower - * or Upper. Default is Lower. + * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower, + * Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower. * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner * * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations() @@ -189,20 +189,7 @@ namespace Eigen { * \endcode * * By default the iterations start with x=0 as an initial guess of the solution. - * One can control the start using the solveWithGuess() method. Here is a step by - * step execution example starting with a random guess and printing the evolution - * of the estimated error: - * * \code - * x = VectorXd::Random(n); - * mr.setMaxIterations(1); - * int i = 0; - * do { - * x = mr.solveWithGuess(b,x); - * std::cout << i << " : " << mr.error() << std::endl; - * ++i; - * } while (mr.info()!=Success && i<100); - * \endcode - * Note that such a step by step excution is slightly slower. + * One can control the start using the solveWithGuess() method. * * \sa class ConjugateGradient, BiCGSTAB, SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ @@ -250,6 +237,11 @@ namespace Eigen { template<typename Rhs,typename Dest> void _solve_with_guess_impl(const Rhs& b, Dest& x) const { + typedef typename internal::conditional<UpLo==(Lower|Upper), + Ref<const MatrixType>&, + SparseSelfAdjointView<const Ref<const MatrixType>, UpLo> + >::type MatrixWrapperType; + m_iterations = Base::maxIterations(); m_error = Base::m_tolerance; @@ -259,7 +251,7 @@ namespace Eigen { m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - internal::minres(mp_matrix->template selfadjointView<UpLo>(), b.col(j), xj, + internal::minres(MatrixWrapperType(mp_matrix), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error); } diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h index 42b60b9b1..22bfdc4ac 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h @@ -53,15 +53,20 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result) result(1,0) = Scalar(0); result(1,1) = logA11; - if (A(0,0) == A(1,1)) { + Scalar y = A(1,1) - A(0,0); + if (y==Scalar(0)) + { result(0,1) = A(0,1) / A(0,0); - } else if ((abs(A(0,0)) < 0.5*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1)))) { - result(0,1) = A(0,1) * (logA11 - logA00) / (A(1,1) - A(0,0)); - } else { + } + else if ((abs(A(0,0)) < 0.5*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1)))) + { + result(0,1) = A(0,1) * (logA11 - logA00) / y; + } + else + { // computation in previous branch is inaccurate if A(1,1) \approx A(0,0) int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - M_PI) / (2*M_PI))); - Scalar y = A(1,1) - A(0,0), x = A(1,1) + A(0,0); - result(0,1) = A(0,1) * (Scalar(2) * numext::atanh2(y,x) + Scalar(0,2*M_PI*unwindingNumber)) / y; + result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*M_PI*unwindingNumber)) / y; } } diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h index ee665c18e..1e5a59c55 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h @@ -299,7 +299,7 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const ComplexScalar logCurr = log(curr); ComplexScalar logPrev = log(prev); int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - M_PI) / (2*M_PI)); - ComplexScalar w = numext::atanh2(curr - prev, curr + prev) + ComplexScalar(0, M_PI*unwindingNumber); + ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, M_PI*unwindingNumber); return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev); } @@ -311,7 +311,7 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(RealScalar curr, RealScalar prev using std::log; using std::sinh; - RealScalar w = numext::atanh2(curr - prev, curr + prev); + RealScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2); return 2 * exp(p * (log(curr) + log(prev)) / 2) * sinh(p * w) / (curr - prev); } diff --git a/unsupported/Eigen/src/Polynomials/PolynomialUtils.h b/unsupported/Eigen/src/Polynomials/PolynomialUtils.h index 2bb8bc84a..40ba65b7e 100644 --- a/unsupported/Eigen/src/Polynomials/PolynomialUtils.h +++ b/unsupported/Eigen/src/Polynomials/PolynomialUtils.h @@ -56,7 +56,7 @@ T poly_eval( const Polynomials& poly, const T& x ) for( DenseIndex i=1; i<poly.size(); ++i ){ val = val*inv_x + poly[i]; } - return std::pow(x,(T)(poly.size()-1)) * val; + return numext::pow(x,(T)(poly.size()-1)) * val; } } diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 97849a25a..7b6751f00 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -50,7 +50,7 @@ if(MPFR_FOUND) include_directories(${MPFR_INCLUDES} ./mpreal) ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ") set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES}) - ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" ) +# ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" ) else() ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ") endif() @@ -76,8 +76,9 @@ if(NOT EIGEN_TEST_NO_OPENGL) find_package(GLUT) find_package(GLEW) if(OPENGL_FOUND AND GLUT_FOUND AND GLEW_FOUND) + include_directories(${OPENGL_INCLUDE_DIR} ${GLUT_INCLUDE_DIR} ${GLEW_INCLUDE_DIRS}) ei_add_property(EIGEN_TESTED_BACKENDS "OpenGL, ") - set(EIGEN_GL_LIB ${GLUT_LIBRARIES} ${GLEW_LIBRARIES}) + set(EIGEN_GL_LIB ${GLUT_LIBRARIES} ${GLEW_LIBRARIES} ${OPENGL_LIBRARIES}) ei_add_test(openglsupport "" "${EIGEN_GL_LIB}" ) else() ei_add_property(EIGEN_MISSING_BACKENDS "OpenGL, ") @@ -94,12 +95,50 @@ ei_add_test(minres) ei_add_test(levenberg_marquardt) ei_add_test(kronecker_product) -option(EIGEN_TEST_CXX11 "Enable testing of C++11 features (e.g. Tensor module)." OFF) +option(EIGEN_TEST_CXX11 "Enable testing of C++11 features (e.g. Tensor module)." ON) if(EIGEN_TEST_CXX11) - # FIXME: add C++11 compiler switch in some portable way - # (MSVC doesn't need any for example, so this will - # clash there) + # It should be safe to always run these tests as there is some fallback code for + # older compiler that don't support cxx11. ei_add_test(cxx11_meta "-std=c++0x") ei_add_test(cxx11_tensor_simple "-std=c++0x") - ei_add_test(cxx11_tensor_symmetry "-std=c++0x") +# ei_add_test(cxx11_tensor_symmetry "-std=c++0x") + ei_add_test(cxx11_tensor_assign "-std=c++0x") + ei_add_test(cxx11_tensor_dimension "-std=c++0x") + ei_add_test(cxx11_tensor_index_list "-std=c++0x") + ei_add_test(cxx11_tensor_comparisons "-std=c++0x") + ei_add_test(cxx11_tensor_contraction "-std=c++0x") + ei_add_test(cxx11_tensor_convolution "-std=c++0x") + ei_add_test(cxx11_tensor_expr "-std=c++0x") + ei_add_test(cxx11_tensor_forced_eval "-std=c++0x") + ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") + ei_add_test(cxx11_tensor_const "-std=c++0x") + ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") + ei_add_test(cxx11_tensor_of_complex "-std=c++0x") + ei_add_test(cxx11_tensor_of_strings "-std=c++0x") + ei_add_test(cxx11_tensor_intdiv "-std=c++0x") + ei_add_test(cxx11_tensor_lvalue "-std=c++0x") + ei_add_test(cxx11_tensor_map "-std=c++0x") + ei_add_test(cxx11_tensor_broadcasting "-std=c++0x") + ei_add_test(cxx11_tensor_chipping "-std=c++0x") + ei_add_test(cxx11_tensor_concatenation "-std=c++0x") + ei_add_test(cxx11_tensor_morphing "-std=c++0x") + ei_add_test(cxx11_tensor_padding "-std=c++0x") + ei_add_test(cxx11_tensor_patch "-std=c++0x") + ei_add_test(cxx11_tensor_image_patch "-std=c++0x") + ei_add_test(cxx11_tensor_reduction "-std=c++0x") + ei_add_test(cxx11_tensor_shuffling "-std=c++0x") + ei_add_test(cxx11_tensor_striding "-std=c++0x") + ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") + ei_add_test(cxx11_tensor_ref "-std=c++0x") + ei_add_test(cxx11_tensor_random "-std=c++0x") + ei_add_test(cxx11_tensor_casts "-std=c++0x") + ei_add_test(cxx11_tensor_reverse "-std=c++0x") + ei_add_test(cxx11_tensor_layout_swap "-std=c++0x") + ei_add_test(cxx11_tensor_io "-std=c++0x") + + # These tests needs nvcc +# ei_add_test(cxx11_tensor_device "-std=c++0x") +# ei_add_test(cxx11_tensor_cuda "-std=c++0x") +# ei_add_test(cxx11_tensor_contract_cuda "-std=c++0x") + endif() diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp new file mode 100644 index 000000000..d16aaf847 --- /dev/null +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -0,0 +1,370 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_1d() +{ + Tensor<int, 1> vec1(6); + Tensor<int, 1, RowMajor> vec2(6); + vec1(0) = 4; vec2(0) = 0; + vec1(1) = 8; vec2(1) = 1; + vec1(2) = 15; vec2(2) = 2; + vec1(3) = 16; vec2(3) = 3; + vec1(4) = 23; vec2(4) = 4; + vec1(5) = 42; vec2(5) = 5; + + int col_major[6]; + int row_major[6]; + memset(col_major, 0, 6*sizeof(int)); + memset(row_major, 0, 6*sizeof(int)); + TensorMap<Tensor<int, 1>> vec3(col_major, 6); + TensorMap<Tensor<int, 1, RowMajor>> vec4(row_major, 6); + + vec3 = vec1; + vec4 = vec2; + + VERIFY_IS_EQUAL(vec3(0), 4); + VERIFY_IS_EQUAL(vec3(1), 8); + VERIFY_IS_EQUAL(vec3(2), 15); + VERIFY_IS_EQUAL(vec3(3), 16); + VERIFY_IS_EQUAL(vec3(4), 23); + VERIFY_IS_EQUAL(vec3(5), 42); + + VERIFY_IS_EQUAL(vec4(0), 0); + VERIFY_IS_EQUAL(vec4(1), 1); + VERIFY_IS_EQUAL(vec4(2), 2); + VERIFY_IS_EQUAL(vec4(3), 3); + VERIFY_IS_EQUAL(vec4(4), 4); + VERIFY_IS_EQUAL(vec4(5), 5); + + vec1.setZero(); + vec2.setZero(); + vec1 = vec3; + vec2 = vec4; + + VERIFY_IS_EQUAL(vec1(0), 4); + VERIFY_IS_EQUAL(vec1(1), 8); + VERIFY_IS_EQUAL(vec1(2), 15); + VERIFY_IS_EQUAL(vec1(3), 16); + VERIFY_IS_EQUAL(vec1(4), 23); + VERIFY_IS_EQUAL(vec1(5), 42); + + VERIFY_IS_EQUAL(vec2(0), 0); + VERIFY_IS_EQUAL(vec2(1), 1); + VERIFY_IS_EQUAL(vec2(2), 2); + VERIFY_IS_EQUAL(vec2(3), 3); + VERIFY_IS_EQUAL(vec2(4), 4); + VERIFY_IS_EQUAL(vec2(5), 5); +} + +static void test_2d() +{ + Tensor<int, 2> mat1(2,3); + Tensor<int, 2, RowMajor> mat2(2,3); + + mat1(0,0) = 0; + mat1(0,1) = 1; + mat1(0,2) = 2; + mat1(1,0) = 3; + mat1(1,1) = 4; + mat1(1,2) = 5; + + mat2(0,0) = 0; + mat2(0,1) = 1; + mat2(0,2) = 2; + mat2(1,0) = 3; + mat2(1,1) = 4; + mat2(1,2) = 5; + + int col_major[6]; + int row_major[6]; + memset(col_major, 0, 6*sizeof(int)); + memset(row_major, 0, 6*sizeof(int)); + TensorMap<Tensor<int, 2>> mat3(row_major, 2, 3); + TensorMap<Tensor<int, 2, RowMajor>> mat4(col_major, 2, 3); + + mat3 = mat1; + mat4 = mat2; + + VERIFY_IS_EQUAL(mat3(0,0), 0); + VERIFY_IS_EQUAL(mat3(0,1), 1); + VERIFY_IS_EQUAL(mat3(0,2), 2); + VERIFY_IS_EQUAL(mat3(1,0), 3); + VERIFY_IS_EQUAL(mat3(1,1), 4); + VERIFY_IS_EQUAL(mat3(1,2), 5); + + VERIFY_IS_EQUAL(mat4(0,0), 0); + VERIFY_IS_EQUAL(mat4(0,1), 1); + VERIFY_IS_EQUAL(mat4(0,2), 2); + VERIFY_IS_EQUAL(mat4(1,0), 3); + VERIFY_IS_EQUAL(mat4(1,1), 4); + VERIFY_IS_EQUAL(mat4(1,2), 5); + + mat1.setZero(); + mat2.setZero(); + mat1 = mat3; + mat2 = mat4; + + VERIFY_IS_EQUAL(mat1(0,0), 0); + VERIFY_IS_EQUAL(mat1(0,1), 1); + VERIFY_IS_EQUAL(mat1(0,2), 2); + VERIFY_IS_EQUAL(mat1(1,0), 3); + VERIFY_IS_EQUAL(mat1(1,1), 4); + VERIFY_IS_EQUAL(mat1(1,2), 5); + + VERIFY_IS_EQUAL(mat2(0,0), 0); + VERIFY_IS_EQUAL(mat2(0,1), 1); + VERIFY_IS_EQUAL(mat2(0,2), 2); + VERIFY_IS_EQUAL(mat2(1,0), 3); + VERIFY_IS_EQUAL(mat2(1,1), 4); + VERIFY_IS_EQUAL(mat2(1,2), 5); +} + +static void test_3d() +{ + Tensor<int, 3> mat1(2,3,7); + Tensor<int, 3, RowMajor> mat2(2,3,7); + + int val = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + mat2(i,j,k) = val; + val++; + } + } + } + + int col_major[2*3*7]; + int row_major[2*3*7]; + memset(col_major, 0, 2*3*7*sizeof(int)); + memset(row_major, 0, 2*3*7*sizeof(int)); + TensorMap<Tensor<int, 3>> mat3(col_major, 2, 3, 7); + TensorMap<Tensor<int, 3, RowMajor>> mat4(row_major, 2, 3, 7); + + mat3 = mat1; + mat4 = mat2; + + val = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(mat3(i,j,k), val); + VERIFY_IS_EQUAL(mat4(i,j,k), val); + val++; + } + } + } + + mat1.setZero(); + mat2.setZero(); + mat1 = mat3; + mat2 = mat4; + + val = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(mat1(i,j,k), val); + VERIFY_IS_EQUAL(mat2(i,j,k), val); + val++; + } + } + } +} + +static void test_same_type() +{ + Tensor<int, 1> orig_tensor(5); + Tensor<int, 1> dest_tensor(5); + orig_tensor.setRandom(); + dest_tensor.setRandom(); + int* orig_data = orig_tensor.data(); + int* dest_data = dest_tensor.data(); + dest_tensor = orig_tensor; + VERIFY_IS_EQUAL(orig_tensor.data(), orig_data); + VERIFY_IS_EQUAL(dest_tensor.data(), dest_data); + for (int i = 0; i < 5; ++i) { + VERIFY_IS_EQUAL(dest_tensor(i), orig_tensor(i)); + } + + TensorFixedSize<int, Sizes<5> > orig_array; + TensorFixedSize<int, Sizes<5> > dest_array; + orig_array.setRandom(); + dest_array.setRandom(); + orig_data = orig_array.data(); + dest_data = dest_array.data(); + dest_array = orig_array; + VERIFY_IS_EQUAL(orig_array.data(), orig_data); + VERIFY_IS_EQUAL(dest_array.data(), dest_data); + for (int i = 0; i < 5; ++i) { + VERIFY_IS_EQUAL(dest_array(i), orig_array(i)); + } + + int orig[5] = {1, 2, 3, 4, 5}; + int dest[5] = {6, 7, 8, 9, 10}; + TensorMap<Tensor<int, 1> > orig_map(orig, 5); + TensorMap<Tensor<int, 1> > dest_map(dest, 5); + orig_data = orig_map.data(); + dest_data = dest_map.data(); + dest_map = orig_map; + VERIFY_IS_EQUAL(orig_map.data(), orig_data); + VERIFY_IS_EQUAL(dest_map.data(), dest_data); + for (int i = 0; i < 5; ++i) { + VERIFY_IS_EQUAL(dest[i], i+1); + } +} + +static void test_auto_resize() +{ + Tensor<int, 1> tensor1; + Tensor<int, 1> tensor2(3); + Tensor<int, 1> tensor3(5); + Tensor<int, 1> tensor4(7); + + Tensor<int, 1> new_tensor(5); + new_tensor.setRandom(); + + tensor1 = tensor2 = tensor3 = tensor4 = new_tensor; + + VERIFY_IS_EQUAL(tensor1.dimension(0), new_tensor.dimension(0)); + VERIFY_IS_EQUAL(tensor2.dimension(0), new_tensor.dimension(0)); + VERIFY_IS_EQUAL(tensor3.dimension(0), new_tensor.dimension(0)); + VERIFY_IS_EQUAL(tensor4.dimension(0), new_tensor.dimension(0)); + for (int i = 0; i < new_tensor.dimension(0); ++i) { + VERIFY_IS_EQUAL(tensor1(i), new_tensor(i)); + VERIFY_IS_EQUAL(tensor2(i), new_tensor(i)); + VERIFY_IS_EQUAL(tensor3(i), new_tensor(i)); + VERIFY_IS_EQUAL(tensor4(i), new_tensor(i)); + } +} + + +static void test_compound_assign() +{ + Tensor<int, 1> start_tensor(10); + Tensor<int, 1> offset_tensor(10); + start_tensor.setRandom(); + offset_tensor.setRandom(); + + Tensor<int, 1> tensor = start_tensor; + tensor += offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) + offset_tensor(i)); + } + + tensor = start_tensor; + tensor -= offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) - offset_tensor(i)); + } + + tensor = start_tensor; + tensor *= offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) * offset_tensor(i)); + } + + tensor = start_tensor; + tensor /= offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) / offset_tensor(i)); + } +} + +static void test_std_initializers_tensor() { +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + Tensor<int, 1> a(3); + a.setValues({0, 1, 2}); + VERIFY_IS_EQUAL(a(0), 0); + VERIFY_IS_EQUAL(a(1), 1); + VERIFY_IS_EQUAL(a(2), 2); + + // It fills the top-left slice. + a.setValues({10, 20}); + VERIFY_IS_EQUAL(a(0), 10); + VERIFY_IS_EQUAL(a(1), 20); + VERIFY_IS_EQUAL(a(2), 2); + + // Chaining. + Tensor<int, 1> a2(3); + a2 = a.setValues({100, 200, 300}); + VERIFY_IS_EQUAL(a(0), 100); + VERIFY_IS_EQUAL(a(1), 200); + VERIFY_IS_EQUAL(a(2), 300); + VERIFY_IS_EQUAL(a2(0), 100); + VERIFY_IS_EQUAL(a2(1), 200); + VERIFY_IS_EQUAL(a2(2), 300); + + Tensor<int, 2> b(2, 3); + b.setValues({{0, 1, 2}, {3, 4, 5}}); + VERIFY_IS_EQUAL(b(0, 0), 0); + VERIFY_IS_EQUAL(b(0, 1), 1); + VERIFY_IS_EQUAL(b(0, 2), 2); + VERIFY_IS_EQUAL(b(1, 0), 3); + VERIFY_IS_EQUAL(b(1, 1), 4); + VERIFY_IS_EQUAL(b(1, 2), 5); + + // It fills the top-left slice. + b.setValues({{10, 20}, {30}}); + VERIFY_IS_EQUAL(b(0, 0), 10); + VERIFY_IS_EQUAL(b(0, 1), 20); + VERIFY_IS_EQUAL(b(0, 2), 2); + VERIFY_IS_EQUAL(b(1, 0), 30); + VERIFY_IS_EQUAL(b(1, 1), 4); + VERIFY_IS_EQUAL(b(1, 2), 5); + + Eigen::Tensor<int, 3> c(3, 2, 4); + c.setValues({{{0, 1, 2, 3}, {4, 5, 6, 7}}, + {{10, 11, 12, 13}, {14, 15, 16, 17}}, + {{20, 21, 22, 23}, {24, 25, 26, 27}}}); + VERIFY_IS_EQUAL(c(0, 0, 0), 0); + VERIFY_IS_EQUAL(c(0, 0, 1), 1); + VERIFY_IS_EQUAL(c(0, 0, 2), 2); + VERIFY_IS_EQUAL(c(0, 0, 3), 3); + VERIFY_IS_EQUAL(c(0, 1, 0), 4); + VERIFY_IS_EQUAL(c(0, 1, 1), 5); + VERIFY_IS_EQUAL(c(0, 1, 2), 6); + VERIFY_IS_EQUAL(c(0, 1, 3), 7); + VERIFY_IS_EQUAL(c(1, 0, 0), 10); + VERIFY_IS_EQUAL(c(1, 0, 1), 11); + VERIFY_IS_EQUAL(c(1, 0, 2), 12); + VERIFY_IS_EQUAL(c(1, 0, 3), 13); + VERIFY_IS_EQUAL(c(1, 1, 0), 14); + VERIFY_IS_EQUAL(c(1, 1, 1), 15); + VERIFY_IS_EQUAL(c(1, 1, 2), 16); + VERIFY_IS_EQUAL(c(1, 1, 3), 17); + VERIFY_IS_EQUAL(c(2, 0, 0), 20); + VERIFY_IS_EQUAL(c(2, 0, 1), 21); + VERIFY_IS_EQUAL(c(2, 0, 2), 22); + VERIFY_IS_EQUAL(c(2, 0, 3), 23); + VERIFY_IS_EQUAL(c(2, 1, 0), 24); + VERIFY_IS_EQUAL(c(2, 1, 1), 25); + VERIFY_IS_EQUAL(c(2, 1, 2), 26); + VERIFY_IS_EQUAL(c(2, 1, 3), 27); +#endif // EIGEN_HAS_VARIADIC_TEMPLATES +} + +void test_cxx11_tensor_assign() +{ + CALL_SUBTEST(test_1d()); + CALL_SUBTEST(test_2d()); + CALL_SUBTEST(test_3d()); + CALL_SUBTEST(test_same_type()); + CALL_SUBTEST(test_auto_resize()); + CALL_SUBTEST(test_compound_assign()); + CALL_SUBTEST(test_std_initializers_tensor()); +} diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp new file mode 100644 index 000000000..2ddf47234 --- /dev/null +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -0,0 +1,194 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template <int DataLayout> +static void test_simple_broadcasting() +{ + Tensor<float, 4, DataLayout> tensor(2,3,5,7); + tensor.setRandom(); + array<ptrdiff_t, 4> broadcasts; + broadcasts[0] = 1; + broadcasts[1] = 1; + broadcasts[2] = 1; + broadcasts[3] = 1; + + Tensor<float, 4, DataLayout> no_broadcast; + no_broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(no_broadcast.dimension(0), 2); + VERIFY_IS_EQUAL(no_broadcast.dimension(1), 3); + VERIFY_IS_EQUAL(no_broadcast.dimension(2), 5); + VERIFY_IS_EQUAL(no_broadcast.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_broadcast(i,j,k,l)); + } + } + } + } + + broadcasts[0] = 2; + broadcasts[1] = 3; + broadcasts[2] = 1; + broadcasts[3] = 4; + Tensor<float, 4, DataLayout> broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 4); + VERIFY_IS_EQUAL(broadcast.dimension(1), 9); + VERIFY_IS_EQUAL(broadcast.dimension(2), 5); + VERIFY_IS_EQUAL(broadcast.dimension(3), 28); + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 28; ++l) { + VERIFY_IS_EQUAL(tensor(i%2,j%3,k%5,l%7), broadcast(i,j,k,l)); + } + } + } + } +} + + +template <int DataLayout> +static void test_vectorized_broadcasting() +{ + Tensor<float, 3, DataLayout> tensor(8,3,5); + tensor.setRandom(); + array<ptrdiff_t, 3> broadcasts; + broadcasts[0] = 2; + broadcasts[1] = 3; + broadcasts[2] = 4; + + Tensor<float, 3, DataLayout> broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 16); + VERIFY_IS_EQUAL(broadcast.dimension(1), 9); + VERIFY_IS_EQUAL(broadcast.dimension(2), 20); + + for (int i = 0; i < 16; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 20; ++k) { + VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k)); + } + } + } + + tensor.resize(11,3,5); + tensor.setRandom(); + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 22); + VERIFY_IS_EQUAL(broadcast.dimension(1), 9); + VERIFY_IS_EQUAL(broadcast.dimension(2), 20); + + for (int i = 0; i < 22; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 20; ++k) { + VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k)); + } + } + } +} + + +template <int DataLayout> +static void test_static_broadcasting() +{ + Tensor<float, 3, DataLayout> tensor(8,3,5); + tensor.setRandom(); + +#ifdef EIGEN_HAS_CONSTEXPR + Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts; +#else + Eigen::array<int, 3> broadcasts; + broadcasts[0] = 2; + broadcasts[1] = 3; + broadcasts[2] = 4; +#endif + + Tensor<float, 3, DataLayout> broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 16); + VERIFY_IS_EQUAL(broadcast.dimension(1), 9); + VERIFY_IS_EQUAL(broadcast.dimension(2), 20); + + for (int i = 0; i < 16; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 20; ++k) { + VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k)); + } + } + } + + tensor.resize(11,3,5); + tensor.setRandom(); + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 22); + VERIFY_IS_EQUAL(broadcast.dimension(1), 9); + VERIFY_IS_EQUAL(broadcast.dimension(2), 20); + + for (int i = 0; i < 22; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 20; ++k) { + VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k)); + } + } + } +} + + +template <int DataLayout> +static void test_fixed_size_broadcasting() +{ + // Need to add a [] operator to the Size class for this to work +#if 0 + Tensor<float, 1, DataLayout> t1(10); + t1.setRandom(); + TensorFixedSize<float, Sizes<1>, DataLayout> t2; + t2 = t2.constant(20.0f); + + Tensor<float, 1, DataLayout> t3 = t1 + t2.broadcast(Eigen::array<int, 1>{{10}}); + for (int i = 0; i < 10; ++i) { + VERIFY_IS_APPROX(t3(i), t1(i) + t2(0)); + } + + TensorMap<TensorFixedSize<float, Sizes<1>, DataLayout> > t4(t2.data(), {{1}}); + Tensor<float, 1, DataLayout> t5 = t1 + t4.broadcast(Eigen::array<int, 1>{{10}}); + for (int i = 0; i < 10; ++i) { + VERIFY_IS_APPROX(t5(i), t1(i) + t2(0)); + } +#endif +} + + +void test_cxx11_tensor_broadcasting() +{ + CALL_SUBTEST(test_simple_broadcasting<ColMajor>()); + CALL_SUBTEST(test_simple_broadcasting<RowMajor>()); + CALL_SUBTEST(test_vectorized_broadcasting<ColMajor>()); + CALL_SUBTEST(test_vectorized_broadcasting<RowMajor>()); + CALL_SUBTEST(test_static_broadcasting<ColMajor>()); + CALL_SUBTEST(test_static_broadcasting<RowMajor>()); + CALL_SUBTEST(test_fixed_size_broadcasting<ColMajor>()); + CALL_SUBTEST(test_fixed_size_broadcasting<RowMajor>()); +} diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp new file mode 100644 index 000000000..4f7ff7067 --- /dev/null +++ b/unsupported/test/cxx11_tensor_casts.cpp @@ -0,0 +1,41 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::array; + +static void test_simple_cast() +{ + Tensor<float, 2> ftensor(20,30); + ftensor.setRandom(); + Tensor<char, 2> chartensor(20,30); + chartensor.setRandom(); + Tensor<std::complex<float>, 2> cplextensor(20,30); + cplextensor.setRandom(); + + chartensor = ftensor.cast<char>(); + cplextensor = ftensor.cast<std::complex<float>>(); + + for (int i = 0; i < 20; ++i) { + for (int j = 0; j < 30; ++j) { + VERIFY_IS_EQUAL(chartensor(i,j), static_cast<char>(ftensor(i,j))); + VERIFY_IS_EQUAL(cplextensor(i,j), static_cast<std::complex<float>>(ftensor(i,j))); + } + } +} + + +void test_cxx11_tensor_casts() +{ + CALL_SUBTEST(test_simple_cast()); +} diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp new file mode 100644 index 000000000..d83417872 --- /dev/null +++ b/unsupported/test/cxx11_tensor_chipping.cpp @@ -0,0 +1,397 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template<int DataLayout> +static void test_simple_chip() +{ + Tensor<float, 5, DataLayout> tensor(2,3,5,7,11); + tensor.setRandom(); + + Tensor<float, 4, DataLayout> chip1; + chip1 = tensor.template chip<0>(1); + + VERIFY_IS_EQUAL(chip1.dimension(0), 3); + VERIFY_IS_EQUAL(chip1.dimension(1), 5); + VERIFY_IS_EQUAL(chip1.dimension(2), 7); + VERIFY_IS_EQUAL(chip1.dimension(3), 11); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l)); + } + } + } + } + + Tensor<float, 4, DataLayout> chip2 = tensor.template chip<1>(1); + VERIFY_IS_EQUAL(chip2.dimension(0), 2); + VERIFY_IS_EQUAL(chip2.dimension(1), 5); + VERIFY_IS_EQUAL(chip2.dimension(2), 7); + VERIFY_IS_EQUAL(chip2.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l)); + } + } + } + } + + Tensor<float, 4, DataLayout> chip3 = tensor.template chip<2>(2); + VERIFY_IS_EQUAL(chip3.dimension(0), 2); + VERIFY_IS_EQUAL(chip3.dimension(1), 3); + VERIFY_IS_EQUAL(chip3.dimension(2), 7); + VERIFY_IS_EQUAL(chip3.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l)); + } + } + } + } + + Tensor<float, 4, DataLayout> chip4(tensor.template chip<3>(5)); + VERIFY_IS_EQUAL(chip4.dimension(0), 2); + VERIFY_IS_EQUAL(chip4.dimension(1), 3); + VERIFY_IS_EQUAL(chip4.dimension(2), 5); + VERIFY_IS_EQUAL(chip4.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l)); + } + } + } + } + + Tensor<float, 4, DataLayout> chip5(tensor.template chip<4>(7)); + VERIFY_IS_EQUAL(chip5.dimension(0), 2); + VERIFY_IS_EQUAL(chip5.dimension(1), 3); + VERIFY_IS_EQUAL(chip5.dimension(2), 5); + VERIFY_IS_EQUAL(chip5.dimension(3), 7); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7)); + } + } + } + } +} + +template<int DataLayout> +static void test_dynamic_chip() +{ + Tensor<float, 5, DataLayout> tensor(2,3,5,7,11); + tensor.setRandom(); + + Tensor<float, 4, DataLayout> chip1; + chip1 = tensor.chip(1, 0); + VERIFY_IS_EQUAL(chip1.dimension(0), 3); + VERIFY_IS_EQUAL(chip1.dimension(1), 5); + VERIFY_IS_EQUAL(chip1.dimension(2), 7); + VERIFY_IS_EQUAL(chip1.dimension(3), 11); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l)); + } + } + } + } + + Tensor<float, 4, DataLayout> chip2 = tensor.chip(1, 1); + VERIFY_IS_EQUAL(chip2.dimension(0), 2); + VERIFY_IS_EQUAL(chip2.dimension(1), 5); + VERIFY_IS_EQUAL(chip2.dimension(2), 7); + VERIFY_IS_EQUAL(chip2.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l)); + } + } + } + } + + Tensor<float, 4, DataLayout> chip3 = tensor.chip(2, 2); + VERIFY_IS_EQUAL(chip3.dimension(0), 2); + VERIFY_IS_EQUAL(chip3.dimension(1), 3); + VERIFY_IS_EQUAL(chip3.dimension(2), 7); + VERIFY_IS_EQUAL(chip3.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l)); + } + } + } + } + + Tensor<float, 4, DataLayout> chip4(tensor.chip(5, 3)); + VERIFY_IS_EQUAL(chip4.dimension(0), 2); + VERIFY_IS_EQUAL(chip4.dimension(1), 3); + VERIFY_IS_EQUAL(chip4.dimension(2), 5); + VERIFY_IS_EQUAL(chip4.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l)); + } + } + } + } + + Tensor<float, 4, DataLayout> chip5(tensor.chip(7, 4)); + VERIFY_IS_EQUAL(chip5.dimension(0), 2); + VERIFY_IS_EQUAL(chip5.dimension(1), 3); + VERIFY_IS_EQUAL(chip5.dimension(2), 5); + VERIFY_IS_EQUAL(chip5.dimension(3), 7); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7)); + } + } + } + } +} + +template<int DataLayout> +static void test_chip_in_expr() { + Tensor<float, 5, DataLayout> input1(2,3,5,7,11); + input1.setRandom(); + Tensor<float, 4, DataLayout> input2(3,5,7,11); + input2.setRandom(); + + Tensor<float, 4, DataLayout> result = input1.template chip<0>(0) + input2; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + float expected = input1(0,i,j,k,l) + input2(i,j,k,l); + VERIFY_IS_EQUAL(result(i,j,k,l), expected); + } + } + } + } + + Tensor<float, 3, DataLayout> input3(3,7,11); + input3.setRandom(); + Tensor<float, 3, DataLayout> result2 = input1.template chip<0>(0).template chip<1>(2) + input3; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 7; ++j) { + for (int k = 0; k < 11; ++k) { + float expected = input1(0,i,2,j,k) + input3(i,j,k); + VERIFY_IS_EQUAL(result2(i,j,k), expected); + } + } + } +} + +template<int DataLayout> +static void test_chip_as_lvalue() +{ + Tensor<float, 5, DataLayout> input1(2,3,5,7,11); + input1.setRandom(); + + Tensor<float, 4, DataLayout> input2(3,5,7,11); + input2.setRandom(); + Tensor<float, 5, DataLayout> tensor = input1; + tensor.template chip<0>(1) = input2; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (i != 1) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m)); + } + } + } + } + } + } + + Tensor<float, 4, DataLayout> input3(2,5,7,11); + input3.setRandom(); + tensor = input1; + tensor.template chip<1>(1) = input3; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (j != 1) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m)); + } + } + } + } + } + } + + Tensor<float, 4, DataLayout> input4(2,3,7,11); + input4.setRandom(); + tensor = input1; + tensor.template chip<2>(3) = input4; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (k != 3) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m)); + } + } + } + } + } + } + + Tensor<float, 4, DataLayout> input5(2,3,5,11); + input5.setRandom(); + tensor = input1; + tensor.template chip<3>(4) = input5; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (l != 4) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m)); + } + } + } + } + } + } + + Tensor<float, 4, DataLayout> input6(2,3,5,7); + input6.setRandom(); + tensor = input1; + tensor.template chip<4>(5) = input6; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (m != 5) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l)); + } + } + } + } + } + } + + Tensor<float, 5, DataLayout> input7(2,3,5,7,11); + input7.setRandom(); + tensor = input1; + tensor.chip(0, 0) = input7.chip(0, 0); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (i != 0) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m)); + } + } + } + } + } + } +} + + +template<int DataLayout> +static void test_chip_raw_data() +{ + Tensor<float, 5, DataLayout> tensor(2,3,5,7,11); + tensor.setRandom(); + + typedef TensorEvaluator<decltype(tensor.template chip<4>(3)), DefaultDevice> Evaluator4; + auto chip = Evaluator4(tensor.template chip<4>(3), DefaultDevice()); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + int chip_index; + if (DataLayout == ColMajor) { + chip_index = i + 2 * (j + 3 * (k + 5 * l)); + } else { + chip_index = 11 * (l + 7 * (k + 5 * (j + 3 * i))); + } + VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(i,j,k,l,3)); + } + } + } + } + + typedef TensorEvaluator<decltype(tensor.template chip<0>(0)), DefaultDevice> Evaluator0; + auto chip0 = Evaluator0(tensor.template chip<0>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip0.data(), static_cast<float*>(0)); + + typedef TensorEvaluator<decltype(tensor.template chip<1>(0)), DefaultDevice> Evaluator1; + auto chip1 = Evaluator1(tensor.template chip<1>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip1.data(), static_cast<float*>(0)); + + typedef TensorEvaluator<decltype(tensor.template chip<2>(0)), DefaultDevice> Evaluator2; + auto chip2 = Evaluator2(tensor.template chip<2>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip2.data(), static_cast<float*>(0)); + + typedef TensorEvaluator<decltype(tensor.template chip<3>(0)), DefaultDevice> Evaluator3; + auto chip3 = Evaluator3(tensor.template chip<3>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0)); +} + +void test_cxx11_tensor_chipping() +{ + CALL_SUBTEST(test_simple_chip<ColMajor>()); + CALL_SUBTEST(test_simple_chip<RowMajor>()); + CALL_SUBTEST(test_dynamic_chip<ColMajor>()); + CALL_SUBTEST(test_dynamic_chip<RowMajor>()); + CALL_SUBTEST(test_chip_in_expr<ColMajor>()); + CALL_SUBTEST(test_chip_in_expr<RowMajor>()); + CALL_SUBTEST(test_chip_as_lvalue<ColMajor>()); + CALL_SUBTEST(test_chip_as_lvalue<RowMajor>()); + CALL_SUBTEST(test_chip_raw_data<ColMajor>()); + CALL_SUBTEST(test_chip_raw_data<RowMajor>()); +} diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp new file mode 100644 index 000000000..186f56ac3 --- /dev/null +++ b/unsupported/test/cxx11_tensor_comparisons.cpp @@ -0,0 +1,84 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_orderings() +{ + Tensor<float, 3> mat1(2,3,7); + Tensor<float, 3> mat2(2,3,7); + Tensor<bool, 3> lt(2,3,7); + Tensor<bool, 3> le(2,3,7); + Tensor<bool, 3> gt(2,3,7); + Tensor<bool, 3> ge(2,3,7); + + mat1.setRandom(); + mat2.setRandom(); + + lt = mat1 < mat2; + le = mat1 <= mat2; + gt = mat1 > mat2; + ge = mat1 >= mat2; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(lt(i,j,k), mat1(i,j,k) < mat2(i,j,k)); + VERIFY_IS_EQUAL(le(i,j,k), mat1(i,j,k) <= mat2(i,j,k)); + VERIFY_IS_EQUAL(gt(i,j,k), mat1(i,j,k) > mat2(i,j,k)); + VERIFY_IS_EQUAL(ge(i,j,k), mat1(i,j,k) >= mat2(i,j,k)); + } + } + } +} + + +static void test_equality() +{ + Tensor<float, 3> mat1(2,3,7); + Tensor<float, 3> mat2(2,3,7); + + mat1.setRandom(); + mat2.setRandom(); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + if (random() < 0.5) { + mat2(i,j,k) = mat1(i,j,k); + } + } + } + } + + Tensor<bool, 3> eq(2,3,7); + Tensor<bool, 3> ne(2,3,7); + eq = (mat1 == mat2); + ne = (mat1 != mat2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(eq(i,j,k), mat1(i,j,k) == mat2(i,j,k)); + VERIFY_IS_EQUAL(ne(i,j,k), mat1(i,j,k) != mat2(i,j,k)); + } + } + } +} + + +void test_cxx11_tensor_comparisons() +{ + CALL_SUBTEST(test_orderings()); + CALL_SUBTEST(test_equality()); +} diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp new file mode 100644 index 000000000..9fdf33c16 --- /dev/null +++ b/unsupported/test/cxx11_tensor_concatenation.cpp @@ -0,0 +1,116 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template<int DataLayout> +static void test_dimension_failures() +{ + Tensor<int, 3, DataLayout> left(2, 3, 1); + Tensor<int, 3, DataLayout> right(3, 3, 1); + left.setRandom(); + right.setRandom(); + + // Okay; other dimensions are equal. + Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0); + + // Dimension mismatches. + VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 1)); + VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 2)); + + // Axis > NumDims or < 0. + VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 3)); + VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, -1)); +} + +template<int DataLayout> +static void test_static_dimension_failure() +{ + Tensor<int, 2, DataLayout> left(2, 3); + Tensor<int, 3, DataLayout> right(2, 3, 1); + +#ifdef CXX11_TENSOR_CONCATENATION_STATIC_DIMENSION_FAILURE + // Technically compatible, but we static assert that the inputs have same + // NumDims. + Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0); +#endif + + // This can be worked around in this case. + Tensor<int, 3, DataLayout> concatenation = left + .reshape(Tensor<int, 3>::Dimensions{{2, 3, 1}}) + .concatenate(right, 0); + Tensor<int, 2, DataLayout> alternative = left + .concatenate(right.reshape(Tensor<int, 2>::Dimensions{{2, 3}}), 0); +} + +template<int DataLayout> +static void test_simple_concatenation() +{ + Tensor<int, 3, DataLayout> left(2, 3, 1); + Tensor<int, 3, DataLayout> right(2, 3, 1); + left.setRandom(); + right.setRandom(); + + Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0); + VERIFY_IS_EQUAL(concatenation.dimension(0), 4); + VERIFY_IS_EQUAL(concatenation.dimension(1), 3); + VERIFY_IS_EQUAL(concatenation.dimension(2), 1); + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0)); + } + for (int i = 2; i < 4; ++i) { + VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i - 2, j, 0)); + } + } + + concatenation = left.concatenate(right, 1); + VERIFY_IS_EQUAL(concatenation.dimension(0), 2); + VERIFY_IS_EQUAL(concatenation.dimension(1), 6); + VERIFY_IS_EQUAL(concatenation.dimension(2), 1); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0)); + } + for (int j = 3; j < 6; ++j) { + VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i, j - 3, 0)); + } + } + + concatenation = left.concatenate(right, 2); + VERIFY_IS_EQUAL(concatenation.dimension(0), 2); + VERIFY_IS_EQUAL(concatenation.dimension(1), 3); + VERIFY_IS_EQUAL(concatenation.dimension(2), 2); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0)); + VERIFY_IS_EQUAL(concatenation(i, j, 1), right(i, j, 0)); + } + } +} + + +// TODO(phli): Add test once we have a real vectorized implementation. +// static void test_vectorized_concatenation() {} + + +void test_cxx11_tensor_concatenation() +{ + CALL_SUBTEST(test_dimension_failures<ColMajor>()); + CALL_SUBTEST(test_dimension_failures<RowMajor>()); + CALL_SUBTEST(test_static_dimension_failure<ColMajor>()); + CALL_SUBTEST(test_static_dimension_failure<RowMajor>()); + CALL_SUBTEST(test_simple_concatenation<ColMajor>()); + CALL_SUBTEST(test_simple_concatenation<RowMajor>()); + // CALL_SUBTEST(test_vectorized_concatenation()); +} diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp new file mode 100644 index 000000000..0ffb02afd --- /dev/null +++ b/unsupported/test/cxx11_tensor_const.cpp @@ -0,0 +1,39 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> +using Eigen::Tensor; + + + + +static void test_simple_assign() +{ + Tensor<int, 3> random(2,3,7); + random.setRandom(); + + TensorMap<Tensor<const int, 3> > constant(random.data(), 2, 3, 7); + Tensor<int, 3> result(2,3,7); + result = constant; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL((result(i,j,k)), random(i,j,k)); + } + } + } +} + +void test_cxx11_tensor_const() +{ + CALL_SUBTEST(test_simple_assign()); +} diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cpp b/unsupported/test/cxx11_tensor_contract_cuda.cpp new file mode 100644 index 000000000..9599607c6 --- /dev/null +++ b/unsupported/test/cxx11_tensor_contract_cuda.cpp @@ -0,0 +1,121 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +typedef Tensor<float, 1>::DimensionPair DimPair; + +template<int DataLayout> +static void test_cuda_contraction(int m_size, int k_size, int n_size) +{ + cout<<"Calling with ("<<m_size<<","<<k_size<<","<<n_size<<")"<<std::endl; + // with these dimensions, the output has 300 * 140 elements, which is + // more than 30 * 1024, which is the number of threads in blocks on + // a 15 SM GK110 GPU + Tensor<float, 2, DataLayout> t_left(Eigen::array<int, 2>(m_size, k_size)); + Tensor<float, 2, DataLayout> t_right(Eigen::array<int, 2>(k_size, n_size)); + Tensor<float, 2, DataLayout> t_result(Eigen::array<int, 2>(m_size, n_size)); + Tensor<float, 2, DataLayout> t_result_gpu(Eigen::array<int, 2>(m_size, n_size)); + Eigen::array<DimPair, 1> dims(DimPair(1, 0)); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size() * sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + cudaMalloc((void**)(&d_t_left), t_left_bytes); + cudaMalloc((void**)(&d_t_right), t_right_bytes); + cudaMalloc((void**)(&d_t_result), t_result_bytes); + + cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > + gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size)); + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > + gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size)); + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > + gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size)); + + + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + t_result = t_left.contract(t_right, dims); + + cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + if (fabs(t_result.data()[i] - t_result_gpu.data()[i]) >= 1e-4) { + cout << "mismatch detected at index " << i << ": " << t_result.data()[i] + << " vs " << t_result_gpu.data()[i] << endl; + assert(false); + } + } + + cudaFree((void*)d_t_left); + cudaFree((void*)d_t_right); + cudaFree((void*)d_t_result); +} + + +void test_cxx11_tensor_cuda() +{ + cout<<"Calling contraction tests"<<std::endl; + CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, 128)); + CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, 128)); + for (int k = 32; k < 256; k++) { + CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, k, 128)); + CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, k, 128)); + } + for (int k = 32; k < 256; k++) { + CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, k)); + CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, k)); + } + for (int k = 32; k < 256; k++) { + CALL_SUBTEST(test_cuda_contraction<ColMajor>(k, 128, 128)); + CALL_SUBTEST(test_cuda_contraction<RowMajor>(k, 128, 128)); + } + + int m_sizes[] = {31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025 }; + int n_sizes[] = {31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025 }; + + int k_sizes[] = { 31, 39, 63, 64, 65, + 95, 96, 127, 129, 255, + 257, 511, 512, 513, 1023, + 1024, 1025}; + + for (int i = 0; i <15; i++) + for (int j = 0; j < 15; j++) + for (int k = 0; k < 17; k++) { + CALL_SUBTEST(test_cuda_contraction<ColMajor>(m_sizes[i], n_sizes[j], k_sizes[k])); + CALL_SUBTEST(test_cuda_contraction<RowMajor>(m_sizes[i], n_sizes[j], k_sizes[k])); + } +} diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp new file mode 100644 index 000000000..2bcae90b8 --- /dev/null +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -0,0 +1,480 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::DefaultDevice; +using Eigen::Tensor; + +typedef Tensor<float, 1>::DimensionPair DimPair; + +template<int DataLayout> +static void test_evals() +{ + Tensor<float, 2, DataLayout> mat1(2, 3); + Tensor<float, 2, DataLayout> mat2(2, 3); + Tensor<float, 2, DataLayout> mat3(3, 2); + + mat1.setRandom(); + mat2.setRandom(); + mat3.setRandom(); + + Tensor<float, 2, DataLayout> mat4(3,3); + mat4.setZero(); + Eigen::array<DimPair, 1> dims3({{DimPair(0, 0)}}); + typedef TensorEvaluator<decltype(mat1.contract(mat2, dims3)), DefaultDevice> Evaluator; + Evaluator eval(mat1.contract(mat2, dims3), DefaultDevice()); + eval.evalTo(mat4.data()); + EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval.dimensions()[0], 3); + VERIFY_IS_EQUAL(eval.dimensions()[1], 3); + + VERIFY_IS_APPROX(mat4(0,0), mat1(0,0)*mat2(0,0) + mat1(1,0)*mat2(1,0)); + VERIFY_IS_APPROX(mat4(0,1), mat1(0,0)*mat2(0,1) + mat1(1,0)*mat2(1,1)); + VERIFY_IS_APPROX(mat4(0,2), mat1(0,0)*mat2(0,2) + mat1(1,0)*mat2(1,2)); + VERIFY_IS_APPROX(mat4(1,0), mat1(0,1)*mat2(0,0) + mat1(1,1)*mat2(1,0)); + VERIFY_IS_APPROX(mat4(1,1), mat1(0,1)*mat2(0,1) + mat1(1,1)*mat2(1,1)); + VERIFY_IS_APPROX(mat4(1,2), mat1(0,1)*mat2(0,2) + mat1(1,1)*mat2(1,2)); + VERIFY_IS_APPROX(mat4(2,0), mat1(0,2)*mat2(0,0) + mat1(1,2)*mat2(1,0)); + VERIFY_IS_APPROX(mat4(2,1), mat1(0,2)*mat2(0,1) + mat1(1,2)*mat2(1,1)); + VERIFY_IS_APPROX(mat4(2,2), mat1(0,2)*mat2(0,2) + mat1(1,2)*mat2(1,2)); + + Tensor<float, 2, DataLayout> mat5(2,2); + mat5.setZero(); + Eigen::array<DimPair, 1> dims4({{DimPair(1, 1)}}); + typedef TensorEvaluator<decltype(mat1.contract(mat2, dims4)), DefaultDevice> Evaluator2; + Evaluator2 eval2(mat1.contract(mat2, dims4), DefaultDevice()); + eval2.evalTo(mat5.data()); + EIGEN_STATIC_ASSERT(Evaluator2::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval2.dimensions()[0], 2); + VERIFY_IS_EQUAL(eval2.dimensions()[1], 2); + + VERIFY_IS_APPROX(mat5(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(0,1) + mat1(0,2)*mat2(0,2)); + VERIFY_IS_APPROX(mat5(0,1), mat1(0,0)*mat2(1,0) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(1,2)); + VERIFY_IS_APPROX(mat5(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(0,1) + mat1(1,2)*mat2(0,2)); + VERIFY_IS_APPROX(mat5(1,1), mat1(1,0)*mat2(1,0) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(1,2)); + + Tensor<float, 2, DataLayout> mat6(2,2); + mat6.setZero(); + Eigen::array<DimPair, 1> dims6({{DimPair(1, 0)}}); + typedef TensorEvaluator<decltype(mat1.contract(mat3, dims6)), DefaultDevice> Evaluator3; + Evaluator3 eval3(mat1.contract(mat3, dims6), DefaultDevice()); + eval3.evalTo(mat6.data()); + EIGEN_STATIC_ASSERT(Evaluator3::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval3.dimensions()[0], 2); + VERIFY_IS_EQUAL(eval3.dimensions()[1], 2); + + VERIFY_IS_APPROX(mat6(0,0), mat1(0,0)*mat3(0,0) + mat1(0,1)*mat3(1,0) + mat1(0,2)*mat3(2,0)); + VERIFY_IS_APPROX(mat6(0,1), mat1(0,0)*mat3(0,1) + mat1(0,1)*mat3(1,1) + mat1(0,2)*mat3(2,1)); + VERIFY_IS_APPROX(mat6(1,0), mat1(1,0)*mat3(0,0) + mat1(1,1)*mat3(1,0) + mat1(1,2)*mat3(2,0)); + VERIFY_IS_APPROX(mat6(1,1), mat1(1,0)*mat3(0,1) + mat1(1,1)*mat3(1,1) + mat1(1,2)*mat3(2,1)); +} + +template<int DataLayout> +static void test_scalar() +{ + Tensor<float, 1, DataLayout> vec1({6}); + Tensor<float, 1, DataLayout> vec2({6}); + + vec1.setRandom(); + vec2.setRandom(); + + Tensor<float, 1, DataLayout> scalar(1); + scalar.setZero(); + Eigen::array<DimPair, 1> dims({{DimPair(0, 0)}}); + typedef TensorEvaluator<decltype(vec1.contract(vec2, dims)), DefaultDevice> Evaluator; + Evaluator eval(vec1.contract(vec2, dims), DefaultDevice()); + eval.evalTo(scalar.data()); + EIGEN_STATIC_ASSERT(Evaluator::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + + float expected = 0.0f; + for (int i = 0; i < 6; ++i) { + expected += vec1(i) * vec2(i); + } + VERIFY_IS_APPROX(scalar(0), expected); +} + +template<int DataLayout> +static void test_multidims() +{ + Tensor<float, 3, DataLayout> mat1(2, 2, 2); + Tensor<float, 4, DataLayout> mat2(2, 2, 2, 2); + + mat1.setRandom(); + mat2.setRandom(); + + Tensor<float, 3, DataLayout> mat3(2, 2, 2); + mat3.setZero(); + Eigen::array<DimPair, 2> dims({{DimPair(1, 2), DimPair(2, 3)}}); + typedef TensorEvaluator<decltype(mat1.contract(mat2, dims)), DefaultDevice> Evaluator; + Evaluator eval(mat1.contract(mat2, dims), DefaultDevice()); + eval.evalTo(mat3.data()); + EIGEN_STATIC_ASSERT(Evaluator::NumDims==3ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval.dimensions()[0], 2); + VERIFY_IS_EQUAL(eval.dimensions()[1], 2); + VERIFY_IS_EQUAL(eval.dimensions()[2], 2); + + VERIFY_IS_APPROX(mat3(0,0,0), mat1(0,0,0)*mat2(0,0,0,0) + mat1(0,1,0)*mat2(0,0,1,0) + + mat1(0,0,1)*mat2(0,0,0,1) + mat1(0,1,1)*mat2(0,0,1,1)); + VERIFY_IS_APPROX(mat3(0,0,1), mat1(0,0,0)*mat2(0,1,0,0) + mat1(0,1,0)*mat2(0,1,1,0) + + mat1(0,0,1)*mat2(0,1,0,1) + mat1(0,1,1)*mat2(0,1,1,1)); + VERIFY_IS_APPROX(mat3(0,1,0), mat1(0,0,0)*mat2(1,0,0,0) + mat1(0,1,0)*mat2(1,0,1,0) + + mat1(0,0,1)*mat2(1,0,0,1) + mat1(0,1,1)*mat2(1,0,1,1)); + VERIFY_IS_APPROX(mat3(0,1,1), mat1(0,0,0)*mat2(1,1,0,0) + mat1(0,1,0)*mat2(1,1,1,0) + + mat1(0,0,1)*mat2(1,1,0,1) + mat1(0,1,1)*mat2(1,1,1,1)); + VERIFY_IS_APPROX(mat3(1,0,0), mat1(1,0,0)*mat2(0,0,0,0) + mat1(1,1,0)*mat2(0,0,1,0) + + mat1(1,0,1)*mat2(0,0,0,1) + mat1(1,1,1)*mat2(0,0,1,1)); + VERIFY_IS_APPROX(mat3(1,0,1), mat1(1,0,0)*mat2(0,1,0,0) + mat1(1,1,0)*mat2(0,1,1,0) + + mat1(1,0,1)*mat2(0,1,0,1) + mat1(1,1,1)*mat2(0,1,1,1)); + VERIFY_IS_APPROX(mat3(1,1,0), mat1(1,0,0)*mat2(1,0,0,0) + mat1(1,1,0)*mat2(1,0,1,0) + + mat1(1,0,1)*mat2(1,0,0,1) + mat1(1,1,1)*mat2(1,0,1,1)); + VERIFY_IS_APPROX(mat3(1,1,1), mat1(1,0,0)*mat2(1,1,0,0) + mat1(1,1,0)*mat2(1,1,1,0) + + mat1(1,0,1)*mat2(1,1,0,1) + mat1(1,1,1)*mat2(1,1,1,1)); +} + +template<int DataLayout> +static void test_holes() { + Tensor<float, 4, DataLayout> t1(2, 5, 7, 3); + Tensor<float, 5, DataLayout> t2(2, 7, 11, 13, 3); + t1.setRandom(); + t2.setRandom(); + + Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(3, 4)}}); + Tensor<float, 5, DataLayout> result = t1.contract(t2, dims); + VERIFY_IS_EQUAL(result.dimension(0), 5); + VERIFY_IS_EQUAL(result.dimension(1), 7); + VERIFY_IS_EQUAL(result.dimension(2), 7); + VERIFY_IS_EQUAL(result.dimension(3), 11); + VERIFY_IS_EQUAL(result.dimension(4), 13); + + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 5; ++l) { + for (int m = 0; m < 5; ++m) { + VERIFY_IS_APPROX(result(i, j, k, l, m), + t1(0, i, j, 0) * t2(0, k, l, m, 0) + + t1(1, i, j, 0) * t2(1, k, l, m, 0) + + t1(0, i, j, 1) * t2(0, k, l, m, 1) + + t1(1, i, j, 1) * t2(1, k, l, m, 1) + + t1(0, i, j, 2) * t2(0, k, l, m, 2) + + t1(1, i, j, 2) * t2(1, k, l, m, 2)); + } + } + } + } + } +} + +template<int DataLayout> +static void test_full_redux() +{ + Tensor<float, 2, DataLayout> t1(2, 2); + Tensor<float, 3, DataLayout> t2(2, 2, 2); + t1.setRandom(); + t2.setRandom(); + + Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}}); + Tensor<float, 1, DataLayout> result = t1.contract(t2, dims); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) + t1(1, 0) * t2(1, 0, 0) + + t1(0, 1) * t2(0, 1, 0) + t1(1, 1) * t2(1, 1, 0)); + VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(0, 0, 1) + t1(1, 0) * t2(1, 0, 1) + + t1(0, 1) * t2(0, 1, 1) + t1(1, 1) * t2(1, 1, 1)); + + dims[0] = DimPair(1, 0); + dims[1] = DimPair(2, 1); + result = t2.contract(t1, dims); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) + t1(1, 0) * t2(0, 1, 0) + + t1(0, 1) * t2(0, 0, 1) + t1(1, 1) * t2(0, 1, 1)); + VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(1, 0, 0) + t1(1, 0) * t2(1, 1, 0) + + t1(0, 1) * t2(1, 0, 1) + t1(1, 1) * t2(1, 1, 1)); +} + +template<int DataLayout> +static void test_contraction_of_contraction() +{ + Tensor<float, 2, DataLayout> t1(2, 2); + Tensor<float, 2, DataLayout> t2(2, 2); + Tensor<float, 2, DataLayout> t3(2, 2); + Tensor<float, 2, DataLayout> t4(2, 2); + t1.setRandom(); + t2.setRandom(); + t3.setRandom(); + t4.setRandom(); + + Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); + auto contract1 = t1.contract(t2, dims); + auto diff = t3 - contract1; + auto contract2 = t1.contract(t4, dims); + Tensor<float, 2, DataLayout> result = contract2.contract(diff, dims); + + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 2); + + Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> + m1(t1.data(), 2, 2), m2(t2.data(), 2, 2), m3(t3.data(), 2, 2), + m4(t4.data(), 2, 2); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> + expected = (m1 * m4) * (m3 - m1 * m2); + + VERIFY_IS_APPROX(result(0, 0), expected(0, 0)); + VERIFY_IS_APPROX(result(0, 1), expected(0, 1)); + VERIFY_IS_APPROX(result(1, 0), expected(1, 0)); + VERIFY_IS_APPROX(result(1, 1), expected(1, 1)); +} + +template<int DataLayout> +static void test_expr() +{ + Tensor<float, 2, DataLayout> mat1(2, 3); + Tensor<float, 2, DataLayout> mat2(3, 2); + mat1.setRandom(); + mat2.setRandom(); + + Tensor<float, 2, DataLayout> mat3(2,2); + + Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); + mat3 = mat1.contract(mat2, dims); + + VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0)); + VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1)); + VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0)); + VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1)); +} + +template<int DataLayout> +static void test_out_of_order_contraction() +{ + Tensor<float, 3, DataLayout> mat1(2, 2, 2); + Tensor<float, 3, DataLayout> mat2(2, 2, 2); + + mat1.setRandom(); + mat2.setRandom(); + + Tensor<float, 2, DataLayout> mat3(2, 2); + + Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(0, 2)}}); + mat3 = mat1.contract(mat2, dims); + + VERIFY_IS_APPROX(mat3(0, 0), + mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) + + mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1)); + VERIFY_IS_APPROX(mat3(1, 0), + mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) + + mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1)); + VERIFY_IS_APPROX(mat3(0, 1), + mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) + + mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1)); + VERIFY_IS_APPROX(mat3(1, 1), + mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) + + mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1)); + + Eigen::array<DimPair, 2> dims2({{DimPair(0, 2), DimPair(2, 0)}}); + mat3 = mat1.contract(mat2, dims2); + + VERIFY_IS_APPROX(mat3(0, 0), + mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) + + mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1)); + VERIFY_IS_APPROX(mat3(1, 0), + mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) + + mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1)); + VERIFY_IS_APPROX(mat3(0, 1), + mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) + + mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1)); + VERIFY_IS_APPROX(mat3(1, 1), + mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) + + mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1)); + +} + +template<int DataLayout> +static void test_consistency() +{ + // this does something like testing (A*B)^T = (B^T * A^T) + + Tensor<float, 3, DataLayout> mat1(4, 3, 5); + Tensor<float, 5, DataLayout> mat2(3, 2, 1, 5, 4); + mat1.setRandom(); + mat2.setRandom(); + + Tensor<float, 4, DataLayout> mat3(5, 2, 1, 5); + Tensor<float, 4, DataLayout> mat4(2, 1, 5, 5); + + // contract on dimensions of size 4 and 3 + Eigen::array<DimPair, 2> dims1({{DimPair(0, 4), DimPair(1, 0)}}); + Eigen::array<DimPair, 2> dims2({{DimPair(4, 0), DimPair(0, 1)}}); + + mat3 = mat1.contract(mat2, dims1); + mat4 = mat2.contract(mat1, dims2); + + // check that these are equal except for ordering of dimensions + if (DataLayout == ColMajor) { + for (size_t i = 0; i < 5; i++) { + for (size_t j = 0; j < 10; j++) { + VERIFY_IS_APPROX(mat3.data()[i + 5 * j], mat4.data()[j + 10 * i]); + } + } + } else { + // Row major + for (size_t i = 0; i < 5; i++) { + for (size_t j = 0; j < 10; j++) { + VERIFY_IS_APPROX(mat3.data()[10 * i + j], mat4.data()[i + 5 * j]); + } + } + } +} + +template<int DataLayout> +static void test_large_contraction() +{ + Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31); + Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10); + Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10); + + t_left.setRandom(); + t_right.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 1500, 248); + MapXf m_right(t_right.data(), 248, 1400); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}}); + + // compute results by separate methods + t_result = t_left.contract(t_right, dims); + m_result = m_left * m_right; + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY(&t_result.data()[i] != &m_result.data()[i]); + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + +template<int DataLayout> +static void test_matrix_vector() +{ + Tensor<float, 2, DataLayout> t_left(30, 50); + Tensor<float, 1, DataLayout> t_right(50); + Tensor<float, 1, DataLayout> t_result(30); + + t_left.setRandom(); + t_right.setRandom(); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 30, 50); + MapXf m_right(t_right.data(), 50, 1); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(30, 1); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 1> dims{{DimPair(1, 0)}}; + + // compute results by separate methods + t_result = t_left.contract(t_right, dims); + m_result = m_left * m_right; + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1)); + } +} + + +template<int DataLayout> +static void test_tensor_vector() +{ + Tensor<float, 3, DataLayout> t_left(7, 13, 17); + Tensor<float, 2, DataLayout> t_right(1, 7); + + t_left.setRandom(); + t_right.setRandom(); + + typedef typename Tensor<float, 1, DataLayout>::DimensionPair DimensionPair; + Eigen::array<DimensionPair, 1> dim_pair01{{{0, 1}}}; + Tensor<float, 3, DataLayout> t_result = t_left.contract(t_right, dim_pair01); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 7, 13*17); + MapXf m_right(t_right.data(), 1, 7); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left.transpose() * m_right.transpose(); + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1)); + } +} + + +template<int DataLayout> +static void test_small_blocking_factors() +{ + Tensor<float, 4, DataLayout> t_left(30, 5, 3, 31); + Tensor<float, 5, DataLayout> t_right(3, 31, 7, 20, 1); + t_left.setRandom(); + t_right.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + // Force the cache sizes, which results in smaller blocking factors. + Eigen::setCpuCacheSizes(896, 1920, 2944); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}}); + Tensor<float, 5, DataLayout> t_result; + t_result = t_left.contract(t_right, dims); + + // compute result using a simple eigen matrix product + Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> m_left(t_left.data(), 150, 93); + Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> m_right(t_right.data(), 93, 140); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left * m_right; + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + + +void test_cxx11_tensor_contraction() +{ + CALL_SUBTEST(test_evals<ColMajor>()); + CALL_SUBTEST(test_evals<RowMajor>()); + CALL_SUBTEST(test_scalar<ColMajor>()); + CALL_SUBTEST(test_scalar<RowMajor>()); + CALL_SUBTEST(test_multidims<ColMajor>()); + CALL_SUBTEST(test_multidims<RowMajor>()); + CALL_SUBTEST(test_holes<ColMajor>()); + CALL_SUBTEST(test_holes<RowMajor>()); + CALL_SUBTEST(test_full_redux<ColMajor>()); + CALL_SUBTEST(test_full_redux<RowMajor>()); + CALL_SUBTEST(test_contraction_of_contraction<ColMajor>()); + CALL_SUBTEST(test_contraction_of_contraction<RowMajor>()); + CALL_SUBTEST(test_expr<ColMajor>()); + CALL_SUBTEST(test_expr<RowMajor>()); + CALL_SUBTEST(test_out_of_order_contraction<ColMajor>()); + CALL_SUBTEST(test_out_of_order_contraction<RowMajor>()); + CALL_SUBTEST(test_consistency<ColMajor>()); + CALL_SUBTEST(test_consistency<RowMajor>()); + CALL_SUBTEST(test_large_contraction<ColMajor>()); + CALL_SUBTEST(test_large_contraction<RowMajor>()); + CALL_SUBTEST(test_matrix_vector<ColMajor>()); + CALL_SUBTEST(test_matrix_vector<RowMajor>()); + CALL_SUBTEST(test_tensor_vector<ColMajor>()); + CALL_SUBTEST(test_tensor_vector<RowMajor>()); + CALL_SUBTEST(test_small_blocking_factors<ColMajor>()); + CALL_SUBTEST(test_small_blocking_factors<RowMajor>()); +} diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp new file mode 100644 index 000000000..4672db463 --- /dev/null +++ b/unsupported/test/cxx11_tensor_convolution.cpp @@ -0,0 +1,141 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::DefaultDevice; + +static void test_evals() +{ + Tensor<float, 2> input(3, 3); + Tensor<float, 1> kernel(2); + + input.setRandom(); + kernel.setRandom(); + + Tensor<float, 2> result(2,3); + result.setZero(); + Eigen::array<Tensor<float, 2>::Index, 1> dims3({0}); + + typedef TensorEvaluator<decltype(input.convolve(kernel, dims3)), DefaultDevice> Evaluator; + Evaluator eval(input.convolve(kernel, dims3), DefaultDevice()); + eval.evalTo(result.data()); + EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval.dimensions()[0], 2); + VERIFY_IS_EQUAL(eval.dimensions()[1], 3); + + VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1)); // index 0 + VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1)); // index 2 + VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1)); // index 4 + VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1)); // index 1 + VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1)); // index 3 + VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5 +} + + +static void test_expr() +{ + Tensor<float, 2> input(3, 3); + Tensor<float, 2> kernel(2, 2); + input.setRandom(); + kernel.setRandom(); + + Tensor<float, 2> result(2,2); + Eigen::array<ptrdiff_t, 2> dims({0, 1}); + result = input.convolve(kernel, dims); + + VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) + + input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1)); + VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) + + input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1)); + VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) + + input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1)); + VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) + + input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1)); +} + + +static void test_modes() { + Tensor<float, 1> input(3); + Tensor<float, 1> kernel(3); + input(0) = 1.0f; + input(1) = 2.0f; + input(2) = 3.0f; + kernel(0) = 0.5f; + kernel(1) = 1.0f; + kernel(2) = 0.0f; + + const Eigen::array<ptrdiff_t, 1> dims{{0}}; + Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 1> padding; + + // Emulate VALID mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(0, 0); + Tensor<float, 1> valid(1); + valid = input.pad(padding).convolve(kernel, dims); + VERIFY_IS_EQUAL(valid.dimension(0), 1); + VERIFY_IS_APPROX(valid(0), 2.5f); + + // Emulate SAME mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(1, 1); + Tensor<float, 1> same(3); + same = input.pad(padding).convolve(kernel, dims); + VERIFY_IS_EQUAL(same.dimension(0), 3); + VERIFY_IS_APPROX(same(0), 1.0f); + VERIFY_IS_APPROX(same(1), 2.5f); + VERIFY_IS_APPROX(same(2), 4.0f); + + // Emulate FULL mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(2, 2); + Tensor<float, 1> full(5); + full = input.pad(padding).convolve(kernel, dims); + VERIFY_IS_EQUAL(full.dimension(0), 5); + VERIFY_IS_APPROX(full(0), 0.0f); + VERIFY_IS_APPROX(full(1), 1.0f); + VERIFY_IS_APPROX(full(2), 2.5f); + VERIFY_IS_APPROX(full(3), 4.0f); + VERIFY_IS_APPROX(full(4), 1.5f); +} + + +static void test_strides() { + Tensor<float, 1> input(13); + Tensor<float, 1> kernel(3); + input.setRandom(); + kernel.setRandom(); + + const Eigen::array<ptrdiff_t, 1> dims{{0}}; + const Eigen::array<ptrdiff_t, 1> stride_of_3{{3}}; + const Eigen::array<ptrdiff_t, 1> stride_of_2{{2}}; + + Tensor<float, 1> result; + result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2); + + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) + + input(6)*kernel(2))); + VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) + + input(12)*kernel(2))); +} + + + + +void test_cxx11_tensor_convolution() +{ + CALL_SUBTEST(test_evals()); + CALL_SUBTEST(test_expr()); + CALL_SUBTEST(test_modes()); + CALL_SUBTEST(test_strides()); +} diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cpp new file mode 100644 index 000000000..8c1ca1bf8 --- /dev/null +++ b/unsupported/test/cxx11_tensor_cuda.cpp @@ -0,0 +1,514 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// TODO(mdevin): Free the cuda memory. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +void test_cuda_elementwise_small() { + Tensor<float, 1> in1(Eigen::array<int, 1>(2)); + Tensor<float, 1> in2(Eigen::array<int, 1>(2)); + Tensor<float, 1> out(Eigen::array<int, 1>(2)); + in1.setRandom(); + in2.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_in2), in2_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( + d_in1, Eigen::array<int, 1>(2)); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2( + d_in2, Eigen::array<int, 1>(2)); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out( + d_out, Eigen::array<int, 1>(2)); + + gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 2; ++i) { + VERIFY_IS_APPROX( + out(Eigen::array<int, 1>(i)), + in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i))); + } +} + +void test_cuda_elementwise() +{ + Tensor<float, 3> in1(Eigen::array<int, 3>(72,53,97)); + Tensor<float, 3> in2(Eigen::array<int, 3>(72,53,97)); + Tensor<float, 3> in3(Eigen::array<int, 3>(72,53,97)); + Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97)); + in1.setRandom(); + in2.setRandom(); + in3.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t in3_bytes = in3.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_in3; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_in2), in2_bytes); + cudaMalloc((void**)(&d_in3), in3_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<int, 3>(72,53,97)); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<int, 3>(72,53,97)); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<int, 3>(72,53,97)); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97)); + + gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3; + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 53; ++j) { + for (int k = 0; k < 97; ++k) { + VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * in3(Eigen::array<int, 3>(i,j,k))); + } + } + } +} + + +void test_cuda_reduction() +{ + Tensor<float, 4> in1(Eigen::array<int, 4>(72,53,97,113)); + Tensor<float, 2> out(Eigen::array<int, 2>(72,97)); + in1.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, Eigen::array<int, 4>(72,53,97,113)); + Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, Eigen::array<int, 2>(72,97)); + + array<int, 2> reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 3; + + gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + float expected = 0; + for (int k = 0; k < 53; ++k) { + for (int l = 0; l < 113; ++l) { + expected = + std::max<float>(expected, in1(Eigen::array<int, 4>(i, k, j, l))); + } + } + VERIFY_IS_APPROX(out(Eigen::array<int, 2>(i,j)), expected); + } + } +} + +template<int DataLayout> +static void test_cuda_contraction() +{ + // with these dimensions, the output has 300 * 140 elements, which is + // more than 30 * 1024, which is the number of threads in blocks on + // a 15 SM GK110 GPU + Tensor<float, 4, DataLayout> t_left(Eigen::array<int, 4>(6, 50, 3, 31)); + Tensor<float, 5, DataLayout> t_right(Eigen::array<int, 5>(3, 31, 7, 20, 1)); + Tensor<float, 5, DataLayout> t_result(Eigen::array<int, 5>(6, 50, 7, 20, 1)); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size() * sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + cudaMalloc((void**)(&d_t_left), t_left_bytes); + cudaMalloc((void**)(&d_t_right), t_right_bytes); + cudaMalloc((void**)(&d_t_result), t_result_bytes); + + cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > + gpu_t_left(d_t_left, Eigen::array<int, 4>(6, 50, 3, 31)); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > + gpu_t_right(d_t_right, Eigen::array<int, 5>(3, 31, 7, 20, 1)); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > + gpu_t_result(d_t_result, Eigen::array<int, 5>(6, 50, 7, 20, 1)); + + typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf; + MapXf m_left(t_left.data(), 300, 93); + MapXf m_right(t_right.data(), 93, 140); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140); + + typedef Tensor<float, 1>::DimensionPair DimPair; + Eigen::array<DimPair, 2> dims; + dims[0] = DimPair(2, 0); + dims[1] = DimPair(3, 1); + + m_result = m_left * m_right; + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + + cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << endl; + assert(false); + } + } +} + +static void test_cuda_convolution_1d() +{ + Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137)); + Tensor<float, 1> kernel(Eigen::array<int, 1>(4)); + Tensor<float, 4> out(Eigen::array<int, 4>(74,34,11,137)); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137)); + Eigen::TensorMap<Eigen::Tensor<float, 1> > gpu_kernel(d_kernel, Eigen::array<int, 1>(4)); + Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,34,11,137)); + + Eigen::array<int, 1> dims(1); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 34; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 137; ++l) { + const float result = out(Eigen::array<int, 4>(i,j,k,l)); + const float expected = input(Eigen::array<int, 4>(i,j+0,k,l)) * kernel(Eigen::array<int, 1>(0)) + + input(Eigen::array<int, 4>(i,j+1,k,l)) * kernel(Eigen::array<int, 1>(1)) + + input(Eigen::array<int, 4>(i,j+2,k,l)) * kernel(Eigen::array<int, 1>(2)) + + input(Eigen::array<int, 4>(i,j+3,k,l)) * kernel(Eigen::array<int, 1>(3)); + VERIFY_IS_APPROX(result, expected); + } + } + } + } +} + + +static void test_cuda_convolution_2d() +{ + Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137)); + Tensor<float, 2> kernel(Eigen::array<int, 2>(3,4)); + Tensor<float, 4> out(Eigen::array<int, 4>(74,35,8,137)); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137)); + Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_kernel(d_kernel, Eigen::array<int, 2>(3,4)); + Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,35,8,137)); + + Eigen::array<int, 2> dims(1,2); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 137; ++l) { + const float result = out(Eigen::array<int, 4>(i,j,k,l)); + const float expected = input(Eigen::array<int, 4>(i,j+0,k+0,l)) * kernel(Eigen::array<int, 2>(0,0)) + + input(Eigen::array<int, 4>(i,j+1,k+0,l)) * kernel(Eigen::array<int, 2>(1,0)) + + input(Eigen::array<int, 4>(i,j+2,k+0,l)) * kernel(Eigen::array<int, 2>(2,0)) + + input(Eigen::array<int, 4>(i,j+0,k+1,l)) * kernel(Eigen::array<int, 2>(0,1)) + + input(Eigen::array<int, 4>(i,j+1,k+1,l)) * kernel(Eigen::array<int, 2>(1,1)) + + input(Eigen::array<int, 4>(i,j+2,k+1,l)) * kernel(Eigen::array<int, 2>(2,1)) + + input(Eigen::array<int, 4>(i,j+0,k+2,l)) * kernel(Eigen::array<int, 2>(0,2)) + + input(Eigen::array<int, 4>(i,j+1,k+2,l)) * kernel(Eigen::array<int, 2>(1,2)) + + input(Eigen::array<int, 4>(i,j+2,k+2,l)) * kernel(Eigen::array<int, 2>(2,2)) + + input(Eigen::array<int, 4>(i,j+0,k+3,l)) * kernel(Eigen::array<int, 2>(0,3)) + + input(Eigen::array<int, 4>(i,j+1,k+3,l)) * kernel(Eigen::array<int, 2>(1,3)) + + input(Eigen::array<int, 4>(i,j+2,k+3,l)) * kernel(Eigen::array<int, 2>(2,3)); + VERIFY_IS_APPROX(result, expected); + } + } + } + } +} + + +static void test_cuda_convolution_3d() +{ + Tensor<float, 5> input(Eigen::array<int, 5>(74,37,11,137,17)); + Tensor<float, 3> kernel(Eigen::array<int, 3>(3,4,2)); + Tensor<float, 5> out(Eigen::array<int, 5>(74,35,8,136,17)); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_input(d_input, Eigen::array<int, 5>(74,37,11,137,17)); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_kernel(d_kernel, Eigen::array<int, 3>(3,4,2)); + Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_out(d_out, Eigen::array<int, 5>(74,35,8,136,17)); + + Eigen::array<int, 3> dims(1,2,3); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 136; ++l) { + for (int m = 0; m < 17; ++m) { + const float result = out(Eigen::array<int, 5>(i,j,k,l,m)); + const float expected = input(Eigen::array<int, 5>(i,j+0,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(0,0,0)) + + input(Eigen::array<int, 5>(i,j+1,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(1,0,0)) + + input(Eigen::array<int, 5>(i,j+2,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(2,0,0)) + + input(Eigen::array<int, 5>(i,j+0,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(0,1,0)) + + input(Eigen::array<int, 5>(i,j+1,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(1,1,0)) + + input(Eigen::array<int, 5>(i,j+2,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(2,1,0)) + + input(Eigen::array<int, 5>(i,j+0,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(0,2,0)) + + input(Eigen::array<int, 5>(i,j+1,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(1,2,0)) + + input(Eigen::array<int, 5>(i,j+2,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(2,2,0)) + + input(Eigen::array<int, 5>(i,j+0,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(0,3,0)) + + input(Eigen::array<int, 5>(i,j+1,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(1,3,0)) + + input(Eigen::array<int, 5>(i,j+2,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(2,3,0)) + + input(Eigen::array<int, 5>(i,j+0,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(0,0,1)) + + input(Eigen::array<int, 5>(i,j+1,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(1,0,1)) + + input(Eigen::array<int, 5>(i,j+2,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(2,0,1)) + + input(Eigen::array<int, 5>(i,j+0,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(0,1,1)) + + input(Eigen::array<int, 5>(i,j+1,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(1,1,1)) + + input(Eigen::array<int, 5>(i,j+2,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(2,1,1)) + + input(Eigen::array<int, 5>(i,j+0,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(0,2,1)) + + input(Eigen::array<int, 5>(i,j+1,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(1,2,1)) + + input(Eigen::array<int, 5>(i,j+2,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(2,2,1)) + + input(Eigen::array<int, 5>(i,j+0,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(0,3,1)) + + input(Eigen::array<int, 5>(i,j+1,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(1,3,1)) + + input(Eigen::array<int, 5>(i,j+2,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(2,3,1)); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + } +} + +static float* CudaCopyFloat(float* data, int size) { + const int nbytes = size * sizeof(float); + float* result = NULL; + if (cudaMalloc((void**)(&result), nbytes) != cudaSuccess) { + return NULL; + } else { + if (data != NULL) { + cudaMemcpy(result, data, nbytes, cudaMemcpyHostToDevice); + } + return result; + } +} + +static void test_cuda_constant_broadcast() +{ + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Tensor<float, 1> t1(10); + for (int i = 0; i < 10; ++i) { + t1(i) = 10.0f * i; + } + float* t1_cuda = CudaCopyFloat(t1.data(), t1.size()); + Eigen::TensorMap<Eigen::Tensor<float, 1> > t1_gpu(t1_cuda, 10); + + Tensor<float, 1> t2(1); + t2 = t2.constant(20.0f); + float* t2_cuda = CudaCopyFloat(t2.data(), t2.size()); + Eigen::TensorMap<Eigen::TensorFixedSize<float, Sizes<1> > > t2_gpu(t2_cuda, 1); + + float* t3_cuda = CudaCopyFloat(NULL, 10); + Eigen::TensorMap<Eigen::Tensor<float, 1> > t3_gpu(t3_cuda, 10); + + t3_gpu.device(gpu_device) = + t1_gpu + t2_gpu.broadcast(Eigen::array<int, 1>(10)); + + Eigen::Tensor<float, 1> t3(10); + cudaMemcpy(t3.data(), t3_gpu.data(), 10 * sizeof(float), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < 10; ++i) { + VERIFY_IS_APPROX(t3(i), t1(i) + t2(0)); + } +} + + +void test_cuda_cast() +{ + Tensor<double, 3> in(Eigen::array<int, 3>(72,53,97)); + Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97)); + in.setRandom(); + + std::size_t in_bytes = in.size() * sizeof(double); + std::size_t out_bytes = out.size() * sizeof(float); + + double* d_in; + float* d_out; + cudaMalloc((void**)(&d_in), in_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<double, 3> > gpu_in(d_in, Eigen::array<int, 3>(72,53,97)); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97)); + + gpu_out.device(gpu_device) = gpu_in.template cast<float>(); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 53; ++j) { + for (int k = 0; k < 97; ++k) { + VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), static_cast<float>(in(Eigen::array<int, 3>(i,j,k)))); + } + } + } +} + + +void test_cxx11_tensor_cuda() +{ + CALL_SUBTEST(test_cuda_elementwise_small()); + CALL_SUBTEST(test_cuda_elementwise()); + CALL_SUBTEST(test_cuda_reduction()); + CALL_SUBTEST(test_cuda_contraction<ColMajor>()); + CALL_SUBTEST(test_cuda_contraction<RowMajor>()); + CALL_SUBTEST(test_cuda_convolution_1d()); + CALL_SUBTEST(test_cuda_convolution_2d()); + CALL_SUBTEST(test_cuda_convolution_3d()); + CALL_SUBTEST(test_cuda_constant_broadcast()); + CALL_SUBTEST(test_cuda_cast()); +} diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp new file mode 100644 index 000000000..f2d7e4ce6 --- /dev/null +++ b/unsupported/test/cxx11_tensor_device.cpp @@ -0,0 +1,391 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_device +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::RowMajor; + +// Context for evaluation on cpu +struct CPUContext { + CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) { + kernel_1d_(0) = 3.14f; + kernel_1d_(1) = 2.7f; + + kernel_2d_(0,0) = 3.14f; + kernel_2d_(1,0) = 2.7f; + kernel_2d_(0,1) = 0.2f; + kernel_2d_(1,1) = 7.0f; + + kernel_3d_(0,0,0) = 3.14f; + kernel_3d_(0,1,0) = 2.7f; + kernel_3d_(0,0,1) = 0.2f; + kernel_3d_(0,1,1) = 7.0f; + kernel_3d_(1,0,0) = -1.0f; + kernel_3d_(1,1,0) = -0.3f; + kernel_3d_(1,0,1) = -0.7f; + kernel_3d_(1,1,1) = -0.5f; + } + + const Eigen::DefaultDevice& device() const { return cpu_device_; } + + const Eigen::Tensor<float, 3>& in1() const { return in1_; } + const Eigen::Tensor<float, 3>& in2() const { return in2_; } + Eigen::Tensor<float, 3>& out() { return out_; } + const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; } + const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; } + const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; } + + private: + const Eigen::Tensor<float, 3>& in1_; + const Eigen::Tensor<float, 3>& in2_; + Eigen::Tensor<float, 3>& out_; + + Eigen::Tensor<float, 1> kernel_1d_; + Eigen::Tensor<float, 2> kernel_2d_; + Eigen::Tensor<float, 3> kernel_3d_; + + Eigen::DefaultDevice cpu_device_; +}; + + +// Context for evaluation on GPU +struct GPUContext { + GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { + assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess); + float kernel_1d_val[] = {3.14f, 2.7f}; + assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + + assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess); + float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f}; + assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + + assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess); + float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f}; + assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + + assert(cudaStreamCreate(&stream_) == cudaSuccess); + } + ~GPUContext() { + assert(cudaFree(kernel_1d_) == cudaSuccess); + assert(cudaFree(kernel_2d_) == cudaSuccess); + assert(cudaFree(kernel_3d_) == cudaSuccess); + assert(cudaStreamDestroy(stream_) == cudaSuccess); + } + + const Eigen::GpuDevice& device() const { return gpu_device_; } + + const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; } + const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; } + Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; } + Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); } + Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); } + Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); } + + private: + const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_; + const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_; + Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_; + + float* kernel_1d_; + float* kernel_2d_; + float* kernel_3d_; + + cudaStream_t stream_; + Eigen::GpuDevice gpu_device_; +}; + + +// The actual expression to evaluate +template <typename Context> +static void test_contextual_eval(Context* context) +{ + context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); +} + +template <typename Context> +static void test_forced_contextual_eval(Context* context) +{ + context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); +} + +template <typename Context> +static void test_compound_assignment(Context* context) +{ + context->out().device(context->device()) = context->in1().constant(2.718f); + context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; +} + + +template <typename Context> +static void test_contraction(Context* context) +{ + Eigen::array<std::pair<int, int>, 2> dims; + dims[0] = std::make_pair(1, 1); + dims[1] = std::make_pair(2, 2); + + Eigen::array<int, 2> shape(40, 50*70); + + Eigen::DSizes<int, 2> indices(0,0); + Eigen::DSizes<int, 2> sizes(40,40); + + context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims); +} + + +template <typename Context> +static void test_1d_convolution(Context* context) +{ + Eigen::DSizes<int, 3> indices(0,0,0); + Eigen::DSizes<int, 3> sizes(40,49,70); + + Eigen::array<int, 1> dims(1); + context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims); +} + +template <typename Context> +static void test_2d_convolution(Context* context) +{ + Eigen::DSizes<int, 3> indices(0,0,0); + Eigen::DSizes<int, 3> sizes(40,49,69); + + Eigen::array<int, 2> dims(1,2); + context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims); +} + +template <typename Context> +static void test_3d_convolution(Context* context) +{ + Eigen::DSizes<int, 3> indices(0,0,0); + Eigen::DSizes<int, 3> sizes(39,49,69); + + Eigen::array<int, 3> dims(0,1,2); + context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims); +} + + +static void test_cpu() { + Eigen::Tensor<float, 3> in1(40,50,70); + Eigen::Tensor<float, 3> in2(40,50,70); + Eigen::Tensor<float, 3> out(40,50,70); + + in1 = in1.random() + in1.constant(10.0f); + in2 = in2.random() + in2.constant(10.0f); + + CPUContext context(in1, in2, out); + test_contextual_eval(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); + } + } + } + + test_forced_contextual_eval(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); + } + } + } + + test_compound_assignment(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); + } + } + } + + test_contraction(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 40; ++j) { + const float result = out(i,j,0); + float expected = 0; + for (int k = 0; k < 50; ++k) { + for (int l = 0; l < 70; ++l) { + expected += in1(i, k, l) * in2(j, k, l); + } + } + VERIFY_IS_APPROX(expected, result); + } + } + + test_1d_convolution(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); + } + } + } + + test_2d_convolution(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) + + (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); + if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { + continue; + } + VERIFY_IS_APPROX(expected, result); + } + } + } + + test_3d_convolution(&context); + for (int i = 0; i < 39; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + + in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) + + (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + + in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); + if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { + continue; + } + VERIFY_IS_APPROX(expected, result); + } + } + } +} + +static void test_gpu() { + Eigen::Tensor<float, 3> in1(40,50,70); + Eigen::Tensor<float, 3> in2(40,50,70); + Eigen::Tensor<float, 3> out(40,50,70); + in1 = in1.random() + in1.constant(10.0f); + in2 = in2.random() + in2.constant(10.0f); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_in2), in2_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70); + + GPUContext context(gpu_in1, gpu_in2, gpu_out); + test_contextual_eval(&context); + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); + } + } + } + + test_forced_contextual_eval(&context); + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); + } + } + } + + test_compound_assignment(&context); + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); + } + } + } + + test_contraction(&context); + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 40; ++j) { + const float result = out(i,j,0); + float expected = 0; + for (int k = 0; k < 50; ++k) { + for (int l = 0; l < 70; ++l) { + expected += in1(i, k, l) * in2(j, k, l); + } + } + VERIFY_IS_APPROX(expected, result); + } + } + + test_1d_convolution(&context); + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); + assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); + } + } + } + + test_2d_convolution(&context); + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); + assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + + in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); + VERIFY_IS_APPROX(expected, result); + } + } + } + + test_3d_convolution(&context); + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); + assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + for (int i = 0; i < 39; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + + in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f + + in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + + in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); + VERIFY_IS_APPROX(expected, result); + } + } + } +} + + +void test_cxx11_tensor_device() +{ + CALL_SUBTEST(test_cpu()); + CALL_SUBTEST(test_gpu()); +} diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp new file mode 100644 index 000000000..0cc4e86f7 --- /dev/null +++ b/unsupported/test/cxx11_tensor_dimension.cpp @@ -0,0 +1,54 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + + +static void test_dynamic_size() +{ + Eigen::DSizes<int, 3> dimensions(2,3,7); + + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2); + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3); + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7); + VERIFY_IS_EQUAL(dimensions.TotalSize(), (size_t)2*3*7); + VERIFY_IS_EQUAL((int)dimensions[0], 2); + VERIFY_IS_EQUAL((int)dimensions[1], 3); + VERIFY_IS_EQUAL((int)dimensions[2], 7); +} + +static void test_fixed_size() +{ + Eigen::Sizes<2,3,7> dimensions; + + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2); + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3); + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7); + VERIFY_IS_EQUAL(dimensions.TotalSize(), (size_t)2*3*7); +} + + +static void test_match() +{ + Eigen::DSizes<int, 3> dyn(2,3,7); + Eigen::Sizes<2,3,7> stat; + VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn, stat), true); +} + + +void test_cxx11_tensor_dimension() +{ + CALL_SUBTEST(test_dynamic_size()); + CALL_SUBTEST(test_fixed_size()); + CALL_SUBTEST(test_match()); +} diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp new file mode 100644 index 000000000..695565e9b --- /dev/null +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -0,0 +1,314 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_1d() +{ + Tensor<float, 1> vec1({6}); + Tensor<float, 1, RowMajor> vec2({6}); + + vec1(0) = 4.0; vec2(0) = 0.0; + vec1(1) = 8.0; vec2(1) = 1.0; + vec1(2) = 15.0; vec2(2) = 2.0; + vec1(3) = 16.0; vec2(3) = 3.0; + vec1(4) = 23.0; vec2(4) = 4.0; + vec1(5) = 42.0; vec2(5) = 5.0; + + float data3[6]; + TensorMap<Tensor<float, 1>> vec3(data3, 6); + vec3 = vec1.sqrt(); + float data4[6]; + TensorMap<Tensor<float, 1, RowMajor>> vec4(data4, 6); + vec4 = vec2.square(); + float data5[6]; + TensorMap<Tensor<float, 1, RowMajor>> vec5(data5, 6); + vec5 = vec2.cube(); + + VERIFY_IS_APPROX(vec3(0), sqrtf(4.0)); + VERIFY_IS_APPROX(vec3(1), sqrtf(8.0)); + VERIFY_IS_APPROX(vec3(2), sqrtf(15.0)); + VERIFY_IS_APPROX(vec3(3), sqrtf(16.0)); + VERIFY_IS_APPROX(vec3(4), sqrtf(23.0)); + VERIFY_IS_APPROX(vec3(5), sqrtf(42.0)); + + VERIFY_IS_APPROX(vec4(0), 0.0f); + VERIFY_IS_APPROX(vec4(1), 1.0f); + VERIFY_IS_APPROX(vec4(2), 2.0f * 2.0f); + VERIFY_IS_APPROX(vec4(3), 3.0f * 3.0f); + VERIFY_IS_APPROX(vec4(4), 4.0f * 4.0f); + VERIFY_IS_APPROX(vec4(5), 5.0f * 5.0f); + + VERIFY_IS_APPROX(vec5(0), 0.0f); + VERIFY_IS_APPROX(vec5(1), 1.0f); + VERIFY_IS_APPROX(vec5(2), 2.0f * 2.0f * 2.0f); + VERIFY_IS_APPROX(vec5(3), 3.0f * 3.0f * 3.0f); + VERIFY_IS_APPROX(vec5(4), 4.0f * 4.0f * 4.0f); + VERIFY_IS_APPROX(vec5(5), 5.0f * 5.0f * 5.0f); + + vec3 = vec1 + vec2; + VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f); + VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f); + VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f); + VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f); + VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f); + VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f); +} + +static void test_2d() +{ + float data1[6]; + TensorMap<Tensor<float, 2>> mat1(data1, 2, 3); + float data2[6]; + TensorMap<Tensor<float, 2, RowMajor>> mat2(data2, 2, 3); + + mat1(0,0) = 0.0; + mat1(0,1) = 1.0; + mat1(0,2) = 2.0; + mat1(1,0) = 3.0; + mat1(1,1) = 4.0; + mat1(1,2) = 5.0; + + mat2(0,0) = -0.0; + mat2(0,1) = -1.0; + mat2(0,2) = -2.0; + mat2(1,0) = -3.0; + mat2(1,1) = -4.0; + mat2(1,2) = -5.0; + + Tensor<float, 2> mat3(2,3); + Tensor<float, 2, RowMajor> mat4(2,3); + mat3 = mat1.abs(); + mat4 = mat2.abs(); + + VERIFY_IS_APPROX(mat3(0,0), 0.0f); + VERIFY_IS_APPROX(mat3(0,1), 1.0f); + VERIFY_IS_APPROX(mat3(0,2), 2.0f); + VERIFY_IS_APPROX(mat3(1,0), 3.0f); + VERIFY_IS_APPROX(mat3(1,1), 4.0f); + VERIFY_IS_APPROX(mat3(1,2), 5.0f); + + VERIFY_IS_APPROX(mat4(0,0), 0.0f); + VERIFY_IS_APPROX(mat4(0,1), 1.0f); + VERIFY_IS_APPROX(mat4(0,2), 2.0f); + VERIFY_IS_APPROX(mat4(1,0), 3.0f); + VERIFY_IS_APPROX(mat4(1,1), 4.0f); + VERIFY_IS_APPROX(mat4(1,2), 5.0f); +} + +static void test_3d() +{ + Tensor<float, 3> mat1(2,3,7); + Tensor<float, 3, RowMajor> mat2(2,3,7); + + float val = 1.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + mat2(i,j,k) = val; + val += 1.0; + } + } + } + + Tensor<float, 3> mat3(2,3,7); + mat3 = mat1 + mat1; + Tensor<float, 3, RowMajor> mat4(2,3,7); + mat4 = mat2 * 3.14f; + Tensor<float, 3> mat5(2,3,7); + mat5 = mat1.inverse().log(); + Tensor<float, 3, RowMajor> mat6(2,3,7); + mat6 = mat2.pow(0.5f) * 3.14f; + Tensor<float, 3> mat7(2,3,7); + mat7 = mat1.cwiseMax(mat5 * 2.0f).exp(); + Tensor<float, 3, RowMajor> mat8(2,3,7); + mat8 = (-mat2).exp() * 3.14f; + Tensor<float, 3, RowMajor> mat9(2,3,7); + mat9 = mat2 + 3.14f; + Tensor<float, 3, RowMajor> mat10(2,3,7); + mat10 = mat2 - 3.14f; + Tensor<float, 3, RowMajor> mat11(2,3,7); + mat11 = mat2 / 3.14f; + + val = 1.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), val + val); + VERIFY_IS_APPROX(mat4(i,j,k), val * 3.14f); + VERIFY_IS_APPROX(mat5(i,j,k), logf(1.0f/val)); + VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f); + VERIFY_IS_APPROX(mat7(i,j,k), expf((std::max)(val, mat5(i,j,k) * 2.0f))); + VERIFY_IS_APPROX(mat8(i,j,k), expf(-val) * 3.14f); + VERIFY_IS_APPROX(mat9(i,j,k), val + 3.14f); + VERIFY_IS_APPROX(mat10(i,j,k), val - 3.14f); + VERIFY_IS_APPROX(mat11(i,j,k), val / 3.14f); + val += 1.0; + } + } + } +} + +static void test_constants() +{ + Tensor<float, 3> mat1(2,3,7); + Tensor<float, 3> mat2(2,3,7); + Tensor<float, 3> mat3(2,3,7); + + float val = 1.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + val += 1.0; + } + } + } + mat2 = mat1.constant(3.14f); + mat3 = mat1.cwiseMax(7.3f).exp(); + + val = 1.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat2(i,j,k), 3.14f); + VERIFY_IS_APPROX(mat3(i,j,k), expf((std::max)(val, 7.3f))); + val += 1.0; + } + } + } +} + +static void test_boolean() +{ + Tensor<int, 1> vec(6); + std::copy_n(std::begin({0, 1, 2, 3, 4, 5}), 6, vec.data()); + + // Test ||. + Tensor<bool, 1> bool1 = vec < vec.constant(1) || vec > vec.constant(4); + VERIFY_IS_EQUAL(bool1[0], true); + VERIFY_IS_EQUAL(bool1[1], false); + VERIFY_IS_EQUAL(bool1[2], false); + VERIFY_IS_EQUAL(bool1[3], false); + VERIFY_IS_EQUAL(bool1[4], false); + VERIFY_IS_EQUAL(bool1[5], true); + + // Test &&, including cast of operand vec. + Tensor<bool, 1> bool2 = vec.cast<bool>() && vec < vec.constant(4); + VERIFY_IS_EQUAL(bool2[0], false); + VERIFY_IS_EQUAL(bool2[1], true); + VERIFY_IS_EQUAL(bool2[2], true); + VERIFY_IS_EQUAL(bool2[3], true); + VERIFY_IS_EQUAL(bool2[4], false); + VERIFY_IS_EQUAL(bool2[5], false); + + // Compilation tests: + // Test Tensor<bool> against results of cast or comparison; verifies that + // CoeffReturnType is set to match Op return type of bool for Unary and Binary + // Ops. + Tensor<bool, 1> bool3 = vec.cast<bool>() && bool2; + bool3 = vec < vec.constant(4) && bool2; +} + +static void test_functors() +{ + Tensor<float, 3> mat1(2,3,7); + Tensor<float, 3> mat2(2,3,7); + Tensor<float, 3> mat3(2,3,7); + + float val = 1.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + val += 1.0; + } + } + } + mat2 = mat1.inverse().unaryExpr(&asinf); + mat3 = mat1.unaryExpr(&tanhf); + + val = 1.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat2(i,j,k), asinf(1.0f / mat1(i,j,k))); + VERIFY_IS_APPROX(mat3(i,j,k), tanhf(mat1(i,j,k))); + val += 1.0; + } + } + } +} + +static void test_type_casting() +{ + Tensor<bool, 3> mat1(2,3,7); + Tensor<float, 3> mat2(2,3,7); + Tensor<double, 3> mat3(2,3,7); + mat1.setRandom(); + mat2.setRandom(); + + mat3 = mat1.template cast<double>(); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) ? 1.0 : 0.0); + } + } + } + + mat3 = mat2.template cast<double>(); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), static_cast<double>(mat2(i,j,k))); + } + } + } +} + +static void test_select() +{ + Tensor<float, 3> selector(2,3,7); + Tensor<float, 3> mat1(2,3,7); + Tensor<float, 3> mat2(2,3,7); + Tensor<float, 3> result(2,3,7); + + selector.setRandom(); + mat1.setRandom(); + mat2.setRandom(); + result = (selector > selector.constant(0.5f)).select(mat1, mat2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(result(i,j,k), (selector(i,j,k) > 0.5f) ? mat1(i,j,k) : mat2(i,j,k)); + } + } + } +} + + +void test_cxx11_tensor_expr() +{ + CALL_SUBTEST(test_1d()); + CALL_SUBTEST(test_2d()); + CALL_SUBTEST(test_3d()); + CALL_SUBTEST(test_constants()); + CALL_SUBTEST(test_boolean()); + CALL_SUBTEST(test_functors()); + CALL_SUBTEST(test_type_casting()); + CALL_SUBTEST(test_select()); +} diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp new file mode 100644 index 000000000..8a27f5ad8 --- /dev/null +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -0,0 +1,198 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::RowMajor; + + +static void test_1d() +{ + TensorFixedSize<float, Sizes<6> > vec1; + TensorFixedSize<float, Sizes<6>, RowMajor> vec2; + + VERIFY_IS_EQUAL((vec1.size()), 6); + // VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6); + // VERIFY_IS_EQUAL((vec1.dimension(0)), 6); + + vec1(0) = 4.0; vec2(0) = 0.0; + vec1(1) = 8.0; vec2(1) = 1.0; + vec1(2) = 15.0; vec2(2) = 2.0; + vec1(3) = 16.0; vec2(3) = 3.0; + vec1(4) = 23.0; vec2(4) = 4.0; + vec1(5) = 42.0; vec2(5) = 5.0; + + float data3[6]; + TensorMap<TensorFixedSize<float, Sizes<6> > > vec3(data3, 6); + vec3 = vec1.sqrt(); + float data4[6]; + TensorMap<TensorFixedSize<float, Sizes<6>, RowMajor> > vec4(data4, 6); + vec4 = vec2.sqrt(); + + VERIFY_IS_EQUAL((vec3.size()), 6); + VERIFY_IS_EQUAL(vec3.rank(), 1); + // VERIFY_IS_EQUAL((vec3.dimensions()[0]), 6); + // VERIFY_IS_EQUAL((vec3.dimension(0)), 6); + + VERIFY_IS_APPROX(vec3(0), sqrtf(4.0)); + VERIFY_IS_APPROX(vec3(1), sqrtf(8.0)); + VERIFY_IS_APPROX(vec3(2), sqrtf(15.0)); + VERIFY_IS_APPROX(vec3(3), sqrtf(16.0)); + VERIFY_IS_APPROX(vec3(4), sqrtf(23.0)); + VERIFY_IS_APPROX(vec3(5), sqrtf(42.0)); + + VERIFY_IS_APPROX(vec4(0), sqrtf(0.0)); + VERIFY_IS_APPROX(vec4(1), sqrtf(1.0)); + VERIFY_IS_APPROX(vec4(2), sqrtf(2.0)); + VERIFY_IS_APPROX(vec4(3), sqrtf(3.0)); + VERIFY_IS_APPROX(vec4(4), sqrtf(4.0)); + VERIFY_IS_APPROX(vec4(5), sqrtf(5.0)); + + vec3 = vec1 + vec2; + VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f); + VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f); + VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f); + VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f); + VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f); + VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f); +} + +static void test_2d() +{ + float data1[6]; + TensorMap<TensorFixedSize<float, Sizes<2, 3> >> mat1(data1,2,3); + float data2[6]; + TensorMap<TensorFixedSize<float, Sizes<2, 3>, RowMajor>> mat2(data2,2,3); + + VERIFY_IS_EQUAL((mat1.size()), 2*3); + VERIFY_IS_EQUAL(mat1.rank(), 2); + // VERIFY_IS_EQUAL((mat1.dimension(0)), 2); + // VERIFY_IS_EQUAL((mat1.dimension(1)), 3); + + mat1(0,0) = 0.0; + mat1(0,1) = 1.0; + mat1(0,2) = 2.0; + mat1(1,0) = 3.0; + mat1(1,1) = 4.0; + mat1(1,2) = 5.0; + + mat2(0,0) = -0.0; + mat2(0,1) = -1.0; + mat2(0,2) = -2.0; + mat2(1,0) = -3.0; + mat2(1,1) = -4.0; + mat2(1,2) = -5.0; + + TensorFixedSize<float, Sizes<2, 3>> mat3; + TensorFixedSize<float, Sizes<2, 3>, RowMajor> mat4; + mat3 = mat1.abs(); + mat4 = mat2.abs(); + + VERIFY_IS_EQUAL((mat3.size()), 2*3); + // VERIFY_IS_EQUAL((mat3.dimension(0)), 2); + // VERIFY_IS_EQUAL((mat3.dimension(1)), 3); + + VERIFY_IS_APPROX(mat3(0,0), 0.0f); + VERIFY_IS_APPROX(mat3(0,1), 1.0f); + VERIFY_IS_APPROX(mat3(0,2), 2.0f); + VERIFY_IS_APPROX(mat3(1,0), 3.0f); + VERIFY_IS_APPROX(mat3(1,1), 4.0f); + VERIFY_IS_APPROX(mat3(1,2), 5.0f); + + VERIFY_IS_APPROX(mat4(0,0), 0.0f); + VERIFY_IS_APPROX(mat4(0,1), 1.0f); + VERIFY_IS_APPROX(mat4(0,2), 2.0f); + VERIFY_IS_APPROX(mat4(1,0), 3.0f); + VERIFY_IS_APPROX(mat4(1,1), 4.0f); + VERIFY_IS_APPROX(mat4(1,2), 5.0f); +} + +static void test_3d() +{ + TensorFixedSize<float, Sizes<2, 3, 7> > mat1; + TensorFixedSize<float, Sizes<2, 3, 7>, RowMajor> mat2; + + VERIFY_IS_EQUAL((mat1.size()), 2*3*7); + VERIFY_IS_EQUAL(mat1.rank(), 3); + // VERIFY_IS_EQUAL((mat1.dimension(0)), 2); + // VERIFY_IS_EQUAL((mat1.dimension(1)), 3); + // VERIFY_IS_EQUAL((mat1.dimension(2)), 7); + + float val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + mat2(i,j,k) = val; + val += 1.0; + } + } + } + + TensorFixedSize<float, Sizes<2, 3, 7> > mat3; + mat3 = mat1.sqrt(); + TensorFixedSize<float, Sizes<2, 3, 7>, RowMajor> mat4; + mat4 = mat2.sqrt(); + + VERIFY_IS_EQUAL((mat3.size()), 2*3*7); + // VERIFY_IS_EQUAL((mat3.dimension(0)), 2); + // VERIFY_IS_EQUAL((mat3.dimension(1)), 3); + // VERIFY_IS_EQUAL((mat3.dimension(2)), 7); + + + val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), sqrtf(val)); + VERIFY_IS_APPROX(mat4(i,j,k), sqrtf(val)); + val += 1.0; + } + } + } +} + + +static void test_array() +{ + TensorFixedSize<float, Sizes<2, 3, 7> > mat1; + float val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + val += 1.0; + } + } + } + + TensorFixedSize<float, Sizes<2, 3, 7> > mat3; + mat3 = mat1.pow(3.5f); + + val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), powf(val, 3.5f)); + val += 1.0; + } + } + } +} + +void test_cxx11_tensor_fixed_size() +{ + CALL_SUBTEST(test_1d()); + CALL_SUBTEST(test_2d()); + CALL_SUBTEST(test_3d()); + CALL_SUBTEST(test_array()); +} diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp new file mode 100644 index 000000000..ad9de867d --- /dev/null +++ b/unsupported/test/cxx11_tensor_forced_eval.cpp @@ -0,0 +1,78 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/Core> +#include <Eigen/CXX11/Tensor> + +using Eigen::MatrixXf; +using Eigen::Tensor; + +static void test_simple() +{ + MatrixXf m1(3,3); + MatrixXf m2(3,3); + m1.setRandom(); + m2.setRandom(); + + TensorMap<Tensor<float, 2>> mat1(m1.data(), 3,3); + TensorMap<Tensor<float, 2>> mat2(m2.data(), 3,3); + + Tensor<float, 2> mat3(3,3); + mat3 = mat1; + + typedef Tensor<float, 1>::DimensionPair DimPair; + Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); + + mat3 = mat3.contract(mat2, dims).eval(); + + VERIFY_IS_APPROX(mat3(0, 0), (m1*m2).eval()(0,0)); + VERIFY_IS_APPROX(mat3(0, 1), (m1*m2).eval()(0,1)); + VERIFY_IS_APPROX(mat3(0, 2), (m1*m2).eval()(0,2)); + VERIFY_IS_APPROX(mat3(1, 0), (m1*m2).eval()(1,0)); + VERIFY_IS_APPROX(mat3(1, 1), (m1*m2).eval()(1,1)); + VERIFY_IS_APPROX(mat3(1, 2), (m1*m2).eval()(1,2)); + VERIFY_IS_APPROX(mat3(2, 0), (m1*m2).eval()(2,0)); + VERIFY_IS_APPROX(mat3(2, 1), (m1*m2).eval()(2,1)); + VERIFY_IS_APPROX(mat3(2, 2), (m1*m2).eval()(2,2)); +} + + +static void test_const() +{ + MatrixXf input(3,3); + input.setRandom(); + MatrixXf output = input; + output.rowwise() -= input.colwise().maxCoeff(); + + Eigen::array<int, 1> depth_dim; + depth_dim[0] = 0; + Tensor<float, 2>::Dimensions dims2d; + dims2d[0] = 1; + dims2d[1] = 3; + Eigen::array<int, 2> bcast; + bcast[0] = 3; + bcast[1] = 1; + const TensorMap<Tensor<const float, 2>> input_tensor(input.data(), 3, 3); + Tensor<float, 2> output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_APPROX(output(i, j), output_tensor(i, j)); + } + } +} + + +void test_cxx11_tensor_forced_eval() +{ + CALL_SUBTEST(test_simple()); + CALL_SUBTEST(test_const()); +} diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp new file mode 100644 index 000000000..26854f5a4 --- /dev/null +++ b/unsupported/test/cxx11_tensor_image_patch.cpp @@ -0,0 +1,476 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +static void test_simple_patch() +{ + Tensor<float, 4> tensor(2,3,5,7); + tensor.setRandom(); + + Tensor<float, 5> single_pixel_patch; + single_pixel_patch = tensor.extract_image_patches<1, 1>(); + + VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(4), 7); + + for (int i = 0; i < tensor.size(); ++i) { + if (tensor.data()[i] != single_pixel_patch.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " << tensor.data()[i] << " vs " << single_pixel_patch.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]); + } + + Tensor<float, 5> entire_image_patch; + entire_image_patch = tensor.extract_image_patches<3, 5>(); + + VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5); + VERIFY_IS_EQUAL(entire_image_patch.dimension(4), 7); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (int r = 0; r < 3; ++r) { + for (int c = 0; c < 5; ++c) { + for (int d = 0; d < 2; ++d) { + for (int b = 0; b < 7; ++b) { + float expected = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected = tensor(d, r-1+i, c-2+j, b); + } + if (entire_image_patch(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId, b), expected); + } + } + } + } + } + } + + Tensor<float, 5> twod_patch; + twod_patch = tensor.extract_image_patches<2, 2>(); + + VERIFY_IS_EQUAL(twod_patch.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5); + VERIFY_IS_EQUAL(twod_patch.dimension(4), 7); + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + int row_padding = 0; + int col_padding = 0; + int stride = 1; + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (int r = 0; r < 2; ++r) { + for (int c = 0; c < 2; ++c) { + for (int d = 0; d < 2; ++d) { + for (int b = 0; b < 7; ++b) { + float expected = 0.0f; + int row_offset = r*stride + i - row_padding; + int col_offset = c*stride + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) { + expected = tensor(d, row_offset, col_offset, b); + } + if (twod_patch(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId, b), expected); + } + } + } + } + } + } +} + +// Verifies VALID padding (no padding) with incrementing values. +static void test_patch_padding_valid() +{ + int input_depth = 3; + int input_rows = 3; + int input_cols = 3; + int input_batches = 1; + int ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + int stride = 2; // Only same stride is supported. + Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches); + // Initializes tensor with incrementing numbers. + for (int i = 0; i < tensor.size(); ++i) { + tensor.data()[i] = i + 1; + } + Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_VALID); + + VERIFY_IS_EQUAL(result.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result.dimension(3), 1); // number of patches + VERIFY_IS_EQUAL(result.dimension(4), input_batches); // number of batches + + // No padding is carried out. + int row_padding = 0; + int col_padding = 0; + + for (int i = 0; (i+stride+ksize-1) < input_rows; i += stride) { // input rows + for (int j = 0; (j+stride+ksize-1) < input_cols; j += stride) { // input cols + int patchId = i+input_rows*j; + for (int r = 0; r < ksize; ++r) { // patch rows + for (int c = 0; c < ksize; ++c) { // patch cols + for (int d = 0; d < input_depth; ++d) { // depth + for (int b = 0; b < input_batches; ++b) { // batch + float expected = 0.0f; + int row_offset = r + i - row_padding; + int col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected = tensor(d, row_offset, col_offset, b); + } + if (result(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected); + } + } + } + } + } + } +} + +// Verifies VALID padding (no padding) with the same value. +static void test_patch_padding_valid_same_value() +{ + int input_depth = 1; + int input_rows = 5; + int input_cols = 5; + int input_batches = 2; + int ksize = 3; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + int stride = 2; // Only same stride is supported. + Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches); + tensor = tensor.constant(11.0f); + Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_VALID); + + VERIFY_IS_EQUAL(result.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result.dimension(3), 4); // number of patches + VERIFY_IS_EQUAL(result.dimension(4), input_batches); // number of batches + + // No padding is carried out. + int row_padding = 0; + int col_padding = 0; + + for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + int patchId = i+input_rows*j; + for (int r = 0; r < ksize; ++r) { // patch rows + for (int c = 0; c < ksize; ++c) { // patch cols + for (int d = 0; d < input_depth; ++d) { // depth + for (int b = 0; b < input_batches; ++b) { // batch + float expected = 0.0f; + int row_offset = r + i - row_padding; + int col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected = tensor(d, row_offset, col_offset, b); + } + if (result(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected); + } + } + } + } + } + } +} + +// Verifies SAME padding. +static void test_patch_padding_same() +{ + int input_depth = 3; + int input_rows = 4; + int input_cols = 2; + int input_batches = 1; + int ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + int stride = 2; // Only same stride is supported. + Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches); + // Initializes tensor with incrementing numbers. + for (int i = 0; i < tensor.size(); ++i) { + tensor.data()[i] = i + 1; + } + Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME); + + VERIFY_IS_EQUAL(result.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result.dimension(3), 2); // number of patches + VERIFY_IS_EQUAL(result.dimension(4), input_batches); // number of batches + + // Based on the calculation described in TensorTraits.h, padding happens to be + // 0. + int row_padding = 0; + int col_padding = 0; + + for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + int patchId = i+input_rows*j; + for (int r = 0; r < ksize; ++r) { // patch rows + for (int c = 0; c < ksize; ++c) { // patch cols + for (int d = 0; d < input_depth; ++d) { // depth + for (int b = 0; b < input_batches; ++b) { // batch + float expected = 0.0f; + int row_offset = r*stride + i - row_padding; + int col_offset = c*stride + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected = tensor(d, row_offset, col_offset, b); + } + if (result(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected); + } + } + } + } + } + } +} + +static void test_patch_no_extra_dim() +{ + Tensor<float, 3> tensor(2,3,5); + tensor.setRandom(); + + Tensor<float, 4> single_pixel_patch; + single_pixel_patch = tensor.extract_image_patches<1, 1>(); + + VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5); + + for (int i = 0; i < tensor.size(); ++i) { + if (tensor.data()[i] != single_pixel_patch.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " << tensor.data()[i] << " vs " << single_pixel_patch.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]); + } + + Tensor<float, 4> entire_image_patch; + entire_image_patch = tensor.extract_image_patches<3, 5>(); + + VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (int r = 0; r < 3; ++r) { + for (int c = 0; c < 5; ++c) { + for (int d = 0; d < 2; ++d) { + float expected = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected = tensor(d, r-1+i, c-2+j); + } + if (entire_image_patch(d, r, c, patchId) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId), expected); + } + } + } + } + } + + Tensor<float, 4> twod_patch; + twod_patch = tensor.extract_image_patches<2, 2>(); + + VERIFY_IS_EQUAL(twod_patch.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5); + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + int row_padding = 0; + int col_padding = 0; + int stride = 1; + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (int r = 0; r < 2; ++r) { + for (int c = 0; c < 2; ++c) { + for (int d = 0; d < 2; ++d) { + float expected = 0.0f; + int row_offset = r*stride + i - row_padding; + int col_offset = c*stride + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) { + expected = tensor(d, row_offset, col_offset); + } + if (twod_patch(d, r, c, patchId) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId), expected); + } + } + } + } + } +} + + +static void test_imagenet_patches() +{ + // Test the code on typical configurations used by the 'imagenet' benchmarks at + // https://github.com/soumith/convnet-benchmarks + Tensor<float, 4> l_in(3, 128, 128, 128); + l_in.setRandom(); + Tensor<float, 5> l_out = l_in.extract_image_patches(11, 11); + VERIFY_IS_EQUAL(l_out.dimension(0), 3); + VERIFY_IS_EQUAL(l_out.dimension(1), 11); + VERIFY_IS_EQUAL(l_out.dimension(2), 11); + VERIFY_IS_EQUAL(l_out.dimension(3), 128*128); + VERIFY_IS_EQUAL(l_out.dimension(4), 128); + for (int b = 0; b < 128; ++b) { + for (int i = 0; i < 128; ++i) { + for (int j = 0; j < 128; ++j) { + int patchId = i+128*j; + for (int c = 0; c < 11; ++c) { + for (int r = 0; r < 11; ++r) { + for (int d = 0; d < 3; ++d) { + float expected = 0.0f; + if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) { + expected = l_in(d, r-5+i, c-5+j, b); + } + if (l_out(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); + } + } + } + } + } + } + + l_in.resize(64, 64, 64, 128); + l_in.setRandom(); + l_out = l_in.extract_image_patches(9, 9); + VERIFY_IS_EQUAL(l_out.dimension(0), 64); + VERIFY_IS_EQUAL(l_out.dimension(1), 9); + VERIFY_IS_EQUAL(l_out.dimension(2), 9); + VERIFY_IS_EQUAL(l_out.dimension(3), 64*64); + VERIFY_IS_EQUAL(l_out.dimension(4), 128); + for (int b = 0; b < 128; ++b) { + for (int i = 0; i < 64; ++i) { + for (int j = 0; j < 64; ++j) { + int patchId = i+64*j; + for (int c = 0; c < 9; ++c) { + for (int r = 0; r < 9; ++r) { + for (int d = 0; d < 64; ++d) { + float expected = 0.0f; + if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) { + expected = l_in(d, r-4+i, c-4+j, b); + } + if (l_out(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); + } + } + } + } + } + } + + l_in.resize(128, 16, 16, 128); + l_in.setRandom(); + l_out = l_in.extract_image_patches(7, 7); + VERIFY_IS_EQUAL(l_out.dimension(0), 128); + VERIFY_IS_EQUAL(l_out.dimension(1), 7); + VERIFY_IS_EQUAL(l_out.dimension(2), 7); + VERIFY_IS_EQUAL(l_out.dimension(3), 16*16); + VERIFY_IS_EQUAL(l_out.dimension(4), 128); + for (int b = 0; b < 128; ++b) { + for (int i = 0; i < 16; ++i) { + for (int j = 0; j < 16; ++j) { + int patchId = i+16*j; + for (int c = 0; c < 7; ++c) { + for (int r = 0; r < 7; ++r) { + for (int d = 0; d < 128; ++d) { + float expected = 0.0f; + if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) { + expected = l_in(d, r-3+i, c-3+j, b); + } + if (l_out(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); + } + } + } + } + } + } + + l_in.resize(384, 13, 13, 128); + l_in.setRandom(); + l_out = l_in.extract_image_patches(3, 3); + VERIFY_IS_EQUAL(l_out.dimension(0), 384); + VERIFY_IS_EQUAL(l_out.dimension(1), 3); + VERIFY_IS_EQUAL(l_out.dimension(2), 3); + VERIFY_IS_EQUAL(l_out.dimension(3), 13*13); + VERIFY_IS_EQUAL(l_out.dimension(4), 128); + for (int b = 0; b < 128; ++b) { + for (int i = 0; i < 13; ++i) { + for (int j = 0; j < 13; ++j) { + int patchId = i+13*j; + for (int c = 0; c < 3; ++c) { + for (int r = 0; r < 3; ++r) { + for (int d = 0; d < 384; ++d) { + float expected = 0.0f; + if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) { + expected = l_in(d, r-1+i, c-1+j, b); + } + if (l_out(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); + } + } + } + } + } + } +} + +void test_cxx11_tensor_image_patch() +{ + CALL_SUBTEST(test_simple_patch()); + CALL_SUBTEST(test_patch_no_extra_dim()); + CALL_SUBTEST(test_patch_padding_valid()); + CALL_SUBTEST(test_patch_padding_valid_same_value()); + CALL_SUBTEST(test_patch_padding_same()); + CALL_SUBTEST(test_imagenet_patches()); +} diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp new file mode 100644 index 000000000..c4d4f244f --- /dev/null +++ b/unsupported/test/cxx11_tensor_index_list.cpp @@ -0,0 +1,268 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +#ifdef EIGEN_HAS_CONSTEXPR + +static void test_static_index_list() +{ + Tensor<float, 4> tensor(2,3,5,7); + tensor.setRandom(); + + constexpr auto reduction_axis = make_index_list(0, 1, 2); + VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0); + VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1); + VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2); + VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0); + VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1); + VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2); + + EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_axis) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::array_get<1>(reduction_axis) == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_axis) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE); + + Tensor<float, 1> result = tensor.sum(reduction_axis); + for (int i = 0; i < result.size(); ++i) { + float expected = 0.0f; + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 5; ++l) { + expected += tensor(j,k,l,i); + } + } + } + VERIFY_IS_APPROX(result(i), expected); + } +} + + +static void test_type2index_list() +{ + Tensor<float, 5> tensor(2,3,5,7,11); + tensor.setRandom(); + tensor += tensor.constant(10.0f); + + typedef Eigen::IndexList<Eigen::type2index<0>> Dims0; + typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>> Dims1; + typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>> Dims2; + typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>> Dims3; + typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> Dims4; + +#if 0 + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims0>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims1>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims2>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims3>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims4>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); +#endif + + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims0, 1, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims1, 2, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims2, 3, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims3, 4, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims4, 5, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims0, 1, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims1, 2, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims2, 3, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims3, 4, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims4, 5, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const Dims0 reduction_axis0; + Tensor<float, 4> result0 = tensor.sum(reduction_axis0); + for (int m = 0; m < 11; ++m) { + for (int l = 0; l < 7; ++l) { + for (int k = 0; k < 5; ++k) { + for (int j = 0; j < 3; ++j) { + float expected = 0.0f; + for (int i = 0; i < 2; ++i) { + expected += tensor(i,j,k,l,m); + } + VERIFY_IS_APPROX(result0(j,k,l,m), expected); + } + } + } + } + + const Dims1 reduction_axis1; + Tensor<float, 3> result1 = tensor.sum(reduction_axis1); + for (int m = 0; m < 11; ++m) { + for (int l = 0; l < 7; ++l) { + for (int k = 0; k < 5; ++k) { + float expected = 0.0f; + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + expected += tensor(i,j,k,l,m); + } + } + VERIFY_IS_APPROX(result1(k,l,m), expected); + } + } + } + + const Dims2 reduction_axis2; + Tensor<float, 2> result2 = tensor.sum(reduction_axis2); + for (int m = 0; m < 11; ++m) { + for (int l = 0; l < 7; ++l) { + float expected = 0.0f; + for (int k = 0; k < 5; ++k) { + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + expected += tensor(i,j,k,l,m); + } + } + } + VERIFY_IS_APPROX(result2(l,m), expected); + } + } + + const Dims3 reduction_axis3; + Tensor<float, 1> result3 = tensor.sum(reduction_axis3); + for (int m = 0; m < 11; ++m) { + float expected = 0.0f; + for (int l = 0; l < 7; ++l) { + for (int k = 0; k < 5; ++k) { + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + expected += tensor(i,j,k,l,m); + } + } + } + } + VERIFY_IS_APPROX(result3(m), expected); + } + + const Dims4 reduction_axis4; + Tensor<float, 1> result4 = tensor.sum(reduction_axis4); + float expected = 0.0f; + for (int m = 0; m < 11; ++m) { + for (int l = 0; l < 7; ++l) { + for (int k = 0; k < 5; ++k) { + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + expected += tensor(i,j,k,l,m); + } + } + } + } + } + VERIFY_IS_APPROX(result4(0), expected); +} + + +static void test_dynamic_index_list() +{ + Tensor<float, 4> tensor(2,3,5,7); + tensor.setRandom(); + + int dim1 = 2; + int dim2 = 1; + int dim3 = 0; + + auto reduction_axis = make_index_list(dim1, dim2, dim3); + + VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 2); + VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1); + VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 0); + VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 2); + VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1); + VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 0); + + Tensor<float, 1> result = tensor.sum(reduction_axis); + for (int i = 0; i < result.size(); ++i) { + float expected = 0.0f; + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 5; ++l) { + expected += tensor(j,k,l,i); + } + } + } + VERIFY_IS_APPROX(result(i), expected); + } +} + +static void test_mixed_index_list() +{ + Tensor<float, 4> tensor(2,3,5,7); + tensor.setRandom(); + + int dim2 = 1; + int dim4 = 3; + + auto reduction_axis = make_index_list(0, dim2, 2, dim4); + + VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0); + VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1); + VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2); + VERIFY_IS_EQUAL(internal::array_get<3>(reduction_axis), 3); + VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0); + VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1); + VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2); + VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[3]), 3); + + typedef IndexList<type2index<0>, int, type2index<2>, int> ReductionIndices; + ReductionIndices reduction_indices; + reduction_indices.set(1, 1); + reduction_indices.set(3, 3); + EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_indices) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_indices) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>()(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>()(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>()(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>()(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); +#if 0 + EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionIndices>()() == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionIndices>()() == false), YOU_MADE_A_PROGRAMMING_MISTAKE); +#endif + + typedef IndexList<type2index<0>, type2index<1>, type2index<2>, type2index<3>> ReductionList; + ReductionList reduction_list; + EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(3, 3) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); +#if 0 + EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionList>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionList>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); +#endif + + Tensor<float, 1> result1 = tensor.sum(reduction_axis); + Tensor<float, 1> result2 = tensor.sum(reduction_indices); + Tensor<float, 1> result3 = tensor.sum(reduction_list); + + float expected = 0.0f; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + expected += tensor(i,j,k,l); + } + } + } + } + VERIFY_IS_APPROX(result1(0), expected); + VERIFY_IS_APPROX(result2(0), expected); + VERIFY_IS_APPROX(result3(0), expected); +} + +#endif + +void test_cxx11_tensor_index_list() +{ +#ifdef EIGEN_HAS_CONSTEXPR + CALL_SUBTEST(test_static_index_list()); + CALL_SUBTEST(test_type2index_list()); + CALL_SUBTEST(test_dynamic_index_list()); + CALL_SUBTEST(test_mixed_index_list()); +#endif +} diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp new file mode 100644 index 000000000..a510dc695 --- /dev/null +++ b/unsupported/test/cxx11_tensor_intdiv.cpp @@ -0,0 +1,77 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + + +static void test_signed_32bit() +{ + for (int32_t i = 1; i < 25000; ++i) { + const Eigen::internal::TensorIntDivisor<int32_t> div(i); + + for (int32_t j = 0; j < 25000; ++j) { + const int32_t fast_div = j / div; + const int32_t slow_div = j / i; + VERIFY_IS_EQUAL(fast_div, slow_div); + } + } +} + + +static void test_unsigned_32bit() +{ + for (uint32_t i = 1; i < 25000; ++i) { + const Eigen::internal::TensorIntDivisor<uint32_t> div(i); + + for (uint32_t j = 0; j < 25000; ++j) { + const uint32_t fast_div = j / div; + const uint32_t slow_div = j / i; + VERIFY_IS_EQUAL(fast_div, slow_div); + } + } +} + + +static void test_signed_64bit() +{ + for (int64_t i = 2; i < 25000; ++i) { + const Eigen::internal::TensorIntDivisor<int64_t> div(i); + + for (int64_t j = 0; j < 25000; ++j) { + const int64_t fast_div = j / div; + const int64_t slow_div = j / i; + VERIFY_IS_EQUAL(fast_div, slow_div); + } + } +} + + +static void test_unsigned_64bit() +{ + for (uint64_t i = 2; i < 25000; ++i) { + const Eigen::internal::TensorIntDivisor<uint64_t> div(i); + + for (uint64_t j = 0; j < 25000; ++j) { + const uint64_t fast_div = j / div; + const uint64_t slow_div = j / i; + VERIFY_IS_EQUAL(fast_div, slow_div); + } + } +} + + +void test_cxx11_tensor_intdiv() +{ + CALL_SUBTEST(test_signed_32bit()); + CALL_SUBTEST(test_unsigned_32bit()); + CALL_SUBTEST(test_signed_64bit()); + CALL_SUBTEST(test_unsigned_64bit()); +} diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp new file mode 100644 index 000000000..8bbcf7089 --- /dev/null +++ b/unsupported/test/cxx11_tensor_io.cpp @@ -0,0 +1,114 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" +#include <sstream> +#include <string> +#include <Eigen/CXX11/Tensor> + + +template<int DataLayout> +static void test_output_1d() +{ + Tensor<int, 1, DataLayout> tensor(5); + for (int i = 0; i < 5; ++i) { + tensor(i) = i; + } + + std::stringstream os; + os << tensor; + + std::string expected("0\n1\n2\n3\n4"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + +template<int DataLayout> +static void test_output_2d() +{ + Tensor<int, 2, DataLayout> tensor(5, 3); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 3; ++j) { + tensor(i, j) = i*j; + } + } + + std::stringstream os; + os << tensor; + + std::string expected("0 0 0\n0 1 2\n0 2 4\n0 3 6\n0 4 8"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + +template<int DataLayout> +static void test_output_expr() +{ + Tensor<int, 1, DataLayout> tensor1(5); + Tensor<int, 1, DataLayout> tensor2(5); + for (int i = 0; i < 5; ++i) { + tensor1(i) = i; + tensor2(i) = 7; + } + + std::stringstream os; + os << tensor1 + tensor2; + + std::string expected(" 7\n 8\n 9\n10\n11"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + +template<int DataLayout> +static void test_output_string() +{ + Tensor<std::string, 2, DataLayout> tensor(5, 3); + tensor.setConstant(std::string("foo")); + + std::cout << tensor << std::endl; + + std::stringstream os; + os << tensor; + + std::string expected("foo foo foo\nfoo foo foo\nfoo foo foo\nfoo foo foo\nfoo foo foo"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + +template<int DataLayout> +static void test_output_const() +{ + Tensor<int, 1, DataLayout> tensor(5); + for (int i = 0; i < 5; ++i) { + tensor(i) = i; + } + + TensorMap<Tensor<const int, 1, DataLayout> > tensor_map(tensor.data(), 5); + + std::stringstream os; + os << tensor_map; + + std::string expected("0\n1\n2\n3\n4"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + +void test_cxx11_tensor_io() +{ + CALL_SUBTEST(test_output_1d<ColMajor>()); + CALL_SUBTEST(test_output_1d<RowMajor>()); + CALL_SUBTEST(test_output_2d<ColMajor>()); + CALL_SUBTEST(test_output_2d<RowMajor>()); + CALL_SUBTEST(test_output_expr<ColMajor>()); + CALL_SUBTEST(test_output_expr<RowMajor>()); + CALL_SUBTEST(test_output_string<ColMajor>()); + CALL_SUBTEST(test_output_string<RowMajor>()); + CALL_SUBTEST(test_output_const<ColMajor>()); + CALL_SUBTEST(test_output_const<RowMajor>()); +} diff --git a/unsupported/test/cxx11_tensor_layout_swap.cpp b/unsupported/test/cxx11_tensor_layout_swap.cpp new file mode 100644 index 000000000..ae297a9da --- /dev/null +++ b/unsupported/test/cxx11_tensor_layout_swap.cpp @@ -0,0 +1,61 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +static void test_simple_swap() +{ + Tensor<float, 3, ColMajor> tensor(2,3,7); + tensor.setRandom(); + + Tensor<float, 3, RowMajor> tensor2 = tensor.swap_layout(); + VERIFY_IS_EQUAL(tensor.dimension(0), tensor2.dimension(2)); + VERIFY_IS_EQUAL(tensor.dimension(1), tensor2.dimension(1)); + VERIFY_IS_EQUAL(tensor.dimension(2), tensor2.dimension(0)); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor(i,j,k), tensor2(k,j,i)); + } + } + } +} + + +static void test_swap_as_lvalue() +{ + Tensor<float, 3, ColMajor> tensor(2,3,7); + tensor.setRandom(); + + Tensor<float, 3, RowMajor> tensor2(7,3,2); + tensor2.swap_layout() = tensor; + VERIFY_IS_EQUAL(tensor.dimension(0), tensor2.dimension(2)); + VERIFY_IS_EQUAL(tensor.dimension(1), tensor2.dimension(1)); + VERIFY_IS_EQUAL(tensor.dimension(2), tensor2.dimension(0)); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor(i,j,k), tensor2(k,j,i)); + } + } + } +} + + +void test_cxx11_tensor_layout_swap() +{ + CALL_SUBTEST(test_simple_swap()); + CALL_SUBTEST(test_swap_as_lvalue()); +} diff --git a/unsupported/test/cxx11_tensor_lvalue.cpp b/unsupported/test/cxx11_tensor_lvalue.cpp new file mode 100644 index 000000000..071f5b406 --- /dev/null +++ b/unsupported/test/cxx11_tensor_lvalue.cpp @@ -0,0 +1,42 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::RowMajor; + + +static void test_compound_assignment() +{ + Tensor<float, 3> mat1(2,3,7); + Tensor<float, 3> mat2(2,3,7); + Tensor<float, 3> mat3(2,3,7); + + mat1.setRandom(); + mat2.setRandom(); + mat3 = mat1; + mat3 += mat2; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) + mat2(i,j,k)); + } + } + } +} + + +void test_cxx11_tensor_lvalue() +{ + CALL_SUBTEST(test_compound_assignment()); +} diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp new file mode 100644 index 000000000..9cf2eb150 --- /dev/null +++ b/unsupported/test/cxx11_tensor_map.cpp @@ -0,0 +1,147 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_1d() +{ + Tensor<int, 1> vec1(6); + Tensor<int, 1, RowMajor> vec2(6); + + TensorMap<Tensor<const int, 1>> vec3(vec1.data(), 6); + TensorMap<Tensor<const int, 1, RowMajor>> vec4(vec2.data(), 6); + + vec1(0) = 4; vec2(0) = 0; + vec1(1) = 8; vec2(1) = 1; + vec1(2) = 15; vec2(2) = 2; + vec1(3) = 16; vec2(3) = 3; + vec1(4) = 23; vec2(4) = 4; + vec1(5) = 42; vec2(5) = 5; + + VERIFY_IS_EQUAL(vec1.rank(), 1); + VERIFY_IS_EQUAL(vec1.size(), 6); + VERIFY_IS_EQUAL(vec1.dimension(0), 6); + + VERIFY_IS_EQUAL(vec3(0), 4); + VERIFY_IS_EQUAL(vec3(1), 8); + VERIFY_IS_EQUAL(vec3(2), 15); + VERIFY_IS_EQUAL(vec3(3), 16); + VERIFY_IS_EQUAL(vec3(4), 23); + VERIFY_IS_EQUAL(vec3(5), 42); + + VERIFY_IS_EQUAL(vec4(0), 0); + VERIFY_IS_EQUAL(vec4(1), 1); + VERIFY_IS_EQUAL(vec4(2), 2); + VERIFY_IS_EQUAL(vec4(3), 3); + VERIFY_IS_EQUAL(vec4(4), 4); + VERIFY_IS_EQUAL(vec4(5), 5); +} + +static void test_2d() +{ + Tensor<int, 2> mat1(2,3); + Tensor<int, 2, RowMajor> mat2(2,3); + + mat1(0,0) = 0; + mat1(0,1) = 1; + mat1(0,2) = 2; + mat1(1,0) = 3; + mat1(1,1) = 4; + mat1(1,2) = 5; + + mat2(0,0) = 0; + mat2(0,1) = 1; + mat2(0,2) = 2; + mat2(1,0) = 3; + mat2(1,1) = 4; + mat2(1,2) = 5; + + TensorMap<Tensor<const int, 2>> mat3(mat1.data(), 2, 3); + TensorMap<Tensor<const int, 2, RowMajor>> mat4(mat2.data(), 2, 3); + + VERIFY_IS_EQUAL(mat3.rank(), 2); + VERIFY_IS_EQUAL(mat3.size(), 6); + VERIFY_IS_EQUAL(mat3.dimension(0), 2); + VERIFY_IS_EQUAL(mat3.dimension(1), 3); + + VERIFY_IS_EQUAL(mat4.rank(), 2); + VERIFY_IS_EQUAL(mat4.size(), 6); + VERIFY_IS_EQUAL(mat4.dimension(0), 2); + VERIFY_IS_EQUAL(mat4.dimension(1), 3); + + VERIFY_IS_EQUAL(mat3(0,0), 0); + VERIFY_IS_EQUAL(mat3(0,1), 1); + VERIFY_IS_EQUAL(mat3(0,2), 2); + VERIFY_IS_EQUAL(mat3(1,0), 3); + VERIFY_IS_EQUAL(mat3(1,1), 4); + VERIFY_IS_EQUAL(mat3(1,2), 5); + + VERIFY_IS_EQUAL(mat4(0,0), 0); + VERIFY_IS_EQUAL(mat4(0,1), 1); + VERIFY_IS_EQUAL(mat4(0,2), 2); + VERIFY_IS_EQUAL(mat4(1,0), 3); + VERIFY_IS_EQUAL(mat4(1,1), 4); + VERIFY_IS_EQUAL(mat4(1,2), 5); +} + +static void test_3d() +{ + Tensor<int, 3> mat1(2,3,7); + Tensor<int, 3, RowMajor> mat2(2,3,7); + + int val = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + mat2(i,j,k) = val; + val++; + } + } + } + + TensorMap<Tensor<const int, 3>> mat3(mat1.data(), 2, 3, 7); + TensorMap<Tensor<const int, 3, RowMajor>> mat4(mat2.data(), array<DenseIndex, 3>{{2, 3, 7}}); + + VERIFY_IS_EQUAL(mat3.rank(), 3); + VERIFY_IS_EQUAL(mat3.size(), 2*3*7); + VERIFY_IS_EQUAL(mat3.dimension(0), 2); + VERIFY_IS_EQUAL(mat3.dimension(1), 3); + VERIFY_IS_EQUAL(mat3.dimension(2), 7); + + VERIFY_IS_EQUAL(mat4.rank(), 3); + VERIFY_IS_EQUAL(mat4.size(), 2*3*7); + VERIFY_IS_EQUAL(mat4.dimension(0), 2); + VERIFY_IS_EQUAL(mat4.dimension(1), 3); + VERIFY_IS_EQUAL(mat4.dimension(2), 7); + + val = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(mat3(i,j,k), val); + VERIFY_IS_EQUAL(mat4(i,j,k), val); + val++; + } + } + } +} + + +void test_cxx11_tensor_map() +{ + CALL_SUBTEST(test_1d()); + CALL_SUBTEST(test_2d()); + CALL_SUBTEST(test_3d()); +} diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp new file mode 100644 index 000000000..7fd7a283a --- /dev/null +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -0,0 +1,342 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +static void test_simple_reshape() +{ + Tensor<float, 5> tensor1(2,3,1,7,1); + tensor1.setRandom(); + + Tensor<float, 3> tensor2(2,3,7); + Tensor<float, 2> tensor3(6,7); + Tensor<float, 2> tensor4(2,21); + + Tensor<float, 3>::Dimensions dim1{{2,3,7}}; + tensor2 = tensor1.reshape(dim1); + Tensor<float, 2>::Dimensions dim2{{6,7}}; + tensor3 = tensor1.reshape(dim2); + Tensor<float, 2>::Dimensions dim3{{2,21}}; + tensor4 = tensor1.reshape(dim1).reshape(dim3); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k)); + VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k)); + VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j+3*k)); + } + } + } +} + + +static void test_reshape_in_expr() { + MatrixXf m1(2,3*5*7*11); + MatrixXf m2(3*5*7*11,13); + m1.setRandom(); + m2.setRandom(); + MatrixXf m3 = m1 * m2; + + TensorMap<Tensor<float, 5>> tensor1(m1.data(), 2,3,5,7,11); + TensorMap<Tensor<float, 5>> tensor2(m2.data(), 3,5,7,11,13); + Tensor<float, 2>::Dimensions newDims1{{2,3*5*7*11}}; + Tensor<float, 2>::Dimensions newDims2{{3*5*7*11,13}}; + typedef Tensor<float, 1>::DimensionPair DimPair; + array<DimPair, 1> contract_along{{DimPair(1, 0)}}; + Tensor<float, 2> tensor3(2,13); + tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along); + + Map<MatrixXf> res(tensor3.data(), 2, 13); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 13; ++j) { + VERIFY_IS_APPROX(res(i,j), m3(i,j)); + } + } +} + + +static void test_reshape_as_lvalue() +{ + Tensor<float, 3> tensor(2,3,7); + tensor.setRandom(); + + Tensor<float, 2> tensor2d(6,7); + Tensor<float, 3>::Dimensions dim{{2,3,7}}; + tensor2d.reshape(dim) = tensor; + + float scratch[2*3*1*7*1]; + TensorMap<Tensor<float, 5>> tensor5d(scratch, 2,3,1,7,1); + tensor5d.reshape(dim).device(Eigen::DefaultDevice()) = tensor; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k)); + VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k)); + } + } + } +} + +template<int DataLayout> +static void test_simple_slice() +{ + Tensor<float, 5, DataLayout> tensor(2,3,5,7,11); + tensor.setRandom(); + + Tensor<float, 5, DataLayout> slice1(1,1,1,1,1); + Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5); + Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1); + slice1 = tensor.slice(indices, sizes); + VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); + + Tensor<float, 5, DataLayout> slice2(1,1,2,2,3); + Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5); + Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3); + slice2 = tensor.slice(indices2, sizes2); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); + } + } + } +} + +// TODO(andydavis) Add RowMajor support when TensorContract supports RowMajor. +static void test_slice_in_expr() { + MatrixXf m1(7,7); + MatrixXf m2(3,3); + m1.setRandom(); + m2.setRandom(); + + MatrixXf m3 = m1.block(1, 2, 3, 3) * m2.block(0, 2, 3, 1); + + TensorMap<Tensor<float, 2>> tensor1(m1.data(), 7, 7); + TensorMap<Tensor<float, 2>> tensor2(m2.data(), 3, 3); + Tensor<float, 2> tensor3(3,1); + typedef Tensor<float, 1>::DimensionPair DimPair; + array<DimPair, 1> contract_along{{DimPair(1, 0)}}; + + Eigen::DSizes<ptrdiff_t, 2> indices1(1,2); + Eigen::DSizes<ptrdiff_t, 2> sizes1(3,3); + Eigen::DSizes<ptrdiff_t, 2> indices2(0,2); + Eigen::DSizes<ptrdiff_t, 2> sizes2(3,1); + tensor3 = tensor1.slice(indices1, sizes1).contract(tensor2.slice(indices2, sizes2), contract_along); + + Map<MatrixXf> res(tensor3.data(), 3, 1); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 1; ++j) { + VERIFY_IS_APPROX(res(i,j), m3(i,j)); + } + } + + // Take an arbitrary slice of an arbitrarily sized tensor. + TensorMap<Tensor<const float, 2>> tensor4(m1.data(), 7, 7); + Tensor<float, 1> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35)); + for (int i = 0; i < 35; ++i) { + VERIFY_IS_APPROX(tensor6(i), expf(tensor4.data()[i])); + } +} + +template<int DataLayout> +static void test_slice_as_lvalue() +{ + Tensor<float, 3, DataLayout> tensor1(2,2,7); + tensor1.setRandom(); + Tensor<float, 3, DataLayout> tensor2(2,2,7); + tensor2.setRandom(); + Tensor<float, 3, DataLayout> tensor3(4,3,5); + tensor3.setRandom(); + Tensor<float, 3, DataLayout> tensor4(4,3,2); + tensor4.setRandom(); + Tensor<float, 3, DataLayout> tensor5(10,13,12); + tensor5.setRandom(); + + Tensor<float, 3, DataLayout> result(4,5,7); + Eigen::DSizes<ptrdiff_t, 3> sizes12(2,2,7); + Eigen::DSizes<ptrdiff_t, 3> first_slice(0,0,0); + result.slice(first_slice, sizes12) = tensor1; + Eigen::DSizes<ptrdiff_t, 3> second_slice(2,0,0); + result.slice(second_slice, sizes12).device(Eigen::DefaultDevice()) = tensor2; + + Eigen::DSizes<ptrdiff_t, 3> sizes3(4,3,5); + Eigen::DSizes<ptrdiff_t, 3> third_slice(0,2,0); + result.slice(third_slice, sizes3) = tensor3; + + Eigen::DSizes<ptrdiff_t, 3> sizes4(4,3,2); + Eigen::DSizes<ptrdiff_t, 3> fourth_slice(0,2,5); + result.slice(fourth_slice, sizes4) = tensor4; + + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 7; ++k) { + for (int i = 0; i < 2; ++i) { + VERIFY_IS_EQUAL(result(i,j,k), tensor1(i,j,k)); + VERIFY_IS_EQUAL(result(i+2,j,k), tensor2(i,j,k)); + } + } + } + for (int i = 0; i < 4; ++i) { + for (int j = 2; j < 5; ++j) { + for (int k = 0; k < 5; ++k) { + VERIFY_IS_EQUAL(result(i,j,k), tensor3(i,j-2,k)); + } + for (int k = 5; k < 7; ++k) { + VERIFY_IS_EQUAL(result(i,j,k), tensor4(i,j-2,k-5)); + } + } + } + + Eigen::DSizes<ptrdiff_t, 3> sizes5(4,5,7); + Eigen::DSizes<ptrdiff_t, 3> fifth_slice(0,0,0); + result.slice(fifth_slice, sizes5) = tensor5.slice(fifth_slice, sizes5); + for (int i = 0; i < 4; ++i) { + for (int j = 2; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(result(i,j,k), tensor5(i,j,k)); + } + } + } +} + +template<int DataLayout> +static void test_slice_raw_data() +{ + Tensor<float, 4, DataLayout> tensor(3,5,7,11); + tensor.setRandom(); + + Eigen::DSizes<ptrdiff_t, 4> offsets(1,2,3,4); + Eigen::DSizes<ptrdiff_t, 4> extents(1,1,1,1); + typedef TensorEvaluator<decltype(tensor.slice(offsets, extents)), DefaultDevice> SliceEvaluator; + auto slice1 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice1.dimensions().TotalSize(), 1ul); + VERIFY_IS_EQUAL(slice1.data()[0], tensor(1,2,3,4)); + + if (DataLayout == ColMajor) { + extents = Eigen::DSizes<ptrdiff_t, 4>(2,1,1,1); + auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul); + VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4)); + VERIFY_IS_EQUAL(slice2.data()[1], tensor(2,2,3,4)); + } else { + extents = Eigen::DSizes<ptrdiff_t, 4>(1,1,1,2); + auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul); + VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4)); + VERIFY_IS_EQUAL(slice2.data()[1], tensor(1,2,3,5)); + } + + extents = Eigen::DSizes<ptrdiff_t, 4>(1,2,1,1); + auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2ul); + VERIFY_IS_EQUAL(slice3.data(), static_cast<float*>(0)); + + if (DataLayout == ColMajor) { + offsets = Eigen::DSizes<ptrdiff_t, 4>(0,2,3,4); + extents = Eigen::DSizes<ptrdiff_t, 4>(3,2,1,1); + auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 6ul); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 2; ++j) { + VERIFY_IS_EQUAL(slice4.data()[i+3*j], tensor(i,2+j,3,4)); + } + } + } else { + offsets = Eigen::DSizes<ptrdiff_t, 4>(1,2,3,0); + extents = Eigen::DSizes<ptrdiff_t, 4>(1,1,2,11); + auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 22ul); + for (int l = 0; l < 11; ++l) { + for (int k = 0; k < 2; ++k) { + VERIFY_IS_EQUAL(slice4.data()[l+11*k], tensor(1,2,3+k,l)); + } + } + } + + if (DataLayout == ColMajor) { + offsets = Eigen::DSizes<ptrdiff_t, 4>(0,0,0,4); + extents = Eigen::DSizes<ptrdiff_t, 4>(3,5,7,2); + auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 210ul); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 2; ++l) { + int slice_index = i + 3 * (j + 5 * (k + 7 * l)); + VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i,j,k,l+4)); + } + } + } + } + } else { + offsets = Eigen::DSizes<ptrdiff_t, 4>(1,0,0,0); + extents = Eigen::DSizes<ptrdiff_t, 4>(2,5,7,11); + auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 770ul); + for (int l = 0; l < 11; ++l) { + for (int k = 0; k < 7; ++k) { + for (int j = 0; j < 5; ++j) { + for (int i = 0; i < 2; ++i) { + int slice_index = l + 11 * (k + 7 * (j + 5 * i)); + VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i+1,j,k,l)); + } + } + } + } + + } + + offsets = Eigen::DSizes<ptrdiff_t, 4>(0,0,0,0); + extents = Eigen::DSizes<ptrdiff_t, 4>(3,5,7,11); + auto slice6 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice6.dimensions().TotalSize(), 3ul*5*7*11); + VERIFY_IS_EQUAL(slice6.data(), tensor.data()); +} + + +static void test_composition() +{ + Eigen::Tensor<float, 2> matrix(7, 11); + matrix.setRandom(); + + const DSizes<ptrdiff_t, 3> newDims{{1, 1, 11}}; + Eigen::Tensor<float, 3> tensor = + matrix.slice(DSizes<ptrdiff_t, 2>(2, 0), DSizes<ptrdiff_t, 2>(1, 11)).reshape(newDims); + + VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11ul); + VERIFY_IS_EQUAL(tensor.dimension(0), 1); + VERIFY_IS_EQUAL(tensor.dimension(1), 1); + VERIFY_IS_EQUAL(tensor.dimension(2), 11); + for (int i = 0; i < 11; ++i) { + VERIFY_IS_EQUAL(tensor(0,0,i), matrix(2,i)); + } +} + + +void test_cxx11_tensor_morphing() +{ + CALL_SUBTEST(test_simple_reshape()); + CALL_SUBTEST(test_reshape_in_expr()); + CALL_SUBTEST(test_reshape_as_lvalue()); + + CALL_SUBTEST(test_simple_slice<ColMajor>()); + CALL_SUBTEST(test_simple_slice<RowMajor>()); + CALL_SUBTEST(test_slice_in_expr()); + CALL_SUBTEST(test_slice_as_lvalue<ColMajor>()); + CALL_SUBTEST(test_slice_as_lvalue<RowMajor>()); + CALL_SUBTEST(test_slice_raw_data<ColMajor>()); + CALL_SUBTEST(test_slice_raw_data<RowMajor>()); + + CALL_SUBTEST(test_composition()); +} diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp new file mode 100644 index 000000000..24b2bcb58 --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_complex.cpp @@ -0,0 +1,81 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::TensorMap; + + + +static void test_additions() +{ + Tensor<std::complex<float>, 1> data1(3); + Tensor<std::complex<float>, 1> data2(3); + for (int i = 0; i < 3; ++i) { + data1(i) = std::complex<float>(i, -i); + data2(i) = std::complex<float>(i, 7 * i); + } + + Tensor<std::complex<float>, 1> sum = data1 + data2; + for (int i = 0; i < 3; ++i) { + VERIFY_IS_EQUAL(sum(i), std::complex<float>(2*i, 6*i)); + } +} + + +static void test_abs() +{ + Tensor<std::complex<float>, 1> data1(3); + Tensor<std::complex<double>, 1> data2(3); + data1.setRandom(); + data2.setRandom(); + + Tensor<float, 1> abs1 = data1.abs(); + Tensor<double, 1> abs2 = data2.abs(); + for (int i = 0; i < 3; ++i) { + VERIFY_IS_APPROX(abs1(i), std::abs(data1(i))); + VERIFY_IS_APPROX(abs2(i), std::abs(data2(i))); + } +} + + +static void test_contractions() +{ + Tensor<std::complex<float>, 4> t_left(30, 50, 8, 31); + Tensor<std::complex<float>, 5> t_right(8, 31, 7, 20, 10); + Tensor<std::complex<float>, 5> t_result(30, 50, 7, 20, 10); + + t_left.setRandom(); + t_right.setRandom(); + + typedef Map<Matrix<std::complex<float>, Dynamic, Dynamic>> MapXcf; + MapXcf m_left(t_left.data(), 1500, 248); + MapXcf m_right(t_right.data(), 248, 1400); + Matrix<std::complex<float>, Dynamic, Dynamic> m_result(1500, 1400); + + // This contraction should be equivalent to a regular matrix multiplication + typedef Tensor<float, 1>::DimensionPair DimPair; + Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}}); + t_result = t_left.contract(t_right, dims); + m_result = m_left * m_right; + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + + +void test_cxx11_tensor_of_complex() +{ + CALL_SUBTEST(test_additions()); + CALL_SUBTEST(test_abs()); + CALL_SUBTEST(test_contractions()); +} diff --git a/unsupported/test/cxx11_tensor_of_const_values.cpp b/unsupported/test/cxx11_tensor_of_const_values.cpp new file mode 100644 index 000000000..f179a0c21 --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_const_values.cpp @@ -0,0 +1,105 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_assign() +{ + float data1[6]; + TensorMap<Tensor<const float, 2>> mat1(data1, 2, 3); + float data2[6]; + const TensorMap<Tensor<float, 2>> mat2(data2, 2, 3); + + for (int i = 0; i < 6; ++i) { + data1[i] = i; + data2[i] = -i; + } + + Tensor<float, 2> rslt1; + rslt1 = mat1; + Tensor<float, 2> rslt2; + rslt2 = mat2; + + Tensor<float, 2> rslt3 = mat1; + Tensor<float, 2> rslt4 = mat2; + + Tensor<float, 2> rslt5(mat1); + Tensor<float, 2> rslt6(mat2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_APPROX(rslt1(i,j), static_cast<float>(i + 2*j)); + VERIFY_IS_APPROX(rslt2(i,j), static_cast<float>(-i - 2*j)); + VERIFY_IS_APPROX(rslt3(i,j), static_cast<float>(i + 2*j)); + VERIFY_IS_APPROX(rslt4(i,j), static_cast<float>(-i - 2*j)); + VERIFY_IS_APPROX(rslt5(i,j), static_cast<float>(i + 2*j)); + VERIFY_IS_APPROX(rslt6(i,j), static_cast<float>(-i - 2*j)); + } + } +} + + +static void test_plus() +{ + float data1[6]; + TensorMap<Tensor<const float, 2>> mat1(data1, 2, 3); + float data2[6]; + TensorMap<Tensor<float, 2>> mat2(data2, 2, 3); + + for (int i = 0; i < 6; ++i) { + data1[i] = i; + data2[i] = -i; + } + + Tensor<float, 2> sum1; + sum1 = mat1 + mat2; + Tensor<float, 2> sum2; + sum2 = mat2 + mat1; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_APPROX(sum1(i,j), 0.0f); + VERIFY_IS_APPROX(sum2(i,j), 0.0f); + } + } +} + + +static void test_plus_equal() +{ + float data1[6]; + TensorMap<Tensor<const float, 2>> mat1(data1, 2, 3); + float data2[6]; + TensorMap<Tensor<float, 2>> mat2(data2, 2, 3); + + for (int i = 0; i < 6; ++i) { + data1[i] = i; + data2[i] = -i; + } + mat2 += mat1; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_APPROX(mat2(i,j), 0.0f); + } + } +} + + +void test_cxx11_tensor_of_const_values() +{ + CALL_SUBTEST(test_assign()); + CALL_SUBTEST(test_plus()); + CALL_SUBTEST(test_plus_equal()); +} diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp new file mode 100644 index 000000000..8d05d154e --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_strings.cpp @@ -0,0 +1,152 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::TensorMap; + +static void test_assign() +{ + std::string data1[6]; + TensorMap<Tensor<std::string, 2>> mat1(data1, 2, 3); + std::string data2[6]; + const TensorMap<Tensor<const std::string, 2>> mat2(data2, 2, 3); + + for (int i = 0; i < 6; ++i) { + std::ostringstream s1; + s1 << "abc" << i*3; + data1[i] = s1.str(); + std::ostringstream s2; + s2 << "def" << i*5; + data2[i] = s2.str(); + } + + Tensor<std::string, 2> rslt1; + rslt1 = mat1; + Tensor<std::string, 2> rslt2; + rslt2 = mat2; + + Tensor<std::string, 2> rslt3 = mat1; + Tensor<std::string, 2> rslt4 = mat2; + + Tensor<std::string, 2> rslt5(mat1); + Tensor<std::string, 2> rslt6(mat2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(rslt1(i,j), data1[i+2*j]); + VERIFY_IS_EQUAL(rslt2(i,j), data2[i+2*j]); + VERIFY_IS_EQUAL(rslt3(i,j), data1[i+2*j]); + VERIFY_IS_EQUAL(rslt4(i,j), data2[i+2*j]); + VERIFY_IS_EQUAL(rslt5(i,j), data1[i+2*j]); + VERIFY_IS_EQUAL(rslt6(i,j), data2[i+2*j]); + } + } +} + + +static void test_concat() +{ + Tensor<std::string, 2> t1(2, 3); + Tensor<std::string, 2> t2(2, 3); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + std::ostringstream s1; + s1 << "abc" << i + j*2; + t1(i, j) = s1.str(); + std::ostringstream s2; + s2 << "def" << i*5 + j*32; + t2(i, j) = s2.str(); + } + } + + Tensor<std::string, 2> result = t1.concatenate(t2, 1); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 6); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(result(i, j), t1(i, j)); + VERIFY_IS_EQUAL(result(i, j+3), t2(i, j)); + } + } +} + + +static void test_slices() +{ + Tensor<std::string, 2> data(2, 6); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + std::ostringstream s1; + s1 << "abc" << i + j*2; + data(i, j) = s1.str(); + } + } + + const Eigen::DSizes<ptrdiff_t, 2> half_size{{2, 3}}; + const Eigen::DSizes<ptrdiff_t, 2> first_half{{0, 0}}; + const Eigen::DSizes<ptrdiff_t, 2> second_half{{0, 3}}; + + Tensor<std::string, 2> t1 = data.slice(first_half, half_size); + Tensor<std::string, 2> t2 = data.slice(second_half, half_size); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(data(i, j), t1(i, j)); + VERIFY_IS_EQUAL(data(i, j+3), t2(i, j)); + } + } +} + + +static void test_additions() +{ + Tensor<std::string, 1> data1(3); + Tensor<std::string, 1> data2(3); + for (int i = 0; i < 3; ++i) { + data1(i) = "abc"; + std::ostringstream s1; + s1 << i; + data2(i) = s1.str(); + } + + Tensor<std::string, 1> sum = data1 + data2; + for (int i = 0; i < 3; ++i) { + std::ostringstream concat; + concat << "abc" << i; + std::string expected = concat.str(); + VERIFY_IS_EQUAL(sum(i), expected); + } +} + + +static void test_initialization() +{ + Tensor<std::string, 2> a(2, 3); + a.setConstant(std::string("foo")); + for (int i = 0; i < 2*3; ++i) { + VERIFY_IS_EQUAL(a(i), std::string("foo")); + } +} + + +void test_cxx11_tensor_of_strings() +{ + // Beware: none of this is likely to ever work on a GPU. + CALL_SUBTEST(test_assign()); + CALL_SUBTEST(test_concat()); + CALL_SUBTEST(test_slices()); + CALL_SUBTEST(test_additions()); + CALL_SUBTEST(test_initialization()); +} diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp new file mode 100644 index 000000000..ffa19896e --- /dev/null +++ b/unsupported/test/cxx11_tensor_padding.cpp @@ -0,0 +1,93 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template<int DataLayout> +static void test_simple_padding() +{ + Tensor<float, 4, DataLayout> tensor(2,3,5,7); + tensor.setRandom(); + + array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings; + paddings[0] = std::make_pair(0, 0); + paddings[1] = std::make_pair(2, 1); + paddings[2] = std::make_pair(3, 4); + paddings[3] = std::make_pair(0, 0); + + Tensor<float, 4, DataLayout> padded; + padded = tensor.pad(paddings); + + VERIFY_IS_EQUAL(padded.dimension(0), 2+0); + VERIFY_IS_EQUAL(padded.dimension(1), 3+3); + VERIFY_IS_EQUAL(padded.dimension(2), 5+7); + VERIFY_IS_EQUAL(padded.dimension(3), 7+0); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 6; ++j) { + for (int k = 0; k < 12; ++k) { + for (int l = 0; l < 7; ++l) { + if (j >= 2 && j < 5 && k >= 3 && k < 8) { + VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l)); + } else { + VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f); + } + } + } + } + } +} + +template<int DataLayout> +static void test_padded_expr() +{ + Tensor<float, 4, DataLayout> tensor(2,3,5,7); + tensor.setRandom(); + + array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings; + paddings[0] = std::make_pair(0, 0); + paddings[1] = std::make_pair(2, 1); + paddings[2] = std::make_pair(3, 4); + paddings[3] = std::make_pair(0, 0); + + Eigen::DSizes<ptrdiff_t, 2> reshape_dims; + reshape_dims[0] = 12; + reshape_dims[1] = 84; + + Tensor<float, 2, DataLayout> result; + result = tensor.pad(paddings).reshape(reshape_dims); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 6; ++j) { + for (int k = 0; k < 12; ++k) { + for (int l = 0; l < 7; ++l) { + const float result_value = DataLayout == ColMajor ? + result(i+2*j,k+12*l) : result(j+6*i,l+7*k); + if (j >= 2 && j < 5 && k >= 3 && k < 8) { + VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l)); + } else { + VERIFY_IS_EQUAL(result_value, 0.0f); + } + } + } + } + } +} + +void test_cxx11_tensor_padding() +{ + CALL_SUBTEST(test_simple_padding<ColMajor>()); + CALL_SUBTEST(test_simple_padding<RowMajor>()); + CALL_SUBTEST(test_padded_expr<ColMajor>()); + CALL_SUBTEST(test_padded_expr<RowMajor>()); +} diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp new file mode 100644 index 000000000..0ee7b46d4 --- /dev/null +++ b/unsupported/test/cxx11_tensor_patch.cpp @@ -0,0 +1,120 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +static void test_simple_patch() +{ + Tensor<float, 4> tensor(2,3,5,7); + tensor.setRandom(); + array<ptrdiff_t, 4> patch_dims; + patch_dims[0] = 1; + patch_dims[1] = 1; + patch_dims[2] = 1; + patch_dims[3] = 1; + + Tensor<float, 5> no_patch; + no_patch = tensor.extract_patches(patch_dims); + + VERIFY_IS_EQUAL(no_patch.dimension(0), 1); + VERIFY_IS_EQUAL(no_patch.dimension(1), 1); + VERIFY_IS_EQUAL(no_patch.dimension(2), 1); + VERIFY_IS_EQUAL(no_patch.dimension(3), 1); + VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size()); + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]); + } + + patch_dims[0] = 2; + patch_dims[1] = 3; + patch_dims[2] = 5; + patch_dims[3] = 7; + Tensor<float, 5> single_patch; + single_patch = tensor.extract_patches(patch_dims); + + VERIFY_IS_EQUAL(single_patch.dimension(0), 2); + VERIFY_IS_EQUAL(single_patch.dimension(1), 3); + VERIFY_IS_EQUAL(single_patch.dimension(2), 5); + VERIFY_IS_EQUAL(single_patch.dimension(3), 7); + VERIFY_IS_EQUAL(single_patch.dimension(4), 1); + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]); + } + + patch_dims[0] = 1; + patch_dims[1] = 2; + patch_dims[2] = 2; + patch_dims[3] = 1; + Tensor<float, 5> twod_patch; + twod_patch = tensor.extract_patches(patch_dims); + + VERIFY_IS_EQUAL(twod_patch.dimension(0), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 4; ++k) { + for (int l = 0; l < 7; ++l) { + int patch_loc = i + 2 * (j + 2 * (k + 4 * l)); + for (int x = 0; x < 2; ++x) { + for (int y = 0; y < 2; ++y) { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc)); + } + } + } + } + } + } + + patch_dims[0] = 1; + patch_dims[1] = 2; + patch_dims[2] = 3; + patch_dims[3] = 5; + Tensor<float, 5> threed_patch; + threed_patch = tensor.extract_patches(patch_dims); + + VERIFY_IS_EQUAL(threed_patch.dimension(0), 1); + VERIFY_IS_EQUAL(threed_patch.dimension(1), 2); + VERIFY_IS_EQUAL(threed_patch.dimension(2), 3); + VERIFY_IS_EQUAL(threed_patch.dimension(3), 5); + VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 3; ++l) { + int patch_loc = i + 2 * (j + 2 * (k + 3 * l)); + for (int x = 0; x < 2; ++x) { + for (int y = 0; y < 3; ++y) { + for (int z = 0; z < 5; ++z) { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc)); + } + } + } + } + } + } + } +} + + +void test_cxx11_tensor_patch() +{ + CALL_SUBTEST(test_simple_patch()); + // CALL_SUBTEST(test_expr_shuffling()); +} diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp new file mode 100644 index 000000000..8276ae822 --- /dev/null +++ b/unsupported/test/cxx11_tensor_random.cpp @@ -0,0 +1,78 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +static void test_default() +{ + Tensor<float, 1> vec(6); + vec.setRandom(); + + // Fixme: we should check that the generated numbers follow a uniform + // distribution instead. + for (int i = 1; i < 6; ++i) { + VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1)); + } +} + +static void test_normal() +{ + Tensor<float, 1> vec(6); + vec.setRandom<Eigen::internal::NormalRandomGenerator<float>>(); + + // Fixme: we should check that the generated numbers follow a gaussian + // distribution instead. + for (int i = 1; i < 6; ++i) { + VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1)); + } +} + + +struct MyGenerator { + MyGenerator() { } + MyGenerator(const MyGenerator&) { } + + // Return a random value to be used. "element_location" is the + // location of the entry to set in the tensor, it can typically + // be ignored. + int operator()(Eigen::DenseIndex element_location, Eigen::DenseIndex /*unused*/ = 0) const { + return 3 * element_location; + } + + // Same as above but generates several numbers at a time. + typename internal::packet_traits<int>::type packetOp( + Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const { + const int packetSize = internal::packet_traits<int>::size; + EIGEN_ALIGN_DEFAULT int values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = 3 * (packet_location + i); + } + return internal::pload<typename internal::packet_traits<int>::type>(values); + } +}; + + +static void test_custom() +{ + Tensor<int, 1> vec(6); + vec.setRandom<MyGenerator>(); + + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(vec(i), 3*i); + } +} + +void test_cxx11_tensor_random() +{ + CALL_SUBTEST(test_default()); + CALL_SUBTEST(test_normal()); + CALL_SUBTEST(test_custom()); +} diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp new file mode 100644 index 000000000..0269853a9 --- /dev/null +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -0,0 +1,420 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" +#include <limits> +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template <int DataLayout> +static void test_simple_reductions() { + Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7); + tensor.setRandom(); + array<ptrdiff_t, 2> reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 3; + + Tensor<float, 2, DataLayout> result = tensor.sum(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 5); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 5; ++j) { + float sum = 0.0f; + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 7; ++l) { + sum += tensor(i, k, j, l); + } + } + VERIFY_IS_APPROX(result(i, j), sum); + } + } + + { + Tensor<float, 1, DataLayout> sum1 = tensor.sum(); + VERIFY_IS_EQUAL(sum1.dimension(0), 1); + + array<ptrdiff_t, 4> reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + reduction_axis[2] = 2; + reduction_axis[3] = 3; + Tensor<float, 1, DataLayout> sum2 = tensor.sum(reduction_axis); + VERIFY_IS_EQUAL(sum2.dimension(0), 1); + + VERIFY_IS_APPROX(sum1(0), sum2(0)); + } + + reduction_axis[0] = 0; + reduction_axis[1] = 2; + result = tensor.prod(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 3); + VERIFY_IS_EQUAL(result.dimension(1), 7); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 7; ++j) { + float prod = 1.0f; + for (int k = 0; k < 2; ++k) { + for (int l = 0; l < 5; ++l) { + prod *= tensor(k, i, l, j); + } + } + VERIFY_IS_APPROX(result(i, j), prod); + } + } + + { + Tensor<float, 1, DataLayout> prod1 = tensor.prod(); + VERIFY_IS_EQUAL(prod1.dimension(0), 1); + + array<ptrdiff_t, 4> reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + reduction_axis[2] = 2; + reduction_axis[3] = 3; + Tensor<float, 1, DataLayout> prod2 = tensor.prod(reduction_axis); + VERIFY_IS_EQUAL(prod2.dimension(0), 1); + + VERIFY_IS_APPROX(prod1(0), prod2(0)); + } + + reduction_axis[0] = 0; + reduction_axis[1] = 2; + result = tensor.maximum(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 3); + VERIFY_IS_EQUAL(result.dimension(1), 7); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 7; ++j) { + float max_val = std::numeric_limits<float>::lowest(); + for (int k = 0; k < 2; ++k) { + for (int l = 0; l < 5; ++l) { + max_val = (std::max)(max_val, tensor(k, i, l, j)); + } + } + VERIFY_IS_APPROX(result(i, j), max_val); + } + } + + { + Tensor<float, 1, DataLayout> max1 = tensor.maximum(); + VERIFY_IS_EQUAL(max1.dimension(0), 1); + + array<ptrdiff_t, 4> reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + reduction_axis[2] = 2; + reduction_axis[3] = 3; + Tensor<float, 1, DataLayout> max2 = tensor.maximum(reduction_axis); + VERIFY_IS_EQUAL(max2.dimension(0), 1); + + VERIFY_IS_APPROX(max1(0), max2(0)); + } + + reduction_axis[0] = 0; + reduction_axis[1] = 1; + result = tensor.minimum(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 5); + VERIFY_IS_EQUAL(result.dimension(1), 7); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 7; ++j) { + float min_val = (std::numeric_limits<float>::max)(); + for (int k = 0; k < 2; ++k) { + for (int l = 0; l < 3; ++l) { + min_val = (std::min)(min_val, tensor(k, l, i, j)); + } + } + VERIFY_IS_APPROX(result(i, j), min_val); + } + } + + { + Tensor<float, 1, DataLayout> min1 = tensor.minimum(); + VERIFY_IS_EQUAL(min1.dimension(0), 1); + + array<ptrdiff_t, 4> reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + reduction_axis[2] = 2; + reduction_axis[3] = 3; + Tensor<float, 1, DataLayout> min2 = tensor.minimum(reduction_axis); + VERIFY_IS_EQUAL(min2.dimension(0), 1); + + VERIFY_IS_APPROX(min1(0), min2(0)); + } + + reduction_axis[0] = 0; + reduction_axis[1] = 1; + result = tensor.mean(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 5); + VERIFY_IS_EQUAL(result.dimension(1), 7); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 7; ++j) { + float sum = 0.0f; + int count = 0; + for (int k = 0; k < 2; ++k) { + for (int l = 0; l < 3; ++l) { + sum += tensor(k, l, i, j); + ++count; + } + } + VERIFY_IS_APPROX(result(i, j), sum / count); + } + } + + { + Tensor<float, 1, DataLayout> mean1 = tensor.mean(); + VERIFY_IS_EQUAL(mean1.dimension(0), 1); + + array<ptrdiff_t, 4> reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + reduction_axis[2] = 2; + reduction_axis[3] = 3; + Tensor<float, 1, DataLayout> mean2 = tensor.mean(reduction_axis); + VERIFY_IS_EQUAL(mean2.dimension(0), 1); + + VERIFY_IS_APPROX(mean1(0), mean2(0)); + } +} + +template <int DataLayout> +static void test_full_reductions() { + Tensor<float, 2, DataLayout> tensor(2, 3); + tensor.setRandom(); + array<ptrdiff_t, 2> reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + + Tensor<float, 1, DataLayout> result = tensor.sum(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 1); + + float sum = 0.0f; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + sum += tensor(i, j); + } + } + VERIFY_IS_APPROX(result(0), sum); + + result = tensor.square().sum(reduction_axis).sqrt(); + VERIFY_IS_EQUAL(result.dimension(0), 1); + + sum = 0.0f; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + sum += tensor(i, j) * tensor(i, j); + } + } + VERIFY_IS_APPROX(result(0), sqrtf(sum)); +} + +struct UserReducer { + static const bool PacketAccess = false; + UserReducer(float offset) : offset_(offset) {} + void reduce(const float val, float* accum) { *accum += val * val; } + float initialize() const { return 0; } + float finalize(const float accum) const { return 1.0f / (accum + offset_); } + + private: + const float offset_; +}; + +template <int DataLayout> +static void test_user_defined_reductions() { + Tensor<float, 2, DataLayout> tensor(5, 7); + tensor.setRandom(); + array<ptrdiff_t, 1> reduction_axis; + reduction_axis[0] = 1; + + UserReducer reducer(10.0f); + Tensor<float, 1, DataLayout> result = tensor.reduce(reduction_axis, reducer); + VERIFY_IS_EQUAL(result.dimension(0), 5); + for (int i = 0; i < 5; ++i) { + float expected = 10.0f; + for (int j = 0; j < 7; ++j) { + expected += tensor(i, j) * tensor(i, j); + } + expected = 1.0f / expected; + VERIFY_IS_APPROX(result(i), expected); + } +} + +template <int DataLayout> +static void test_tensor_maps() { + int inputs[2 * 3 * 5 * 7]; + TensorMap<Tensor<int, 4, DataLayout> > tensor_map(inputs, 2, 3, 5, 7); + TensorMap<Tensor<const int, 4, DataLayout> > tensor_map_const(inputs, 2, 3, 5, + 7); + const TensorMap<Tensor<const int, 4, DataLayout> > tensor_map_const_const( + inputs, 2, 3, 5, 7); + + tensor_map.setRandom(); + array<ptrdiff_t, 2> reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 3; + + Tensor<int, 2, DataLayout> result = tensor_map.sum(reduction_axis); + Tensor<int, 2, DataLayout> result2 = tensor_map_const.sum(reduction_axis); + Tensor<int, 2, DataLayout> result3 = + tensor_map_const_const.sum(reduction_axis); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 5; ++j) { + int sum = 0; + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 7; ++l) { + sum += tensor_map(i, k, j, l); + } + } + VERIFY_IS_EQUAL(result(i, j), sum); + VERIFY_IS_EQUAL(result2(i, j), sum); + VERIFY_IS_EQUAL(result3(i, j), sum); + } + } +} + +template <int DataLayout> +static void test_static_dims() { + Tensor<float, 4, DataLayout> in(72, 53, 97, 113); + Tensor<float, 2, DataLayout> out(72, 97); + in.setRandom(); + +#ifndef EIGEN_HAS_CONSTEXPR + array<int, 2> reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 3; +#else + Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<3> > reduction_axis; +#endif + + out = in.maximum(reduction_axis); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + float expected = -1e10f; + for (int k = 0; k < 53; ++k) { + for (int l = 0; l < 113; ++l) { + expected = (std::max)(expected, in(i, k, j, l)); + } + } + VERIFY_IS_APPROX(out(i, j), expected); + } + } +} + +template <int DataLayout> +static void test_innermost_last_dims() { + Tensor<float, 4, DataLayout> in(72, 53, 97, 113); + Tensor<float, 2, DataLayout> out(97, 113); + in.setRandom(); + +// Reduce on the innermost dimensions. +#ifndef EIGEN_HAS_CONSTEXPR + array<int, 2> reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; +#else + // This triggers the use of packets for ColMajor. + Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1> > reduction_axis; +#endif + + out = in.maximum(reduction_axis); + + for (int i = 0; i < 97; ++i) { + for (int j = 0; j < 113; ++j) { + float expected = -1e10f; + for (int k = 0; k < 53; ++k) { + for (int l = 0; l < 72; ++l) { + expected = (std::max)(expected, in(l, k, i, j)); + } + } + VERIFY_IS_APPROX(out(i, j), expected); + } + } +} + +template <int DataLayout> +static void test_innermost_first_dims() { + Tensor<float, 4, DataLayout> in(72, 53, 97, 113); + Tensor<float, 2, DataLayout> out(72, 53); + in.setRandom(); + +// Reduce on the innermost dimensions. +#ifndef EIGEN_HAS_CONSTEXPR + array<int, 2> reduction_axis; + reduction_axis[0] = 2; + reduction_axis[1] = 3; +#else + // This triggers the use of packets for RowMajor. + Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>> reduction_axis; +#endif + + out = in.maximum(reduction_axis); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 53; ++j) { + float expected = -1e10f; + for (int k = 0; k < 97; ++k) { + for (int l = 0; l < 113; ++l) { + expected = (std::max)(expected, in(i, j, k, l)); + } + } + VERIFY_IS_APPROX(out(i, j), expected); + } + } +} + +template <int DataLayout> +static void test_reduce_middle_dims() { + Tensor<float, 4, DataLayout> in(72, 53, 97, 113); + Tensor<float, 2, DataLayout> out(72, 53); + in.setRandom(); + +// Reduce on the innermost dimensions. +#ifndef EIGEN_HAS_CONSTEXPR + array<int, 2> reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 2; +#else + // This triggers the use of packets for RowMajor. + Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2>> reduction_axis; +#endif + + out = in.maximum(reduction_axis); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 113; ++j) { + float expected = -1e10f; + for (int k = 0; k < 53; ++k) { + for (int l = 0; l < 97; ++l) { + expected = (std::max)(expected, in(i, k, l, j)); + } + } + VERIFY_IS_APPROX(out(i, j), expected); + } + } +} + +void test_cxx11_tensor_reduction() { + CALL_SUBTEST(test_simple_reductions<ColMajor>()); + CALL_SUBTEST(test_simple_reductions<RowMajor>()); + CALL_SUBTEST(test_full_reductions<ColMajor>()); + CALL_SUBTEST(test_full_reductions<RowMajor>()); + CALL_SUBTEST(test_user_defined_reductions<ColMajor>()); + CALL_SUBTEST(test_user_defined_reductions<RowMajor>()); + CALL_SUBTEST(test_tensor_maps<ColMajor>()); + CALL_SUBTEST(test_tensor_maps<RowMajor>()); + CALL_SUBTEST(test_static_dims<ColMajor>()); + CALL_SUBTEST(test_static_dims<RowMajor>()); + CALL_SUBTEST(test_innermost_last_dims<ColMajor>()); + CALL_SUBTEST(test_innermost_last_dims<RowMajor>()); + CALL_SUBTEST(test_innermost_first_dims<ColMajor>()); + CALL_SUBTEST(test_innermost_first_dims<RowMajor>()); + CALL_SUBTEST(test_reduce_middle_dims<ColMajor>()); + CALL_SUBTEST(test_reduce_middle_dims<RowMajor>()); +} diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp new file mode 100644 index 000000000..aa369f278 --- /dev/null +++ b/unsupported/test/cxx11_tensor_ref.cpp @@ -0,0 +1,208 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_simple_lvalue_ref() +{ + Tensor<int, 1> input(6); + input.setRandom(); + + TensorRef<Tensor<int, 1>> ref3(input); + TensorRef<Tensor<int, 1>> ref4 = input; + + VERIFY_IS_EQUAL(ref3.data(), input.data()); + VERIFY_IS_EQUAL(ref4.data(), input.data()); + + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(ref3(i), input(i)); + VERIFY_IS_EQUAL(ref4(i), input(i)); + } + + for (int i = 0; i < 6; ++i) { + ref3.coeffRef(i) = i; + } + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(input(i), i); + } + for (int i = 0; i < 6; ++i) { + ref4.coeffRef(i) = -i * 2; + } + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(input(i), -i*2); + } +} + + +static void test_simple_rvalue_ref() +{ + Tensor<int, 1> input1(6); + input1.setRandom(); + Tensor<int, 1> input2(6); + input2.setRandom(); + + TensorRef<Tensor<int, 1>> ref3(input1 + input2); + TensorRef<Tensor<int, 1>> ref4 = input1 + input2; + + VERIFY_IS_NOT_EQUAL(ref3.data(), input1.data()); + VERIFY_IS_NOT_EQUAL(ref4.data(), input1.data()); + VERIFY_IS_NOT_EQUAL(ref3.data(), input2.data()); + VERIFY_IS_NOT_EQUAL(ref4.data(), input2.data()); + + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(ref3(i), input1(i) + input2(i)); + VERIFY_IS_EQUAL(ref4(i), input1(i) + input2(i)); + } +} + + +static void test_multiple_dims() +{ + Tensor<float, 3> input(3,5,7); + input.setRandom(); + + TensorRef<Tensor<float, 3>> ref(input); + VERIFY_IS_EQUAL(ref.data(), input.data()); + VERIFY_IS_EQUAL(ref.dimension(0), 3); + VERIFY_IS_EQUAL(ref.dimension(1), 5); + VERIFY_IS_EQUAL(ref.dimension(2), 7); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(ref(i,j,k), input(i,j,k)); + } + } + } +} + + +static void test_slice() +{ + Tensor<float, 5> tensor(2,3,5,7,11); + tensor.setRandom(); + + Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5); + Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1); + TensorRef<Tensor<float, 5>> slice = tensor.slice(indices, sizes); + VERIFY_IS_EQUAL(slice(0,0,0,0,0), tensor(1,2,3,4,5)); + + Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5); + Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3); + slice = tensor.slice(indices2, sizes2); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); + } + } + } + + Eigen::DSizes<ptrdiff_t, 5> indices3(0,0,0,0,0); + Eigen::DSizes<ptrdiff_t, 5> sizes3(2,3,1,1,1); + slice = tensor.slice(indices3, sizes3); + VERIFY_IS_EQUAL(slice.data(), tensor.data()); +} + + +static void test_ref_of_ref() +{ + Tensor<float, 3> input(3,5,7); + input.setRandom(); + + TensorRef<Tensor<float, 3>> ref(input); + TensorRef<Tensor<float, 3>> ref_of_ref(ref); + TensorRef<Tensor<float, 3>> ref_of_ref2; + ref_of_ref2 = ref; + + VERIFY_IS_EQUAL(ref_of_ref.data(), input.data()); + VERIFY_IS_EQUAL(ref_of_ref.dimension(0), 3); + VERIFY_IS_EQUAL(ref_of_ref.dimension(1), 5); + VERIFY_IS_EQUAL(ref_of_ref.dimension(2), 7); + + VERIFY_IS_EQUAL(ref_of_ref2.data(), input.data()); + VERIFY_IS_EQUAL(ref_of_ref2.dimension(0), 3); + VERIFY_IS_EQUAL(ref_of_ref2.dimension(1), 5); + VERIFY_IS_EQUAL(ref_of_ref2.dimension(2), 7); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(ref_of_ref(i,j,k), input(i,j,k)); + VERIFY_IS_EQUAL(ref_of_ref2(i,j,k), input(i,j,k)); + } + } + } +} + + +static void test_ref_in_expr() +{ + Tensor<float, 3> input(3,5,7); + input.setRandom(); + TensorRef<Tensor<float, 3>> input_ref(input); + + Tensor<float, 3> result(3,5,7); + result.setRandom(); + TensorRef<Tensor<float, 3>> result_ref(result); + + Tensor<float, 3> bias(3,5,7); + bias.setRandom(); + + result_ref = input_ref + bias; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(result_ref(i,j,k), input(i,j,k) + bias(i,j,k)); + VERIFY_IS_NOT_EQUAL(result(i,j,k), input(i,j,k) + bias(i,j,k)); + } + } + } + + result = result_ref; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(result(i,j,k), input(i,j,k) + bias(i,j,k)); + } + } + } +} + + +static void test_coeff_ref() +{ + Tensor<float, 5> tensor(2,3,5,7,11); + tensor.setRandom(); + Tensor<float, 5> original = tensor; + + TensorRef<Tensor<float, 4>> slice = tensor.chip(7, 4); + slice.coeffRef(0, 0, 0, 0) = 1.0f; + slice.coeffRef(1, 0, 0, 0) += 2.0f; + + VERIFY_IS_EQUAL(tensor(0,0,0,0,7), 1.0f); + VERIFY_IS_EQUAL(tensor(1,0,0,0,7), original(1,0,0,0,7) + 2.0f); +} + + +void test_cxx11_tensor_ref() +{ + CALL_SUBTEST(test_simple_lvalue_ref()); + CALL_SUBTEST(test_simple_rvalue_ref()); + CALL_SUBTEST(test_multiple_dims()); + CALL_SUBTEST(test_slice()); + CALL_SUBTEST(test_ref_of_ref()); + CALL_SUBTEST(test_ref_in_expr()); + CALL_SUBTEST(test_coeff_ref()); +} diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp new file mode 100644 index 000000000..4c0be35da --- /dev/null +++ b/unsupported/test/cxx11_tensor_reverse.cpp @@ -0,0 +1,167 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com and +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::array; + +template <int DataLayout> +static void test_simple_reverse() +{ + Tensor<float, 4, DataLayout> tensor(2,3,5,7); + tensor.setRandom(); + + array<bool, 4> dim_rev; + dim_rev[0] = false; + dim_rev[1] = true; + dim_rev[2] = true; + dim_rev[3] = false; + + Tensor<float, 4, DataLayout> reversed_tensor; + reversed_tensor = tensor.reverse(dim_rev); + + VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2); + VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3); + VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5); + VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l)); + } + } + } + } + + dim_rev[0] = true; + dim_rev[1] = false; + dim_rev[2] = false; + dim_rev[3] = false; + + reversed_tensor = tensor.reverse(dim_rev); + + VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2); + VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3); + VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5); + VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7); + + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l)); + } + } + } + } + + dim_rev[0] = true; + dim_rev[1] = false; + dim_rev[2] = false; + dim_rev[3] = true; + + reversed_tensor = tensor.reverse(dim_rev); + + VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2); + VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3); + VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5); + VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7); + + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l)); + } + } + } + } +} + + +template <int DataLayout> +static void test_expr_reverse() +{ + Tensor<float, 4, DataLayout> tensor(2,3,5,7); + tensor.setRandom(); + + array<bool, 4> dim_rev; + dim_rev[0] = false; + dim_rev[1] = true; + dim_rev[2] = false; + dim_rev[3] = true; + + + Tensor<float, 4, DataLayout> expected; + expected = tensor.reverse(dim_rev); + + Tensor<float, 4, DataLayout> result(2,3,5,7); + + array<ptrdiff_t, 4> src_slice_dim{{2,3,1,7}}; + array<ptrdiff_t, 4> src_slice_start{{0,0,0,0}}; + array<ptrdiff_t, 4> dst_slice_dim{{2,3,1,7}}; + array<ptrdiff_t, 4> dst_slice_start{{0,0,0,0}}; + + for (int i = 0; i < 5; ++i) { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.slice(src_slice_start, src_slice_dim).reverse(dim_rev); + src_slice_start[2] += 1; + dst_slice_start[2] += 1; + } + + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 3); + VERIFY_IS_EQUAL(result.dimension(2), 5); + VERIFY_IS_EQUAL(result.dimension(3), 7); + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } + + dst_slice_start[2] = 0; + result.setRandom(); + for (int i = 0; i < 5; ++i) { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); + dst_slice_start[2] += 1; + } + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } +} + + +void test_cxx11_tensor_reverse() +{ + CALL_SUBTEST(test_simple_reverse<ColMajor>()); + CALL_SUBTEST(test_simple_reverse<RowMajor>()); + CALL_SUBTEST(test_expr_reverse<ColMajor>()); + CALL_SUBTEST(test_expr_reverse<RowMajor>()); +} diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp new file mode 100644 index 000000000..2f7fd9e50 --- /dev/null +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -0,0 +1,187 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::array; + +template <int DataLayout> +static void test_simple_shuffling() +{ + Tensor<float, 4, DataLayout> tensor(2,3,5,7); + tensor.setRandom(); + array<ptrdiff_t, 4> shuffles; + shuffles[0] = 0; + shuffles[1] = 1; + shuffles[2] = 2; + shuffles[3] = 3; + + Tensor<float, 4, DataLayout> no_shuffle; + no_shuffle = tensor.shuffle(shuffles); + + VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2); + VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3); + VERIFY_IS_EQUAL(no_shuffle.dimension(2), 5); + VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l)); + } + } + } + } + + shuffles[0] = 2; + shuffles[1] = 3; + shuffles[2] = 1; + shuffles[3] = 0; + Tensor<float, 4, DataLayout> shuffle; + shuffle = tensor.shuffle(shuffles); + + VERIFY_IS_EQUAL(shuffle.dimension(0), 5); + VERIFY_IS_EQUAL(shuffle.dimension(1), 7); + VERIFY_IS_EQUAL(shuffle.dimension(2), 3); + VERIFY_IS_EQUAL(shuffle.dimension(3), 2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i)); + } + } + } + } +} + + +template <int DataLayout> +static void test_expr_shuffling() +{ + Tensor<float, 4, DataLayout> tensor(2,3,5,7); + tensor.setRandom(); + + array<ptrdiff_t, 4> shuffles; + shuffles[0] = 2; + shuffles[1] = 3; + shuffles[2] = 1; + shuffles[3] = 0; + Tensor<float, 4, DataLayout> expected; + expected = tensor.shuffle(shuffles); + + Tensor<float, 4, DataLayout> result(5,7,3,2); + + array<int, 4> src_slice_dim{{2,3,1,7}}; + array<int, 4> src_slice_start{{0,0,0,0}}; + array<int, 4> dst_slice_dim{{1,7,3,2}}; + array<int, 4> dst_slice_start{{0,0,0,0}}; + + for (int i = 0; i < 5; ++i) { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.slice(src_slice_start, src_slice_dim).shuffle(shuffles); + src_slice_start[2] += 1; + dst_slice_start[0] += 1; + } + + VERIFY_IS_EQUAL(result.dimension(0), 5); + VERIFY_IS_EQUAL(result.dimension(1), 7); + VERIFY_IS_EQUAL(result.dimension(2), 3); + VERIFY_IS_EQUAL(result.dimension(3), 2); + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } + + dst_slice_start[0] = 0; + result.setRandom(); + for (int i = 0; i < 5; ++i) { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.shuffle(shuffles).slice(dst_slice_start, dst_slice_dim); + dst_slice_start[0] += 1; + } + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } +} + + +template <int DataLayout> +static void test_shuffling_as_value() +{ + Tensor<float, 4, DataLayout> tensor(2,3,5,7); + tensor.setRandom(); + array<ptrdiff_t, 4> shuffles; + shuffles[2] = 0; + shuffles[3] = 1; + shuffles[1] = 2; + shuffles[0] = 3; + Tensor<float, 4, DataLayout> shuffle(5,7,3,2); + shuffle.shuffle(shuffles) = tensor; + + VERIFY_IS_EQUAL(shuffle.dimension(0), 5); + VERIFY_IS_EQUAL(shuffle.dimension(1), 7); + VERIFY_IS_EQUAL(shuffle.dimension(2), 3); + VERIFY_IS_EQUAL(shuffle.dimension(3), 2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i)); + } + } + } + } + + array<ptrdiff_t, 4> no_shuffle; + no_shuffle[0] = 0; + no_shuffle[1] = 1; + no_shuffle[2] = 2; + no_shuffle[3] = 3; + Tensor<float, 4, DataLayout> shuffle2(5,7,3,2); + shuffle2.shuffle(shuffles) = tensor.shuffle(no_shuffle); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 7; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 2; ++l) { + VERIFY_IS_EQUAL(shuffle2(i,j,k,l), shuffle(i,j,k,l)); + } + } + } + } +} + +void test_cxx11_tensor_shuffling() +{ + CALL_SUBTEST(test_simple_shuffling<ColMajor>()); + CALL_SUBTEST(test_simple_shuffling<RowMajor>()); + CALL_SUBTEST(test_expr_shuffling<ColMajor>()); + CALL_SUBTEST(test_expr_shuffling<RowMajor>()); + CALL_SUBTEST(test_shuffling_as_value<ColMajor>()); + CALL_SUBTEST(test_shuffling_as_value<RowMajor>()); +} diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp index ea512c9cc..23855fca0 100644 --- a/unsupported/test/cxx11_tensor_simple.cpp +++ b/unsupported/test/cxx11_tensor_simple.cpp @@ -32,6 +32,7 @@ static void test_1d() vec1(5) = 42; vec2(5) = 5; vec3(5) = 0; vec4.setZero(); + VERIFY_IS_EQUAL((vec1.rank()), 1); VERIFY_IS_EQUAL((vec1.size()), 6); VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6); @@ -99,10 +100,12 @@ static void test_2d() mat2(1,1) = 4; mat2(1,2) = 5; + VERIFY_IS_EQUAL((mat1.rank()), 2); VERIFY_IS_EQUAL((mat1.size()), 6); VERIFY_IS_EQUAL((mat1.dimensions()[0]), 2); VERIFY_IS_EQUAL((mat1.dimensions()[1]), 3); + VERIFY_IS_EQUAL((mat2.rank()), 2); VERIFY_IS_EQUAL((mat2.size()), 6); VERIFY_IS_EQUAL((mat2.dimensions()[0]), 2); VERIFY_IS_EQUAL((mat2.dimensions()[1]), 3); @@ -163,7 +166,7 @@ static void test_3d() VERIFY_IS_EQUAL((epsilon(0,2,1)), -1); VERIFY_IS_EQUAL((epsilon(1,0,2)), -1); - std::array<Eigen::DenseIndex, 3> dims{{2,3,4}}; + array<Eigen::DenseIndex, 3> dims{{2,3,4}}; Tensor<int, 3> t1(dims); Tensor<int, 3, RowMajor> t2(dims); @@ -244,7 +247,7 @@ static void test_simple_assign() epsilon(0,1,2) = epsilon(2,0,1) = epsilon(1,2,0) = 1; epsilon(2,1,0) = epsilon(0,2,1) = epsilon(1,0,2) = -1; - Tensor<int, 3> e2(2,3,1); + Tensor<int, 3> e2(3,3,3); e2.setZero(); VERIFY_IS_EQUAL((e2(1,2,0)), 0); @@ -257,12 +260,38 @@ static void test_simple_assign() VERIFY_IS_EQUAL((e2(1,0,2)), -1); } +static void test_resize() +{ + Tensor<int, 3> epsilon; + epsilon.resize(2,3,7); + VERIFY_IS_EQUAL(epsilon.dimension(0), 2); + VERIFY_IS_EQUAL(epsilon.dimension(1), 3); + VERIFY_IS_EQUAL(epsilon.dimension(2), 7); + VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 2ul*3*7); + + const int* old_data = epsilon.data(); + epsilon.resize(3,2,7); + VERIFY_IS_EQUAL(epsilon.dimension(0), 3); + VERIFY_IS_EQUAL(epsilon.dimension(1), 2); + VERIFY_IS_EQUAL(epsilon.dimension(2), 7); + VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 2ul*3*7); + VERIFY_IS_EQUAL(epsilon.data(), old_data); + + epsilon.resize(3,5,7); + VERIFY_IS_EQUAL(epsilon.dimension(0), 3); + VERIFY_IS_EQUAL(epsilon.dimension(1), 5); + VERIFY_IS_EQUAL(epsilon.dimension(2), 7); + VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 3ul*5*7); + VERIFY_IS_NOT_EQUAL(epsilon.data(), old_data); +} + void test_cxx11_tensor_simple() { CALL_SUBTEST(test_1d()); CALL_SUBTEST(test_2d()); CALL_SUBTEST(test_3d()); CALL_SUBTEST(test_simple_assign()); + CALL_SUBTEST(test_resize()); } /* diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp new file mode 100644 index 000000000..935b908cc --- /dev/null +++ b/unsupported/test/cxx11_tensor_striding.cpp @@ -0,0 +1,119 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template<int DataLayout> +static void test_simple_striding() +{ + Tensor<float, 4, DataLayout> tensor(2,3,5,7); + tensor.setRandom(); + array<ptrdiff_t, 4> strides; + strides[0] = 1; + strides[1] = 1; + strides[2] = 1; + strides[3] = 1; + + Tensor<float, 4, DataLayout> no_stride; + no_stride = tensor.stride(strides); + + VERIFY_IS_EQUAL(no_stride.dimension(0), 2); + VERIFY_IS_EQUAL(no_stride.dimension(1), 3); + VERIFY_IS_EQUAL(no_stride.dimension(2), 5); + VERIFY_IS_EQUAL(no_stride.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l)); + } + } + } + } + + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; + Tensor<float, 4, DataLayout> stride; + stride = tensor.stride(strides); + + VERIFY_IS_EQUAL(stride.dimension(0), 1); + VERIFY_IS_EQUAL(stride.dimension(1), 1); + VERIFY_IS_EQUAL(stride.dimension(2), 3); + VERIFY_IS_EQUAL(stride.dimension(3), 3); + + for (int i = 0; i < 1; ++i) { + for (int j = 0; j < 1; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 3; ++l) { + VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l)); + } + } + } + } +} + + +template<int DataLayout> +static void test_striding_as_lvalue() +{ + Tensor<float, 4, DataLayout> tensor(2,3,5,7); + tensor.setRandom(); + array<ptrdiff_t, 4> strides; + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; + + Tensor<float, 4, DataLayout> result(3, 12, 10, 21); + result.stride(strides) = tensor; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), result(2*i,4*j,2*k,3*l)); + } + } + } + } + + array<ptrdiff_t, 4> no_strides; + no_strides[0] = 1; + no_strides[1] = 1; + no_strides[2] = 1; + no_strides[3] = 1; + Tensor<float, 4, DataLayout> result2(3, 12, 10, 21); + result2.stride(strides) = tensor.stride(no_strides); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), result2(2*i,4*j,2*k,3*l)); + } + } + } + } +} + + +void test_cxx11_tensor_striding() +{ + CALL_SUBTEST(test_simple_striding<ColMajor>()); + CALL_SUBTEST(test_simple_striding<RowMajor>()); + CALL_SUBTEST(test_striding_as_lvalue<ColMajor>()); + CALL_SUBTEST(test_striding_as_lvalue<RowMajor>()); +} diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp new file mode 100644 index 000000000..6fe65c7f9 --- /dev/null +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -0,0 +1,270 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_USE_THREADS + + +#include "main.h" +#include <iostream> +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + + +static void test_multithread_elementwise() +{ + Tensor<float, 3> in1(2,3,7); + Tensor<float, 3> in2(2,3,7); + Tensor<float, 3> out(2,3,7); + + in1.setRandom(); + in2.setRandom(); + + Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(3, 11)); + out.device(thread_pool_device) = in1 + in2 * 3.14f; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f); + } + } + } +} + + +static void test_multithread_compound_assignment() +{ + Tensor<float, 3> in1(2,3,7); + Tensor<float, 3> in2(2,3,7); + Tensor<float, 3> out(2,3,7); + + in1.setRandom(); + in2.setRandom(); + + Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(3, 11)); + out.device(thread_pool_device) = in1; + out.device(thread_pool_device) += in2 * 3.14f; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f); + } + } + } +} + +template<int DataLayout> +static void test_multithread_contraction() +{ + Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31); + Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10); + Tensor<float, 5, DataLayout> t_result(30, 50, 70, 2, 10); + + t_left.setRandom(); + t_right.setRandom(); + + // this contraction should be equivalent to a single matrix multiplication + typedef Tensor<float, 1>::DimensionPair DimPair; + Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}}); + + typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 1500, 1147); + MapXf m_right(t_right.data(), 1147, 1400); + Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400); + + Eigen::ThreadPoolDevice thread_pool_device(4); + + // compute results by separate methods + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + m_result = m_left * m_right; + + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + VERIFY(&t_result.data()[i] != &m_result.data()[i]); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } +} + +template<int DataLayout> +static void test_contraction_corner_cases() +{ + Tensor<float, 2, DataLayout> t_left(32, 500); + Tensor<float, 2, DataLayout> t_right(32, 28*28); + Tensor<float, 2, DataLayout> t_result(500, 28*28); + + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; + t_result = t_result.constant(NAN); + + // this contraction should be equivalent to a single matrix multiplication + typedef Tensor<float, 1>::DimensionPair DimPair; + Eigen::array<DimPair, 1> dims{{DimPair(0, 0)}}; + + typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 32, 500); + MapXf m_right(t_right.data(), 32, 28*28); + Matrix<float, Dynamic, Dynamic, DataLayout> m_result(500, 28*28); + + Eigen::ThreadPoolDevice thread_pool_device(12); + + // compute results by separate methods + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + m_result = m_left.transpose() * m_right; + + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!std::isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + t_left.resize(32, 1); + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_result.resize (1, 28*28); + t_result = t_result.constant(NAN); + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + new(&m_left) MapXf(t_left.data(), 32, 1); + m_result = m_left.transpose() * m_right; + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!std::isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + t_left.resize(32, 500); + t_right.resize(32, 4); + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; + t_result.resize (500, 4); + t_result = t_result.constant(NAN); + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + new(&m_left) MapXf(t_left.data(), 32, 500); + new(&m_right) MapXf(t_right.data(), 32, 4); + m_result = m_left.transpose() * m_right; + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!std::isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + t_left.resize(32, 1); + t_right.resize(32, 4); + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; + t_result.resize (1, 4); + t_result = t_result.constant(NAN); + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + new(&m_left) MapXf(t_left.data(), 32, 1); + new(&m_right) MapXf(t_right.data(), 32, 4); + m_result = m_left.transpose() * m_right; + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!std::isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } +} + +template<int DataLayout> +static void test_multithread_contraction_agrees_with_singlethread() { + int contract_size = internal::random<int>(1, 5000); + + Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80), + contract_size, + internal::random<int>(1, 100)); + + Tensor<float, 4, DataLayout> right(internal::random<int>(1, 25), + internal::random<int>(1, 37), + contract_size, + internal::random<int>(1, 51)); + + left.setRandom(); + right.setRandom(); + + // add constants to shift values away from 0 for more precision + left += left.constant(1.5f); + right += right.constant(1.5f); + + typedef Tensor<float, 1>::DimensionPair DimPair; + Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}}); + + Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(2, 11)); + + Tensor<float, 5, DataLayout> st_result; + st_result = left.contract(right, dims); + + Tensor<float, 5, DataLayout> tp_result(st_result.dimensions()); + tp_result.device(thread_pool_device) = left.contract(right, dims); + + VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); + for (ptrdiff_t i = 0; i < st_result.size(); i++) { + // if both of the values are very small, then do nothing (because the test will fail + // due to numerical precision issues when values are small) + if (fabs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4) { + VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]); + } + } +} + + +static void test_memcpy() { + + for (int i = 0; i < 5; ++i) { + const int num_threads = internal::random<int>(3, 11); + Eigen::ThreadPoolDevice thread_pool_device(num_threads); + + const int size = internal::random<int>(13, 7632); + Tensor<float, 1> t1(size); + t1.setRandom(); + std::vector<float> result(size); + thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float)); + for (int i = 0; i < size; i++) { + VERIFY_IS_EQUAL(t1(i), result[i]); + } + } +} + + +static void test_multithread_random() +{ + Eigen::ThreadPoolDevice device(2); + Tensor<float, 1> t(1 << 20); + t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>(); +} + + +void test_cxx11_tensor_thread_pool() +{ + CALL_SUBTEST(test_multithread_elementwise()); + CALL_SUBTEST(test_multithread_compound_assignment()); + + CALL_SUBTEST(test_multithread_contraction<ColMajor>()); + CALL_SUBTEST(test_multithread_contraction<RowMajor>()); + + CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<ColMajor>()); + CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<RowMajor>()); + + // Exercise various cases that have been problematic in the past. + CALL_SUBTEST(test_contraction_corner_cases<ColMajor>()); + CALL_SUBTEST(test_contraction_corner_cases<RowMajor>()); + + CALL_SUBTEST(test_memcpy()); + + CALL_SUBTEST(test_multithread_random()); +} diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp index 81b762c37..8b300b78a 100644 --- a/unsupported/test/minres.cpp +++ b/unsupported/test/minres.cpp @@ -21,6 +21,7 @@ template<typename T> void test_minres_T() // Diagonal preconditioner MINRES<SparseMatrix<T>, Lower, DiagonalPreconditioner<T> > minres_colmajor_lower_diag; MINRES<SparseMatrix<T>, Upper, DiagonalPreconditioner<T> > minres_colmajor_upper_diag; + MINRES<SparseMatrix<T>, Lower|Upper, DiagonalPreconditioner<T> > minres_colmajor_uplo_diag; // call tests for SPD matrix CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_I) ); @@ -28,6 +29,7 @@ template<typename T> void test_minres_T() CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_diag) ); CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_diag) ); + CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_uplo_diag) ); // TO DO: symmetric semi-definite matrix // TO DO: symmetric indefinite matrix |