diff options
author | Konstantinos Margaritis <markos@freevec.org> | 2017-10-12 22:23:13 +0300 |
---|---|---|
committer | Konstantinos Margaritis <markos@freevec.org> | 2017-10-12 22:23:13 +0300 |
commit | df7644aec36bbd2305c55e95f0b6cd1e84a27839 (patch) | |
tree | b2313480082bf12cbb30c84ee2cc112c380588d2 | |
parent | 98e52cc770ac26fbd29aaa7583443009d7937084 (diff) | |
parent | 0e85a677e36956f13a3d85047d088d773b39b69b (diff) |
Merged eigen/eigen into default
36 files changed, 509 insertions, 266 deletions
diff --git a/Eigen/Core b/Eigen/Core index f6fe4b0ec..c66359b79 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -22,6 +22,7 @@ #define EIGEN_CUDA_ARCH __CUDA_ARCH__ #endif +// Starting with CUDA 9 the composite __CUDACC_VER__ is not available. #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) #define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) #elif defined(__CUDACC_VER__) diff --git a/Eigen/QtAlignedMalloc b/Eigen/QtAlignedMalloc index c6571f129..4f07df02a 100644 --- a/Eigen/QtAlignedMalloc +++ b/Eigen/QtAlignedMalloc @@ -27,7 +27,7 @@ void qFree(void *ptr) void *qRealloc(void *ptr, std::size_t size) { void* newPtr = Eigen::internal::aligned_malloc(size); - memcpy(newPtr, ptr, size); + std::memcpy(newPtr, ptr, size); Eigen::internal::aligned_free(ptr); return newPtr; } diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 483277fe6..694f7cbde 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -396,6 +396,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false> */ template<typename Derived> template<typename OtherDerived> +EIGEN_DEVICE_FUNC inline const Product<Derived, OtherDerived> MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const { diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 7ca6a9280..c437f1a92 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -114,7 +114,7 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma inline Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() - : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? internal::traits<Map>::OuterStrideAtCompileTime + : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime) : IsVectorAtCompileTime ? (this->size() * innerStride()) : int(Flags)&RowMajorBit ? (this->cols() * innerStride()) : (this->rows() * innerStride()); diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h index 41fae5096..e94c8ee96 100644 --- a/Eigen/src/Core/NoAlias.h +++ b/Eigen/src/Core/NoAlias.h @@ -99,7 +99,7 @@ class NoAlias * \sa class NoAlias */ template<typename Derived> -NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias() +NoAlias<Derived,MatrixBase> EIGEN_DEVICE_FUNC MatrixBase<Derived>::noalias() { return NoAlias<Derived, Eigen::MatrixBase >(derived()); } diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 8cedd65ad..ee24e615a 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -50,38 +50,45 @@ struct half; namespace half_impl { #if !defined(EIGEN_HAS_CUDA_FP16) - -// Make our own __half definition that is similar to CUDA's. -struct __half { - EIGEN_DEVICE_FUNC __half() : x(0) {} - explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {} +// Make our own __half_raw definition that is similar to CUDA's. +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} unsigned short x; }; - +#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 +// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw +typedef __half __half_raw; #endif -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); -struct half_base : public __half { +struct half_base : public __half_raw { EIGEN_DEVICE_FUNC half_base() {} - EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {} - EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {} + EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} + EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} +#endif }; } // namespace half_impl // Class definition. struct half : public half_impl::half_base { - #if !defined(EIGEN_HAS_CUDA_FP16) - typedef half_impl::__half __half; + #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) + typedef half_impl::__half_raw __half_raw; #endif EIGEN_DEVICE_FUNC half() {} - EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} +#endif explicit EIGEN_DEVICE_FUNC half(bool b) : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} @@ -269,8 +276,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { // these in hardware. If we need more performance on older/other CPUs, they are // also possible to vectorize directly. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { - __half h; +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) { + __half_raw h; h.x = x; return h; } @@ -280,12 +287,13 @@ union FP32 { float f; }; -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 - return __float2half(ff); + __half tmp_ff = __float2half(ff); + return *(__half_raw*)&tmp_ff; #elif defined(EIGEN_HAS_FP16_C) - __half h; + __half_raw h; h.x = _cvtss_sh(ff, 0); return h; @@ -296,7 +304,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { const FP32 f16max = { (127 + 16) << 23 }; const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; unsigned int sign_mask = 0x80000000u; - __half o; + __half_raw o; o.x = static_cast<unsigned short>(0x0u); unsigned int sign = f.u & sign_mask; @@ -335,7 +343,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { #endif } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __half2float(h); @@ -512,8 +520,8 @@ struct numeric_limits<Eigen::half> { static const bool is_bounded = false; static const bool is_modulo = false; static const int digits = 11; - static const int digits10 = 2; - //static const int max_digits10 = ; + static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html static const int radix = 2; static const int min_exponent = -13; static const int min_exponent10 = -4; @@ -612,11 +620,15 @@ struct hash<Eigen::half> { // Add the missing shfl_xor intrinsic #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + #if EIGEN_CUDACC_VER < 90000 return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width)); + #else + return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width)); + #endif } #endif -// ldg() has an overload for __half, but we also need one for Eigen::half. +// ldg() has an overload for __half_raw, but we also need one for Eigen::half. #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half( diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index ba6a7f920..ce48e4b31 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -100,7 +100,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) { half2 result; - result.x = a.x & 0x7FFF7FFF; + unsigned temp = *(reinterpret_cast<const unsigned*>(&(a))); + *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF; return result; } diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index b63ea2697..f5b071a9c 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -410,6 +410,16 @@ #endif #endif +// Does the compiler support type_trais? +#ifndef EIGEN_HAS_TYPE_TRAITS +#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) +#define EIGEN_HAS_TYPE_TRAITS 1 +#define EIGEN_INCLUDE_TYPE_TRAITS +#else +#define EIGEN_HAS_TYPE_TRAITS 0 +#endif +#endif + // Does the compiler support variadic templates? #ifndef EIGEN_HAS_VARIADIC_TEMPLATES #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 7d9053496..c455f92a1 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -493,7 +493,7 @@ template<typename T> struct smart_copy_helper<T,true> { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); - memcpy(target, start, size); + std::memcpy(target, start, size); } }; @@ -696,7 +696,15 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b) /** \class aligned_allocator * \ingroup Core_Module * -* \brief STL compatible allocator to use with with 16 byte aligned types +* \brief STL compatible allocator to use with types requiring a non standrad alignment. +* +* The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd. +* By default, it will thus provide at least 16 bytes alignment and more in following cases: +* - 32 bytes alignment if AVX is enabled. +* - 64 bytes alignment if AVX512 is enabled. +* +* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented +* \link TopicPreprocessorDirectivesPerformance there \endlink. * * Example: * \code diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 4b337f29f..10328be0d 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -34,6 +34,18 @@ inline IndexDest convert_index(const IndexSrc& idx) { return IndexDest(idx); } +// true if T can be considered as an integral index (i.e., and integral type or enum) +template<typename T> struct is_valid_index_type +{ + enum { value = +#if EIGEN_HAS_TYPE_TRAITS + internal::is_integral<T>::value || std::is_enum<T>::value +#else + // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index. + internal::is_convertible<T,Index>::value +#endif + }; +}; // promote_scalar_arg is an helper used in operation between an expression and a scalar, like: // expression * scalar diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index 75595517a..af1228cb8 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -309,61 +309,119 @@ inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiR } namespace internal { -template<typename VectorX, typename VectorY, typename OtherScalar> -void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j) + +template<typename Scalar, typename OtherScalar, + int SizeAtCompileTime, int MinAlignment, bool Vectorizable> +struct apply_rotation_in_the_plane_selector { - typedef typename VectorX::Scalar Scalar; - enum { - PacketSize = packet_traits<Scalar>::size, - OtherPacketSize = packet_traits<OtherScalar>::size - }; - typedef typename packet_traits<Scalar>::type Packet; - typedef typename packet_traits<OtherScalar>::type OtherPacket; - eigen_assert(xpr_x.size() == xpr_y.size()); - Index size = xpr_x.size(); - Index incrx = xpr_x.derived().innerStride(); - Index incry = xpr_y.derived().innerStride(); + static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + { + for(Index i=0; i<size; ++i) + { + Scalar xi = *x; + Scalar yi = *y; + *x = c * xi + numext::conj(s) * yi; + *y = -s * xi + numext::conj(c) * yi; + x += incrx; + y += incry; + } + } +}; - Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0); - Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0); - - OtherScalar c = j.c(); - OtherScalar s = j.s(); - if (c==OtherScalar(1) && s==OtherScalar(0)) - return; +template<typename Scalar, typename OtherScalar, + int SizeAtCompileTime, int MinAlignment> +struct apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime,MinAlignment,true /* vectorizable */> +{ + static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + { + enum { + PacketSize = packet_traits<Scalar>::size, + OtherPacketSize = packet_traits<OtherScalar>::size + }; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename packet_traits<OtherScalar>::type OtherPacket; + + /*** dynamic-size vectorized paths ***/ + if(SizeAtCompileTime == Dynamic && ((incrx==1 && incry==1) || PacketSize == 1)) + { + // both vectors are sequentially stored in memory => vectorization + enum { Peeling = 2 }; - /*** dynamic-size vectorized paths ***/ + Index alignedStart = internal::first_default_aligned(y, size); + Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; - if(VectorX::SizeAtCompileTime == Dynamic && - (VectorX::Flags & VectorY::Flags & PacketAccessBit) && - (PacketSize == OtherPacketSize) && - ((incrx==1 && incry==1) || PacketSize == 1)) - { - // both vectors are sequentially stored in memory => vectorization - enum { Peeling = 2 }; + const OtherPacket pc = pset1<OtherPacket>(c); + const OtherPacket ps = pset1<OtherPacket>(s); + conj_helper<OtherPacket,Packet,NumTraits<OtherScalar>::IsComplex,false> pcj; + conj_helper<OtherPacket,Packet,false,false> pm; - Index alignedStart = internal::first_default_aligned(y, size); - Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; + for(Index i=0; i<alignedStart; ++i) + { + Scalar xi = x[i]; + Scalar yi = y[i]; + x[i] = c * xi + numext::conj(s) * yi; + y[i] = -s * xi + numext::conj(c) * yi; + } - const OtherPacket pc = pset1<OtherPacket>(c); - const OtherPacket ps = pset1<OtherPacket>(s); - conj_helper<OtherPacket,Packet,NumTraits<OtherScalar>::IsComplex,false> pcj; - conj_helper<OtherPacket,Packet,false,false> pm; + Scalar* EIGEN_RESTRICT px = x + alignedStart; + Scalar* EIGEN_RESTRICT py = y + alignedStart; - for(Index i=0; i<alignedStart; ++i) - { - Scalar xi = x[i]; - Scalar yi = y[i]; - x[i] = c * xi + numext::conj(s) * yi; - y[i] = -s * xi + numext::conj(c) * yi; - } + if(internal::first_default_aligned(x, size)==alignedStart) + { + for(Index i=alignedStart; i<alignedEnd; i+=PacketSize) + { + Packet xi = pload<Packet>(px); + Packet yi = pload<Packet>(py); + pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + px += PacketSize; + py += PacketSize; + } + } + else + { + Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize); + for(Index i=alignedStart; i<peelingEnd; i+=Peeling*PacketSize) + { + Packet xi = ploadu<Packet>(px); + Packet xi1 = ploadu<Packet>(px+PacketSize); + Packet yi = pload <Packet>(py); + Packet yi1 = pload <Packet>(py+PacketSize); + pstoreu(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstoreu(px+PacketSize, padd(pm.pmul(pc,xi1),pcj.pmul(ps,yi1))); + pstore (py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pm.pmul(ps,xi1))); + px += Peeling*PacketSize; + py += Peeling*PacketSize; + } + if(alignedEnd!=peelingEnd) + { + Packet xi = ploadu<Packet>(x+peelingEnd); + Packet yi = pload <Packet>(y+peelingEnd); + pstoreu(x+peelingEnd, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + } + } - Scalar* EIGEN_RESTRICT px = x + alignedStart; - Scalar* EIGEN_RESTRICT py = y + alignedStart; + for(Index i=alignedEnd; i<size; ++i) + { + Scalar xi = x[i]; + Scalar yi = y[i]; + x[i] = c * xi + numext::conj(s) * yi; + y[i] = -s * xi + numext::conj(c) * yi; + } + } - if(internal::first_default_aligned(x, size)==alignedStart) + /*** fixed-size vectorized path ***/ + else if(SizeAtCompileTime != Dynamic && MinAlignment>0) // FIXME should be compared to the required alignment { - for(Index i=alignedStart; i<alignedEnd; i+=PacketSize) + const OtherPacket pc = pset1<OtherPacket>(c); + const OtherPacket ps = pset1<OtherPacket>(s); + conj_helper<OtherPacket,Packet,NumTraits<OtherPacket>::IsComplex,false> pcj; + conj_helper<OtherPacket,Packet,false,false> pm; + Scalar* EIGEN_RESTRICT px = x; + Scalar* EIGEN_RESTRICT py = y; + for(Index i=0; i<size; i+=PacketSize) { Packet xi = pload<Packet>(px); Packet yi = pload<Packet>(py); @@ -373,76 +431,40 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x py += PacketSize; } } - else - { - Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize); - for(Index i=alignedStart; i<peelingEnd; i+=Peeling*PacketSize) - { - Packet xi = ploadu<Packet>(px); - Packet xi1 = ploadu<Packet>(px+PacketSize); - Packet yi = pload <Packet>(py); - Packet yi1 = pload <Packet>(py+PacketSize); - pstoreu(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); - pstoreu(px+PacketSize, padd(pm.pmul(pc,xi1),pcj.pmul(ps,yi1))); - pstore (py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); - pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pm.pmul(ps,xi1))); - px += Peeling*PacketSize; - py += Peeling*PacketSize; - } - if(alignedEnd!=peelingEnd) - { - Packet xi = ploadu<Packet>(x+peelingEnd); - Packet yi = pload <Packet>(y+peelingEnd); - pstoreu(x+peelingEnd, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); - pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); - } - } - for(Index i=alignedEnd; i<size; ++i) + /*** non-vectorized path ***/ + else { - Scalar xi = x[i]; - Scalar yi = y[i]; - x[i] = c * xi + numext::conj(s) * yi; - y[i] = -s * xi + numext::conj(c) * yi; + apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime,MinAlignment,false>::run(x,incrx,y,incry,size,c,s); } } +}; - /*** fixed-size vectorized path ***/ - else if(VectorX::SizeAtCompileTime != Dynamic && - (VectorX::Flags & VectorY::Flags & PacketAccessBit) && - (PacketSize == OtherPacketSize) && - (EIGEN_PLAIN_ENUM_MIN(evaluator<VectorX>::Alignment, evaluator<VectorY>::Alignment)>0)) // FIXME should be compared to the required alignment - { - const OtherPacket pc = pset1<OtherPacket>(c); - const OtherPacket ps = pset1<OtherPacket>(s); - conj_helper<OtherPacket,Packet,NumTraits<OtherPacket>::IsComplex,false> pcj; - conj_helper<OtherPacket,Packet,false,false> pm; - Scalar* EIGEN_RESTRICT px = x; - Scalar* EIGEN_RESTRICT py = y; - for(Index i=0; i<size; i+=PacketSize) - { - Packet xi = pload<Packet>(px); - Packet yi = pload<Packet>(py); - pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); - pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); - px += PacketSize; - py += PacketSize; - } - } +template<typename VectorX, typename VectorY, typename OtherScalar> +void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j) +{ + typedef typename VectorX::Scalar Scalar; + const bool Vectorizable = (VectorX::Flags & VectorY::Flags & PacketAccessBit) + && (int(packet_traits<Scalar>::size) == int(packet_traits<OtherScalar>::size)); - /*** non-vectorized path ***/ - else - { - for(Index i=0; i<size; ++i) - { - Scalar xi = *x; - Scalar yi = *y; - *x = c * xi + numext::conj(s) * yi; - *y = -s * xi + numext::conj(c) * yi; - x += incrx; - y += incry; - } - } + eigen_assert(xpr_x.size() == xpr_y.size()); + Index size = xpr_x.size(); + Index incrx = xpr_x.derived().innerStride(); + Index incry = xpr_y.derived().innerStride(); + + Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0); + Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0); + + OtherScalar c = j.c(); + OtherScalar s = j.s(); + if (c==OtherScalar(1) && s==OtherScalar(0)) + return; + + apply_rotation_in_the_plane_selector< + Scalar,OtherScalar, + VectorX::SizeAtCompileTime, + EIGEN_PLAIN_ENUM_MIN(evaluator<VectorX>::Alignment, evaluator<VectorY>::Alignment), + Vectorizable>::run(x,incrx,y,incry,size,c,s); } } // end namespace internal diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h index 8a5cc91f2..e0295f2af 100644 --- a/Eigen/src/SparseCore/AmbiVector.h +++ b/Eigen/src/SparseCore/AmbiVector.h @@ -94,7 +94,7 @@ class AmbiVector Index allocSize = m_allocatedElements * sizeof(ListEl); allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar); Scalar* newBuffer = new Scalar[allocSize]; - memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); + std::memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); delete[] m_buffer; m_buffer = newBuffer; } diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h index 492eb0a29..9db119b67 100644 --- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h @@ -17,7 +17,9 @@ namespace internal { template<typename Lhs, typename Rhs, typename ResultType> static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, bool sortedInsertion = false) { - typedef typename remove_all<Lhs>::type::Scalar Scalar; + typedef typename remove_all<Lhs>::type::Scalar LhsScalar; + typedef typename remove_all<Rhs>::type::Scalar RhsScalar; + typedef typename remove_all<ResultType>::type::Scalar ResScalar; // make sure to call innerSize/outerSize since we fake the storage order. Index rows = lhs.innerSize(); @@ -25,7 +27,7 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r eigen_assert(lhs.outerSize() == rhs.innerSize()); ei_declare_aligned_stack_constructed_variable(bool, mask, rows, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, values, rows, 0); + ei_declare_aligned_stack_constructed_variable(ResScalar, values, rows, 0); ei_declare_aligned_stack_constructed_variable(Index, indices, rows, 0); std::memset(mask,0,sizeof(bool)*rows); @@ -51,12 +53,12 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r Index nnz = 0; for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) { - Scalar y = rhsIt.value(); + RhsScalar y = rhsIt.value(); Index k = rhsIt.index(); for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt) { Index i = lhsIt.index(); - Scalar x = lhsIt.value(); + LhsScalar x = lhsIt.value(); if(!mask[i]) { mask[i] = true; @@ -166,11 +168,12 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,C { static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) { - typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix; - RowMajorMatrix rhsRow = rhs; - RowMajorMatrix resRow(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl<RowMajorMatrix,Lhs,RowMajorMatrix>(rhsRow, lhs, resRow); - res = resRow; + typedef SparseMatrix<typename Rhs::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRhs; + typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRes; + RowMajorRhs rhsRow = rhs; + RowMajorRes resRow(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl<RowMajorRhs,Lhs,RowMajorRes>(rhsRow, lhs, resRow); + res = resRow; } }; @@ -179,10 +182,11 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,R { static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) { - typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix; - RowMajorMatrix lhsRow = lhs; - RowMajorMatrix resRow(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorMatrix,RowMajorMatrix>(rhs, lhsRow, resRow); + typedef SparseMatrix<typename Lhs::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorLhs; + typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRes; + RowMajorLhs lhsRow = lhs; + RowMajorRes resRow(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorLhs,RowMajorRes>(rhs, lhsRow, resRow); res = resRow; } }; @@ -219,10 +223,11 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,C { static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) { - typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix; - ColMajorMatrix lhsCol = lhs; - ColMajorMatrix resCol(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl<ColMajorMatrix,Rhs,ColMajorMatrix>(lhsCol, rhs, resCol); + typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorLhs; + typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRes; + ColMajorLhs lhsCol = lhs; + ColMajorRes resCol(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl<ColMajorLhs,Rhs,ColMajorRes>(lhsCol, rhs, resCol); res = resCol; } }; @@ -232,10 +237,11 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,R { static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) { - typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix; - ColMajorMatrix rhsCol = rhs; - ColMajorMatrix resCol(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorMatrix,ColMajorMatrix>(lhs, rhsCol, resCol); + typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRhs; + typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRes; + ColMajorRhs rhsCol = rhs; + ColMajorRes resCol(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorRhs,ColMajorRes>(lhs, rhsCol, resCol); res = resCol; } }; @@ -263,7 +269,8 @@ namespace internal { template<typename Lhs, typename Rhs, typename ResultType> static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res) { - typedef typename remove_all<Lhs>::type::Scalar Scalar; + typedef typename remove_all<Lhs>::type::Scalar LhsScalar; + typedef typename remove_all<Rhs>::type::Scalar RhsScalar; Index cols = rhs.outerSize(); eigen_assert(lhs.outerSize() == rhs.innerSize()); @@ -274,12 +281,12 @@ static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, { for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) { - Scalar y = rhsIt.value(); + RhsScalar y = rhsIt.value(); Index k = rhsIt.index(); for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt) { Index i = lhsIt.index(); - Scalar x = lhsIt.value(); + LhsScalar x = lhsIt.value(); res.coeffRef(i,j) += x * y; } } @@ -310,9 +317,9 @@ struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,RowMajor,ColMa { static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) { - typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix; - ColMajorMatrix lhsCol(lhs); - internal::sparse_sparse_to_dense_product_impl<ColMajorMatrix,Rhs,ResultType>(lhsCol, rhs, res); + typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorLhs; + ColMajorLhs lhsCol(lhs); + internal::sparse_sparse_to_dense_product_impl<ColMajorLhs,Rhs,ResultType>(lhsCol, rhs, res); } }; @@ -321,9 +328,9 @@ struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,ColMajor,RowMa { static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) { - typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix; - ColMajorMatrix rhsCol(rhs); - internal::sparse_sparse_to_dense_product_impl<Lhs,ColMajorMatrix,ResultType>(lhs, rhsCol, res); + typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRhs; + ColMajorRhs rhsCol(rhs); + internal::sparse_sparse_to_dense_product_impl<Lhs,ColMajorRhs,ResultType>(lhs, rhsCol, res); } }; diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h index 21c419002..88820a48f 100644 --- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h @@ -21,7 +21,8 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r { // return sparse_sparse_product_with_pruning_impl2(lhs,rhs,res); - typedef typename remove_all<Lhs>::type::Scalar Scalar; + typedef typename remove_all<Rhs>::type::Scalar RhsScalar; + typedef typename remove_all<ResultType>::type::Scalar ResScalar; typedef typename remove_all<Lhs>::type::StorageIndex StorageIndex; // make sure to call innerSize/outerSize since we fake the storage order. @@ -31,7 +32,7 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r eigen_assert(lhs.outerSize() == rhs.innerSize()); // allocate a temporary buffer - AmbiVector<Scalar,StorageIndex> tempVector(rows); + AmbiVector<ResScalar,StorageIndex> tempVector(rows); // mimics a resizeByInnerOuter: if(ResultType::IsRowMajor) @@ -63,14 +64,14 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r { // FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index()) tempVector.restart(); - Scalar x = rhsIt.value(); + RhsScalar x = rhsIt.value(); for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt) { tempVector.coeffRef(lhsIt.index()) += lhsIt.value() * x; } } res.startVec(j); - for (typename AmbiVector<Scalar,StorageIndex>::Iterator it(tempVector,tolerance); it; ++it) + for (typename AmbiVector<ResScalar,StorageIndex>::Iterator it(tempVector,tolerance); it; ++it) res.insertBackByOuterInner(j,it.index()) = it.value(); } res.finalize(); @@ -85,7 +86,6 @@ struct sparse_sparse_product_with_pruning_selector; template<typename Lhs, typename Rhs, typename ResultType> struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,ColMajor> { - typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar; typedef typename ResultType::RealScalar RealScalar; static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) @@ -129,8 +129,8 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,R typedef typename ResultType::RealScalar RealScalar; static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) { - typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs; - typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs; + typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs; + typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs; ColMajorMatrixLhs colLhs(lhs); ColMajorMatrixRhs colRhs(rhs); internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,ColMajorMatrixRhs,ResultType>(colLhs, colRhs, res, tolerance); @@ -149,7 +149,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,R typedef typename ResultType::RealScalar RealScalar; static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) { - typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixLhs; + typedef SparseMatrix<typename Lhs::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixLhs; RowMajorMatrixLhs rowLhs(lhs); sparse_sparse_product_with_pruning_selector<RowMajorMatrixLhs,Rhs,ResultType,RowMajor,RowMajor>(rowLhs,rhs,res,tolerance); } @@ -161,7 +161,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,C typedef typename ResultType::RealScalar RealScalar; static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) { - typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixRhs; + typedef SparseMatrix<typename Rhs::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixRhs; RowMajorMatrixRhs rowRhs(rhs); sparse_sparse_product_with_pruning_selector<Lhs,RowMajorMatrixRhs,ResultType,RowMajor,RowMajor,RowMajor>(lhs,rowRhs,res,tolerance); } @@ -173,7 +173,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,R typedef typename ResultType::RealScalar RealScalar; static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) { - typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs; + typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs; ColMajorMatrixRhs colRhs(rhs); internal::sparse_sparse_product_with_pruning_impl<Lhs,ColMajorMatrixRhs,ResultType>(lhs, colRhs, res, tolerance); } @@ -185,7 +185,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,C typedef typename ResultType::RealScalar RealScalar; static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) { - typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs; + typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs; ColMajorMatrixLhs colLhs(lhs); internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,Rhs,ResultType>(colLhs, rhs, res, tolerance); } diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index 22c1666c5..a7ec63adf 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -55,9 +55,7 @@ ivcSize(const Indices& indices) const { template<typename RowIndices, typename ColIndices> struct valid_indexed_view_overload { - // Here we use is_convertible to Index instead of is_integral in order to treat enums as Index. - // In c++11 we could use is_integral<T> && is_enum<T> if is_convertible appears to be too permissive. - enum { value = !(internal::is_convertible<RowIndices,Index>::value && internal::is_convertible<ColIndices,Index>::value) }; + enum { value = !(internal::is_valid_index_type<RowIndices>::value && internal::is_valid_index_type<ColIndices>::value) }; }; public: @@ -146,7 +144,7 @@ operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&col template<typename Indices> typename internal::enable_if< - IsRowMajor && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_integral<Indices>::value)), + IsRowMajor && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)), IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,typename IvcType<Indices>::type> >::type operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST { @@ -157,7 +155,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST template<typename Indices> typename internal::enable_if< - (!IsRowMajor) && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_integral<Indices>::value)), + (!IsRowMajor) && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)), IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcType<Indices>::type,IvcIndex> >::type operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST { @@ -168,7 +166,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST template<typename Indices> typename internal::enable_if< - (internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1) && (!internal::is_integral<Indices>::value) && (!Symbolic::is_symbolic<Indices>::value), + (internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1) && (!internal::is_valid_index_type<Indices>::value) && (!Symbolic::is_symbolic<Indices>::value), VectorBlock<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,internal::array_size<Indices>::value> >::type operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST { @@ -250,6 +248,8 @@ operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST * * For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but taking a single parameter. * + * See also this <a href="https://stackoverflow.com/questions/46110917/eigen-replicate-items-along-one-dimension-without-useless-allocations">question</a> and its answer for an example of how to duplicate coefficients. + * * \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index) */ template<typename RowIndices, typename ColIndices> diff --git a/test/cuda_basic.cu b/test/cuda_basic.cu index 0ff13477d..ce66c2c78 100644 --- a/test/cuda_basic.cu +++ b/test/cuda_basic.cu @@ -20,9 +20,6 @@ #include <math_constants.h> #include <cuda.h> -#if EIGEN_CUDACC_VER >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include "cuda_common.h" diff --git a/test/half_float.cpp b/test/half_float.cpp index 7b6208873..7734f82cc 100644 --- a/test/half_float.cpp +++ b/test/half_float.cpp @@ -20,7 +20,7 @@ using Eigen::half; void test_conversion() { - using Eigen::half_impl::__half; + using Eigen::half_impl::__half_raw; // Conversion from float. VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00); @@ -37,9 +37,9 @@ void test_conversion() VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002); // Verify round-to-nearest-even behavior. - float val1 = float(half(__half(0x3c00))); - float val2 = float(half(__half(0x3c01))); - float val3 = float(half(__half(0x3c02))); + float val1 = float(half(__half_raw(0x3c00))); + float val2 = float(half(__half_raw(0x3c01))); + float val3 = float(half(__half_raw(0x3c02))); VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00); VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02); @@ -55,21 +55,21 @@ void test_conversion() VERIFY_IS_EQUAL(half(true).x, 0x3c00); // Conversion to float. - VERIFY_IS_EQUAL(float(half(__half(0x0000))), 0.0f); - VERIFY_IS_EQUAL(float(half(__half(0x3c00))), 1.0f); + VERIFY_IS_EQUAL(float(half(__half_raw(0x0000))), 0.0f); + VERIFY_IS_EQUAL(float(half(__half_raw(0x3c00))), 1.0f); // Denormals. - VERIFY_IS_APPROX(float(half(__half(0x8001))), -5.96046e-08f); - VERIFY_IS_APPROX(float(half(__half(0x0001))), 5.96046e-08f); - VERIFY_IS_APPROX(float(half(__half(0x0002))), 1.19209e-07f); + VERIFY_IS_APPROX(float(half(__half_raw(0x8001))), -5.96046e-08f); + VERIFY_IS_APPROX(float(half(__half_raw(0x0001))), 5.96046e-08f); + VERIFY_IS_APPROX(float(half(__half_raw(0x0002))), 1.19209e-07f); // NaNs and infinities. VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number. VERIFY(!(numext::isnan)(float(half(0.0f)))); - VERIFY((numext::isinf)(float(half(__half(0xfc00))))); - VERIFY((numext::isnan)(float(half(__half(0xfc01))))); - VERIFY((numext::isinf)(float(half(__half(0x7c00))))); - VERIFY((numext::isnan)(float(half(__half(0x7c01))))); + VERIFY((numext::isinf)(float(half(__half_raw(0xfc00))))); + VERIFY((numext::isnan)(float(half(__half_raw(0xfc01))))); + VERIFY((numext::isinf)(float(half(__half_raw(0x7c00))))); + VERIFY((numext::isnan)(float(half(__half_raw(0x7c01))))); #if !EIGEN_COMP_MSVC // Visual Studio errors out on divisions by 0 @@ -79,12 +79,12 @@ void test_conversion() #endif // Exactly same checks as above, just directly on the half representation. - VERIFY(!(numext::isinf)(half(__half(0x7bff)))); - VERIFY(!(numext::isnan)(half(__half(0x0000)))); - VERIFY((numext::isinf)(half(__half(0xfc00)))); - VERIFY((numext::isnan)(half(__half(0xfc01)))); - VERIFY((numext::isinf)(half(__half(0x7c00)))); - VERIFY((numext::isnan)(half(__half(0x7c01)))); + VERIFY(!(numext::isinf)(half(__half_raw(0x7bff)))); + VERIFY(!(numext::isnan)(half(__half_raw(0x0000)))); + VERIFY((numext::isinf)(half(__half_raw(0xfc00)))); + VERIFY((numext::isnan)(half(__half_raw(0xfc01)))); + VERIFY((numext::isinf)(half(__half_raw(0x7c00)))); + VERIFY((numext::isnan)(half(__half_raw(0x7c01)))); #if !EIGEN_COMP_MSVC // Visual Studio errors out on divisions by 0 diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp index 7245cf378..8b3082cea 100644 --- a/test/indexed_view.cpp +++ b/test/indexed_view.cpp @@ -366,6 +366,11 @@ void check_indexed_view() VERIFY( is_same_eq( cA.middleRows<3>(1), cA.middleRows(1,fix<3>)) ); } + // Check compilation of enums as index type: + enum { X=0, Y=1 }; + a(X) = 1; + A(X,Y) = 1; + } void test_indexed_view() diff --git a/test/main.h b/test/main.h index bd5325196..429c44f81 100644 --- a/test/main.h +++ b/test/main.h @@ -50,6 +50,19 @@ #endif #endif +// Same for cuda_fp16.h +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) +#define EIGEN_TEST_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) +#elif defined(__CUDACC_VER__) +#define EIGEN_TEST_CUDACC_VER __CUDACC_VER__ +#else +#define EIGEN_TEST_CUDACC_VER 0 +#endif + +#if EIGEN_TEST_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + // To test that all calls from Eigen code to std::min() and std::max() are // protected by parenthesis against macro expansion, the min()/max() macros // are defined here and any not-parenthesized min/max call will cause a diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp index 197586741..f47170b72 100644 --- a/test/sparse_product.cpp +++ b/test/sparse_product.cpp @@ -371,6 +371,88 @@ void bug_942() VERIFY_IS_APPROX( ( d.asDiagonal()*cmA ).eval().coeff(0,0), res ); } +template<typename Real> +void test_mixing_types() +{ + typedef std::complex<Real> Cplx; + typedef SparseMatrix<Real> SpMatReal; + typedef SparseMatrix<Cplx> SpMatCplx; + typedef SparseMatrix<Cplx,RowMajor> SpRowMatCplx; + typedef Matrix<Real,Dynamic,Dynamic> DenseMatReal; + typedef Matrix<Cplx,Dynamic,Dynamic> DenseMatCplx; + + Index n = internal::random<Index>(1,100); + double density = (std::max)(8./(n*n), 0.2); + + SpMatReal sR1(n,n); + SpMatCplx sC1(n,n), sC2(n,n), sC3(n,n); + SpRowMatCplx sCR(n,n); + DenseMatReal dR1(n,n); + DenseMatCplx dC1(n,n), dC2(n,n), dC3(n,n); + + initSparse<Real>(density, dR1, sR1); + initSparse<Cplx>(density, dC1, sC1); + initSparse<Cplx>(density, dC2, sC2); + + VERIFY_IS_APPROX( sC2 = (sR1 * sC1), dC3 = dR1.template cast<Cplx>() * dC1 ); + VERIFY_IS_APPROX( sC2 = (sC1 * sR1), dC3 = dC1 * dR1.template cast<Cplx>() ); + VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1), dC3 = dR1.template cast<Cplx>().transpose() * dC1 ); + VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1), dC3 = dC1.transpose() * dR1.template cast<Cplx>() ); + VERIFY_IS_APPROX( sC2 = (sR1 * sC1.transpose()), dC3 = dR1.template cast<Cplx>() * dC1.transpose() ); + VERIFY_IS_APPROX( sC2 = (sC1 * sR1.transpose()), dC3 = dC1 * dR1.template cast<Cplx>().transpose() ); + VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1.transpose()), dC3 = dR1.template cast<Cplx>().transpose() * dC1.transpose() ); + VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1.transpose()), dC3 = dC1.transpose() * dR1.template cast<Cplx>().transpose() ); + + VERIFY_IS_APPROX( sCR = (sR1 * sC1), dC3 = dR1.template cast<Cplx>() * dC1 ); + VERIFY_IS_APPROX( sCR = (sC1 * sR1), dC3 = dC1 * dR1.template cast<Cplx>() ); + VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1), dC3 = dR1.template cast<Cplx>().transpose() * dC1 ); + VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1), dC3 = dC1.transpose() * dR1.template cast<Cplx>() ); + VERIFY_IS_APPROX( sCR = (sR1 * sC1.transpose()), dC3 = dR1.template cast<Cplx>() * dC1.transpose() ); + VERIFY_IS_APPROX( sCR = (sC1 * sR1.transpose()), dC3 = dC1 * dR1.template cast<Cplx>().transpose() ); + VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1.transpose()), dC3 = dR1.template cast<Cplx>().transpose() * dC1.transpose() ); + VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1.transpose()), dC3 = dC1.transpose() * dR1.template cast<Cplx>().transpose() ); + + + VERIFY_IS_APPROX( sC2 = (sR1 * sC1).pruned(), dC3 = dR1.template cast<Cplx>() * dC1 ); + VERIFY_IS_APPROX( sC2 = (sC1 * sR1).pruned(), dC3 = dC1 * dR1.template cast<Cplx>() ); + VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1).pruned(), dC3 = dR1.template cast<Cplx>().transpose() * dC1 ); + VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1).pruned(), dC3 = dC1.transpose() * dR1.template cast<Cplx>() ); + VERIFY_IS_APPROX( sC2 = (sR1 * sC1.transpose()).pruned(), dC3 = dR1.template cast<Cplx>() * dC1.transpose() ); + VERIFY_IS_APPROX( sC2 = (sC1 * sR1.transpose()).pruned(), dC3 = dC1 * dR1.template cast<Cplx>().transpose() ); + VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1.transpose()).pruned(), dC3 = dR1.template cast<Cplx>().transpose() * dC1.transpose() ); + VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1.transpose()).pruned(), dC3 = dC1.transpose() * dR1.template cast<Cplx>().transpose() ); + + VERIFY_IS_APPROX( sCR = (sR1 * sC1).pruned(), dC3 = dR1.template cast<Cplx>() * dC1 ); + VERIFY_IS_APPROX( sCR = (sC1 * sR1).pruned(), dC3 = dC1 * dR1.template cast<Cplx>() ); + VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1).pruned(), dC3 = dR1.template cast<Cplx>().transpose() * dC1 ); + VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1).pruned(), dC3 = dC1.transpose() * dR1.template cast<Cplx>() ); + VERIFY_IS_APPROX( sCR = (sR1 * sC1.transpose()).pruned(), dC3 = dR1.template cast<Cplx>() * dC1.transpose() ); + VERIFY_IS_APPROX( sCR = (sC1 * sR1.transpose()).pruned(), dC3 = dC1 * dR1.template cast<Cplx>().transpose() ); + VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1.transpose()).pruned(), dC3 = dR1.template cast<Cplx>().transpose() * dC1.transpose() ); + VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1.transpose()).pruned(), dC3 = dC1.transpose() * dR1.template cast<Cplx>().transpose() ); + + + VERIFY_IS_APPROX( dC2 = (sR1 * sC1), dC3 = dR1.template cast<Cplx>() * dC1 ); + VERIFY_IS_APPROX( dC2 = (sC1 * sR1), dC3 = dC1 * dR1.template cast<Cplx>() ); + VERIFY_IS_APPROX( dC2 = (sR1.transpose() * sC1), dC3 = dR1.template cast<Cplx>().transpose() * dC1 ); + VERIFY_IS_APPROX( dC2 = (sC1.transpose() * sR1), dC3 = dC1.transpose() * dR1.template cast<Cplx>() ); + VERIFY_IS_APPROX( dC2 = (sR1 * sC1.transpose()), dC3 = dR1.template cast<Cplx>() * dC1.transpose() ); + VERIFY_IS_APPROX( dC2 = (sC1 * sR1.transpose()), dC3 = dC1 * dR1.template cast<Cplx>().transpose() ); + VERIFY_IS_APPROX( dC2 = (sR1.transpose() * sC1.transpose()), dC3 = dR1.template cast<Cplx>().transpose() * dC1.transpose() ); + VERIFY_IS_APPROX( dC2 = (sC1.transpose() * sR1.transpose()), dC3 = dC1.transpose() * dR1.template cast<Cplx>().transpose() ); + + + VERIFY_IS_APPROX( dC2 = dR1 * sC1, dC3 = dR1.template cast<Cplx>() * sC1 ); + VERIFY_IS_APPROX( dC2 = sR1 * dC1, dC3 = sR1.template cast<Cplx>() * dC1 ); + VERIFY_IS_APPROX( dC2 = dC1 * sR1, dC3 = dC1 * sR1.template cast<Cplx>() ); + VERIFY_IS_APPROX( dC2 = sC1 * dR1, dC3 = sC1 * dR1.template cast<Cplx>() ); + + VERIFY_IS_APPROX( dC2 = dR1.row(0) * sC1, dC3 = dR1.template cast<Cplx>().row(0) * sC1 ); + VERIFY_IS_APPROX( dC2 = sR1 * dC1.col(0), dC3 = sR1.template cast<Cplx>() * dC1.col(0) ); + VERIFY_IS_APPROX( dC2 = dC1.row(0) * sR1, dC3 = dC1.row(0) * sR1.template cast<Cplx>() ); + VERIFY_IS_APPROX( dC2 = sC1 * dR1.col(0), dC3 = sC1 * dR1.template cast<Cplx>().col(0) ); +} + void test_sparse_product() { for(int i = 0; i < g_repeat; i++) { @@ -381,5 +463,7 @@ void test_sparse_product() CALL_SUBTEST_2( (sparse_product<SparseMatrix<std::complex<double>, RowMajor > >()) ); CALL_SUBTEST_3( (sparse_product<SparseMatrix<float,ColMajor,long int> >()) ); CALL_SUBTEST_4( (sparse_product_regression_test<SparseMatrix<double,RowMajor>, Matrix<double, Dynamic, Dynamic, RowMajor> >()) ); + + CALL_SUBTEST_5( (test_mixing_types<float>()) ); } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index 428b18499..903bc51cc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -388,7 +388,11 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, // the sum across all big k blocks of the product of little k block of index (x, y) // with block of index (y, z). To compute the final output, we need to reduce // the 8 threads over y by summation. +#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) +#else +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) +#endif #define reduceRow(i, mask) \ shuffleInc(i, 0, mask); \ @@ -614,8 +618,13 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh x1 = rhs_pf0.x; x2 = rhs_pf0.z; } + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 x1 = __shfl_xor(x1, 4); x2 = __shfl_xor(x2, 4); + #else + x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4); + x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4); + #endif if((threadIdx.x%8) < 4) { rhs_pf0.y = x1; rhs_pf0.w = x2; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index 83c449cf1..b148dae39 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -174,8 +174,10 @@ class TensorCostModel { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { double cost = totalCost(output_size, cost_per_coeff); - int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; - return numext::mini(max_threads, numext::maxi(1, threads)); + double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; + // Make sure we don't invoke undefined behavior when we convert to an int. + threads = numext::mini<double>(threads, GenericNumTraits<int>::highest()); + return numext::mini(max_threads, numext::maxi<int>(1, threads)); } // taskSize assesses parallel task size. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 974eb7deb..ebcbd6f41 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -62,9 +62,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) else { assert(0 && "Wordsize not supported"); } -#else // __CUDA_ARCH__ >= 300 +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif // __CUDA_ARCH__ >= 300 +#endif // EIGEN_CUDA_ARCH >= 300 } // We extend atomicExch to support extra data types @@ -104,9 +104,9 @@ template <> __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) { #if EIGEN_CUDA_ARCH >= 300 atomicAdd(output, accum); -#else // __CUDA_ARCH__ >= 300 +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif // __CUDA_ARCH__ >= 300 +#endif // EIGEN_CUDA_ARCH >= 300 } @@ -168,7 +168,11 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); + #else + reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum); + #endif } if ((threadIdx.x & (warpSize - 1)) == 0) { @@ -179,9 +183,9 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num // Let the last block reset the semaphore atomicInc(semaphore, gridDim.x + 1); } -#else // __CUDA_ARCH__ >= 300 +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif // __CUDA_ARCH__ >= 300 +#endif // EIGEN_CUDA_ARCH >= 300 } @@ -223,12 +227,14 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x; // Initialize the output value if it wasn't initialized by the ReductionInitKernel - if (gridDim.x == 1 && first_index == 0) { - if (num_coeffs % 2 != 0) { - half last = input.m_impl.coeff(num_coeffs-1); - *scratch = __halves2half2(last, reducer.initialize()); - } else { - *scratch = reducer.template initializePacket<half2>(); + if (gridDim.x == 1) { + if (first_index == 0) { + if (num_coeffs % 2 != 0) { + half last = input.m_impl.coeff(num_coeffs-1); + *scratch = __halves2half2(last, reducer.initialize()); + } else { + *scratch = reducer.template initializePacket<half2>(); + } } __syncthreads(); } @@ -244,19 +250,25 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum); + #else + int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize); + reducer.reducePacket(*(half2*)(&temp), &accum); + #endif } if ((threadIdx.x & (warpSize - 1)) == 0) { atomicReduce(scratch, accum, reducer); } - __syncthreads(); - - if (gridDim.x == 1 && first_index == 0) { - half tmp = __low2half(*scratch); - reducer.reduce(__high2half(*scratch), &tmp); - *output = tmp; + if (gridDim.x == 1) { + __syncthreads(); + if (first_index == 0) { + half tmp = __low2half(*scratch); + reducer.reduce(__high2half(*scratch), &tmp); + *output = tmp; + } } } @@ -425,7 +437,11 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); + #else + reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val); + #endif } if ((threadIdx.x & (warpSize - 1)) == 0) { @@ -433,9 +449,9 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu } } } -#else // __CUDA_ARCH__ >= 300 +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif // __CUDA_ARCH__ >= 300 +#endif // EIGEN_CUDA_ARCH >= 300 } #ifdef EIGEN_HAS_CUDA_FP16 @@ -515,8 +531,15 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1); reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2); + #else + int temp1 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val1), (unsigned)offset, warpSize); + int temp2 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val2), (unsigned)offset, warpSize); + reducer.reducePacket(*(half2*)(&temp1), &reduced_val1); + reducer.reducePacket(*(half2*)(&temp2), &reduced_val2); + #endif } half val1 = __low2half(reduced_val1); diff --git a/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/unsupported/Eigen/src/EulerAngles/EulerAngles.h index a5d034d71..e43cdb7fb 100644 --- a/unsupported/Eigen/src/EulerAngles/EulerAngles.h +++ b/unsupported/Eigen/src/EulerAngles/EulerAngles.h @@ -341,7 +341,7 @@ EIGEN_EULER_ANGLES_TYPEDEFS(double, d) // set from a vector of Euler angles template<class System, class Other> - struct eulerangles_assign_impl<System,Other,4,1> + struct eulerangles_assign_impl<System,Other,3,1> { typedef typename Other::Scalar Scalar; static void run(EulerAngles<Scalar, System>& e, const Other& vec) diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp index 79ee72847..500fb2d17 100644 --- a/unsupported/test/EulerAngles.cpp +++ b/unsupported/test/EulerAngles.cpp @@ -278,6 +278,9 @@ void test_EulerAngles() EulerAnglesXYZd onesEd(1, 1, 1); EulerAnglesXYZf onesEf = onesEd.cast<float>(); VERIFY_IS_APPROX(onesEd, onesEf.cast<double>()); + + // Simple Construction from Vector3 test + VERIFY_IS_APPROX(onesEd, EulerAnglesXYZd(Vector3d::Ones())); CALL_SUBTEST_1( eulerangles_manual<float>() ); CALL_SUBTEST_2( eulerangles_manual<double>() ); diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index 0dfd6cfe1..0e8b8125d 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -12,12 +12,15 @@ #define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; template <int Layout> diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu index 83a740e7a..dabf9e45f 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu @@ -13,12 +13,15 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; void test_cuda_conversion() { diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu index cbff5a9b2..d25e1bee1 100644 --- a/unsupported/test/cxx11_tensor_complex_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cuda.cu @@ -11,12 +11,15 @@ #define EIGEN_TEST_FUNC cxx11_tensor_complex #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; void test_cuda_nullary() { diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu index 9133fce5a..4f0f621b4 100644 --- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu @@ -11,12 +11,15 @@ #define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; template<typename T> diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index 0b2f3f0f4..c68287e34 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -14,12 +14,15 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; typedef Tensor<float, 1>::DimensionPair DimPair; diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index ad8c9662f..d9059a2dc 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -12,12 +12,15 @@ #define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; void test_cuda_nullary() { diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index ae21f492a..d5bfeeb39 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -13,12 +13,15 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; using Eigen::RowMajor; diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 0ba7657b8..c9f3ae1ae 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -13,12 +13,15 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; template<typename> diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu index 94d5f4e5a..9d08605fc 100644 --- a/unsupported/test/cxx11_tensor_random_cuda.cu +++ b/unsupported/test/cxx11_tensor_random_cuda.cu @@ -13,12 +13,15 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + void test_cuda_random_uniform() { diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu index fd09d013b..d6ce04f1c 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -12,12 +12,15 @@ #define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda #define EIGEN_USE_GPU -#if dEIGEN_CUDACC_VER >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + template<typename Type, int DataLayout> static void test_full_reductions() { diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu index 46571cfea..e99724b91 100644 --- a/unsupported/test/cxx11_tensor_scan_cuda.cu +++ b/unsupported/test/cxx11_tensor_scan_cuda.cu @@ -13,12 +13,15 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; typedef Tensor<float, 1>::DimensionPair DimPair; |