From c6953f799b01d36f4236b64f351cc1446e0abe17 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 13 Oct 2020 21:48:31 +0000 Subject: Add packet generic ops `predux_fmin`, `predux_fmin_nan`, `predux_fmax`, and `predux_fmax_nan` that implement reductions with `PropagateNaN`, and `PropagateNumbers` semantics. Add (slow) generic implementations for most reductions. --- Eigen/src/Core/GenericPacketMath.h | 380 +++++++++++++++++++++---------------- 1 file changed, 214 insertions(+), 166 deletions(-) (limited to 'Eigen/src/Core/GenericPacketMath.h') diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 32cedd0b1..a734d99b7 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -215,74 +215,29 @@ pmul(const bool& a, const bool& b) { return a && b; } template EIGEN_DEVICE_FUNC inline Packet pdiv(const Packet& a, const Packet& b) { return a/b; } -/** \internal \returns the min of \a a and \a b (coeff-wise). - If \a a or \b b is NaN, the return value is implementation defined. */ +/** \internal \returns one bits */ template EIGEN_DEVICE_FUNC inline Packet -pmin(const Packet& a, const Packet& b) { return numext::mini(a, b); } +ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;} -/** \internal \returns the max of \a a and \a b (coeff-wise) - If \a a or \b b is NaN, the return value is implementation defined. */ +/** \internal \returns zero bits */ template EIGEN_DEVICE_FUNC inline Packet -pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); } +pzero(const Packet& /*a*/) { Packet b; memset((void*)&b, 0, sizeof(b)); return b;} -/** \internal \returns the absolute value of \a a */ +/** \internal \returns a <= b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pabs(const Packet& a) { using std::abs; return abs(a); } -template<> EIGEN_DEVICE_FUNC inline unsigned int -pabs(const unsigned int& a) { return a; } -template<> EIGEN_DEVICE_FUNC inline unsigned long -pabs(const unsigned long& a) { return a; } -template<> EIGEN_DEVICE_FUNC inline unsigned long long -pabs(const unsigned long long& a) { return a; } +pcmp_le(const Packet& a, const Packet& b) { return a<=b ? ptrue(a) : pzero(a); } -/** \internal \returns the phase angle of \a a */ +/** \internal \returns a < b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -parg(const Packet& a) { using numext::arg; return arg(a); } - - -/** \internal \returns \a a logically shifted by N bits to the right */ -template EIGEN_DEVICE_FUNC inline int -parithmetic_shift_right(const int& a) { return a >> N; } -template EIGEN_DEVICE_FUNC inline long int -parithmetic_shift_right(const long int& a) { return a >> N; } - -/** \internal \returns \a a arithmetically shifted by N bits to the right */ -template EIGEN_DEVICE_FUNC inline int -plogical_shift_right(const int& a) { return static_cast(static_cast(a) >> N); } -template EIGEN_DEVICE_FUNC inline long int -plogical_shift_right(const long int& a) { return static_cast(static_cast(a) >> N); } - -/** \internal \returns \a a shifted by N bits to the left */ -template EIGEN_DEVICE_FUNC inline int -plogical_shift_left(const int& a) { return a << N; } -template EIGEN_DEVICE_FUNC inline long int -plogical_shift_left(const long int& a) { return a << N; } - -/** \internal \returns the significant and exponent of the underlying floating point numbers - * See https://en.cppreference.com/w/cpp/numeric/math/frexp - */ -template -EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet& a, Packet& exponent) { - int exp; - EIGEN_USING_STD(frexp); - Packet result = frexp(a, &exp); - exponent = static_cast(exp); - return result; -} +pcmp_lt(const Packet& a, const Packet& b) { return a EIGEN_DEVICE_FUNC inline Packet -pldexp(const Packet &a, const Packet &exponent) { - EIGEN_USING_STD(ldexp) - return ldexp(a, static_cast(exponent)); -} +pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); } -/** \internal \returns zero bits */ +/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pzero(const Packet& /*a*/) { Packet b; memset((void*)&b, 0, sizeof(b)); return b;} - +pcmp_lt_or_nan(const Packet& a, const Packet& b) { return a>=b ? pzero(a) : ptrue(a); } template<> EIGEN_DEVICE_FUNC inline float pzero(const float& a) { EIGEN_UNUSED_VARIABLE(a) return 0.f; @@ -293,10 +248,6 @@ template<> EIGEN_DEVICE_FUNC inline double pzero(const double& a) { return 0.; } -/** \internal \returns one bits */ -template EIGEN_DEVICE_FUNC inline Packet -ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;} - template EIGEN_DEVICE_FUNC inline std::complex ptrue(const std::complex& /*a*/) { RealScalar b; @@ -341,22 +292,6 @@ pxor(const Packet& a, const Packet& b) { template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return pand(a, pxor(ptrue(b), b)); } -/** \internal \returns a <= b as a bit mask */ -template EIGEN_DEVICE_FUNC inline Packet -pcmp_le(const Packet& a, const Packet& b) { return a<=b ? ptrue(a) : pzero(a); } - -/** \internal \returns a < b as a bit mask */ -template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt(const Packet& a, const Packet& b) { return a EIGEN_DEVICE_FUNC inline Packet -pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); } - -/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ -template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt_or_nan(const Packet& a, const Packet& b) { return a>=b ? pzero(a) : ptrue(a); } - /** \internal \returns \a or \b for each field in packet according to \mask */ template EIGEN_DEVICE_FUNC inline Packet pselect(const Packet& mask, const Packet& a, const Packet& b) { @@ -378,6 +313,119 @@ template<> EIGEN_DEVICE_FUNC inline bool pselect( return cond ? a : b; } +/** \internal \returns the min or of \a a and \a b (coeff-wise) + If either \a a or \a b are NaN, the result is implementation defined. */ +template +struct pminmax_impl { + template + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) { + return op(a,b); + } +}; + +/** \internal \returns the min or max of \a a and \a b (coeff-wise) + If either \a a or \a b are NaN, NaN is returned. */ +template<> +struct pminmax_impl { + template + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) { + Packet not_nan_mask_a = pcmp_eq(a, a); + Packet not_nan_mask_b = pcmp_eq(b, b); + return pselect(not_nan_mask_a, + pselect(not_nan_mask_b, op(a, b), b), + a); + } +}; + +/** \internal \returns the min or max of \a a and \a b (coeff-wise) + If both \a a and \a b are NaN, NaN is returned. + Equivalent to std::fmin(a, b). */ +template<> +struct pminmax_impl { + template + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) { + Packet not_nan_mask_a = pcmp_eq(a, a); + Packet not_nan_mask_b = pcmp_eq(b, b); + return pselect(not_nan_mask_a, + pselect(not_nan_mask_b, op(a, b), a), + b); + } +}; + +/** \internal \returns the min of \a a and \a b (coeff-wise). + If \a a or \b b is NaN, the return value is implementation defined. */ +template EIGEN_DEVICE_FUNC inline Packet +pmin(const Packet& a, const Packet& b) { return numext::mini(a,b); } + +/** \internal \returns the min of \a a and \a b (coeff-wise). + NaNPropagation determines the NaN propagation semantics. */ +template EIGEN_DEVICE_FUNC inline Packet +pmin(const Packet& a, const Packet& b) { return pminmax_impl::run(a,b, pmin); } + +/** \internal \returns the max of \a a and \a b (coeff-wise) + If \a a or \b b is NaN, the return value is implementation defined. */ +template EIGEN_DEVICE_FUNC inline Packet +pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); } + +/** \internal \returns the max of \a a and \a b (coeff-wise). + NaNPropagation determines the NaN propagation semantics. */ +template EIGEN_DEVICE_FUNC inline Packet +pmax(const Packet& a, const Packet& b) { return pminmax_impl::run(a,b, pmax); } + +/** \internal \returns the absolute value of \a a */ +template EIGEN_DEVICE_FUNC inline Packet +pabs(const Packet& a) { return numext::abs(a); } +template<> EIGEN_DEVICE_FUNC inline unsigned int +pabs(const unsigned int& a) { return a; } +template<> EIGEN_DEVICE_FUNC inline unsigned long +pabs(const unsigned long& a) { return a; } +template<> EIGEN_DEVICE_FUNC inline unsigned long long +pabs(const unsigned long long& a) { return a; } + +/** \internal \returns the phase angle of \a a */ +template EIGEN_DEVICE_FUNC inline Packet +parg(const Packet& a) { using numext::arg; return arg(a); } + + +/** \internal \returns \a a logically shifted by N bits to the right */ +template EIGEN_DEVICE_FUNC inline int +parithmetic_shift_right(const int& a) { return a >> N; } +template EIGEN_DEVICE_FUNC inline long int +parithmetic_shift_right(const long int& a) { return a >> N; } + +/** \internal \returns \a a arithmetically shifted by N bits to the right */ +template EIGEN_DEVICE_FUNC inline int +plogical_shift_right(const int& a) { return static_cast(static_cast(a) >> N); } +template EIGEN_DEVICE_FUNC inline long int +plogical_shift_right(const long int& a) { return static_cast(static_cast(a) >> N); } + +/** \internal \returns \a a shifted by N bits to the left */ +template EIGEN_DEVICE_FUNC inline int +plogical_shift_left(const int& a) { return a << N; } +template EIGEN_DEVICE_FUNC inline long int +plogical_shift_left(const long int& a) { return a << N; } + +/** \internal \returns the significant and exponent of the underlying floating point numbers + * See https://en.cppreference.com/w/cpp/numeric/math/frexp + */ +template +EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet& a, Packet& exponent) { + int exp; + EIGEN_USING_STD(frexp); + Packet result = frexp(a, &exp); + exponent = static_cast(exp); + return result; +} + +/** \internal \returns a * 2^exponent + * See https://en.cppreference.com/w/cpp/numeric/math/ldexp + */ +template EIGEN_DEVICE_FUNC inline Packet +pldexp(const Packet &a, const Packet &exponent) { + EIGEN_USING_STD(ldexp) + return ldexp(a, static_cast(exponent)); +} + /** \internal \returns the min of \a a and \a b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); } @@ -507,57 +555,6 @@ template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* a #endif } -/** \internal \returns the first element of a packet */ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type pfirst(const Packet& a) -{ return a; } - -/** \internal \returns the sum of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux(const Packet& a) -{ return a; } - -/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4. - * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} - * For packet-size smaller or equal to 4, this boils down to a noop. - */ -template EIGEN_DEVICE_FUNC inline -typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type -predux_half_dowto4(const Packet& a) -{ return a; } - -/** \internal \returns the product of the elements of \a a */ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul(const Packet& a) -{ return a; } - -/** \internal \returns the min of the elements of \a a */ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min(const Packet& a) -{ return a; } - -/** \internal \returns the max of the elements of \a a */ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max(const Packet& a) -{ return a; } - -/** \internal \returns true if all coeffs of \a a means "true" - * It is supposed to be called on values returned by pcmp_*. - */ -// not needed yet -// template EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a) -// { return bool(a); } - -/** \internal \returns true if any coeffs of \a a means "true" - * It is supposed to be called on values returned by pcmp_*. - */ -template EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a) -{ - // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames. - // It is expected that "true" is either: - // - Scalar(1) - // - bits full of ones (NaN for floats), - // - or first bit equals to 1 (1 for ints, smallest denormal for floats). - // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars. - typedef typename unpacket_traits::type Scalar; - return numext::not_equal_strict(predux(a), Scalar(0)); -} - /** \internal \returns the reversed elements of \a a*/ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) { return a; } @@ -656,53 +653,104 @@ Packet print(const Packet& a) { using numext::rint; return rint(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); } +/** \internal \returns the first element of a packet */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +pfirst(const Packet& a) +{ return a; } -/** \internal \returns the max of \a a and \a b (coeff-wise) - If both \a a and \a b are NaN, NaN is returned. - Equivalent to std::fmax(a, b). */ -template EIGEN_DEVICE_FUNC inline Packet -pfmax(const Packet& a, const Packet& b) { - Packet not_nan_mask_a = pcmp_eq(a, a); - Packet not_nan_mask_b = pcmp_eq(b, b); - return pselect(not_nan_mask_a, - pselect(not_nan_mask_b, pmax(a, b), a), - b); +/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4. + * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} + * For packet-size smaller or equal to 4, this boils down to a noop. + */ +template +EIGEN_DEVICE_FUNC inline typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type +predux_half_dowto4(const Packet& a) +{ return a; } + +// Slow generic implementation of Packet reduction. +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +predux_helper(const Packet& a, Op op) { + typedef typename unpacket_traits::type Scalar; + const size_t n = unpacket_traits::size; + Scalar elements[n]; + pstoreu(elements, a); + for(size_t k = n / 2; k > 0; k /= 2) { + for(size_t i = 0; i < k; ++i) { + elements[i] = op(elements[i], elements[i + k]); + } + } + return elements[0]; } -/** \internal \returns the min of \a a and \a b (coeff-wise) - If both \a a and \a b are NaN, NaN is returned. - Equivalent to std::fmin(a, b). */ -template EIGEN_DEVICE_FUNC inline Packet -pfmin(const Packet& a, const Packet& b) { - Packet not_nan_mask_a = pcmp_eq(a, a); - Packet not_nan_mask_b = pcmp_eq(b, b); - return pselect(not_nan_mask_a, - pselect(not_nan_mask_b, pmin(a, b), a), - b); +/** \internal \returns the sum of the elements of \a a*/ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +predux(const Packet& a) +{ + return predux_helper(a, padd::type>); } -/** \internal \returns the max of \a a and \a b (coeff-wise) - If either \a a or \a b are NaN, NaN is returned. */ -template EIGEN_DEVICE_FUNC inline Packet -pfmax_nan(const Packet& a, const Packet& b) { - Packet not_nan_mask_a = pcmp_eq(a, a); - Packet not_nan_mask_b = pcmp_eq(b, b); - return pselect(not_nan_mask_a, - pselect(not_nan_mask_b, pmax(a, b), b), - a); +/** \internal \returns the product of the elements of \a a */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +predux_mul(const Packet& a) +{ + return predux_helper(a, pmul::type>); } -/** \internal \returns the min of \a a and \a b (coeff-wise) - If either \a a or \a b are NaN, NaN is returned. */ -template EIGEN_DEVICE_FUNC inline Packet -pfmin_nan(const Packet& a, const Packet& b) { - Packet not_nan_mask_a = pcmp_eq(a, a); - Packet not_nan_mask_b = pcmp_eq(b, b); - return pselect(not_nan_mask_a, - pselect(not_nan_mask_b, pmin(a, b), b), - a); +/** \internal \returns the min of the elements of \a a */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +predux_min(const Packet& a) +{ + return predux_helper(a, pmin::type>); } +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +predux_min(const Packet& a) +{ + return predux_helper(a, pmin::type>); +} + +/** \internal \returns the max of the elements of \a a */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +predux_max(const Packet& a) +{ + return predux_helper(a, pmax::type>); +} + +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +predux_max(const Packet& a) +{ + return predux_helper(a, pmax::type>); +} + +/** \internal \returns true if all coeffs of \a a means "true" + * It is supposed to be called on values returned by pcmp_*. + */ +// not needed yet +// template EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a) +// { return bool(a); } + +/** \internal \returns true if any coeffs of \a a means "true" + * It is supposed to be called on values returned by pcmp_*. + */ +template EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a) +{ + // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames. + // It is expected that "true" is either: + // - Scalar(1) + // - bits full of ones (NaN for floats), + // - or first bit equals to 1 (1 for ints, smallest denormal for floats). + // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars. + typedef typename unpacket_traits::type Scalar; + return numext::not_equal_strict(predux(a), Scalar(0)); +} /*************************************************************************** * The following functions might not have to be overwritten for vectorized types -- cgit v1.2.3