From 748c4c4599918ef27b61bade7cea91c4ea8845e1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Apr 2016 13:11:04 -0700 Subject: More accurate cost estimates for exp, log, tanh, and sqrt. --- Eigen/src/Core/functors/UnaryFunctors.h | 88 ++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 12 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 7ba0abedc..22ceb4c1c 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -234,9 +234,33 @@ template struct scalar_exp_op { template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexp(a); } }; -template -struct functor_traits > -{ enum { Cost = 5 * NumTraits::MulCost, PacketAccess = packet_traits::HasExp }; }; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasExp, + // The following numbers are based on the AVX implementation. +#ifdef EIGEN_VECTORIZE_FMA + // Haswell can issue 2 add/mul/madd per cycle. + Cost = + (sizeof(Scalar) == 4 + // float: 8 pmadd, 4 pmul, 2 padd/psub, 6 other + ? (8 * NumTraits::AddCost + 6 * NumTraits::MulCost) + // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div, 13 other + : (14 * NumTraits::AddCost + + 6 * NumTraits::MulCost + + NumTraits::template Div::HasDiv>::Cost)), +#else + Cost = + (sizeof(Scalar) == 4 + // float: 7 pmadd, 6 pmul, 4 padd/psub, 10 other + ? (21 * NumTraits::AddCost + 13 * NumTraits::MulCost) + // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div, 13 other + : (23 * NumTraits::AddCost + + 12 * NumTraits::MulCost + + NumTraits::template Div::HasDiv>::Cost)) +#endif + }; +}; /** \internal * @@ -250,9 +274,24 @@ template struct scalar_log_op { template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog(a); } }; -template -struct functor_traits > -{ enum { Cost = 5 * NumTraits::MulCost, PacketAccess = packet_traits::HasLog }; }; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasLog, + Cost = + (PacketAccess + // The following numbers are based on the AVX implementation. +#ifdef EIGEN_VECTORIZE_FMA + // 8 pmadd, 6 pmul, 8 padd/psub, 16 other, can issue 2 add/mul/madd per cycle. + ? (20 * NumTraits::AddCost + 7 * NumTraits::MulCost) +#else + // 8 pmadd, 6 pmul, 8 padd/psub, 20 other + ? (36 * NumTraits::AddCost + 14 * NumTraits::MulCost) +#endif + // Measured cost of std::log. + : sizeof(Scalar)==4 ? 40 : 85) + }; +}; /** \internal * @@ -280,10 +319,19 @@ template struct scalar_sqrt_op { template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); } }; -template -struct functor_traits > -{ enum { - Cost = 5 * NumTraits::MulCost, +template +struct functor_traits > { + enum { +#if EIGEN_FAST_MATH + // The following numbers are based on the AVX implementation. + Cost = (sizeof(Scalar) == 8 ? 28 + // 4 pmul, 1 pmadd, 3 other + : (3 * NumTraits::AddCost + + 5 * NumTraits::MulCost)), +#else + // The following numbers are based on min VSQRT throughput on Haswell. + Cost = (sizeof(Scalar) == 8 ? 28 : 14), +#endif PacketAccess = packet_traits::HasSqrt }; }; @@ -574,8 +622,24 @@ template struct functor_traits > { enum { - Cost = 5 * NumTraits::MulCost, - PacketAccess = packet_traits::HasTanh + PacketAccess = packet_traits::HasTanh, + Cost = + (PacketAccess + // The following numbers are based on the AVX implementation, +#ifdef EIGEN_VECTORIZE_FMA + // Haswell can issue 2 add/mul/madd per cycle. + // 9 pmadd, 2 pmul, 1 div, 2 other + ? (2 * NumTraits::AddCost + 6 * NumTraits::MulCost + + NumTraits::template Div::HasDiv>::Cost) +#else + ? (11 * NumTraits::AddCost + + 11 * NumTraits::MulCost + + NumTraits::template Div::HasDiv>::Cost) +#endif + // This number assumes a naive implementation of tanh + : (6 * NumTraits::AddCost + 3 * NumTraits::MulCost + + 2 * NumTraits::template Div::HasDiv>::Cost + + functor_traits >::Cost)) }; }; -- cgit v1.2.3