From 383d1cc2ed76d1757a45cd0b2d6559dee7e2ee1b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 20 Nov 2015 11:09:46 -0800 Subject: Added proper support for fast 64bit integer division on CUDA --- unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 31 +++++++++-------------- 1 file changed, 12 insertions(+), 19 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 058fb2c42..81c661269 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -61,13 +61,8 @@ namespace { template struct DividerTraits { -#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__) typedef typename UnsignedTraits::type type; static const int N = sizeof(T) * 8; -#else - typedef uint32_t type; - static const int N = 32; -#endif }; template @@ -79,40 +74,38 @@ namespace { #endif } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { #if defined(__CUDA_ARCH__) - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { return __umul64hi(a, b); - } -#else - template - EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { -#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__) +#elif defined(__SIZEOF_INT128__) __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); return static_cast(v >> 64); #else - EIGEN_STATIC_ASSERT(sizeof(T) == 4, YOU_MADE_A_PROGRAMMING_MISTAKE); - return (a * b) >> 32; + return (TensorUInt128, uint64_t>(a) * TensorUInt128, uint64_t>(b)).upper(); #endif } -#endif template struct DividerHelper { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier (const int log_div, const T divider) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) { EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE); return static_cast((static_cast(1) << (N+log_div)) / divider - (static_cast(1) << N) + 1); } }; -#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__) template struct DividerHelper<64, T> { - static EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { +#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); +#else + const uint64_t shift = 1ULL << log_div; + TensorUInt128 result = (TensorUInt128 >(shift, 0) / TensorUInt128, uint64_t>(divider) - TensorUInt128, static_val<0> >(1, 0) + TensorUInt128, static_val<1> >(1)); + return static_cast(result); +#endif } }; -#endif } -- cgit v1.2.3