diff options
Diffstat (limited to 'Eigen/src/Core/arch/CUDA')
-rw-r--r-- | Eigen/src/Core/arch/CUDA/Half.h | 497 | ||||
-rw-r--r-- | Eigen/src/Core/arch/CUDA/MathFunctions.h | 115 | ||||
-rw-r--r-- | Eigen/src/Core/arch/CUDA/PacketMath.h | 63 | ||||
-rw-r--r-- | Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 192 | ||||
-rw-r--r-- | Eigen/src/Core/arch/CUDA/TypeCasting.h | 112 |
5 files changed, 975 insertions, 4 deletions
diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h new file mode 100644 index 000000000..281b8e4c6 --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -0,0 +1,497 @@ +// Standard 16-bit float type, mostly useful for GPUs. Defines a new +// class Eigen::half (inheriting from CUDA's __half struct) with +// operator overloads such that it behaves basically as an arithmetic +// type. It will be quite slow on CPUs (so it is recommended to stay +// in fp32 for CPUs, except for simple parameter conversions, I/O +// to disk and the likes), but fast on GPUs. +// +// +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The conversion routines are Copyright (c) Fabian Giesen, 2016. +// The original license follows: +// +// Copyright (c) Fabian Giesen, 2016 +// All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef EIGEN_HALF_CUDA_H +#define EIGEN_HALF_CUDA_H + +#if __cplusplus > 199711L +#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() +#else +#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type() +#endif + + +#if !defined(EIGEN_HAS_CUDA_FP16) + +// Make our own __half definition that is similar to CUDA's. +struct __half { + unsigned short x; +}; + +#endif + +namespace Eigen { + +namespace internal { + +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h); + +} // end namespace internal + +// Class definition. +struct half : public __half { + EIGEN_DEVICE_FUNC half() {} + + EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {} + EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {} + + explicit EIGEN_DEVICE_FUNC half(bool b) + : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + explicit EIGEN_DEVICE_FUNC half(int i) + : __half(internal::float_to_half_rtne(static_cast<float>(i))) {} + explicit EIGEN_DEVICE_FUNC half(long l) + : __half(internal::float_to_half_rtne(static_cast<float>(l))) {} + explicit EIGEN_DEVICE_FUNC half(long long ll) + : __half(internal::float_to_half_rtne(static_cast<float>(ll))) {} + explicit EIGEN_DEVICE_FUNC half(float f) + : __half(internal::float_to_half_rtne(f)) {} + explicit EIGEN_DEVICE_FUNC half(double d) + : __half(internal::float_to_half_rtne(static_cast<float>(d))) {} + + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const { + // +0.0 and -0.0 become false, everything else becomes true. + return (x & 0x7fff) != 0; + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const { + return static_cast<signed char>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const { + return static_cast<unsigned char>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const { + return static_cast<short>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const { + return static_cast<unsigned short>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const { + return static_cast<int>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const { + return static_cast<unsigned int>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const { + return static_cast<long>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const { + return static_cast<unsigned long>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const { + return static_cast<long long>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const { + return static_cast<unsigned long long>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { + return internal::half_to_float(*this); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { + return static_cast<double>(internal::half_to_float(*this)); + } + + EIGEN_DEVICE_FUNC half& operator=(const half& other) { + x = other.x; + return *this; + } +}; + +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + +// Intrinsics for native fp16 support. Note that on current hardware, +// these are no faster than fp32 arithmetic (you need to use the half2 +// versions to get the ALU speed increased), but you do save the +// conversion steps back and forth. + +__device__ half operator + (const half& a, const half& b) { + return __hadd(a, b); +} +__device__ half operator * (const half& a, const half& b) { + return __hmul(a, b); +} +__device__ half operator - (const half& a, const half& b) { + return __hsub(a, b); +} +__device__ half operator / (const half& a, const half& b) { + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); +} +__device__ half operator - (const half& a) { + return __hneg(a); +} +__device__ half& operator += (half& a, const half& b) { + a = a + b; + return a; +} +__device__ half& operator *= (half& a, const half& b) { + a = a * b; + return a; +} +__device__ half& operator -= (half& a, const half& b) { + a = a - b; + return a; +} +__device__ half& operator /= (half& a, const half& b) { + a = a / b; + return a; +} +__device__ bool operator == (const half& a, const half& b) { + return __heq(a, b); +} +__device__ bool operator != (const half& a, const half& b) { + return __hne(a, b); +} +__device__ bool operator < (const half& a, const half& b) { + return __hlt(a, b); +} +__device__ bool operator <= (const half& a, const half& b) { + return __hle(a, b); +} +__device__ bool operator > (const half& a, const half& b) { + return __hgt(a, b); +} +__device__ bool operator >= (const half& a, const half& b) { + return __hge(a, b); +} + +#else // Emulate support for half floats + +// Definitions for CPUs and older CUDA, mostly working through conversion +// to/from fp32. + +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { + return half(float(a) + float(b)); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { + return half(float(a) * float(b)); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { + return half(float(a) - float(b)); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { + return half(float(a) / float(b)); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { + half result; + result.x = a.x ^ 0x8000; + return result; +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { + a = half(float(a) + float(b)); + return a; +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { + a = half(float(a) * float(b)); + return a; +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { + a = half(float(a) - float(b)); + return a; +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { + a = half(float(a) / float(b)); + return a; +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { + return float(a) == float(b); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { + return float(a) != float(b); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { + return float(a) < float(b); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { + return float(a) <= float(b); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { + return float(a) > float(b); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { + return float(a) >= float(b); +} + +#endif // Emulate support for half floats + +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to half. +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { + return Eigen::half(static_cast<float>(a) / static_cast<float>(b)); +} + +// Conversion routines, including fallbacks for the host or older CUDA. +// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of +// these in hardware. If we need more performance on older/other CPUs, they are +// also possible to vectorize directly. + +namespace internal { + +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { + __half h; + h.x = x; + return h; +} + +union FP32 { + unsigned int u; + float f; +}; + +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __float2half(ff); + +#elif defined(EIGEN_HAS_FP16_C) + __half h; + h.x = _cvtss_sh(ff, 0); + return h; + +#else + FP32 f; f.f = ff; + + const FP32 f32infty = { 255 << 23 }; + const FP32 f16max = { (127 + 16) << 23 }; + const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + unsigned int sign_mask = 0x80000000u; + __half o = { 0 }; + + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + + // and one integer subtract of the bias later, we have our final float! + o.x = static_cast<unsigned short>(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = static_cast<unsigned short>(f.u >> 13); + } + } + + o.x |= static_cast<unsigned short>(sign >> 16); + return o; +#endif +} + +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __half2float(h); + +#elif defined(EIGEN_HAS_FP16_C) + return _cvtsh_ss(h.x); + +#else + const FP32 magic = { 113 << 23 }; + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; +#endif +} + +} // end namespace internal + +// Traits. + +namespace internal { + +template<> struct is_arithmetic<half> { enum { value = true }; }; + +} // end namespace internal + +template<> struct NumTraits<Eigen::half> + : GenericNumTraits<Eigen::half> +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { + return internal::raw_uint16_to_half(0x0800); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return half(1e-3f); } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() { + return internal::raw_uint16_to_half(0x7bff); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() { + return internal::raw_uint16_to_half(0xfbff); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() { + return internal::raw_uint16_to_half(0x7c00); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { + return internal::raw_uint16_to_half(0x7c01); + } +}; + +// Infinity/NaN checks. + +namespace numext { + +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) { + return (a.x & 0x7fff) == 0x7c00; +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hisnan(a); +#else + return (a.x & 0x7fff) > 0x7c00; +#endif +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) { + return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) { + Eigen::half result; + result.x = a.x & 0x7FFF; + return result; +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exp(const Eigen::half& a) { + return Eigen::half(::expf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) { + return Eigen::half(::logf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::half& a) { + return Eigen::half(::sqrtf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half pow(const Eigen::half& a, const Eigen::half& b) { + return Eigen::half(::powf(float(a), float(b))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) { + return Eigen::half(::floorf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::half& a) { + return Eigen::half(::ceilf(float(a))); +} + +} // end namespace numext + +} // end namespace Eigen + +// Standard mathematical functions and trancendentals. +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) { + Eigen::half result; + result.x = a.x & 0x7FFF; + return result; +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { + return Eigen::half(::expf(float(a))); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { + return Eigen::half(::logf(float(a))); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) { + return Eigen::half(::sqrtf(float(a))); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) { + return Eigen::half(::powf(float(a), float(b))); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) { + return Eigen::half(::floorf(float(a))); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) { + return Eigen::half(::ceilf(float(a))); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isnan)(const Eigen::half& a) { + return (Eigen::numext::isnan)(a); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isinf)(const Eigen::half& a) { + return (Eigen::numext::isinf)(a); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isfinite)(const Eigen::half& a) { + return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a); +} + + +namespace std { + +#if __cplusplus > 199711L +template <> +struct hash<Eigen::half> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const { + return static_cast<std::size_t>(a.x); + } +}; +#endif + +} // end namespace std + + +// Add the missing shfl_xor intrinsic +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width)); +} +#endif + +// ldg() has an overload for __half, but we also need one for Eigen::half. +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320 +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { + return Eigen::internal::raw_uint16_to_half( + __ldg(reinterpret_cast<const unsigned short*>(ptr))); +} +#endif + + +#endif // EIGEN_HALF_CUDA_H diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h index 3bea88bea..317499b29 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -66,6 +66,121 @@ double2 prsqrt<double2>(const double2& a) return make_double2(rsqrt(a.x), rsqrt(a.y)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 plgamma<float4>(const float4& a) +{ + return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 plgamma<double2>(const double2& a) +{ + return make_double2(lgamma(a.x), lgamma(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pdigamma<float4>(const float4& a) +{ + using numext::digamma; + return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pdigamma<double2>(const double2& a) +{ + using numext::digamma; + return make_double2(digamma(a.x), digamma(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pzeta<float4>(const float4& x, const float4& q) +{ + using numext::zeta; + return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pzeta<double2>(const double2& x, const double2& q) +{ + using numext::zeta; + return make_double2(zeta(x.x, q.x), zeta(x.y, q.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 ppolygamma<float4>(const float4& n, const float4& x) +{ + using numext::polygamma; + return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 ppolygamma<double2>(const double2& n, const double2& x) +{ + using numext::polygamma; + return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 perf<float4>(const float4& a) +{ + return make_float4(erf(a.x), erf(a.y), erf(a.z), erf(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 perf<double2>(const double2& a) +{ + return make_double2(erf(a.x), erf(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 perfc<float4>(const float4& a) +{ + return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 perfc<double2>(const double2& a) +{ + return make_double2(erfc(a.x), erfc(a.y)); +} + + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigamma<float4>(const float4& a, const float4& x) +{ + using numext::igamma; + return make_float4( + igamma(a.x, x.x), + igamma(a.y, x.y), + igamma(a.z, x.z), + igamma(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigamma<double2>(const double2& a, const double2& x) +{ + using numext::igamma; + return make_double2(igamma(a.x, x.x), igamma(a.y, x.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigammac<float4>(const float4& a, const float4& x) +{ + using numext::igammac; + return make_float4( + igammac(a.x, x.x), + igammac(a.y, x.y), + igammac(a.z, x.z), + igammac(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigammac<double2>(const double2& a, const double2& x) +{ + using numext::igammac; + return make_double2(igammac(a.x, x.x), igammac(a.y, x.y)); +} + #endif } // end namespace internal diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 0d2c2fef0..932df1092 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -21,7 +21,6 @@ namespace internal { template<> struct is_arithmetic<float4> { enum { value = true }; }; template<> struct is_arithmetic<double2> { enum { value = true }; }; - template<> struct packet_traits<float> : default_packet_traits { typedef float4 type; @@ -39,6 +38,14 @@ template<> struct packet_traits<float> : default_packet_traits HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasZeta = 1, + HasPolygamma = 1, + HasErf = 1, + HasErfc = 1, + HasIgamma = 1, + HasIGammac = 1, HasBlend = 0, }; @@ -59,6 +66,12 @@ template<> struct packet_traits<double> : default_packet_traits HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasErf = 1, + HasErfc = 1, + HasIGamma = 1, + HasIGammac = 1, HasBlend = 0, }; @@ -177,25 +190,39 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to to[1] = from.y; } -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return __ldg((const float4*)from); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return __ldg((const double2*)from); +#else + return make_double2(from[0], from[1]); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return make_double2(__ldg(from+0), __ldg(from+1)); -} +#else + return make_double2(from[0], from[1]); #endif +} template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) { return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); @@ -251,6 +278,35 @@ template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) return a.x * a.y; } +template<size_t offset> +struct protate_impl<offset, float4> +{ + static float4 run(const float4& a) { + if (offset == 0) { + return make_float4(a.x, a.y, a.z, a.w); + } + if (offset == 1) { + return make_float4(a.w, a.x, a.y, a.z); + } + if (offset == 2) { + return make_float4(a.z, a.w, a.x, a.y); + } + return make_float4(a.y, a.z, a.w, a.x); + } +}; + +template<size_t offset> +struct protate_impl<offset, double2> +{ + static double2 run(const double2& a) { + if (offset == 0) { + return make_double2(a.x, a.y); + } + return make_double2(a.y, a.x); + } +}; + + template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) { return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); } @@ -258,7 +314,6 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) { return make_double2(fabs(a.x), fabs(a.y)); } - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<float4,4>& kernel) { double tmp = kernel.packet[0].y; diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h new file mode 100644 index 000000000..61d532e4d --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -0,0 +1,192 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H +#define EIGEN_PACKET_MATH_HALF_CUDA_H + +#if defined(EIGEN_HAS_CUDA_FP16) + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) + +// Most of the following operations require arch >= 5.3 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + +namespace Eigen { +namespace internal { + +template<> struct is_arithmetic<half2> { enum { value = true }; }; + +template<> struct packet_traits<half> : default_packet_traits +{ + typedef half2 type; + typedef half2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + HasDiv = 1 + }; +}; + + +template<> struct unpacket_traits<half2> { typedef half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const half& from) { + return __half2half2(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const half* from) { + return *reinterpret_cast<const half2*>(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const half* from) { + return __halves2half2(from[0], from[1]); +} + +template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const half* from) { + return __halves2half2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<half>(half* to, const half2& from) { + *reinterpret_cast<half2*>(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, const half2& from) { + to[0] = __low2half(from); + to[1] = __high2half(from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) { + return __ldg((const half2*)from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) { + return __halves2half2(__ldg(from+0), __ldg(from+1)); +} + +template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) { + return __halves2half2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<half, half2>(half* to, const half2& from, Index stride) { + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); +} + +template<> EIGEN_DEVICE_FUNC inline half pfirst<half2>(const half2& a) { + return __low2half(a); +} + +template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) { + half2 result; + result.x = a.x & 0x7FFF7FFF; + return result; +} + + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<half2,2>& kernel) { + half a1 = __low2half(kernel.packet[0]); + half a2 = __high2half(kernel.packet[0]); + half b1 = __low2half(kernel.packet[1]); + half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) { + return __halves2half2(a, __hadd(a, __float2half(1.0f))); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) { + return __hadd2(a, b); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) { + return __hsub2(a, b); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { + return __hneg2(a); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) { + return __hmul2(a, b); +} + + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) { + return __hfma2(a, b, c); + } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 / b1; + float r2 = a2 / b2; + return __floats2half2_rn(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) { + return __hadd(__low2half(a), __high2half(a)); +} + +template<> EIGEN_DEVICE_FUNC inline half predux_max<half2>(const half2& a) { + half first = __low2half(a); + half second = __high2half(a); + return __hgt(first, second) ? first : second; +} + +template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) { + half first = __low2half(a); + half second = __high2half(a); + return __hlt(first, second) ? first : second; +} + +template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) { + return __hmul(__low2half(a), __high2half(a)); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif +#endif +#endif +#endif // EIGEN_PACKET_MATH_HALF_CUDA_H diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h new file mode 100644 index 000000000..396b38eaf --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -0,0 +1,112 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_CUDA_H +#define EIGEN_TYPE_CASTING_CUDA_H + +namespace Eigen { + +namespace internal { + +#if defined(EIGEN_HAS_CUDA_FP16) + +template<> +struct scalar_cast_op<float, half> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const float& a) const { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __float2half(a); + #else + return half(a); + #endif + } +}; + +template<> +struct functor_traits<scalar_cast_op<float, half> > +{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op<int, half> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const int& a) const { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __float2half(static_cast<float>(a)); + #else + return half(static_cast<float>(a)); + #endif + } +}; + +template<> +struct functor_traits<scalar_cast_op<int, half> > +{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op<half, float> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const half& a) const { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __half2float(a); + #else + return static_cast<float>(a); + #endif + } +}; + +template<> +struct functor_traits<scalar_cast_op<half, float> > +{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; }; + + + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + +template <> +struct type_casting_traits<half, float> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) { + float2 r1 = __half22float2(a); + float2 r2 = __half22float2(b); + return make_float4(r1.x, r1.y, r2.x, r2.y); +} + +template <> +struct type_casting_traits<float, half> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) { + // Simply discard the second half of the input + return __float22half2_rn(make_float2(a.x, a.y)); +} + +#endif +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_CUDA_H |