diff options
Diffstat (limited to 'Eigen/src/Core/arch/CUDA/PacketMath.h')
-rw-r--r-- | Eigen/src/Core/arch/CUDA/PacketMath.h | 63 |
1 files changed, 59 insertions, 4 deletions
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 0d2c2fef0..932df1092 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -21,7 +21,6 @@ namespace internal { template<> struct is_arithmetic<float4> { enum { value = true }; }; template<> struct is_arithmetic<double2> { enum { value = true }; }; - template<> struct packet_traits<float> : default_packet_traits { typedef float4 type; @@ -39,6 +38,14 @@ template<> struct packet_traits<float> : default_packet_traits HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasZeta = 1, + HasPolygamma = 1, + HasErf = 1, + HasErfc = 1, + HasIgamma = 1, + HasIGammac = 1, HasBlend = 0, }; @@ -59,6 +66,12 @@ template<> struct packet_traits<double> : default_packet_traits HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasErf = 1, + HasErfc = 1, + HasIGamma = 1, + HasIGammac = 1, HasBlend = 0, }; @@ -177,25 +190,39 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to to[1] = from.y; } -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return __ldg((const float4*)from); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return __ldg((const double2*)from); +#else + return make_double2(from[0], from[1]); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return make_double2(__ldg(from+0), __ldg(from+1)); -} +#else + return make_double2(from[0], from[1]); #endif +} template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) { return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); @@ -251,6 +278,35 @@ template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) return a.x * a.y; } +template<size_t offset> +struct protate_impl<offset, float4> +{ + static float4 run(const float4& a) { + if (offset == 0) { + return make_float4(a.x, a.y, a.z, a.w); + } + if (offset == 1) { + return make_float4(a.w, a.x, a.y, a.z); + } + if (offset == 2) { + return make_float4(a.z, a.w, a.x, a.y); + } + return make_float4(a.y, a.z, a.w, a.x); + } +}; + +template<size_t offset> +struct protate_impl<offset, double2> +{ + static double2 run(const double2& a) { + if (offset == 0) { + return make_double2(a.x, a.y); + } + return make_double2(a.y, a.x); + } +}; + + template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) { return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); } @@ -258,7 +314,6 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) { return make_double2(fabs(a.x), fabs(a.y)); } - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<float4,4>& kernel) { double tmp = kernel.packet[0].y; |