aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/CUDA/PacketMath.h
diff options
context:
space:
mode:
Diffstat (limited to 'Eigen/src/Core/arch/CUDA/PacketMath.h')
-rw-r--r--Eigen/src/Core/arch/CUDA/PacketMath.h63
1 files changed, 59 insertions, 4 deletions
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index 0d2c2fef0..932df1092 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -21,7 +21,6 @@ namespace internal {
template<> struct is_arithmetic<float4> { enum { value = true }; };
template<> struct is_arithmetic<double2> { enum { value = true }; };
-
template<> struct packet_traits<float> : default_packet_traits
{
typedef float4 type;
@@ -39,6 +38,14 @@ template<> struct packet_traits<float> : default_packet_traits
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
+ HasLGamma = 1,
+ HasDiGamma = 1,
+ HasZeta = 1,
+ HasPolygamma = 1,
+ HasErf = 1,
+ HasErfc = 1,
+ HasIgamma = 1,
+ HasIGammac = 1,
HasBlend = 0,
};
@@ -59,6 +66,12 @@ template<> struct packet_traits<double> : default_packet_traits
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
+ HasLGamma = 1,
+ HasDiGamma = 1,
+ HasErf = 1,
+ HasErfc = 1,
+ HasIGamma = 1,
+ HasIGammac = 1,
HasBlend = 0,
};
@@ -177,25 +190,39 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
to[1] = from.y;
}
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
return __ldg((const float4*)from);
+#else
+ return make_float4(from[0], from[1], from[2], from[3]);
+#endif
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
return __ldg((const double2*)from);
+#else
+ return make_double2(from[0], from[1]);
+#endif
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
+#else
+ return make_float4(from[0], from[1], from[2], from[3]);
+#endif
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
return make_double2(__ldg(from+0), __ldg(from+1));
-}
+#else
+ return make_double2(from[0], from[1]);
#endif
+}
template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
@@ -251,6 +278,35 @@ template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a)
return a.x * a.y;
}
+template<size_t offset>
+struct protate_impl<offset, float4>
+{
+ static float4 run(const float4& a) {
+ if (offset == 0) {
+ return make_float4(a.x, a.y, a.z, a.w);
+ }
+ if (offset == 1) {
+ return make_float4(a.w, a.x, a.y, a.z);
+ }
+ if (offset == 2) {
+ return make_float4(a.z, a.w, a.x, a.y);
+ }
+ return make_float4(a.y, a.z, a.w, a.x);
+ }
+};
+
+template<size_t offset>
+struct protate_impl<offset, double2>
+{
+ static double2 run(const double2& a) {
+ if (offset == 0) {
+ return make_double2(a.x, a.y);
+ }
+ return make_double2(a.y, a.x);
+ }
+};
+
+
template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
}
@@ -258,7 +314,6 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
return make_double2(fabs(a.x), fabs(a.y));
}
-
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<float4,4>& kernel) {
double tmp = kernel.packet[0].y;