aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-06-01 11:12:07 -0700
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-06-01 11:12:07 -0700
commitd27b0ad4c8b6637aad7ef47902742626785b6a76 (patch)
tree665613bc0b1bdddcbdb68787b4e5852174e32839 /unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
parentcd221a62eef7251f7e6815c0cb6dfc383f4ac49d (diff)
Added support for mean reductions on fp16
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h40
1 files changed, 32 insertions, 8 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 7368768cf..f9ce24373 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -230,13 +230,15 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2
#endif
-template <typename Self, typename Op>
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
struct FullReductionLauncher {
- template <typename OutputType>
static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
assert(false && "Should only be called on floats and half floats");
}
+};
+template <typename Self, typename Op, bool PacketAccess>
+struct FullReductionLauncher<Self, Op, float, PacketAccess> {
static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs) {
typedef typename Self::Index Index;
typedef typename Self::CoeffReturnType Scalar;
@@ -254,8 +256,18 @@ struct FullReductionLauncher {
LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
}
+};
#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+ static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+ assert(false && "Should not be called since there is no packet accessor");
+ }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
typedef typename Self::Index Index;
@@ -279,8 +291,8 @@ struct FullReductionLauncher {
1, 1, 0, device, reducer, output, scratch);
}
}
-#endif
};
+#endif
template <typename Self, typename Op, bool Vectorizable>
@@ -306,7 +318,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
return;
}
- FullReductionLauncher<Self, Op>::run(self, reducer, device, output, num_coeffs);
+ FullReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs);
}
};
@@ -473,14 +485,16 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
#endif
-template <typename Self, typename Op>
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
struct InnerReductionLauncher {
- template <typename OutputType>
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
assert(false && "Should only be called to reduce floats and half floats on a gpu device");
return true;
}
+};
+template <typename Self, typename Op, bool PacketAccess>
+struct InnerReductionLauncher<Self, Op, float, PacketAccess> {
static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
@@ -509,8 +523,18 @@ struct InnerReductionLauncher {
return false;
}
+};
#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+ static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+ assert(false && "Should not be called since there is no packet accessor");
+ }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
@@ -543,8 +567,8 @@ struct InnerReductionLauncher {
return false;
}
-#endif
};
+#endif
template <typename Self, typename Op>
@@ -574,7 +598,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
return true;
}
- return InnerReductionLauncher<Self, Op>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+ return InnerReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
}
};