aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h67
-rw-r--r--unsupported/test/cxx11_tensor_reduction_cuda.cu26
2 files changed, 64 insertions, 29 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index d3894e625..4e2e416e6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -67,6 +67,23 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
#endif
}
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+ return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom<double>(double* address, double val) {
+ unsigned long long int* address_as_ull = (unsigned long long int*)address;
+ unsigned long long int old = *address_as_ull;
+ unsigned long long int assumed;
+ do {
+ assumed = old;
+ old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
+ } while (assumed != old);
+ return __longlong_as_double(old);
+}
#ifdef EIGEN_HAS_CUDA_FP16
template <template <typename T> class R>
@@ -130,7 +147,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
unsigned int block = atomicCAS(semaphore, 0u, 1u);
if (block == 0) {
// We're the first block to run, initialize the output value
- atomicExch(output, reducer.initialize());
+ atomicExchCustom(output, reducer.initialize());
__threadfence();
atomicExch(semaphore, 2u);
}
@@ -263,17 +280,22 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2
#endif
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct FullReductionLauncher {
static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
assert(false && "Should only be called on floats and half floats");
}
};
-template <typename Self, typename Op, bool PacketAccess>
-struct FullReductionLauncher<Self, Op, float, PacketAccess> {
- static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+ Self, Op, OutputType, PacketAccess,
+ typename internal::enable_if<
+ internal::is_same<float, OutputType>::value ||
+ internal::is_same<double, OutputType>::value,
+ void>::type> {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
typedef typename Self::Index Index;
typedef typename Self::CoeffReturnType Scalar;
const int block_size = 256;
@@ -330,15 +352,17 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
// Unfortunately nvidia doesn't support well exotic types such as complex,
- // so reduce the scope of the optimized version of the code to the simple case
- // of floats and half floats.
+ // so reduce the scope of the optimized version of the code to the simple cases
+ // of doubles, floats and half floats
#ifdef EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#elif __CUDA_ARCH__ >= 300
static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
#else
static const bool HasOptimizedImplementation = false;
#endif
@@ -362,6 +386,7 @@ template <int NumPerThread, typename Self,
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) {
#if __CUDA_ARCH__ >= 300
+ typedef typename Self::CoeffReturnType Type;
eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1);
eigen_assert(gridDim.y == 1);
@@ -391,13 +416,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
const Index col_block = i % input_col_blocks;
const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
- float reduced_val = reducer.initialize();
+ Type reduced_val = reducer.initialize();
for (Index j = 0; j < NumPerThread; j += unroll_times) {
const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
if (last_col >= num_coeffs_to_reduce) {
for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
- const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+ const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
reducer.reduce(val, &reduced_val);
}
break;
@@ -523,7 +548,7 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
#endif
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct InnerReductionLauncher {
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
assert(false && "Should only be called to reduce floats and half floats on a gpu device");
@@ -531,9 +556,15 @@ struct InnerReductionLauncher {
}
};
-template <typename Self, typename Op, bool PacketAccess>
-struct InnerReductionLauncher<Self, Op, float, PacketAccess> {
- static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+ Self, Op, OutputType, PacketAccess,
+ typename internal::enable_if<
+ internal::is_same<float, OutputType>::value ||
+ internal::is_same<double, OutputType>::value,
+ void>::type> {
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
@@ -551,7 +582,7 @@ struct InnerReductionLauncher<Self, Op, float, PacketAccess> {
const int max_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
- LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+ LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
num_blocks, 1024, 0, device, reducer.initialize(),
num_preserved_vals, output);
}
@@ -618,10 +649,12 @@ struct InnerReducer<Self, Op, GpuDevice> {
#ifdef EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#elif __CUDA_ARCH__ >= 300
static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
#else
static const bool HasOptimizedImplementation = false;
#endif
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu
index cad0c08e0..6d8f01c02 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cu
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu
@@ -16,7 +16,7 @@
#include <unsupported/Eigen/CXX11/Tensor>
-template<int DataLayout>
+template<typename Type, int DataLayout>
static void test_full_reductions() {
Eigen::CudaStreamDevice stream;
@@ -25,24 +25,24 @@ static void test_full_reductions() {
const int num_rows = internal::random<int>(1024, 5*1024);
const int num_cols = internal::random<int>(1024, 5*1024);
- Tensor<float, 2, DataLayout> in(num_rows, num_cols);
+ Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
in.setRandom();
- Tensor<float, 0, DataLayout> full_redux;
+ Tensor<Type, 0, DataLayout> full_redux;
full_redux = in.sum();
- std::size_t in_bytes = in.size() * sizeof(float);
- std::size_t out_bytes = full_redux.size() * sizeof(float);
- float* gpu_in_ptr = static_cast<float*>(gpu_device.allocate(in_bytes));
- float* gpu_out_ptr = static_cast<float*>(gpu_device.allocate(out_bytes));
+ std::size_t in_bytes = in.size() * sizeof(Type);
+ std::size_t out_bytes = full_redux.size() * sizeof(Type);
+ Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
+ Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
- TensorMap<Tensor<float, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
- TensorMap<Tensor<float, 0, DataLayout> > out_gpu(gpu_out_ptr);
+ TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+ TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
out_gpu.device(gpu_device) = in_gpu.sum();
- Tensor<float, 0, DataLayout> full_redux_gpu;
+ Tensor<Type, 0, DataLayout> full_redux_gpu;
gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
gpu_device.synchronize();
@@ -54,6 +54,8 @@ static void test_full_reductions() {
}
void test_cxx11_tensor_reduction_cuda() {
- CALL_SUBTEST_1(test_full_reductions<ColMajor>());
- CALL_SUBTEST_2(test_full_reductions<RowMajor>());
+ CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
+ CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
+ CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
+ CALL_SUBTEST_2((test_full_reductions<double, RowMajor>()));
}