diff options
author | Igor Babuschkin <igor@babuschk.in> | 2016-07-01 19:08:26 +0100 |
---|---|---|
committer | Igor Babuschkin <igor@babuschk.in> | 2016-07-01 19:08:26 +0100 |
commit | eeb0d880ee6dac1b28ac820566017f641f45fcf9 (patch) | |
tree | c717ff06c46ede3b13405fa03cdada34311fb149 | |
parent | d161b8f03acc023078457bfed2236e709c094392 (diff) |
Enable efficient Tensor reduction for doubles
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 67 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_reduction_cuda.cu | 26 |
2 files changed, 64 insertions, 29 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index d3894e625..4e2e416e6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -67,6 +67,23 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) #endif } +// We extend atomicExch to support extra data types +template <typename Type> +__device__ inline Type atomicExchCustom(Type* address, Type val) { + return atomicExch(address, val); +} + +template <> +__device__ inline double atomicExchCustom<double>(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull; + unsigned long long int assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val)); + } while (assumed != old); + return __longlong_as_double(old); +} #ifdef EIGEN_HAS_CUDA_FP16 template <template <typename T> class R> @@ -130,7 +147,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num unsigned int block = atomicCAS(semaphore, 0u, 1u); if (block == 0) { // We're the first block to run, initialize the output value - atomicExch(output, reducer.initialize()); + atomicExchCustom(output, reducer.initialize()); __threadfence(); atomicExch(semaphore, 2u); } @@ -263,17 +280,22 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2 #endif - -template <typename Self, typename Op, typename OutputType, bool PacketAccess> +template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> struct FullReductionLauncher { static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) { assert(false && "Should only be called on floats and half floats"); } }; -template <typename Self, typename Op, bool PacketAccess> -struct FullReductionLauncher<Self, Op, float, PacketAccess> { - static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs) { +// Specialization for float and double +template <typename Self, typename Op, typename OutputType, bool PacketAccess> +struct FullReductionLauncher< + Self, Op, OutputType, PacketAccess, + typename internal::enable_if< + internal::is_same<float, OutputType>::value || + internal::is_same<double, OutputType>::value, + void>::type> { + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; typedef typename Self::CoeffReturnType Scalar; const int block_size = 256; @@ -330,15 +352,17 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> { template <typename Self, typename Op, bool Vectorizable> struct FullReducer<Self, Op, GpuDevice, Vectorizable> { // Unfortunately nvidia doesn't support well exotic types such as complex, - // so reduce the scope of the optimized version of the code to the simple case - // of floats and half floats. + // so reduce the scope of the optimized version of the code to the simple cases + // of doubles, floats and half floats #ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value || (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); #elif __CUDA_ARCH__ >= 300 static const bool HasOptimizedImplementation = !Op::IsStateful && - internal::is_same<typename Self::CoeffReturnType, float>::value; + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value); #else static const bool HasOptimizedImplementation = false; #endif @@ -362,6 +386,7 @@ template <int NumPerThread, typename Self, __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, typename Self::CoeffReturnType* output) { #if __CUDA_ARCH__ >= 300 + typedef typename Self::CoeffReturnType Type; eigen_assert(blockDim.y == 1); eigen_assert(blockDim.z == 1); eigen_assert(gridDim.y == 1); @@ -391,13 +416,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu const Index col_block = i % input_col_blocks; const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x; - float reduced_val = reducer.initialize(); + Type reduced_val = reducer.initialize(); for (Index j = 0; j < NumPerThread; j += unroll_times) { const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); if (last_col >= num_coeffs_to_reduce) { for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) { - const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); reducer.reduce(val, &reduced_val); } break; @@ -523,7 +548,7 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, #endif -template <typename Self, typename Op, typename OutputType, bool PacketAccess> +template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> struct InnerReductionLauncher { static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats and half floats on a gpu device"); @@ -531,9 +556,15 @@ struct InnerReductionLauncher { } }; -template <typename Self, typename Op, bool PacketAccess> -struct InnerReductionLauncher<Self, Op, float, PacketAccess> { - static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { +// Specialization for float and double +template <typename Self, typename Op, typename OutputType, bool PacketAccess> +struct InnerReductionLauncher< + Self, Op, OutputType, PacketAccess, + typename internal::enable_if< + internal::is_same<float, OutputType>::value || + internal::is_same<double, OutputType>::value, + void>::type> { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; @@ -551,7 +582,7 @@ struct InnerReductionLauncher<Self, Op, float, PacketAccess> { const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / 1024; const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); - LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>), + LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>), num_blocks, 1024, 0, device, reducer.initialize(), num_preserved_vals, output); } @@ -618,10 +649,12 @@ struct InnerReducer<Self, Op, GpuDevice> { #ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value || (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); #elif __CUDA_ARCH__ >= 300 static const bool HasOptimizedImplementation = !Op::IsStateful && - internal::is_same<typename Self::CoeffReturnType, float>::value; + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value); #else static const bool HasOptimizedImplementation = false; #endif diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu index cad0c08e0..6d8f01c02 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -16,7 +16,7 @@ #include <unsupported/Eigen/CXX11/Tensor> -template<int DataLayout> +template<typename Type, int DataLayout> static void test_full_reductions() { Eigen::CudaStreamDevice stream; @@ -25,24 +25,24 @@ static void test_full_reductions() { const int num_rows = internal::random<int>(1024, 5*1024); const int num_cols = internal::random<int>(1024, 5*1024); - Tensor<float, 2, DataLayout> in(num_rows, num_cols); + Tensor<Type, 2, DataLayout> in(num_rows, num_cols); in.setRandom(); - Tensor<float, 0, DataLayout> full_redux; + Tensor<Type, 0, DataLayout> full_redux; full_redux = in.sum(); - std::size_t in_bytes = in.size() * sizeof(float); - std::size_t out_bytes = full_redux.size() * sizeof(float); - float* gpu_in_ptr = static_cast<float*>(gpu_device.allocate(in_bytes)); - float* gpu_out_ptr = static_cast<float*>(gpu_device.allocate(out_bytes)); + std::size_t in_bytes = in.size() * sizeof(Type); + std::size_t out_bytes = full_redux.size() * sizeof(Type); + Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes)); + Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes)); gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); - TensorMap<Tensor<float, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols); - TensorMap<Tensor<float, 0, DataLayout> > out_gpu(gpu_out_ptr); + TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols); + TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr); out_gpu.device(gpu_device) = in_gpu.sum(); - Tensor<float, 0, DataLayout> full_redux_gpu; + Tensor<Type, 0, DataLayout> full_redux_gpu; gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); gpu_device.synchronize(); @@ -54,6 +54,8 @@ static void test_full_reductions() { } void test_cxx11_tensor_reduction_cuda() { - CALL_SUBTEST_1(test_full_reductions<ColMajor>()); - CALL_SUBTEST_2(test_full_reductions<RowMajor>()); + CALL_SUBTEST_1((test_full_reductions<float, ColMajor>())); + CALL_SUBTEST_1((test_full_reductions<double, ColMajor>())); + CALL_SUBTEST_2((test_full_reductions<float, RowMajor>())); + CALL_SUBTEST_2((test_full_reductions<double, RowMajor>())); } |