From 780623261eedd996404795dfb7928e680408adb5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 09:07:14 -0800 Subject: Re-enabled the optimized reduction CUDA code. --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 -- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index fd7064459..cea32d05f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -527,7 +527,6 @@ struct TensorEvaluator, Device> } // Attempt to use an optimized reduction. -#if 0 else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) { bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { @@ -563,7 +562,6 @@ struct TensorEvaluator, Device> return false; } } -#endif return true; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 558d0c83d..198b3604c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -126,7 +126,7 @@ struct FullReducer { const int block_size = 256; const int num_per_thread = 128; const int num_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); - LAUNCH_CUDA_KERNEL((FullReductionKernel), + LAUNCH_CUDA_KERNEL((FullReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs, output); } }; @@ -222,7 +222,7 @@ struct InnerReducer { const int num_per_thread = 128; const int num_blocks = 32; - LAUNCH_CUDA_KERNEL((InnerReductionKernel), + LAUNCH_CUDA_KERNEL((InnerReductionKernel), num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); } }; @@ -279,7 +279,7 @@ struct OuterReducer { device.maxCudaThreadsPerMultiProcessor() / block_size; const int num_blocks = numext::mini(max_blocks, dyn_blocks); - LAUNCH_CUDA_KERNEL((OuterReductionKernel), + LAUNCH_CUDA_KERNEL((OuterReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); } }; -- cgit v1.2.3