From aa33446dace833fbf06632e586c80119b3d8ac11 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 9 Jun 2016 08:22:27 -0700 Subject: Improved support for vectorization of 16-bit floats --- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 86 ++++++++++++++++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 5 ++ .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 8 +- 3 files changed, 95 insertions(+), 4 deletions(-) (limited to 'unsupported/Eigen') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 3dd32e9d1..bf52e490f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -84,6 +84,14 @@ struct functor_traits > { }; +template +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false + }; +}; + // Standard reduction functors template struct SumReducer { @@ -119,6 +127,15 @@ template struct SumReducer } }; +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::type::HasAdd + }; +}; + + template struct MeanReducer { static const bool PacketAccess = packet_traits::HasAdd && !NumTraits::IsInteger; @@ -162,6 +179,15 @@ template struct MeanReducer DenseIndex packetCount_; }; +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::type::HasAdd + }; +}; + + template struct MaxReducer { static const bool PacketAccess = packet_traits::HasMax; @@ -195,6 +221,15 @@ template struct MaxReducer } }; +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::type::HasMax + }; +}; + + template struct MinReducer { static const bool PacketAccess = packet_traits::HasMin; @@ -228,6 +263,14 @@ template struct MinReducer } }; +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::type::HasMin + }; +}; + template struct ProdReducer { @@ -263,6 +306,14 @@ template struct ProdReducer } }; +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::MulCost, + PacketAccess = PacketType::type::HasMul + }; +}; + struct AndReducer { @@ -280,6 +331,15 @@ struct AndReducer } }; +template +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false + }; +}; + + struct OrReducer { static const bool PacketAccess = false; static const bool IsStateful = false; @@ -295,6 +355,15 @@ struct OrReducer { } }; +template +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false + }; +}; + + // Argmin/Argmax reducers template struct ArgMaxTupleReducer { @@ -312,6 +381,15 @@ template struct ArgMaxTupleReducer } }; +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = false + }; +}; + + template struct ArgMinTupleReducer { static const bool PacketAccess = false; @@ -328,6 +406,14 @@ template struct ArgMinTupleReducer } }; +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = false + }; +}; + // Random number generation namespace { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index b1645d56f..82a905a65 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -54,6 +54,11 @@ struct PacketType { // For CUDA packet types when using a GpuDevice #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +template <> + struct PacketType { + typedef half2 type; + static const int size = 2; + }; template <> struct PacketType { typedef float4 type; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index e82530955..1b4fdd03f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -331,7 +331,7 @@ struct FullReducer { #ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same::value || - (internal::is_same::value && Op::PacketAccess)); + (internal::is_same::value && reducer_traits::PacketAccess)); #else static const bool HasOptimizedImplementation = !Op::IsStateful && internal::is_same::value; @@ -346,7 +346,7 @@ struct FullReducer { return; } - FullReductionLauncher::run(self, reducer, device, output, num_coeffs); + FullReductionLauncher::PacketAccess>::run(self, reducer, device, output, num_coeffs); } }; @@ -608,7 +608,7 @@ struct InnerReducer { #ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same::value || - (internal::is_same::value && Op::PacketAccess)); + (internal::is_same::value && reducer_traits::PacketAccess)); #else static const bool HasOptimizedImplementation = !Op::IsStateful && internal::is_same::value; @@ -627,7 +627,7 @@ struct InnerReducer { return true; } - return InnerReductionLauncher::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); + return InnerReductionLauncher::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); } }; -- cgit v1.2.3