diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-05-31 10:33:40 -0700 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-05-31 10:33:40 -0700 |
commit | 5aeb3687c4dd2d86909d5f3ba0428ac883dfcf06 (patch) | |
tree | 575fd3301621dd2748743986aed380582e40ce5a /unsupported | |
parent | b6e306f189498b89b555f49c548bd3a0dddf2594 (diff) |
Only enable optimized reductions of fp16 if the reduction functor supports them
Diffstat (limited to 'unsupported')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 10 |
1 files changed, 2 insertions, 8 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 4f2dfcb7a..7368768cf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -291,7 +291,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> { #ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value || - internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value); + (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess)); #else static const bool HasOptimizedImplementation = !Op::IsStateful && internal::is_same<typename Self::CoeffReturnType, float>::value; @@ -475,12 +475,6 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, template <typename Self, typename Op> struct InnerReductionLauncher { - // Unfortunately nvidia doesn't support well exotic types such as complex, - // so reduce the scope of the optimized version of the code to the simple case - // of floats. - static const bool HasOptimizedImplementation = !Op::IsStateful && - internal::is_same<typename Self::CoeffReturnType, float>::value; - template <typename OutputType> static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats and half floats on a gpu device"); @@ -561,7 +555,7 @@ struct InnerReducer<Self, Op, GpuDevice> { #ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value || - internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value); + (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess)); #else static const bool HasOptimizedImplementation = !Op::IsStateful && internal::is_same<typename Self::CoeffReturnType, float>::value; |