From 46f88fc454e78484ebdf9d58990d0489c1103cf4 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 11 Sep 2018 10:08:10 -0700 Subject: Use numerically stable tree reduction in TensorReduction. --- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 68 +++++++-------- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 96 ++++++++++++++++++---- .../Eigen/CXX11/src/Tensor/TensorReductionGpu.h | 10 +-- 3 files changed, 117 insertions(+), 57 deletions(-) (limited to 'unsupported/Eigen') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index cd666c173..9fd276075 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -58,16 +58,15 @@ template struct reducer_traits { enum { Cost = 1, - PacketAccess = false + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true }; }; // Standard reduction functors template struct SumReducer { - static const bool PacketAccess = packet_traits::HasAdd; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { internal::scalar_sum_op sum_op; *accum = sum_op(*accum, t); @@ -103,16 +102,14 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasAdd + PacketAccess = PacketType::HasAdd, + IsStateful = false, + IsExactlyAssociative = NumTraits::IsInteger }; }; - template struct MeanReducer { - static const bool PacketAccess = packet_traits::HasAdd && packet_traits::HasDiv && !NumTraits::IsInteger; - static const bool IsStateful = true; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MeanReducer() : scalarCount_(0), packetCount_(0) { } @@ -161,7 +158,9 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasAdd + PacketAccess = PacketType::HasAdd && !NumTraits::IsInteger, + IsStateful = true, + IsExactlyAssociative = NumTraits::IsInteger }; }; @@ -194,9 +193,6 @@ struct MinMaxBottomValue { template struct MaxReducer { - static const bool PacketAccess = packet_traits::HasMax; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { if (t > *accum) { *accum = t; } } @@ -228,16 +224,15 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasMax + PacketAccess = PacketType::HasMax, + IsStateful = false, + IsExactlyAssociative = true }; }; template struct MinReducer { - static const bool PacketAccess = packet_traits::HasMin; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { if (t < *accum) { *accum = t; } } @@ -269,16 +264,15 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasMin + PacketAccess = PacketType::HasMin, + IsStateful = false, + IsExactlyAssociative = true }; }; template struct ProdReducer { - static const bool PacketAccess = packet_traits::HasMul; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { internal::scalar_product_op prod_op; (*accum) = prod_op(*accum, t); @@ -314,16 +308,15 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::MulCost, - PacketAccess = PacketType::HasMul + PacketAccess = PacketType::HasMul, + IsStateful = false, + IsExactlyAssociative = true }; }; struct AndReducer { - static const bool PacketAccess = false; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum && t; } @@ -339,15 +332,14 @@ template struct reducer_traits { enum { Cost = 1, - PacketAccess = false + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true }; }; struct OrReducer { - static const bool PacketAccess = false; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum || t; } @@ -363,7 +355,9 @@ template struct reducer_traits { enum { Cost = 1, - PacketAccess = false + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true }; }; @@ -371,9 +365,6 @@ struct reducer_traits { // Argmin/Argmax reducers template struct ArgMaxTupleReducer { - static const bool PacketAccess = false; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { if (t.second > accum->second) { *accum = t; } } @@ -389,16 +380,15 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = false + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true }; }; template struct ArgMinTupleReducer { - static const bool PacketAccess = false; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { if (t.second < accum->second) { *accum = t; } } @@ -414,7 +404,9 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = false + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true }; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 01d3863da..d7a04a525 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -165,7 +165,9 @@ struct GenericDimReducer<-1, Self, Op> { } }; -template +template struct InnerMostDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { typename Self::CoeffReturnType accum = reducer.initialize(); @@ -177,23 +179,88 @@ struct InnerMostDimReducer { }; template -struct InnerMostDimReducer { +struct InnerMostDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { - const int packetSize = internal::unpacket_traits::size; + const typename Self::Index packetSize = internal::unpacket_traits::size; const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; - typename Self::PacketReturnType p = reducer.template initializePacket(); + typename Self::PacketReturnType paccum = reducer.template initializePacket(); for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { - reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &p); + reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &paccum); } typename Self::CoeffReturnType accum = reducer.initialize(); for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); } - return reducer.finalizeBoth(accum, p); + return reducer.finalizeBoth(accum, paccum); } }; -template +static const int kLeafSize = 1024; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType + reduce(const Self& self, typename Self::Index firstIndex, + typename Self::Index numValuesToReduce, Op& reducer) { + typename Self::CoeffReturnType accum = reducer.initialize(); + if (numValuesToReduce > kLeafSize) { + const typename Self::Index half = numValuesToReduce / 2; + reducer.reduce(reduce(self, firstIndex, half, reducer), &accum); + reducer.reduce( + reduce(self, firstIndex + half, numValuesToReduce - half, reducer), + &accum); + } else { + for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + } + return reducer.finalize(accum); + } +}; + +#if !defined(EIGEN_USE_GPU) || !defined(__CUDACC__) +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType + reduce(const Self& self, typename Self::Index firstIndex, + typename Self::Index numValuesToReduce, Op& reducer) { + const typename Self::Index packetSize = + internal::unpacket_traits::size; + typename Self::CoeffReturnType accum = reducer.initialize(); + if (numValuesToReduce > packetSize * kLeafSize) { + // Make sure the split point is aligned on a packet boundary. + const typename Self::Index split = + packetSize * + divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)), + packetSize); + const typename Self::Index num_left = + numext::mini(split - firstIndex, numValuesToReduce); + reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum); + if (num_left < numValuesToReduce) { + reducer.reduce( + reduce(self, split, numValuesToReduce - num_left, reducer), &accum); + } + return reducer.finalize(accum); + } else { + const typename Self::Index VectorizedSize = + (numValuesToReduce / packetSize) * packetSize; + typename Self::PacketReturnType paccum = + reducer.template initializePacket(); + for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { + reducer.reducePacket( + self.m_impl.template packet(firstIndex + j), &paccum); + } + for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; + ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalizeBoth(accum, paccum); + } + } +}; +#endif + +template struct InnerMostDimPreserver { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { eigen_assert(false && "should never be called"); @@ -228,7 +295,7 @@ struct InnerMostDimPreserver<-1, Self, Op, true> { }; // Default full reducer -template +template struct FullReducer { static const bool HasOptimizedImplementation = false; @@ -242,7 +309,7 @@ struct FullReducer { #ifdef EIGEN_USE_THREADS // Multithreaded full reducers template + bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)> struct FullReducerShard { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer, @@ -255,8 +322,8 @@ struct FullReducerShard { // Multithreaded full reducer template struct FullReducer { - static const bool HasOptimizedImplementation = !Op::IsStateful; - static const int PacketSize = + static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful; + static const Index PacketSize = unpacket_traits::size; // launch one reducer per thread and accumulate the result. @@ -394,6 +461,7 @@ class TensorReductionOp : public TensorBase class MakePointer_, typename Device> struct TensorEvaluator, Device> { + typedef internal::reducer_traits ReducerTraits; typedef TensorReductionOp XprType; typedef typename XprType::Index Index; typedef ArgType ChildType; @@ -407,11 +475,11 @@ struct TensorEvaluator, static const bool InputPacketAccess = TensorEvaluator::PacketAccess; typedef typename internal::remove_const::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; + static const Index PacketSize = PacketType::size; enum { IsAligned = false, - PacketAccess = Self::InputPacketAccess && Op::PacketAccess, + PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess, BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -696,7 +764,7 @@ struct TensorEvaluator, private: template friend struct internal::GenericDimReducer; - template friend struct internal::InnerMostDimReducer; + template friend struct internal::InnerMostDimReducer; template friend struct internal::InnerMostDimPreserver; template friend struct internal::FullReducer; #ifdef EIGEN_USE_THREADS diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index cd20df505..7504c1598 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -376,12 +376,12 @@ struct FullReducer { // so reduce the scope of the optimized version of the code to the simple cases // of doubles, floats and half floats #ifdef EIGEN_HAS_GPU_FP16 - static const bool HasOptimizedImplementation = !Op::IsStateful && + static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && (internal::is_same::value || internal::is_same::value || (internal::is_same::value && reducer_traits::PacketAccess)); #else // EIGEN_HAS_GPU_FP16 - static const bool HasOptimizedImplementation = !Op::IsStateful && + static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && (internal::is_same::value || internal::is_same::value); #endif // EIGEN_HAS_GPU_FP16 @@ -697,12 +697,12 @@ struct InnerReducer { // so reduce the scope of the optimized version of the code to the simple case // of floats and half floats. #ifdef EIGEN_HAS_GPU_FP16 - static const bool HasOptimizedImplementation = !Op::IsStateful && + static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && (internal::is_same::value || internal::is_same::value || (internal::is_same::value && reducer_traits::PacketAccess)); #else // EIGEN_HAS_GPU_FP16 - static const bool HasOptimizedImplementation = !Op::IsStateful && + static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && (internal::is_same::value || internal::is_same::value); #endif // EIGEN_HAS_GPU_FP16 @@ -759,7 +759,7 @@ struct OuterReducer { // Unfortunately nvidia doesn't support well exotic types such as complex, // so reduce the scope of the optimized version of the code to the simple case // of floats. - static const bool HasOptimizedImplementation = !Op::IsStateful && + static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && (internal::is_same::value || internal::is_same::value); template -- cgit v1.2.3