diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-03-08 16:02:00 -0800 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-03-08 16:02:00 -0800 |
commit | 6d6413f76832a094d0835770af2adfaabba24738 (patch) | |
tree | 26af25675b883deab50c70f8a6f69b3e50c93523 | |
parent | 5a427a94a9c04f5cc32c185c9eebe10e40956d5e (diff) |
Simplified the full reduction code
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 143 |
1 files changed, 71 insertions, 72 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 4f2801e53..875155243 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -221,121 +221,120 @@ struct FullReducer { #ifdef EIGEN_USE_THREADS // Multithreaded full reducers -template <typename Eval, typename Op, bool Vectorizable = (Eval::InputPacketAccess & Op::PacketAccess)> +template <typename Self, typename Op, + bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> struct FullReducerShard { - static void run(const Eval& eval, typename Eval::Index firstIndex, typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) { - - shard->saccum = reducer.initialize(); - for (typename Eval::Index j = 0; j < numValuesToReduce; ++j) { - reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum); - } - } - - typename Eval::CoeffReturnType saccum; -}; - -template <typename Eval, typename Op> -struct FullReducerShard<Eval, Op, true> { - static void run(const Eval& eval, typename Eval::Index firstIndex, typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) { - - const int packetSize = internal::unpacket_traits<typename Eval::PacketReturnType>::size; - const typename Eval::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; - - shard->paccum = reducer.template initializePacket<typename Eval::PacketReturnType>(); - for (typename Eval::Index j = 0; j < VectorizedSize; j += packetSize) { - reducer.reducePacket(eval.m_impl.template packet<Unaligned>(firstIndex + j), &shard->paccum); - } - shard->saccum = reducer.initialize(); - for (typename Eval::Index j = VectorizedSize; j < numValuesToReduce; ++j) { - reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum); - } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, + typename Self::Index numValuesToReduce, Op& reducer, + typename Self::CoeffReturnType* output) { + *output = InnerMostDimReducer<Self, Op, vectorizable>::reduce( + self, firstIndex, numValuesToReduce, reducer); } - - typename Eval::PacketReturnType paccum; - typename Eval::CoeffReturnType saccum; }; - template <typename Self, typename Op> struct FullReducer<Self, Op, ThreadPoolDevice, false> { static const bool HasOptimizedImplementation = !Op::IsStateful; + static const int PacketSize = + unpacket_traits<typename Self::PacketReturnType>::size; // launch one reducer per thread and accumulate the result. - static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, typename Self::CoeffReturnType* output) { + static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, + typename Self::CoeffReturnType* output) { typedef typename Self::Index Index; const Index num_coeffs = array_prod(self.m_impl.dimensions()); - const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs)/device.numThreads()); - const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; - eigen_assert(num_coeffs >= numblocks * blocksize); - - std::vector<Notification*> results; - results.reserve(numblocks); - std::vector<FullReducerShard<Self, Op, false> > shards; - shards.resize(numblocks); - for (Index i = 0; i < numblocks; ++i) { - results.push_back(device.enqueue(&FullReducerShard<Self, Op, false>::run, self, i*blocksize, blocksize, reducer, &shards[i])); - } - - FullReducerShard<Self, Op, false> finalShard; - if (numblocks * blocksize < num_coeffs) { - FullReducerShard<Self, Op, false>::run(self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard); + if (num_coeffs == 0) { + *output = reducer.finalize(reducer.initialize()); + return; + } + const int num_threads = device.numThreads(); + if (num_threads == 1) { + *output = InnerMostDimReducer<Self, Op, false>::reduce(self, 0, num_coeffs, reducer); + return; } else { - finalShard.saccum = reducer.initialize(); - } - - for (Index i = 0; i < numblocks; ++i) { - wait_until_ready(results[i]); - delete results[i]; - } + const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads); + const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + eigen_assert(num_coeffs >= numblocks * blocksize); + + std::vector<Notification*> results; + results.reserve(numblocks); + std::vector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); + for (Index i = 0; i < numblocks; ++i) { + results.push_back( + device.enqueue(&FullReducerShard<Self, Op, false>::run, self, + i * blocksize, blocksize, reducer, &shards[i])); + } - for (Index i = 0; i < numblocks; ++i) { - reducer.reduce(shards[i].saccum, &finalShard.saccum); + typename Self::CoeffReturnType finalShard; + if (numblocks * blocksize < num_coeffs) { + finalShard = InnerMostDimReducer<Self, Op, false>::reduce( + self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); + } else { + finalShard = reducer.initialize(); + } + for (Index i = 0; i < numblocks; ++i) { + wait_until_ready(results[i]); + delete results[i]; + } + for (Index i = 0; i < numblocks; ++i) { + reducer.reduce(shards[i], &finalShard); + } + *output = reducer.finalize(finalShard); } - *output = reducer.finalize(finalShard.saccum); } }; template <typename Self, typename Op> struct FullReducer<Self, Op, ThreadPoolDevice, true> { static const bool HasOptimizedImplementation = !Op::IsStateful; + static const int PacketSize = + unpacket_traits<typename Self::PacketReturnType>::size; // launch one reducer per thread and accumulate the result. - static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, typename Self::CoeffReturnType* output) { + static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, + typename Self::CoeffReturnType* output) { typedef typename Self::Index Index; const Index num_coeffs = array_prod(self.m_impl.dimensions()); - const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs)/device.numThreads()); + if (num_coeffs == 0) { + *output = reducer.finalize(reducer.initialize()); + return; + } + const int num_threads = device.numThreads(); + if (num_threads == 1) { + *output = InnerMostDimReducer<Self, Op, true>::reduce(self, 0, num_coeffs, reducer); + return; + } + const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads); const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; eigen_assert(num_coeffs >= numblocks * blocksize); std::vector<Notification*> results; results.reserve(numblocks); - std::vector<FullReducerShard<Self, Op, true> > shards; - shards.resize(numblocks); + std::vector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); for (Index i = 0; i < numblocks; ++i) { - results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run, self, i*blocksize, blocksize, reducer, &shards[i])); + results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run, + self, i * blocksize, blocksize, reducer, + &shards[i])); } - - FullReducerShard<Self, Op, true> finalShard; + typename Self::CoeffReturnType finalShard; if (numblocks * blocksize < num_coeffs) { - FullReducerShard<Self, Op, true>::run(self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard); + finalShard = InnerMostDimReducer<Self, Op, true>::reduce( + self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); } else { - finalShard.paccum = reducer.template initializePacket<typename Self::PacketReturnType>(); - finalShard.saccum = reducer.initialize(); + finalShard = reducer.initialize(); } for (Index i = 0; i < numblocks; ++i) { wait_until_ready(results[i]); delete results[i]; } - for (Index i = 0; i < numblocks; ++i) { - reducer.reducePacket(shards[i].paccum, &finalShard.paccum); - reducer.reduce(shards[i].saccum, &finalShard.saccum); + reducer.reduce(shards[i], &finalShard); } - - *output = reducer.finalizeBoth(finalShard.saccum, finalShard.paccum); + *output = reducer.finalize(finalShard); } }; + #endif |