diff options
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 114 |
1 files changed, 62 insertions, 52 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f50f839fc..d6e2ab1a2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -11,7 +11,7 @@ #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H #ifdef EIGEN_USE_THREADS -#include <future> +#include <future>" #endif namespace Eigen { @@ -28,45 +28,49 @@ namespace internal { // Default strategy: the expression is evaluated with a single cpu thread. template<typename Expression, typename Device = DefaultDevice, bool Vectorizable = TensorEvaluator<Expression, Device>::PacketAccess> -struct TensorExecutor +class TensorExecutor { + public: typedef typename Expression::Index Index; EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const Device& device = Device()) { TensorEvaluator<Expression, Device> evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); - - const Index size = evaluator.dimensions().TotalSize(); - for (Index i = 0; i < size; ++i) { - evaluator.evalScalar(i); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = evaluator.dimensions().TotalSize(); + for (Index i = 0; i < size; ++i) { + evaluator.evalScalar(i); + } } - evaluator.cleanup(); } }; template<typename Expression> -struct TensorExecutor<Expression, DefaultDevice, true> +class TensorExecutor<Expression, DefaultDevice, true> { + public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); - - const Index size = evaluator.dimensions().TotalSize(); - static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; - const int VectorizedSize = (size / PacketSize) * PacketSize; - - for (Index i = 0; i < VectorizedSize; i += PacketSize) { - evaluator.evalPacket(i); - } - for (Index i = VectorizedSize; i < size; ++i) { - evaluator.evalScalar(i); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = evaluator.dimensions().TotalSize(); + static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; + const int VectorizedSize = (size / PacketSize) * PacketSize; + + for (Index i = 0; i < VectorizedSize; i += PacketSize) { + evaluator.evalPacket(i); + } + for (Index i = VectorizedSize; i < size; ++i) { + evaluator.evalScalar(i); + } } - evaluator.cleanup(); } }; @@ -107,38 +111,40 @@ struct EvalRange<Evaluator, Index, true> { }; template<typename Expression, bool Vectorizable> -struct TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> +class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { + public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const ThreadPoolDevice& device) { typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; Evaluator evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); - - const Index size = evaluator.dimensions().TotalSize(); - - static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; - - int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1; - const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; - - Index i = 0; - vector<std::future<void> > results; - results.reserve(numblocks); - for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange<Evaluator, Index>::run, &evaluator, i*blocksize, (i+1)*blocksize)); - } + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = evaluator.dimensions().TotalSize(); + + static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; + + int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1; + const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + Index i = 0; + vector<std::future<void> > results; + results.reserve(numblocks); + for (int i = 0; i < numblocks; ++i) { + results.push_back(std::async(std::launch::async, &EvalRange<Evaluator, Index>::run, &evaluator, i*blocksize, (i+1)*blocksize)); + } - for (int i = 0; i < numblocks; ++i) { - results[i].get(); - } + for (int i = 0; i < numblocks; ++i) { + results[i].get(); + } - if (numblocks * blocksize < size) { - EvalRange<Evaluator, Index>::run(&evaluator, numblocks * blocksize, size); + if (numblocks * blocksize < size) { + EvalRange<Evaluator, Index>::run(&evaluator, numblocks * blocksize, size); + } } - evaluator.cleanup(); } }; @@ -157,19 +163,23 @@ __global__ void EigenMetaKernel(Evaluator eval, unsigned int size) { } template<typename Expression, bool Vectorizable> -struct TensorExecutor<Expression, GpuDevice, Vectorizable> +class TensorExecutor<Expression, GpuDevice, Vectorizable> { + public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const GpuDevice& device) { TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); - const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); - const int block_size = maxCudaThreadsPerBlock(); - - const Index size = evaluator.dimensions().TotalSize(); - EigenMetaKernel<TensorEvaluator<Expression, GpuDevice> > <<<num_blocks, block_size, 0, device.stream()>>>(evaluator, size); - eigen_assert(cudaGetLastError() == cudaSuccess); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); + const int block_size = maxCudaThreadsPerBlock(); + + const Index size = evaluator.dimensions().TotalSize(); + EigenMetaKernel<TensorEvaluator<Expression, GpuDevice> > <<<num_blocks, block_size, 0, device.stream()>>>(evaluator, size); + assert(cudaGetLastError() == cudaSuccess); + } evaluator.cleanup(); } }; |