aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2014-08-13 08:22:05 -0700
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2014-08-13 08:22:05 -0700
commit439feca139a093292923e14c085352e5dd2239a2 (patch)
treea247b2285e4d5472eb68f93535630dbc2f1bc67e /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
parent647622281e5409e617854d35450afc0cd3a4dd49 (diff)
Reworked the TensorExecutor code to support in place evaluation.
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h114
1 files changed, 62 insertions, 52 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index f50f839fc..d6e2ab1a2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -11,7 +11,7 @@
#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
#ifdef EIGEN_USE_THREADS
-#include <future>
+#include <future>"
#endif
namespace Eigen {
@@ -28,45 +28,49 @@ namespace internal {
// Default strategy: the expression is evaluated with a single cpu thread.
template<typename Expression, typename Device = DefaultDevice, bool Vectorizable = TensorEvaluator<Expression, Device>::PacketAccess>
-struct TensorExecutor
+class TensorExecutor
{
+ public:
typedef typename Expression::Index Index;
EIGEN_DEVICE_FUNC
static inline void run(const Expression& expr, const Device& device = Device())
{
TensorEvaluator<Expression, Device> evaluator(expr, device);
- evaluator.evalSubExprsIfNeeded();
-
- const Index size = evaluator.dimensions().TotalSize();
- for (Index i = 0; i < size; ++i) {
- evaluator.evalScalar(i);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const Index size = evaluator.dimensions().TotalSize();
+ for (Index i = 0; i < size; ++i) {
+ evaluator.evalScalar(i);
+ }
}
-
evaluator.cleanup();
}
};
template<typename Expression>
-struct TensorExecutor<Expression, DefaultDevice, true>
+class TensorExecutor<Expression, DefaultDevice, true>
{
+ public:
typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
{
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
- evaluator.evalSubExprsIfNeeded();
-
- const Index size = evaluator.dimensions().TotalSize();
- static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
- const int VectorizedSize = (size / PacketSize) * PacketSize;
-
- for (Index i = 0; i < VectorizedSize; i += PacketSize) {
- evaluator.evalPacket(i);
- }
- for (Index i = VectorizedSize; i < size; ++i) {
- evaluator.evalScalar(i);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const Index size = evaluator.dimensions().TotalSize();
+ static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+ const int VectorizedSize = (size / PacketSize) * PacketSize;
+
+ for (Index i = 0; i < VectorizedSize; i += PacketSize) {
+ evaluator.evalPacket(i);
+ }
+ for (Index i = VectorizedSize; i < size; ++i) {
+ evaluator.evalScalar(i);
+ }
}
-
evaluator.cleanup();
}
};
@@ -107,38 +111,40 @@ struct EvalRange<Evaluator, Index, true> {
};
template<typename Expression, bool Vectorizable>
-struct TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
{
+ public:
typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const ThreadPoolDevice& device)
{
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
Evaluator evaluator(expr, device);
- evaluator.evalSubExprsIfNeeded();
-
- const Index size = evaluator.dimensions().TotalSize();
-
- static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
-
- int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
- const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
- const Index numblocks = size / blocksize;
-
- Index i = 0;
- vector<std::future<void> > results;
- results.reserve(numblocks);
- for (int i = 0; i < numblocks; ++i) {
- results.push_back(std::async(std::launch::async, &EvalRange<Evaluator, Index>::run, &evaluator, i*blocksize, (i+1)*blocksize));
- }
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const Index size = evaluator.dimensions().TotalSize();
+
+ static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
+
+ int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
+ const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+ const Index numblocks = size / blocksize;
+
+ Index i = 0;
+ vector<std::future<void> > results;
+ results.reserve(numblocks);
+ for (int i = 0; i < numblocks; ++i) {
+ results.push_back(std::async(std::launch::async, &EvalRange<Evaluator, Index>::run, &evaluator, i*blocksize, (i+1)*blocksize));
+ }
- for (int i = 0; i < numblocks; ++i) {
- results[i].get();
- }
+ for (int i = 0; i < numblocks; ++i) {
+ results[i].get();
+ }
- if (numblocks * blocksize < size) {
- EvalRange<Evaluator, Index>::run(&evaluator, numblocks * blocksize, size);
+ if (numblocks * blocksize < size) {
+ EvalRange<Evaluator, Index>::run(&evaluator, numblocks * blocksize, size);
+ }
}
-
evaluator.cleanup();
}
};
@@ -157,19 +163,23 @@ __global__ void EigenMetaKernel(Evaluator eval, unsigned int size) {
}
template<typename Expression, bool Vectorizable>
-struct TensorExecutor<Expression, GpuDevice, Vectorizable>
+class TensorExecutor<Expression, GpuDevice, Vectorizable>
{
+ public:
typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const GpuDevice& device)
{
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
- evaluator.evalSubExprsIfNeeded();
- const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock();
- const int block_size = maxCudaThreadsPerBlock();
-
- const Index size = evaluator.dimensions().TotalSize();
- EigenMetaKernel<TensorEvaluator<Expression, GpuDevice> > <<<num_blocks, block_size, 0, device.stream()>>>(evaluator, size);
- eigen_assert(cudaGetLastError() == cudaSuccess);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock();
+ const int block_size = maxCudaThreadsPerBlock();
+
+ const Index size = evaluator.dimensions().TotalSize();
+ EigenMetaKernel<TensorEvaluator<Expression, GpuDevice> > <<<num_blocks, block_size, 0, device.stream()>>>(evaluator, size);
+ assert(cudaGetLastError() == cudaSuccess);
+ }
evaluator.cleanup();
}
};