aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h244
1 files changed, 244 insertions, 0 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
new file mode 100644
index 000000000..05ac9bd2f
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -0,0 +1,244 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+
+namespace Eigen {
+
+/** \class TensorExecutor
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor executor class.
+ *
+ * This class is responsible for launch the evaluation of the expression on
+ * the specified computing device.
+ */
+namespace internal {
+
+template <typename Device, typename Expression>
+struct IsVectorizable {
+ static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
+};
+
+// Default strategy: the expression is evaluated with a single cpu thread.
+template<typename Expression, typename Device = DefaultDevice, bool Vectorizable = IsVectorizable<Device, Expression>::value>
+class TensorExecutor
+{
+ public:
+ typedef typename Expression::Index Index;
+ EIGEN_DEVICE_FUNC
+ static inline void run(const Expression& expr, const Device& device = Device())
+ {
+ TensorEvaluator<Expression, Device> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const Index size = array_prod(evaluator.dimensions());
+ for (Index i = 0; i < size; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+ evaluator.cleanup();
+ }
+};
+
+
+template<typename Expression>
+class TensorExecutor<Expression, DefaultDevice, true>
+{
+ public:
+ typedef typename Expression::Index Index;
+ static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
+ {
+ TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const Index size = array_prod(evaluator.dimensions());
+ static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+ const Index VectorizedSize = (size / PacketSize) * PacketSize;
+
+ for (Index i = 0; i < VectorizedSize; i += PacketSize) {
+ evaluator.evalPacket(i);
+ }
+ for (Index i = VectorizedSize; i < size; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+ evaluator.cleanup();
+ }
+};
+
+
+
+// Multicore strategy: the index space is partitioned and each partition is executed on a single core
+#ifdef EIGEN_USE_THREADS
+template <typename Evaluator, typename Index, bool Vectorizable = Evaluator::PacketAccess>
+struct EvalRange {
+ static void run(Evaluator evaluator, const Index first, const Index last) {
+ eigen_assert(last > first);
+ for (Index i = first; i < last; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+};
+
+template <typename Evaluator, typename Index>
+struct EvalRange<Evaluator, Index, true> {
+ static void run(Evaluator evaluator, const Index first, const Index last) {
+ eigen_assert(last > first);
+
+ Index i = first;
+ static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+ if (last - first > PacketSize) {
+ eigen_assert(first % PacketSize == 0);
+ Index lastPacket = last - (last % PacketSize);
+ for (; i < lastPacket; i += PacketSize) {
+ evaluator.evalPacket(i);
+ }
+ }
+
+ for (; i < last; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+};
+
+template<typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
+{
+ public:
+ typedef typename Expression::Index Index;
+ static inline void run(const Expression& expr, const ThreadPoolDevice& device)
+ {
+ typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+ Evaluator evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const Index size = array_prod(evaluator.dimensions());
+
+ static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
+
+ int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
+ const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+ const Index numblocks = size / blocksize;
+
+ Index i = 0;
+ std::vector<Future> results;
+ results.reserve(numblocks);
+ for (int i = 0; i < numblocks; ++i) {
+ results.push_back(device.enqueue(&EvalRange<Evaluator, Index>::run, evaluator, i*blocksize, (i+1)*blocksize));
+ }
+
+ if (numblocks * blocksize < size) {
+ EvalRange<Evaluator, Index>::run(evaluator, numblocks * blocksize, size);
+ }
+
+ for (int i = 0; i < numblocks; ++i) {
+ get_when_ready(&results[i]);
+ }
+
+ }
+ evaluator.cleanup();
+ }
+};
+#endif
+
+
+// GPU: the evaluation of the expression is offloaded to a GPU.
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+template <typename Evaluator, typename Index>
+__global__ void
+__launch_bounds__(1024)
+EigenMetaKernel_NonVectorizable(Evaluator eval, Index size) {
+
+ const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
+ const Index step_size = blockDim.x * gridDim.x;
+
+ // Use the scalar path
+ for (Index i = first_index; i < size; i += step_size) {
+ eval.evalScalar(i);
+ }
+}
+
+template <typename Evaluator, typename Index>
+__global__ void
+__launch_bounds__(1024)
+EigenMetaKernel_Vectorizable(Evaluator eval, Index size) {
+
+ const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
+ const Index step_size = blockDim.x * gridDim.x;
+
+ // Use the vector path
+ const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+ const Index vectorized_step_size = step_size * PacketSize;
+ const Index vectorized_size = (size / PacketSize) * PacketSize;
+ for (Index i = first_index * PacketSize; i < vectorized_size;
+ i += vectorized_step_size) {
+ eval.evalPacket(i);
+ }
+ for (Index i = vectorized_size + first_index; i < size; i += step_size) {
+ eval.evalScalar(i);
+ }
+}
+
+template <typename Expression>
+struct IsVectorizable<GpuDevice, Expression> {
+ static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess && TensorEvaluator<Expression, GpuDevice>::IsAligned;
+};
+
+template<typename Expression>
+class TensorExecutor<Expression, GpuDevice, false>
+{
+ public:
+ typedef typename Expression::Index Index;
+ static inline void run(const Expression& expr, const GpuDevice& device)
+ {
+ TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock();
+ const int block_size = maxCudaThreadsPerBlock();
+ const Index size = array_prod(evaluator.dimensions());
+ LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
+ }
+ evaluator.cleanup();
+ }
+};
+
+template<typename Expression>
+class TensorExecutor<Expression, GpuDevice, true>
+{
+ public:
+ typedef typename Expression::Index Index;
+ static inline void run(const Expression& expr, const GpuDevice& device)
+ {
+ TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock();
+ const int block_size = maxCudaThreadsPerBlock();
+ const Index size = array_prod(evaluator.dimensions());
+ LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
+ }
+ evaluator.cleanup();
+ }
+};
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H