Rename Index to StorageIndex + use Eigen::Array and Eigen::Map when possible

author: Eugene Zhulenev <ezhulenev@google.com> 2018-07-27 12:45:17 -0700
committer: Eugene Zhulenev <ezhulenev@google.com> 2018-07-27 12:45:17 -0700
commit: 966c2a7bb62a8b5b9ecd349730ffcd3b5719837d (patch)
tree: 83e61bb77a5340f529c336afaa69cc78d654d599 /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
parent: 6913221c43c6ad41b1fbfc0d263d2764abd11ad2 (diff)
1 files changed, 94 insertions, 91 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 024de3696..ac5afd891 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -36,15 +36,16 @@ template <typename Expression, typename Device, bool Vectorizable,
           bool Tileable>
 class TensorExecutor {
  public:
-  typedef typename Expression::Index Index;
+  using StorageIndex = typename Expression::Index;
+
   EIGEN_DEVICE_FUNC
   static inline void run(const Expression& expr,
                          const Device& device = Device()) {
     TensorEvaluator<Expression, Device> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign) {
-      const Index size = array_prod(evaluator.dimensions());
-      for (Index i = 0; i < size; ++i) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      for (StorageIndex i = 0; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
@@ -56,35 +57,36 @@ class TensorExecutor {
  * Process all the data with a single cpu thread, using vectorized instructions.
  */
 template <typename Expression>
-class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, /*Tilable*/ false> {
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
+                     /*Tileable*/ false> {
  public:
-  typedef typename Expression::Index Index;
+  using StorageIndex = typename Expression::Index;
 
   EIGEN_DEVICE_FUNC
-  static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
-  {
+  static inline void run(const Expression& expr,
+                         const DefaultDevice& device = DefaultDevice()) {
     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const Index size = array_prod(evaluator.dimensions());
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
       const int PacketSize = unpacket_traits<typename TensorEvaluator<
           Expression, DefaultDevice>::PacketReturnType>::size;
 
       // Give compiler a strong possibility to unroll the loop. But don't insist
       // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
-      const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
-      for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
-        for (Index j = 0; j < 4; j++) {
+      const StorageIndex UnrolledSize =
+          (size / (4 * PacketSize)) * 4 * PacketSize;
+      for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
-      const Index VectorizedSize = (size / PacketSize) * PacketSize;
-      for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
+      const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
+      for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
         evaluator.evalPacket(i);
       }
-      for (Index i = VectorizedSize; i < size; ++i) {
+      for (StorageIndex i = VectorizedSize; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
@@ -97,42 +99,41 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, /*Tilable
  * sizing a block to fit L1 cache we get better cache performance.
  */
 template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true> {
+class TensorExecutor<Expression, DefaultDevice, Vectorizable,
+                     /*Tileable*/ true> {
  public:
-  typedef typename Expression::Index Index;
+  using Scalar = typename traits<Expression>::Scalar;
+  using ScalarNoConst = typename remove_const<Scalar>::type;
+
+  using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
+  using StorageIndex = typename traits<Expression>::Index;
+
+  static const int NumDims = traits<Expression>::NumDimensions;
 
   EIGEN_DEVICE_FUNC
   static inline void run(const Expression& expr,
                          const DefaultDevice& device = DefaultDevice()) {
-    using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
-
-    using Index = typename traits<Expression>::Index;
-    const int NumDims = traits<Expression>::NumDimensions;
-
-    using Scalar = typename traits<Expression>::Scalar;
-    using ScalarNoConst = typename remove_const<Scalar>::type;
-
     using TensorBlock =
-        TensorBlock<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
-    using TensorBlockMapper =
-        TensorBlockMapper<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
+        TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
+    using TensorBlockMapper = TensorBlockMapper<ScalarNoConst, StorageIndex,
+                                                NumDims, Evaluator::Layout>;
 
     Evaluator evaluator(expr, device);
-    std::size_t total_size = array_prod(evaluator.dimensions());
-    std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+    Index total_size = array_prod(evaluator.dimensions());
+    Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
 
     if (total_size < cache_size) {
       // TODO(andydavis) Reduce block management overhead for small tensors.
       // TODO(wuke) Do not do this when evaluating TensorBroadcastingOp.
       internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
-                               false>::run(expr, device);
+                               /*Tileable*/ false>::run(expr, device);
       return;
     }
 
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign) {
       // Size tensor blocks to fit in cache (or requested target block size).
-      size_t block_total_size = numext::mini(cache_size, total_size);
+      Index block_total_size = numext::mini(cache_size, total_size);
       TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
       // Query expression tree for desired block size/shape.
       std::vector<TensorOpResourceRequirements> resources;
@@ -146,8 +147,8 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true>
       Scalar* data = static_cast<Scalar*>(
           device.allocate(block_total_size * sizeof(Scalar)));
 
-      const Index total_block_count = block_mapper.total_block_count();
-      for (Index i = 0; i < total_block_count; ++i) {
+      const StorageIndex total_block_count = block_mapper.total_block_count();
+      for (StorageIndex i = 0; i < total_block_count; ++i) {
         TensorBlock block = block_mapper.GetBlockForIndex(i, data);
         evaluator.evalBlock(&block);
       }
@@ -162,37 +163,38 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true>
  * executed on a single core.
  */
 #ifdef EIGEN_USE_THREADS
-template <typename Evaluator, typename Index, bool Vectorizable>
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EvalRange {
-  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+  static void run(Evaluator* evaluator_in, const StorageIndex first,
+                  const StorageIndex last) {
     Evaluator evaluator = *evaluator_in;
     eigen_assert(last >= first);
-    for (Index i = first; i < last; ++i) {
+    for (StorageIndex i = first; i < last; ++i) {
       evaluator.evalScalar(i);
     }
   }
 
-  static Index alignBlockSize(Index size) {
-    return size;
-  }
+  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
 };
 
-template <typename Evaluator, typename Index>
-struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
-  static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+template <typename Evaluator, typename StorageIndex>
+struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
+  static const int PacketSize =
+      unpacket_traits<typename Evaluator::PacketReturnType>::size;
 
-  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+  static void run(Evaluator* evaluator_in, const StorageIndex first,
+                  const StorageIndex last) {
     Evaluator evaluator = *evaluator_in;
     eigen_assert(last >= first);
-    Index i = first;
+    StorageIndex i = first;
     if (last - first >= PacketSize) {
       eigen_assert(first % PacketSize == 0);
-      Index last_chunk_offset = last - 4 * PacketSize;
+      StorageIndex last_chunk_offset = last - 4 * PacketSize;
       // Give compiler a strong possibility to unroll the loop. But don't insist
       // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
-      for (; i <= last_chunk_offset; i += 4*PacketSize) {
-        for (Index j = 0; j < 4; j++) {
+      for (; i <= last_chunk_offset; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
@@ -206,7 +208,7 @@ struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
     }
   }
 
-  static Index alignBlockSize(Index size) {
+  static StorageIndex alignBlockSize(StorageIndex size) {
     // Align block size to packet size and account for unrolling in run above.
     if (size >= 16 * PacketSize) {
       return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
@@ -219,24 +221,24 @@ struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
 template <typename Expression, bool Vectorizable, bool Tileable>
 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
  public:
-  typedef typename Expression::Index Index;
+  using StorageIndex = typename Expression::Index;
 
   static inline void run(const Expression& expr,
                          const ThreadPoolDevice& device) {
     typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-    typedef EvalRange<Evaluator, Index, Vectorizable> EvalRange;
+    typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
 
     Evaluator evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
     if (needs_assign) {
-      const Index PacketSize =
+      const StorageIndex PacketSize =
           Vectorizable
               ? unpacket_traits<typename Evaluator::PacketReturnType>::size
               : 1;
-      const Index size = array_prod(evaluator.dimensions());
+      const StorageIndex size = array_prod(evaluator.dimensions());
       device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
                          EvalRange::alignBlockSize,
-                         [&evaluator](Index first, Index last) {
+                         [&evaluator](StorageIndex first, StorageIndex last) {
                            EvalRange::run(&evaluator, first, last);
                          });
     }
@@ -247,24 +249,24 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
 template <typename Expression, bool Vectorizable>
 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
  public:
-  typedef typename Expression::Index Index;
+  using Scalar = typename traits<Expression>::Scalar;
+  using ScalarNoConst = typename remove_const<Scalar>::type;
 
-  static inline void run(const Expression& expr,
-                         const ThreadPoolDevice& device) {
-    typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-    typedef typename internal::remove_const<
-        typename traits<Expression>::Scalar>::type Scalar;
-    typedef typename traits<Expression>::Index Index;
+  using Evaluator = TensorEvaluator<Expression, ThreadPoolDevice>;
+  using StorageIndex = typename traits<Expression>::Index;
 
-    static const int NumDims = traits<Expression>::NumDimensions;
+  static const int NumDims = traits<Expression>::NumDimensions;
 
-    typedef TensorBlock<Scalar, Index, NumDims, Evaluator::Layout> TensorBlock;
-    typedef TensorBlockMapper<Scalar, Index, NumDims, Evaluator::Layout>
-        TensorBlockMapper;
+  static inline void run(const Expression& expr,
+                         const ThreadPoolDevice& device) {
+    using TensorBlock =
+        TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
+    using TensorBlockMapper =
+        TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
 
     Evaluator evaluator(expr, device);
-    std::size_t total_size = array_prod(evaluator.dimensions());
-    std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+    StorageIndex total_size = array_prod(evaluator.dimensions());
+    StorageIndex cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
     if (total_size < cache_size) {
       // TODO(andydavis) Reduce block management overhead for small tensors.
       internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
@@ -276,7 +278,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
     if (needs_assign) {
       TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
-      size_t block_total_size = 0;
+      Index block_total_size = 0;
       // Query expression tree for desired block size/shape.
       std::vector<internal::TensorOpResourceRequirements> resources;
       evaluator.getResourceRequirements(&resources);
@@ -296,15 +298,16 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
       void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
       device.parallelFor(
           block_mapper.total_block_count(), cost * block_size,
-          [=, &device, &evaluator, &block_mapper](Index first, Index last) {
+          [=, &device, &evaluator, &block_mapper](StorageIndex first,
+                                                  StorageIndex last) {
             // currentThreadId() returns -1 if called from a thread not in the
-            // threadpool, such as the main thread dispatching Eigen
+            // thread pool, such as the main thread dispatching Eigen
             // expressions.
             const int thread_idx = device.currentThreadId();
             eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
             Scalar* thread_buf = reinterpret_cast<Scalar*>(
                 static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
-            for (Index i = first; i < last; ++i) {
+            for (StorageIndex i = first; i < last; ++i) {
               auto block = block_mapper.GetBlockForIndex(i, thread_buf);
               evaluator.evalBlock(&block);
             }
@@ -324,51 +327,51 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
 template <typename Expression, bool Vectorizable, bool Tileable>
 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
  public:
-  typedef typename Expression::Index Index;
+  typedef typename Expression::Index StorageIndex;
   static void run(const Expression& expr, const GpuDevice& device);
 };
 
 
 #if defined(EIGEN_GPUCC)
-template <typename Evaluator, typename Index, bool Vectorizable>
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EigenMetaKernelEval {
   static __device__ EIGEN_ALWAYS_INLINE
-  void run(Evaluator& eval, Index first, Index last, Index step_size) {
-    for (Index i = first; i < last; i += step_size) {
+  void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
+    for (StorageIndex i = first; i < last; i += step_size) {
       eval.evalScalar(i);
     }
   }
 };
 
-template <typename Evaluator, typename Index>
-struct EigenMetaKernelEval<Evaluator, Index, true> {
+template <typename Evaluator, typename StorageIndex>
+struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
   static __device__ EIGEN_ALWAYS_INLINE
-  void run(Evaluator& eval, Index first, Index last, Index step_size) {
-    const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
-    const Index vectorized_size = (last / PacketSize) * PacketSize;
-    const Index vectorized_step_size = step_size * PacketSize;
+  void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
+    const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const StorageIndex vectorized_size = (last / PacketSize) * PacketSize;
+    const StorageIndex vectorized_step_size = step_size * PacketSize;
 
     // Use the vector path
-    for (Index i = first * PacketSize; i < vectorized_size;
+    for (StorageIndex i = first * PacketSize; i < vectorized_size;
          i += vectorized_step_size) {
       eval.evalPacket(i);
     }
-    for (Index i = vectorized_size + first; i < last; i += step_size) {
+    for (StorageIndex i = vectorized_size + first; i < last; i += step_size) {
       eval.evalScalar(i);
     }
   }
 };
 
-template <typename Evaluator, typename Index>
+template <typename Evaluator, typename StorageIndex>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel(Evaluator eval, Index size) {
+EigenMetaKernel(Evaluator eval, StorageIndex size) {
 
-  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index step_size = blockDim.x * gridDim.x;
+  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const StorageIndex step_size = blockDim.x * gridDim.x;
 
   const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
-  EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
+  EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
 }
 
 /*static*/
@@ -382,12 +385,12 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
     const int block_size = device.maxGpuThreadsPerBlock();
     const int max_blocks = device.getNumGpuMultiProcessors() *
                            device.maxGpuThreadsPerMultiProcessor() / block_size;
-    const Index size = array_prod(evaluator.dimensions());
+    const StorageIndex size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
 
     LAUNCH_GPU_KERNEL(
-        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
+        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
         num_blocks, block_size, 0, device, evaluator, size);
   }
   evaluator.cleanup();
author	Eugene Zhulenev <ezhulenev@google.com>	2018-07-27 12:45:17 -0700
committer	Eugene Zhulenev <ezhulenev@google.com>	2018-07-27 12:45:17 -0700
commit	966c2a7bb62a8b5b9ecd349730ffcd3b5719837d (patch)
tree	83e61bb77a5340f529c336afaa69cc78d654d599 /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
parent	6913221c43c6ad41b1fbfc0d263d2764abd11ad2 (diff)